R分析流程
最后发布时间:2022-11-08 10:03:21
浏览量:
{
library("optparse")
library(TCGAbiolinks)
library(tidyverse)
library(reshape2)
option_list = list(make_option(c("-p", "--project"), type = "character", default = "TCGA-ESCA", help = "TCGA project"),
make_option(c("-w","--workflow.type"), type = "character", default = "HTSeq - Counts", help = "File path to CpG site R data."),
make_option(c("-d","--directory"), type = "character", help = "test"))
args <- parse_args(OptionParser(option_list=option_list))
if(!is.null(args$directory)){
if(!file.exists(args$directory)){
dir.create(file.path(args$directory))
}
setwd(args$directory)
message("切换工作目录到:",args$directory)
}
if(is.null(args$project)){
message("请输入 TCGA Project!!")
q()
}
}
{
IS_DEBUG=T
LOG_FILENAME= "report_download.txt"
IS_EMPTY=F
source("/home/wangyang/workspace/bioinfo_analysis/Rscript/tools/tools.R")
}
####################################################
## 环境准备
###################################################
{
project <- args$project
message("TCGA项目为: ",project)
info("分析的TCGA项目为: ",project)
if(!file.exists("result")){
dir.create(file.path("result"))
}
}
公共函数123446
{
TCGA_query <- function(project,data.type,workflow.type,filename){
if(!file.exists(paste0("result/",filename,".rda"))){
message("重新处理:",paste0("result/",filename,".rda"))
query<- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = data.type,
workflow.type = workflow.type)
saveRDS(query,file = paste0("result/",filename,".rda"))
return(query)
}else{
message("加载本地数据: ",paste0("result/",filename,".rda"))
return(readRDS(paste0("result/",filename,".rda")))
}
}
#### 获得metadata信息
TCAG_prepare <- function(query,filename){
if(!file.exists(paste0("result/",filename,".rda"))){
message("重新处理:",paste0("result/",filename,".rda"))
data <- GDCprepare(query = query,
summarizedExperiment = F,
save = F)
saveRDS(data,file =paste0("result/",filename,".rda"))
return(data)
}else{
message("加载本地数据: ",paste0("result/",filename,".rda"))
return(readRDS(paste0("result/",filename,".rda")))
}
}
DESeq2_Analysis <- function(expr,filename,metadata){
if(!file.exists(paste0("result/",filename,".rda"))){
## 差异基因表达分析
library(DESeq2)
if(!identical(colnames(expr),rownames(metadata))){
message("expr的列名与metadata行名不一致!!")
message("expr 列数:",dim(expr)[2])
message("metadata行数:",dim(metadata)[1])
expr <- expr[,rownames(metadata)]
}
metadata$group <- factor(metadata$group)
dds <- DESeqDataSetFromMatrix(countData = expr,
colData = metadata,
design = ~ group)
## PCA图
#vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
#vsd_df <- assay(vsd)
# pdf(file = paste0("result/",filename,".pdf"))
#DESeq2::plotPCA(vsd, intgroup = "group")
#dev.off()
#message("写入图片:",paste0("result/",filename,".pdf"))
keep <- rowSums(counts(dds) > 0) >= min(table(metadata$group))
dds_filt <- dds[keep, ]
dds2 <- DESeq(dds_filt, parallel = T)
message(resultsNames(dds2))
res <- results(dds2)
deg <- as.data.frame(res)
message("写入文件:",paste0("result/",filename,".rda")," ",paste0("result/",filename,".csv"))
saveRDS(deg,file =paste0("result/",filename,".rda"))
write.csv(deg,file=paste0("result/",filename,".csv"),quote = F,row.names = F)
return(deg)
}else{
message("加载本地数据: ",paste0("result/",filename,".rda"))
return(readRDS(paste0("result/",filename,".rda")))
}
}
TCGA_metadata <- function(path,filename) {
metadata <- jsonlite::read_json(path, simplifyVector = T)
metadata <- tibble::tibble(
file_name = metadata$file_name,
md5sum = metadata$md5sum,
cases = bind_rows(metadata$associated_entities)$entity_submitter_id,
TCGA_id_full = cases,
TCGA_id = stringr::str_sub(cases, 1, 16),
patient_id = stringr::str_sub(TCGA_id, 1, 12),
tissue_type_id = stringr::str_sub(TCGA_id, 14, 15),
tissue_type = sapply(tissue_type_id, function(x) {
switch(x,
"01" = "Primary Solid Tumor",
"02" = "Recurrent Solid Tumor",
"03" = "Primary Blood Derived Cancer - Peripheral Blood",
"05" = "Additional - New Primary",
"06" = "Metastatic",
"07" = "Additional Metastatic",
"11" = "Solid Tissue Normal")}),
group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>%
column_to_rownames("TCGA_id_full")
saveRDS(metadata,file =paste0("result/",filename,".rda"))
write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F)
return(metadata)
}
get_metadata <- function(metadata,filename){
message("重新处理:",paste0("result/",filename,".rda"))
metadata <- tibble::tibble(
TCGA_id_full = metadata$cases,
cases=metadata$cases,
TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16),
patient_id = stringr::str_sub(TCGA_id, 1, 12),
tissue_type_id = stringr::str_sub(TCGA_id, 14, 15),
tissue_type = sapply(tissue_type_id, function(x) {
switch(x,
"01" = "Primary Solid Tumor",
"02" = "Recurrent Solid Tumor",
"03" = "Primary Blood Derived Cancer - Peripheral Blood",
"05" = "Additional - New Primary",
"06" = "Metastatic",
"07" = "Additional Metastatic",
"11" = "Solid Tissue Normal")}),
group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>%
column_to_rownames("TCGA_id_full")
saveRDS(metadata,file =paste0("result/",filename,".rda"))
write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F)
return(metadata)
}
}
####################################################
TCGA 数据下载
###################################################
#### 下载临床数据
(function(){
if(!file.exists("result/clinical.rda")){
clinical <- GDCquery_clinic(project = project, type = "clinical")%>%
mutate(futime=ifelse(vital_status=='Alive',days_to_last_follow_up,days_to_death),
fustat = factor(vital_status,levels = c("Alive","Dead"),labels = c(0,1)),
fustat=as.numeric(as.character(fustat)),
futime_year=futime/365,
patient_id=submitter_id,
tumor_stage=case_when(tumor_stage=="stage i"~"i",
grepl("stage i[^i ^v]+",tumor_stage)~"i",
tumor_stage=="stage ii"~"ii",
grepl("stage ii[^i]+",tumor_stage)~"ii",
grepl("stage iii",tumor_stage)~"iii",
grepl("stage iv",tumor_stage)~"iv",
tumor_stage=="not reported"~"unknow"
))%>%
dplyr::select(c("patient_id","futime","fustat","futime_year","tumor_stage","tumor_stage"))
saveRDS(clinical,file = "result/clinical.rda")
}else{
clinical <- readRDS("result/clinical.rda")
}
info("临床资料有 ",length(unique(clinical$patient_id))," 个样本, 矩阵的大小为", paste0(dim(clinical),collapse = ","))
message("加载本地数据result/clinical.rda")
})()
#### 下载基因数据
(function(){
if(T){
clinical <- readRDS("result/clinical.rda")
query <- TCGA_query(project=project,data.type="Gene Expression Quantification",workflow.type="HTSeq - Counts",filename = "mRNA_query")
info("TCGA Query Project: ",query$project[[1]])
info("TCGA data.category: ",query$data.category)
info("TCGA data.type: ",query$data.type)
info("TCGA workflow.type: ",query$workflow.type)
metadata <- query[[1]][[1]]%>%
get_metadata("gene_metadata")
}else{
metadata <- TCGA_metadata("data/metadata.cart.2021-05-20.json","gene_metadata")
query<- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = c(metadata$cases))
info("TCGA Query Project: ",query$project[[1]])
info("TCGA data.category: ",query$data.category)
info("TCGA data.type: ",query$data.type)
info("TCGA workflow.type: ",query$workflow.type)
}
if(T){
metadata_sample <- metadata%>%
dplyr::select("cases")%>%
mutate(sample= substr(cases,14,15))
info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
info("TCGA 样本信息[01 原发实体瘤, 11正常组织]: ",paste0(names(table(metadata_sample$sample)),collapse = ","),"对应数量",paste0(table(metadata_sample$sample),collapse = ","))
}
mRNA_lnRNA_clinical <- clinical%>%
filter(patient_id %in% metadata$patient_id)
info("在metadata中包含临床资料的有 ",length(unique(mRNA_lnRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(mRNA_lnRNA_clinical),collapse = ","))
saveRDS(mRNA_lnRNA_clinical,file = "result/mRNA_lnRNA_clinical.rda")
GDCdownload(query)
data <- TCAG_prepare(query = query,filename = "mRNA_TCAG_prepare")%>%
column_to_rownames("X1")
data <- data[,rownames(metadata)]
info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
"表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
})()
#### 下载miRNA数据
(function(){
clinical <- readRDS("result/clinical.rda")
query <- TCGA_query(project=project,data.type="miRNA Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_query")
info("TCGA Query Project: ",query$project[[1]])
info("TCGA data.category: ",query$data.category)
info("TCGA data.type: ",query$data.type)
info("TCGA workflow.type: ",query$workflow.type)
metadata <- query[[1]][[1]]
metadata <- get_metadata(metadata,"miRNA_metadata")
info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
miRNA_clinical <- clinical <- clinical%>%
filter(patient_id %in% metadata$patient_id)
info("在metadata中包含临床资料的有 ",length(unique(miRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_clinical),collapse = ","))
saveRDS(miRNA_clinical,file = "result/miRNA_clinical.rda")
GDCdownload(query)
data <- TCAG_prepare(query = query,filename = "miRNA_TCAG_prepare")%>%
dplyr::select("miRNA_ID",starts_with("read_count"))%>%
rename_at(vars(contains("read_count")), ~ substr(.,12,length(.)))%>%
column_to_rownames("miRNA_ID")
data <- data[,rownames(metadata)]
saveRDS(data,file ="result/miRNA_expr.rda")
info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
"表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
# sum(is.na(expr_miRNA))
})()
#### 下载miRNA前体数据
(function(){
clinical <- readRDS("result/clinical.rda")
query <- TCGA_query(project=project,data.type="Isoform Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_iso_query")
info("TCGA Query Project: ",query$project[[1]])
info("TCGA data.category: ",query$data.category)
info("TCGA data.type: ",query$data.type)
info("TCGA workflow.type: ",query$workflow.type)
metadata <- query[[1]][[1]]
metadata <- get_metadata(metadata,"miRNA_iso_metadata")
info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
miRNA_iso_clinical <- clinical <- clinical%>%
filter(patient_id %in% metadata$patient_id)
saveRDS(miRNA_iso_clinical,file = "result/miRNA_iso_clinical.rda")
info("在metadata中包含临床资料的有 ",length(unique(miRNA_iso_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_iso_clinical),collapse = ","))
# 下载数据
GDCdownload(query)
miRNA_anno <- read_lines("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/mature.fa.gz") %>%
as_tibble() %>%
dplyr::filter(str_detect(value, pattern = "Homo")) %>%
tidyr::separate(col = value, into = paste0("ID", 1:5), sep = " ") %>%
dplyr::select(isoform_id = ID2, miRNA_name = ID5)
data <- TCAG_prepare(query = query,filename = "miRNA_ios_TCAG_prepare")%>%
dplyr::select(miRNA_region,barcode,read_count)%>%
filter(grepl("mature",miRNA_region))%>%
dcast(miRNA_region~barcode,value.var = "read_count")%>%
plyr::rename(c(miRNA_region="isoform_id"))%>%
mutate(isoform_id=substr(isoform_id,8,length(.)))%>%
inner_join(miRNA_anno,by="isoform_id")%>%
column_to_rownames("miRNA_name")%>%
dplyr::select(-isoform_id)
data <- data[,rownames(metadata)]
saveRDS(data,file ="result/miRNA_iso_expr.rda")
info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
"表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
})()
####################################################
TCGA 数据处理
###################################################
#### 获取注释文件
(function(){
if(!file.exists("result/gff_v22.rda")){
gff_v22 <- read_tsv("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/gencode.gene.info.v22.tsv")
saveRDS(gff_v22,"result/gff_v22.rda")
}
})()
#### 注释基因数据
(function(){
gff_v22 <- readRDS("result/gff_v22.rda")
data <- readRDS("result/mRNA_TCAG_prepare.rda")
id2symbol <- gff_v22 %>%
dplyr::select(1, 2)
expr_data <- data[1:(nrow(data)-5),]%>%
plyr::rename(c(X1="gene_id"))%>%
inner_join(id2symbol,by = "gene_id")
saveRDS(expr_data,file = "result/expr_data.rda")
})()
#### 提取mRNA和lnRNA
(function(){
expr_data <- readRDS("result/expr_data.rda")
gff_v22 <- readRDS("result/gff_v22.rda")
lncRNA_types <- "3prime_overlapping_ncrna, antisense, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping"
lncRNA_types <- unlist(str_split(lncRNA_types, pattern = ", "))
gene_type <- gff_v22 %>%
dplyr::select(gene_id, gene_type) %>%
arrange(gene_id)
identical(gene_type$gene_id,expr_data$gene_id)
expr_mRNA <- expr_data[gene_type$gene_type=="protein_coding",]%>%
{.[!duplicated(.$gene_name),]}%>%
column_to_rownames("gene_name")%>%
dplyr::select(-gene_id)
png(filename = "test.png")
boxplot(log2(expr_mRNA+1))
dev.off()
write.csv(expr_mRNA,file = "result/mRNA_expr.csv",quote = F)
saveRDS(expr_mRNA,file ="result/mRNA_expr.rda")
expr_lncRNA <- expr_data[gene_type$gene_type %in% lncRNA_types,]%>%
{.[!duplicated(.$gene_name),]}%>%
column_to_rownames("gene_name")%>%
dplyr::select(-gene_id)
write.csv(expr_lncRNA,file = "result/lncRNA_expr.csv",quote = F)
saveRDS(expr_lncRNA,file ="result/lncRNA_expr.rda")
})()
鉴定差异表达
(function(){
expr_mRNA<- readRDS("result/mRNA_expr.rda")
#boxplot(log2(expr_mRNA+1))
expr_lncRNA<- readRDS("result/lncRNA_expr.rda")
data_expr_miRNA<- readRDS("result/miRNA_expr.rda")
data_expr_ios_miRNA <- readRDS("result/miRNA_iso_expr.rda")
gene_metadata <- readRDS("result/gene_metadata.rda")
miRNA_metadata <- readRDS("result/miRNA_metadata.rda")
miRNA_iso_metadata <- readRDS("result/miRNA_iso_metadata.rda")
if(F){
paired_metadata <- gene_metadata%>%
filter(tissue_type_id %in% c("01","11"))%>%
add_count(patient_id,name = "n_patient")%>%
filter(n_patient==2)%>%
arrange(patient_id,tissue_type_id)
paired_count<- expr_mRNA%>%
dplyr::select(paired_metadata$cases)
group <- paired_metadata$group%>%as.factor()
colData <- data.frame(sample_Id = colnames(paired_count),group=group)
dds <- DESeqDataSetFromMatrix(countData =paired_count,
colData = paired_metadata,
design = ~ group)
vst <- vst(dds)
count_vst <- assay(vst)
boxplot(log2(paired_count+1),col=group)
limma::plotDensities(log2(paired_count+1),legend=F)
boxplot(count_vst,col=group)
limma::plotDensities(count_vst,legend=F)
pivot_longer(as.data.frame(count_vst[1:2,]),cols=everything(),names_to = "cases")
data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>%
left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>%
ggplot(aes(x=group,y=value,fill=group))+
geom_boxplot(outlier.shape = NA)+
geom_point(size=1, alpha=0.5)+
geom_line(aes(group=patient_id), colour="black", linetype="11")
gene <- data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>%
left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>%
select(-cases)
spread(gene,group,value)%>%
filter(Normal<Tumor)
}
if(F){
identical(rownames(gene_metadata),colnames(expr_mRNA))
group <- gene_metadata$group
data.frame(gene=as.vector(t(expr_mRNA[1,])),group=group,row.names = rownames(gene_metadata))%>%
write.csv(file = "expr_mRNA_group.csv",quote = F)
df <- read.csv("expr_mRNA_group.csv",row.names = 1)
boxplot(log2(gene)~group,data=df,col=c("green","red"))
## 计算p
wilcoxTest <- wilcox.test(gene~group,data=df)
pValue <- wilcoxTest$p.value
conGeneMeans <- mean( subset(df,group=="Normal")$gene)
treatGeneMeans <- mean( subset(df,group=="Tumor")$gene)
## 计算logFC
logFC <- log2(treatGeneMeans/conGeneMeans)
}
if(F){
library(DESeq2)
gene_metadata$group <- factor(gene_metadata$group)
dds <- DESeqDataSetFromMatrix(countData = expr_mRNA[1,],
colData = gene_metadata,
design = ~ group)
## PCA图
vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
boxplot(log2(expr_mRNA[,1:10]+1))
boxplot(assay(vsd))
dds2 <- DESeq(dds, parallel = T)
message(resultsNames(dds2))
res <- results(dds2)
deg <- as.data.frame(res)
}
mRNA_deg<- DESeq2_Analysis(expr_mRNA,"mRNA_expr_DESeq",gene_metadata)
info("鉴定差异mRNA个数: ",length(unique(rownames(mRNA_deg))))
lnRNA_deg <- DESeq2_Analysis(expr_lncRNA,"lnRNA_expr_DESeq",gene_metadata)
info("鉴定差异lnRNA个数: ",length(unique(rownames(lnRNA_deg))))
miRNA_deg <- DESeq2_Analysis(data_expr_miRNA,"miRNA_expr_DESeq",miRNA_metadata)
info("鉴定差异miRNA个数: ",length(unique(rownames(miRNA_deg))))
miRNA_deg_iso <- DESeq2_Analysis(data_expr_ios_miRNA,"miRNA_iso_expr_DESeq",miRNA_iso_metadata)
info("鉴定差异miRNA_iso个数: ",length(unique(rownames(miRNA_deg_iso))))
})()
########################备注########################################
#GDC_projects <- getGDCprojects()
#write.csv(GDC_projects[[1]],file = "data/GDC_projects.csv",quote = F,row.names = F)
#GDC_projects$project_id[grepl("TCGA",GDC_projects$project_id)]
#sort(table(gff_v22$gene_type),decreasing = T)
#sort(unique(gff_v22$gene_type))
#table(lncRNA_types %in% unique(gff_v22$gene_type))
if(F){
metadata$group <- factor(metadata$group,levels = c("Normal", "Tumor"))
#--test
library(edgeR)
library(limma)
y <- DGEList(counts = data_expr_miRNA,group = metadata$group)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y,pair = levels(metadata$group))
gene1 <- decideTestsDGE(et,p.value = 0.01,lfc=2)
summary(gene1)
ordered_targs <- topTags(et,n=1000)
allDEG <- ordered_targs$table
diff_signif <- allDEG%>%
filter(PValue<0.01,abs(logFC)>2)%>%
arrange(logFC)
newData <- y$pseudo.counts
heatmapData <- newData[rownames(diff_signif),]
hmExp <- log2(heatmapData + 0.01)
library(gplots)
hmMat <- as.matrix(hmExp)
#par(oma=c(10,3,3,7))
heatmap.2(hmMat,col = "bluered",trace = "none")
library(pheatmap)
pheatmap(hmMat)
}