{ library("optparse") library(TCGAbiolinks) library(tidyverse) library(reshape2) option_list = list(make_option(c("-p", "--project"), type = "character", default = "TCGA-ESCA", help = "TCGA project"), make_option(c("-w","--workflow.type"), type = "character", default = "HTSeq - Counts", help = "File path to CpG site R data."), make_option(c("-d","--directory"), type = "character", help = "test")) args <- parse_args(OptionParser(option_list=option_list)) if(!is.null(args$directory)){ if(!file.exists(args$directory)){ dir.create(file.path(args$directory)) } setwd(args$directory) message("切换工作目录到:",args$directory) } if(is.null(args$project)){ message("请输入 TCGA Project!!") q() } } { IS_DEBUG=T LOG_FILENAME= "report_download.txt" IS_EMPTY=F source("/home/wangyang/workspace/bioinfo_analysis/Rscript/tools/tools.R") } #################################################### ## 环境准备 ################################################### { project <- args$project message("TCGA项目为: ",project) info("分析的TCGA项目为: ",project) if(!file.exists("result")){ dir.create(file.path("result")) } }
{ TCGA_query <- function(project,data.type,workflow.type,filename){ if(!file.exists(paste0("result/",filename,".rda"))){ message("重新处理:",paste0("result/",filename,".rda")) query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = data.type, workflow.type = workflow.type) saveRDS(query,file = paste0("result/",filename,".rda")) return(query) }else{ message("加载本地数据: ",paste0("result/",filename,".rda")) return(readRDS(paste0("result/",filename,".rda"))) } } #### 获得metadata信息 TCAG_prepare <- function(query,filename){ if(!file.exists(paste0("result/",filename,".rda"))){ message("重新处理:",paste0("result/",filename,".rda")) data <- GDCprepare(query = query, summarizedExperiment = F, save = F) saveRDS(data,file =paste0("result/",filename,".rda")) return(data) }else{ message("加载本地数据: ",paste0("result/",filename,".rda")) return(readRDS(paste0("result/",filename,".rda"))) } } DESeq2_Analysis <- function(expr,filename,metadata){ if(!file.exists(paste0("result/",filename,".rda"))){ ## 差异基因表达分析 library(DESeq2) if(!identical(colnames(expr),rownames(metadata))){ message("expr的列名与metadata行名不一致!!") message("expr 列数:",dim(expr)[2]) message("metadata行数:",dim(metadata)[1]) expr <- expr[,rownames(metadata)] } metadata$group <- factor(metadata$group) dds <- DESeqDataSetFromMatrix(countData = expr, colData = metadata, design = ~ group) ## PCA图 #vsd <- varianceStabilizingTransformation(dds, blind = TRUE) #vsd_df <- assay(vsd) # pdf(file = paste0("result/",filename,".pdf")) #DESeq2::plotPCA(vsd, intgroup = "group") #dev.off() #message("写入图片:",paste0("result/",filename,".pdf")) keep <- rowSums(counts(dds) > 0) >= min(table(metadata$group)) dds_filt <- dds[keep, ] dds2 <- DESeq(dds_filt, parallel = T) message(resultsNames(dds2)) res <- results(dds2) deg <- as.data.frame(res) message("写入文件:",paste0("result/",filename,".rda")," ",paste0("result/",filename,".csv")) saveRDS(deg,file =paste0("result/",filename,".rda")) write.csv(deg,file=paste0("result/",filename,".csv"),quote = F,row.names = F) return(deg) }else{ message("加载本地数据: ",paste0("result/",filename,".rda")) return(readRDS(paste0("result/",filename,".rda"))) } } TCGA_metadata <- function(path,filename) { metadata <- jsonlite::read_json(path, simplifyVector = T) metadata <- tibble::tibble( file_name = metadata$file_name, md5sum = metadata$md5sum, cases = bind_rows(metadata$associated_entities)$entity_submitter_id, TCGA_id_full = cases, TCGA_id = stringr::str_sub(cases, 1, 16), patient_id = stringr::str_sub(TCGA_id, 1, 12), tissue_type_id = stringr::str_sub(TCGA_id, 14, 15), tissue_type = sapply(tissue_type_id, function(x) { switch(x, "01" = "Primary Solid Tumor", "02" = "Recurrent Solid Tumor", "03" = "Primary Blood Derived Cancer - Peripheral Blood", "05" = "Additional - New Primary", "06" = "Metastatic", "07" = "Additional Metastatic", "11" = "Solid Tissue Normal")}), group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>% column_to_rownames("TCGA_id_full") saveRDS(metadata,file =paste0("result/",filename,".rda")) write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F) return(metadata) } get_metadata <- function(metadata,filename){ message("重新处理:",paste0("result/",filename,".rda")) metadata <- tibble::tibble( TCGA_id_full = metadata$cases, cases=metadata$cases, TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16), patient_id = stringr::str_sub(TCGA_id, 1, 12), tissue_type_id = stringr::str_sub(TCGA_id, 14, 15), tissue_type = sapply(tissue_type_id, function(x) { switch(x, "01" = "Primary Solid Tumor", "02" = "Recurrent Solid Tumor", "03" = "Primary Blood Derived Cancer - Peripheral Blood", "05" = "Additional - New Primary", "06" = "Metastatic", "07" = "Additional Metastatic", "11" = "Solid Tissue Normal")}), group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>% column_to_rownames("TCGA_id_full") saveRDS(metadata,file =paste0("result/",filename,".rda")) write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F) return(metadata) } }
####################################################
###################################################
#### 下载临床数据 (function(){ if(!file.exists("result/clinical.rda")){ clinical <- GDCquery_clinic(project = project, type = "clinical")%>% mutate(futime=ifelse(vital_status=='Alive',days_to_last_follow_up,days_to_death), fustat = factor(vital_status,levels = c("Alive","Dead"),labels = c(0,1)), fustat=as.numeric(as.character(fustat)), futime_year=futime/365, patient_id=submitter_id, tumor_stage=case_when(tumor_stage=="stage i"~"i", grepl("stage i[^i ^v]+",tumor_stage)~"i", tumor_stage=="stage ii"~"ii", grepl("stage ii[^i]+",tumor_stage)~"ii", grepl("stage iii",tumor_stage)~"iii", grepl("stage iv",tumor_stage)~"iv", tumor_stage=="not reported"~"unknow" ))%>% dplyr::select(c("patient_id","futime","fustat","futime_year","tumor_stage","tumor_stage")) saveRDS(clinical,file = "result/clinical.rda") }else{ clinical <- readRDS("result/clinical.rda") } info("临床资料有 ",length(unique(clinical$patient_id))," 个样本, 矩阵的大小为", paste0(dim(clinical),collapse = ",")) message("加载本地数据result/clinical.rda") })() #### 下载基因数据 (function(){ if(T){ clinical <- readRDS("result/clinical.rda") query <- TCGA_query(project=project,data.type="Gene Expression Quantification",workflow.type="HTSeq - Counts",filename = "mRNA_query") info("TCGA Query Project: ",query$project[[1]]) info("TCGA data.category: ",query$data.category) info("TCGA data.type: ",query$data.type) info("TCGA workflow.type: ",query$workflow.type) metadata <- query[[1]][[1]]%>% get_metadata("gene_metadata") }else{ metadata <- TCGA_metadata("data/metadata.cart.2021-05-20.json","gene_metadata") query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts", barcode = c(metadata$cases)) info("TCGA Query Project: ",query$project[[1]]) info("TCGA data.category: ",query$data.category) info("TCGA data.type: ",query$data.type) info("TCGA workflow.type: ",query$workflow.type) } if(T){ metadata_sample <- metadata%>% dplyr::select("cases")%>% mutate(sample= substr(cases,14,15)) info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ",")) info("TCGA 样本信息[01 原发实体瘤, 11正常组织]: ",paste0(names(table(metadata_sample$sample)),collapse = ","),"对应数量",paste0(table(metadata_sample$sample),collapse = ",")) } mRNA_lnRNA_clinical <- clinical%>% filter(patient_id %in% metadata$patient_id) info("在metadata中包含临床资料的有 ",length(unique(mRNA_lnRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(mRNA_lnRNA_clinical),collapse = ",")) saveRDS(mRNA_lnRNA_clinical,file = "result/mRNA_lnRNA_clinical.rda") GDCdownload(query) data <- TCAG_prepare(query = query,filename = "mRNA_TCAG_prepare")%>% column_to_rownames("X1") data <- data[,rownames(metadata)] info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","), "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data))) })() #### 下载miRNA数据 (function(){ clinical <- readRDS("result/clinical.rda") query <- TCGA_query(project=project,data.type="miRNA Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_query") info("TCGA Query Project: ",query$project[[1]]) info("TCGA data.category: ",query$data.category) info("TCGA data.type: ",query$data.type) info("TCGA workflow.type: ",query$workflow.type) metadata <- query[[1]][[1]] metadata <- get_metadata(metadata,"miRNA_metadata") info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ",")) miRNA_clinical <- clinical <- clinical%>% filter(patient_id %in% metadata$patient_id) info("在metadata中包含临床资料的有 ",length(unique(miRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_clinical),collapse = ",")) saveRDS(miRNA_clinical,file = "result/miRNA_clinical.rda") GDCdownload(query) data <- TCAG_prepare(query = query,filename = "miRNA_TCAG_prepare")%>% dplyr::select("miRNA_ID",starts_with("read_count"))%>% rename_at(vars(contains("read_count")), ~ substr(.,12,length(.)))%>% column_to_rownames("miRNA_ID") data <- data[,rownames(metadata)] saveRDS(data,file ="result/miRNA_expr.rda") info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","), "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data))) # sum(is.na(expr_miRNA)) })() #### 下载miRNA前体数据 (function(){ clinical <- readRDS("result/clinical.rda") query <- TCGA_query(project=project,data.type="Isoform Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_iso_query") info("TCGA Query Project: ",query$project[[1]]) info("TCGA data.category: ",query$data.category) info("TCGA data.type: ",query$data.type) info("TCGA workflow.type: ",query$workflow.type) metadata <- query[[1]][[1]] metadata <- get_metadata(metadata,"miRNA_iso_metadata") info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ",")) miRNA_iso_clinical <- clinical <- clinical%>% filter(patient_id %in% metadata$patient_id) saveRDS(miRNA_iso_clinical,file = "result/miRNA_iso_clinical.rda") info("在metadata中包含临床资料的有 ",length(unique(miRNA_iso_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_iso_clinical),collapse = ",")) # 下载数据 GDCdownload(query) miRNA_anno <- read_lines("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/mature.fa.gz") %>% as_tibble() %>% dplyr::filter(str_detect(value, pattern = "Homo")) %>% tidyr::separate(col = value, into = paste0("ID", 1:5), sep = " ") %>% dplyr::select(isoform_id = ID2, miRNA_name = ID5) data <- TCAG_prepare(query = query,filename = "miRNA_ios_TCAG_prepare")%>% dplyr::select(miRNA_region,barcode,read_count)%>% filter(grepl("mature",miRNA_region))%>% dcast(miRNA_region~barcode,value.var = "read_count")%>% plyr::rename(c(miRNA_region="isoform_id"))%>% mutate(isoform_id=substr(isoform_id,8,length(.)))%>% inner_join(miRNA_anno,by="isoform_id")%>% column_to_rownames("miRNA_name")%>% dplyr::select(-isoform_id) data <- data[,rownames(metadata)] saveRDS(data,file ="result/miRNA_iso_expr.rda") info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","), "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data))) })()
#### 获取注释文件 (function(){ if(!file.exists("result/gff_v22.rda")){ gff_v22 <- read_tsv("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/gencode.gene.info.v22.tsv") saveRDS(gff_v22,"result/gff_v22.rda") } })() #### 注释基因数据 (function(){ gff_v22 <- readRDS("result/gff_v22.rda") data <- readRDS("result/mRNA_TCAG_prepare.rda") id2symbol <- gff_v22 %>% dplyr::select(1, 2) expr_data <- data[1:(nrow(data)-5),]%>% plyr::rename(c(X1="gene_id"))%>% inner_join(id2symbol,by = "gene_id") saveRDS(expr_data,file = "result/expr_data.rda") })() #### 提取mRNA和lnRNA (function(){ expr_data <- readRDS("result/expr_data.rda") gff_v22 <- readRDS("result/gff_v22.rda") lncRNA_types <- "3prime_overlapping_ncrna, antisense, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping" lncRNA_types <- unlist(str_split(lncRNA_types, pattern = ", ")) gene_type <- gff_v22 %>% dplyr::select(gene_id, gene_type) %>% arrange(gene_id) identical(gene_type$gene_id,expr_data$gene_id) expr_mRNA <- expr_data[gene_type$gene_type=="protein_coding",]%>% {.[!duplicated(.$gene_name),]}%>% column_to_rownames("gene_name")%>% dplyr::select(-gene_id) png(filename = "test.png") boxplot(log2(expr_mRNA+1)) dev.off() write.csv(expr_mRNA,file = "result/mRNA_expr.csv",quote = F) saveRDS(expr_mRNA,file ="result/mRNA_expr.rda") expr_lncRNA <- expr_data[gene_type$gene_type %in% lncRNA_types,]%>% {.[!duplicated(.$gene_name),]}%>% column_to_rownames("gene_name")%>% dplyr::select(-gene_id) write.csv(expr_lncRNA,file = "result/lncRNA_expr.csv",quote = F) saveRDS(expr_lncRNA,file ="result/lncRNA_expr.rda") })()
(function(){ expr_mRNA<- readRDS("result/mRNA_expr.rda") #boxplot(log2(expr_mRNA+1)) expr_lncRNA<- readRDS("result/lncRNA_expr.rda") data_expr_miRNA<- readRDS("result/miRNA_expr.rda") data_expr_ios_miRNA <- readRDS("result/miRNA_iso_expr.rda") gene_metadata <- readRDS("result/gene_metadata.rda") miRNA_metadata <- readRDS("result/miRNA_metadata.rda") miRNA_iso_metadata <- readRDS("result/miRNA_iso_metadata.rda") if(F){ paired_metadata <- gene_metadata%>% filter(tissue_type_id %in% c("01","11"))%>% add_count(patient_id,name = "n_patient")%>% filter(n_patient==2)%>% arrange(patient_id,tissue_type_id) paired_count<- expr_mRNA%>% dplyr::select(paired_metadata$cases) group <- paired_metadata$group%>%as.factor() colData <- data.frame(sample_Id = colnames(paired_count),group=group) dds <- DESeqDataSetFromMatrix(countData =paired_count, colData = paired_metadata, design = ~ group) vst <- vst(dds) count_vst <- assay(vst) boxplot(log2(paired_count+1),col=group) limma::plotDensities(log2(paired_count+1),legend=F) boxplot(count_vst,col=group) limma::plotDensities(count_vst,legend=F) pivot_longer(as.data.frame(count_vst[1:2,]),cols=everything(),names_to = "cases") data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>% left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>% ggplot(aes(x=group,y=value,fill=group))+ geom_boxplot(outlier.shape = NA)+ geom_point(size=1, alpha=0.5)+ geom_line(aes(group=patient_id), colour="black", linetype="11") gene <- data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>% left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>% select(-cases) spread(gene,group,value)%>% filter(Normal<Tumor) } if(F){ identical(rownames(gene_metadata),colnames(expr_mRNA)) group <- gene_metadata$group data.frame(gene=as.vector(t(expr_mRNA[1,])),group=group,row.names = rownames(gene_metadata))%>% write.csv(file = "expr_mRNA_group.csv",quote = F) df <- read.csv("expr_mRNA_group.csv",row.names = 1) boxplot(log2(gene)~group,data=df,col=c("green","red")) ## 计算p wilcoxTest <- wilcox.test(gene~group,data=df) pValue <- wilcoxTest$p.value conGeneMeans <- mean( subset(df,group=="Normal")$gene) treatGeneMeans <- mean( subset(df,group=="Tumor")$gene) ## 计算logFC logFC <- log2(treatGeneMeans/conGeneMeans) } if(F){ library(DESeq2) gene_metadata$group <- factor(gene_metadata$group) dds <- DESeqDataSetFromMatrix(countData = expr_mRNA[1,], colData = gene_metadata, design = ~ group) ## PCA图 vsd <- varianceStabilizingTransformation(dds, blind = TRUE) boxplot(log2(expr_mRNA[,1:10]+1)) boxplot(assay(vsd)) dds2 <- DESeq(dds, parallel = T) message(resultsNames(dds2)) res <- results(dds2) deg <- as.data.frame(res) } mRNA_deg<- DESeq2_Analysis(expr_mRNA,"mRNA_expr_DESeq",gene_metadata) info("鉴定差异mRNA个数: ",length(unique(rownames(mRNA_deg)))) lnRNA_deg <- DESeq2_Analysis(expr_lncRNA,"lnRNA_expr_DESeq",gene_metadata) info("鉴定差异lnRNA个数: ",length(unique(rownames(lnRNA_deg)))) miRNA_deg <- DESeq2_Analysis(data_expr_miRNA,"miRNA_expr_DESeq",miRNA_metadata) info("鉴定差异miRNA个数: ",length(unique(rownames(miRNA_deg)))) miRNA_deg_iso <- DESeq2_Analysis(data_expr_ios_miRNA,"miRNA_iso_expr_DESeq",miRNA_iso_metadata) info("鉴定差异miRNA_iso个数: ",length(unique(rownames(miRNA_deg_iso)))) })()
########################备注######################################## #GDC_projects <- getGDCprojects() #write.csv(GDC_projects[[1]],file = "data/GDC_projects.csv",quote = F,row.names = F) #GDC_projects$project_id[grepl("TCGA",GDC_projects$project_id)] #sort(table(gff_v22$gene_type),decreasing = T) #sort(unique(gff_v22$gene_type)) #table(lncRNA_types %in% unique(gff_v22$gene_type)) if(F){ metadata$group <- factor(metadata$group,levels = c("Normal", "Tumor")) #--test library(edgeR) library(limma) y <- DGEList(counts = data_expr_miRNA,group = metadata$group) y <- calcNormFactors(y) y <- estimateCommonDisp(y) y <- estimateTagwiseDisp(y) et <- exactTest(y,pair = levels(metadata$group)) gene1 <- decideTestsDGE(et,p.value = 0.01,lfc=2) summary(gene1) ordered_targs <- topTags(et,n=1000) allDEG <- ordered_targs$table diff_signif <- allDEG%>% filter(PValue<0.01,abs(logFC)>2)%>% arrange(logFC) newData <- y$pseudo.counts heatmapData <- newData[rownames(diff_signif),] hmExp <- log2(heatmapData + 0.01) library(gplots) hmMat <- as.matrix(hmExp) #par(oma=c(10,3,3,7)) heatmap.2(hmMat,col = "bluered",trace = "none") library(pheatmap) pheatmap(hmMat) }