R分析流程

最后发布时间:2022-11-08 10:03:21 浏览量:
{
  library("optparse")
  library(TCGAbiolinks)
  library(tidyverse)
  library(reshape2)
  option_list = list(make_option(c("-p", "--project"), type = "character", default = "TCGA-ESCA", help  = "TCGA project"),
                     make_option(c("-w","--workflow.type"), type = "character", default = "HTSeq - Counts", help = "File path to CpG site R data."),
                     make_option(c("-d","--directory"), type = "character", help = "test"))
  
  args <- parse_args(OptionParser(option_list=option_list))
  if(!is.null(args$directory)){
    if(!file.exists(args$directory)){
      dir.create(file.path(args$directory))
    }
    setwd(args$directory)
    message("切换工作目录到:",args$directory)
  }
  if(is.null(args$project)){
    message("请输入 TCGA Project!!")
    q()
  }
}
{
  IS_DEBUG=T
  LOG_FILENAME= "report_download.txt"
  IS_EMPTY=F
  source("/home/wangyang/workspace/bioinfo_analysis/Rscript/tools/tools.R")
}
####################################################
## 环境准备
###################################################
{
  project <- args$project
  message("TCGA项目为: ",project)
  info("分析的TCGA项目为: ",project)
  if(!file.exists("result")){
    dir.create(file.path("result"))
  }
}

公共函数123446

{
  TCGA_query <- function(project,data.type,workflow.type,filename){
    if(!file.exists(paste0("result/",filename,".rda"))){
      message("重新处理:",paste0("result/",filename,".rda"))
      query<- GDCquery(project = project,
                       data.category = "Transcriptome Profiling",
                       data.type = data.type,
                       workflow.type = workflow.type) 
      saveRDS(query,file = paste0("result/",filename,".rda"))
      return(query)
    }else{
      message("加载本地数据: ",paste0("result/",filename,".rda"))
      return(readRDS(paste0("result/",filename,".rda")))
    }
  }
  #### 获得metadata信息
  
  TCAG_prepare <- function(query,filename){
    if(!file.exists(paste0("result/",filename,".rda"))){
      message("重新处理:",paste0("result/",filename,".rda"))
      data <- GDCprepare(query = query,
                         summarizedExperiment = F,
                         save = F)
      saveRDS(data,file =paste0("result/",filename,".rda"))
      return(data)
    }else{
      message("加载本地数据: ",paste0("result/",filename,".rda"))
      return(readRDS(paste0("result/",filename,".rda")))
    }
    
  }
  DESeq2_Analysis <- function(expr,filename,metadata){
    if(!file.exists(paste0("result/",filename,".rda"))){
      ## 差异基因表达分析
      library(DESeq2)
      if(!identical(colnames(expr),rownames(metadata))){
        message("expr的列名与metadata行名不一致!!")
        message("expr 列数:",dim(expr)[2])
        message("metadata行数:",dim(metadata)[1])
        expr <- expr[,rownames(metadata)]
      }
      
      metadata$group <- factor(metadata$group)
      dds <- DESeqDataSetFromMatrix(countData = expr, 
                                    colData = metadata,
                                    design = ~ group)
      ## PCA图 
      #vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
      #vsd_df <- assay(vsd)
      # pdf(file = paste0("result/",filename,".pdf"))
      #DESeq2::plotPCA(vsd, intgroup = "group")
      #dev.off()
      #message("写入图片:",paste0("result/",filename,".pdf"))
      keep <- rowSums(counts(dds) > 0) >= min(table(metadata$group))
      dds_filt <- dds[keep, ]
      dds2 <- DESeq(dds_filt, parallel = T)
      message(resultsNames(dds2))
      res <- results(dds2)
      deg <- as.data.frame(res) 
      message("写入文件:",paste0("result/",filename,".rda")," ",paste0("result/",filename,".csv"))
      saveRDS(deg,file =paste0("result/",filename,".rda"))
      write.csv(deg,file=paste0("result/",filename,".csv"),quote = F,row.names = F)
      return(deg)
    }else{
      message("加载本地数据: ",paste0("result/",filename,".rda"))
      return(readRDS(paste0("result/",filename,".rda")))
    }
    
  }
  TCGA_metadata <- function(path,filename) {
      metadata <- jsonlite::read_json(path, simplifyVector = T)
      metadata <- tibble::tibble(
        file_name = metadata$file_name,
        md5sum = metadata$md5sum,
        cases = bind_rows(metadata$associated_entities)$entity_submitter_id,
        TCGA_id_full = cases,
        TCGA_id = stringr::str_sub(cases, 1, 16),
        patient_id = stringr::str_sub(TCGA_id, 1, 12),
        tissue_type_id = stringr::str_sub(TCGA_id, 14, 15),
        tissue_type = sapply(tissue_type_id, function(x) {
          switch(x,
                 "01" = "Primary Solid Tumor",
                 "02" = "Recurrent Solid Tumor",
                 "03" = "Primary Blood Derived Cancer - Peripheral Blood",
                 "05" = "Additional - New Primary",
                 "06" = "Metastatic",
                 "07" = "Additional Metastatic",
                 "11" = "Solid Tissue Normal")}),   
        group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>%
        column_to_rownames("TCGA_id_full")
      saveRDS(metadata,file =paste0("result/",filename,".rda"))
      write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F)
      return(metadata)
  }
  get_metadata <- function(metadata,filename){
      message("重新处理:",paste0("result/",filename,".rda"))
      metadata <- tibble::tibble(
        TCGA_id_full = metadata$cases,
        cases=metadata$cases,
        TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16),
        patient_id = stringr::str_sub(TCGA_id, 1, 12),
        tissue_type_id = stringr::str_sub(TCGA_id, 14, 15),
        tissue_type = sapply(tissue_type_id, function(x) {
          switch(x,
                 "01" = "Primary Solid Tumor",
                 "02" = "Recurrent Solid Tumor",
                 "03" = "Primary Blood Derived Cancer - Peripheral Blood",
                 "05" = "Additional - New Primary",
                 "06" = "Metastatic",
                 "07" = "Additional Metastatic",
                 "11" = "Solid Tissue Normal")}),   
        group = ifelse(tissue_type_id == "11", "Normal", "Tumor"))%>%
        column_to_rownames("TCGA_id_full")
      saveRDS(metadata,file =paste0("result/",filename,".rda"))
      write.csv(metadata,file = paste0("result/",filename,".csv"),quote = F)
      return(metadata)
  }
}

####################################################

TCGA 数据下载

###################################################


#### 下载临床数据
(function(){
  if(!file.exists("result/clinical.rda")){
    clinical <- GDCquery_clinic(project = project, type = "clinical")%>%
      mutate(futime=ifelse(vital_status=='Alive',days_to_last_follow_up,days_to_death),
             fustat = factor(vital_status,levels = c("Alive","Dead"),labels = c(0,1)),
             fustat=as.numeric(as.character(fustat)),
             futime_year=futime/365,
             patient_id=submitter_id,
             tumor_stage=case_when(tumor_stage=="stage i"~"i",
                                   grepl("stage i[^i ^v]+",tumor_stage)~"i",
                                   tumor_stage=="stage ii"~"ii",
                                   grepl("stage ii[^i]+",tumor_stage)~"ii",
                                   grepl("stage iii",tumor_stage)~"iii",
                                   grepl("stage iv",tumor_stage)~"iv",
                                   tumor_stage=="not reported"~"unknow"
             ))%>%
      dplyr::select(c("patient_id","futime","fustat","futime_year","tumor_stage","tumor_stage"))
    saveRDS(clinical,file = "result/clinical.rda")
  }else{
    clinical <- readRDS("result/clinical.rda")
  }
  info("临床资料有 ",length(unique(clinical$patient_id))," 个样本, 矩阵的大小为", paste0(dim(clinical),collapse = ","))
  message("加载本地数据result/clinical.rda")
})()

#### 下载基因数据
(function(){
  if(T){
    clinical <- readRDS("result/clinical.rda")
    query <- TCGA_query(project=project,data.type="Gene Expression Quantification",workflow.type="HTSeq - Counts",filename = "mRNA_query")
    info("TCGA Query Project: ",query$project[[1]])
    info("TCGA data.category: ",query$data.category)
    info("TCGA data.type: ",query$data.type)
    info("TCGA workflow.type: ",query$workflow.type)
    metadata <- query[[1]][[1]]%>%
      get_metadata("gene_metadata")
  }else{
    metadata <- TCGA_metadata("data/metadata.cart.2021-05-20.json","gene_metadata")
    query<- GDCquery(project = project,
                     data.category = "Transcriptome Profiling",
                     data.type = "Gene Expression Quantification",
                     workflow.type = "HTSeq - Counts",
                     barcode = c(metadata$cases)) 
    info("TCGA Query Project: ",query$project[[1]])
    info("TCGA data.category: ",query$data.category)
    info("TCGA data.type: ",query$data.type)
    info("TCGA workflow.type: ",query$workflow.type)
  }

  
  if(T){
    metadata_sample <- metadata%>%
      dplyr::select("cases")%>%
      mutate(sample= substr(cases,14,15))
    info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
    info("TCGA 样本信息[01 原发实体瘤, 11正常组织]: ",paste0(names(table(metadata_sample$sample)),collapse = ","),"对应数量",paste0(table(metadata_sample$sample),collapse = ","))
  }
  
  mRNA_lnRNA_clinical <- clinical%>%
    filter(patient_id %in% metadata$patient_id)
  info("在metadata中包含临床资料的有 ",length(unique(mRNA_lnRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(mRNA_lnRNA_clinical),collapse = ","))
  saveRDS(mRNA_lnRNA_clinical,file = "result/mRNA_lnRNA_clinical.rda")
  GDCdownload(query)
  data <- TCAG_prepare(query = query,filename = "mRNA_TCAG_prepare")%>%
    column_to_rownames("X1")
  data <- data[,rownames(metadata)]
  info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
       "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
})()

#### 下载miRNA数据
(function(){
  clinical <- readRDS("result/clinical.rda")
  query <- TCGA_query(project=project,data.type="miRNA Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_query")
  info("TCGA Query Project: ",query$project[[1]])
  info("TCGA data.category: ",query$data.category)
  info("TCGA data.type: ",query$data.type)
  info("TCGA workflow.type: ",query$workflow.type)
  metadata <- query[[1]][[1]]
  metadata <- get_metadata(metadata,"miRNA_metadata")
  info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
  miRNA_clinical <- clinical <- clinical%>%
    filter(patient_id %in% metadata$patient_id)
  info("在metadata中包含临床资料的有 ",length(unique(miRNA_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_clinical),collapse = ","))
  saveRDS(miRNA_clinical,file = "result/miRNA_clinical.rda")
  GDCdownload(query)
  
  data <- TCAG_prepare(query = query,filename = "miRNA_TCAG_prepare")%>%
    dplyr::select("miRNA_ID",starts_with("read_count"))%>%
    rename_at(vars(contains("read_count")), ~ substr(.,12,length(.)))%>%
    column_to_rownames("miRNA_ID")
  data <- data[,rownames(metadata)]
  saveRDS(data,file ="result/miRNA_expr.rda")
  info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
       "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
  
 # sum(is.na(expr_miRNA))
})()

#### 下载miRNA前体数据
(function(){
  clinical <- readRDS("result/clinical.rda")
  query <- TCGA_query(project=project,data.type="Isoform Expression Quantification",workflow.type="BCGSC miRNA Profiling",filename = "miRNA_iso_query")
  info("TCGA Query Project: ",query$project[[1]])
  info("TCGA data.category: ",query$data.category)
  info("TCGA data.type: ",query$data.type)
  info("TCGA workflow.type: ",query$workflow.type)
  metadata <- query[[1]][[1]]
  metadata <- get_metadata(metadata,"miRNA_iso_metadata")
  info("metadata中样本有 ",length(unique(rownames(metadata)))," 个, 矩阵大小为: ",paste0(dim(metadata),collapse = ","))
  miRNA_iso_clinical <- clinical <- clinical%>%
    filter(patient_id %in% metadata$patient_id)
  saveRDS(miRNA_iso_clinical,file = "result/miRNA_iso_clinical.rda")
  info("在metadata中包含临床资料的有 ",length(unique(miRNA_iso_clinical$patient_id))," 个, 矩阵大小为: ",paste0(dim(miRNA_iso_clinical),collapse = ","))
  
  # 下载数据
  GDCdownload(query)
  miRNA_anno <- read_lines("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/mature.fa.gz") %>% 
    as_tibble() %>% 
    dplyr::filter(str_detect(value, pattern = "Homo")) %>% 
    tidyr::separate(col = value, into = paste0("ID", 1:5), sep = " ") %>% 
    dplyr::select(isoform_id = ID2, miRNA_name = ID5)
  data <- TCAG_prepare(query = query,filename = "miRNA_ios_TCAG_prepare")%>%
    dplyr::select(miRNA_region,barcode,read_count)%>%
    filter(grepl("mature",miRNA_region))%>%
    dcast(miRNA_region~barcode,value.var = "read_count")%>%
    plyr::rename(c(miRNA_region="isoform_id"))%>%
    mutate(isoform_id=substr(isoform_id,8,length(.)))%>%
    inner_join(miRNA_anno,by="isoform_id")%>%
    column_to_rownames("miRNA_name")%>%
    dplyr::select(-isoform_id)
  data <- data[,rownames(metadata)]
  saveRDS(data,file ="result/miRNA_iso_expr.rda")
  info("表达矩阵中有样本 ",length(unique(colnames(data)))," 个, 矩阵大小为: ",paste0(dim(data),collapse = ","),
       "表达矩阵的列名与metadata行名: ",identical(rownames(metadata),colnames(data)))
})()

####################################################

TCGA 数据处理

###################################################

#### 获取注释文件
(function(){
  if(!file.exists("result/gff_v22.rda")){
    gff_v22 <- read_tsv("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/gencode.gene.info.v22.tsv")
    saveRDS(gff_v22,"result/gff_v22.rda")
  }
})()

#### 注释基因数据
(function(){
  gff_v22 <- readRDS("result/gff_v22.rda")
  data <- readRDS("result/mRNA_TCAG_prepare.rda")
  id2symbol <- gff_v22 %>% 
    dplyr::select(1, 2)
  expr_data <- data[1:(nrow(data)-5),]%>%
    plyr::rename(c(X1="gene_id"))%>%
    inner_join(id2symbol,by = "gene_id")

  saveRDS(expr_data,file = "result/expr_data.rda")
})()


#### 提取mRNA和lnRNA
(function(){
  expr_data <- readRDS("result/expr_data.rda")
  gff_v22 <- readRDS("result/gff_v22.rda")
  lncRNA_types <- "3prime_overlapping_ncrna, antisense, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping"
  lncRNA_types <- unlist(str_split(lncRNA_types, pattern = ", "))
  gene_type <- gff_v22 %>% 
    dplyr::select(gene_id, gene_type) %>% 
    arrange(gene_id)
  identical(gene_type$gene_id,expr_data$gene_id)
  expr_mRNA <- expr_data[gene_type$gene_type=="protein_coding",]%>%
    {.[!duplicated(.$gene_name),]}%>%
    column_to_rownames("gene_name")%>%
    dplyr::select(-gene_id)
  png(filename = "test.png")
  boxplot(log2(expr_mRNA+1))
  dev.off()
  write.csv(expr_mRNA,file = "result/mRNA_expr.csv",quote = F)
  saveRDS(expr_mRNA,file ="result/mRNA_expr.rda")
  
  expr_lncRNA <- expr_data[gene_type$gene_type %in% lncRNA_types,]%>%
    {.[!duplicated(.$gene_name),]}%>%
    column_to_rownames("gene_name")%>%
    dplyr::select(-gene_id)
  write.csv(expr_lncRNA,file = "result/lncRNA_expr.csv",quote = F)
  saveRDS(expr_lncRNA,file ="result/lncRNA_expr.rda")
})()

鉴定差异表达

(function(){
  expr_mRNA<-  readRDS("result/mRNA_expr.rda")
  #boxplot(log2(expr_mRNA+1))
  expr_lncRNA<-  readRDS("result/lncRNA_expr.rda")
  data_expr_miRNA<-  readRDS("result/miRNA_expr.rda")
  data_expr_ios_miRNA <-  readRDS("result/miRNA_iso_expr.rda")
  gene_metadata <- readRDS("result/gene_metadata.rda")
  miRNA_metadata <- readRDS("result/miRNA_metadata.rda")
  miRNA_iso_metadata <- readRDS("result/miRNA_iso_metadata.rda")
  
  
  if(F){
    paired_metadata <- gene_metadata%>%
      filter(tissue_type_id %in% c("01","11"))%>%
      add_count(patient_id,name = "n_patient")%>%
      filter(n_patient==2)%>%
      arrange(patient_id,tissue_type_id)
    paired_count<- expr_mRNA%>%
      dplyr::select(paired_metadata$cases)
    group <- paired_metadata$group%>%as.factor()
    colData <- data.frame(sample_Id = colnames(paired_count),group=group)
    dds <- DESeqDataSetFromMatrix(countData =paired_count, 
                                         colData = paired_metadata,
                                         design = ~ group)
    vst <- vst(dds)
    count_vst <- assay(vst)
    boxplot(log2(paired_count+1),col=group)
    limma::plotDensities(log2(paired_count+1),legend=F)
    boxplot(count_vst,col=group)
    limma::plotDensities(count_vst,legend=F)
    
    pivot_longer(as.data.frame(count_vst[1:2,]),cols=everything(),names_to = "cases")
    data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>%
      left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>%
      ggplot(aes(x=group,y=value,fill=group))+
      geom_boxplot(outlier.shape = NA)+
      geom_point(size=1, alpha=0.5)+
      geom_line(aes(group=patient_id), colour="black", linetype="11") 
    
    gene <-  data.frame(value=as.vector(t( count_vst["TSPAN6",])),cases = colnames(count_vst))%>%
      left_join(paired_metadata%>%select("cases","group","patient_id"),by="cases")%>%
      select(-cases)
    spread(gene,group,value)%>%
      filter(Normal<Tumor)
    
    }
  if(F){
    identical(rownames(gene_metadata),colnames(expr_mRNA))
    group <- gene_metadata$group
    data.frame(gene=as.vector(t(expr_mRNA[1,])),group=group,row.names = rownames(gene_metadata))%>%
      write.csv(file = "expr_mRNA_group.csv",quote = F)
    
    df <- read.csv("expr_mRNA_group.csv",row.names = 1)
    boxplot(log2(gene)~group,data=df,col=c("green","red"))
    ## 计算p
    wilcoxTest <- wilcox.test(gene~group,data=df)
    pValue <- wilcoxTest$p.value
    conGeneMeans <- mean( subset(df,group=="Normal")$gene)
    treatGeneMeans <- mean( subset(df,group=="Tumor")$gene)
    ## 计算logFC
    logFC <- log2(treatGeneMeans/conGeneMeans)

  }
  if(F){
    library(DESeq2)
    gene_metadata$group <- factor(gene_metadata$group)
    dds <- DESeqDataSetFromMatrix(countData = expr_mRNA[1,], 
                                  colData = gene_metadata,
                                  design = ~ group)
    ## PCA图 
    vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
    boxplot(log2(expr_mRNA[,1:10]+1))
    boxplot(assay(vsd))
    dds2 <- DESeq(dds, parallel = T)
    message(resultsNames(dds2))
    res <- results(dds2)
    deg <- as.data.frame(res) 
  }
  
  mRNA_deg<- DESeq2_Analysis(expr_mRNA,"mRNA_expr_DESeq",gene_metadata)
  info("鉴定差异mRNA个数: ",length(unique(rownames(mRNA_deg))))
  lnRNA_deg <- DESeq2_Analysis(expr_lncRNA,"lnRNA_expr_DESeq",gene_metadata)
  info("鉴定差异lnRNA个数: ",length(unique(rownames(lnRNA_deg))))
  miRNA_deg <- DESeq2_Analysis(data_expr_miRNA,"miRNA_expr_DESeq",miRNA_metadata)
  info("鉴定差异miRNA个数: ",length(unique(rownames(miRNA_deg))))
  miRNA_deg_iso <- DESeq2_Analysis(data_expr_ios_miRNA,"miRNA_iso_expr_DESeq",miRNA_iso_metadata)
  info("鉴定差异miRNA_iso个数: ",length(unique(rownames(miRNA_deg_iso))))
})()
########################备注########################################
#GDC_projects <- getGDCprojects()
#write.csv(GDC_projects[[1]],file = "data/GDC_projects.csv",quote = F,row.names = F)
#GDC_projects$project_id[grepl("TCGA",GDC_projects$project_id)]
#sort(table(gff_v22$gene_type),decreasing = T)
#sort(unique(gff_v22$gene_type))
#table(lncRNA_types %in% unique(gff_v22$gene_type))

if(F){
  metadata$group <- factor(metadata$group,levels = c("Normal", "Tumor"))
  #--test
  library(edgeR)
  library(limma)
  y <- DGEList(counts = data_expr_miRNA,group = metadata$group)
  y <- calcNormFactors(y)
  y <- estimateCommonDisp(y)
  y <- estimateTagwiseDisp(y)
  et <- exactTest(y,pair = levels(metadata$group))
  gene1 <- decideTestsDGE(et,p.value = 0.01,lfc=2)
  summary(gene1)
  ordered_targs <- topTags(et,n=1000)
  allDEG <- ordered_targs$table
  diff_signif <- allDEG%>%
    filter(PValue<0.01,abs(logFC)>2)%>%
    arrange(logFC)
  
  newData <- y$pseudo.counts
  heatmapData <- newData[rownames(diff_signif),]
  hmExp <- log2(heatmapData + 0.01)
  library(gplots)
  hmMat <- as.matrix(hmExp)
  #par(oma=c(10,3,3,7))
  heatmap.2(hmMat,col = "bluered",trace = "none")
  library(pheatmap)
  pheatmap(hmMat)
}