GEO数据分析

最后发布时间:2022-02-11 11:15:19 浏览量:

开始你的创作:GEO2R使用

library(GEOquery)
library(limma)
library(edgeR)
library(tidyverse)

(function(){
  GSE ="GSE126094"
  message("开始分析: ",GSE," 	Hsa_circ_101555 functions as a competing endogenous RNA of miR-597-5p to promote colorectal cancer progression [Circular RNAs]")
  if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
    gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
    saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
  }
  
  gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
  gset <- gset[[1]]
  gset_expr <- exprs(gset)%>%
    as.data.frame()
  message(GSE," 使用GPL平台 ",gset@annotation)
  #boxplot(log2(gset_expr),outline=FALSE)
  #title(main=GSE)
  gset_pd <- pData(gset)%>%
    dplyr::select(sample_id=geo_accession,sample_name=title)%>%
    mutate(group=case_when(grepl("normal",sample_name)~"Normal",
                           T ~"Tumor"))
  gset_annot <- gset@featureData@data 
  
  gset_expr_annot <- gset_expr %>% 
    mutate(symbol=gset_annot$circRNA)%>%
    rownames_to_column("id")%>%
    dplyr::select(-id) %>%
    filter(symbol!="") %>%
    stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
    tibble::column_to_rownames("symbol")

  saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
  saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})()


(function(){
  GSE ="GSE138589"
  message("开始分析: ",GSE,"circRNA Microarray for human colorectal adenocarcinoma and normal mucosa")
  if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
    gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
    saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
  }
  
  gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
  gset <- gset[[1]]
  gset_expr <- exprs(gset)%>%
    as.data.frame()
  message(GSE," 使用GPL平台 ",gset@annotation)
  #boxplot(gset_expr,outline=FALSE)
  #title(main=GSE)
  gset_pd <- pData(gset)%>%
    dplyr::select(sample_id=geo_accession,sample_name=title)%>%
    mutate(group=case_when(grepl("normal",sample_name)~"Normal",
                           T ~"Tumor"))
  gset_annot <- gset@featureData@data 
  
  gset_expr_annot <- gset_expr %>% 
    mutate(symbol=gset_annot$circRNA)%>%
    rownames_to_column("id")%>%
    dplyr::select(-id) %>%
    filter(symbol!="") %>%
    stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
    tibble::column_to_rownames("symbol")
  
  saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
  saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})

(function(){
  GSE ="GSE142837"
  message("开始分析: ",GSE,"	circRNA expression data from human colorectal cancer tissues and the correponding non-cancerous tissues")
  if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
    gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
    saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
  }
  
  gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
  gset <- gset[[1]]
  gset_expr <- exprs(gset)%>%
    as.data.frame()
  message(GSE," 使用GPL平台 ",gset@annotation)
  #boxplot(log2(gset_expr),outline=FALSE)
  #title(main=GSE)
  gset_pd <- pData(gset)%>%
    dplyr::select(sample_id=geo_accession,sample_name=title)%>%
    mutate(group=case_when(grepl("non-cancerous",sample_name)~"Normal",
                           T ~"Tumor"))
  gset_annot <- gset@featureData@data 
  
  gset_expr_annot <- gset_expr %>% 
    mutate(symbol=gset_annot$circRNA)%>%
    rownames_to_column("id")%>%
    dplyr::select(-id) %>%
    filter(symbol!="") %>%
    stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
    tibble::column_to_rownames("symbol")
  
  saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
  saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})()
library("maptools")  # point labels without overlaps
library(umap)
(function(){
  GEOS <- c("GSE126094","GSE142837")
  Normalized <- c(F,F)
  for(i in 1:length(GEOS)){
    GEO <- GEOS[i]
    message("Normalized:",Normalized[i])
    message("开始分析:",GEO)
    expr <- readRDS(paste0("GEO-COAD/gset_expr_annot_circRNA_",GEO,".rda"))
    metadata <- readRDS(paste0("GEO-COAD/metadata_circRNA_",GEO,".rda"))
    message("分组信息列名与表达矩阵的行名相同",identical(colnames(expr),rownames(metadata)))
    group <- factor(metadata$group, levels = c("Normal", "Tumor")) 
    
    ## 检测是否需要log2转换
    qx <- as.numeric(quantile(expr, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
    LogC <- (qx[5] > 100) ||
      (qx[6]-qx[1] > 50 && qx[2] > 0)
    if (LogC) {
      message("##############",GEO,"log2 转换")
      expr[which(expr <= 0)] <- NaN
      expr <- log2(expr) 
    }
    
    ## 质控
    if(T){
      par(mfrow=c(1,3),mar=c(7,2,1,6))
      ord <- order(group) 
      boxplot(expr[,ord], boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
      plotDensities(expr, group=group, main=GEO, legend ="topright")
      ump <- umap(t(expr), n_neighbors = 9, random_state = 123)
      plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
      legend("topright", legend=levels(group), pch=20,
             col=1:nlevels(group), title="Group", pt.cex=1.5,inset=c(-0.6,0))
      pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
    }
    
    
    design <- model.matrix( ~ 0 + group)
    colnames(design) <- levels(group)
    message("Tumor-Normal")
    contrast.matrix <- makeContrasts("Tumor-Normal", levels = design)
    
    ## 是否强制标准化
    if(!Normalized[i]){
      message(GEO," 进行Normalized")
      if(F){
        dge <- DGEList(counts=expr)
        dge <- calcNormFactors(dge)   
        v <- voom(dge,design,plot=TRUE, normalize="quantile") 
      }
      if(F){
        expr <- expr[complete.cases(expr), ]
        v <- vooma(log2(expr), design, plot=T)
        expr <- normalizeBetweenArrays(expr) # normalize data
        par(mfrow=c(1,1),mar=c(7,3,2,2))
        boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
      }

      
      expr<- normalizeBetweenArrays(expr)
      par(mfrow=c(1,3),mar=c(7,2,1,6))
      ord <- order(group) 
      boxplot(expr[,ord], boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
      plotDensities(expr, group=group, main=GEO, legend ="topright")
      ump <- umap(t(expr), n_neighbors = 9, random_state = 123)
      plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
      legend("topright", legend=levels(group), pch=20,
             col=1:nlevels(group), title="Group", pt.cex=1.5, inset=c(-0.6,0))
      pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
    }
    fit <- lmFit(expr, design)%>%
      contrasts.fit(contrast.matrix)%>%
      eBayes()
    
    #contrast <- paste0(rev(levels(group)), collapse = "-")
 
    res <- topTable(fit, coef = 1, n = Inf) 
    saveRDS(res,file = paste0("GEO-COAD/limma_",GEO,".rda"))
  }
  if(F){
    GEO <- GEOS[1]
    expr <- readRDS(paste0("GEO-COAD/gset_expr_annot_circRNA_",GEO,".rda"))
    metadata <- readRDS(paste0("GEO-COAD/metadata_circRNA_",GEO,".rda"))
    message("分组信息列名与表达矩阵的行名相同",identical(colnames(expr),rownames(metadata)))
    group <- factor(metadata$group, levels = c("Normal", "Tumor"))  
    
    par(mfrow=c(1,3),mar=c(7,2,1,5))
    ord <- order(group) 
    boxplot(log2(expr[,ord]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
    plotDensities(log2(expr), group=group, main=GEO, legend ="topright")
    #par(mfrow=c(1,1),mar=c(2,2,2,2))
    ump <- umap(t(log2(expr)), n_neighbors = 9, random_state = 123)
    #par(mar=c(3,3,2,6), xpd=TRUE)
    plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
    legend("topright", legend=levels(group), pch=20,
           col=1:nlevels(group), title="Group", pt.cex=1.5, inset=c(-0.3,0))
    pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
    
    design <- model.matrix( ~ 0 + group)
    colnames(design) <- levels(group)
    contrast <- paste0(rev(levels(group)), collapse = "-")
    message(contrast)
    contrast.matrix <- makeContrasts(contrast, levels = design)
    if(F){
      dge <- DGEList(counts=log2(expr))
      dge <- calcNormFactors(dge)   
      v <- voom(dge,design,plot=TRUE, normalize="quantile") 
      
      v <- vooma(expr, design, plot=T)
      par(mfrow=c(1,1),mar=c(7,3,2,2))
      boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
    }
    expr<- normalizeBetweenArrays(log2(expr))
    boxplot(expr, boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
    fit <- lmFit(expr, design)%>%
      contrasts.fit(contrast.matrix)%>%
      eBayes()
    res <- topTable(fit, coef = 1, n = Inf) 
    saveRDS(res,file = paste0("GEO-COAD/limma_",GEO,".rda"))
  }
 
})()