GEO数据分析
最后发布时间:2022-02-11 11:15:19
浏览量:
开始你的创作:GEO2R使用
library(GEOquery)
library(limma)
library(edgeR)
library(tidyverse)
(function(){
GSE ="GSE126094"
message("开始分析: ",GSE," Hsa_circ_101555 functions as a competing endogenous RNA of miR-597-5p to promote colorectal cancer progression [Circular RNAs]")
if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
}
gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
gset <- gset[[1]]
gset_expr <- exprs(gset)%>%
as.data.frame()
message(GSE," 使用GPL平台 ",gset@annotation)
#boxplot(log2(gset_expr),outline=FALSE)
#title(main=GSE)
gset_pd <- pData(gset)%>%
dplyr::select(sample_id=geo_accession,sample_name=title)%>%
mutate(group=case_when(grepl("normal",sample_name)~"Normal",
T ~"Tumor"))
gset_annot <- gset@featureData@data
gset_expr_annot <- gset_expr %>%
mutate(symbol=gset_annot$circRNA)%>%
rownames_to_column("id")%>%
dplyr::select(-id) %>%
filter(symbol!="") %>%
stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
tibble::column_to_rownames("symbol")
saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})()
(function(){
GSE ="GSE138589"
message("开始分析: ",GSE,"circRNA Microarray for human colorectal adenocarcinoma and normal mucosa")
if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
}
gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
gset <- gset[[1]]
gset_expr <- exprs(gset)%>%
as.data.frame()
message(GSE," 使用GPL平台 ",gset@annotation)
#boxplot(gset_expr,outline=FALSE)
#title(main=GSE)
gset_pd <- pData(gset)%>%
dplyr::select(sample_id=geo_accession,sample_name=title)%>%
mutate(group=case_when(grepl("normal",sample_name)~"Normal",
T ~"Tumor"))
gset_annot <- gset@featureData@data
gset_expr_annot <- gset_expr %>%
mutate(symbol=gset_annot$circRNA)%>%
rownames_to_column("id")%>%
dplyr::select(-id) %>%
filter(symbol!="") %>%
stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
tibble::column_to_rownames("symbol")
saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})
(function(){
GSE ="GSE142837"
message("开始分析: ",GSE," circRNA expression data from human colorectal cancer tissues and the correponding non-cancerous tissues")
if(!file.exists(paste0("GEO-COAD/",GSE,".rds"))){
gset <- getGEO(GSE,AnnotGPL=T,destdir = "GEO-COAD")
saveRDS(gset,paste0("GEO-COAD/",GSE,".rds"))
}
gset <- readRDS(paste0("GEO-COAD/",GSE,".rds"))
gset <- gset[[1]]
gset_expr <- exprs(gset)%>%
as.data.frame()
message(GSE," 使用GPL平台 ",gset@annotation)
#boxplot(log2(gset_expr),outline=FALSE)
#title(main=GSE)
gset_pd <- pData(gset)%>%
dplyr::select(sample_id=geo_accession,sample_name=title)%>%
mutate(group=case_when(grepl("non-cancerous",sample_name)~"Normal",
T ~"Tumor"))
gset_annot <- gset@featureData@data
gset_expr_annot <- gset_expr %>%
mutate(symbol=gset_annot$circRNA)%>%
rownames_to_column("id")%>%
dplyr::select(-id) %>%
filter(symbol!="") %>%
stats::aggregate(. ~ symbol, data =., FUN = mean)%>%
tibble::column_to_rownames("symbol")
saveRDS(gset_expr_annot,paste0("GEO-COAD/gset_expr_annot_circRNA_",GSE,".rda"))
saveRDS(gset_pd,paste0("GEO-COAD/metadata_circRNA_",GSE,".rda"))
})()
library("maptools") # point labels without overlaps
library(umap)
(function(){
GEOS <- c("GSE126094","GSE142837")
Normalized <- c(F,F)
for(i in 1:length(GEOS)){
GEO <- GEOS[i]
message("Normalized:",Normalized[i])
message("开始分析:",GEO)
expr <- readRDS(paste0("GEO-COAD/gset_expr_annot_circRNA_",GEO,".rda"))
metadata <- readRDS(paste0("GEO-COAD/metadata_circRNA_",GEO,".rda"))
message("分组信息列名与表达矩阵的行名相同",identical(colnames(expr),rownames(metadata)))
group <- factor(metadata$group, levels = c("Normal", "Tumor"))
## 检测是否需要log2转换
qx <- as.numeric(quantile(expr, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) {
message("##############",GEO,"log2 转换")
expr[which(expr <= 0)] <- NaN
expr <- log2(expr)
}
## 质控
if(T){
par(mfrow=c(1,3),mar=c(7,2,1,6))
ord <- order(group)
boxplot(expr[,ord], boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
plotDensities(expr, group=group, main=GEO, legend ="topright")
ump <- umap(t(expr), n_neighbors = 9, random_state = 123)
plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
legend("topright", legend=levels(group), pch=20,
col=1:nlevels(group), title="Group", pt.cex=1.5,inset=c(-0.6,0))
pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
}
design <- model.matrix( ~ 0 + group)
colnames(design) <- levels(group)
message("Tumor-Normal")
contrast.matrix <- makeContrasts("Tumor-Normal", levels = design)
## 是否强制标准化
if(!Normalized[i]){
message(GEO," 进行Normalized")
if(F){
dge <- DGEList(counts=expr)
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE, normalize="quantile")
}
if(F){
expr <- expr[complete.cases(expr), ]
v <- vooma(log2(expr), design, plot=T)
expr <- normalizeBetweenArrays(expr) # normalize data
par(mfrow=c(1,1),mar=c(7,3,2,2))
boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
}
expr<- normalizeBetweenArrays(expr)
par(mfrow=c(1,3),mar=c(7,2,1,6))
ord <- order(group)
boxplot(expr[,ord], boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
plotDensities(expr, group=group, main=GEO, legend ="topright")
ump <- umap(t(expr), n_neighbors = 9, random_state = 123)
plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
legend("topright", legend=levels(group), pch=20,
col=1:nlevels(group), title="Group", pt.cex=1.5, inset=c(-0.6,0))
pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
}
fit <- lmFit(expr, design)%>%
contrasts.fit(contrast.matrix)%>%
eBayes()
#contrast <- paste0(rev(levels(group)), collapse = "-")
res <- topTable(fit, coef = 1, n = Inf)
saveRDS(res,file = paste0("GEO-COAD/limma_",GEO,".rda"))
}
if(F){
GEO <- GEOS[1]
expr <- readRDS(paste0("GEO-COAD/gset_expr_annot_circRNA_",GEO,".rda"))
metadata <- readRDS(paste0("GEO-COAD/metadata_circRNA_",GEO,".rda"))
message("分组信息列名与表达矩阵的行名相同",identical(colnames(expr),rownames(metadata)))
group <- factor(metadata$group, levels = c("Normal", "Tumor"))
par(mfrow=c(1,3),mar=c(7,2,1,5))
ord <- order(group)
boxplot(log2(expr[,ord]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
plotDensities(log2(expr), group=group, main=GEO, legend ="topright")
#par(mfrow=c(1,1),mar=c(2,2,2,2))
ump <- umap(t(log2(expr)), n_neighbors = 9, random_state = 123)
#par(mar=c(3,3,2,6), xpd=TRUE)
plot(ump$layout, main="UMAP plot, nbrs=9", xlab="", ylab="", col=group, pch=20, cex=1.5)
legend("topright", legend=levels(group), pch=20,
col=1:nlevels(group), title="Group", pt.cex=1.5, inset=c(-0.3,0))
pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
design <- model.matrix( ~ 0 + group)
colnames(design) <- levels(group)
contrast <- paste0(rev(levels(group)), collapse = "-")
message(contrast)
contrast.matrix <- makeContrasts(contrast, levels = design)
if(F){
dge <- DGEList(counts=log2(expr))
dge <- calcNormFactors(dge)
v <- voom(dge,design,plot=TRUE, normalize="quantile")
v <- vooma(expr, design, plot=T)
par(mfrow=c(1,1),mar=c(7,3,2,2))
boxplot(log2(v[['E']]), boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
}
expr<- normalizeBetweenArrays(log2(expr))
boxplot(expr, boxwex=0.6, notch=T, main=GEO, outline=FALSE, las=2, col=group[ord])
fit <- lmFit(expr, design)%>%
contrasts.fit(contrast.matrix)%>%
eBayes()
res <- topTable(fit, coef = 1, n = Inf)
saveRDS(res,file = paste0("GEO-COAD/limma_",GEO,".rda"))
}
})()