GEOquery R包源码解读

最后发布时间:2021-08-23 15:57:46 浏览量:

GEO 网站

使用data/GSE161784_series_matrix.txt.gz

library(GEOquery)
expr <- getGEO(filename="data/GSE161784_series_matrix.txt.gz",destdir="data",getGPL=T,AnnotGPL=T)
getGEOfile <- function(GEO,destdir=tempdir(),AnnotGPL=FALSE,
                       amount=c('full','brief','quick','data')) {
    amount <- match.arg(amount)
    geotype <- toupper(substr(GEO,1,3)) # GSE
    mode <- 'wb'
    GEO <- toupper(GEO)
    stub = gsub('\\d{1,3}$','nnn',GEO,perl=TRUE) # GSE2nnn
    if (geotype == 'GDS') {
      gdsurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/datasets/%s/%s/soft/%s'
      myurl <- sprintf(gdsurl,stub,GEO,paste0(GEO,'.soft.gz'))
      destfile <- file.path(destdir,paste0(GEO,'.soft.gz'))
    }
    if (geotype == 'GSE' & amount=='full') {
      gseurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/series/%s/%s/soft/%s'
	  # https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161784/soft/GSE161784_family.soft.gz
      myurl <- sprintf(gseurl,stub,GEO,paste0(GEO,'_family.soft.gz')) 
      destfile <- file.path(destdir,paste(GEO,'.soft.gz',sep="")) # data/GSE161784.soft.gz
    }
    if (geotype == 'GSE' & amount!='full' & amount!='table') {
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft',sep=""))
      mode <- 'w'
    }

如果AnnotGPL=Thttps://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL19nnn/GPL19117/annot/GPL19117.annot.gz下载GPL19117.annot.gz,如果这个链接不存在GPL19117.annot.gzAnnotation GPL not available, so will use submitter GPL instead),则从https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full下载

    if (geotype == 'GPL') {
	# getGEO(filename="data/GSE161784_series_matrix.txt.gz",destdir="data",getGPL=T,AnnotGPL=T)
	# AnnotGPL 从https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL19nnn/GPL19117/annot/GPL19117.annot.gz下载
      if (AnnotGPL) {
        gplurl <- 'https://ftp.ncbi.nlm.nih.gov/geo/platforms/%s/%s/annot/%s'
        myurl <- sprintf(gplurl,stub,GEO,paste0(GEO,'.annot.gz'))
        destfile <- file.path(destdir,paste(GEO,'.annot.gz',sep=""))
        # check to see if Annotation GPL is present.  If so,
        # use it, else move on to submitter GPL
        res=try({
          if(!file.exists(destfile)) {
            downloadFile(myurl, destfile, mode)
            message('File stored at: ')
            message(destfile)
          } else {
            message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
          }
        },silent=TRUE)
        if(!inherits(res,'try-error')) {
          return(invisible(destfile))
        } else {
          message('Annotation GPL not available, so will use submitter GPL instead')
        }
      } 
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft.gz',sep=""))
      mode <- 'w'
      if(!file.exists(destfile)) {
	  # myurl=https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full
	  # destfile=data/GPL19117.soft.gz
	  # mode=w
	  # getOption('download.file.method.GEOquery')=auto
        download.file(myurl,destfile,mode=mode,quiet=TRUE,method=getOption('download.file.method.GEOquery'),
                      headers = c("accept-encoding"="gzip"))
        message('File stored at: ')
        message(destfile)
      } else {
        message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
      }
      return(invisible(destfile))
    }

由于soft文件经常很大,导致下面错误,解决方法就是:在浏览器打开https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full,下载文件GPL19117.txt, 执行以下命令, 之后替换当前目录data/GPL19117.soft.gz

gzip GPL19117.txt
mv GPL19117.txt.gz GPL19117.soft.gz

注:GPL文件的下载路径需要使用getGEO的destdir="data"参数指定

download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full",
                "data/GPL19117.soft.gz",mode="w",quiet=TRUE,method="auto",headers = c("accept-encoding"="gzip"))

图片alt

图片alt

    if (geotype == 'GSM') {
      gseurl <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
      myurl <- paste(gseurl,'?targ=self&acc=',GEO,'&form=text&view=',amount,sep='')
      destfile <- file.path(destdir,paste(GEO,'.soft',sep=""))
      mode <- 'w'
    }
	# 判断文件是否存在
    if(!file.exists(destfile)) {
      downloadFile(myurl, destfile, mode)
      message('File stored at: ')
      message(destfile)
    } else {
      message(sprintf('Using locally cached version of %s found here:\n%s ',GEO,destfile))
    }
  }

解析data/GSE161784_series_matrix.txt.gz获取GPLXXX号

parseGSEMatrix <- function(fname,AnnotGPL=FALSE,destdir=tempdir(),getGPL=TRUE,parseCharacteristics=TRUE) {
	// 从data/GSE161784_series_matrix.txt.gz解析到GPL,再次调用getGEO,从网络下载
	if(getGPL) {
			gpl <- getGEO(GPL,AnnotGPL=AnnotGPL,destdir=destdir)
	}
	
	    eset <- new('ExpressionSet',
                phenoData=as(sampledat,'AnnotatedDataFrame'),
                annotation=GPL,
                featureData=fd,
                experimentData=ed,
                exprs=as.matrix(datamat))
    return(list(GPL=as.character(sampledat[1,grep('platform_id',colnames(sampledat),ignore.case=TRUE)]),eset=eset))
}

数据下载流程

使用getGEO('GSE161784',GSEMatrix=T, destdir="data")下载文件流程

使用getGEO('GSE161784', GSEMatrix=F, destdir="data")下载文件流程

使用getGEO(filename="data/GSE161784_series_matrix.txt.gz",destdir="data",getGPL=T,AnnotGPL=T)下载文件流程

使用getGEO(filename="data/GSE161784_series_matrix.txt.gz",destdir="data",getGPL=F,AnnotGPL=T)下载文件流程

不会下载东西

FAQ

https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161784/soft/GSE161784_family.soft.gzhttps://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full两个soft文件的区别?

  • 第一个下载得到 GSE161784_family.soft.gz,这是一个表达矩阵文件,GEOQuery会解析该表达矩阵并且自动下载其对应的GPL文件
  • 第二个下载得到 data/GPL19117.soft.gz,

getGEO('GSE161784',GSEMatrix=T, destdir="data")与getGEO(filename="data/GSE161784_series_matrix.txt.gz",destdir="data")作用相同,第二个添加GSEMatrix=F没有作用

getGEO('GSE161784', GSEMatrix=F, destdir="data")

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full从浏览器与从

下载区别

download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL19117&form=text&view=full",
                "data/GPL19117.soft.gz",mode="w",quiet=TRUE,method="auto",headers = c("accept-encoding"="gzip"))

GPL的下载方式

gse <- getGEO("GSE58469",destdir="data")
gpl97 <- getGEO('GPL8179',destdir=".")
download.file("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL8179&form=text&view=full",
                "GPL8179.soft.gz",mode="w",quiet=TRUE,method="auto",headers = c("accept-encoding"="gzip"))
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL8179&form=text&view=full

gpl97 <- getGEO('GPL8179',destdir=".",AnnotGPL=T)
#Annotation GPL not available, so will use submitter GPL instead
#https://ftp.ncbi.nlm.nih.gov/geo/platforms/GPL8nnn/GPL8179/annot/GPL8179.annot.gz

参考