browseVignettes("TCGAbiolinks") 查看帮助
下载count数据
library(TCGAbiolinks) GDC_projects <- getGDCprojects() # 查看TCGA项目 GDC_projects # 查询数据 query_LIHC_counts <- GDCquery(project = "TCGA-LIHC", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts") # 下载数据 GDCdownload(query_LIHC_counts) # 合并数据 LIHC_rnaseq_biolinks <- GDCprepare(query = query_LIHC_counts, summarizedExperiment = FALSE, save = TRUE, save.filename = "data/LIHC_TCGAbiolinks/LIHC_rnaseq_biolinks.rda") load(file = "data/LIHC_TCGAbiolinks/LIHC_rnaseq_biolinks.rda") tail(LIHC_rnaseq_biolinks, 10)[ ,1:5]
summarizedExperiment = T
library(SummarizedExperiment) summary(RT_rnaseq_SE) col_data <- colData(RT_rnaseq_SE) row_data <- rowData(RT_rnaseq_SE) expr_data <- assay(RT_rnaseq_SE)
图片alt
下载临床信息
# clinical data query_clin_LIHC <- GDCquery(project = "TCGA-LIHC", data.category = "Clinical", data.type = "Clinical Supplement", data.format = "BCR Biotab") # GDCdownload(query_clin_LIHC) clinical.BCRtab.all <- GDCprepare(query_clin_LIHC) names(clinical.BCRtab.all) sort(colnames(clinical.BCRtab.all$clinical_patient_lihc))
# 方式二 LIHC_clin <- GDCquery_clinic(project = "TCGA-LIHC", type = "clinical") LIHC_clin_xena <- readr::read_tsv("data/LIHC_UCSC/TCGA-LIHC.GDC_phenotype.tsv.gz")
TCGA BarcodeSample Type Codes
json_file <- "data/LIHC_GDC/metadata.cart.2020-07-15.json" TCGA_metadata <- function(path = json_file) { metadata <- jsonlite::read_json(path, simplifyVector = T) metadata <- tibble::tibble( file_name = metadata$file_name, md5sum = metadata$md5sum, TCGA_id_full = bind_rows(metadata$associated_entities)$entity_submitter_id, TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16), patient_id = stringr::str_sub(TCGA_id, 1, 12), tissue_type_id = stringr::str_sub(TCGA_id, 14, 15), tissue_type = sapply(tissue_type_id, function(x) { switch(x, "01" = "Primary Solid Tumor", "02" = "Recurrent Solid Tumor", "03" = "Primary Blood Derived Cancer - Peripheral Blood", "05" = "Additional - New Primary", "06" = "Metastatic", "07" = "Additional Metastatic", "11" = "Solid Tissue Normal")}), group = ifelse(tissue_type_id == "11", "Normal", "Tumor")) return(metadata) } metadata2 <- TCGA_metadata()
file_path <- "data/LIHC_GDC/gdc_download_20200715_050611.695593" counts_files <- list.files(path = file_path, pattern = "counts.gz", full.names = T, recursive = T) head(counts_files) # 验证下载文件的md5sum值 pkgs_in("tools") all(tools::md5sum(counts_files) %in% metadata$md5sum) ## first file test <- readr::read_tsv(counts_files[1], col_names = F) head(test) tail(test) ## merged counts data counts_df <- counts_files %>% lapply(function(x) { tmp <- read_tsv(x, col_names = F) %>% purrr::set_names("gene_id", basename(x)) cat(which(counts_files == x), "of", length(counts_files), "\n") return(tmp) }) %>% reduce(function(x, y) full_join(x, y, by = "gene_id")) %>% dplyr::select(gene_id, metadata$file_name) %>% set_names("gene_id", metadata$TCGA_id_full) %>% dplyr::slice(1:(nrow(.)-5)) LIHC_counts <- counts_df LIHC_metadata <- metadata save(LIHC_counts, LIHC_metadata, file = "data/LIHC_GDC/LIHC_counts.rda")
ID转换downlaod
gtf_v22 <- read_tsv("data/gencode.gene.info.v22.tsv") id2symbol <- gtf_v22 %>% dplyr::select(1, 2) LIHC_counts_filtered_anno <- id2symbol %>% inner_join(LIHC_counts_filtered, by = "gene_id")
sample_anno <- readr::read_tsv(file = "data/merged_sample_quality_annotations.tsv") LIHC_anno <- sample_anno %>% dplyr::filter(aliquot_barcode %in% colnames(LIHC_counts)) %>% write_tsv("data/LIHC_anno.tsv") sample_good <- read_tsv("data/LIHC_anno_examined.tsv") %>% pull(aliquot_barcode) %>% na.omit() sample_rm <- setdiff(LIHC_anno$aliquot_barcode, sample_good) LIHC_counts_filtered <- LIHC_counts %>% select(gene_id, all_of(sample_good))
gtf_v22 <- read_tsv("data/gencode.gene.info.v22.tsv") ## lncRNA sort(table(gtf_v22$gene_type), decreasing = T) sort(unique(gtf_v22$gene_type)) ## lncRNA 范围 from ensembl.org ## https://m.ensembl.org/info/genome/genebuild/biotypes.html # Long non-coding RNA (lncRNA): A non-coding gene/transcript >200bp in length # 3' overlapping ncRNA: Transcripts where ditag and/or published experimental data strongly supports the existence of long (>200bp) non-coding transcripts that overlap the 3'UTR of a protein-coding locus on the same strand. # Antisense: Transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand. # Macro lncRNA: Unspliced lncRNAs that are several kb in size. # Non coding: Transcripts which are known from the literature to not be protein coding. # Retained intron: An alternatively spliced transcript believed to contain intronic sequence relative to other, coding, transcripts of the same gene. # Sense intronic: A long non-coding transcript in introns of a coding gene that does not overlap any exons. # Sense overlapping: A long non-coding transcript that contains a coding gene in its intron on the same strand. # lincRNA (long intergenic ncRNA): Transcripts that are long intergenic non-coding RNA locus with a length >200bp. Requires lack of coding potential and may not be conserved between species. lncRNA_types <- "3prime_overlapping_ncrna, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic, sense_overlapping" lncRNA_types <- unlist(str_split(lncRNA_types, pattern = ", ")) lncRNA_types lncRNA_types %in% unique(gtf_v22$gene_type) ## gene type info gene_type <- gtf_v22 %>% select(gene_id, gene_type) %>% arrange(gene_id) dim(gene_type) identical(gene_type$gene_id, LIHC_counts$gene_id) ## 提取mRNA expr_mRNA <- LIHC_counts[gene_type$gene_type == "protein_coding", ] ## 提取lncRNA expr_lncRNA <- LIHC_counts[gene_type$gene_type %in% lncRNA_types, ]
作用弥补TCGA数据库中缺少癌旁组织样
#### 1 miRNA expression ---- ## import json file source("codes/custom_functions.R") pkgs_in("jsonlite") library(tidyverse) json_file <- "data/LIHC_miRNA/miRNA_Expression/metadata.cart.LIHC.miRNA.json" metadata <- TCGA_metadata() ## miRNA_files file_path <- "data/LIHC_miRNA/miRNA_Expression/" miRNA_files <- list.files(path = file_path, pattern = "mirnas.quantification.txt", full.names = T, recursive = T) head(miRNA_files) ## 验证下载文件的md5sum值 pkgs_in("tools") all(tools::md5sum(miRNA_files) %in% metadata$md5sum) ## first file test <- readr::read_tsv(miRNA_files[1]) dim(test) head(test) tail(test) tail(test$read_count/sum(test$read_count)*10^6) ## merged miRNA counts data miRNA_df <- miRNA_files %>% lapply(function(x) { tmp <- read_tsv(x, col_names = T)[ ,1:2] %>% purrr::set_names("miRNA_id", basename(x)) cat(which(miRNA_files == x), "of", length(miRNA_files), "\n") return(tmp) }) %>% purrr::reduce(.f = function(x, y) full_join(x, y, by = "miRNA_id")) %>% dplyr::select(miRNA_id, metadata$file_name) %>% purrr::set_names("miRNA_id", metadata$TCGA_id_full) sum(is.na(miRNA_df)) ## save data LIHC_miRNA <- miRNA_df LIHC_metadata_miRNA <- metadata save(LIHC_miRNA, LIHC_metadata_miRNA, file = "data/LIHC_miRNA/LIHC_miRNA.rda") #### 2 miRNA isoform expression ---- ## import json file rm(list = ls()) source("codes/custom_functions.R") pkgs_in("jsonlite") library(tidyverse) json_file <- "data/LIHC_miRNA/miRNA_Isoform/metadata.cart.LIHC.miRNA.Isoform.json" metadata <- TCGA_metadata() ## miRNA_isoform_files file_path <- "data/LIHC_miRNA/miRNA_Isoform/" miRNA_isoform_files <- list.files(path = file_path, pattern = "isoforms.quantification.txt", full.names = T, recursive = T) head(miRNA_isoform_files, 3) ## 验证下载文件的md5sum值 pkgs_in("tools") all(tools::md5sum(miRNA_isoform_files) %in% metadata$md5sum) ## first file test <- readr::read_tsv(miRNA_isoform_files[1], col_names = T) head(test) tail(test) ## miRNA注释信息 # ftp://mirbase.org/pub/mirbase/21/ miRNA_anno <- read_lines("data/LIHC_miRNA/mature.fa.gz") %>% as_tibble() %>% dplyr::filter(str_detect(value, pattern = "Homo")) %>% tidyr::separate(col = value, into = paste0("ID", 1:5), sep = " ") %>% dplyr::select(isoform_id = ID2, miRNA_name = ID5) ## merged miRNA isoform counts data isoform_list <- miRNA_isoform_files %>% lapply(function(x) { tmp <- read_tsv(x) %>% dplyr::select(isoform_id = miRNA_region, read_count) %>% mutate(isoform_id = str_extract(isoform_id, "MIMAT.*")) %>% na.omit() %>% aggregate(. ~ isoform_id, data = ., sum) %>% # aggregate(read_count ~ isoform_id, data = ., sum) %>% purrr::set_names("isoform_id", basename(x)) cat(which(miRNA_isoform_files == x), "of", length(miRNA_isoform_files), "\n") return(tmp) }) ## aggregate # aggregate(weight ~ feed, data = chickwts, mean) # aggregate(breaks ~ wool + tension, data = warpbreaks, mean) # aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, mean) # aggregate(cbind(ncases, ncontrols) ~ alcgp + tobgp, data = esoph, sum) # iris # aggregate(. ~ Species, data = iris, mean) # ToothGrowth # aggregate(len ~ ., data = ToothGrowth, mean) test2 <- full_join(x = isoform_list[[1]], y = isoform_list[[2]], by = "isoform_id") isoform_df <- isoform_list %>% reduce(function(x, y) full_join(x, y, by = "isoform_id")) %>% select(isoform_id, metadata$file_name) %>% set_names("isoform_id", metadata$TCGA_id_full) %>% mutate(across(.fns = ~ ifelse(is.na(.x), 0, .x))) %>% right_join(miRNA_anno, ., by = "isoform_id") sum(is.na(isoform_df)) ## save data LIHC_miRNA_iso <- isoform_df LIHC_metadata_miRNA_iso <- metadata save(LIHC_miRNA_iso, LIHC_metadata_miRNA_iso, file = "data/LIHC_miRNA/LIHC_miRNA_isoform.rda") #### End ----
get_mircode <- function(){ if(!file.exists("result/mircode.rda")){ mircode <- read.table("/home/wangyang/workspace/bioinfo_analysis/Rscript/data/mircode_highconsfamilies.txt",sep = "\t",header = T) table(mircode$gene_class) lncmiRcode = mircode[mircode$gene_class %in%c("lncRNA, intergenic","lncRNA, overlapping"),1:4] head(lncmiRcode$microrna) library(stringr) p1 = str_starts(lncmiRcode$microrna,"miR-") table(p1) p2 = str_starts(lncmiRcode$microrna,"let-") table(p2) lncmiRcode_let <- lncmiRcode[p2,]%>% dplyr::select(c("gene_symbol","microrna"))%>% mutate(miRNA=str_replace_all(microrna,"let-",""))%>% separate_rows(miRNA,sep = "/",convert = T)%>% mutate_at(vars(contains("miRNA")), ~ paste0("hsa-let-",.))%>% mutate(miRNA = str_extract(miRNA,"hsa-let-[0-9]+[a-z]?"))%>% dplyr::select(c("gene_symbol","miRNA")) lncmiRcode_miR <- lncmiRcode[p1,]%>% dplyr::select(c("gene_symbol","microrna"))%>% mutate(miRNA=str_replace_all(microrna,"miR-",""))%>% separate_rows(miRNA,sep = "/",convert = T)%>% mutate_at(vars(contains("miRNA")), ~ paste0("hsa-mir-",.))%>% mutate(miRNA = str_extract(miRNA,"hsa-mir-[0-9]+[a-z]?"))%>% dplyr::select(c("gene_symbol","miRNA")) lncmiRcode_f <- rbind(lncmiRcode_let,lncmiRcode_miR) saveRDS(lncmiRcode_f,file = "result/mircode.rda") return(lncmiRcode_f) } return(readRDS("result/mircode.rda")) }
#### load data ---- load_data <- load("data/LIHC_GDC/LIHC_counts.rda") #### 样本过滤 ---- # Merged Sample Quality Annotations - merged_sample_quality_annotations.tsv # https://gdc.cancer.gov/about-data/publications/pancanatlas library(tidyverse) sample_anno <- readr::read_tsv(file = "data/merged_sample_quality_annotations.tsv") LIHC_anno <- sample_anno %>% dplyr::filter(aliquot_barcode %in% colnames(LIHC_counts)) %>% write_tsv("data/LIHC_anno.tsv") sample_good <- read_tsv("data/LIHC_anno_examined.tsv") %>% pull(aliquot_barcode) %>% na.omit() sample_rm <- setdiff(LIHC_anno$aliquot_barcode, sample_good) LIHC_counts_filtered <- LIHC_counts %>% select(gene_id, all_of(sample_good)) #### 准备数据 ---- ### metadata ---- metadata <- LIHC_metadata %>% filter(tissue_type_id %in% c("01", "11")) %>% add_count(patient_id, name = "n_patient") %>% filter(n_patient == 2) %>% arrange(patient_id) %>% as.data.frame() rownames(metadata) <- metadata$TCGA_id ### counts ---- paired_counts <- LIHC_counts %>% set_names("gene_id", str_sub(colnames(.)[-1], 1,16)) %>% column_to_rownames("gene_id") %>% select(metadata$TCGA_id) keep <- rowSums(paired_counts > 0) >= 50 head(keep) paired_counts <- paired_counts[keep,] ### vst ---- library(DESeq2) group <- metadata$group %>% as.factor() colData = data.frame(sample_id = colnames(paired_counts), group = group) DDS <- DESeq2::DESeqDataSetFromMatrix(paired_counts, colData = metadata, design = ~ group) vst <- DESeq2::vst(DDS) counts_vst <- assay(vst) #### boxplot & density plot ---- boxplot(log2(paired_counts+1), las = 2, outline = F, col = c("red", "blue")) limma::plotDensities(log2(paired_counts+1), legend = F) boxplot(counts_vst, las = 2, outline = F, col = group) limma::plotDensities(counts_vst , legend = F) ## ggplot2 data data_counts <- log2(paired_counts + 1) %>% pivot_longer(cols = everything(), names_to = "TCGA_id") %>% left_join(metadata %>% select(TCGA_id, group), by = "TCGA_id") data_vst <- counts_vst %>% as.data.frame() %>% pivot_longer(cols = everything(), names_to = "TCGA_id") %>% left_join(metadata %>% select(TCGA_id, group), by = "TCGA_id") ## ggplot2 box plot ggplot(data_counts, aes(x = TCGA_id, y = value, fill = group)) + geom_boxplot(outlier.shape = NA) + theme_bw() + xlab("samples") + ylab("log2 count value") + theme(axis.text.x = element_blank(), axis.ticks = element_blank()) ggplot(data_vst, aes(x = TCGA_id, y = value, fill = group)) + geom_boxplot(outlier.shape = NA) + theme_bw() + xlab("samples") + ylab("vst transformed value") + theme(axis.text.x = element_blank(), axis.ticks = element_blank()) ## ggplot2 density plot ggplot(data_counts, aes(x = value, col = TCGA_id)) + geom_density() + theme_bw() + xlab("samples") + xlim(0, 20) + theme(legend.position = "none") ggplot(data_vst, aes(x = value, col = TCGA_id)) + geom_density() + theme_bw() + xlim(0, 20) + theme(legend.position = "none") sample_rm <- str_sub(sample_rm, 1, 16) sum(colnames(counts_vst) %in% sample_rm) data_vst_filter <- data_vst %>% dplyr::filter(!(TCGA_id %in% sample_rm)) ggplot(data_vst_filter, aes(x = value, col = TCGA_id)) + geom_density() + theme_bw() + xlim(0, 20) + theme(legend.position = "none") #### PCA plot ---- source("codes/custom_functions.R") PCA_new(log2(paired_counts+1), group = metadata$group) PCA_new(counts_vst, group = metadata$group) # 加载包 library("factoextra") library("FactoMineR") # 准备数据 pca <- PCA(t(log2(paired_counts+1)), graph = FALSE) # 作图 fviz_pca_ind(pca, label = "none", habillage = as.factor(metadata$group), palette = c("#00AFBB", "#E7B800"), addEllipses = TRUE) #### 聚类树状图 ---- expr <- as.matrix(counts_vst) dist <- dist(t(expr)) hc = hclust(dist) library(factoextra) library(RColorBrewer) pdf("data/clust.pdf", height = 10, width = 20) fviz_dend(hc, k = 6, cex = 0.5, k_colors = brewer.pal(4, "Set2"), color_labels_by_k = TRUE, ggtheme = theme_classic()) dev.off() #### End ----
metadata <- query_COAD_counts[[1]][[1]] metadata <- tibble::tibble( TCGA_id_full = metadata$cases, TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16), patient_id = stringr::str_sub(TCGA_id, 1, 12), tissue_type_id = stringr::str_sub(TCGA_id, 14, 15), tissue_type = sapply(tissue_type_id, function(x) { switch(x, "01" = "Primary Solid Tumor", "02" = "Recurrent Solid Tumor", "03" = "Primary Blood Derived Cancer - Peripheral Blood", "05" = "Additional - New Primary", "06" = "Metastatic", "07" = "Additional Metastatic", "11" = "Solid Tissue Normal")}), group = ifelse(tissue_type_id == "11", "Normal", "Tumor")) rownames(metadata) <- metadata$TCGA_id_full
TCGA_metadata <- function(path = json_file) { metadata <- jsonlite::read_json(path, simplifyVector = T) metadata <- tibble::tibble( file_name = metadata$file_name, md5sum = metadata$md5sum, TCGA_id_full = bind_rows(metadata$associated_entities)$entity_submitter_id, TCGA_id = stringr::str_sub(TCGA_id_full, 1, 16), patient_id = stringr::str_sub(TCGA_id, 1, 12), tissue_type_id = stringr::str_sub(TCGA_id, 14, 15), tissue_type = sapply(tissue_type_id, function(x) { switch(x, "01" = "Primary Solid Tumor", "02" = "Recurrent Solid Tumor", "03" = "Primary Blood Derived Cancer - Peripheral Blood", "05" = "Additional - New Primary", "06" = "Metastatic", "07" = "Additional Metastatic", "11" = "Solid Tissue Normal")}), group = ifelse(tissue_type_id == "11", "Normal", "Tumor")) return(metadata) } metadata2 <- TCGA_metadata()
counts <- LIHC_counts_filtered %>% column_to_rownames("gene_id") %>% dplyr::select(metadata$TCGA_id_full) identical(colnames(counts), rownames(metadata))
### 2.1 构建dds对象 ---- library(DESeq2) ?DESeqDataSetFromMatrix() dds <- DESeqDataSetFromMatrix(countData = counts, colData = metadata, design = ~ group) ## PCA图 vsd <- vst(dds, blind = TRUE) vsd_df <- assay(vsd) head(vsd_df, 5)[, 1:2] DESeq2::plotPCA(vsd, intgroup = "group") source("codes/custom_functions.R") PCA_new(expr = vsd_df, group = metadata$group) ## remove batch effect vsd_batch_rm <- limma::removeBatchEffect(x = vsd_df, batch = metadata$batch) PCA_new(expr = vsd_batch_rm, group = metadata$group) ### 2.2 基因过滤 ---- table(metadata$group) keep <- rowSums(counts(dds) > 0) >= 44 dds_filt <- dds[keep, ] ### 2.3 DESeq一步完成差异分析 ---- time1 <- Sys.time() dds2 <- DESeq(dds_filt) runtime1 <- Sys.time() - time1 runtime1 # Time difference of 4.238453 mins time2 <- Sys.time() dds2 <- DESeq(dds_filt, parallel = T) runtime2 <- Sys.time() - time2 runtime2 # Time difference of 3.506404 mins # save(dds2, file = "data/dds2.Rda") load(file = "data/dds2.Rda") ### 2.4 提取差异分析结果 ---- resultsNames(dds2) res <- results(dds2) # 基因ID对应的基因名 gtf_v22 <- read_tsv("data/gencode.gene.info.v22.tsv") %>% dplyr::select(gene_id, gene_name) res_deseq2 <- as.data.frame(res) %>% rownames_to_column("gene_id") %>% left_join(gtf_v22, by = "gene_id") %>% relocate(gene_name, .after = "gene_id") %>% arrange(padj) %>% dplyr::filter(abs(log2FoldChange) > 1, padj < 0.05) # Note on p-values set to NA: some values in the results table can be set to NA for one of the following reasons: # If within a row, all samples have zero counts, the baseMean column will be zero, and the log2 fold change estimates, p value and adjusted p value will all be set to NA. # If a row contains a sample with an extreme count outlier then the p value and adjusted p value will be set to NA. These outlier counts are detected by Cook’s distance. # If a row is filtered by automatic independent filtering, for having a low mean normalized count, then only the adjusted p value will be set to NA. # baseMean # https://support.bioconductor.org/p/75244/ head(rowMeans(DESeq2::counts(dds2, normalized=TRUE, replaced = TRUE)))
dds_batch <- DESeqDataSetFromMatrix(countData = counts, colData = metadata, design = ~ batch + group) keep <- rowSums(counts(dds_batch) > 0) >= 44 dds_filt2 <- dds_batch[keep, ] dds_batch2 <- DESeq2::DESeq(dds_filt2, parallel = T) # save(dds_batch2, file = "data/dds_batch2.Rda") load("data/dds_batch2.Rda") resultsNames(dds_batch2) res_batch <- results(dds_batch2) res_deseq2_batch <- as.data.frame(res_batch) %>% rownames_to_column("gene_id") %>% left_join(gtf_v22, by = "gene_id") %>% relocate(gene_name, .after = "gene_id") %>% arrange(padj) %>% dplyr::filter(abs(log2FoldChange) > 1, padj < 0.05)
length(intersect(res_deseq2_batch$gene_id, res_deseq2$gene_id)) library(VennDiagram) venn.diagram(x = list("rm_batch" = res_deseq2_batch$gene_id, "batch" = res_deseq2$gene_id), filename = "results/Venn_2.jpeg", fill = c("blue", "red"), scaled = T, cex = 1.5, cat.cex = 1.5, main = "Venn Diagram", main.cex = 2)
library(TCGAbiolinks) project_df <- (function(){ projects <- TCGAbiolinks:::getGDCprojects()$project_id projects_full <- projects[grepl('^TCGA',projects,perl=T)] projects_ID <- sapply(projects_full,function(x){stringi::stri_sub(x,6,10)}) res <- data.frame(cancer=projects_ID,TCGA_ID=projects_full) return(res) })() readr::write_csv(project_df,file = "TCGA_PROJECT.csv")
import pandas as pd df = pd.read_csv("TCGA_PROJECT.csv") for item in df['TCGA_ID']: print("./DownloadFPKM.R FPKM {0}".format(item)) print("./DownloadFPKM.R Counts {0}".format(item)) print("./DownloadFPKM.R miRNA {0}".format(item))
#!/usr/bin/Rscript library(TCGAbiolinks) getwd() args=commandArgs(T) print(args[1]) #type="FPKM" #project = "TCGA-CHOL" type=args[1] project = args[2] #####下载FPKM (function(){ if(!file.exists("GDCdata/FPKM")){ dir.create("GDCdata/FPKM",recursive = T) } if(!file.exists("GDCdata/Counts")){ dir.create("GDCdata/Counts",recursive = T) } if(!file.exists("result")){ dir.create("result") } if(!file.exists("GDCdata/miRNA")){ dir.create("GDCdata/miRNA",recursive = T) } # match.file.cases.all <- NULL # for(project in project_df[,2]){ filename <- gsub("-","_",paste0("result/",project,"_FPKM.tsv")) if(file.exists(filename)){ message("###########",filename,"存在#######################################################") }else{ if(type=="FPKM"){ query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - FPKM") GDCdownload(query,directory = "GDCdata/FPKM",method = "api") data <- GDCprepare(query, directory = "GDCdata/FPKM", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) }else if(type=="Counts"){ query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts") GDCdownload(query,directory = "GDCdata/Counts",method = "api") data <- GDCprepare(query, directory = "GDCdata/Counts", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) }else if(type=="miRNA"){ query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "miRNA Expression Quantification", workflow.type = "BCGSC miRNA Profiling") GDCdownload(query,directory = "GDCdata/miRNA",method = "api") data <- GDCprepare(query, directory = "GDCdata/miRNA", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) } } })()
#!/usr/bin/Rscript library(TCGAbiolinks) getwd() project_df <- (function(){ projects <- TCGAbiolinks:::getGDCprojects()$project_id projects_full <- projects[grepl('^TCGA',projects,perl=T)] projects_ID <- sapply(projects_full,function(x){stringi::stri_sub(x,6,10)}) res <- data.frame(cancer=projects_ID,TCGA_ID=projects_full) return(res) })() #####下载FPKM (function(){ if(!file.exists("GDCdata/FPKM")){ dir.create("GDCdata/FPKM",recursive = T) } if(!file.exists("result")){ dir.create("result") } match.file.cases.all <- NULL for(project in project_df[,2]){ filename <- gsub("-","_",paste0("result/",project,"_FPKM.tsv")) if(file.exists(filename)){ message("###########",filename,"存在#######################################################") next }else{ query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - FPKM") match.file.cases <- getResults(query,cols=c("cases","file_name")) match.file.cases$project <- project match.file.cases.all <- rbind(match.file.cases.all,match.file.cases) GDCdownload(query,directory = "GDCdata/FPKM",method = "api") table(match.file.cases.all$project) data <- GDCprepare(query, directory = "GDCdata/FPKM", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) } } readr::write_tsv(match.file.cases.all, path = "TCGA_FPKM.tsv") })() #####下载COUNT (function(){ if(!file.exists("GDCdata/Counts")){ dir.create("GDCdata/Counts",recursive = T) } if(!file.exists("result")){ dir.create("result") } match.file.cases.all <- NULL for(project in project_df[,2]){ filename <- gsub("-","_",paste0("result/",project,"_Counts.tsv")) if(file.exists(filename)){ message("###########",filename,"存在#######################################################") next }else{ query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts") match.file.cases <- getResults(query,cols=c("cases","file_name")) match.file.cases$project <- project match.file.cases.all <- rbind(match.file.cases.all,match.file.cases) GDCdownload(query,directory = "GDCdata/Counts",method = "api") table(match.file.cases.all$project) data <- GDCprepare(query, directory = "GDCdata/Counts", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) } # project <- "TCGA-BRCA" } readr::write_tsv(match.file.cases.all, path = "TCGA_Counts.tsv") })() #####下载miRNA (function(){ if(!file.exists("GDCdata/miRNA")){ dir.create("GDCdata/miRNA",recursive = T) } if(!file.exists("result")){ dir.create("result") } match.file.cases.all <- NULL for(project in project_df[,2]){ filename <- gsub("-","_",paste0("result/",project,"_miRNA.tsv")) if(file.exists(filename)){ message("###########",filename,"存在#######################################################") next }else{ #project <- "TCGA-BRCA" query<- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "miRNA Expression Quantification", workflow.type = "BCGSC miRNA Profiling") match.file.cases <- getResults(query,cols=c("cases","file_name")) match.file.cases$project <- project match.file.cases.all <- rbind(match.file.cases.all,match.file.cases) GDCdownload(query,directory = "GDCdata/miRNA",method = "api") table(match.file.cases.all$project) data <- GDCprepare(query, directory = "GDCdata/miRNA", summarizedExperiment = F, save = F) readr::write_tsv(data,file =filename ) } } readr::write_tsv(match.file.cases.all, path = "TCGA_miRNA.tsv") })()