学习资料
Trinotate 是一个全面的注释套件,设计用于模型或非模型生物的转录组,特别是从头组装的转录组的自动功能注释。Trinotate 利用多种不同的参考方法进行功能注释,包括对已知序列数据(BLAST +/SwissProt)进行同源性搜索,蛋白质结构域鉴定(HMMER/PFAM) ,蛋白质信号肽和跨膜结构域预测(signalP/tmHMM) ,并利用各种注释数据库(eggNOG/GO/Kegg 数据库)。所有来自转录本分析的功能注释数据集成到 SQLite 数据库中,该数据库允许快速有效地搜索与所需的科学假设相关的具有特定质量的术语,或者为转录组创建整个注释报告的方法。
Trinotate 包括 TrinotateWeb,它提供了一个本地驱动的基于 Web 的图形界面,用于导航转录组注释并使用 Trinity/RSEM/Bioiconductor 分析框架分析转录本表达和差异表达。
Trinotate 在很大程度上依赖于 SwissProt 和 Pfam,定制的蛋白质文件生成如下所述,专门用于 Trinotate。您可以通过运行这个 Trinotate 构建过程来获得蛋白质数据库文件。此步骤将下载几个数据资源,包括 swissprot、 pfam 和其他相关资源的最新版本,创建并填充 Trinotate 样板 sqlite 数据库(Trinotate.sqlite) ,并生成与 BLAST 一起使用的 uniprot_sprot.pep文件,以及用于 Pfam 搜索的Pfam-A.hmm.gz文件。像这样运行构建过程:
uniprot_sprot.pep
Pfam-A.hmm.gz
$TRINOTATE_HOME/Trinotate --create \ --db myTrinotate.sqlite \ --trinotate_data_dir /path/to/TRINOTATE_DATA_DIR \ --use_diamond
docker run --rm -it \ --user $(id -u):$(id -g) \ -v $PWD:/data \ -v /tmp:/tmp \ -e TRINOTATE_HOME=/usr/local/src/Trinotate \ -w $PWD \ trinityrnaseq/trinotate bash $TRINOTATE_HOME/Trinotate --create \ --db /data/myTrinotate.sqlite \ --trinotate_data_dir /data/db \ --use_diamond
读取结果
trinotate <- read_tsv("results/annotation/trinotate_annotation_report.txt",show_col_types = FALSE)
提取Pfam结果
trinotate |> select(gene_id="#gene_id",Pfam ) |> filter(Pfam!=".") %>% aggregate(Pfam~gene_id, data = .,paste,collapse = ", ") |> mutate(id=str_split(gene_id,pattern = "\\.")%>% map_chr(., 2) , id=as.integer(id) ) |> arrange(id) |> select(-id)
gene_id Pfam MSTRG.5 PF10551.12^MULE^MULE transposase domain^106-202^E:5e-13 MSTRG.13 PF14529.9^Exo_endo_phos_2^Endonuclease-reverse transcriptase^93-215^E:1.2e-16`PF03372.26^Exo_endo_phos^Endonuclease/Exonuclease/phosphatase family^120-212^E:4e-08`PF00078.30^RVT_1^Reverse transcriptase (RNA-dependent DNA polymerase)^503-751^E:5.9e-44, PF00098.26^zf-CCHC^Zinc knuckle^422-439^E:0.0021 MSTRG.16 PF00078.30^RVT_1^Reverse transcriptase (RNA-dependent DNA polymerase)^25-172^E:2e-19 MSTRG.21 PF14529.9^Exo_endo_phos_2^Endonuclease-reverse transcriptase^107-190^E:3.2e-11, PF07530.14^PRE_C2HC^Associated with zinc fingers^205-268^E:8.9e-16
提取UniProt结果
trinotate |> select(gene_id="#gene_id",UniProt="sprot_Top_BLASTP_hit") |> filter(UniProt!=".") |> mutate(UniProt=str_split(UniProt,pattern = "\\^")%>% map_chr(., 1))
gene_id UniProt MSTRG.32 AIMP1_CRIGR MSTRG.35 CID16_SCHPO MSTRG.22 ALPL_ARATH MSTRG.70 BRE4_CAEBR
UniProt_symbol <- read_tsv("resources/bioDBnet_db2db_221019032316_1638756.txt",show_col_types = FALSE) UniProt_symbol <- UniProt_symbol[,c(1,2)] colnames(UniProt_symbol) <- c("UniProt","Symbol") head(UniProt_symbol)
UniProt Symbol AIMP1_CRIGR AIMP1 CID16_SCHPO cid16 ALPL_ARATH AT3G55350 BRE4_CAEBR bre-4
trinotate |> select(gene_id="#gene_id",UniProt="sprot_Top_BLASTP_hit") |> filter(UniProt!=".") |> mutate(UniProt=str_split(UniProt,pattern = "\\^")%>% map_chr(., 1)) |> inner_join(UniProt_symbol,by="UniProt")
gene_id UniProt Symbol MSTRG.32 AIMP1_CRIGR AIMP1 MSTRG.35 CID16_SCHPO cid16 MSTRG.22 ALPL_ARATH AT3G55350
提取GO注释
get_go_tab <- function(data,col){ data |> dplyr::select(gene_id="#gene_id",go=all_of(col) ) |> filter(go!=".") |> separate_rows(go, sep = "`") |> mutate(goid=str_split(go,pattern = "\\^")%>% map_chr(., 1), ontology=str_split(go,pattern = "\\^")%>% map_chr(., 2), term=str_split(go,pattern = "\\^")%>% map_chr(., 3)) |> select(-go) }
get_go_tab(trinotate,col="gene_ontology_Pfam") gene_id goid ontology term MSTRG.13 GO:0003824 molecular_function catalytic activity MSTRG.13 GO:0003676 molecular_function nucleic acid binding
get_go_tab(trinotate,col="gene_ontology_BLASTX") gene_id goid ontology term MSTRG.32 GO:0017101 cellular_component aminoacyl-tRNA synthetase multienzyme complex MSTRG.32 GO:0005829 cellular_component cytosol MSTRG.32 GO:0005783 cellular_component endoplasmic reticulum
get_go_tab(trinotate,col="gene_ontology_BLASTP") gene_id goid ontology term MSTRG.32 GO:0017101 cellular_component aminoacyl-tRNA synthetase multienzyme complex MSTRG.32 GO:0005829 cellular_component cytosol MSTRG.32 GO:0005783 cellular_component endoplasmic reticulum
go_tab <- lapply(c("gene_ontology_Pfam","gene_ontology_BLASTX","gene_ontology_BLASTP"), function(x){get_go_tab(trinotate,col=x)}) |> reduce(function(x,y)rbind(x,y)) |> unique() head(go_tab) gene_id goid ontology term MSTRG.13 GO:0003824 molecular_function catalytic activity MSTRG.13 GO:0003676 molecular_function nucleic acid binding MSTRG.13 GO:0008270 molecular_function zinc ion binding
生成基因与GO映射关系
go_tab |> select(gene_id,go_id= goid) %>% aggregate(go_id~gene_id, data = .,paste,collapse = ", ") |> mutate(id=str_split(gene_id,pattern = "\\.")%>% map_chr(., 2) , id=as.integer(id) ) |> arrange(id) |> select(-id) gene_id go_id MSTRG.13 GO:0003824, GO:0003676, GO:0008270 MSTRG.21 GO:0003824 MSTRG.22 GO:0000118, GO:0046872, GO:0004518
提取kegg注释
kegg <- trinotate |> dplyr::select(transcript_id,gene_id="#gene_id",kegg=all_of("Kegg") ) |> filter(kegg!=".") |> unique() |> mutate(kegg_org=str_split(kegg,pattern = ":")%>% map_chr(., 2), kegg_id=str_split(kegg,pattern = ":")%>% map_chr(., 3), kegg=paste0(kegg_org,":",kegg_id)) |> unique() head(kegg) transcript_id gene_id kegg kegg_org kegg_id MSTRG.32.1 MSTRG.32 cge:100689238 cge 100689238 MSTRG.35.1 MSTRG.35 spo:SPAC17H9.01 spo SPAC17H9.01 MSTRG.22.1 MSTRG.22 ath:AT3G55350 ath AT3G55350
ko_org.0 <- readRDS("/data/wangyang/reproducibility/SitobionAvenae/resources/ko_org.rds") head(ko_org.0) X1 X2 cge:100768823 ko:K25539 cge:100772829 ko:K25533 cge:100755390 ko:K25530 cge:107978677 ko:K02980
ko_list <- readRDS("/data/wangyang/reproducibility/SitobionAvenae/resources/ko_list.rds") colnames(ko_list) <- c("ko","desc") head(ko_list) ko desc ko:K00001 E1.1.1.1, adh; alcohol dehydrogenase [EC:1.1.1.1] ko:K00002 AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] ko:K00003 hom; homoserine dehydrogenase [EC:1.1.1.3]
ko_list <- ko_list |> mutate(ko=gsub("ko:","",ko)) ko_list ko desc K00001 E1.1.1.1, adh; alcohol dehydrogenase [EC:1.1.1.1] K00002 AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2] K00003 hom; homoserine dehydrogenase [EC:1.1.1.3]
ko_org<- ko_org.0 |> reduce(function(x,y)rbind(x,y)) head(ko_org) X1 X2 cge:100768823 ko:K25539 cge:100772829 ko:K25533 cge:100755390 ko:K25530
colnames(ko_org) <- c("kegg","ko") ko_org <- ko_org |> mutate(ko=gsub("ko:","",ko)) head(ko_org) kegg ko cge:100768823 K25539 cge:100772829 K25533 cge:100755390 K25530
kegg_tab <- ko_org |> inner_join(kegg,by = "kegg") |> inner_join(ko_list,by="ko") |> select(gene_id,org=kegg,ko,desc) head(kegg_tab) gene_id org ko desc MSTRG.5239 cge:100689274 K11496 CENPB; centromere protein B MSTRG.10917 cge:100689274 K11496 CENPB; centromere protein B MSTRG.5269 cge:100689402 K00236 SDHC, SDH3; succinate dehydrogenase (ubiquinone) cytochrome b560 subunit
kegg_map <- kegg_tab |> mutate(KO_Database_GenesAnno=paste0(ko,"|",org,"|",desc)) |> select(gene_id, KO_Database_GenesAnno) |> unique() %>% aggregate(KO_Database_GenesAnno~gene_id, data = .,paste, collapse=" ") head(kegg_map) gene_id KO_Database_GenesAnno 1 MSTRG.10001 K16756|dme:Dmel_CG32137|BICDL, CCDC64; BICD family-like cargo adapter 2 MSTRG.10013 K01648|rno:24159|ACLY; ATP citrate (pro-S)-lyase [EC:2.3.3.8] 3 MSTRG.10019 K01897|hsa:2182|ACSL, fadD; long-chain acyl-CoA synthetase [EC:6.2.1.3] K01897|pon:100174659|ACSL, fadD; long-chain acyl-CoA synthetase [EC:6.2.1.3]
x <- bitr_kegg(kegg_tab$ko,"kegg", "Path",organism= "ko") y <- ko2name(x$Path) pathway <- merge(x, y, by.x='Path', by.y='ko') |> select(pathway=Path,ko=kegg,name) |> unique() x <- bitr_kegg(kegg_tab$ko,"kegg", "Path",organism= "ko") y <- ko2name(x$Path) pathway <- merge(x, y, by.x='Path', by.y='ko') |> select(pathway=Path,ko=kegg,name) |> unique() gene_id ko org pathway name MSTRG.5269 K00236 cge:100689402 ko00020 Citrate cycle (TCA cycle) MSTRG.5269 K00236 cge:100689402 ko00190 Oxidative phosphorylation MSTRG.5269 K00236 cge:100689402 ko01100 Metabolic pathways MSTRG.5269 K00236 cge:100689402 ko01110 Biosynthesis of secondary metabolites MSTRG.5269 K00236 cge:100689402 ko01120 Microbial metabolism in diverse environments
go_anno.0 <- read_tsv("/data/wangyang/reproducibility/SitobionAvenae/results/enrichment/go.tsv", show_col_types = FALSE) |>as.data.frame() |> mutate(ontology=case_when(ontology=="molecular_function"~"MF", ontology=="biological_process"~"BP", ontology=="cellular_component"~"CC")) head(go_anno.0) gene_id goid ontology term 1 MSTRG.13 GO:0003824 MF catalytic activity 2 MSTRG.13 GO:0003676 MF nucleic acid binding 3 MSTRG.13 GO:0008270 MF zinc ion binding 4 MSTRG.26 GO:0046983 MF protein dimerization activity
go_anno.0[c('goid','gene_id')] goid gene_id GO:0003824 MSTRG.13 GO:0003676 MSTRG.13 GO:0008270 MSTRG.13
go_anno.0[c('goid','term')] goid term GO:0003824 catalytic activity GO:0003676 nucleic acid binding GO:0008270 zinc ion binding
go_rich_list <- lapply(c("BP","MF","CC"),function(x){ go_anno <- subset(go_anno.0, ontology==x) go_rich <- enricher(gene = deg_gene, TERM2GENE = go_anno[c('goid','gene_id')], TERM2NAME = go_anno[c('goid','term')], pvalueCutoff = 1000, pAdjustMethod = 'BH', qvalueCutoff = 100, maxGSSize = 200) go_rich@ontology <- x go_rich }) names(go_rich_list) <- c("BP","MF","CC")
kegg_anno.0 <- read_tsv("/data/wangyang/reproducibility/SitobionAvenae/results/enrichment/kegg.tsv", show_col_types = FALSE) |>as.data.frame() head(kegg_anno.0) kegg_anno <- kegg_anno.0 gene_id ko org pathway name 1 MSTRG.5269 K00236 cge:100689402 ko00020 Citrate cycle (TCA cycle) 2 MSTRG.5269 K00236 cge:100689402 ko00190 Oxidative phosphorylation 3 MSTRG.5269 K00236 cge:100689402 ko01100 Metabolic pathways
kegg_anno[c('pathway','gene_id')] pathway gene_id ko00020 MSTRG.5269 ko00190 MSTRG.5269 ko01100 MSTRG.5269 ko01110 MSTRG.5269
kegg_anno[c('pathway','name')] pathway name ko00020 Citrate cycle (TCA cycle) ko00190 Oxidative phosphorylation ko01100 Metabolic pathways ko01110 Biosynthesis of secondary metabolites
kegg_rich <- enricher(gene = deg_gene, TERM2GENE = kegg_anno[c('pathway','gene_id')], TERM2NAME = kegg_anno[c('pathway','name')], pvalueCutoff = 54, pAdjustMethod = 'BH', qvalueCutoff = 100, maxGSSize = 200)