功能基因注释
最后发布时间 : 2023-08-06 21:20:14
浏览量 :
3.3 功能基因注释
# 输入数据:上一步预测的蛋白序列 result/NR/protein.fa
# 中间结果:temp/eggnog/protein.emapper.seed_orthologs
# temp/eggnog/output.emapper.annotations
# temp/eggnog/output
# COG定量表:result/eggnog/cogtab.count
# result/eggnog/cogtab.count.spf (用于STAMP)
# KO定量表:result/eggnog/kotab.count
# result/eggnog/kotab.count.spf (用于STAMP)
# CAZy碳水化合物注释和定量:result/dbcan2/cazytab.count
# result/dbcan2/cazytab.count.spf (用于STAMP)
# 抗生素抗性:result/resfam/resfam.count
# result/resfam/resfam.count.spf (用于STAMP)
# 这部分可以拓展到其它数据库
3.3.1 基因注释eggNOG(COG/KEGG/CAZy)
# https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2
# 记录软件版本
conda activate eggnog
emapper.py --version # 2.1.6
# diamond比对基因至eggNOG 5.0数据库, 9p11m, 1~9h,默认diamond 1e-3
mkdir -p temp/eggnog
time emapper.py --no_annot --no_file_comments --override \
--data_dir ${db}/eggnog \
-i result/NR/protein.fa \
--cpu 9 -m diamond \
-o temp/eggnog/protein
# 比对结果功能注释, 1h # sqlite3.OperationalError: no such table: prots是数据库不配套,重新下载即可
emapper.py \
--annotate_hits_table temp/eggnog/protein.emapper.seed_orthologs \
--data_dir ${db}/eggnog \
--cpu 9 --no_file_comments --override \
-o temp/eggnog/output
# 2.1较2.0结果又有新变化,添加了#号表头,减少了列
sed '1 s/^#//' temp/eggnog/output.emapper.annotations \
> temp/eggnog/output
csvtk -t headers -v temp/eggnog/output
summarizeAbundance生成COG/KO/CAZy丰度汇总表
mkdir -p result/eggnog
# 显示帮助,需要Python3环境,可修改软件第一行指定python位置,如指定某Python执行脚本 /mnt/bai/yongxin/miniconda2/envs/humann3/bin/python3 /db/EasyMicrobiome/script/summarizeAbundance.py
summarizeAbundance.py -h
# 汇总,7列COG_category按字母分隔,12列KEGG_ko和19列CAZy按逗号分隔,原始值累加
# 指定humann3中的Python 3.7.6运行正常,qiime2中的Python 3.6.13报错
summarizeAbundance.py \
-i result/salmon/gene.TPM \
-m temp/eggnog/output \
-c '7,12,19' -s '*+,+,' -n raw \
-o result/eggnog/eggnog
sed -i 's/^ko://' result/eggnog/eggnog.KEGG_ko.raw.txt
sed -i '/^-/d' result/eggnog/eggnog*
# eggnog.CAZy.raw.txt eggnog.COG_category.raw.txt eggnog.KEGG_ko.raw.txt
# 添加注释生成STAMP的spf格式,结合metadata.txt进行差异比较
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \
/db/EasyMicrobiome/kegg/KO_description.txt \
result/eggnog/eggnog.KEGG_ko.raw.txt | \
sed 's/^\t/Unannotated\t/' \
> result/eggnog/eggnog.KEGG_ko.TPM.spf
# KO to level 1/2/3
summarizeAbundance.py \
-i result/eggnog/eggnog.KEGG_ko.raw.txt \
-m /db/EasyMicrobiome/kegg/KO1-4.txt \
-c 2,3,4 -s ',+,+,' -n raw \
-o result/eggnog/KEGG
# CAZy
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \
/db/EasyMicrobiome/dbcan2/CAZy_description.txt result/eggnog/eggnog.CAZy.raw.txt | \
sed 's/^\t/Unannotated\t/' > result/eggnog/eggnog.CAZy.TPM.spf
# COG
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2"\t"$3} NR>FNR{print a[$1],$0}' \
/db/EasyMicrobiome/eggnog/COG.anno result/eggnog/eggnog.COG_category.raw.txt > \
result/eggnog/eggnog.COG_category.TPM.spf
### 3.3.2 (可选)碳水化合物dbCAN2
# 比对CAZy数据库, 用时2~18m
mkdir -p temp/dbcan2
# --sensitive慢10倍,dbCAN2推荐e值为1e-102,此处结果3条太少,以1e-3为例演示
diamond blastp \
--db /db/dbcan2/CAZyDB.09242021 \
--query result/NR/protein.fa \
--threads 9 -e 1e-3 --outfmt 6 --max-target-seqs 1 --quiet \
--out temp/dbcan2/gene_diamond.f6
wc -l temp/dbcan2/gene_diamond.f6
# 整理比对数据为表格
mkdir -p result/dbcan2
# 提取基因与dbcan分类对应表
format_dbcan2list.pl \
-i temp/dbcan2/gene_diamond.f6 \
-o temp/dbcan2/gene.list
# 按对应表累计丰度,依赖
summarizeAbundance.py \
-i result/salmon/gene.TPM \
-m temp/dbcan2/gene.list \
-c 2 -s ',' -n raw \
-o result/dbcan2/TPM
# 添加注释生成STAMP的spf格式,结合metadata.txt进行差异比较
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \
/db/EasyMicrobiome/dbcan2/CAZy_description.txt result/dbcan2/TPM.CAZy.raw.txt | \
sed 's/^\t/Unannotated\t/' > result/dbcan2/TPM.CAZy.raw.spf
# 检查未注释数量,有则需要检查原因
# grep 'Unannotated' result/dbcan2/TPM.CAZy.raw.spf|wc -l
3.3.3 抗生素抗性CARD
数据库:https://card.mcmaster.ca/ ,有在线分析平台,本地代码供参考
# 参考文献:http://doi.org/10.1093/nar/gkz935
# 软件使用Github: https://github.com/arpcard/rgi
# 启动rgi环境
conda activate rgi
rgi -h # 5.2.1
# 蛋白注释
mkdir -p result/card
cut -f 1 -d ' ' result/NR/protein.fa > temp/protein.fa
rgi main -i temp/protein.fa -t protein \
-n 9 -a DIAMOND --include_loose --clean \
-o result/card/protein
结果说明:
- protein.json,在线可视化
- protein.txt,注释基因列表
3.4 基因物种注释
# Generate report in default taxid output
conda activate meta
memusg -t kraken2 --db /db/kraken2/mini \
result/NR/nucleotide.fa \
--threads 3 \
--report temp/NRgene.report \
--output temp/NRgene.output
# Genes & taxid list
grep '^C' temp/NRgene.output|cut -f 2,3|sed '1 i Name\ttaxid' \
> temp/NRgene.taxid
# Add taxonomy
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$0} NR>FNR{print $1,a[$2]}' \
/db/EasyMicrobiome/kraken2/taxonomy.txt \
temp/NRgene.taxid \
> result/NR/nucleotide.tax
memusg -t /conda2/envs/humann3/bin/python3 /db/EasyMicrobiome/script/summarizeAbundance.py \
-i result/salmon/gene.TPM \
-m result/NR/nucleotide.tax \
-c '2,3,4,5,6,7,8,9' -s ',+,+,+,+,+,+,+,' -n raw \
-o result/NR/tax
wc -l result/NR/tax*|sort -n