# 输入文件:拼装好的序列文件 result/megahit/final.contigs.fa # 输出文件:prodigal预测的基因序列 temp/prodigal/gene.fa # 基因文件大,可参考附录prodigal拆分基因文件,并行计算 mkdir -p temp/prodigal # prodigal的meta模式预测基因,35s,>和2>&1记录分析过程至gene.log prodigal -i result/megahit/final.contigs.fa \ -d temp/prodigal/gene.fa \ -o temp/prodigal/gene.gff \ -p meta -f gff > temp/prodigal/gene.log 2>&1 # 查看日志是否运行完成,有无错误 tail temp/prodigal/gene.log # 统计基因数量 seqkit stat temp/prodigal/gene.fa # 统计完整基因数量,数据量大可只用完整基因部分 grep -c 'partial=00' temp/prodigal/gene.fa # 提取完整基因(完整片段获得的基因全为完整,如成环的细菌基因组) grep 'partial=00' temp/prodigal/gene.fa | cut -f1 -d ' '| sed 's/>//' > temp/prodigal/full_length.id seqkit grep -f temp/prodigal/full_length.id temp/prodigal/gene.fa > temp/prodigal/full_length.fa seqkit stat temp/prodigal/full_length.fa
# 输入文件:prodigal预测的基因序列 temp/prodigal/gene.fa # 输出文件:去冗余后的基因和蛋白序列:result/NR/nucleotide.fa, result/NR/protein.fa mkdir -p result/NR # aS覆盖度,c相似度,G局部比对,g最优解,T多线程,M内存0不限制 # 2万基因2m,2千万需要2000h,多线程可加速 cd-hit-est -i temp/prodigal/gene.fa \ -o result/NR/nucleotide.fa \ -aS 0.9 -c 0.95 -G 0 -g 0 -T 0 -M 0 # 统计非冗余基因数量,单次拼接结果数量下降不大,多批拼接冗余度高 grep -c '>' result/NR/nucleotide.fa # 翻译核酸为对应蛋白序列, --trim去除结尾的* seqkit translate --trim result/NR/nucleotide.fa \ > result/NR/protein.fa # 两批数据去冗余使用cd-hit-est-2d加速,见附录
# 输入文件:去冗余后的基因和蛋白序列:result/NR/nucleotide.fa # 输出文件:Salmon定量后的结果:result/salmon/gene.count, gene.TPM mkdir -p temp/salmon salmon -v # 1.4.0 # 建索引, -t序列, -i 索引,10s salmon index \ -t result/NR/nucleotide.fa \ -p 9 \ -i temp/salmon/index # 定量,l文库类型自动选择,p线程,--meta宏基因组模式, 2个任务并行2个样 # 注意parallel中待并行的命令必须是双引号,内部变量需要使用原始绝对路径 tail -n+2 result/metadata.txt|cut -f1|rush -j 2 \ "salmon quant \ -i temp/salmon/index -l A -p 3 --meta \ -1 temp/qc/{1}_1.fastq \ -2 temp/qc/{1}_2.fastq \ -o temp/salmon/{1}.quant" # 合并 mkdir -p result/salmon salmon quantmerge --quants temp/salmon/*.quant \ -o result/salmon/gene.TPM salmon quantmerge --quants temp/salmon/*.quant \ --column NumReads -o result/salmon/gene.count sed -i '1 s/.quant//g' result/salmon/gene.* # 预览结果表格 head -n3 result/salmon/gene.*
# 输入数据:上一步预测的蛋白序列 result/NR/protein.fa # 中间结果:temp/eggnog/protein.emapper.seed_orthologs # temp/eggnog/output.emapper.annotations # temp/eggnog/output # COG定量表:result/eggnog/cogtab.count # result/eggnog/cogtab.count.spf (用于STAMP) # KO定量表:result/eggnog/kotab.count # result/eggnog/kotab.count.spf (用于STAMP) # CAZy碳水化合物注释和定量:result/dbcan2/cazytab.count # result/dbcan2/cazytab.count.spf (用于STAMP) # 抗生素抗性:result/resfam/resfam.count # result/resfam/resfam.count.spf (用于STAMP) # 这部分可以拓展到其它数据库
# https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2 # 记录软件版本 conda activate eggnog emapper.py --version # 2.1.6 # diamond比对基因至eggNOG 5.0数据库, 9p11m, 1~9h,默认diamond 1e-3 mkdir -p temp/eggnog time emapper.py --no_annot --no_file_comments --override \ --data_dir ${db}/eggnog \ -i result/NR/protein.fa \ --cpu 9 -m diamond \ -o temp/eggnog/protein # 比对结果功能注释, 1h # sqlite3.OperationalError: no such table: prots是数据库不配套,重新下载即可 emapper.py \ --annotate_hits_table temp/eggnog/protein.emapper.seed_orthologs \ --data_dir ${db}/eggnog \ --cpu 9 --no_file_comments --override \ -o temp/eggnog/output # 2.1较2.0结果又有新变化,添加了#号表头,减少了列 sed '1 s/^#//' temp/eggnog/output.emapper.annotations \ > temp/eggnog/output csvtk -t headers -v temp/eggnog/output summarizeAbundance生成COG/KO/CAZy丰度汇总表 mkdir -p result/eggnog # 显示帮助,需要Python3环境,可修改软件第一行指定python位置,如指定某Python执行脚本 /mnt/bai/yongxin/miniconda2/envs/humann3/bin/python3 /db/EasyMicrobiome/script/summarizeAbundance.py summarizeAbundance.py -h # 汇总,7列COG_category按字母分隔,12列KEGG_ko和19列CAZy按逗号分隔,原始值累加 # 指定humann3中的Python 3.7.6运行正常,qiime2中的Python 3.6.13报错 summarizeAbundance.py \ -i result/salmon/gene.TPM \ -m temp/eggnog/output \ -c '7,12,19' -s '*+,+,' -n raw \ -o result/eggnog/eggnog sed -i 's/^ko://' result/eggnog/eggnog.KEGG_ko.raw.txt sed -i '/^-/d' result/eggnog/eggnog* # eggnog.CAZy.raw.txt eggnog.COG_category.raw.txt eggnog.KEGG_ko.raw.txt # 添加注释生成STAMP的spf格式,结合metadata.txt进行差异比较 awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \ /db/EasyMicrobiome/kegg/KO_description.txt \ result/eggnog/eggnog.KEGG_ko.raw.txt | \ sed 's/^\t/Unannotated\t/' \ > result/eggnog/eggnog.KEGG_ko.TPM.spf # KO to level 1/2/3 summarizeAbundance.py \ -i result/eggnog/eggnog.KEGG_ko.raw.txt \ -m /db/EasyMicrobiome/kegg/KO1-4.txt \ -c 2,3,4 -s ',+,+,' -n raw \ -o result/eggnog/KEGG # CAZy awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \ /db/EasyMicrobiome/dbcan2/CAZy_description.txt result/eggnog/eggnog.CAZy.raw.txt | \ sed 's/^\t/Unannotated\t/' > result/eggnog/eggnog.CAZy.TPM.spf # COG awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2"\t"$3} NR>FNR{print a[$1],$0}' \ /db/EasyMicrobiome/eggnog/COG.anno result/eggnog/eggnog.COG_category.raw.txt > \ result/eggnog/eggnog.COG_category.TPM.spf ### 3.3.2 (可选)碳水化合物dbCAN2 # 比对CAZy数据库, 用时2~18m mkdir -p temp/dbcan2 # --sensitive慢10倍,dbCAN2推荐e值为1e-102,此处结果3条太少,以1e-3为例演示 diamond blastp \ --db /db/dbcan2/CAZyDB.09242021 \ --query result/NR/protein.fa \ --threads 9 -e 1e-3 --outfmt 6 --max-target-seqs 1 --quiet \ --out temp/dbcan2/gene_diamond.f6 wc -l temp/dbcan2/gene_diamond.f6 # 整理比对数据为表格 mkdir -p result/dbcan2 # 提取基因与dbcan分类对应表 format_dbcan2list.pl \ -i temp/dbcan2/gene_diamond.f6 \ -o temp/dbcan2/gene.list # 按对应表累计丰度,依赖 summarizeAbundance.py \ -i result/salmon/gene.TPM \ -m temp/dbcan2/gene.list \ -c 2 -s ',' -n raw \ -o result/dbcan2/TPM # 添加注释生成STAMP的spf格式,结合metadata.txt进行差异比较 awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \ /db/EasyMicrobiome/dbcan2/CAZy_description.txt result/dbcan2/TPM.CAZy.raw.txt | \ sed 's/^\t/Unannotated\t/' > result/dbcan2/TPM.CAZy.raw.spf # 检查未注释数量,有则需要检查原因 # grep 'Unannotated' result/dbcan2/TPM.CAZy.raw.spf|wc -l
数据库:https://card.mcmaster.ca/ ,有在线分析平台,本地代码供参考
# 参考文献:http://doi.org/10.1093/nar/gkz935 # 软件使用Github: https://github.com/arpcard/rgi # 启动rgi环境 conda activate rgi rgi -h # 5.2.1 # 蛋白注释 mkdir -p result/card cut -f 1 -d ' ' result/NR/protein.fa > temp/protein.fa rgi main -i temp/protein.fa -t protein \ -n 9 -a DIAMOND --include_loose --clean \ -o result/card/protein
结果说明:
# Generate report in default taxid output conda activate meta memusg -t kraken2 --db /db/kraken2/mini \ result/NR/nucleotide.fa \ --threads 3 \ --report temp/NRgene.report \ --output temp/NRgene.output # Genes & taxid list grep '^C' temp/NRgene.output|cut -f 2,3|sed '1 i Name\ttaxid' \ > temp/NRgene.taxid # Add taxonomy awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$0} NR>FNR{print $1,a[$2]}' \ /db/EasyMicrobiome/kraken2/taxonomy.txt \ temp/NRgene.taxid \ > result/NR/nucleotide.tax memusg -t /conda2/envs/humann3/bin/python3 /db/EasyMicrobiome/script/summarizeAbundance.py \ -i result/salmon/gene.TPM \ -m result/NR/nucleotide.tax \ -c '2,3,4,5,6,7,8,9' -s ',+,+,+,+,+,+,+,' -n raw \ -o result/NR/tax wc -l result/NR/tax*|sort -n