展开

MetaPhlAn源码解读

最后发布时间 : 2024-05-23 14:29:22 浏览量 :

学习资料

生信小木屋

生信小木屋

测试数据集:http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/

markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'], pars['nreads'], pars['subsampling'], pars['subsampling_seed'])

Statistical approach for converting marker abundances into clade(分支) abundances

 wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])

metaphlan自带的数据库

metaphlan_databases
tar -tf mpa_vJan21_CHOCOPhlAnSGB_202103.tar

mpa_vJan21_CHOCOPhlAnSGB_202103.pkl
mpa_vJan21_CHOCOPhlAnSGB_202103_SGB.fna.bz2
mpa_vJan21_CHOCOPhlAnSGB_202103_VINFO.csv
mpa_vJan21_CHOCOPhlAnSGB_202103_VSG.fna.bz2
import pickle
import bz2

db = pickle.load(bz2.open('mpa_vJan21_CHOCOPhlAnSGB_202103.pkl', 'r'))
>>> type(db)
<class 'dict'>
>>> db.keys()
dict_keys(['taxonomy', 'markers', 'merged_taxon'])
>>> list(db['taxonomy'].items())[0]
(
    'k__Bacteria|p__Fusobacteria|c__CFGB15|o__OFGB15|f__FGB15|g__GGB18|s__GGB18_SGB19|t__SGB19', 
    ('2|32066||||||', 1057586)
)
>>> list(db['markers'].items())[0]
(
    'SGB231__BALAOFEP_02157', 
    {
        'clade': 't__SGB231', 
        'ext': [], 
        'len': 3879, 
        'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'
    }
)
>>> list(db['merged_taxon'].items())[0]
(
    ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_marisnigri|t__SGB177_group', '2157|28890|224756|2191|2194|45989|2198|'), 
    [
        ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sp_MH98A', '2157|28890|224756|2191|2194|45989|1495314', 1), 
        ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_chikugoensis', '2157|28890|224756|2191|2194|45989|118126', 1), 
        ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_horonobensis', '2157|28890|224756|2191|2194|45989|528314', 1), 
        ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sediminis', '2157|28890|224756|2191|2194|45989|1550566', 1)
    ]
)

创建marker和taxonomy

db['markers']["test_new_marker"] = {
                                   'clade': "t__SGB231",
                                   'ext': {},
                                   'len': 3879,
                                   'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'
                                }
db['markers']["test_new_marker"]
db['taxonomy']['k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'] = ('2|32066||||||', 1057586)

研究一下mpa_vJan21_CHOCOPhlAnSGB_202103.pkl

import bz2
import pickle

with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a:
    mpa_pkl = pickle.load( a )

mpa = mpa_pkl
df = pd.DataFrame(columns=['clade','value'])
for clade, value in mpa['taxonomy'].items():
    # print(clade, value)
    df.loc[len(df), df.columns] = clade, value
df.to_csv("taxonomy.tsv", sep='\t',index=False)
df

cladevalue
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_pallens|t__SGB1564('2|976|200643|171549|171552|838|60133|', 3107191)
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543('2|976|200643|171549|171552|838|470565|', 3033525)
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidales_unclassified|g__Phocaeicola|s__Phocaeicola_vulgatus|t__SGB1814('2|976|200643|171549||909656|821|', 5224998)
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_caccae|t__SGB1877('2|976|200643|171549|815|816|47678|', 5218573)
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Tannerellaceae|g__Parabacteroides|s__Parabacteroides_distasonis|t__SGB1934('2|976|200643|171549|2005525|375288|823|', 5285465)
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Rikenellaceae|g__Alistipes|s__Alistipes_putredinis|t__SGB2318('2|976|200643|171549|171550|239759|28117|', 2562921)
k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Veillonella|s__Veillonella_dispar|t__SGB6952('2|1239|909932|1843489|31977|29465|39778|', 2133369)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners|t__SGB7025('2|1239|91061|186826|33958|1578|147802|', 1336395)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Dolosigranulum|s__Dolosigranulum_pigrum|t__SGB7017('2|1239|91061|186826|186828|29393|29394|', 1947067)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_jensenii|t__SGB7035('2|1239|91061|186826|33958|1578|109790|', 1692677)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_crispatus|t__SGB7045('2|1239|91061|186826|33958|1578|47770|', 2205466)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_parasanguinis|t__SGB8071('2|1239|91061|186826|1300|1301|1318|', 2130341)
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_mitis|t__SGB8163('2|1239|91061|186826|1300|1301|28037|', 2074591)
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pasteurellales|f__Pasteurellaceae|g__Haemophilus|s__Haemophilus_haemolyticus|t__SGB9649('2|1224|1236|135625|712|724|726|', 1943941)
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_nonliquefaciens|t__SGB10426('2|1224|1236|72274|468|475|478|', 2266725)
k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Lautropia|s__Lautropia_dentalis|t__SGB13164('2|1224|28216|80840|119060|47670|2490857|', 3907984)
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_SGB15893|t__SGB15893('2|201174|1760|2037|2049|1654||', 2721100)
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_pseudodiphtheriticum|t__SGB17013('2|201174|1760|85007|1653|1716|37637|', 2333902)
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_matruchotii|t__SGB17007('2|201174|1760|85007|1653|1716|43768|', 2911552)
k__Bacteria|p__Candidatus_Saccharibacteria|c__Candidatus_Saccharibacteria_unclassified|o__Candidatus_Saccharibacteria_unclassified|f__Candidatus_Saccharibacteria_unclassified|g__Candidatus_Saccharibacteria_unclassified|s__Candidatus_Saccharibacteria_unclassified_SGB19850|t__SGB19850_group('2|95818||||||', 926884)
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Lachnospiraceae_unclassified|s__Eubacterium_rectale|t__SGB4933_group('2|1239|186801|186802|186803||39491|', 3333464)
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micrococcales|f__Micrococcaceae|g__Rothia|s__Rothia_dentocariosa|t__SGB16988_group('2|201174|1760|85006|1268|32207|2047|', 2512541)

db['taxonomy']['7-levels taxonomy with clade names of genome1'] = ('7-levels NCBI taxonomy id of genome1', length of genome1)
关于NCBI taxonomy id https://www.ncbi.nlm.nih.gov/taxonomy/?term=60133

df = pd.DataFrame(columns=['marker','clade','ext','len','taxon'])
print(df.columns)
for k,p in mpa['markers'].items():
    # print(p)
    # clade,ext,len,taxon = tuple(p.values())
    df.loc[len(df), df.columns] = k,p['clade'],p['ext'],p['len'],p['taxon']
df.to_csv("marker.tsv", sep='\t',index=False)
df
markercladeextlentaxon
SGB1543__OFGJFGGB_00388t__SGB1543[]3174k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543
SGB1543__JCHLCMHF_00138t__SGB1543[]2970k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543
SGB1543__CDNDMBFI_01014t__SGB1543[]2736k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543

db['markers'][new_marker_name] = {
'clade': the clade that the marker belongs to,
'ext': {the GCA of the first external genome where the marker appears,
the GCA of the second external genome where the marker appears,
},
'len': length of the marker,
'taxon': the taxon of the marker
}

参数

tree = TaxTree( mpa_pkl, ignore_markers )
tree.set_min_cu_len( pars['min_cu_len'] )
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])

  • --min_cu_len: minimum total nucleotide length for the markers in a clade for estimating the abundance without considering sub-clade abundances(最小总核苷酸长度的标记在一个进化枝估计丰度而不考虑子进化枝丰度)
  • --stat': Statistical approach for converting marker abundances into clade abundances(default tavg_g)(标记丰度转换为分支丰度的统计方法)
    • avg_g : clade global (i.e. normalizing all markers together) average
    • avg_l : average of length-normalized marker counts
    • tavg_g : truncated clade global average at --stat_q quantile
    • tavg_l : truncated average of length-normalized marker counts (at --stat_q)
    • wavg_g : winsorized clade global average (at --stat_q)
    • wavg_l : winsorized average of length-normalized marker counts (at --stat_q)
    • med : median of length-normalized marker counts
  • --stat_q: Quantile value for the robust average
  • --perc_nonzero: Percentage of markers with a non zero relative abundance for misidentify a species
  • --avoid_disqm: Deactivate the procedure of disambiguating the quasi-markers based on the marker abundance pattern found in the sample. It is generally recommended to keep the disambiguation procedure in order to minimize false positives
from metaphlan.metaphlan import TaxTree,TaxClade
tree = TaxTree( mpa_pkl, {} )

for key in tree.all_clades:
    print(key,tree.all_clades[key].nreads)
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
    if marker not in tree.markers2lens:
        continue
    print(marker,reads)
    print(marker,len(reads))

提取核心代码

import bz2
import pickle
from metaphlan.metaphlan import map2bbh
import numpy as np
import pandas as pd

with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a:
    mpa_pkl = pickle.load( a )
mpa = mpa_pkl
len(mpa['taxonomy'])
from metaphlan.metaphlan import TaxTree,map2bbh
tree = TaxTree( mpa_pkl, {} )

tree.set_min_cu_len( 2000 )



markers2reads, n_metagenome_reads, avg_read_length = map2bbh("/ssd1/wy/workspace/MetaPhlAn/test_data/01.fasta.bowtie2out.txt", 
                                                           5, "bowtie2out", None, None, None, '1992')
tree.set_stat( 'tavg_g',0.2, 0.33, avg_read_length,False)

map_out = []
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
    if marker not in tree.markers2lens:
        continue
    tax_seq, ids_seq = tree.add_reads( marker, len(reads),
                                add_viruses = False,
                                ignore_eukaryotes = False,
                                ignore_bacteria = False,
                                ignore_archaea = False,
                                ignore_ksgbs = False,
                                ignore_usgbs = False
                                )
    if tax_seq:
        map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)]

tax_lev =None
SGB_ANALYSIS = True
tax_units = "kpcofgst"

clade2abundance_n = dict([(tax_label, clade) for tax_label, clade in tree.all_clades.items()
                    if tax_label.startswith("k__") and not clade.uncl])

clade2abundance, clade2est_nreads, tot_ab, tot_reads = {}, {}, 0.0, 0



def compute_abundance( clade ):
    if clade.abundance is not None: return clade.abundance

    sum_ab = sum([compute_abundance(c) for c in clade.children.values()])


    rat_nreads, removed = [], []
    for marker, n_reads in sorted(clade.markers2nreads.items(),key=lambda x:x[0]):
        misidentified = False

        # if not clade.avoid_disqm:
        #     for ext in clade.markers2exts[marker]:
        #         ext_clade = clade.taxa2clades[ext]
        #         m2nr = ext_clade.markers2nreads
                
        #         tocladetmp = ext_clade
        #         while len(tocladetmp.children) == 1:
        #             tocladetmp = list(tocladetmp.children.values())[0]
        #             m2nr = tocladetmp.markers2nreads

        #         nonzeros = sum([v>0 for v in m2nr.values()])
        #         if len(m2nr):
        #             if float(nonzeros) / len(m2nr) > clade.perc_nonzero:
        #                 misidentified = True
        #                 removed.append( (clade.markers2lens[marker],n_reads) )
        #                 break
        if not misidentified:
            rat_nreads.append( (clade.markers2lens[marker],n_reads) )
        # if not clade.avoid_disqm and len(removed):
        #     n_rat_nreads = float(len(rat_nreads))
        #     n_removed = float(len(removed))
        #     n_tot = n_rat_nreads + n_removed
        #     n_ripr = 10

        #     if len(clade.get_terminals()) < 2:
        #         n_ripr = 0

        #     if "k__Viruses" in clade.get_full_name():
        #         n_ripr = 0

        #     if n_rat_nreads < n_ripr and n_tot > n_rat_nreads:
        #         rat_nreads += removed[:n_ripr-int(n_rat_nreads)]       
    rat_nreads = sorted(rat_nreads, key = lambda x: x[1])
    # rat_v: marker 基因的长度  nreads_v: 每一个marker基因对应的reads个数
    rat_v,nreads_v = zip(*rat_nreads) if rat_nreads else ([],[])
    # rat: marker 基因的长度总和 nrawreads:reads的个数
    rat, nrawreads, loc_ab = float(sum(rat_v)) or -1.0, sum(nreads_v), 0.0
    quant = int(clade.quantile*len(rat_nreads))
    ql,qr,qn = (quant,-quant,quant) if quant else (None,None,0)


    if rat < 0.0:
        pass
    elif clade.stat == 'tavg_g':
        # n: 每一个marker基因对应的reads个数; r: 每一个marker基因的长度; avg_read_length: 平均reads的长度
        wnreads = sorted([(float(n)/(np.absolute(r-clade.avg_read_length)+1),(np.absolute(r - clade.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
        den,num = zip(*[v[1:] for v in wnreads[ql:qr]])
        if sum(num) >0:
            print("111")
        loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0

    clade.abundance = loc_ab
    if rat < clade.min_cu_len and clade.children:
        clade.abundance = sum_ab
    elif loc_ab < sum_ab:
        clade.abundance = sum_ab

    if clade.abundance > sum_ab and clade.children: # *1.1??
        clade.uncl_abundance = clade.abundance - sum_ab
    clade.subcl_uncl = not clade.children and clade.name[0] not in tax_units[-2:]

    return clade.abundance


for tax_label, clade in clade2abundance_n.items():
    tot_ab += compute_abundance(clade)

def get_all_abundances(clade ):

    ret = [(clade.name, clade.tax_id, clade.abundance)]
    if clade.uncl_abundance > 0.0:
        lchild = list(clade.children.values())[0].name[:3]
        ret += [(lchild+clade.name[3:]+"_unclassified", "", clade.uncl_abundance)]
    if clade.subcl_uncl and clade.name[0] != tax_units[-2]:
        cind = tax_units.index( clade.name[0] )
        ret += [(   tax_units[cind+1]+clade.name[1:]+"_unclassified","",
                    clade.abundance)]
    for c in clade.children.values():
        ret += get_all_abundances(c)
    return ret


for tax_label, clade in clade2abundance_n.items():
    # print(clade.get_all_abundances())

    # print(get_all_abundances(clade))

    for clade_label, tax_id, abundance in sorted(clade.get_all_abundances(), key=lambda pars:pars[0]):
       clade2abundance[(clade_label, tax_id)] = abundance

ret_d = dict([( tax, float(abundance) / tot_ab if tot_ab else 0.0) for tax, abundance in clade2abundance.items()])
outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in ret_d.items() if relab > 0.0]
for clade, taxid, relab in sorted(  outpred, reverse=True,
                                        key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))):
    add_repr = ''
    print( "\t".join( [clade, 
                                taxid, 
                                str(relab*1.0), 
                                add_repr
                            ] ) + "\n" )
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
    if marker not in tree.markers2lens:
        continue
    print(marker,reads)
    print(marker,len(reads))
for key in tree.all_clades:
    print(key,tree.all_clades[key].markers2nreads)
for k,p in mpa['markers'].items():
    print(k,p)

Marker level analysis

metaphlan -t marker_ab_table /ssd1/wy/workspace/metagenomics/strainphlan/output/bowtie2/SRS013951.fastq.bowtie2 --input_type bowtie2out -o marker_abundance_table.txt

mark count

metaphlan  --bowtie2db /ssd1/wy/workspace/MetaPhlAn/test_data/databases --index mpa_vJan21_TOY_CHOCOPhlAnSGB_202103 /ssd1/wy/workspace/MetaPhlAn/test_data/SRS014476-Supragingival_plaque.fasta --input_type fasta -t marker_counts