MetaPhlAn源码解读
最后发布时间 : 2024-05-23 14:29:22
浏览量 :
学习资料
测试数据集:http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/
markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'], pars['nreads'], pars['subsampling'], pars['subsampling_seed'])
Statistical approach for converting marker abundances into clade(分支) abundances
wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
metaphlan自带的数据库
metaphlan_databasestar -tf mpa_vJan21_CHOCOPhlAnSGB_202103.tar
mpa_vJan21_CHOCOPhlAnSGB_202103.pkl
mpa_vJan21_CHOCOPhlAnSGB_202103_SGB.fna.bz2
mpa_vJan21_CHOCOPhlAnSGB_202103_VINFO.csv
mpa_vJan21_CHOCOPhlAnSGB_202103_VSG.fna.bz2
import pickle
import bz2
db = pickle.load(bz2.open('mpa_vJan21_CHOCOPhlAnSGB_202103.pkl', 'r'))
>>> type(db)
<class 'dict'>
>>> db.keys()
dict_keys(['taxonomy', 'markers', 'merged_taxon'])
>>> list(db['taxonomy'].items())[0]
(
'k__Bacteria|p__Fusobacteria|c__CFGB15|o__OFGB15|f__FGB15|g__GGB18|s__GGB18_SGB19|t__SGB19',
('2|32066||||||', 1057586)
)
>>> list(db['markers'].items())[0]
(
'SGB231__BALAOFEP_02157',
{
'clade': 't__SGB231',
'ext': [],
'len': 3879,
'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'
}
)
>>> list(db['merged_taxon'].items())[0]
(
('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_marisnigri|t__SGB177_group', '2157|28890|224756|2191|2194|45989|2198|'),
[
('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sp_MH98A', '2157|28890|224756|2191|2194|45989|1495314', 1),
('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_chikugoensis', '2157|28890|224756|2191|2194|45989|118126', 1),
('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_horonobensis', '2157|28890|224756|2191|2194|45989|528314', 1),
('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sediminis', '2157|28890|224756|2191|2194|45989|1550566', 1)
]
)
创建marker和taxonomy
db['markers']["test_new_marker"] = {
'clade': "t__SGB231",
'ext': {},
'len': 3879,
'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'
}
db['markers']["test_new_marker"]
db['taxonomy']['k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'] = ('2|32066||||||', 1057586)
研究一下mpa_vJan21_CHOCOPhlAnSGB_202103.pkl
import bz2
import pickle
with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a:
mpa_pkl = pickle.load( a )
mpa = mpa_pkl
df = pd.DataFrame(columns=['clade','value'])
for clade, value in mpa['taxonomy'].items():
# print(clade, value)
df.loc[len(df), df.columns] = clade, value
df.to_csv("taxonomy.tsv", sep='\t',index=False)
df
clade | value |
---|---|
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_pallens|t__SGB1564 | ('2|976|200643|171549|171552|838|60133|', 3107191) |
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543 | ('2|976|200643|171549|171552|838|470565|', 3033525) |
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidales_unclassified|g__Phocaeicola|s__Phocaeicola_vulgatus|t__SGB1814 | ('2|976|200643|171549||909656|821|', 5224998) |
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_caccae|t__SGB1877 | ('2|976|200643|171549|815|816|47678|', 5218573) |
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Tannerellaceae|g__Parabacteroides|s__Parabacteroides_distasonis|t__SGB1934 | ('2|976|200643|171549|2005525|375288|823|', 5285465) |
k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Rikenellaceae|g__Alistipes|s__Alistipes_putredinis|t__SGB2318 | ('2|976|200643|171549|171550|239759|28117|', 2562921) |
k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Veillonella|s__Veillonella_dispar|t__SGB6952 | ('2|1239|909932|1843489|31977|29465|39778|', 2133369) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_iners|t__SGB7025 | ('2|1239|91061|186826|33958|1578|147802|', 1336395) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Dolosigranulum|s__Dolosigranulum_pigrum|t__SGB7017 | ('2|1239|91061|186826|186828|29393|29394|', 1947067) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_jensenii|t__SGB7035 | ('2|1239|91061|186826|33958|1578|109790|', 1692677) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_crispatus|t__SGB7045 | ('2|1239|91061|186826|33958|1578|47770|', 2205466) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_parasanguinis|t__SGB8071 | ('2|1239|91061|186826|1300|1301|1318|', 2130341) |
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_mitis|t__SGB8163 | ('2|1239|91061|186826|1300|1301|28037|', 2074591) |
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pasteurellales|f__Pasteurellaceae|g__Haemophilus|s__Haemophilus_haemolyticus|t__SGB9649 | ('2|1224|1236|135625|712|724|726|', 1943941) |
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_nonliquefaciens|t__SGB10426 | ('2|1224|1236|72274|468|475|478|', 2266725) |
k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Lautropia|s__Lautropia_dentalis|t__SGB13164 | ('2|1224|28216|80840|119060|47670|2490857|', 3907984) |
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_SGB15893|t__SGB15893 | ('2|201174|1760|2037|2049|1654||', 2721100) |
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_pseudodiphtheriticum|t__SGB17013 | ('2|201174|1760|85007|1653|1716|37637|', 2333902) |
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_matruchotii|t__SGB17007 | ('2|201174|1760|85007|1653|1716|43768|', 2911552) |
k__Bacteria|p__Candidatus_Saccharibacteria|c__Candidatus_Saccharibacteria_unclassified|o__Candidatus_Saccharibacteria_unclassified|f__Candidatus_Saccharibacteria_unclassified|g__Candidatus_Saccharibacteria_unclassified|s__Candidatus_Saccharibacteria_unclassified_SGB19850|t__SGB19850_group | ('2|95818||||||', 926884) |
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Lachnospiraceae_unclassified|s__Eubacterium_rectale|t__SGB4933_group | ('2|1239|186801|186802|186803||39491|', 3333464) |
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micrococcales|f__Micrococcaceae|g__Rothia|s__Rothia_dentocariosa|t__SGB16988_group | ('2|201174|1760|85006|1268|32207|2047|', 2512541) |
db['taxonomy']['7-levels taxonomy with clade names of genome1'] = ('7-levels NCBI taxonomy id of genome1', length of genome1)
关于NCBI taxonomy id https://www.ncbi.nlm.nih.gov/taxonomy/?term=60133
df = pd.DataFrame(columns=['marker','clade','ext','len','taxon'])
print(df.columns)
for k,p in mpa['markers'].items():
# print(p)
# clade,ext,len,taxon = tuple(p.values())
df.loc[len(df), df.columns] = k,p['clade'],p['ext'],p['len'],p['taxon']
df.to_csv("marker.tsv", sep='\t',index=False)
df
marker | clade | ext | len | taxon |
---|---|---|---|---|
SGB1543__OFGJFGGB_00388 | t__SGB1543 | [] | 3174 | k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543 |
SGB1543__JCHLCMHF_00138 | t__SGB1543 | [] | 2970 | k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543 |
SGB1543__CDNDMBFI_01014 | t__SGB1543 | [] | 2736 | k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella|s__Prevotella_histicola|t__SGB1543 |
db['markers'][new_marker_name] = {
'clade': the clade that the marker belongs to,
'ext': {the GCA of the first external genome where the marker appears,
the GCA of the second external genome where the marker appears,
},
'len': length of the marker,
'taxon': the taxon of the marker
}
参数
tree = TaxTree( mpa_pkl, ignore_markers )
tree.set_min_cu_len( pars['min_cu_len'] )
tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])
--min_cu_len
: minimum total nucleotide length for the markers in a clade for estimating the abundance without considering sub-clade abundances(最小总核苷酸长度的标记在一个进化枝估计丰度而不考虑子进化枝丰度)--stat'
: Statistical approach for converting marker abundances into clade abundances(default tavg_g)(标记丰度转换为分支丰度的统计方法)avg_g
: clade global (i.e. normalizing all markers together) averageavg_l
: average of length-normalized marker countstavg_g
: truncated clade global average at --stat_q quantiletavg_l
: truncated average of length-normalized marker counts (at --stat_q)wavg_g
: winsorized clade global average (at --stat_q)wavg_l
: winsorized average of length-normalized marker counts (at --stat_q)med
: median of length-normalized marker counts
--stat_q
: Quantile value for the robust average--perc_nonzero
: Percentage of markers with a non zero relative abundance for misidentify a species--avoid_disqm
: Deactivate the procedure of disambiguating the quasi-markers based on the marker abundance pattern found in the sample. It is generally recommended to keep the disambiguation procedure in order to minimize false positives
from metaphlan.metaphlan import TaxTree,TaxClade
tree = TaxTree( mpa_pkl, {} )
for key in tree.all_clades:
print(key,tree.all_clades[key].nreads)
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
if marker not in tree.markers2lens:
continue
print(marker,reads)
print(marker,len(reads))
提取核心代码
import bz2
import pickle
from metaphlan.metaphlan import map2bbh
import numpy as np
import pandas as pd
with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a:
mpa_pkl = pickle.load( a )
mpa = mpa_pkl
len(mpa['taxonomy'])
from metaphlan.metaphlan import TaxTree,map2bbh
tree = TaxTree( mpa_pkl, {} )
tree.set_min_cu_len( 2000 )
markers2reads, n_metagenome_reads, avg_read_length = map2bbh("/ssd1/wy/workspace/MetaPhlAn/test_data/01.fasta.bowtie2out.txt",
5, "bowtie2out", None, None, None, '1992')
tree.set_stat( 'tavg_g',0.2, 0.33, avg_read_length,False)
map_out = []
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
if marker not in tree.markers2lens:
continue
tax_seq, ids_seq = tree.add_reads( marker, len(reads),
add_viruses = False,
ignore_eukaryotes = False,
ignore_bacteria = False,
ignore_archaea = False,
ignore_ksgbs = False,
ignore_usgbs = False
)
if tax_seq:
map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)]
tax_lev =None
SGB_ANALYSIS = True
tax_units = "kpcofgst"
clade2abundance_n = dict([(tax_label, clade) for tax_label, clade in tree.all_clades.items()
if tax_label.startswith("k__") and not clade.uncl])
clade2abundance, clade2est_nreads, tot_ab, tot_reads = {}, {}, 0.0, 0
def compute_abundance( clade ):
if clade.abundance is not None: return clade.abundance
sum_ab = sum([compute_abundance(c) for c in clade.children.values()])
rat_nreads, removed = [], []
for marker, n_reads in sorted(clade.markers2nreads.items(),key=lambda x:x[0]):
misidentified = False
# if not clade.avoid_disqm:
# for ext in clade.markers2exts[marker]:
# ext_clade = clade.taxa2clades[ext]
# m2nr = ext_clade.markers2nreads
# tocladetmp = ext_clade
# while len(tocladetmp.children) == 1:
# tocladetmp = list(tocladetmp.children.values())[0]
# m2nr = tocladetmp.markers2nreads
# nonzeros = sum([v>0 for v in m2nr.values()])
# if len(m2nr):
# if float(nonzeros) / len(m2nr) > clade.perc_nonzero:
# misidentified = True
# removed.append( (clade.markers2lens[marker],n_reads) )
# break
if not misidentified:
rat_nreads.append( (clade.markers2lens[marker],n_reads) )
# if not clade.avoid_disqm and len(removed):
# n_rat_nreads = float(len(rat_nreads))
# n_removed = float(len(removed))
# n_tot = n_rat_nreads + n_removed
# n_ripr = 10
# if len(clade.get_terminals()) < 2:
# n_ripr = 0
# if "k__Viruses" in clade.get_full_name():
# n_ripr = 0
# if n_rat_nreads < n_ripr and n_tot > n_rat_nreads:
# rat_nreads += removed[:n_ripr-int(n_rat_nreads)]
rat_nreads = sorted(rat_nreads, key = lambda x: x[1])
# rat_v: marker 基因的长度 nreads_v: 每一个marker基因对应的reads个数
rat_v,nreads_v = zip(*rat_nreads) if rat_nreads else ([],[])
# rat: marker 基因的长度总和 nrawreads:reads的个数
rat, nrawreads, loc_ab = float(sum(rat_v)) or -1.0, sum(nreads_v), 0.0
quant = int(clade.quantile*len(rat_nreads))
ql,qr,qn = (quant,-quant,quant) if quant else (None,None,0)
if rat < 0.0:
pass
elif clade.stat == 'tavg_g':
# n: 每一个marker基因对应的reads个数; r: 每一个marker基因的长度; avg_read_length: 平均reads的长度
wnreads = sorted([(float(n)/(np.absolute(r-clade.avg_read_length)+1),(np.absolute(r - clade.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
den,num = zip(*[v[1:] for v in wnreads[ql:qr]])
if sum(num) >0:
print("111")
loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0
clade.abundance = loc_ab
if rat < clade.min_cu_len and clade.children:
clade.abundance = sum_ab
elif loc_ab < sum_ab:
clade.abundance = sum_ab
if clade.abundance > sum_ab and clade.children: # *1.1??
clade.uncl_abundance = clade.abundance - sum_ab
clade.subcl_uncl = not clade.children and clade.name[0] not in tax_units[-2:]
return clade.abundance
for tax_label, clade in clade2abundance_n.items():
tot_ab += compute_abundance(clade)
def get_all_abundances(clade ):
ret = [(clade.name, clade.tax_id, clade.abundance)]
if clade.uncl_abundance > 0.0:
lchild = list(clade.children.values())[0].name[:3]
ret += [(lchild+clade.name[3:]+"_unclassified", "", clade.uncl_abundance)]
if clade.subcl_uncl and clade.name[0] != tax_units[-2]:
cind = tax_units.index( clade.name[0] )
ret += [( tax_units[cind+1]+clade.name[1:]+"_unclassified","",
clade.abundance)]
for c in clade.children.values():
ret += get_all_abundances(c)
return ret
for tax_label, clade in clade2abundance_n.items():
# print(clade.get_all_abundances())
# print(get_all_abundances(clade))
for clade_label, tax_id, abundance in sorted(clade.get_all_abundances(), key=lambda pars:pars[0]):
clade2abundance[(clade_label, tax_id)] = abundance
ret_d = dict([( tax, float(abundance) / tot_ab if tot_ab else 0.0) for tax, abundance in clade2abundance.items()])
outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in ret_d.items() if relab > 0.0]
for clade, taxid, relab in sorted( outpred, reverse=True,
key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))):
add_repr = ''
print( "\t".join( [clade,
taxid,
str(relab*1.0),
add_repr
] ) + "\n" )
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]):
if marker not in tree.markers2lens:
continue
print(marker,reads)
print(marker,len(reads))
for key in tree.all_clades:
print(key,tree.all_clades[key].markers2nreads)
for k,p in mpa['markers'].items():
print(k,p)
Marker level analysis
metaphlan -t marker_ab_table /ssd1/wy/workspace/metagenomics/strainphlan/output/bowtie2/SRS013951.fastq.bowtie2 --input_type bowtie2out -o marker_abundance_table.txt
mark count
metaphlan --bowtie2db /ssd1/wy/workspace/MetaPhlAn/test_data/databases --index mpa_vJan21_TOY_CHOCOPhlAnSGB_202103 /ssd1/wy/workspace/MetaPhlAn/test_data/SRS014476-Supragingival_plaque.fasta --input_type fasta -t marker_counts