mpa_vJan21_CHOCOPhlAnSGB_202103.pkl
学习资料
markers2reads, n_metagenome_reads, avg_read_length = map2bbh(pars['inp'], pars['min_mapq_val'], pars['input_type'], pars['min_alignment_len'], pars['nreads'], pars['subsampling'], pars['subsampling_seed'])
wnreads = sorted([(float(n)/(np.absolute(r-self.avg_read_length)+1),(np.absolute(r - self.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0])
--bowtie2db METAPHLAN_BOWTIE2_DB Folder containing the MetaPhlAn database. You can specify the location by exporting the DEFAULT_DB_FOLDER variable in the shell.[default /opt/miniforge3/envs/gmwi2_env/lib/python3.8/site-packages/metaphlan/metaphlan_databases]
metaphlan_databasestar -tf mpa_vJan21_CHOCOPhlAnSGB_202103.tar
tar -tf mpa_vJan21_CHOCOPhlAnSGB_202103.tar
mpa_vJan21_CHOCOPhlAnSGB_202103.pkl mpa_vJan21_CHOCOPhlAnSGB_202103_SGB.fna.bz2 mpa_vJan21_CHOCOPhlAnSGB_202103_VINFO.csv mpa_vJan21_CHOCOPhlAnSGB_202103_VSG.fna.bz2
import pickle import bz2 db = pickle.load(bz2.open('mpa_vJan21_CHOCOPhlAnSGB_202103.pkl', 'r'))
>>> type(db) <class 'dict'> >>> db.keys() dict_keys(['taxonomy', 'markers', 'merged_taxon'])
>>> list(db['taxonomy'].items())[0] ( 'k__Bacteria|p__Fusobacteria|c__CFGB15|o__OFGB15|f__FGB15|g__GGB18|s__GGB18_SGB19|t__SGB19', ('2|32066||||||', 1057586) )
>>> list(db['markers'].items())[0] ( 'SGB231__BALAOFEP_02157', { 'clade': 't__SGB231', 'ext': [], 'len': 3879, 'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231' } )
>>> list(db['merged_taxon'].items())[0] ( ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_marisnigri|t__SGB177_group', '2157|28890|224756|2191|2194|45989|2198|'), [ ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sp_MH98A', '2157|28890|224756|2191|2194|45989|1495314', 1), ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_chikugoensis', '2157|28890|224756|2191|2194|45989|118126', 1), ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_horonobensis', '2157|28890|224756|2191|2194|45989|528314', 1), ('k__Archaea|p__Euryarchaeota|c__Methanomicrobia|o__Methanomicrobiales|f__Methanomicrobiaceae|g__Methanoculleus|s__Methanoculleus_sediminis', '2157|28890|224756|2191|2194|45989|1550566', 1) ] )
db['markers']["test_new_marker"] = { 'clade': "t__SGB231", 'ext': {}, 'len': 3879, 'taxon': 'k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231' } db['markers']["test_new_marker"]
db['taxonomy']['k__Archaea|p__Euryarchaeota|c__Halobacteria|o__Halobacteriales|f__Haloarculaceae|g__Halomicrobium|s__Halomicrobium_katesii|t__SGB231'] = ('2|32066||||||', 1057586)
import bz2 import pickle with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a: mpa_pkl = pickle.load( a ) mpa = mpa_pkl df = pd.DataFrame(columns=['clade','value']) for clade, value in mpa['taxonomy'].items(): # print(clade, value) df.loc[len(df), df.columns] = clade, value df.to_csv("taxonomy.tsv", sep='\t',index=False) df
db['taxonomy']['7-levels taxonomy with clade names of genome1'] = ('7-levels NCBI taxonomy id of genome1', length of genome1)关于NCBI taxonomy id https://www.ncbi.nlm.nih.gov/taxonomy/?term=60133
df = pd.DataFrame(columns=['marker','clade','ext','len','taxon']) print(df.columns) for k,p in mpa['markers'].items(): # print(p) # clade,ext,len,taxon = tuple(p.values()) df.loc[len(df), df.columns] = k,p['clade'],p['ext'],p['len'],p['taxon'] df.to_csv("marker.tsv", sep='\t',index=False) df
db['markers'][new_marker_name] = {'clade': the clade that the marker belongs to,'ext': {the GCA of the first external genome where the marker appears,the GCA of the second external genome where the marker appears,},'len': length of the marker,'taxon': the taxon of the marker}
tree = TaxTree( mpa_pkl, ignore_markers ) tree.set_min_cu_len( pars['min_cu_len'] ) tree.set_stat( pars['stat'], pars['stat_q'], pars['perc_nonzero'], avg_read_length, pars['avoid_disqm'])
--min_cu_len
--stat'
avg_g
avg_l
tavg_g
tavg_l
wavg_g
wavg_l
med
--stat_q
--perc_nonzero
--avoid_disqm
from metaphlan.metaphlan import TaxTree,TaxClade tree = TaxTree( mpa_pkl, {} ) for key in tree.all_clades: print(key,tree.all_clades[key].nreads)
for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): if marker not in tree.markers2lens: continue print(marker,reads) print(marker,len(reads))
import bz2 import pickle from metaphlan.metaphlan import map2bbh import numpy as np import pandas as pd with bz2.BZ2File( '/ssd1/wy/workspace/MetaPhlAn/test_data/databases/mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl', 'r' ) as a: mpa_pkl = pickle.load( a ) mpa = mpa_pkl len(mpa['taxonomy']) from metaphlan.metaphlan import TaxTree,map2bbh tree = TaxTree( mpa_pkl, {} ) tree.set_min_cu_len( 2000 ) markers2reads, n_metagenome_reads, avg_read_length = map2bbh("/ssd1/wy/workspace/MetaPhlAn/test_data/01.fasta.bowtie2out.txt", 5, "bowtie2out", None, None, None, '1992') tree.set_stat( 'tavg_g',0.2, 0.33, avg_read_length,False) map_out = [] for marker,reads in sorted(markers2reads.items(), key=lambda pars: pars[0]): if marker not in tree.markers2lens: continue tax_seq, ids_seq = tree.add_reads( marker, len(reads), add_viruses = False, ignore_eukaryotes = False, ignore_bacteria = False, ignore_archaea = False, ignore_ksgbs = False, ignore_usgbs = False ) if tax_seq: map_out +=["\t".join([r,tax_seq, ids_seq]) for r in sorted(reads)] tax_lev =None SGB_ANALYSIS = True tax_units = "kpcofgst" clade2abundance_n = dict([(tax_label, clade) for tax_label, clade in tree.all_clades.items() if tax_label.startswith("k__") and not clade.uncl]) clade2abundance, clade2est_nreads, tot_ab, tot_reads = {}, {}, 0.0, 0 def compute_abundance( clade ): if clade.abundance is not None: return clade.abundance sum_ab = sum([compute_abundance(c) for c in clade.children.values()]) rat_nreads, removed = [], [] for marker, n_reads in sorted(clade.markers2nreads.items(),key=lambda x:x[0]): misidentified = False # if not clade.avoid_disqm: # for ext in clade.markers2exts[marker]: # ext_clade = clade.taxa2clades[ext] # m2nr = ext_clade.markers2nreads # tocladetmp = ext_clade # while len(tocladetmp.children) == 1: # tocladetmp = list(tocladetmp.children.values())[0] # m2nr = tocladetmp.markers2nreads # nonzeros = sum([v>0 for v in m2nr.values()]) # if len(m2nr): # if float(nonzeros) / len(m2nr) > clade.perc_nonzero: # misidentified = True # removed.append( (clade.markers2lens[marker],n_reads) ) # break if not misidentified: rat_nreads.append( (clade.markers2lens[marker],n_reads) ) # if not clade.avoid_disqm and len(removed): # n_rat_nreads = float(len(rat_nreads)) # n_removed = float(len(removed)) # n_tot = n_rat_nreads + n_removed # n_ripr = 10 # if len(clade.get_terminals()) < 2: # n_ripr = 0 # if "k__Viruses" in clade.get_full_name(): # n_ripr = 0 # if n_rat_nreads < n_ripr and n_tot > n_rat_nreads: # rat_nreads += removed[:n_ripr-int(n_rat_nreads)] rat_nreads = sorted(rat_nreads, key = lambda x: x[1]) # rat_v: marker 基因的长度 nreads_v: 每一个marker基因对应的reads个数 rat_v,nreads_v = zip(*rat_nreads) if rat_nreads else ([],[]) # rat: marker 基因的长度总和 nrawreads:reads的个数 rat, nrawreads, loc_ab = float(sum(rat_v)) or -1.0, sum(nreads_v), 0.0 quant = int(clade.quantile*len(rat_nreads)) ql,qr,qn = (quant,-quant,quant) if quant else (None,None,0) if rat < 0.0: pass elif clade.stat == 'tavg_g': # n: 每一个marker基因对应的reads个数; r: 每一个marker基因的长度; avg_read_length: 平均reads的长度 wnreads = sorted([(float(n)/(np.absolute(r-clade.avg_read_length)+1),(np.absolute(r - clade.avg_read_length)+1) ,n) for r,n in rat_nreads], key=lambda x:x[0]) den,num = zip(*[v[1:] for v in wnreads[ql:qr]]) if sum(num) >0: print("111") loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0 clade.abundance = loc_ab if rat < clade.min_cu_len and clade.children: clade.abundance = sum_ab elif loc_ab < sum_ab: clade.abundance = sum_ab if clade.abundance > sum_ab and clade.children: # *1.1?? clade.uncl_abundance = clade.abundance - sum_ab clade.subcl_uncl = not clade.children and clade.name[0] not in tax_units[-2:] return clade.abundance for tax_label, clade in clade2abundance_n.items(): tot_ab += compute_abundance(clade) def get_all_abundances(clade ): ret = [(clade.name, clade.tax_id, clade.abundance)] if clade.uncl_abundance > 0.0: lchild = list(clade.children.values())[0].name[:3] ret += [(lchild+clade.name[3:]+"_unclassified", "", clade.uncl_abundance)] if clade.subcl_uncl and clade.name[0] != tax_units[-2]: cind = tax_units.index( clade.name[0] ) ret += [( tax_units[cind+1]+clade.name[1:]+"_unclassified","", clade.abundance)] for c in clade.children.values(): ret += get_all_abundances(c) return ret for tax_label, clade in clade2abundance_n.items(): # print(clade.get_all_abundances()) # print(get_all_abundances(clade)) for clade_label, tax_id, abundance in sorted(clade.get_all_abundances(), key=lambda pars:pars[0]): clade2abundance[(clade_label, tax_id)] = abundance ret_d = dict([( tax, float(abundance) / tot_ab if tot_ab else 0.0) for tax, abundance in clade2abundance.items()]) outpred = [(taxstr, taxid,round(relab*100.0,5)) for (taxstr, taxid), relab in ret_d.items() if relab > 0.0] for clade, taxid, relab in sorted( outpred, reverse=True, key=lambda x:x[2]+(100.0*(8-(x[0].count("|"))))): add_repr = '' print( "\t".join( [clade, taxid, str(relab*1.0), add_repr ] ) + "\n" )
for key in tree.all_clades: print(key,tree.all_clades[key].markers2nreads)
for k,p in mpa['markers'].items(): print(k,p)
metaphlan -t marker_ab_table /ssd1/wy/workspace/metagenomics/strainphlan/output/bowtie2/SRS013951.fastq.bowtie2 --input_type bowtie2out -o marker_abundance_table.txt
metaphlan --bowtie2db /ssd1/wy/workspace/MetaPhlAn/test_data/databases --index mpa_vJan21_TOY_CHOCOPhlAnSGB_202103 /ssd1/wy/workspace/MetaPhlAn/test_data/SRS014476-Supragingival_plaque.fasta --input_type fasta -t marker_counts