需求:构建13个物种的13个基因的系统进化树
12个
from Bio import Entrez import os Entrez.email = "1749748955@qq.com" def downloadByNucleotide(ids,path,file_type,endName): if not os.path.exists(path): os.makedirs(path) print("共下载序列",len(ids),"条") for id in ids: print("准备下载",id,"....") hd_efetch_fa = Entrez.efetch(db='nucleotide', id=id, rettype=file_type) read_efetch_fa = hd_efetch_fa.read() with open("{0}/{1}.{2}".format(path,id,endName),"w") as file: file.write(read_efetch_fa) print(id,"下载完成....") # download genbank sequencess def downloadByNucleotideAndGenbank(ids,path): downloadByNucleotide(ids,path,"gb","gb") if __name__ == "__main__": ids=["NC_037234.1","NC_029326.1","NC_024028.1","NC_030265.1","KY689117.1","KY689119.1","NC_034282.1","NC_037207.1","NC_037235.1","NC_030266.1","KY689125.1","NC_037205.1"] downloadByNucleotideAndGenbank(ids,"temp/source")
. ├── KY689117.1.gb ├── KY689119.1.gb ├── KY689125.1.gb ├── NC_024028.1.gb ├── NC_029326.1.gb ├── NC_030265.1.gb ├── NC_030266.1.gb ├── NC_034282.1.gb ├── NC_037205.1.gb ├── NC_037207.1.gb ├── NC_037234.1.gb └── NC_037235.1.gb
拷贝需要比对的基因到 temp/source
temp/source
from Bio import SeqIO import os list_sort=["ND2","COX1","COX2","ATP8","ATP6","COX3","ND3","ND5","ND4","ND4L","ND6","CYTB","ND1"] def linkGene(input_path,output_path): # 如果不存在就创建文件夹 if not os.path.exists(input_path): os.makedirs(input_path) if not os.path.exists(output_path): os.makedirs(output_path) # 遍历该目录下的每一个文件夹 for file_name in os.listdir(input_path): # 使用biopython读取genbank文件的完整序列 records = SeqIO.read("{0}/{1}".format(input_path,file_name),"genbank") #从Genbank获取完整序列 complete_seq = str(records.seq) # 打印基因名称 print(">"+records.annotations["source"]+" "+records.id) gene_dict = {} for feature in records.features: if feature.type == "CDS": # print(feature.qualifiers.get("gene")) # 获取基因的名称eg.[ND1、ND2] gene_name = feature.qualifiers.get("gene")[0] # print(type(feature.location.parts)) gene_seq="" for gene in feature.location.parts: # print(complete_seq[gene.start:gene.end]) # 获取对应的基因序列 gene_seq = gene_seq + complete_seq[gene.start:gene.end] # gene_dict[gene_name]=gene_seq #按照顺序遍历基因 print("定义了",len(list_sort),"个基因","序列有",len(gene_dict),"个基因") for gene in list_sort: # 第一次循环创建文件 f = open("{0}/{1}.fasta".format(output_path,gene),"a") if gene in gene_dict: # 添加基因到文件,有12个物种,1个基因文件有12个序列 print("找到基因:",gene,"长度为:",len(gene_dict[gene])) f.write(">"+records.annotations["source"]+" "+records.name+"\n"+gene_dict[gene]+"\n") else: print("error----------","没有找到基因:",gene) print("*******写入完成*******") if __name__ == "__main__": linkGene("temp/source","temp/result")
. ├── ATP6.fasta ├── ATP8.fasta ├── COX1.fasta ├── COX2.fasta ├── COX3.fasta ├── CYTB.fasta ├── ND1.fasta ├── ND2.fasta ├── ND3.fasta ├── ND4.fasta ├── ND4L.fasta ├── ND5.fasta └── ND6.fasta
megacc -a temp/mao/clustal_align_nucleotide.mao -d [输入文件] -f Fasta -o [输出文件夹]
.mao文件的的生成见: https://www.megasoftware.net/web_help_10/index.htm#t=Quick_Start_Tutorial.htm
.mao
import os def alignFasta(input_seq,output_seq): # 如果不存在就创建文件夹 if not os.path.exists(input_seq): os.makedirs(input_seq) if not os.path.exists(output_seq): os.makedirs(output_seq) os.system("megacc -a temp/mao/clustal_align_nucleotide.mao -d {0} -f Fasta -o {1}".format(input_seq,output_seq)) def generate_align(): for file_name in os.listdir("temp/result"): print("比对:"+file_name+"...........") alignFasta("temp/result/{0}".format(file_name),"temp/align_output/") if __name__ == "__main__": generate_align()
. ├── ATP6-10469.fasta ├── ATP6-10469_summary.txt ├── ATP8-10415.fasta ├── ATP8-10415_summary.txt ├── COX1-10259.fasta ├── COX1-10259_summary.txt ├── COX2-10525.fasta ├── COX2-10525_summary.txt ├── COX3-10055.fasta ├── COX3-10055_summary.txt ├── CYTB-10197.fasta ├── CYTB-10197_summary.txt ├── ND1-10421.fasta ├── ND1-10421_summary.txt ├── ND2-10556.fasta ├── ND2-10556_summary.txt ├── ND3-10653.fasta ├── ND3-10653_summary.txt ├── ND4-10095.fasta ├── ND4-10095_summary.txt ├── ND4L-10418.fasta ├── ND4L-10418_summary.txt ├── ND5-10646.fasta ├── ND5-10646_summary.txt ├── ND6-10386.fasta └── ND6-10386_summary.txt
align_result.fasta