pipleline
最后发布时间 : 2023-08-06 21:24:54
浏览量 :
Metaphor:A workflow for streamlined assembly and binning of metagenomes
rule map_reads:
input:
catalogue_idx="output/mapping/{binning_group}/{binning_group}_{kind}_catalogue.mmi",
fastq1=get_map_reads_input_R1,
fastq2=get_map_reads_input_R2,
output:
# This one needs wildcard named 'sample' because of the input functions.
bam=temp("output/mapping/bam/{binning_group}/{sample}-to-{kind}.map.bam"),
params:
N=50,
preset="sr",
flags=3584,
split_prefix="output/mapping/bam/{binning_group}/{sample}-to-{kind}",
threads: get_threads_per_task_size("big")
resources:
mem_mb=get_mb_per_cores,
wildcard_constraints:
binning_group="cobinning"
if config["cobinning"]
else "|".join(binning_group_names),
sample="|".join(sample_IDs),
kind="contigs|genes",
log:
"output/logs/mapping/map_reads/{binning_group}-{sample}-to-{kind}.log",
benchmark:
"output/benchmarks/mapping/map_reads/{binning_group}-{sample}-to-{kind}.txt"
conda:
"../envs/samtools.yaml"
shell:
"""
{{ minimap2 -t {threads} \
-N {params.N} \
-ax {params.preset} \
--split-prefix {params.split_prefix}\
{input.catalogue_idx} \
{input.fastq1} \
{input.fastq2} ; }} 2>> {log} |
{{ samtools view \
-F {params.flags} \
-b --threads \
{threads} > {output.bam} ; }} 2>> {log}
"""
zyd
##############################################
# Metagenome
# 1.fastp
# 2.bowties2
# 3,metaphlan
# 4.cat
#i 5.humann
# 6.bowties2
# 7.grep
# 8.python
# 9.perl
# Author:zhangyudan
# Date: 2022.6.13
##############################################
#rule all:
# input:
# "plots/quals.svg"
(SAMPLES1,SAMPLES) = glob_wildcards("/usr/data2/zyd/111/{sample1}/{sample}.2.fastq.gz")
#configfile: "config.yaml"
#samples=config["samples"]
rule all:
input:
expand("{sample}/metaphlan3/{sample}.profiled_metagenome.txt", sample=SAMPLES),
expand("{sample}/fastqc/{sample}.clean.fastq.1_fastqc.html", sample=SAMPLES),
#expand("{sample}/humann/{sample}.clean_pathabundance.tsv", sample=SAMPLES),
expand("{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile", sample=SAMPLES),
expand("{sample}/abundance/accumulation.pr", sample=SAMPLES)
rule fastp:
input:
rawdatafq1="/usr/data2/zyd/111/{sample}/{sample}.1.fastq.gz",
rawdatafq2="/usr/data2/zyd/111/{sample}/{sample}.2.fastq.gz"
#rawdatafq1=expand("/usr/data2/zyd/111/{sample}/{sample}.1.fastq.gz",sample=config["samples"]),
#rawdatafq2=expand("/usr/data2/zyd/111/{sample}/{sample}.2.fastq.gz",sample=config["samples"])
output:
fastpdata1="{sample}/fastp/{sample}.filter_R1.fastq.gz",
fastpdata2="{sample}/fastp/{sample}.filter_R2.fastq.gz",
fastpjson="{sample}/fastp/{sample}.fastp.json",
fastphtml="{sample}/fastp/{sample}.fastp.html"
shell:
"fastp -i {input.rawdatafq1} -I {input.rawdatafq2} -o {output.fastpdata1} -O {output.fastpdata2} -Q --thread=8 --length_required=50 --n_base_limit=2 --compression=6 -h {output.fastphtml} -j {output.fastpjson}"
rule bowties2_human38:
input:
fastpdata1="{sample}/fastp/{sample}.filter_R1.fastq.gz",
fastpdata2="{sample}/fastp/{sample}.filter_R2.fastq.gz"
output:
cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz",
cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz"
shell:
"bowtie2 -p 8 -x /home/zyd/metagenome/bowtie2-2.4.5-linux-x86_64/index/human/human38 -1 {input.fastpdata1} -2 {input.fastpdata2} --un-conc-gz {wildcards.sample}/rm_host/{wildcards.sample}.clean.fastq.gz -S /dev/null 2> {wildcards.sample}/rm_host/{wildcards.sample}.bowtie2.log"
rule fastqc:
input:
cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz",
cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz"
output:
"{sample}/fastqc/{sample}.clean.fastq.1_fastqc.html",
"{sample}/fastqc/{sample}.clean.fastq.2_fastqc.html"
shell:
"fastqc -o {wildcards.sample}/fastqc -t 8 {input.cleandata1} {input.cleandata2}"
rule metaphlan3:
input:
cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz",
cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz"
output:
bz2="{sample}/metaphlan3/{sample}.metagenome.bowtie2.bz2",
txt="{sample}/metaphlan3/{sample}.profiled_metagenome.txt"
shell:
"metaphlan {input.cleandata1},{input.cleandata2} --bowtie2out {output.bz2} --nproc 8 --input_type fastq -o {output.txt}"
rule cat:
input:
cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz",
cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz"
output:
cleandata="{sample}/{sample}.clean.fastq.gz"
shell:
"cat {input.cleandata1} {input.cleandata2} > {output.cleandata}"
rule humann:
input:
cleandata="{sample}/{sample}.clean.fastq.gz"
output:
"{sample}/humann/{sample}.clean_pathabundance.tsv",
shell:
"humann --threads 8 --input {input.cleandata} --output {wildcards.sample}/humann"
rule bowties2_twins:
input:
cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz",
cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz"
output:
DIRsam="{sample}/twins/{sample}.OUTPUT_DIR.sam",
DIRlog="{sample}/twins/{sample}.OUTPUT_DIR.log"
shell:
"bowtie2 -p 8 -x /home/zyd/metagenome/bowtie2-2.4.5-linux-x86_64/index/twins/11.4M_index --very-sensitive --no-unal --no-sq -1 {input.cleandata1} -2 {input.cleandata2} -S {output.DIRsam} 2> {output.DIRlog}"
rule uniquesam1:
input:
DIRsam="{sample}/twins/{sample}.OUTPUT_DIR.sam",
output:
uniquesam="{sample}/twins/{sample}.OUTPUT_DIR.pair.unique.sam"
shell:
"grep 'AS\:' {input.DIRsam} | grep -v 'XS\:' | grep 'YS\:' > {output.uniquesam}"
rule abundance:
input:
uniquesam="{sample}/twins/{sample}.OUTPUT_DIR.pair.unique.sam"
output:
abundance="{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile"
shell:
"python From_reads_to_abundance.py twins_IGC.geneset.out.renew.fa.length {input.uniquesam} {output.abundance}"
rule perl:
input:
abundance="{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile"
output:
"{sample}/abundance/accumulation.pr",
"{sample}/abundance/normalization.pr",
"{sample}/abundance/sample.abundance.total.rate"
shell:
"perl get_functional_profile.pl -k KO_geneid.txt -f {input.abundance} -o {wildcards.sample}/abundance"
(SAMPLES1,SAMPLES) = glob_wildcards("/data/mrna/kraken2/{sample1}/{sample}_R1.fastq.gz")
rule kraken2_bracken_pluspf:
conda: "virus"
input:
cleandata1="/data/mrna/kraken2/{sample}/{sample}_R1.fastq.gz",
cleandata2="/data/mrna/kraken2/{sample}/{sample}_R2.fastq.gz"
output:
kraken2_output="{sample}/kraken2_bracken_pluspf/{sample}.output",
report="{sample}/kraken2_bracken_pluspf/{sample}.report"
threads: 16
shell:
"kraken2 {input.cleandata1} {input.cleandata2} --db /db/pluspf/ --output {output.kraken2_output} --report {output.report} --use-names --report-zero-counts --threads {threads}"
rule kraken2_bracken_out_pluspf:
conda: "virus"
input:
report="{sample}/kraken2_bracken_pluspf/{sample}.report"
output:
bracken_species="{sample}/kraken2_bracken_pluspf/{sample}.species",
bracken_genus="{sample}/kraken2_bracken_pluspf/{sample}.genus",
bracken_family="{sample}/kraken2_bracken_pluspf/{sample}.family",
bracken_order="{sample}/kraken2_bracken_pluspf/{sample}.order",
bracken_class="{sample}/kraken2_bracken_pluspf/{sample}.class",
bracken_phylum="{sample}/kraken2_bracken_pluspf/{sample}.phylum",
bracken_domain="{sample}/kraken2_bracken_pluspf/{sample}.domain"
shell:
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l S -t 0 -o {output.bracken_species} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l G -t 0 -o {output.bracken_genus} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l F -t 0 -o {output.bracken_family} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l O -t 0 -o {output.bracken_order} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l C -t 0 -o {output.bracken_class} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l P -t 0 -o {output.bracken_phylum} && "
"bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l D -t 0 -o {output.bracken_domain}"
./pluspf/
├── database100mers.kmer_distrib
├── database150mers.kmer_distrib
├── database200mers.kmer_distrib
├── database250mers.kmer_distrib
├── database300mers.kmer_distrib
├── database50mers.kmer_distrib
├── database75mers.kmer_distrib
├── hash.k2d
├── inspect.txt
├── k2_pluspf_20220908.tar.gz
├── opts.k2d
├── seqid2taxid.map
└── taxo.k2d