Metagenomic
rule map_reads: input: catalogue_idx="output/mapping/{binning_group}/{binning_group}_{kind}_catalogue.mmi", fastq1=get_map_reads_input_R1, fastq2=get_map_reads_input_R2, output: # This one needs wildcard named 'sample' because of the input functions. bam=temp("output/mapping/bam/{binning_group}/{sample}-to-{kind}.map.bam"), params: N=50, preset="sr", flags=3584, split_prefix="output/mapping/bam/{binning_group}/{sample}-to-{kind}", threads: get_threads_per_task_size("big") resources: mem_mb=get_mb_per_cores, wildcard_constraints: binning_group="cobinning" if config["cobinning"] else "|".join(binning_group_names), sample="|".join(sample_IDs), kind="contigs|genes", log: "output/logs/mapping/map_reads/{binning_group}-{sample}-to-{kind}.log", benchmark: "output/benchmarks/mapping/map_reads/{binning_group}-{sample}-to-{kind}.txt" conda: "../envs/samtools.yaml" shell: """ {{ minimap2 -t {threads} \ -N {params.N} \ -ax {params.preset} \ --split-prefix {params.split_prefix}\ {input.catalogue_idx} \ {input.fastq1} \ {input.fastq2} ; }} 2>> {log} | {{ samtools view \ -F {params.flags} \ -b --threads \ {threads} > {output.bam} ; }} 2>> {log} """
############################################## # Metagenome # 1.fastp # 2.bowties2 # 3,metaphlan # 4.cat #i 5.humann # 6.bowties2 # 7.grep # 8.python # 9.perl # Author:zhangyudan # Date: 2022.6.13 ############################################## #rule all: # input: # "plots/quals.svg" (SAMPLES1,SAMPLES) = glob_wildcards("/usr/data2/zyd/111/{sample1}/{sample}.2.fastq.gz") #configfile: "config.yaml" #samples=config["samples"] rule all: input: expand("{sample}/metaphlan3/{sample}.profiled_metagenome.txt", sample=SAMPLES), expand("{sample}/fastqc/{sample}.clean.fastq.1_fastqc.html", sample=SAMPLES), #expand("{sample}/humann/{sample}.clean_pathabundance.tsv", sample=SAMPLES), expand("{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile", sample=SAMPLES), expand("{sample}/abundance/accumulation.pr", sample=SAMPLES) rule fastp: input: rawdatafq1="/usr/data2/zyd/111/{sample}/{sample}.1.fastq.gz", rawdatafq2="/usr/data2/zyd/111/{sample}/{sample}.2.fastq.gz" #rawdatafq1=expand("/usr/data2/zyd/111/{sample}/{sample}.1.fastq.gz",sample=config["samples"]), #rawdatafq2=expand("/usr/data2/zyd/111/{sample}/{sample}.2.fastq.gz",sample=config["samples"]) output: fastpdata1="{sample}/fastp/{sample}.filter_R1.fastq.gz", fastpdata2="{sample}/fastp/{sample}.filter_R2.fastq.gz", fastpjson="{sample}/fastp/{sample}.fastp.json", fastphtml="{sample}/fastp/{sample}.fastp.html" shell: "fastp -i {input.rawdatafq1} -I {input.rawdatafq2} -o {output.fastpdata1} -O {output.fastpdata2} -Q --thread=8 --length_required=50 --n_base_limit=2 --compression=6 -h {output.fastphtml} -j {output.fastpjson}" rule bowties2_human38: input: fastpdata1="{sample}/fastp/{sample}.filter_R1.fastq.gz", fastpdata2="{sample}/fastp/{sample}.filter_R2.fastq.gz" output: cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz", cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz" shell: "bowtie2 -p 8 -x /home/zyd/metagenome/bowtie2-2.4.5-linux-x86_64/index/human/human38 -1 {input.fastpdata1} -2 {input.fastpdata2} --un-conc-gz {wildcards.sample}/rm_host/{wildcards.sample}.clean.fastq.gz -S /dev/null 2> {wildcards.sample}/rm_host/{wildcards.sample}.bowtie2.log" rule fastqc: input: cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz", cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz" output: "{sample}/fastqc/{sample}.clean.fastq.1_fastqc.html", "{sample}/fastqc/{sample}.clean.fastq.2_fastqc.html" shell: "fastqc -o {wildcards.sample}/fastqc -t 8 {input.cleandata1} {input.cleandata2}" rule metaphlan3: input: cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz", cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz" output: bz2="{sample}/metaphlan3/{sample}.metagenome.bowtie2.bz2", txt="{sample}/metaphlan3/{sample}.profiled_metagenome.txt" shell: "metaphlan {input.cleandata1},{input.cleandata2} --bowtie2out {output.bz2} --nproc 8 --input_type fastq -o {output.txt}" rule cat: input: cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz", cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz" output: cleandata="{sample}/{sample}.clean.fastq.gz" shell: "cat {input.cleandata1} {input.cleandata2} > {output.cleandata}" rule humann: input: cleandata="{sample}/{sample}.clean.fastq.gz" output: "{sample}/humann/{sample}.clean_pathabundance.tsv", shell: "humann --threads 8 --input {input.cleandata} --output {wildcards.sample}/humann" rule bowties2_twins: input: cleandata1="{sample}/rm_host/{sample}.clean.fastq.1.gz", cleandata2="{sample}/rm_host/{sample}.clean.fastq.2.gz" output: DIRsam="{sample}/twins/{sample}.OUTPUT_DIR.sam", DIRlog="{sample}/twins/{sample}.OUTPUT_DIR.log" shell: "bowtie2 -p 8 -x /home/zyd/metagenome/bowtie2-2.4.5-linux-x86_64/index/twins/11.4M_index --very-sensitive --no-unal --no-sq -1 {input.cleandata1} -2 {input.cleandata2} -S {output.DIRsam} 2> {output.DIRlog}" rule uniquesam1: input: DIRsam="{sample}/twins/{sample}.OUTPUT_DIR.sam", output: uniquesam="{sample}/twins/{sample}.OUTPUT_DIR.pair.unique.sam" shell: "grep 'AS\:' {input.DIRsam} | grep -v 'XS\:' | grep 'YS\:' > {output.uniquesam}" rule abundance: input: uniquesam="{sample}/twins/{sample}.OUTPUT_DIR.pair.unique.sam" output: abundance="{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile" shell: "python From_reads_to_abundance.py twins_IGC.geneset.out.renew.fa.length {input.uniquesam} {output.abundance}" rule perl: input: abundance="{sample}/abundance/{sample}.OUTPUT_DIR.gene.profile" output: "{sample}/abundance/accumulation.pr", "{sample}/abundance/normalization.pr", "{sample}/abundance/sample.abundance.total.rate" shell: "perl get_functional_profile.pl -k KO_geneid.txt -f {input.abundance} -o {wildcards.sample}/abundance"
(SAMPLES1,SAMPLES) = glob_wildcards("/data/mrna/kraken2/{sample1}/{sample}_R1.fastq.gz") rule kraken2_bracken_pluspf: conda: "virus" input: cleandata1="/data/mrna/kraken2/{sample}/{sample}_R1.fastq.gz", cleandata2="/data/mrna/kraken2/{sample}/{sample}_R2.fastq.gz" output: kraken2_output="{sample}/kraken2_bracken_pluspf/{sample}.output", report="{sample}/kraken2_bracken_pluspf/{sample}.report" threads: 16 shell: "kraken2 {input.cleandata1} {input.cleandata2} --db /db/pluspf/ --output {output.kraken2_output} --report {output.report} --use-names --report-zero-counts --threads {threads}" rule kraken2_bracken_out_pluspf: conda: "virus" input: report="{sample}/kraken2_bracken_pluspf/{sample}.report" output: bracken_species="{sample}/kraken2_bracken_pluspf/{sample}.species", bracken_genus="{sample}/kraken2_bracken_pluspf/{sample}.genus", bracken_family="{sample}/kraken2_bracken_pluspf/{sample}.family", bracken_order="{sample}/kraken2_bracken_pluspf/{sample}.order", bracken_class="{sample}/kraken2_bracken_pluspf/{sample}.class", bracken_phylum="{sample}/kraken2_bracken_pluspf/{sample}.phylum", bracken_domain="{sample}/kraken2_bracken_pluspf/{sample}.domain" shell: "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l S -t 0 -o {output.bracken_species} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l G -t 0 -o {output.bracken_genus} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l F -t 0 -o {output.bracken_family} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l O -t 0 -o {output.bracken_order} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l C -t 0 -o {output.bracken_class} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l P -t 0 -o {output.bracken_phylum} && " "bracken -d /ssd1/zyd/virus/db/kraken2/20220908/pluspf/ -i {input.report} -r 100 -l D -t 0 -o {output.bracken_domain}"
./pluspf/ ├── database100mers.kmer_distrib ├── database150mers.kmer_distrib ├── database200mers.kmer_distrib ├── database250mers.kmer_distrib ├── database300mers.kmer_distrib ├── database50mers.kmer_distrib ├── database75mers.kmer_distrib ├── hash.k2d ├── inspect.txt ├── k2_pluspf_20220908.tar.gz ├── opts.k2d ├── seqid2taxid.map └── taxo.k2d