diff --git a/workflow/rules/read_clipping.smk b/workflow/rules/read_clipping.smk index 8d93acc74..e7fd00cfd 100644 --- a/workflow/rules/read_clipping.smk +++ b/workflow/rules/read_clipping.smk @@ -20,63 +20,48 @@ rule samtools_sort: "v1.15.1/bio/samtools/sort" -rule bed_to_bedpe: +rule bed_to_tsv: input: check_bed_for_URL(config["preprocessing"]["amplicon-primers"]), output: - "resources/primer.bedpe", + "resources/primer.tsv", log: "logs/bed-to-bedpe.log", conda: "../envs/python.yaml" script: - "../scripts/bed-to-bedpe.py" + "../scripts/bed-to-tsv.py" -rule bamclipper: +rule trim_primers_fgbio: input: bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam", bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai", - bedpe="resources/primer.bedpe", - output: - temp( - "results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam" + ref="resources/genomes/{reference}.fasta".format( + reference=config["preprocessing"]["amplicon-reference"] ), - params: - output_dir=get_output_dir, - cwd=lambda w: os.getcwd(), - bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe), - bam=lambda w, input: os.path.basename(input.bam), + primers="resources/primer.tsv", + output: + "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/bamclipper/{read_type}/{sample}.log", + "logs/{date}/fgbio_primer_clipping/{read_type}/{sample}.log", conda: - "../envs/bamclipper.yaml" - threads: 6 + "../envs/fgbio.yaml" shell: - "(cp {input.bam} {params.output_dir} &&" - " cp {input.bai} {params.output_dir} &&" - " cd {params.output_dir} &&" - " bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) " - " > {params.cwd}/{log} 2>&1" + "fgbio TrimPrimers -i {input.bam} -p {input.primers} -o {output} -H true -r {input.ref} > {log} 2>&1" -rule fgbio: +rule filter_bam_fgbio: input: - bam="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam", - bai="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam.bai", - ref="resources/genomes/{reference}.fasta".format( - reference=config["preprocessing"]["amplicon-reference"] - ), + bam="results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", output: - temp( - "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam" - ), + "results/{date}/read-clipping/hc_filtered/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/fgbio/{read_type}/{sample}.log", + "logs/{date}/fgbio_filter_bam/{read_type}/{sample}.log", conda: "../envs/fgbio.yaml" shell: - "fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1" + "fgbio FilterBam -i {input.bam} -o {output} --min-insert-size 100 --remove-single-end-mappings > {log} 2>&1" rule samtools_fastq_pe: @@ -131,7 +116,7 @@ rule plot_primer_clipping: ), params: samples=lambda wildcards: get_samples_for_date(wildcards.date), - bedpe="resources/primer.bedpe", + bedpe="resources/primer.tsv", log: "logs/{date}/plot-primer-clipping.log", conda: diff --git a/workflow/scripts/bed-to-bedpe.py b/workflow/scripts/bed-to-tsv.py similarity index 85% rename from workflow/scripts/bed-to-bedpe.py rename to workflow/scripts/bed-to-tsv.py index 4a3b343df..098a18cb7 100644 --- a/workflow/scripts/bed-to-bedpe.py +++ b/workflow/scripts/bed-to-tsv.py @@ -23,10 +23,9 @@ df_sense["chrom"], df_sense["start"], df_sense["end"], - df_antisense["chrom"], df_antisense["start"], df_antisense["end"], ] -headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] +headers = ["chrom", "left_start", "left_end", "right_start", "right_end"] df_bedpe = pd.concat(data, axis=1, keys=headers) -df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a") +df_bedpe.to_csv(snakemake.output[0], index=None, sep="\t", mode="a") diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index c6dabb8e3..a99b55a8f 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -13,8 +13,10 @@ from intervaltree import IntervalTree # read primer bedpe to df -PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) -PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=0) +print(PRIMER) +PRIMER.drop(PRIMER.columns[[0]], axis=1, inplace=True) +print(PRIMER) PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] # convert df to interval trees @@ -116,7 +118,7 @@ def count_intervals(file): "uncut primer within", "cut primer exact", "cut primer within", - "no mathing win", + "no matching win", ], } )