Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 18 additions & 33 deletions workflow/rules/read_clipping.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,63 +20,48 @@ rule samtools_sort:
"v1.15.1/bio/samtools/sort"


rule bed_to_bedpe:
rule bed_to_tsv:
input:
check_bed_for_URL(config["preprocessing"]["amplicon-primers"]),
output:
"resources/primer.bedpe",
"resources/primer.tsv",
log:
"logs/bed-to-bedpe.log",
conda:
"../envs/python.yaml"
script:
"../scripts/bed-to-bedpe.py"
"../scripts/bed-to-tsv.py"


rule bamclipper:
rule trim_primers_fgbio:
input:
bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam",
bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai",
bedpe="resources/primer.bedpe",
output:
temp(
"results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam"
ref="resources/genomes/{reference}.fasta".format(
reference=config["preprocessing"]["amplicon-reference"]
),
params:
output_dir=get_output_dir,
cwd=lambda w: os.getcwd(),
bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe),
bam=lambda w, input: os.path.basename(input.bam),
primers="resources/primer.tsv",
output:
"results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam",
log:
"logs/{date}/bamclipper/{read_type}/{sample}.log",
"logs/{date}/fgbio_primer_clipping/{read_type}/{sample}.log",
conda:
"../envs/bamclipper.yaml"
threads: 6
"../envs/fgbio.yaml"
shell:
"(cp {input.bam} {params.output_dir} &&"
" cp {input.bai} {params.output_dir} &&"
" cd {params.output_dir} &&"
" bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) "
" > {params.cwd}/{log} 2>&1"
"fgbio TrimPrimers -i {input.bam} -p {input.primers} -o {output} -H true -r {input.ref} > {log} 2>&1"


rule fgbio:
rule filter_bam_fgbio:
input:
bam="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam",
bai="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam.bai",
ref="resources/genomes/{reference}.fasta".format(
reference=config["preprocessing"]["amplicon-reference"]
),
bam="results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam",
output:
temp(
"results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam"
),
"results/{date}/read-clipping/hc_filtered/{read_type}/{sample}/{sample}.bam",
log:
"logs/{date}/fgbio/{read_type}/{sample}.log",
"logs/{date}/fgbio_filter_bam/{read_type}/{sample}.log",
conda:
"../envs/fgbio.yaml"
shell:
"fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1"
"fgbio FilterBam -i {input.bam} -o {output} --min-insert-size 100 --remove-single-end-mappings > {log} 2>&1"


rule samtools_fastq_pe:
Expand Down Expand Up @@ -131,7 +116,7 @@ rule plot_primer_clipping:
),
params:
samples=lambda wildcards: get_samples_for_date(wildcards.date),
bedpe="resources/primer.bedpe",
bedpe="resources/primer.tsv",
log:
"logs/{date}/plot-primer-clipping.log",
conda:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
df_sense["chrom"],
df_sense["start"],
df_sense["end"],
df_antisense["chrom"],
df_antisense["start"],
df_antisense["end"],
]
headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"]
headers = ["chrom", "left_start", "left_end", "right_start", "right_end"]
df_bedpe = pd.concat(data, axis=1, keys=headers)
df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a")
df_bedpe.to_csv(snakemake.output[0], index=None, sep="\t", mode="a")
8 changes: 5 additions & 3 deletions workflow/scripts/plot-primer-clipping.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
from intervaltree import IntervalTree

# read primer bedpe to df
PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None)
PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True)
PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=0)
print(PRIMER)
PRIMER.drop(PRIMER.columns[[0]], axis=1, inplace=True)
print(PRIMER)
PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"]

# convert df to interval trees
Expand Down Expand Up @@ -116,7 +118,7 @@ def count_intervals(file):
"uncut primer within",
"cut primer exact",
"cut primer within",
"no mathing win",
"no matching win",
],
}
)
Expand Down