Skip to content

Commit cb45687

Browse files
authored
fixing syndna filtering (#16)
* fixing syndna filtering * +f * output -> out_dir * fix error * rm _insert_counts * missing new line * check sum * add awk to change filename to samplename * filename -> fn * ignore 1st line awk * bam * failures parsing * type -> merge-type * use completed * update resources * 14400 * 14400 * 60G -> 4G * add extra comments to syndna.sbatch
1 parent 2d60bfd commit cb45687

File tree

7 files changed

+133
-78
lines changed

7 files changed

+133
-78
lines changed

qp_pacbio/data/resources.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2):
6262
syndna:
6363
node_count: 1
6464
nprocs: 16
65-
wall_time_limit: 10:00:00
66-
mem_in_gb: 60
65+
wall_time_limit: 4:00:00
66+
mem_in_gb: 20
6767
max_tasks: 16
6868
finish:
6969
node_count: 1
70-
nprocs: 16
71-
wall_time_limit: 1-00:00:00
72-
mem_in_gb: 120
70+
nprocs: 1
71+
wall_time_limit: 4:00:00
72+
mem_in_gb: 4

qp_pacbio/data/templates/syndna.sbatch

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
#SBATCH -n {{nprocs}}
66
#SBATCH --time {{wall_time_limit}}
77
#SBATCH --mem {{mem_in_gb}}G
8-
#SBATCH -o {{output}}/minimap2/logs/%x-%A_%a.out
9-
#SBATCH -e {{output}}/minimap2/logs/%x-%A_%a.err
8+
#SBATCH -o {{output}}/syndna/logs/%x-%A_%a.out
9+
#SBATCH -e {{output}}/syndna/logs/%x-%A_%a.err
1010
#SBATCH --array {{array_params}}
1111

1212
source ~/.bashrc
1313
set -e
1414
{{conda_environment}}
1515
out_folder={{output}}/syndna
16-
mkdir -p
16+
mkdir -p ${out_folder}
1717
cd ${out_folder}
1818
db_folder=/scratch/qp-pacbio/minimap2/syndna/
1919

@@ -28,23 +28,49 @@ mkdir -p ${out_folder}/filtered/
2828
sn_folder=${out_folder}/bioms/${sample_name}
2929
mkdir -p ${sn_folder}
3030

31+
txt=${sn_folder}/${sample_name}.txt
32+
tsv=${txt/.txt/.tsv}
3133
coverm contig --single $filename --reference ${db_folder}/All_synDNA_inserts.fasta --mapper minimap2-hifi \
3234
--min-read-percent-identity 0.95 --min-read-aligned-percent 0.0 -m mean count --threads {{nprocs}} \
33-
--output-file ${sn_folder}/${sample_name}.txt
34-
cat ${sn_folder}/${sample_name}_insert_counts.txt | sed 's/Contig/\#OTU ID/' | \
35-
sed 's/ Read Count//' > ${sn_folder}/${sample_name}.tsv
36-
biom convert -i ${sn_folder}/${sample_name}.txt -o ${sn_folder}/${sample_name}.biom --to-hdf5
35+
--output-file ${txt}
36+
37+
awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}' ${txt} | \
38+
sed 's/Contig/\#OTU ID/' | sed 's/All_synDNA_inserts.fasta\///' | \
39+
sed 's/ Read Count//' | sed "s/${fn}/${sample_name}/" > ${tsv}
40+
41+
# if counts is zero mark it as missing and stop
42+
counts=`tail -n +2 ${tsv} | awk '{sum += $NF} END {print sum}'`
43+
if [[ "$counts" == "0" ]]; then
44+
echo ${sample_name} > {{output}}/failed_${SLURM_ARRAY_TASK_ID}.log
45+
exit 0
46+
fi
47+
48+
biom convert -i ${tsv} -o ${sn_folder}/syndna.biom --to-hdf5
3749

3850
# removing AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta not coverm
51+
# ---- original commands ----
52+
# minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
53+
# samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
54+
# seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
55+
# ---- original commands ----
3956
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
4057
samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
4158
seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
4259

4360
# removing GCF_000184185.1_ASM18418v1_genomic_chroso.fna use coverm
44-
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq
45-
samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.sam
46-
coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.sam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
47-
samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_no_inserts.bam
61+
# ---- original commands ----
62+
# minimap2 -x map-hifi -t 8 -a --MD --eqx -o reads.sam ecoli_genome.fna reads.fastq
63+
# samtools view -bS -@ 8 reads.fastq | samtools sort -@ 24 -O bam -o reads.sorted.bam
64+
# coverm filter --bam-files reads.sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads 8 -o reads_filtered.sorted.bam
65+
# samtools view -O SAM -o reads_filtered.sam ./reads_filtered.sorted.bam
66+
# awk '{print $1}' reads_filtered.sam > reads_filtered.txt
67+
# seqkit grep -v -f reads_filtered.txt reads.fastq > reads_no_ecoli.fastq
68+
# ---- original commands ----
69+
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid.fastq
70+
samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.bam
71+
coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
72+
samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_GCF_000184185.bam
4873
awk '{print $1}' ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam > ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt
49-
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_GCF_000184185.fastq | gz > ${out_folder}/filtered/${fn}
50-
awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}'
74+
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_no_plasmid.fastq | gzip > ${out_folder}/filtered/${fn}
75+
76+
touch {{output}}/completed_${SLURM_ARRAY_TASK_ID}.log

qp_pacbio/data/templates/syndna_finish.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
#SBATCH -n {{nprocs}}
66
#SBATCH --time {{wall_time_limit}}
77
#SBATCH --mem {{mem_in_gb}}G
8-
#SBATCH -o {{output}}/merge/logs/%x-%A_%a.out
9-
#SBATCH -e {{output}}/merge/logs/%x-%A_%a.err
8+
#SBATCH -o {{output}}/finish/logs/%x-%A_%a.out
9+
#SBATCH -e {{output}}/finish/logs/%x-%A_%a.err
1010

1111
source ~/.bashrc
1212
set -e
1313
{{conda_environment}}
1414
cd {{output}}/
1515

16-
biom_merge_pacbio --base {{output}} --type syndna
16+
biom_merge_pacbio --base {{output}}/syndna --merge-type syndna
1717

1818
# find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
1919
# micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz

qp_pacbio/data/templates/woltka_minimap2_merge.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ for f in `ls bioms/*/per-gene.biom`; do
3939
done | parallel --halt now,fail=1 -j {{nprocs}}
4040
wait
4141

42-
biom_merge_pacbio --base {{output}} --type woltka
42+
biom_merge_pacbio --base {{output}} --merge-type woltka
4343

4444
find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
4545
micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz

qp_pacbio/qp_pacbio.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ def generate_sample_list(qclient, artifact_id, out_dir):
229229
with open(out_fp, "w", encoding="utf-8") as f:
230230
f.write("\n".join(lines))
231231

232+
preparation_information = join(out_dir, "prep_info.tsv")
233+
prep.set_index("sample_name").to_csv(preparation_information, sep="\t")
234+
232235
return len(lines)
233236

234237

@@ -454,11 +457,26 @@ def syndna_processing(qclient, job_id, parameters, out_dir):
454457

455458
errors = []
456459
ainfo = []
457-
fp_biom = f"{out_dir}/syndna.biom"
460+
461+
failures = glob(f"{out_dir}/failed_*.log")
462+
if failures:
463+
errors.append("Samples failed: ")
464+
for f in failures:
465+
with open(f, "r") as fp:
466+
errors.append(fp.read())
467+
return False, ainfo, "\n".join(errors)
468+
469+
completed = len(glob(f"{out_dir}/completed_*.log"))
470+
with open(f"{out_dir}/sample_list.txt") as fp:
471+
samples = len(fp.readlines())
472+
473+
if completed != samples:
474+
errors.append(f"There are {samples - completed} missing samples.")
475+
476+
fp_biom = f"{out_dir}/syndna/syndna.biom"
458477
# do we need to stor alignments?
459478
# fp_alng = f'{out_dir}/sams/final/alignment.tar'
460-
461-
if exists(fp_biom): # and exists(fp_alng):
479+
if not errors and exists(fp_biom): # and exists(fp_alng):
462480
# if we got to this point a preparation file should exist in
463481
# the output folder
464482
prep = pd.read_csv(f"{out_dir}/prep_info.tsv", index_col=None, sep="\t")
@@ -492,7 +510,7 @@ def syndna_processing(qclient, job_id, parameters, out_dir):
492510
"contact [email protected] for more information"
493511
)
494512

495-
fp_seqs = f"{out_dir}/filtered"
513+
fp_seqs = f"{out_dir}/syndna/filtered"
496514
reads = []
497515
for f in glob(f"{fp_seqs}/*.fastq.gz"):
498516
reads.append((f, "raw_forward_seqs"))
@@ -558,7 +576,7 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
558576
"mem_in_gb": step_resources["mem_in_gb"],
559577
"array_params": f"1-{njobs}%{step_resources['max_tasks']}",
560578
}
561-
minimap2_script = _write_slurm(f"{out_dir}/minimap2", m2t, **params)
579+
minimap2_script = _write_slurm(f"{out_dir}/syndna", m2t, **params)
562580

563581
m2mt = JGT("syndna_finish.sbatch")
564582
step_resources = resources["finish"]
@@ -570,6 +588,6 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
570588
"mem_in_gb": step_resources["mem_in_gb"],
571589
"url": url,
572590
}
573-
minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params)
591+
minimap2_finish_script = _write_slurm(f"{out_dir}/finish", m2mt, **params)
574592

575-
return minimap2_script, minimap2_merge_script
593+
return minimap2_script, minimap2_finish_script

qp_pacbio/scripts.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#
77
# The full license is in the file LICENSE, distributed with this software.
88
# -----------------------------------------------------------------------------
9-
import enum
109
from glob import glob
1110
from os import makedirs
1211
from os.path import join
@@ -21,6 +20,7 @@
2120
PACBIO_PROCESSING_STEPS,
2221
generate_minimap2_processing,
2322
generate_sample_list,
23+
generate_syndna_processing,
2424
pacbio_generate_templates,
2525
)
2626
from qp_pacbio.util import client_connect
@@ -57,18 +57,23 @@ def execute(url, job_id, output_dir):
5757
command = job_info["command"]
5858
artifact_id = parameters["artifact"]
5959

60-
if command == "Woltka v0.1.7, minimap2":
61-
main_fp, merge_fp = generate_minimap2_processing(
60+
regular_commands = {
61+
"Woltka v0.1.7, minimap2": generate_minimap2_processing,
62+
"Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2)": generate_syndna_processing,
63+
}
64+
65+
if command in regular_commands.keys():
66+
first_fp, second_fp = regular_commands[command](
6267
qclient, job_id, output_dir, parameters, url
6368
)
6469

6570
# Submitting jobs and returning id
66-
main_job = run(["sbatch", main_fp], stdout=PIPE)
67-
main_job_id = main_job.stdout.decode("utf8").split()[-1]
68-
cmd = ["sbatch", "-d", f"afterok:{main_job_id}", merge_fp]
69-
merge_job = run(cmd, stdout=PIPE)
70-
merge_job_id = merge_job.stdout.decode("utf8").split()[-1]
71-
print(f"{main_job_id}, {merge_job_id}")
71+
first_job = run(["sbatch", first_fp], stdout=PIPE)
72+
first_job_id = first_job.stdout.decode("utf8").split()[-1]
73+
cmd = ["sbatch", "-d", f"afterok:{first_job_id}", second_fp]
74+
second_job = run(cmd, stdout=PIPE)
75+
second_job_id = second_job.stdout.decode("utf8").split()[-1]
76+
print(f"{first_job_id}, {second_job_id}")
7277
qclient.update_job_step(job_id, "Step 2 of 4: Aligning sequences")
7378
elif command == "PacBio processing":
7479
frp = join(output_dir, "results")
@@ -151,22 +156,20 @@ def _biom_merge(tables):
151156
return full
152157

153158

154-
class BIOMMergeOptions(enum.Enum):
155-
SYNDNA = enum.auto()
156-
WOLTKA = enum.auto()
157-
158-
159159
@click.command()
160160
@click.option("--base", type=click.Path(exists=True), required=True)
161-
@click.option("--type", type=click.Choice(BIOMMergeOptions, case_sensitive=False))
162-
def biom_merge(base, type: BIOMMergeOptions):
161+
@click.option(
162+
"--merge-type", type=click.Choice(["syndna", "woltka"], case_sensitive=False)
163+
)
164+
def biom_merge(base, merge_type):
163165
"""Merges all PacBio biom tables"""
164-
if type == BIOMMergeOptions.SYNDNA:
166+
merge_type = merge_type.lower()
167+
if merge_type == "syndna":
165168
ranks = ["syndna"]
166-
elif type == BIOMMergeOptions.WOLTKA:
169+
elif merge_type == "woltka":
167170
ranks = ["none", "per-gene", "ko", "ec", "pathway"]
168171
else:
169-
raise ValueError(f"Type '{type}' not supported")
172+
raise ValueError(f"Type '{merge_type}' not supported")
170173

171174
for rank in ranks:
172175
rank = rank + ".biom"

0 commit comments

Comments
 (0)