fixing syndna filtering (#16)

antgonza · web-flow · commit cb456877cd3a · 2025-12-02T14:44:36.000-08:00
* fixing syndna filtering

* +f

* output -&gt; out_dir

* fix error

* rm _insert_counts

* missing new line

* check sum

* add awk to change filename to samplename

* filename -&gt; fn

* ignore 1st line awk

* bam

* failures parsing

* type -&gt; merge-type

* use completed

* update resources

* 14400

* 14400

* 60G -&gt; 4G

* add extra comments to syndna.sbatch
diff --git a/qp_pacbio/data/resources.yaml b/qp_pacbio/data/resources.yaml
@@ -62,11 +62,11 @@ Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2):
   syndna:
     node_count: 1
     nprocs: 16
-    wall_time_limit: 10:00:00
-    mem_in_gb: 60
+    wall_time_limit: 4:00:00
+    mem_in_gb: 20
     max_tasks: 16
   finish:
     node_count: 1
-    nprocs: 16
-    wall_time_limit: 1-00:00:00
-    mem_in_gb: 120
+    nprocs: 1
+    wall_time_limit: 4:00:00
+    mem_in_gb: 4
diff --git a/qp_pacbio/data/templates/syndna.sbatch b/qp_pacbio/data/templates/syndna.sbatch
@@ -5,15 +5,15 @@
 #SBATCH -n {{nprocs}}
 #SBATCH --time {{wall_time_limit}}
 #SBATCH --mem {{mem_in_gb}}G
-#SBATCH -o {{output}}/minimap2/logs/%x-%A_%a.out
-#SBATCH -e {{output}}/minimap2/logs/%x-%A_%a.err
+#SBATCH -o {{output}}/syndna/logs/%x-%A_%a.out
+#SBATCH -e {{output}}/syndna/logs/%x-%A_%a.err
 #SBATCH --array {{array_params}}
 
 source ~/.bashrc
 set -e
 {{conda_environment}}
 out_folder={{output}}/syndna
-mkdir -p
+mkdir -p ${out_folder}
 cd ${out_folder}
 db_folder=/scratch/qp-pacbio/minimap2/syndna/
 
@@ -28,23 +28,49 @@ mkdir -p ${out_folder}/filtered/
 sn_folder=${out_folder}/bioms/${sample_name}
 mkdir -p ${sn_folder}
 
+txt=${sn_folder}/${sample_name}.txt
+tsv=${txt/.txt/.tsv}
 coverm contig --single $filename --reference ${db_folder}/All_synDNA_inserts.fasta --mapper minimap2-hifi \
     --min-read-percent-identity 0.95 --min-read-aligned-percent 0.0 -m mean count --threads {{nprocs}} \
-    --output-file ${sn_folder}/${sample_name}.txt
-cat ${sn_folder}/${sample_name}_insert_counts.txt | sed 's/Contig/\#OTU ID/' | \
-    sed 's/ Read Count//' > ${sn_folder}/${sample_name}.tsv
-biom convert -i ${sn_folder}/${sample_name}.txt -o ${sn_folder}/${sample_name}.biom --to-hdf5
+    --output-file ${txt}
+
+awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}' ${txt} | \
+    sed 's/Contig/\#OTU ID/' | sed  's/All_synDNA_inserts.fasta\///' | \
+    sed  's/ Read Count//' | sed "s/${fn}/${sample_name}/" > ${tsv}
+
+# if counts is zero mark it as missing and stop
+counts=`tail -n +2 ${tsv} | awk '{sum += $NF} END {print sum}'`
+if [[ "$counts" == "0" ]]; then
+    echo ${sample_name} > {{output}}/failed_${SLURM_ARRAY_TASK_ID}.log
+    exit 0
+fi
+
+biom convert -i ${tsv} -o ${sn_folder}/syndna.biom --to-hdf5
 
 # removing AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta not coverm
+# ---- original commands ----
+# minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
+# samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
+# seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
+# ---- original commands ----
 minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
 samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
 seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
 
 # removing GCF_000184185.1_ASM18418v1_genomic_chroso.fna use coverm
-minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq
-samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.sam
-coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.sam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
-samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_no_inserts.bam
+# ---- original commands ----
+# minimap2 -x map-hifi -t 8 -a --MD --eqx -o reads.sam ecoli_genome.fna reads.fastq
+# samtools view -bS -@ 8 reads.fastq | samtools sort -@ 24 -O bam -o reads.sorted.bam
+# coverm filter --bam-files reads.sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads 8 -o reads_filtered.sorted.bam
+# samtools view -O SAM -o reads_filtered.sam ./reads_filtered.sorted.bam
+# awk '{print $1}' reads_filtered.sam > reads_filtered.txt
+# seqkit grep -v -f reads_filtered.txt reads.fastq > reads_no_ecoli.fastq
+# ---- original commands ----
+minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid.fastq
+samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.bam
+coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
+samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_GCF_000184185.bam
 awk '{print $1}' ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam > ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt
-seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_GCF_000184185.fastq | gz > ${out_folder}/filtered/${fn}
-awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}'
+seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_no_plasmid.fastq | gzip > ${out_folder}/filtered/${fn}
+
+touch {{output}}/completed_${SLURM_ARRAY_TASK_ID}.log
diff --git a/qp_pacbio/data/templates/syndna_finish.sbatch b/qp_pacbio/data/templates/syndna_finish.sbatch
@@ -5,15 +5,15 @@
 #SBATCH -n {{nprocs}}
 #SBATCH --time {{wall_time_limit}}
 #SBATCH --mem {{mem_in_gb}}G
-#SBATCH -o {{output}}/merge/logs/%x-%A_%a.out
-#SBATCH -e {{output}}/merge/logs/%x-%A_%a.err
+#SBATCH -o {{output}}/finish/logs/%x-%A_%a.out
+#SBATCH -e {{output}}/finish/logs/%x-%A_%a.err
 
 source ~/.bashrc
 set -e
 {{conda_environment}}
 cd {{output}}/
 
-biom_merge_pacbio --base {{output}} --type syndna
+biom_merge_pacbio --base {{output}}/syndna --merge-type syndna
 
 # find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
 # micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz
diff --git a/qp_pacbio/data/templates/woltka_minimap2_merge.sbatch b/qp_pacbio/data/templates/woltka_minimap2_merge.sbatch
@@ -39,7 +39,7 @@ for f in `ls bioms/*/per-gene.biom`; do
 done | parallel --halt now,fail=1 -j {{nprocs}}
 wait
 
-biom_merge_pacbio --base {{output}} --type woltka
+biom_merge_pacbio --base {{output}} --merge-type woltka
 
 find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
 micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz
diff --git a/qp_pacbio/qp_pacbio.py b/qp_pacbio/qp_pacbio.py
@@ -229,6 +229,9 @@ def generate_sample_list(qclient, artifact_id, out_dir):
     with open(out_fp, "w", encoding="utf-8") as f:
         f.write("\n".join(lines))
 
+    preparation_information = join(out_dir, "prep_info.tsv")
+    prep.set_index("sample_name").to_csv(preparation_information, sep="\t")
+
     return len(lines)
 
 
@@ -454,11 +457,26 @@ def syndna_processing(qclient, job_id, parameters, out_dir):
 
     errors = []
     ainfo = []
-    fp_biom = f"{out_dir}/syndna.biom"
+
+    failures = glob(f"{out_dir}/failed_*.log")
+    if failures:
+        errors.append("Samples failed: ")
+        for f in failures:
+            with open(f, "r") as fp:
+                errors.append(fp.read())
+        return False, ainfo, "\n".join(errors)
+
+    completed = len(glob(f"{out_dir}/completed_*.log"))
+    with open(f"{out_dir}/sample_list.txt") as fp:
+        samples = len(fp.readlines())
+
+    if completed != samples:
+        errors.append(f"There are {samples - completed} missing samples.")
+
+    fp_biom = f"{out_dir}/syndna/syndna.biom"
     # do we need to stor alignments?
     # fp_alng = f'{out_dir}/sams/final/alignment.tar'
-
-    if exists(fp_biom):  # and exists(fp_alng):
+    if not errors and exists(fp_biom):  # and exists(fp_alng):
         # if we got to this point a preparation file should exist in
         # the output folder
         prep = pd.read_csv(f"{out_dir}/prep_info.tsv", index_col=None, sep="\t")
@@ -492,7 +510,7 @@ def syndna_processing(qclient, job_id, parameters, out_dir):
             "contact qiita.help@gmail.com for more information"
         )
 
-    fp_seqs = f"{out_dir}/filtered"
+    fp_seqs = f"{out_dir}/syndna/filtered"
     reads = []
     for f in glob(f"{fp_seqs}/*.fastq.gz"):
         reads.append((f, "raw_forward_seqs"))
@@ -558,7 +576,7 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
         "mem_in_gb": step_resources["mem_in_gb"],
         "array_params": f"1-{njobs}%{step_resources['max_tasks']}",
     }
-    minimap2_script = _write_slurm(f"{out_dir}/minimap2", m2t, **params)
+    minimap2_script = _write_slurm(f"{out_dir}/syndna", m2t, **params)
 
     m2mt = JGT("syndna_finish.sbatch")
     step_resources = resources["finish"]
@@ -570,6 +588,6 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
         "mem_in_gb": step_resources["mem_in_gb"],
         "url": url,
     }
-    minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params)
+    minimap2_finish_script = _write_slurm(f"{out_dir}/finish", m2mt, **params)
 
-    return minimap2_script, minimap2_merge_script
+    return minimap2_script, minimap2_finish_script
diff --git a/qp_pacbio/scripts.py b/qp_pacbio/scripts.py
@@ -6,7 +6,6 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
-import enum
 from glob import glob
 from os import makedirs
 from os.path import join
@@ -21,6 +20,7 @@
     PACBIO_PROCESSING_STEPS,
     generate_minimap2_processing,
     generate_sample_list,
+    generate_syndna_processing,
     pacbio_generate_templates,
 )
 from qp_pacbio.util import client_connect
@@ -57,18 +57,23 @@ def execute(url, job_id, output_dir):
         command = job_info["command"]
         artifact_id = parameters["artifact"]
 
-        if command == "Woltka v0.1.7, minimap2":
-            main_fp, merge_fp = generate_minimap2_processing(
+        regular_commands = {
+            "Woltka v0.1.7, minimap2": generate_minimap2_processing,
+            "Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2)": generate_syndna_processing,
+        }
+
+        if command in regular_commands.keys():
+            first_fp, second_fp = regular_commands[command](
                 qclient, job_id, output_dir, parameters, url
             )
 
             # Submitting jobs and returning id
-            main_job = run(["sbatch", main_fp], stdout=PIPE)
-            main_job_id = main_job.stdout.decode("utf8").split()[-1]
-            cmd = ["sbatch", "-d", f"afterok:{main_job_id}", merge_fp]
-            merge_job = run(cmd, stdout=PIPE)
-            merge_job_id = merge_job.stdout.decode("utf8").split()[-1]
-            print(f"{main_job_id}, {merge_job_id}")
+            first_job = run(["sbatch", first_fp], stdout=PIPE)
+            first_job_id = first_job.stdout.decode("utf8").split()[-1]
+            cmd = ["sbatch", "-d", f"afterok:{first_job_id}", second_fp]
+            second_job = run(cmd, stdout=PIPE)
+            second_job_id = second_job.stdout.decode("utf8").split()[-1]
+            print(f"{first_job_id}, {second_job_id}")
             qclient.update_job_step(job_id, "Step 2 of 4: Aligning sequences")
         elif command == "PacBio processing":
             frp = join(output_dir, "results")
@@ -151,22 +156,20 @@ def _biom_merge(tables):
     return full
 
 
-class BIOMMergeOptions(enum.Enum):
-    SYNDNA = enum.auto()
-    WOLTKA = enum.auto()
-
-
 @click.command()
 @click.option("--base", type=click.Path(exists=True), required=True)
-@click.option("--type", type=click.Choice(BIOMMergeOptions, case_sensitive=False))
-def biom_merge(base, type: BIOMMergeOptions):
+@click.option(
+    "--merge-type", type=click.Choice(["syndna", "woltka"], case_sensitive=False)
+)
+def biom_merge(base, merge_type):
     """Merges all PacBio biom tables"""
-    if type == BIOMMergeOptions.SYNDNA:
+    merge_type = merge_type.lower()
+    if merge_type == "syndna":
         ranks = ["syndna"]
-    elif type == BIOMMergeOptions.WOLTKA:
+    elif merge_type == "woltka":
         ranks = ["none", "per-gene", "ko", "ec", "pathway"]
     else:
-        raise ValueError(f"Type '{type}' not supported")
+        raise ValueError(f"Type '{merge_type}' not supported")
 
     for rank in ranks:
         rank = rank + ".biom"
diff --git a/qp_pacbio/tests/test_pacbio.py b/qp_pacbio/tests/test_pacbio.py