markdups and molcov (#210)

* default is 0 * speed up gzipping by using gnu * add sequencer for optical dup buffer * better text * nicer formatting * add wildcard to rule * use pre-binned Counter * much faster mol cov * rm the stupid _Inline folder perl makes * fix to increment bins between start and end * swap the manual CLI for an intelligent per-sample search * fix reference to sequencer * rm unnecessary call to cut * rash of cli updates * better text * fix cli * fix test call
pdimens · Feb 24, 2025 · 2579cd5 · 2579cd5
1 parent 07c227b
commit 2579cd5
Show file tree

Hide file tree

Showing 32 changed files with 165 additions and 203 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -234,7 +234,7 @@ jobs:
         run: harpy qc -x "--low_complexity_filter" --quiet 2 test/fastq
       - name: harpy qc all options
         shell: micromamba-shell {0}
-        run: harpy qc -a auto -d -c 21,40,3,0 --quiet 2 test/fastq
+        run: harpy qc -a auto -d -c 21 40 3 0 --quiet 2 test/fastq
   deconvolve:
     needs: [changes]
     if: ${{ needs.changes.outputs.deconvolve == 'true' }}

diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py
@@ -9,27 +9,6 @@ def convert_to_int(ctx, param, value):
         return None
     return int(value)
 
-class IntList(click.ParamType):
-    """A class for a click type which accepts an arbitrary number of integers, separated by a comma."""
-    name = "int_list"
-    def __init__(self, entries):
-        super().__init__()
-        self.entries = entries
-
-    def convert(self, value, param, ctx):
-        try:
-            parts = [i.strip() for i in value.split(',')]
-            if len(parts) != self.entries:
-                raise ValueError
-            for i in parts:
-                try:
-                    int(i)
-                except:
-                    raise ValueError
-            return [int(i) for i in parts]
-        except ValueError:
-            self.fail(f"{value} is not a valid list of integers. The value should be {self.entries} integers separated by a comma.", param, ctx)
-
 class KParam(click.ParamType):
     """A class for a click type which accepts any number of odd integers separated by a comma, or the word auto."""
     name = "k_param"

diff --git a/harpy/align.py b/harpy/align.py
@@ -45,7 +45,7 @@ def align():
     "harpy align ema": [
         {
             "name": "Parameters",
-            "options": ["--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"],
+            "options": [ "--barcode-list", "--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality"],
             "panel_styles": {"border_style": "blue"}
         },
         {
@@ -68,15 +68,15 @@ def align():
     ]
 }
 
-@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/")
+@click.command(epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/")
 @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
 @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
 @click.option('-x', '--extra-params', type = BwaParams(), help = 'Additional bwa mem parameters, in quotes')
 @click.option('-u', '--keep-unmapped',  is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
-@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
-@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)')
+@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
+@click.option('-d', '--molecule-distance', default = 0, show_default = True, type = click.IntRange(min = 0), help = 'Distance cutoff for molecule assignment (bp)')
 @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/bwa", show_default=True,  help = 'Output directory name')
-@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
+@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
 @click.option('--container',  is_flag = True, default = False, help = 'Use a container instead of conda')
 @click.option('--contigs',  type = ContigList(), help = 'File or list of contigs to plot')
 @click.option('--setup-only',  is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
@@ -95,8 +95,8 @@ def bwa(inputs, output_dir, genome, depth_window, ignore_bx, threads, keep_unmap
     
     BWA is a fast, robust, and reliable aligner that does not use barcodes when mapping.
     Harpy will post-processes the alignments using the specified `--molecule-distance`
-    to assign alignments to unique molecules. Use `--molecule-distance 0` to ignore
-    alignment distance during molecule assignment.
+    to assign alignments to unique molecules. Use a value >`0` for `--molecule-distance` to have
+    harpy perform alignment-distance based barcode deconvolution.
     """
     output_dir = output_dir.rstrip("/")
     workflowdir = os.path.join(output_dir, 'workflow')
@@ -156,17 +156,17 @@ def bwa(inputs, output_dir, genome, depth_window, ignore_bx, threads, keep_unmap
     )
     launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet, "workflow/align.bwa.summary")
 
-@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema")
+@click.command(context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema")
 @click.option('-x', '--extra-params', type = EmaParams(), help = 'Additional ema align parameters, in quotes')
 @click.option('-d', '--fragment-density',  is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization')
 @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
-@click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins")
+@click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000, clamp = True), help="Number of barcode bins")
 @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
 @click.option('-u', '--keep-unmapped',  is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
-@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
+@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
 @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True,  help = 'Output directory name')
 @click.option('-p', '--platform', type = click.Choice(['haplotag', '10x'], case_sensitive=False), default = "haplotag", show_default=True, help = "Linked read bead technology\n[haplotag, 10x]")
-@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
+@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
 @click.option('-l', '--barcode-list', type = click.Path(exists=True, dir_okay=False), help = "File of known barcodes for 10x linked reads")
 @click.option('--setup-only',  is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
 @click.option('--contigs',  type = ContigList(), help = 'File or list of contigs to plot')
@@ -266,16 +266,16 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
     )
     launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary")
 
-@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/")
+@click.command(epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/")
 @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
 @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
 @click.option('-x', '--extra-params', type = StrobeAlignParams(), help = 'Additional aligner parameters, in quotes')
 @click.option('-u', '--keep-unmapped',  is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
-@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
-@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)')
+@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
+@click.option('-d', '--molecule-distance', default = 0, show_default = True, type = click.IntRange(min = 0), help = 'Distance cutoff for molecule assignment (bp)')
 @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/strobealign", show_default=True,  help = 'Output directory name')
 @click.option('-l', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
-@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
+@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
 @click.option('--contigs',  type = ContigList(), help = 'File or list of contigs to plot')
 @click.option('--container',  is_flag = True, default = False, help = 'Use a container instead of conda')
 @click.option('--setup-only',  is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
@@ -293,9 +293,8 @@ def strobe(inputs, output_dir, genome, read_length, ignore_bx, keep_unmapped, de
     files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
     
     strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does 
-    not use barcodes when mapping, so Harpy will post-processes the alignments using the
-    specified `--molecule-distance` to assign alignments to unique molecules. Use 
-    `--molecule-distance 0` to ignore alignment distance during molecule assignment.
+    not use barcodes when mapping. Use a value >`0` for `--molecule-distance` to have
+    harpy perform alignment-distance based barcode deconvolution.
     
     The `--read-length` is an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`].
     The alignment process will be faster and take up less disk/RAM if you specify an `-l` value that isn't

diff --git a/harpy/assembly.py b/harpy/assembly.py
@@ -32,26 +32,26 @@
     ]
 }
 
-@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/assembly")
+@click.command(context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/assembly")
 # SPADES
 @click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode [`BX`,`BC`]")
 @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)')
-@click.option('-r', '--max-memory',  type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes')
+@click.option('-r', '--max-memory',  type = click.IntRange(min = 1000), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes')
 @click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes')
 # TIGMINT/ARCS/LINKS
 @click.option('-y', '--arcs-extra', type = ArcsParams(), help = 'Additional ARCS parameters, in quotes (`option=arg` format)')
 @click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length")
 @click.option("-n", "--links", type = int, default = 5, show_default = True, help = "Minimum number of links to compute scaffold")
 @click.option("-a", "--min-aligned", type = int, default = 5, show_default = True, help = "Minimum aligned read pairs per barcode")
-@click.option("-q", "--min-quality", type = click.IntRange(0,40), default = 0, show_default = True, help = "Minimum mapping quality")
+@click.option("-q", "--min-quality", type = click.IntRange(0,40, clamp = True), default = 0, show_default = True, help = "Minimum mapping quality")
 @click.option("-m", "--mismatch", type = int, default = 5, show_default = True, help = "Maximum number of mismatches")
 @click.option("-d", "--molecule-distance", type = int, default = 50000, show_default = True, help = "Distance cutoff to split molecules (bp)")
 @click.option("-l", "--molecule-length", type = int, default = 2000, show_default = True, help = "Minimum molecule length (bp)")
-@click.option("-i", "--seq-identity", type = click.IntRange(0,100), default = 98, show_default = True, help = "Minimum sequence identity") 
+@click.option("-i", "--seq-identity", type = click.IntRange(0,100, clamp = True), default = 98, show_default = True, help = "Minimum sequence identity") 
 @click.option("-s", "--span", type = int, default = 20, show_default = True, help = "Minimum number of spanning molecules to be considered assembled")
 # Other Options
 @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Assembly", show_default=True,  help = 'Output directory name')
-@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use')
+@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(1, 999, clamp = True), help = 'Number of threads to use')
 @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]")
 @click.option('--container',  is_flag = True, default = False, help = 'Use a container instead of conda')
 @click.option('--hpc',  type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file')

diff --git a/harpy/bin/inline_to_haplotag.py b/harpy/bin/inline_to_haplotag.py
@@ -5,6 +5,7 @@
 import gzip
 import sqlite3
 import argparse
+import subprocess
 from itertools import zip_longest
 import pysam
 
@@ -119,15 +120,17 @@ def process_record(fw_rec, rv_rec, barcode_database, bc_len):
 with (
     pysam.FastxFile(args.forward) as fw,
     pysam.FastxFile(args.reverse) as rv,
-    gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) as fw_out,
-    gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) as rv_out,
+    open(f"{args.prefix}.R1.fq.gz", "wb") as fw_out,
+    open(f"{args.prefix}.R2.fq.gz", "wb") as rv_out,
 ):
+    gzip_R1 = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=fw_out)
+    gzip_R2 = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=rv_out)
     for fw_record, rv_record in zip_longest(fw, rv):
         new_fw, new_rv = process_record(fw_record, rv_record, bc_db, bc_len)
         if new_fw:
-            fw_out.write(new_fw.encode("utf-8"))
+            gzip_R1.stdin.write(new_fw.encode("utf-8"))
         if new_rv:
-            rv_out.write(new_rv.encode("utf-8"))
+            gzip_R2.stdin.write(new_rv.encode("utf-8"))
 
 bc_db.cursor().close()