Skip to content

Commit

Permalink
markdups and molcov (#210)
Browse files Browse the repository at this point in the history
* default is 0

* speed up gzipping by using gnu

* add sequencer for optical dup buffer

* better text

* nicer formatting

* add wildcard to rule

* use pre-binned Counter

* much faster mol cov

* rm the stupid _Inline folder perl makes

* fix to increment bins between start and end

* swap the manual CLI for an intelligent per-sample search

* fix reference to sequencer

* rm unnecessary call to cut

* rash of cli updates

* better text

* fix cli

* fix test call
  • Loading branch information
pdimens authored Feb 24, 2025
1 parent 07c227b commit 2579cd5
Show file tree
Hide file tree
Showing 32 changed files with 165 additions and 203 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ jobs:
run: harpy qc -x "--low_complexity_filter" --quiet 2 test/fastq
- name: harpy qc all options
shell: micromamba-shell {0}
run: harpy qc -a auto -d -c 21,40,3,0 --quiet 2 test/fastq
run: harpy qc -a auto -d -c 21 40 3 0 --quiet 2 test/fastq
deconvolve:
needs: [changes]
if: ${{ needs.changes.outputs.deconvolve == 'true' }}
Expand Down
21 changes: 0 additions & 21 deletions harpy/_cli_types_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,6 @@ def convert_to_int(ctx, param, value):
return None
return int(value)

class IntList(click.ParamType):
"""A class for a click type which accepts an arbitrary number of integers, separated by a comma."""
name = "int_list"
def __init__(self, entries):
super().__init__()
self.entries = entries

def convert(self, value, param, ctx):
try:
parts = [i.strip() for i in value.split(',')]
if len(parts) != self.entries:
raise ValueError
for i in parts:
try:
int(i)
except:
raise ValueError
return [int(i) for i in parts]
except ValueError:
self.fail(f"{value} is not a valid list of integers. The value should be {self.entries} integers separated by a comma.", param, ctx)

class KParam(click.ParamType):
"""A class for a click type which accepts any number of odd integers separated by a comma, or the word auto."""
name = "k_param"
Expand Down
35 changes: 17 additions & 18 deletions harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def align():
"harpy align ema": [
{
"name": "Parameters",
"options": ["--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"],
"options": [ "--barcode-list", "--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality"],
"panel_styles": {"border_style": "blue"}
},
{
Expand All @@ -68,15 +68,15 @@ def align():
]
}

@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/")
@click.command(epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/")
@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
@click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
@click.option('-x', '--extra-params', type = BwaParams(), help = 'Additional bwa mem parameters, in quotes')
@click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
@click.option('-d', '--molecule-distance', default = 0, show_default = True, type = click.IntRange(min = 0), help = 'Distance cutoff for molecule assignment (bp)')
@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/bwa", show_default=True, help = 'Output directory name')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
@click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda')
@click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot')
@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
Expand All @@ -95,8 +95,8 @@ def bwa(inputs, output_dir, genome, depth_window, ignore_bx, threads, keep_unmap
BWA is a fast, robust, and reliable aligner that does not use barcodes when mapping.
Harpy will post-processes the alignments using the specified `--molecule-distance`
to assign alignments to unique molecules. Use `--molecule-distance 0` to ignore
alignment distance during molecule assignment.
to assign alignments to unique molecules. Use a value >`0` for `--molecule-distance` to have
harpy perform alignment-distance based barcode deconvolution.
"""
output_dir = output_dir.rstrip("/")
workflowdir = os.path.join(output_dir, 'workflow')
Expand Down Expand Up @@ -156,17 +156,17 @@ def bwa(inputs, output_dir, genome, depth_window, ignore_bx, threads, keep_unmap
)
launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet, "workflow/align.bwa.summary")

@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema")
@click.command(context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema")
@click.option('-x', '--extra-params', type = EmaParams(), help = 'Additional ema align parameters, in quotes')
@click.option('-d', '--fragment-density', is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization')
@click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
@click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins")
@click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000, clamp = True), help="Number of barcode bins")
@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
@click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name')
@click.option('-p', '--platform', type = click.Choice(['haplotag', '10x'], case_sensitive=False), default = "haplotag", show_default=True, help = "Linked read bead technology\n[haplotag, 10x]")
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
@click.option('-l', '--barcode-list', type = click.Path(exists=True, dir_okay=False), help = "File of known barcodes for 10x linked reads")
@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
@click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot')
Expand Down Expand Up @@ -266,16 +266,16 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
)
launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary")

@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/")
@click.command(epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/")
@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping')
@click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
@click.option('-x', '--extra-params', type = StrobeAlignParams(), help = 'Additional aligner parameters, in quotes')
@click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)')
@click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(0, 40, clamp = True), help = 'Minimum mapping quality to pass filtering')
@click.option('-d', '--molecule-distance', default = 0, show_default = True, type = click.IntRange(min = 0), help = 'Distance cutoff for molecule assignment (bp)')
@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/strobealign", show_default=True, help = 'Output directory name')
@click.option('-l', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(4,999, clamp = True), help = 'Number of threads to use')
@click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot')
@click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda')
@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit')
Expand All @@ -293,9 +293,8 @@ def strobe(inputs, output_dir, genome, read_length, ignore_bx, keep_unmapped, de
files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does
not use barcodes when mapping, so Harpy will post-processes the alignments using the
specified `--molecule-distance` to assign alignments to unique molecules. Use
`--molecule-distance 0` to ignore alignment distance during molecule assignment.
not use barcodes when mapping. Use a value >`0` for `--molecule-distance` to have
harpy perform alignment-distance based barcode deconvolution.
The `--read-length` is an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`].
The alignment process will be faster and take up less disk/RAM if you specify an `-l` value that isn't
Expand Down
10 changes: 5 additions & 5 deletions harpy/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,26 @@
]
}

@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/assembly")
@click.command(context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/assembly")
# SPADES
@click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode [`BX`,`BC`]")
@click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)')
@click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes')
@click.option('-r', '--max-memory', type = click.IntRange(min = 1000), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes')
@click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes')
# TIGMINT/ARCS/LINKS
@click.option('-y', '--arcs-extra', type = ArcsParams(), help = 'Additional ARCS parameters, in quotes (`option=arg` format)')
@click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length")
@click.option("-n", "--links", type = int, default = 5, show_default = True, help = "Minimum number of links to compute scaffold")
@click.option("-a", "--min-aligned", type = int, default = 5, show_default = True, help = "Minimum aligned read pairs per barcode")
@click.option("-q", "--min-quality", type = click.IntRange(0,40), default = 0, show_default = True, help = "Minimum mapping quality")
@click.option("-q", "--min-quality", type = click.IntRange(0,40, clamp = True), default = 0, show_default = True, help = "Minimum mapping quality")
@click.option("-m", "--mismatch", type = int, default = 5, show_default = True, help = "Maximum number of mismatches")
@click.option("-d", "--molecule-distance", type = int, default = 50000, show_default = True, help = "Distance cutoff to split molecules (bp)")
@click.option("-l", "--molecule-length", type = int, default = 2000, show_default = True, help = "Minimum molecule length (bp)")
@click.option("-i", "--seq-identity", type = click.IntRange(0,100), default = 98, show_default = True, help = "Minimum sequence identity")
@click.option("-i", "--seq-identity", type = click.IntRange(0,100, clamp = True), default = 98, show_default = True, help = "Minimum sequence identity")
@click.option("-s", "--span", type = int, default = 20, show_default = True, help = "Minimum number of spanning molecules to be considered assembled")
# Other Options
@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Assembly", show_default=True, help = 'Output directory name')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(1, 999, clamp = True), help = 'Number of threads to use')
@click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]")
@click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda')
@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file')
Expand Down
11 changes: 7 additions & 4 deletions harpy/bin/inline_to_haplotag.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gzip
import sqlite3
import argparse
import subprocess
from itertools import zip_longest
import pysam

Expand Down Expand Up @@ -119,15 +120,17 @@ def process_record(fw_rec, rv_rec, barcode_database, bc_len):
with (
pysam.FastxFile(args.forward) as fw,
pysam.FastxFile(args.reverse) as rv,
gzip.open(f"{args.prefix}.R1.fq.gz", "wb", 6) as fw_out,
gzip.open(f"{args.prefix}.R2.fq.gz", "wb", 6) as rv_out,
open(f"{args.prefix}.R1.fq.gz", "wb") as fw_out,
open(f"{args.prefix}.R2.fq.gz", "wb") as rv_out,
):
gzip_R1 = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=fw_out)
gzip_R2 = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=rv_out)
for fw_record, rv_record in zip_longest(fw, rv):
new_fw, new_rv = process_record(fw_record, rv_record, bc_db, bc_len)
if new_fw:
fw_out.write(new_fw.encode("utf-8"))
gzip_R1.stdin.write(new_fw.encode("utf-8"))
if new_rv:
rv_out.write(new_rv.encode("utf-8"))
gzip_R2.stdin.write(new_rv.encode("utf-8"))

bc_db.cursor().close()

Loading

0 comments on commit 2579cd5

Please sign in to comment.