diff --git a/.github/filters.yml b/.github/filters.yml index f5be11d4..a49e1790 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -142,7 +142,7 @@ phase: &phase simvars: &simvars - *common - *container - - 'harpy/simulate.py' + - 'harpy/simulate_variants.py' - 'harpy/snakefiles/simulate_snpindel.smk' - 'harpy/snakefiles/simulate_variants.smk' - 'test/vcf/test.bcf' @@ -150,7 +150,7 @@ simvars: &simvars simreads: &simreads - *common - *container - - 'harpy/simulate.py' + - 'harpy/simulate_linkedreads.py' - 'harpy/snakefiles/simulate_linkedreads.smk' - 'test/genome**gz' - 'extractReads.cpp' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 50adf14c..1e0ae9d9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -109,7 +109,7 @@ jobs: # if: ${{ needs.changes.outputs.modules == 'true' }} # run: | # export APPTAINER_TMPDIR=$PWD/test/ -# harpy qc --skip-reports --quiet test/fastq/sample1.*.fq.gz +# harpy qc --skip-reports --quiet 2 test/fastq/sample1.*.fq.gz # - name: Create Singularity Artifact # if: ${{ steps.singularity.outcome == 'success' }} # uses: actions/upload-artifact@v4 @@ -151,7 +151,7 @@ jobs: # path: .snakemake/singularity - name: harpy demultiplex shell: micromamba-shell {0} - run: harpy demultiplex gen1 --quiet --schema test/demux/samples.schema test/demux/Undetermined_S0_L004_R* test/demux/Undetermined_S0_L004_I* + run: harpy demultiplex gen1 --quiet 2 --schema test/demux/samples.schema test/demux/Undetermined_S0_L004_R* test/demux/Undetermined_S0_L004_I* preflight: needs: [changes] @@ -193,7 +193,7 @@ jobs: - name: test preflight bam if: always() shell: micromamba-shell {0} - run: harpy preflight bam --quiet test/bam + run: harpy preflight bam --quiet 2 test/bam qc: needs: [changes] @@ -231,10 +231,10 @@ jobs: # path: .snakemake/singularity - name: harpy qc shell: micromamba-shell {0} - run: harpy qc -x "--low_complexity_filter" --quiet test/fastq + run: harpy qc -x "--low_complexity_filter" --quiet 2 test/fastq - name: harpy qc all options shell: micromamba-shell {0} - run: harpy qc -a auto -d -c 21,40,3,0 --quiet test/fastq + run: harpy qc -a auto -d -c 21,40,3,0 --quiet 2 test/fastq deconvolve: needs: [changes] if: ${{ needs.changes.outputs.deconvolve == 'true' }} @@ -271,7 +271,7 @@ jobs: # path: .snakemake/singularity - name: harpy deconvolve shell: micromamba-shell {0} - run: harpy deconvolve --quiet test/fastq + run: harpy deconvolve --quiet 2 test/fastq bwa: needs: [changes] if: ${{ needs.changes.outputs.bwa == 'true' }} @@ -308,7 +308,7 @@ jobs: # path: .snakemake/singularity - name: test bwa shell: micromamba-shell {0} - run: harpy align bwa --quiet -g test/genome/genome.fasta.gz -x "-A 2" test/fastq + run: harpy align bwa --quiet 2 -g test/genome/genome.fasta.gz -x "-A 2" test/fastq ema: needs: [changes] @@ -346,7 +346,7 @@ jobs: # path: .snakemake/singularity - name: test ema shell: micromamba-shell {0} - run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz test/fastq + run: harpy align ema --quiet 2 --ema-bins 150 -g test/genome/genome.fasta.gz test/fastq strobe: needs: [changes] @@ -384,7 +384,7 @@ jobs: # path: .snakemake/singularity - name: test strobealign shell: micromamba-shell {0} - run: harpy align strobe --quiet -l 125 -g test/genome/genome.fasta.gz test/fastq + run: harpy align strobe --quiet 2 -l 125 -g test/genome/genome.fasta.gz test/fastq mpileup: needs: [changes] @@ -422,10 +422,10 @@ jobs: # path: .snakemake/singularity - name: snp mpileup shell: micromamba-shell {0} - run: harpy snp mpileup --quiet -r test/positions.bed -g test/genome/genome.fasta.gz -x "--ignore-RG" test/bam + run: harpy snp mpileup --quiet 2 -r test/positions.bed -g test/genome/genome.fasta.gz -x "--ignore-RG" test/bam - name: snp mpileup-pop shell: micromamba-shell {0} - run: harpy snp mpileup --quiet -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam + run: harpy snp mpileup --quiet 2 -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam freebayes: needs: [changes] @@ -463,10 +463,10 @@ jobs: # path: .snakemake/singularity - name: snp freebayes shell: micromamba-shell {0} - run: harpy snp freebayes --quiet -r test/positions.bed -g test/genome/genome.fasta.gz -x "-g 200" test/bam + run: harpy snp freebayes --quiet 2 -r test/positions.bed -g test/genome/genome.fasta.gz -x "-g 200" test/bam - name: snp freebayes-pop shell: micromamba-shell {0} - run: harpy snp freebayes --quiet -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam + run: harpy snp freebayes --quiet 2 -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam impute: needs: [changes] @@ -504,11 +504,11 @@ jobs: # path: .snakemake/singularity - name: impute shell: micromamba-shell {0} - run: harpy impute --quiet --vcf test/vcf/test.bcf -p test/stitch.params test/bam + run: harpy impute --quiet 2 --vcf test/vcf/test.bcf -p test/stitch.params test/bam - name: impute from vcf shell: micromamba-shell {0} if: always() - run: harpy impute --quiet --vcf-samples -o vcfImpute --vcf test/vcf/test.bcf -p test/stitch.params test/bam + run: harpy impute --quiet 2 --vcf-samples -o vcfImpute --vcf test/vcf/test.bcf -p test/stitch.params test/bam phase: needs: [changes] @@ -546,17 +546,17 @@ jobs: # path: .snakemake/singularity - name: phase shell: micromamba-shell {0} - run: harpy phase --quiet --vcf test/vcf/test.bcf -x "--max_iter 10001" test/bam + run: harpy phase --quiet 2 --vcf test/vcf/test.bcf -x "--max_iter 10001" test/bam - name: phase with indels shell: micromamba-shell {0} if: always() - run: harpy phase --quiet --vcf test/vcf/test.bcf -o phaseindel -g test/genome/genome.fasta.gz test/bam + run: harpy phase --quiet 2 --vcf test/vcf/test.bcf -o phaseindel -g test/genome/genome.fasta.gz test/bam - name: phase from vcf shell: micromamba-shell {0} if: always() run: | cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam.py -d pineapple1 test/bam/pineapple.bam - harpy phase --quiet --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam + harpy phase --quiet 2 --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam leviathan: needs: [changes] @@ -594,12 +594,12 @@ jobs: # path: .snakemake/singularity - name: leviathan shell: micromamba-shell {0} - run: harpy sv leviathan --quiet -s 100 -b 1 -g test/genome/genome.fasta.gz -x "-M 2002" test/bam + run: harpy sv leviathan --quiet 2 -s 100 -b 1 -g test/genome/genome.fasta.gz -x "-M 2002" test/bam continue-on-error: true - name: leviathan-pop if: always() shell: micromamba-shell {0} - run: harpy sv leviathan --quiet -s 100 -b 1 -g test/genome/genome.fasta.gz -o SV/leviathanpop -p test/samples.groups test/bam + run: harpy sv leviathan --quiet 2 -s 100 -b 1 -g test/genome/genome.fasta.gz -o SV/leviathanpop -p test/samples.groups test/bam naibr: needs: [changes] @@ -637,20 +637,20 @@ jobs: # path: .snakemake/singularity - name: naibr shell: micromamba-shell {0} - run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/naibr -x "-min_sv 5000" test/bam_phased && rm -r Genome + run: harpy sv naibr --quiet 2 -g test/genome/genome.fasta.gz -o SV/naibr -x "-min_sv 5000" test/bam_phased && rm -r Genome - name: naibr pop if: always() shell: micromamba-shell {0} - run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/pop -p test/samples.groups test/bam_phased && rm -r Genome + run: harpy sv naibr --quiet 2 -g test/genome/genome.fasta.gz -o SV/pop -p test/samples.groups test/bam_phased && rm -r Genome - name: naibr with phasing if: always() shell: micromamba-shell {0} run: | - harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/phase -v test/vcf/test.phased.bcf test/bam && rm -r Genome + harpy sv naibr --quiet 2 -g test/genome/genome.fasta.gz -o SV/phase -v test/vcf/test.phased.bcf test/bam && rm -r Genome - name: naibr pop with phasing if: always() shell: micromamba-shell {0} - run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/phasepop -v test/vcf/test.phased.bcf -p test/samples.groups test/bam && rm -r Genome + run: harpy sv naibr --quiet 2 -g test/genome/genome.fasta.gz -o SV/phasepop -v test/vcf/test.phased.bcf -p test/samples.groups test/bam && rm -r Genome simulate_variants: @@ -690,26 +690,26 @@ jobs: - name: simulate random snps/indels shell: micromamba-shell {0} run: | - harpy simulate snpindel --quiet --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/haplotype_1/sim.hap1.snp.vcf --indel-vcf Simulate/snpindel/haplotype_1/sim.hap1.indel.vcf test/genome/genome.fasta.gz + harpy simulate snpindel --quiet 2 --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate snpindel --quiet 2 --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/haplotype_1/sim.hap1.snp.vcf --indel-vcf Simulate/snpindel/haplotype_1/sim.hap1.indel.vcf test/genome/genome.fasta.gz - name: simulate inversions shell: micromamba-shell {0} if: always() run: | - harpy simulate inversion --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/haplotype_1/sim.hap1.inversion.vcf test/genome/genome.fasta.gz + harpy simulate inversion --quiet 2 --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate inversion --quiet 2 --prefix Simulate/invvcf --vcf Simulate/inversion/haplotype_1/sim.hap1.inversion.vcf test/genome/genome.fasta.gz - name: simulate cnv shell: micromamba-shell {0} if: always() run: | - harpy simulate cnv --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/haplotype_1/sim.hap1.cnv.vcf test/genome/genome.fasta.gz + harpy simulate cnv --quiet 2 --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate cnv --quiet 2 --prefix Simulate/cnvvcf --vcf Simulate/cnv/haplotype_1/sim.hap1.cnv.vcf test/genome/genome.fasta.gz - name: simulate translocations shell: micromamba-shell {0} if: always() run: | - harpy simulate translocation --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/haplotype_1/sim.hap1.translocation.vcf test/genome/genome.fasta.gz + harpy simulate translocation --quiet 2 --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate translocation --quiet 2 --prefix Simulate/transvcf --vcf Simulate/translocation/haplotype_1/sim.hap1.translocation.vcf test/genome/genome.fasta.gz simulate_linkedreads: needs: [changes] @@ -749,12 +749,12 @@ jobs: shell: micromamba-shell {0} run: | haplotag_barcodes.py -n 14000000 > test/haplotag.bc - harpy simulate linkedreads --quiet -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz + harpy simulate linkedreads --quiet 2 -t 4 -n 2 -b test/haplotag.bc -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz assembly: needs: [changes] if: ${{ needs.changes.outputs.assembly == 'true' }} - name: metassembly + name: assembly runs-on: ubuntu-latest steps: - name: Checkout @@ -787,13 +787,13 @@ jobs: # path: .snakemake/singularity - name: test assembly shell: micromamba-shell {0} - run: harpy assembly --quiet -r 4000 test/fastq/sample1.* + run: harpy assembly --quiet 2 -r 4000 test/fastq/sample1.* - name: test metassembly shell: micromamba-shell {0} - run: harpy metassembly --quiet -r 4000 test/fastq/sample1.* + run: harpy metassembly --quiet 2 -r 4000 test/fastq/sample1.* - name: test metassembly without barcodes shell: micromamba-shell {0} - run: harpy metassembly --ignore-bx --quiet -r 4000 test/fastq/sample1.* + run: harpy metassembly --ignore-bx --quiet 2 -r 4000 test/fastq/sample1.* extras: needs: [changes] @@ -832,10 +832,10 @@ jobs: run: harpy popgroup test/fastq - name: harpy downsample bam shell: micromamba-shell {0} - run: harpy downsample -d 1 --random-seed 699 --quiet test/bam/sample1.bam + run: harpy downsample -d 1 --random-seed 699 --quiet 2 test/bam/sample1.bam - name: harpy downsample fastq shell: micromamba-shell {0} - run: harpy downsample -d 1 --quiet test/fastq/sample1.*gz + run: harpy downsample -d 1 --quiet 2 test/fastq/sample1.*gz - name: harpy hpc shell: micromamba-shell {0} run: | diff --git a/harpy/__main__.py b/harpy/__main__.py index 2a39a6ff..955f676f 100644 --- a/harpy/__main__.py +++ b/harpy/__main__.py @@ -2,24 +2,22 @@ import rich_click as click from . import align +from . import diagnose, resume, view from . import deconvolve from . import demultiplex from . import container from . import hpc from . import impute -from . import assembly -from . import metassembly +from . import assembly, metassembly from . import qc from . import phase from . import preflight -from . import resume -from . import simulate +from . import simulate_linkedreads, simulate_variants from . import snp from . import sv from .popgroup import popgroup from . import downsample from .imputeparams import imputeparams -from . import view click.rich_click.USE_MARKDOWN = True click.rich_click.SHOW_ARGUMENTS = False @@ -43,6 +41,38 @@ def cli(): **Documentation**: [https://pdimens.github.io/harpy/](https://pdimens.github.io/harpy/) """ +## unify simulate commands +@click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]}) +def simulate(): + """ + Simulate variants or linked-reads from a genome + + To simulate genomic variants, provide an additional subcommand {`snpindel`,`inversion`,`cnv`,`translocation`} + to get more information about that workflow. The variant simulator (`simuG`) can only simulate + one type of variant at a time, so you may need to run it a few times if you want multiple variant types. + Use `simulate linkedreads` to simulate haplotag linked-reads from a diploid genome, which you can create by simulating + genomic variants. + """ + +simulate_commandstring = { + "harpy simulate": [ + { + "name": "Linked Read Sequences", + "commands": ["linkedreads"], + }, + { + "name": "Genomic Variants", + "commands": ["cnv", "inversion", "snpindel", "translocation"], + } + ] +} + +simulate.add_command(simulate_linkedreads.linkedreads) +simulate.add_command(simulate_variants.snpindel) +simulate.add_command(simulate_variants.inversion) +simulate.add_command(simulate_variants.cnv) +simulate.add_command(simulate_variants.translocation) + # main program cli.add_command(downsample.downsample) cli.add_command(popgroup) @@ -56,28 +86,32 @@ def cli(): cli.add_command(sv.sv) cli.add_command(impute.impute) cli.add_command(phase.phase) -cli.add_command(simulate.simulate) +cli.add_command(simulate) cli.add_command(container.containerize) cli.add_command(hpc.hpc) cli.add_command(resume.resume) cli.add_command(deconvolve.deconvolve) cli.add_command(metassembly.metassembly) cli.add_command(assembly.assembly) - +cli.add_command(diagnose.diagnose) ## the workflows ## click.rich_click.COMMAND_GROUPS = { "harpy": [ { - "name": "workflows", - "commands": sorted(["demultiplex","qc", "align","snp","sv","impute","phase", "simulate", "assembly", "metassembly"]), + "name": "Workflows", + "commands": sorted(["demultiplex","qc", "align","snp","sv","impute","phase", "simulate", "assembly", "metassembly"]) }, { "name": "Other Commands", - "commands": sorted(["deconvolve", "downsample", "hpc", "imputeparams", "popgroup","preflight","resume", "view"]) + "commands": sorted(["deconvolve", "downsample", "hpc", "imputeparams", "popgroup"]) + }, + { + "name": "Troubleshoot", + "commands": sorted(["view", "resume", "diagnose", "preflight"]) } ], - } | simulate.commandstring | hpc.docstring + } | simulate_commandstring | hpc.docstring -for i in [align, deconvolve, downsample, demultiplex, impute, phase, preflight, qc, simulate, snp, sv, assembly, metassembly]: +for i in [align, deconvolve, downsample, demultiplex, impute, phase, preflight, qc, simulate_linkedreads, simulate_variants, snp, sv, assembly, metassembly]: click.rich_click.OPTION_GROUPS |= i.docstring diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py index ace195b3..aef15ba9 100644 --- a/harpy/_cli_types_generic.py +++ b/harpy/_cli_types_generic.py @@ -3,6 +3,12 @@ import os import click +def convert_to_int(ctx, param, value): + # This function converts the string choice to an integer + if value is None: + return None + return int(value) + class IntList(click.ParamType): """A class for a click type which accepts an arbitrary number of integers, separated by a comma.""" name = "int_list" @@ -92,11 +98,13 @@ class SnakemakeParams(click.ParamType): name = "snakemake_params" def convert(self, value, param, ctx): forbidden = "--rerun-incomplete --ri --show-failed-logs --rerun-triggers --nolock --software-deployment-method --smd --deployment --deployment-method --conda-prefix --cores -c --directory -d --snakefile -s --configfile --configfiles".split() - available = "--dry-run --dryrun -n --profile --cache --jobs -j --local-cores --resources --res --set-threads --max-threads --set-resources --set-scatter --set-resource-scopes --default-resources --default-res --preemptible-rules --preemptible-retries --envvars --touch -t --keep-going -k --force -f --executor -e --forceall -F --forcerun -R --prioritize -P --batch --until -U --omit-from -O --shadow-prefixDIR --scheduler --wms-monitor --wms-monitor-arg --scheduler-ilp-solver --conda-base-path --no-subworkflows --nosw --precommand --groups --group-components --report --report-stylesheet --reporterPLUGIN --draft-notebook --edit-notebook --notebook-listen --lint --generate-unit-tests --containerize --export-cwl --list-rules --list -l --list-target-rules --lt --dag --rulegraph --filegraph --d3dag --summary -S --detailed-summary -D --archive --cleanup-metadata --cmFILE --cleanup-shadow --skip-script-cleanup --unlock --list-changes --lc --list-input-changes --li --list-params-changes --lp --list-untracked --lu --delete-all-output --delete-temp-output --keep-incomplete --drop-metadata --version -v --printshellcmds -p --debug-dag --nocolor --quiet -q --print-compilation --verbose --force-use-threads --allow-ambiguity -a --ignore-incomplete --ii --max-inventory-time --latency-wait --output-wait -w --wait-for-files --wait-for-files-file --queue-input-wait-time --notemp --nt --all-temp --unneeded-temp-files --keep-storage-local-copies --target-files-omit-workdir-adjustment --allowed-rules --max-jobs-per-timespan --max-jobs-per-second --max-status-checks-per-second --seconds-between-status-checks --retries --restart-times -T --wrapper-prefix --default-storage-provider --default-storage-prefix --local-storage-prefix --remote-job-local-storage-prefix --shared-fs-usage --scheduler-greediness --greediness --no-hooks --debug --runtime-profile --local-groupid --attempt --log-handler-script --log-service --job-deploy-sources --benchmark-extended --container-image --immediate-submit --is --jobscript --js --jobname --jn --flux --container-cleanup-images --use-conda --conda-not-block-search-path-envvars --list-conda-envs --conda-cleanup-envs --conda-cleanup-pkgs --conda-create-envs-only --conda-frontend --use-apptainer --use-singularity --apptainer-prefix --singularity-prefix --apptainer-args --singularity-args --use-envmodules --scheduler-solver-path --deploy-sources --target-jobs --mode --report-html-path --report-html-stylesheet-path".split() + available = "--profile --cache --jobs -j --local-cores --resources --res --set-threads --max-threads --set-resources --set-scatter --set-resource-scopes --default-resources --default-res --preemptible-rules --preemptible-retries --envvars --touch -t --keep-going -k --force -f --executor -e --forceall -F --forcerun -R --prioritize -P --batch --until -U --omit-from -O --shadow-prefixDIR --scheduler --wms-monitor --wms-monitor-arg --scheduler-ilp-solver --conda-base-path --no-subworkflows --nosw --precommand --groups --group-components --report --report-stylesheet --reporterPLUGIN --draft-notebook --edit-notebook --notebook-listen --lint --generate-unit-tests --containerize --export-cwl --list-rules --list -l --list-target-rules --lt --dag --rulegraph --filegraph --d3dag --summary -S --detailed-summary -D --archive --cleanup-metadata --cmFILE --cleanup-shadow --skip-script-cleanup --unlock --list-changes --lc --list-input-changes --li --list-params-changes --lp --list-untracked --lu --delete-all-output --delete-temp-output --keep-incomplete --drop-metadata --version -v --printshellcmds -p --debug-dag --nocolor --quiet -q --print-compilation --verbose --force-use-threads --allow-ambiguity -a --ignore-incomplete --ii --max-inventory-time --latency-wait --output-wait -w --wait-for-files --wait-for-files-file --queue-input-wait-time --notemp --nt --all-temp --unneeded-temp-files --keep-storage-local-copies --target-files-omit-workdir-adjustment --allowed-rules --max-jobs-per-timespan --max-jobs-per-second --max-status-checks-per-second --seconds-between-status-checks --retries --restart-times -T --wrapper-prefix --default-storage-provider --default-storage-prefix --local-storage-prefix --remote-job-local-storage-prefix --shared-fs-usage --scheduler-greediness --greediness --no-hooks --debug --runtime-profile --local-groupid --attempt --log-handler-script --log-service --job-deploy-sources --benchmark-extended --container-image --immediate-submit --is --jobscript --js --jobname --jn --flux --container-cleanup-images --use-conda --conda-not-block-search-path-envvars --list-conda-envs --conda-cleanup-envs --conda-cleanup-pkgs --conda-create-envs-only --conda-frontend --use-apptainer --use-singularity --apptainer-prefix --singularity-prefix --apptainer-args --singularity-args --use-envmodules --scheduler-solver-path --deploy-sources --target-jobs --mode --report-html-path --report-html-stylesheet-path".split() for i in value.split(): if i.startswith("-"): if i in forbidden: self.fail(f"{i} is a forbidden option because it is already used by Harpy to call Snakemake.", param, ctx) + if i == "--dry-run" or i == "-n": + self.fail(f"The dry run option ({i}) is incompatible with the way Harpy launches Snakemake with a progress bar. Please use `harpy diagnose` instead.") if i not in available: self.fail(f"{i} is not a valid Snakemake option. Run \'snakemake --help\' for a list of all Snakemake command line options.", param, ctx) return value diff --git a/harpy/_launch.py b/harpy/_launch.py index 815f3f74..c3891672 100644 --- a/harpy/_launch.py +++ b/harpy/_launch.py @@ -16,6 +16,10 @@ EXIT_CODE_CONDA_ERROR = 2 EXIT_CODE_RUNTIME_ERROR = 3 SNAKEMAKE_CMD = "snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --conda-prefix .environments --conda-cleanup-pkgs cache --apptainer-prefix .environments --directory ." +# quiet = 0 : print all things, full progressbar +# quiet = 1 : print all text, only "Total" progressbar +# quiet = 2 : print nothing, no progressbar + # logic to properly refresh progress bar for jupyter sessions try: __IPYTHON__ @@ -57,7 +61,7 @@ def purge_empty_logs(target_dir): def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, summaryfile = None): """launch snakemake with the given commands""" start_time = datetime.now() - if not quiet: + if quiet < 2: print_onstart(starttext, workflow.replace("_", " ")) exitcode = None try: @@ -72,7 +76,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, su exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_GENERIC_ERROR exitcode = EXIT_CODE_CONDA_ERROR if "Conda" in output else exitcode break - if not quiet: + if quiet < 2: console = Console() with console.status("[dim]Preparing workflow", spinner = "point", spinner_style="yellow") as status: while output.startswith("Building DAG of jobs...") or output.startswith("Assuming"): @@ -137,7 +141,8 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, su break if process.poll() or exitcode: break - task_ids = {"total_progress" : progress.add_task("[bold blue]Total", total=job_inventory["total"][1])} + total_text = "[bold blue]Total" if quiet == 0 else "[bold blue]Progress" + task_ids = {"total_progress" : progress.add_task(total_text, total=job_inventory["total"][1])} while output: output = process.stderr.readline() @@ -154,7 +159,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, su if rulematch: rule = rulematch.group().replace(":","").split()[-1] if rule not in task_ids: - task_ids[rule] = progress.add_task(job_inventory[rule][0], total=job_inventory[rule][1]) + task_ids[rule] = progress.add_task(job_inventory[rule][0], total=job_inventory[rule][1], visible = quiet != 1) continue # store the job id in the inventory so we can later look up which rule it's associated with jobidmatch = re.search(r"jobid:\s\d+", string = output) @@ -179,7 +184,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, su if process.returncode < 1: gzip_file(sm_logfile) purge_empty_logs(outdir) - if not quiet: + if quiet < 2: end_time = datetime.now() elapsed_time = end_time - start_time print_onsuccess(outdir, summaryfile, elapsed_time) @@ -209,7 +214,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, su purge_empty_logs(outdir) sys.exit(1) except KeyboardInterrupt: - # Handle the keyboard interrupt + # Handle a keyboard interrupt console = Console(stderr=True) console.print("") console.rule("[bold]Terminating Harpy", style = "yellow") diff --git a/harpy/_misc.py b/harpy/_misc.py index 7ebb44b8..9856b709 100644 --- a/harpy/_misc.py +++ b/harpy/_misc.py @@ -9,27 +9,29 @@ from datetime import datetime from pathlib import Path from importlib_resources import files -from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn +from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn, TaskProgressColumn import harpy.scripts import harpy.reports import harpy.snakefiles from ._printing import print_error, print_solution -def harpy_progressbar(quiet): +def harpy_progressbar(quiet: int) -> Progress: """ The pre-configured transient progress bar that workflows and validations use """ + progress_text = "[progress.percentage]{task.percentage:>3.0f}%" if quiet == 1 else "[progress.remaining]{task.completed}/{task.total}" + # TextColumn("[progress.remaining]{task.completed}/{task.total}", style = "magenta") return Progress( SpinnerColumn(spinner_name = "arc", style = "dim"), TextColumn("[progress.description]{task.description}"), BarColumn(complete_style="yellow", finished_style="blue"), - TextColumn("[progress.remaining]{task.completed}/{task.total}", style = "magenta"), + TaskProgressColumn(progress_text, style="magenta"), TimeElapsedColumn(), - transient=True, - disable=quiet + transient = True, + disable = quiet == 2 ) -def harpy_pulsebar(quiet, desc_text): +def harpy_pulsebar(quiet: int, desc_text: str) -> Progress: """ The pre-configured transient pulsing progress bar that workflows use, typically for installing the software dependencies/container @@ -38,16 +40,16 @@ def harpy_pulsebar(quiet, desc_text): TextColumn("[progress.description]{task.description}"), BarColumn(bar_width= 70 - len(desc_text), pulse_style = "grey46"), TimeElapsedColumn(), - transient=True, - disable=quiet + transient = True, + disable = quiet == 2 ) -def symlink(original, destination): +def symlink(original: str, destination: str) -> None: """Create a symbolic link from original -> destination if the destination doesn't already exist.""" if not (Path(destination).is_symlink() or Path(destination).exists()): Path(destination).symlink_to(Path(original).resolve()) -def fetch_script(workdir, target): +def fetch_script(workdir: str, target: str) -> None: """ Retrieve the target harpy script and write it into workdir/scripts """ @@ -60,7 +62,7 @@ def fetch_script(workdir, target): print_solution("There may be an issue with your Harpy installation, which would require reinstalling Harpy. Alternatively, there may be in a issue with your conda/mamba environment or configuration.") sys.exit(1) -def fetch_rule(workdir, target): +def fetch_rule(workdir: str, target: str) -> None: """ Retrieve the target harpy rule and write it into the workdir """ @@ -73,7 +75,7 @@ def fetch_rule(workdir, target): print_solution("There may be an issue with your Harpy installation, which would require reinstalling Harpy. Alternatively, there may be in a issue with your conda/mamba environment or configuration.") sys.exit(1) -def fetch_report(workdir, target): +def fetch_report(workdir: str, target: str) -> None: """ Retrieve the target harpy report and write it into workdir/report """ @@ -114,7 +116,7 @@ def fetch_report(workdir, target): print_solution("There may be an issue with your internet connection or Harpy installation, that latter of which would require reinstalling Harpy. Alternatively, there may be in a issue with your conda/mamba environment or configuration.") sys.exit(1) -def snakemake_log(outdir, workflow): +def snakemake_log(outdir: str, workflow: str) -> str: """Return a snakemake logfile name. Iterates logfile run number if one exists.""" attempts = glob.glob(f"{outdir}/logs/snakemake/*.log*") if not attempts: @@ -122,14 +124,14 @@ def snakemake_log(outdir, workflow): increment = sorted([int(i.split(".")[1]) for i in attempts])[-1] + 1 return f"{outdir}/logs/snakemake/{workflow}.{increment}." + datetime.now().strftime("%d_%m_%Y") + ".log" -def gzip_file(infile): +def gzip_file(infile: str) -> None: """gzip a file and delete the original, using only python""" if os.path.exists(infile): with open(infile, 'rb') as f_in, gzip.open(infile + '.gz', 'wb', 6) as f_out: shutil.copyfileobj(f_in, f_out) os.remove(infile) -def safe_read(file_path): +def safe_read(file_path: str): """returns the proper file opener for reading if a file_path is gzipped""" try: with gzip.open(file_path, 'rt') as f: diff --git a/harpy/_parsers.py b/harpy/_parsers.py index b23feb03..fe6eae1c 100644 --- a/harpy/_parsers.py +++ b/harpy/_parsers.py @@ -4,12 +4,13 @@ import os import sys import subprocess +from typing import Tuple from pathlib import Path from rich.markdown import Markdown import rich_click as click from ._printing import print_error, print_solution_with_culprits -def getnames(directory, ext): +def getnames(directory: str, ext: str) -> list[str]: """Find all files in 'directory' that end with 'ext'""" samplenames = set([i.split(ext)[0] for i in os.listdir(directory) if i.endswith(ext)]) if len(samplenames) < 1: @@ -17,7 +18,7 @@ def getnames(directory, ext): sys.exit(1) return samplenames -def parse_fastq_inputs(inputs): +def parse_fastq_inputs(inputs: list[str]) -> Tuple[list[str], int]: """ Parse the command line input FASTQ arguments to generate a clean list of input files. Returns the number of unique samples, i.e. forward and reverse reads for one sample = 1 sample. @@ -65,7 +66,7 @@ def parse_fastq_inputs(inputs): # return the filenames and # of unique samplenames return infiles, n -def parse_alignment_inputs(inputs): +def parse_alignment_inputs(inputs:list[str]) -> Tuple[list[str], int]: """ Parse the command line input sam/bam arguments to generate a clean list of input files and return the number of unique samples. @@ -117,7 +118,7 @@ def parse_alignment_inputs(inputs): sys.exit(1) return bam_infiles, len(uniqs) -def biallelic_contigs(vcf, workdir): +def biallelic_contigs(vcf: str, workdir: str) -> Tuple[str,int]: """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic""" vbn = os.path.basename(vcf) os.makedirs(f"{workdir}/", exist_ok = True) diff --git a/harpy/_printing.py b/harpy/_printing.py index 4020704d..9149c042 100644 --- a/harpy/_printing.py +++ b/harpy/_printing.py @@ -9,7 +9,7 @@ console = Console(stderr=True) -def print_error(errortitle, errortext): +def print_error(errortitle: str, errortext: str) -> None: """Print a yellow panel with error text""" rprint( Panel( @@ -22,7 +22,7 @@ def print_error(errortitle, errortext): file = sys.stderr ) -def print_solution(solutiontext): +def print_solution(solutiontext: str) -> None: """Print a blue panel with solution text""" rprint( Panel(solutiontext, @@ -34,7 +34,7 @@ def print_solution(solutiontext): file = sys.stderr ) -def print_solution_with_culprits(solutiontext, culprittext): +def print_solution_with_culprits(solutiontext: str, culprittext: str) -> None: """Print a blue panel with solution text and culprittext as the subtitle to introducethe list of offenders below it.""" rprint( Panel(solutiontext, @@ -47,7 +47,7 @@ def print_solution_with_culprits(solutiontext, culprittext): file = sys.stderr ) -def print_notice(noticetext): +def print_notice(noticetext: str) -> None: """Print a white panel with information text text""" rprint( Panel( @@ -60,13 +60,13 @@ def print_notice(noticetext): file = sys.stderr ) -def print_onstart(text, title): +def print_onstart(text: str, title: str) -> None: """Print a panel of info on workflow run""" rprint("") console.rule(f"[bold]harpy {title}", style = "light_steel_blue") console.print(text) -def print_setup_error(exitcode): +def print_setup_error(exitcode: int) -> None: """Print a red panel with snakefile or conda/singularity error text""" if exitcode == 1: errortext = "Something is wrong with the Snakefile for this workflow. If you manually edited the Snakefile, see the error below for troubleshooting. If you didn't, it's probably a bug (oops!) and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues" @@ -78,7 +78,7 @@ def print_setup_error(exitcode): console.print(errortext) console.rule("[bold]Error Reported by Snakemake", style = "red") -def print_onsuccess(outdir, summary = None, time = None): +def print_onsuccess(outdir: str, summary = None, time = None) -> None: """Print a green panel with success text. To be used in place of onsuccess: inside a snakefile""" days = time.days seconds = time.seconds @@ -95,7 +95,7 @@ def print_onsuccess(outdir, summary = None, time = None): console.rule("[bold]Workflow Finished!", style="green") console.print(datatable) -def print_onerror(logfile): +def print_onerror(logfile: str) -> None: """Print a red panel with error text. To be used in place of onerror: inside a snakefile. Expects the erroring rule printed after it.""" console.rule("[bold]Workflow Error", style = "red") console.print(f"The workflow stopped because of an error. Full workflow log:\n[bold]{logfile}[/bold]") diff --git a/harpy/_validations.py b/harpy/_validations.py index c53ffe40..2087470a 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -21,7 +21,7 @@ except NameError: _in_ipython_session = False -def is_gzip(file_path): +def is_gzip(file_path: str) -> bool: """helper function to determine if a file is gzipped""" try: with gzip.open(file_path, 'rt') as f: @@ -30,7 +30,7 @@ def is_gzip(file_path): except gzip.BadGzipFile: return False -def is_plaintext(file_path): +def is_plaintext(file_path: str) -> bool: """helper function to determine if a file is plaintext""" try: with open(file_path, 'r') as f: @@ -39,7 +39,7 @@ def is_plaintext(file_path): except UnicodeDecodeError: return False -def check_impute_params(parameters): +def check_impute_params(parameters: str) -> dict: """Validate the STITCH parameter file for column names, order, types, missing values, etc.""" with open(parameters, "r", encoding="utf-8") as paramfile: header = paramfile.readline().rstrip().lower() @@ -140,7 +140,7 @@ def check_impute_params(parameters): sys.exit(1) return data -def validate_bam_RG(bamlist, threads, quiet): +def validate_bam_RG(bamlist: str, threads: int, quiet: int) -> None: """Validate BAM files bamlist to make sure the sample name inferred from the file matches the @RG tag within the file""" culpritfiles = [] culpritIDs = [] @@ -170,7 +170,7 @@ def check_RG(bamfile): click.echo(f"{i}\t{j}", file = sys.stderr) sys.exit(1) -def check_phase_vcf(infile): +def check_phase_vcf(infile: str) -> None: """Check to see if the input VCf file is phased or not, govered by the presence of ID=PS or ID=HP tags""" vcfheader = subprocess.run(f"bcftools view -h {infile}".split(), stdout = subprocess.PIPE, check = False).stdout.decode('utf-8') if ("##FORMAT= None: """Validate the input population file to make sure there are two entries per row""" with open(infile, "r", encoding="utf-8") as f: rows = [i for i in f.readlines() if i != "\n" and not i.lstrip().startswith("#")] @@ -195,7 +195,7 @@ def validate_popfile(infile): _ = [click.echo(f"{i[0]+1}\t{i[1]}", file = sys.stderr) for i in invalids] sys.exit(1) -def vcf_sample_match(vcf, bamlist, vcf_samples): +def vcf_sample_match(vcf: str, bamlist: list[str], vcf_samples: bool) -> list[str]: """Validate that the input VCF file and the samples in the list of BAM files. The directionality of this check is determined by 'vcf_samples', which prioritizes the sample list in the vcf file, rather than bamlist.""" bcfquery = subprocess.Popen(["bcftools", "query", "-l", vcf], stdout=subprocess.PIPE) vcfsamples = bcfquery.stdout.read().decode().split() @@ -219,7 +219,7 @@ def vcf_sample_match(vcf, bamlist, vcf_samples): sys.exit(1) return(query) -def vcf_contig_match(contigs, vcf): +def vcf_contig_match(contigs: list[str], vcf: str) -> None: vcf_contigs = [] vcf_header = subprocess.run(["bcftools", "head", vcf], capture_output = True, text = True) for i in vcf_header.stdout.split("\n"): @@ -237,14 +237,14 @@ def vcf_contig_match(contigs, vcf): click.echo(",".join([i for i in bad_names]), file = sys.stderr) sys.exit(1) -def validate_popsamples(infiles, popfile, quiet): +def validate_popsamples(infiles: list[str], popfile: str, quiet: int) -> None: """Validate the presence of samples listed in 'populations' to be in the input files""" with open(popfile, "r", encoding="utf-8") as f: popsamples = [i.split()[0] for i in f.readlines() if i != "\n" and not i.lstrip().startswith("#")] in_samples = [Path(i).stem for i in infiles] missing_samples = [x for x in popsamples if x not in in_samples] overlooked = [x for x in in_samples if x not in popsamples] - if len(overlooked) > 0 and not quiet: + if len(overlooked) > 0 and not quiet != 2: print_notice(f"There are [bold]{len(overlooked)}[/bold] samples found in the inputs that weren\'t included in [blue]{popfile}[/blue]. This will [bold]not[/bold] cause errors and can be ignored if it was deliberate. Commenting or removing these lines will avoid this message. The samples are:\n" + ", ".join(overlooked)) if len(missing_samples) > 0: print_error("mismatched inputs", f"There are [bold]{len(missing_samples)}[/bold] samples included in [blue]{popfile}[/blue] that weren\'t found in in the inputs. Terminating Harpy to avoid downstream errors.") @@ -255,10 +255,11 @@ def validate_popsamples(infiles, popfile, quiet): click.echo(", ".join(sorted(missing_samples)), file = sys.stderr) sys.exit(1) -def validate_demuxschema(infile): - """Validate the file format of the demultiplex schema""" +def validate_demuxschema(infile:str, return_len: bool = False) -> None | int: + """Validate the file format of the demultiplex schema. Set return_len to True to return the number of samples""" code_letters = set() #codes can be Axx, Bxx, Cxx, Dxx segment_ids = set() + samples = set() segment_pattern = re.compile(r'^[A-D]\d{2}$') with open(infile, 'r') as file: for line in file: @@ -269,6 +270,7 @@ def validate_demuxschema(infile): print_solution("This haplotagging design expects segments to follow the format of letter [green bold]A-D[/green bold] followed by [bold]two digits[/bold], e.g. [green bold]C51[/green bold]). Check that your ID segments or formatted correctly and that you are attempting to demultiplex with a workflow appropriate for your data design.") sys.exit(1) code_letters.add(segment_id[0]) + samples.add(sample) if segment_id in segment_ids: print_error("ambiguous segment ID", "An ID segment must only be associated with a single sample.") print_solution_with_culprits( @@ -293,22 +295,22 @@ def validate_demuxschema(infile): ) click.echo(", ".join(code_letters)) sys.exit(1) + if return_len: + return len(samples) -def validate_regions(regioninput, genome): +def validate_regions(regioninput: int | str, genome: str) -> str: """validates the --regions input of harpy snp to infer whether it's an integer, region, or file""" try: # is an int region = int(regioninput) - except: + if region < 10: + print_error("window size too small", "Input for [green bold]--regions[/green bold] was interpreted as an integer to create equal windows to call variants. Integer input for [green bold]--regions[/green bold] must be greater than or equal to [green bold]10[/green bold].") + sys.exit(1) + else: + return "windows" + except ValueError: region = regioninput - if isinstance(region, int) and region < 10: - print_error("window size too small", "Input for [green bold]--regions[/green bold] was interpreted as an integer to create equal windows to call variants. Integer input for [green bold]--regions[/green bold] must be greater than or equal to [green bold]10[/green bold].") - sys.exit(1) - elif isinstance(region, int) and region >= 10: - return "windows" - else: - pass - # is a string + # is a string reg = re.split(r"[\:-]", regioninput) if len(reg) == 3: # is a single region, check the types to be [str, int, int] @@ -375,7 +377,7 @@ def validate_regions(regioninput, genome): sys.exit(1) return "file" -def check_fasta(genofile): +def check_fasta(genofile: str) -> str: """perform validations on fasta file for extensions and file contents""" # validate fasta file contents line_num = 0 @@ -420,7 +422,7 @@ def check_fasta(genofile): print_solution(solutiontext + "\nSee the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/") sys.exit(1) -def fasta_contig_match(contigs, fasta): +def fasta_contig_match(contigs: str, fasta: str) -> None: """Checks whether a list of contigs are present in a fasta file""" valid_contigs = [] with safe_read(fasta) as gen_open: @@ -440,7 +442,7 @@ def fasta_contig_match(contigs, fasta): click.echo(",".join([i for i in bad_names]), file = sys.stderr) sys.exit(1) -def validate_fastq_bx(fastq_list, threads, quiet): +def validate_fastq_bx(fastq_list: list[str], threads: int, quiet: int) -> None: """ Parse a list of fastq files to verify that they have BX/BC tag, and only one of those two types per file """ @@ -470,7 +472,7 @@ def validate(fastq): for future in as_completed(futures): progress.update(task_progress, advance=1, refresh = _in_ipython_session) -def validate_barcodefile(infile, return_len = False, quiet = False, limit = 140): +def validate_barcodefile(infile: str, return_len: bool = False, quiet: int = 0, limit: int = 60) -> None | int: """Does validations to make sure it's one length, within a length limit, one per line, and nucleotides""" if is_gzip(infile): print_error("incorrect format", f"The input file must be in uncompressed format. Please decompress [blue]{infile}[/blue] and try again.") diff --git a/harpy/align.py b/harpy/align.py index c21c5226..fe5dfcb5 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -7,7 +7,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types_generic import ContigList, InputFile, HPCProfile, SnakemakeParams +from ._cli_types_generic import convert_to_int, ContigList, InputFile, HPCProfile, SnakemakeParams from ._cli_types_params import BwaParams, EmaParams, StrobeAlignParams from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._parsers import parse_fastq_inputs @@ -70,13 +70,13 @@ def align(): @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/bwa", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--ignore-bx', is_flag = True, default = False, help = 'Ignore parts of the workflow specific to linked-read sequences') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) @@ -160,13 +160,13 @@ def bwa(inputs, output_dir, genome, depth_window, ignore_bx, threads, keep_unmap @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name') @click.option('-p', '--platform', type = click.Choice(['haplotag', '10x'], case_sensitive=False), default = "haplotag", show_default=True, help = "Linked read bead technology\n[haplotag, 10x]") -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('-l', '--barcode-list', type = click.Path(exists=True, dir_okay=False), help = "File of known barcodes for 10x linked reads") @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) @@ -269,13 +269,13 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/strobealign", show_default=True, help = 'Output directory name') @click.option('-l', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--ignore-bx', is_flag = True, default = False, help = 'Ignore parts of the workflow specific to linked-read sequences') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/assembly.py b/harpy/assembly.py index 094e7c6d..92647c2d 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -4,7 +4,7 @@ import sys import yaml import rich_click as click -from ._cli_types_generic import KParam, HPCProfile, SnakemakeParams +from ._cli_types_generic import convert_to_int, KParam, HPCProfile, SnakemakeParams from ._cli_types_params import SpadesParams, ArcsParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD @@ -48,11 +48,11 @@ @click.option("-s", "--span", type = int, default = 20, show_default = True, help = "Minimum number of spanning molecules to be considered assembled") # Other Options @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Assembly", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index d0296584..54dc3536 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -4,7 +4,7 @@ import sys import yaml import rich_click as click -from ._cli_types_generic import HPCProfile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, snakemake_log @@ -29,17 +29,17 @@ @click.option('-w', '--window-size', default = 40, show_default = True, type=int, help = 'Size of window guaranteed to contain at least one kmer') @click.option('-d', '--density', default = 3, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'On average, 1/2^d kmers are indexed') @click.option('-a', '--dropout', default = 0, show_default = True, type = click.IntRange(min = 0, max_open = True), help = 'Minimum cloud size to deconvolve') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Deconvolve", show_default=True, help = 'Output directory name') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, threads, snakemake, quiet, hpc, container, setup_only): """ - Resolve clashing barcodes from different molecules + Resolve shared barcodes between unrelated molecules Provide the input fastq files and/or directories at the end of the command as individual files/folders, using shell wildcards (e.g. `data/acronotus*.fq`), or both. diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 1a3147cc..da6076d5 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -5,7 +5,7 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import HPCProfile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, fetch_script, snakemake_log @@ -41,12 +41,12 @@ def demultiplex(): @click.option('-u', '--keep-unknown', is_flag = True, default = False, help = 'Keep reads that could not be demultiplexed') @click.option('-q', '--qx-rx', is_flag = True, default = False, help = 'Include the `QX:Z` and `RX:Z` tags in the read header') @click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'File of `sample`\\`barcode`') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max = 999), help = 'Number of threads to use') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('R1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @@ -74,7 +74,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, keep_unknown, schema, qx_rx, th if snakemake: command += f" {snakemake}" - validate_demuxschema(schema) + validate_demuxschema(schema, return_len = False) os.makedirs(f"{workflowdir}", exist_ok=True) fetch_rule(workflowdir, "demultiplex_gen1.smk") fetch_script(workflowdir, "demultiplex_gen1.py") diff --git a/harpy/diagnose.py b/harpy/diagnose.py new file mode 100644 index 00000000..612cc7f4 --- /dev/null +++ b/harpy/diagnose.py @@ -0,0 +1,52 @@ +import os +import sys +import yaml +import subprocess +import rich_click as click +from rich.console import Console +from rich import print as rprint +from ._printing import print_error + +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False)) +@click.argument('directory', required=True, type=click.Path(exists=True, file_okay=False)) +def diagnose(directory): + """ + Run the Snakemake debugger to identify hang-ups + """ + directory = directory.rstrip("/") + if not os.path.exists(f"{directory}/workflow/config.yaml"): + print_error("missing config file", f"Target directory [blue]{directory}[/blue] does not contain the file [bold]workflow/config.yaml[/bold]") + sys.exit(1) + with open(f"{directory}/workflow/config.yaml", 'r', encoding="utf-8") as f: + harpy_config = yaml.full_load(f) + + command = harpy_config["workflow_call"] + " --dry-run --debug-dag" + console = Console(stderr=True) + console.rule("[bold]Diagnosing Snakemake Job Graph", style = "green") + try: + process = subprocess.Popen( + command.split(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding='utf-8' + ) + while True: + output = process.stdout.readline() + error = process.stderr.readline() + if not output and not error and process.poll() is not None: + break + if error: + rprint(f"[red]{error}", end="", file=sys.stderr) + if output: + if output.startswith("This was a dry-run"): + process.terminate() + exit(0) + else: + rprint(f"[yellow]{output}", end="", file=sys.stderr) + except: + console.print("") + console.rule("[bold]Terminating Snakemake", style = "yellow") + process.terminate() + process.wait() + sys.exit(1) \ No newline at end of file diff --git a/harpy/downsample.py b/harpy/downsample.py index 667b386e..757866d3 100644 --- a/harpy/downsample.py +++ b/harpy/downsample.py @@ -6,7 +6,7 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import SnakemakeParams +from ._cli_types_generic import convert_to_int, SnakemakeParams from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, snakemake_log from ._printing import workflow_info @@ -29,8 +29,8 @@ @click.option('-p', '--prefix', type = click.Path(exists = False), default = "downsampled", show_default = True, help = 'Prefix for output file(s)') @click.option('--random-seed', type = click.IntRange(min = 1), help = "Random seed for sampling") @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Downsample", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('input', required=True, type=click.Path(exists=True, readable=True, dir_okay=False), nargs=-1) diff --git a/harpy/impute.py b/harpy/impute.py index d137aa39..d1577cbf 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -5,7 +5,7 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, InputFile, SnakemakeParams from ._cli_types_params import StitchParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD @@ -31,12 +31,12 @@ @click.option('-x', '--extra-params', type = StitchParams(), help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') @click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'STITCH parameter file (tab-delimited)') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for imputation rather than those found the inputs') diff --git a/harpy/metassembly.py b/harpy/metassembly.py index 7112ddd4..9102b75a 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -7,7 +7,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, snakemake_log -from ._cli_types_generic import HPCProfile, KParam, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, KParam, SnakemakeParams from ._cli_types_params import SpadesParams from ._printing import workflow_info from ._validations import validate_fastq_bx @@ -34,11 +34,11 @@ @click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # Common Workflow @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/phase.py b/harpy/phase.py index 94c47c76..f1ca316c 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -7,7 +7,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import convert_to_int, ContigList, HPCProfile, InputFile, SnakemakeParams from ._cli_types_params import HapCutParams from ._parsers import parse_alignment_inputs from ._printing import workflow_info @@ -33,13 +33,13 @@ @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') @click.option('-p', '--prune-threshold', default = 7, show_default = True, type = click.IntRange(0,100), help = 'PHRED-scale threshold (%) for pruning low-confidence SNPs (larger prunes more.)') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max = 999), help = 'Number of threads to use') @click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for phasing rather than those found the inputs') diff --git a/harpy/preflight.py b/harpy/preflight.py index 5a51c245..5290a3e5 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -4,7 +4,7 @@ import sys import yaml import rich_click as click -from ._cli_types_generic import HPCProfile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_rule, fetch_report, snakemake_log @@ -37,8 +37,8 @@ def preflight(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/preflight/") -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @@ -99,11 +99,11 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, container, setup_onl @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/preflight/") @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/fastq", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, container, setup_only): diff --git a/harpy/qc.py b/harpy/qc.py index 79f0d251..b72a679f 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -7,7 +7,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types_generic import HPCProfile, IntList, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, IntList, SnakemakeParams from ._cli_types_params import FastpParams from ._parsers import parse_fastq_inputs from ._printing import workflow_info @@ -33,13 +33,13 @@ @click.option('-m', '--max-length', default = 150, show_default = True, type=int, help = 'Maximum length to trim sequences down to') @click.option('-n', '--min-length', default = 30, show_default = True, type=int, help = 'Discard reads shorter than this length') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "QC", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('-a', '--trim-adapters', type = str, help = 'Detect and trim adapters') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--ignore-bx', is_flag = True, default = False, help = 'Ignore parts of the workflow specific to linked-read sequences') -@click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/resume.py b/harpy/resume.py index 5bd42448..a87ca496 100644 --- a/harpy/resume.py +++ b/harpy/resume.py @@ -6,6 +6,7 @@ import yaml import rich_click as click from ._conda import check_environments +from ._cli_types_generic import convert_to_int from ._printing import print_error, workflow_info from ._launch import launch_snakemake from ._misc import snakemake_log @@ -13,8 +14,8 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/other") @click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments') -@click.option('-t', '--threads', type = click.IntRange(min = 2, max_open = True), help = 'Change the number of threads (>1)') -@click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') +@click.option('-t', '--threads', type = click.IntRange(min = 2, max = 999), help = 'Change the number of threads (>1)') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.argument('directory', required=True, type=click.Path(exists=True, file_okay=False, readable=True), nargs=1) def resume(directory, conda, threads, quiet): """ diff --git a/harpy/simulate_linkedreads.py b/harpy/simulate_linkedreads.py new file mode 100644 index 00000000..433a24c6 --- /dev/null +++ b/harpy/simulate_linkedreads.py @@ -0,0 +1,114 @@ +"""Harpy workflows to simulate genomic variants and linked-reads""" +import os +import sys +import yaml +from pathlib import Path +import rich_click as click +from ._cli_types_generic import convert_to_int, HPCProfile, InputFile, SnakemakeParams +from ._conda import create_conda_recipes +from ._launch import launch_snakemake, SNAKEMAKE_CMD +from ._misc import fetch_rule, fetch_script, snakemake_log +from ._printing import workflow_info +from ._validations import check_fasta, validate_barcodefile + +docstring = { + "harpy simulate linkedreads": [ + { + "name": "Parameters", + "options": ["--barcodes", "--distance-sd", "--outer-distance", "--molecule-length", "--molecules-per", "--mutation-rate", "--partitions", "--read-pairs"] + }, + { + "name": "Workflow Controls", + "options": ["--container", "--hpc", "--output-dir", "--quiet", "--snakemake", "--threads", "--help"] + } + ] +} + +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") +@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads") +@click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") +@click.option('-m', '--molecules-per', type = click.IntRange(min = 1, max = 4700), default = 10, show_default=True, help = "Average number of molecules per partition") +@click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)") +@click.option('-r', '--mutation-rate', type = click.FloatRange(min = 0), default=0.001, show_default=True, help = "Random mutation rate for simulating reads") +@click.option('-d', '--outer-distance', type = click.IntRange(min = 100), default = 350, show_default= True, help = "Outer distance between paired-end reads (bp)") +@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Simulate/linkedreads", help = 'Output directory name') +@click.option('-p', '--partitions', type = click.IntRange(min = 1), default=1500, show_default=True, help = "Number (in thousands) of partitions/beads to generate") +@click.option('-n', '--read-pairs', type = click.FloatRange(min = 0.001), default = 600, show_default=True, help = "Number (in millions) of read pairs to simulate") +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max = 999), help = 'Number of threads to use') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') +@click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome_hap1', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) +@click.argument('genome_hap2', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) +def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, container, setup_only): + """ + Create linked reads from a genome + + Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If + you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/blog/simulate_diploid/). + + If not providing a file for `--barcodes`, Harpy will generate a file containing the original + (96^4) set of 24-basepair haplotagging barcodes (~2GB disk space). The `--barcodes` file is expected to have one + linked-read barcode per line, given as nucleotides. + """ + output_dir = output_dir.rstrip("/") + workflowdir = os.path.join(output_dir, 'workflow') + sdm = "conda" if not container else "conda apptainer" + command = f'{SNAKEMAKE_CMD} --software-deployment-method {sdm} --cores {threads}' + command += f" --snakefile {workflowdir}/simulate_linkedreads.smk" + command += f" --configfile {workflowdir}/config.yaml" + if hpc: + command += f" --workflow-profile {hpc}" + if snakemake: + command += f" {snakemake}" + + check_fasta(genome_hap1) + check_fasta(genome_hap2) + if barcodes: + bc_len = validate_barcodefile(barcodes, True, quiet) + os.makedirs(f"{workflowdir}/", exist_ok= True) + fetch_rule(workflowdir, "simulate_linkedreads.smk") + fetch_script(workflowdir, "HaploSim.pl") + os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) + sm_log = snakemake_log(output_dir, "simulate_linkedreads") + conda_envs = ["simulations"] + configs = { + "workflow" : "simulate linkedreads", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "outer_distance" : outer_distance, + "distance_sd" : distance_sd, + "read_pairs" : read_pairs, + "mutation_rate" : mutation_rate, + "molecule_length" : molecule_length, + "partitions" : partitions, + "molecules_per_partition" : molecules_per, + "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, + 'barcodes': { + "file": Path(barcodes).resolve().as_posix() if barcodes else f"{workflowdir}/input/haplotag_barcodes.txt", + "length": bc_len if barcodes else 24 + }, + "inputs" : { + "genome_hap1" : Path(genome_hap1).resolve().as_posix(), + "genome_hap2" : Path(genome_hap2).resolve().as_posix(), + } + } + with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: + yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) + + create_conda_recipes(output_dir, conda_envs) + if setup_only: + sys.exit(0) + + start_text = workflow_info( + ("Genome Haplotype 1:", os.path.basename(genome_hap1)), + ("Genome Haplotype 2:", os.path.basename(genome_hap2)), + ("Barcodes:", os.path.basename(barcodes) if barcodes else "Haplotagging Default"), + ("Output Folder:", output_dir + "/"), + ("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") + ) + launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") + diff --git a/harpy/simulate.py b/harpy/simulate_variants.py similarity index 78% rename from harpy/simulate.py rename to harpy/simulate_variants.py index 426608ef..38216636 100644 --- a/harpy/simulate.py +++ b/harpy/simulate_variants.py @@ -4,211 +4,97 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD -from ._misc import fetch_rule, fetch_script, snakemake_log +from ._misc import fetch_rule, snakemake_log from ._printing import print_error, workflow_info -from ._validations import check_fasta, validate_barcodefile - -@click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]}) -def simulate(): - """ - Simulate variants or linked-reads from a genome - - To simulate genomic variants, provide an additional subcommand {`snpindel`,`inversion`,`cnv`,`translocation`} - to get more information about that workflow. The variant simulator (`simuG`) can only simulate - one type of variant at a time, so you may need to run it a few times if you want multiple variant types. - Use `simulate linkedreads` to simulate haplotag linked-reads from a diploid genome, which you can create by simulating - genomic variants. - """ +from ._validations import check_fasta commandstring = { "harpy simulate": [ - { - "name": "Linked Read Sequences", - "commands": ["linkedreads"], - }, { "name": "Genomic Variants", - "commands": ["cnv", "inversion", "snpindel", "translocation"], + "commands": ["cnv", "inversion", "snpindel", "translocation"] } ] } docstring = { - "harpy simulate linkedreads": [ - { - "name": "Parameters", - "options": ["--barcodes", "--distance-sd", "--outer-distance", "--molecule-length", "--molecules-per", "--mutation-rate", "--partitions", "--read-pairs"], - }, - { - "name": "Workflow Controls", - "options": ["--container", "--hpc", "--output-dir", "--quiet", "--snakemake", "--threads", "--help"], - }, - ], "harpy simulate snpindel": [ { "name": "Known Variants", - "options": ["--indel-vcf", "--snp-vcf"], + "options": ["--indel-vcf", "--snp-vcf"] }, { "name": "Random Variants", - "options": ["--centromeres", "--exclude-chr", "--genes", "--indel-count", "--indel-ratio", "--snp-count", "--snp-gene-constraints", "--titv-ratio"], + "options": ["--centromeres", "--exclude-chr", "--genes", "--indel-count", "--indel-ratio", "--snp-count", "--snp-gene-constraints", "--titv-ratio"] }, { "name": "Diploid Options", - "options": ["--heterozygosity", "--only-vcf"], + "options": ["--heterozygosity", "--only-vcf"] }, { "name": "Workflow Controls", - "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"], + "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"] }, ], "harpy simulate inversion": [ { "name": "Known Variants", - "options": ["--vcf"], + "options": ["--vcf"] }, { "name": "Random Variants", - "options": ["--centromeres", "--count", "--exclude-chr", "--genes", "--max-size", "--min-size"], + "options": ["--centromeres", "--count", "--exclude-chr", "--genes", "--max-size", "--min-size"] }, { "name": "Diploid Options", - "options": ["--heterozygosity", "--only-vcf"], + "options": ["--heterozygosity", "--only-vcf"] }, { "name": "Workflow Controls", - "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"], + "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"] }, ], "harpy simulate cnv": [ { "name": "Known Variants", - "options": ["--vcf"], + "options": ["--vcf"] }, { "name": "Random Variants", - "options": ["--centromeres", "--count", "--dup-ratio", "--exclude-chr", "--gain-ratio", "--genes", "--max-copy", "--max-size", "--min-size"], + "options": ["--centromeres", "--count", "--dup-ratio", "--exclude-chr", "--gain-ratio", "--genes", "--max-copy", "--max-size", "--min-size"] }, { "name": "Diploid Options", - "options": ["--heterozygosity", "--only-vcf"], + "options": ["--heterozygosity", "--only-vcf"] }, { "name": "Workflow Controls", - "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"], + "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"] }, ], "harpy simulate translocation": [ { "name": "Known Variants", - "options": ["--vcf"], + "options": ["--vcf"] }, { "name": "Random Variants", - "options": ["--centromeres", "--count", "--exclude-chr", "--genes"], + "options": ["--centromeres", "--count", "--exclude-chr", "--genes"] }, { "name": "Diploid Options", - "options": ["--heterozygosity", "--only-vcf"], + "options": ["--heterozygosity", "--only-vcf"] }, { "name": "Workflow Controls", - "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"], + "options": ["--container", "--hpc", "--output-dir", "--prefix", "--quiet", "--random-seed", "--snakemake", "--help"] }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") -@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads") -@click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") -@click.option('-m', '--molecules-per', type = click.IntRange(min = 1, max = 4700), default = 10, show_default=True, help = "Average number of molecules per partition") -@click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)") -@click.option('-r', '--mutation-rate', type = click.FloatRange(min = 0), default=0.001, show_default=True, help = "Random mutation rate for simulating reads") -@click.option('-d', '--outer-distance', type = click.IntRange(min = 100), default = 350, show_default= True, help = "Outer distance between paired-end reads (bp)") -@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Simulate/linkedreads", help = 'Output directory name') -@click.option('-p', '--partitions', type = click.IntRange(min = 1), default=1500, show_default=True, help = "Number (in thousands) of partitions/beads to generate") -@click.option('-n', '--read-pairs', type = click.FloatRange(min = 0.001), default = 600, show_default=True, help = "Number (in millions) of read pairs to simulate") -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') -@click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome_hap1', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) -@click.argument('genome_hap2', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) -def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, container, setup_only): - """ - Create linked reads from a genome - - Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If - you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/blog/simulate_diploid/). - - If not providing a file for `--barcodes`, Harpy will generate a file containing the original - (96^4) set of 24-basepair haplotagging barcodes (~2GB disk space). The `--barcodes` file is expected to have one - linked-read barcode per line, given as nucleotides. - """ - output_dir = output_dir.rstrip("/") - workflowdir = os.path.join(output_dir, 'workflow') - sdm = "conda" if not container else "conda apptainer" - command = f'{SNAKEMAKE_CMD} --software-deployment-method {sdm} --cores {threads}' - command += f" --snakefile {workflowdir}/simulate_linkedreads.smk" - command += f" --configfile {workflowdir}/config.yaml" - if hpc: - command += f" --workflow-profile {hpc}" - if snakemake: - command += f" {snakemake}" - - check_fasta(genome_hap1) - check_fasta(genome_hap2) - if barcodes: - bc_len = validate_barcodefile(barcodes, True, quiet) - os.makedirs(f"{workflowdir}/", exist_ok= True) - fetch_rule(workflowdir, "simulate_linkedreads.smk") - fetch_script(workflowdir, "HaploSim.pl") - os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) - sm_log = snakemake_log(output_dir, "simulate_linkedreads") - conda_envs = ["simulations"] - configs = { - "workflow" : "simulate linkedreads", - "snakemake_log" : sm_log, - "output_directory" : output_dir, - "outer_distance" : outer_distance, - "distance_sd" : distance_sd, - "read_pairs" : read_pairs, - "mutation_rate" : mutation_rate, - "molecule_length" : molecule_length, - "partitions" : partitions, - "molecules_per_partition" : molecules_per, - "workflow_call" : command.rstrip(), - "conda_environments" : conda_envs, - 'barcodes': { - "file": Path(barcodes).resolve().as_posix() if barcodes else f"{workflowdir}/input/haplotag_barcodes.txt", - "length": bc_len if barcodes else 24 - }, - "inputs" : { - "genome_hap1" : Path(genome_hap1).resolve().as_posix(), - "genome_hap2" : Path(genome_hap2).resolve().as_posix(), - } - } - with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: - yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - - create_conda_recipes(output_dir, conda_envs) - if setup_only: - sys.exit(0) - - start_text = workflow_info( - ("Genome Haplotype 1:", os.path.basename(genome_hap1)), - ("Genome Haplotype 2:", os.path.basename(genome_hap2)), - ("Barcodes:", os.path.basename(barcodes) if barcodes else "Haplotagging Default"), - ("Output Folder:", output_dir + "/"), - ("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - ) - launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") - @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-s', '--snp-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known snps to simulate') @click.option('-i', '--indel-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known indels to simulate') @@ -229,7 +115,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--random-seed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) @@ -352,7 +238,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--random-seed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) @@ -455,7 +341,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--random-seed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) @@ -562,7 +448,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--random-seed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) @@ -644,8 +530,3 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, ) launch_snakemake(command, "simulate_translocation", start_text, output_dir, sm_log, quiet, "workflow/simulate.translocation.summary") -simulate.add_command(linkedreads) -simulate.add_command(snpindel) -simulate.add_command(inversion) -simulate.add_command(cnv) -simulate.add_command(translocation) diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index dcd0aee7..de65c4cd 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index ed006696..545e93a1 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] nbins = config["EMA_bins"] diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index a38a79f4..f716e288 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/deconvolve.smk b/harpy/snakefiles/deconvolve.smk index 2bee9657..53a9607f 100644 --- a/harpy/snakefiles/deconvolve.smk +++ b/harpy/snakefiles/deconvolve.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" fqlist = config["inputs"] outdir = config["output_directory"] diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index 8defee1b..55a699ef 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -17,8 +17,9 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - FR = "[12]", - part = "\d+" + sample = r"[a-zA-Z0-9._-]+", + FR = r"[12]", + part = r"\d{3}" def parse_schema(smpl, keep_unknown): d = {} @@ -40,6 +41,7 @@ def parse_schema(smpl, keep_unknown): samples = parse_schema(samplefile, keep_unknown) samplenames = [i for i in samples] +print(samplenames) fastq_parts = [f"{i:03d}" for i in range(1, min(workflow.cores, 999) + 1)] rule barcode_segments: @@ -119,26 +121,14 @@ rule demultiplex: rule merge_partitions: input: collect(outdir + "/{{sample}}.{part}.R{{FR}}.fq", part = fastq_parts) - output: - outdir + "/{sample}.R{FR}.fq" - log: - outdir + "/logs/{sample}.{FR}.concat.log" - container: - None - shell: - "cat {input} > {output} 2> {log}" - -rule compress_fastq: - input: - outdir + "/{sample}.R{FR}.fq" output: outdir + "/{sample}.R{FR}.fq.gz" log: - outdir + "/logs/{sample}.{FR}.compress.log" + outdir + "/logs/{sample}.{FR}.concat.log" container: None shell: - "gzip {input} 2> {log}" + "cat {input} | gzip > {output} 2> {log}" rule merge_barcode_logs: input: diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index fa784fd1..fdda8746 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -9,9 +9,9 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+", - paramset = "[^/]+", - contig = "[^/]+" + sample = r"[a-zA-Z0-9._-]+", + paramset = r"[^/]+", + contig = r"[^/]+" bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 3ef7a86f..93f9cbc3 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -12,7 +12,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" ##TODO MANUAL PRUNING OF SWITCH ERRORS # https://github.com/vibansal/HapCUT2/blob/master/outputformat.md diff --git a/harpy/snakefiles/preflight_bam.smk b/harpy/snakefiles/preflight_bam.smk index 080f8bc0..d356d034 100644 --- a/harpy/snakefiles/preflight_bam.smk +++ b/harpy/snakefiles/preflight_bam.smk @@ -12,7 +12,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/preflight_fastq.smk b/harpy/snakefiles/preflight_fastq.smk index fb1116ab..e6f0e821 100644 --- a/harpy/snakefiles/preflight_fastq.smk +++ b/harpy/snakefiles/preflight_fastq.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" fqlist = config["inputs"] outdir = config["output_directory"] diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index 3e740a1e..1e9379a8 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" fqlist = config["inputs"] outdir = config["output_directory"] diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index df240684..1e8a862b 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -14,7 +14,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - hap = "[01]" + hap = r"[01]" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index 80ed7d47..8bbc730a 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index a1d03adc..18d0e1ee 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -11,7 +11,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index 7ae21b3c..5c7ce084 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -12,7 +12,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index 94180586..dc45fb83 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -11,8 +11,8 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+", - population = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+", + population = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index 8d69c31e..6250b6ec 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -12,7 +12,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index 5e75bc21..88520f12 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -12,7 +12,7 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index f58dc6a2..0fa504b6 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -12,8 +12,8 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+", - population = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+", + population = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index 611a70b7..9aa16102 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -12,8 +12,8 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+", - population = "[a-zA-Z0-9._-]+" + sample = r"[a-zA-Z0-9._-]+", + population = r"[a-zA-Z0-9._-]+" outdir = config["output_directory"] envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") diff --git a/harpy/snp.py b/harpy/snp.py index d0592b42..b36b62cf 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -5,7 +5,7 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import convert_to_int, HPCProfile, InputFile, SnakemakeParams from ._cli_types_params import MpileupParams, FreebayesParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD @@ -54,11 +54,11 @@ def snp(): @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) @@ -156,11 +156,11 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/sv.py b/harpy/sv.py index 581694d0..dfb127c1 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -5,7 +5,7 @@ import yaml from pathlib import Path import rich_click as click -from ._cli_types_generic import ContigList, HPCProfile, InputFile, IntList, SnakemakeParams +from ._cli_types_generic import convert_to_int, ContigList, HPCProfile, InputFile, IntList, SnakemakeParams from ._cli_types_params import LeviathanParams, NaibrParams from ._conda import create_conda_recipes from ._launch import launch_snakemake, SNAKEMAKE_CMD @@ -61,12 +61,12 @@ def sv(): @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SV/leviathan", show_default=True, help = 'Output directory name') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) @@ -159,13 +159,13 @@ def leviathan(inputs, output_dir, genome, min_size, min_barcodes, iterations, du @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance delineating separate molecules') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SV/naibr", show_default=True, help = 'Output directory name') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max = 999), help = 'Number of threads to use') @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to phased bcf/vcf file') @click.option('--container', is_flag = True, default = False, help = 'Use a container instead of conda') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--quiet', show_default = True, default = "0", type = click.Choice(["0", "1", "2"]), callback = convert_to_int, help = 'Verbosity of output. `0` shows all output, `1` shows single progress bar, `2` suppressess all output') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1)