diff --git a/.prettierignore b/.prettierignore index 7dc8ef0e9..34eded85f 100644 --- a/.prettierignore +++ b/.prettierignore @@ -3,6 +3,7 @@ adaptivecard.json slackreport.json .nextflow* work/ +docs/manual_tests.md data/ results/ .DS_Store diff --git a/conf/modules.config b/conf/modules.config index 5c4ba94be..f1bb3215f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1752,4 +1752,11 @@ process { ] ] } + + withName: 'NFCORE_EAGER:EAGER:CLASSIFY_MTDNA_HAPLOGROUP:HAPLOGREP3_CLASSIFY' { + ext.args = { + def phylotree = params.human_mtdna_phylotree ?: (params.human_mtdna_reference.toLowerCase() == 'rsrs' ? 'phylotree-rsrs@17.1' : 'phylotree-fu-rcrs@1.2') + "--tree ${phylotree}" + } + } } diff --git a/docs/development/code_conventions.md b/docs/development/code_conventions.md index d4870a141..55c3d2ae1 100644 --- a/docs/development/code_conventions.md +++ b/docs/development/code_conventions.md @@ -27,11 +27,11 @@ The alias should ideally make it intuitive to understand which subworkflow the m - The unique module names specified above should make it possible to always configure modules without the need for a regex/glob when using `withName`. Exception to this is modules named within nf-core subworkflows, which should be configured with a regex/glob. - The order of attributes within configuration blocks should always be the following: - 1. tag (mandatory) - 2. ext.args\* (optional. Followed by ext.args{2,3,...} in ascending order) - 3. ext.prefix (optional) - 4. publishDir (optional) - 5. any other attributes go to the end. + 1. tag (mandatory) + 2. ext.args\* (optional. Followed by ext.args{2,3,...} in ascending order) + 3. ext.prefix (optional) + 4. publishDir (optional) + 5. any other attributes go to the end. - NEVER use `meta.id` in module configuration (`tag`,`ext.*`), but instead the full explicit combination of unique attributes expected. `meta.sample_id` is fine to use and is equivalent to `meta.id`, but should be supplemented by `meta.library_id` and `meta.lane` etc, as required. - Every process that is reference-specific MUST include `${meta.reference}` in its `tag` and `ext.prefix` attributes. This is to avoid confusion when running the pipeline with multiple references. - Tags that include reference and sample information should be formatted as `${meta.reference}|${meta.sample_id}_*`. Reference specific attributes go on the left-hand-side of the tag, data-specific attributes on the right-hand-side. diff --git a/docs/development/dev_docs.md b/docs/development/dev_docs.md index 870318370..474ca0834 100644 --- a/docs/development/dev_docs.md +++ b/docs/development/dev_docs.md @@ -16,7 +16,7 @@ To add new input files or options to the reference sheet, you have to complete a ### Multi-reference input workflow -1. Add new column named and test data to the test reference sheet (https://github.com/nf-core/test-datasets/blob/eager/reference/reference_sheet_multiref.csv). +1. Add new column named and test data to the test reference sheet (). 2. Read in new input via nf-validation plugin within the reference_indexing_multi local subworkflow. 1. Add new "property" to the fasta validation schema (assets/schema_fasta.json). 1. Add "type" of your object, e.g. `"type": "string"` for file paths and `"type": "integer"` for numbers. diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 2bec03850..604fdd722 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -1,3 +1,4 @@ + # Manual Tests Here is a list of manual tests we can run with the expect output commands @@ -1133,3 +1134,18 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Expect: BAM input shows up in FastQC -> mapping results. nextflow run main.nf -profile test,docker --outdir ./results -w work/ --convert_inputbam --skip_deduplication -resume -ansi-log false -dump-channels ``` + +### MTDNA HAPLOGROUP CLASSIFICATION + +```bash +#### MTDNA HAPLOGROUP CLASSIFICATION with default settings +## Expect: Directory created 'mtdna_haplogroup//' containing a .txt file for each sample with haplogroup assignments +## Expect: The haplogroup .txt file contains at minimum columns for rank, name, quality, range, and details of the haplogroup assignment +nextflow run main.nf -profile docker,test --outdir ./results/mtdna_haplogroup_test --run_genotyping --genotyping_tool ug --genotyping_source raw --run_classify_mtdna_haplogroup -resume + +#### MTDNA HAPLOGROUP CLASSIFICATION with specific arguments +## Expect: Directory created 'mtdna_haplogroup//' containing a .txt file for each sample with haplogroup assignments +## Expect: The haplogroup assignment may differ based on the classification settings +nextflow run main.nf -profile docker,test --outdir ./results/mtdna_haplogroup_test --run_classify_mtdna_haplogroup --run_genotyping --genotyping_tool ug --genotyping_source raw --run_classify_mtdna_haplogroup --human_mtdna_reference rsrs --human_mtdna_phylotree phylotree-rsrs@1.0 -resume +``` + \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index 0264f9dad..06f5c8f8c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -107,7 +107,7 @@ Only the `reference_name`, and `fasta` columns are mandatory, whereas all other Files for `fai`, `dict`, `mapper_index` will be generated by the pipeline for you if not specified. -A real-world example could look as follows, where a user-supplied `.dict` file and `circular_target ` and `mitochondrion_header` are not specified: +A real-world example could look as follows, where a user-supplied `.dict` file and `circular_target` and `mitochondrion_header` are not specified: ```txt reference_name,fasta,fai,dict,mapper_index,circular_target,mitochondrion @@ -217,7 +217,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` - - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). + - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow `24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. diff --git a/modules.json b/modules.json index 293de7bc4..fc0bffa03 100644 --- a/modules.json +++ b/modules.json @@ -180,6 +180,11 @@ "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", "installed_by": ["modules"] }, + "haplogrep3/classify": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": ["modules"] + }, "kraken2/kraken2": { "branch": "master", "git_sha": "653218e79ffa76fde20319e9062f8b8da5cf7555", diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 033f4154a..b20ad3c0c 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -41,7 +41,7 @@ process FASTQC { fastqc \\ ${args} \\ --threads ${task.cpus} \\ - --memory ${fastqc_memory} \\ + --memory ${fastqc_memory.toInteger()} \\ ${renamed_files} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/haplogrep3/classify/environment.yml b/modules/nf-core/haplogrep3/classify/environment.yml new file mode 100644 index 000000000..c219ac43c --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::haplogrep3=3.2.2 diff --git a/modules/nf-core/haplogrep3/classify/main.nf b/modules/nf-core/haplogrep3/classify/main.nf new file mode 100644 index 000000000..056b0635f --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/main.nf @@ -0,0 +1,47 @@ +process HAPLOGREP3_CLASSIFY { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/haplogrep3:3.2.2--hdfd78af_0': + 'biocontainers/haplogrep3:3.2.2--hdfd78af_0' }" + + input: + tuple val(meta), path(inputfile) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + haplogrep3 \\ + classify \\ + $args \\ + --in $inputfile \\ + --out ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + haplogrep3: \$(echo \$(haplogrep3 2>&1) | (sed '2!d') | (sed 's/Haplogrep 3 //')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + haplogrep3: \$(echo \$(haplogrep3 2>&1) | (sed '2!d') | (sed 's/Haplogrep 3 //')) + END_VERSIONS + """ + +} diff --git a/modules/nf-core/haplogrep3/classify/meta.yml b/modules/nf-core/haplogrep3/classify/meta.yml new file mode 100644 index 000000000..4c5c925d2 --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/meta.yml @@ -0,0 +1,45 @@ +name: "haplogrep3_classify" +description: classification into haplogroups +keywords: + - haplogroups + - classify + - mtDNA +tools: + - "haplogrep3": + description: "A tool for mtDNA haplogroup classification." + homepage: "https://github.com/genepi/haplogrep3" + documentation: "https://github.com/genepi/haplogrep3" + tool_dev_url: "https://github.com/genepi/haplogrep3" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputfile: + type: file + description: valid options are hsd, vcf, or fasta files + pattern: "*.{vcf,vcf.gz,fasta,hsd}" +output: + - txt: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.txt": + type: file + description: text file with classification information + pattern: "*.{txt}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucpen" +maintainers: + - "@lucpen" + - "@ramprasadn" diff --git a/modules/nf-core/haplogrep3/classify/tests/main.nf.test b/modules/nf-core/haplogrep3/classify/tests/main.nf.test new file mode 100644 index 000000000..654124c52 --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/tests/main.nf.test @@ -0,0 +1,59 @@ + +nextflow_process { + + name "Test Process HAPLOGREP3_CLASSIFY" + script "../main.nf" + process "HAPLOGREP3_CLASSIFY" + + tag "modules" + tag "modules_nfcore" + tag "haplogrep3" + tag "haplogrep3/classify" + + test("test-haplogrep3-classify") { + + config './nextflow.config' + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test-haplogrep3-classify-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/haplogrep3/classify/tests/main.nf.test.snap b/modules/nf-core/haplogrep3/classify/tests/main.nf.test.snap new file mode 100644 index 000000000..d1a276d25 --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "test-haplogrep3-classify-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,bd62c94d9b52732b89fbd979ded94a60" + ], + "txt": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,bd62c94d9b52732b89fbd979ded94a60" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-28T15:34:35.106097277" + }, + "test-haplogrep3-classify": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,fb242df629aa6168371d1d742f0fb179" + ] + ], + "1": [ + "versions.yml:md5,bd62c94d9b52732b89fbd979ded94a60" + ], + "txt": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,fb242df629aa6168371d1d742f0fb179" + ] + ], + "versions": [ + "versions.yml:md5,bd62c94d9b52732b89fbd979ded94a60" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-28T15:36:19.954363253" + } +} \ No newline at end of file diff --git a/modules/nf-core/haplogrep3/classify/tests/nextflow.config b/modules/nf-core/haplogrep3/classify/tests/nextflow.config new file mode 100644 index 000000000..7f57266a1 --- /dev/null +++ b/modules/nf-core/haplogrep3/classify/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: HAPLOGREP3_CLASSIFY { + ext.args = '--tree phylotree-rsrs@17.1' + } +} diff --git a/nextflow.config b/nextflow.config index 3333c140c..2518b2783 100644 --- a/nextflow.config +++ b/nextflow.config @@ -249,6 +249,11 @@ params { run_sexdeterrmine = false sexdeterrmine_bedfile = null + // mtDNA haplogroup classification + run_mtdna_haplogroup_classification = false + human_mtdna_reference = 'rcrs' + human_mtdna_phylotree = null + // Genotyping run_genotyping = false genotyping_tool = null diff --git a/nextflow_schema.json b/nextflow_schema.json index c69a1de19..b7cabc058 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1590,6 +1590,36 @@ }, "fa_icon": "fas fa-transgender-alt", "help_text": "" + }, + "mtdna_haplogroup_options": { + "title": "mtDNA Haplogroup Classification", + "type": "object", + "description": "Options for classifying mitochondrial haplogroups using Haplogrep3.", + "default": "", + "fa_icon": "fas fa-dna", + "properties": { + "run_mtdna_haplogroup_classification": { + "type": "boolean", + "description": "Run Haplogrep3 to determine mitochondrial haplogroups.", + "fa_icon": "fas fa-dna", + "default": false + }, + "human_mtdna_reference": { + "type": "string", + "description": "Specify the human mitochondrial reference sequence used for alignment (e.g., rCRS or RSRS). Determines default phylotree for Haplogrep3.", + "fa_icon": "fas fa-dna", + "enum": ["rcrs", "rsrs"], + "default": "rcrs", + "help_text": "Select 'rcrs' (Reconstructed Sapiens Reference Sequence) or 'rsrs' (revised Cambridge Reference Sequence). This choice influences the default phylotree used by Haplogrep3 if --haplogrep3_phylotree is not explicitly set. Defaults are 'phylotree-fu-rcrs@1.2' for rCRS and 'phylotree-rsrs@17.1' for RSRS." + }, + "human_mtdna_phylotree": { + "type": ["string", "null"], + "description": "Specify the exact Phylotree version for Haplogrep3 to use.", + "fa_icon": "fas fa-project-diagram", + "default": null, + "help_text": "Overrides the default phylotree selection based on --human_mtdna_reference. Provide the specific tree name (e.g., 'phylotree-fu-rcrs@1.2'). Available trees can be listed with 'haplogrep3 list trees'." + } + } } }, "allOf": [ @@ -1646,6 +1676,9 @@ }, { "$ref": "#/$defs/human_sex_determination" + }, + { + "$ref": "#/$defs/mtdna_haplogroup_options" } ] } diff --git a/subworkflows/local/classify_mtdna_haplogroup.nf b/subworkflows/local/classify_mtdna_haplogroup.nf new file mode 100644 index 000000000..63c5ad54d --- /dev/null +++ b/subworkflows/local/classify_mtdna_haplogroup.nf @@ -0,0 +1,28 @@ +// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) +// https://github.com/nf-core/modules/tree/master/subworkflows +// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: +// https://nf-co.re/join +// TODO nf-core: A subworkflow SHOULD import at least two modules + +include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' +include { HAPLOGREP3_CLASSIFY } from '../../modules/nf-core/haplogrep3/classify/main' + +workflow CLASSIFY_MTDNA_HAPLOGROUP { + + take: + ch_mtdna_vcf + + main: + ch_versions = Channel.empty() + ch_haplogroups = Channel.empty() + + ch_input_haplogrep3 = ch_mtdna_vcf + + HAPLOGREP3_CLASSIFY(ch_input_haplogrep3) + ch_haplogroups = HAPLOGREP3_CLASSIFY.out.txt + ch_versions = ch_versions.mix(HAPLOGREP3_CLASSIFY.out.versions) + + emit: + haplogroups = ch_haplogroups + versions = ch_versions +} diff --git a/workflows/eager.nf b/workflows/eager.nf index 148f262c1..94afa89b6 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -32,6 +32,7 @@ include { METAGENOMICS } from '../subwork include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' +include { CLASSIFY_MTDNA_HAPLOGROUP } from '../subworkflows/local/classify_mtdna_haplogroup' include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING } from '../subworkflows/local/merge_libraries' include { GENOTYPE } from '../subworkflows/local/genotype' @@ -560,6 +561,35 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(GENOTYPE.out.mqc.collect { it[1] }.ifEmpty([])) } + // + // SUBWORKFLOW: Run mtDNA Haplogroup Classification + // + + if (params.run_mtdna_haplogroup) { + if (!params.run_genotyping) { + error "Cannot run mtDNA haplogroup classification (--run_mtdna_haplogroup) without running genotyping (--run_genotyping). VCF files are required as input." + } + + ch_mito_header_for_filter = REFERENCE_INDEXING.out.mitochondrion_header + .map { meta, header -> [ meta.id, header ] } + + ch_mtdna_haplogroup_input = GENOTYPE.out.vcf + .map { meta, vcf, tbi -> + def reference_id = meta.reference + [ reference_id, meta, vcf ] + } + .join(ch_mito_header_for_filter) + .filter { ref_id, meta, vcf, mito_header -> + vcf.name.contains(meta.id) + } + .map { ref_id, meta, vcf, mito_header -> + [ meta, vcf ] + } + + CLASSIFY_MTDNA_HAPLOGROUP(ch_mtdna_haplogroup_input) + ch_versions = ch_versions.mix(CLASSIFY_MTDNA_HAPLOGROUP.out.versions) + } + // // Collate and save software versions //