From b2b0394326c4139ee3ff5badb572697cfc261c48 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Mon, 24 Mar 2025 15:44:20 +0000 Subject: [PATCH 01/18] added error catching for multiple udg treatment in single lib (causes name coll. + not correct wet-lab-wise) --- .../local/utils_nfcore_eager_pipeline/main.nf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index a92c8f89c..2058e6ff9 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -143,6 +143,20 @@ workflow PIPELINE_INITIALISATION { [ meta, singlestrand ] } + // - No libraries with multiple UDG treatments (UDG treatment done as part of library generation) + ch_samplesheet_test = ch_samplesheet + .map { + meta, r1, r2, bam -> + [ meta.subMap('library_id'), meta.subMap('damage_treatment') ] + } + .groupTuple() + .map { meta, damage_treatment -> + if ( damage_treatment.toList().unique().size() > 1 ) { + exit 1, "[nf-core] ERROR: Validation of 'input' file failed. Library IDs can only have a single UDG treatment across lanes." + } + [ meta, damage_treatment ] + } + emit: samplesheet_fastqs = ch_samplesheet_fastqs samplesheet_bams = ch_samplesheet_bams @@ -235,6 +249,7 @@ def validateInputParameters() { if ( params.run_metagenomics && params.preprocessing_skippairmerging && params.metagenomics_profiling_tool == 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } + if ( params.genotyping_source == 'raw' && params.run_trim_bam || params.run_pmd_filtering || params.run_mapdamage_rescaling ) { log.warn("[nf-core/eager] WARNING: --genotyping_source is set to 'raw' AND damage correction carried out (rescaling, filtering, or trimming). The output of these tools will NOT be utilized for genotyping!") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } @@ -245,6 +260,8 @@ def validateInputParameters() { // // Validate channels from input samplesheet // +// FROM NF-CORE TEMPLATE: THIS IS NOT NECESSARY +// TODO: remove prior to 3.0 def validateInputSamplesheet(input) { def (metas, fastqs) = input[1..2] From 378754297dad9ddb7603b8ff909bc73577481e7d Mon Sep 17 00:00:00 2001 From: Ian Light Date: Mon, 24 Mar 2025 15:44:56 +0000 Subject: [PATCH 02/18] adjusted default behavior for bam saving to prevent overwriting (with same file, but still) --- conf/modules.config | 7 +++++-- nextflow.config | 2 +- nextflow_schema.json | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index daf87aee2..a023b818c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -695,7 +695,7 @@ process { ] } - withName: ".*DEDUPLICATE:BUILD_INTERVALS" { + withName: ".*DEDUPLICATE:BUILD_INTERVALS" { publishDir = [ enabled: false ] @@ -1530,7 +1530,7 @@ process { ] } - withName: ".*MERGE_LIBRARIES:SAMTOOLS_MERGE_LIBRARIES" { + withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_MERGE_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } publishDir = [ @@ -1543,6 +1543,7 @@ process { ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ [ + enabled: {params.genotyping_source == "raw" ? false : true}, // data path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, mode: params.publish_dir_mode, @@ -1557,6 +1558,7 @@ process { ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ [ + enabled: {params.genotyping_source == "raw" ? false : true}, // data path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, mode: params.publish_dir_mode, @@ -1570,6 +1572,7 @@ process { ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ [ + enabled: {params.genotyping_source == "raw" ? false : true}, // stats path: { "${params.outdir}/final_bams/${params.genotyping_source}/stats/" }, mode: params.publish_dir_mode, diff --git a/nextflow.config b/nextflow.config index 7f841067f..cc64bae86 100644 --- a/nextflow.config +++ b/nextflow.config @@ -240,7 +240,7 @@ params { // Genotyping run_genotyping = false genotyping_tool = null - genotyping_source = null + genotyping_source = 'raw' skip_bcftools_stats = false genotyping_reference_ploidy = 2 genotyping_pileupcaller_min_base_quality = 30 diff --git a/nextflow_schema.json b/nextflow_schema.json index fecd8c6ad..12dfc693c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1066,6 +1066,7 @@ }, "genotyping_source": { "type": "string", + "default": "raw", "description": "Specify which input BAM to use for genotyping.", "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", From 34dd28c1feb35b81b46d2291302440c540de1a45 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Mon, 24 Mar 2025 15:48:34 +0000 Subject: [PATCH 03/18] unfinished config for testing for name collisions --- conf/test_namecollision.config | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 conf/test_namecollision.config diff --git a/conf/test_namecollision.config b/conf/test_namecollision.config new file mode 100644 index 000000000..e8d041cf0 --- /dev/null +++ b/conf/test_namecollision.config @@ -0,0 +1,50 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/eager -profile test_namecollision, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test name collisions profile' + config_profile_description = 'Minimal test dataset to check for name collisions' + + // Input data + input = params.pipelines_testdata_base_path + 'eager/testdata/Human/human_design_bam_eager3.tsv' + + // Genome references + fasta = params.pipelines_testdata_base_path + 'eager/reference/Human/hs37d5_chr21-MT.fa.gz' + + + // Preprocessing + sequencing_qc_tool = 'falco' + preprocessing_tool = 'fastp' + convert_inputbam = true + + // Mapping + mapping_tool = 'bwaaln' + + // Metagenomics + run_metagenomics = true + metagenomics_complexity_tool = 'prinseq' + metagenomics_profiling_tool = 'kraken2' + metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/kraken/eager_test.tar.gz' + metagenomics_run_postprocessing = true + + // Genotyping + genotyping_tool = 'hc' + +} From ee05ad1cbf124ad2087364765a434051173abd02 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Mon, 24 Mar 2025 16:48:21 +0000 Subject: [PATCH 04/18] partial implementation --- conf/modules.config | 99 ++++++++++++------------- subworkflows/local/manipulate_damage.nf | 36 ++++++--- workflows/eager.nf | 8 +- 3 files changed, 78 insertions(+), 65 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a023b818c..f12aea22d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1259,6 +1259,55 @@ process { ] } + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_MERGE_LIBRARIES" { + // TODO: add parameter from run_trim_bam/run_pmd_filtering/run_mapdamage_rescaling --> replace genotpying source for final bams + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}_unsorted" } + publishDir = [ + enabled: false + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_SORT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + publishDir = [ + [ + // data + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/data/" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_INDEX_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + publishDir = [ + [ + // data + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/data/" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + publishDir = [ + [ + // stats + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/stats/" }, + mode: params.publish_dir_mode, + pattern: '*.flagstat' + ] + ] + } + // // METAGENOMIC SCREENING // @@ -1530,56 +1579,6 @@ process { ] } - withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_MERGE_LIBRARIES" { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } - publishDir = [ - enabled: false - ] - } - - withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_SORT_MERGED_LIBRARIES" { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } - publishDir = [ - [ - enabled: {params.genotyping_source == "raw" ? false : true}, - // data - path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, - mode: params.publish_dir_mode, - pattern: '*.bam' - ] - ] - } - - withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_INDEX_MERGED_LIBRARIES" { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = { params.fasta_largeref ? "-c" : "" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } - publishDir = [ - [ - enabled: {params.genotyping_source == "raw" ? false : true}, - // data - path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, - mode: params.publish_dir_mode, - pattern: '*.{bai,csi}' - ] - ] - } - - withName: ".*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } - publishDir = [ - [ - enabled: {params.genotyping_source == "raw" ? false : true}, - // stats - path: { "${params.outdir}/final_bams/${params.genotyping_source}/stats/" }, - mode: params.publish_dir_mode, - pattern: '*.flagstat' - ] - ] - } // // GENOTYPING diff --git a/subworkflows/local/manipulate_damage.nf b/subworkflows/local/manipulate_damage.nf index bc8d96cba..583daa84d 100644 --- a/subworkflows/local/manipulate_damage.nf +++ b/subworkflows/local/manipulate_damage.nf @@ -12,6 +12,7 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_RESCALED } from '../../ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_FILTERED } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_TRIMMED } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED } from '../../modules/nf-core/samtools/flagstat/main' +include { MERGE_LIBRARIES as MERGE_LIBRARIES_DAMAGE_MANIPULATION } from '../../subworkflows/local/merge_libraries' // TODO: Add required channels and channel manipulations for reference-dependent bed masking before pmdtools. Requires multi-ref support before implementation. workflow MANIPULATE_DAMAGE { @@ -21,11 +22,14 @@ workflow MANIPULATE_DAMAGE { ch_pmd_masking // [ [ meta ], masked_fasta, bed_for_masking ] main: - ch_versions = Channel.empty() - ch_rescaled_bams = Channel.empty() - ch_pmd_filtered_bams = Channel.empty() - ch_trimmed_bams = Channel.empty() - ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads + ch_versions = Channel.empty() + ch_rescaled_bams = Channel.empty() + ch_pmd_filtered_bams = Channel.empty() + ch_trimmed_bams = Channel.empty() + ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads + ch_merged_damage_manipulated_bams = Channel.empty() + ch_multiqc_files = Channel.empty() + // Ensure correct reference is associated with each bam_bai pair ch_refs = ch_fasta @@ -169,10 +173,22 @@ workflow MANIPULATE_DAMAGE { ch_trimmed_bams = BAMUTIL_TRIMBAM.out.bam.join( ch_trimmed_index ) } + // SUBWORKFLOW: merge libraries for saving and potentially genotyping + MERGE_LIBRARIES_DAMAGE_MANIPULATION(ch_bams_for_library_merge) + ch_versions = ch_versions.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.versions) + ch_merged_damage_manipulated_bams = MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.mqc.collect { it[1] }.ifEmpty([])) + + // MAYBE solution here, just modifying it, removing from eager.nf + ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams + + emit: - rescaled = ch_rescaled_bams // [ meta, bam, bai ] - filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] - trimmed = ch_trimmed_bams // [ meta, bam, bai ] - flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] - versions = ch_versions + rescaled = ch_rescaled_bams // [ meta, bam, bai ] + filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] + trimmed = ch_trimmed_bams // [ meta, bam, bai ] + flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] + merged_bams = ch_merged_damage_manipulated_bams // [ meta, bam, bai ] + mqc = ch_multiqc_files + versions = ch_versions } diff --git a/workflows/eager.nf b/workflows/eager.nf index dd98a7908..1492dbca6 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -528,13 +528,11 @@ workflow EAGER { MANIPULATE_DAMAGE(ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking) ch_multiqc_files = ch_multiqc_files.mix(MANIPULATE_DAMAGE.out.flagstat.collect { it[1] }.ifEmpty([])) ch_versions = ch_versions.mix(MANIPULATE_DAMAGE.out.versions) - ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams +// TODO: figure out splitting into genotyping source (merging already done within manipulate damage, so just getting the trimmed/rescaled/filtered --> ch_bams_for_genotyping) + ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams - // SUBWORKFLOW: merge libraries for genotyping - MERGE_LIBRARIES_GENOTYPING(ch_bams_for_library_merge) - ch_versions = ch_versions.mix(MERGE_LIBRARIES_GENOTYPING.out.versions) - ch_bams_for_genotyping = MERGE_LIBRARIES_GENOTYPING.out.bam_bai ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_GENOTYPING.out.mqc.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.mqc.collect { it[1] }.ifEmpty([])) } else { ch_bams_for_genotyping = ch_merged_dedup_bams From 8b522331a5d8a7f87674fc022c4d8e9a7a8cf82c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Tue, 25 Mar 2025 14:28:29 +0000 Subject: [PATCH 05/18] mild rewrite of DAMAGEMANIPULATION --> GENOTYPING for increased control and consistent saving of files. --- conf/modules.config | 6 ++-- nextflow.config | 3 +- nextflow_schema.json | 13 +++++++-- subworkflows/local/genotype.nf | 2 +- subworkflows/local/manipulate_damage.nf | 39 +++++++++++++------------ subworkflows/local/merge_libraries.nf | 4 +-- workflows/eager.nf | 29 +++++++++++------- 7 files changed, 58 insertions(+), 38 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f12aea22d..9f3f1cd73 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1173,7 +1173,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_RESCALED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ @@ -1205,7 +1205,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_FILTERED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ @@ -1247,7 +1247,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_TRIMMED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ diff --git a/nextflow.config b/nextflow.config index cc64bae86..0668c5cfc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -241,7 +241,8 @@ params { run_genotyping = false genotyping_tool = null genotyping_source = 'raw' - skip_bcftools_stats = false + genotyping_use_unmerged_libraries = false + genotyping_skip_bcftools_stats = false genotyping_reference_ploidy = 2 genotyping_pileupcaller_min_base_quality = 30 genotyping_pileupcaller_min_map_quality = 30 diff --git a/nextflow_schema.json b/nextflow_schema.json index 12dfc693c..11b3bbf23 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1068,9 +1068,16 @@ "type": "string", "default": "raw", "description": "Specify which input BAM to use for genotyping.", - "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", + "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (no damage manipulation - per-sample merged libraries, see Note below); 'pmd' (per-sample merged pmdtools output); 'trimmed' (for base-clipped BAMs); 'pmd-trimmed' (Base-clipped-PMD-filtered BAMs); 'rescaled' (mapDamage2 rescaling).\nNote: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", - "enum": ["raw", "pmd", "trimmed", "rescaled"] + "enum": ["raw", "pmd", "trimmed", "pmd_trimmed", "rescaled"] + }, + "genotyping_use_unmerged_libraries": { + "type": "boolean", + "fa_icon": "fas fa-merge ", + "default": false, + "description": "Use unmerged mapped libraries for genotyping.", + "help_text": "Rather than merging all mapped libraries into a single file per sample, data for each library will be genotyped separately. /nThis may be advisable when data for a single sample has both UDG and non-UDG treated libraries, or when damage rescaling is done and different libraries can have distinct damage models." }, "genotyping_tool": { "type": "string", @@ -1079,7 +1086,7 @@ "help_text": "Specify which genotyper to use. Current options are: pileupCaller, ANGSD, GATK UnifiedGenotyper (v3.5), GATK HaplotypeCaller (v4) or FreeBayes.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK v3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", "description": "Specify which genotyper to use." }, - "skip_bcftools_stats": { + "genotyping_skip_bcftools_stats": { "type": "boolean", "fa_icon": "fas fa-forward", "description": "Specify to skip generation of VCF-based variant calling statistics with bcftools.", diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index a674bf85f..d0898ecdb 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -418,7 +418,7 @@ workflow GENOTYPE { } // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes - if ( !params.skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { + if ( !params.genotyping_skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { ch_bcftools_input= ch_genotypes_vcf .map { addNewMetaFromAttributes( it, "reference" , "reference" , false ) diff --git a/subworkflows/local/manipulate_damage.nf b/subworkflows/local/manipulate_damage.nf index 583daa84d..3e9966644 100644 --- a/subworkflows/local/manipulate_damage.nf +++ b/subworkflows/local/manipulate_damage.nf @@ -26,8 +26,8 @@ workflow MANIPULATE_DAMAGE { ch_rescaled_bams = Channel.empty() ch_pmd_filtered_bams = Channel.empty() ch_trimmed_bams = Channel.empty() - ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads ch_merged_damage_manipulated_bams = Channel.empty() + ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads ch_multiqc_files = Channel.empty() @@ -62,7 +62,7 @@ workflow MANIPULATE_DAMAGE { ch_mapdamage_input = ch_mapdamage_prep.no_skip .multiMap { ignore_me, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam ] + bam: [ meta + [ 'damage_manipulation' : 'rescaled' ], bam ] fasta: fasta } @@ -73,7 +73,8 @@ workflow MANIPULATE_DAMAGE { ch_versions = ch_versions.mix( SAMTOOLS_INDEX_DAMAGE_RESCALED.out.versions.first() ) ch_rescaled_index = params.fasta_largeref ? SAMTOOLS_INDEX_DAMAGE_RESCALED.out.csi : SAMTOOLS_INDEX_DAMAGE_RESCALED.out.bai - // TODO When adding library-level data merging pre-genotyping, make sure that rescaled bams are not merged in any way as the underlying damage model could differ between libraries + // It may be inappropriate to merge different rescaled libraries (or rescaled and non-rescaled libs) for genotyping/downstream analysis + // Distinct libraries may have distinct damage models! ch_rescaled_bams = MAPDAMAGE2.out.rescaled.join(ch_rescaled_index) .mix(ch_skip_rescale) // Should these be mixed actually, or excluded? Might not make sense to take rescaled and non-rescaled bams togetehr for anything downstream. } @@ -122,7 +123,7 @@ workflow MANIPULATE_DAMAGE { .combine( ch_pmd_fastas, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta] fasta ] .multiMap { combine_meta, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam, bai ] + bam: [ meta + [ 'damage_manipulation' : 'pmd' ] , bam, bai ] fasta: fasta } @@ -137,21 +138,24 @@ workflow MANIPULATE_DAMAGE { SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED( ch_pmd_filtered_bams ) ch_pmd_filtered_flagstat = SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.flagstat + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.flagstat ) ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.versions.first() ) } if ( params.run_trim_bam ) { - if ( params.run_pmd_filtering ) { + // only run trimming on pmd reads if used downstream in genotyping + // must be explicilty asked for, since other tools not combined like this (eg mapdamage + trimming) + if ( params.genotyping_source == 'pmd_trimmed' ) { ch_to_trim = ch_pmd_filtered_bams .map{ meta, bam, bai -> - [ meta, bam ] + [ meta + ['damage_manipulation' : 'pmd_trimmed'], bam ] } } else { ch_to_trim = ch_bam_bai .map { meta, bam, bai -> - [ meta, bam ] + [ meta + ['damage_manipulation' : 'trimmed'], bam ] } } @@ -173,22 +177,21 @@ workflow MANIPULATE_DAMAGE { ch_trimmed_bams = BAMUTIL_TRIMBAM.out.bam.join( ch_trimmed_index ) } - // SUBWORKFLOW: merge libraries for saving and potentially genotyping - MERGE_LIBRARIES_DAMAGE_MANIPULATION(ch_bams_for_library_merge) + // SUBWORKFLOW: merge libraries for saving (in final bams) and (potentially) genotyping + + ch_for_merging = ch_rescaled_bams.mix(ch_pmd_filtered_bams).mix(ch_trimmed_bams) + + MERGE_LIBRARIES_DAMAGE_MANIPULATION(ch_for_merging) ch_versions = ch_versions.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.versions) ch_merged_damage_manipulated_bams = MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.bam_bai ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.mqc.collect { it[1] }.ifEmpty([])) - // MAYBE solution here, just modifying it, removing from eager.nf - ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams - - emit: - rescaled = ch_rescaled_bams // [ meta, bam, bai ] - filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] - trimmed = ch_trimmed_bams // [ meta, bam, bai ] - flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] - merged_bams = ch_merged_damage_manipulated_bams // [ meta, bam, bai ] + rescaled = ch_rescaled_bams // [ meta, bam, bai ] + filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] + trimmed = ch_trimmed_bams // [ meta, bam, bai ] + merged_bams = ch_merged_damage_manipulated_bams // [ meta, bam, bai ] // per-tool + flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] mqc = ch_multiqc_files versions = ch_versions } diff --git a/subworkflows/local/merge_libraries.nf b/subworkflows/local/merge_libraries.nf index d568b99ef..ebda7c858 100644 --- a/subworkflows/local/merge_libraries.nf +++ b/subworkflows/local/merge_libraries.nf @@ -18,9 +18,9 @@ workflow MERGE_LIBRARIES { ch_multiqc_files = Channel.empty() ch_library_merge_input = ch_bam_bai - .map { addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference"], ["id", "sample_id", "strandedness", "reference"], false ) } + .map { addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference", "damage_manipulation"], ["id", "sample_id", "strandedness", "reference", "damage_manipulation"], false ) } .groupTuple(by: 0) - // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on) + // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on), preserve damage maniupulation tool info for downstream output sorting .map { meta, lib_metas, bam, bai -> [ meta + [ 'single_end': true ], bam ] diff --git a/workflows/eager.nf b/workflows/eager.nf index 1492dbca6..d4d4215b2 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -32,7 +32,6 @@ include { ESTIMATE_CONTAMINATION } from '../subwork include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' -include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING } from '../subworkflows/local/merge_libraries' include { GENOTYPE } from '../subworkflows/local/genotype' /* @@ -231,6 +230,7 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(FILTER_BAM.out.mqc.collect { it[1] }.ifEmpty([])) } else { + // TODO: more intuitive name for this?, since here we don't have filtered reads :P ch_bamfiltered_for_deduplication = MAP.out.bam .join(MAP.out.bai) .mix(ch_bams_from_input) @@ -254,6 +254,7 @@ workflow EAGER { ch_versions = ch_versions.mix(DEDUPLICATE.out.versions) } else { + // TODO: more intuitive name for this, since here we don't have deduplicated reads :P ch_dedupped_bams = ch_reads_for_deduplication ch_dedupped_flagstat = Channel.empty() } @@ -526,16 +527,8 @@ workflow EAGER { if (params.run_mapdamage_rescaling || params.run_pmd_filtering || params.run_trim_bam) { MANIPULATE_DAMAGE(ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking) - ch_multiqc_files = ch_multiqc_files.mix(MANIPULATE_DAMAGE.out.flagstat.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MANIPULATE_DAMAGE.out.mqc.collect { it[1] }.ifEmpty([])) ch_versions = ch_versions.mix(MANIPULATE_DAMAGE.out.versions) -// TODO: figure out splitting into genotyping source (merging already done within manipulate damage, so just getting the trimmed/rescaled/filtered --> ch_bams_for_genotyping) - ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams - - ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_GENOTYPING.out.mqc.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.mqc.collect { it[1] }.ifEmpty([])) - } - else { - ch_bams_for_genotyping = ch_merged_dedup_bams } // @@ -543,9 +536,25 @@ workflow EAGER { // if (params.run_genotyping) { + + if ( params.genotyping_use_unmerged_libraries ) { + // Get UNMERGED data from either initial mapping (post deduplication/filtering, if done), or post-damage manipulation after deduplication/filtering (if done) -- Note: the .out channels include libraries that are not damage manipulated (eg UDG-Full)) + ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : params.genotyping_source == 'pmd_trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + } + else { + // Genotyping done on MERGED data, regardless of UDG-treatment vs not; damage manipulation per-library! + // Select input for genotyping: Default of raw is merged libraries post-deduplication; otherwise, merged libraries post damage manipulated + ch_bams_for_genotyping = params.genotyping_source != 'raw' ? MANIPULATE_DAMAGE.out.merged_bams.filter{ it[0]['damage_manipulation'] == params.genotyping_source } : ch_merged_dedup_bams + } + + // ch_bams_for_genotyping.view() + ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference.map { meta, fasta, fai, dict, mapindex -> [meta, fasta, fai, dict] } + + ch_reference_for_genotyping.view() + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, From e8e812bf636c103430b9ef9d13b61b13f3f46e58 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Tue, 25 Mar 2025 14:29:49 +0000 Subject: [PATCH 06/18] update validation, genotyping parameter consistency --- docs/development/manual_tests.md | 8 ++--- .../local/utils_nfcore_eager_pipeline/main.nf | 29 ++++++++++++------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 2bec03850..a1843b20f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -991,7 +991,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk UG on trimmed reads. Skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. ## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --run_trim_bam \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ @@ -1030,7 +1030,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Attempt to run Gatk HC on trimmed reads, without activating trimming. ## Expect: FAILURE. Cannot set genotyping source to ;trimmed' without trimming. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` @@ -1039,7 +1039,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk HC on trimmed reads, with different out mode and emit confidence. Skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' --run_trim_bam -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' --run_trim_bam -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` @@ -1063,7 +1063,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Freebayes on trimmed reads. Different options, and skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --run_trim_bam \ --genotyping_freebayes_skip_coverage 10 \ --genotyping_freebayes_min_alternate_count 2 \ diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 2058e6ff9..1b0f9b3e2 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -228,9 +228,25 @@ def validateInputParameters() { if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } if ( params.bamfiltering_generatefastq && params.run_bamfiltering ) { exit 1, ("[nf-core/eager] ERROR: --bamfiltering_generatefastq will NOT generate a fastq file unless BAM filtering is turned on with `--run_bamfiltering`") } + + // damage manipulation if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } - if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } - if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is turned on with `--run_pmd_filtering`.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is turned on with `--run_mapdamage_rescaling`.") } + if ( params.genotyping_source == 'pmd_trimmed' && ! params.run_pmd_filtering || ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd_trimmed' unless PMD-filtering is turned on with `--run_pmd_filtering` and BAM trimming is turned on with `--run_trim_bam`.") } + + // genotyping + if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } + if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } + if ( params.genotyping_source == 'raw' && ( params.run_trim_bam || params.run_pmd_filtering || params.run_mapdamage_rescaling ) ) { log.warn("[nf-core/eager] WARNING: --genotyping_source is set to 'raw' AND damage correction carried out (rescaling, filtering, or trimming). The output of these tools will NOT be utilized for genotyping!") } + if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( ( params.genotyping_source == 'rescaled' || params.genotyping_source 'pmd_timmed' || params.genotyping_source == 'pmd' ) && ! params.genotyping_use_unmerged_libraries ) { log.warn("[nf-core/eager] WARNING: Combining multiple libraries with rescaled damage for genotyping may be inappropriate!") } + if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } + if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } + + // metagenomics if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { if (params.metagenomics_prinseq_dustscore == 0.5) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") } } @@ -247,14 +263,6 @@ def validateInputParameters() { ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for metaphlan, which does not utilize paired-end information!") } if ( params.run_metagenomics && params.preprocessing_skippairmerging && params.metagenomics_profiling_tool == 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } - if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } - if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } - if ( params.genotyping_source == 'raw' && params.run_trim_bam || params.run_pmd_filtering || params.run_mapdamage_rescaling ) { log.warn("[nf-core/eager] WARNING: --genotyping_source is set to 'raw' AND damage correction carried out (rescaling, filtering, or trimming). The output of these tools will NOT be utilized for genotyping!") } - if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } - if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } - if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } - if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } } // @@ -414,7 +422,6 @@ def addNewMetaFromAttributes( ArrayList row, Object source_attributes, Object ta } else { // Option D: Not both the same type or acceptable types. Error. throw new IllegalArgumentException("Error: target_attributes and source_attributes must be of same type (both Strings or both Lists).") - } def new_row = [ meta2 ] + row From 021aea47e3cca1f66a0e870bc8a2ace58c08b8fb Mon Sep 17 00:00:00 2001 From: Ian Light Date: Tue, 25 Mar 2025 14:35:46 +0000 Subject: [PATCH 07/18] removal of partial implementation of name collision config --- conf/test_namecollision.config | 50 ---------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 conf/test_namecollision.config diff --git a/conf/test_namecollision.config b/conf/test_namecollision.config deleted file mode 100644 index e8d041cf0..000000000 --- a/conf/test_namecollision.config +++ /dev/null @@ -1,50 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - - Use as follows: - nextflow run nf-core/eager -profile test_namecollision, --outdir - ----------------------------------------------------------------------------------------- -*/ - -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - -params { - config_profile_name = 'Test name collisions profile' - config_profile_description = 'Minimal test dataset to check for name collisions' - - // Input data - input = params.pipelines_testdata_base_path + 'eager/testdata/Human/human_design_bam_eager3.tsv' - - // Genome references - fasta = params.pipelines_testdata_base_path + 'eager/reference/Human/hs37d5_chr21-MT.fa.gz' - - - // Preprocessing - sequencing_qc_tool = 'falco' - preprocessing_tool = 'fastp' - convert_inputbam = true - - // Mapping - mapping_tool = 'bwaaln' - - // Metagenomics - run_metagenomics = true - metagenomics_complexity_tool = 'prinseq' - metagenomics_profiling_tool = 'kraken2' - metagenomics_profiling_database = params.pipelines_testdata_base_path + 'eager/databases/kraken/eager_test.tar.gz' - metagenomics_run_postprocessing = true - - // Genotyping - genotyping_tool = 'hc' - -} From 07d0a2dc1d86b6c0e39dbf033d6a0bf46361ef39 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Tue, 25 Mar 2025 14:57:14 +0000 Subject: [PATCH 08/18] typo in parameter check --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 1b0f9b3e2..4228241c9 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -242,7 +242,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } - if ( ( params.genotyping_source == 'rescaled' || params.genotyping_source 'pmd_timmed' || params.genotyping_source == 'pmd' ) && ! params.genotyping_use_unmerged_libraries ) { log.warn("[nf-core/eager] WARNING: Combining multiple libraries with rescaled damage for genotyping may be inappropriate!") } + if ( ( params.genotyping_source == 'rescaled' || params.genotyping_source == 'pmd_timmed' || params.genotyping_source == 'pmd' ) && ! params.genotyping_use_unmerged_libraries ) { log.warn("[nf-core/eager] WARNING: Combining multiple libraries with rescaled damage for genotyping may be inappropriate!") } if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } From b7ac6ed975876c6b7e22868db6e902d536f42f90 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 26 Mar 2025 10:35:39 +0000 Subject: [PATCH 09/18] fixed erroneous param combo check --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 4228241c9..dfd667810 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -233,7 +233,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is turned on with `--run_pmd_filtering`.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is turned on with `--run_mapdamage_rescaling`.") } - if ( params.genotyping_source == 'pmd_trimmed' && ! params.run_pmd_filtering || ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd_trimmed' unless PMD-filtering is turned on with `--run_pmd_filtering` and BAM trimming is turned on with `--run_trim_bam`.") } + if ( params.genotyping_source == 'pmd_trimmed' && ! ( params.run_pmd_filtering && params.run_trim_bam ) ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd_trimmed' unless PMD-filtering is turned on with `--run_pmd_filtering` and BAM trimming is turned on with `--run_trim_bam`.") } // genotyping if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } From 6eb67c47d38657a671b9ccbc9d365ab241fddcdc Mon Sep 17 00:00:00 2001 From: Ian Light Date: Wed, 26 Mar 2025 14:13:43 +0000 Subject: [PATCH 10/18] update genotyping libraries splitting for process names and saving --- conf/modules.config | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9f3f1cd73..1c8eff35f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -515,6 +515,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: 'BWA_SAMSE|BWA_SAMPE' { tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -537,6 +538,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: BWA_MEM { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -552,6 +554,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: BOWTIE2_ALIGN { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -1260,7 +1263,6 @@ process { } withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_MERGE_LIBRARIES" { - // TODO: add parameter from run_trim_bam/run_pmd_filtering/run_mapdamage_rescaling --> replace genotpying source for final bams tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}_unsorted" } publishDir = [ @@ -1639,22 +1641,22 @@ process { } withName: GATK_REALIGNERTARGETCREATOR { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}_realigntarget" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}_realigntarget" : "${sample_id}_${meta.reference}_realigntarget" } publishDir = [ enabled: false ] } withName: GATK_INDELREALIGNER { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}_realigned" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}_realigned" : "${sample_id}_${meta.reference}_realigned" } publishDir = [ [ // data @@ -1667,16 +1669,17 @@ process { } withName: GATK_UNIFIEDGENOTYPER { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ - "--sample_ploidy ${meta2.ploidy}", + // ploidy set at 2 for non-specified ploidy in input fasta sheet for consistency with gatk default + meta2.ploidy ? "--sample_ploidy ${meta2.ploidy}" : "--sample_ploidy 2", "-stand_call_conf ${params.genotyping_gatk_call_conf}", "-dcov ${params.genotyping_gatk_ug_downsample}", "--output_mode ${params.genotyping_gatk_ug_out_mode}", "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1688,9 +1691,9 @@ process { } withName: BCFTOOLS_INDEX_UG { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = "--tbi" //tbi indices for consistency with GATK HC - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1702,15 +1705,15 @@ process { } withName: GATK4_HAPLOTYPECALLER { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ // Option names have changed from underscore_separated to hyphen-separated in GATK4 - "--sample-ploidy ${meta2.ploidy}", + meta2.ploidy ? "--sample-ploidy ${meta2.ploidy}" : "--sample-ploidy 2", "-stand-call-conf ${params.genotyping_gatk_call_conf}", "--output-mode ${params.genotyping_gatk_hc_out_mode}", "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", ].join(' ').trim() } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1722,13 +1725,13 @@ process { } withName: FREEBAYES { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ - "-p ${ref_meta.ploidy}", + ref_meta.ploidy ? "--ploidy ${ref_meta.ploidy}" : "--ploidy 2", "-C ${params.genotyping_freebayes_min_alternate_count}", params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", ].join(' ').trim() } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1740,9 +1743,9 @@ process { } withName: BCFTOOLS_INDEX_FREEBAYES { - tag = { "${meta.reference}|${meta.sample_id}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = "--tbi" //tbi indices for consistency with GATK HC - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1754,8 +1757,8 @@ process { } withName: BCFTOOLS_STATS_GENOTYPING { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } publishDir = [ [ // stats From 71f4510b14bd3489b98c6cf6285adddb8334341c Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Mar 2025 11:00:01 +0000 Subject: [PATCH 11/18] working RG removal branching for mpileupcaller --- subworkflows/local/genotype.nf | 173 +++++++++++------- .../local/utils_nfcore_eager_pipeline/main.nf | 2 +- workflows/eager.nf | 5 +- 3 files changed, 113 insertions(+), 67 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index d0898ecdb..123f74d05 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -2,6 +2,7 @@ // Genotype the input data using the requested genotyper. // +include { PICARD_ADDORREPLACEREADGROUPS } from '../../modules/nf-core/picard/addorreplacereadgroups/main' include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../../modules/nf-core/samtools/mpileup/main' include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' include { SEQUENCETOOLS_PILEUPCALLER } from '../../modules/nf-core/sequencetools/pileupcaller/main' @@ -35,6 +36,13 @@ workflow GENOTYPE { if ( params.genotyping_tool == 'pileupcaller' ) { + // TODO: if running per-library, necessary to update readgroups + // reassign readgroups to be unique per library (eg rewrite any SM field from sample to SAMPLEID_LIBID) + // some tools (eg pileupcaller) break when multiple .bams input have the same SM field for read groups + // PICARD_ADDORREPLACEREADGROUPS(ch_bams_for_genotyping) + // ch_bams_for_genotyping = PICARD_ADDORREPLACEREADGROUPS.out + + // Compile together all reference based files ch_refs_prep = ch_fasta_plus // Because aux files are optional, the channel can be [[],[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. @@ -61,85 +69,118 @@ workflow GENOTYPE { } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, bed, snp ] // Prepare collect bams for mpileup - ch_mpileup_inputs_bams = ch_bam_bai + // Recreate Read Group headers if no genotyping input is library-by-library + if ( params.genotyping_use_unmerged_libraries ) { + ch_input_for_rewriting_readgroups = ch_bam_bai .map { - addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + .multiMap { + ignore_me, combo_meta, bams, bais, ref_meta, fasta, fai, dict, bed, snp -> + bams: [ combo_meta, bams ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] } - .groupTuple() - .map { - combo_meta, metas, bams, bais -> - def ids = metas.collect { meta -> meta.sample_id } - [ combo_meta + [sample_id: ids], bams ] // Drop bais - } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later - // Combine prepped bams and references - ch_mpileup_inputs = ch_mpileup_inputs_bams - .map { - addNewMetaFromAttributes( it, "reference", "reference" , false ) - } - .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) - // do not run if no bed file is provided - .filter { it[7] != []} - .multiMap { - ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> - def bedfile = bed != "" ? bed : [] - bams: [ combo_meta, bams, bedfile ] - fasta: [ fasta ] - } + PICARD_ADDORREPLACEREADGROUPS(ch_input_for_rewriting_readgroups.bams, ch_input_for_rewriting_readgroups.fasta, ch_input_for_rewriting_readgroups.fai) - SAMTOOLS_MPILEUP_PILEUPCALLER( - ch_mpileup_inputs.bams, - ch_mpileup_inputs.fasta, - ) - ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) - - ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup + ch_mpileup_inputs_bams = PICARD_ADDORREPLACEREADGROUPS.out.bam .map { - addNewMetaFromAttributes( it, "reference", "reference" , false ) - } - .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) - .multiMap { - ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> - // def snpfile = snp != "" ? snp : [] - mpileup: [ meta, mpileup ] - snpfile: snp + addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) } - - // Run PileupCaller - SEQUENCETOOLS_PILEUPCALLER( - ch_pileupcaller_input.mpileup, - ch_pileupcaller_input.snpfile, - [] - ) - ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) - - // Merge/rename genotyping datasets - ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat + .groupTuple() .map { - addNewMetaFromAttributes( it, "reference" , "reference" , false ) + combo_meta, metas, bams -> + def ids = metas.collect { meta -> meta.library_id } + [ combo_meta + [sample_id: ids], bams ] + } // Collect all LIBRARY IDs into a list in meta.sample_id. Useful when running pileupCaller later + // distinct from running merged libraries as single sample -- libraries must be unique + } + else { + ch_mpileup_inputs_bams = ch_bam_bai + .map { + addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) } .groupTuple() .map { - combo_meta, metas, geno, snp, ind -> - [ combo_meta, geno, snp, ind ] - } + combo_meta, metas, bams, bais -> + def ids = metas.collect { meta -> meta.sample_id } + [ combo_meta + [sample_id: ids], bams ] // Drop bais + } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later + } + + ch_mpileup_inputs_bams.view() - COLLECT_GENOTYPES( ch_final_genotypes ) - // Add genotyper info to the meta - ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + // Combine prepped bams and references + ch_mpileup_inputs = ch_mpileup_inputs_bams .map { - meta, geno, snp, ind -> - [ meta + [ genotyper: "pileupcaller" ], geno , snp, ind ] + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + // do not run if no bed file is provided + .filter { it[7] != []} + .multiMap { + ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> + def bedfile = bed != "" ? bed : [] + bams: [ combo_meta, bams, bedfile ] + fasta: [ fasta ] } - ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) - // Calculate coverage stats for collected eigenstrat dataset - EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( - ch_pileupcaller_genotypes - ) - ch_eigenstrat_coverage_stats = EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.tsv - ch_versions = ch_versions.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.json ) + + SAMTOOLS_MPILEUP_PILEUPCALLER( + ch_mpileup_inputs.bams, + ch_mpileup_inputs.fasta, + ) + ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) + + ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup + .map { + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) + .multiMap { + ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> + // def snpfile = snp != "" ? snp : [] + mpileup: [ meta, mpileup ] + snpfile: snp + } + + // Run PileupCaller + SEQUENCETOOLS_PILEUPCALLER( + ch_pileupcaller_input.mpileup, + ch_pileupcaller_input.snpfile, + [] + ) + ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) + + // Merge/rename genotyping datasets + ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat + .map { + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple() + .map { + combo_meta, metas, geno, snp, ind -> + [ combo_meta, geno, snp, ind ] + } + + COLLECT_GENOTYPES( ch_final_genotypes ) + // Add genotyper info to the meta + ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + .map { + meta, geno, snp, ind -> + [ meta + [ genotyper: "pileupcaller" ], geno , snp, ind ] + } + ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) + + // Calculate coverage stats for collected eigenstrat dataset + EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( + ch_pileupcaller_genotypes + ) + ch_eigenstrat_coverage_stats = EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.tsv + ch_versions = ch_versions.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.json ) } if ( params.genotyping_tool == 'ug' ) { @@ -401,6 +442,8 @@ workflow GENOTYPE { fasta: [ ref_meta, fasta ] } + ch_input_for_angsd.bam.view() + ANGSD_GL( ch_input_for_angsd.bam, ch_input_for_angsd.fasta, diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index dfd667810..208ad13a8 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -243,7 +243,7 @@ def validateInputParameters() { if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } if ( ( params.genotyping_source == 'rescaled' || params.genotyping_source == 'pmd_timmed' || params.genotyping_source == 'pmd' ) && ! params.genotyping_use_unmerged_libraries ) { log.warn("[nf-core/eager] WARNING: Combining multiple libraries with rescaled damage for genotyping may be inappropriate!") } - if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } + // if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } // metagenomics diff --git a/workflows/eager.nf b/workflows/eager.nf index d4d4215b2..203e8d0ee 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -540,6 +540,9 @@ workflow EAGER { if ( params.genotyping_use_unmerged_libraries ) { // Get UNMERGED data from either initial mapping (post deduplication/filtering, if done), or post-damage manipulation after deduplication/filtering (if done) -- Note: the .out channels include libraries that are not damage manipulated (eg UDG-Full)) ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : params.genotyping_source == 'pmd_trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + + // ch_bams_for_genotyping.view() + } else { // Genotyping done on MERGED data, regardless of UDG-treatment vs not; damage manipulation per-library! @@ -553,7 +556,7 @@ workflow EAGER { [meta, fasta, fai, dict] } - ch_reference_for_genotyping.view() + // ch_reference_for_genotyping.view() GENOTYPE( ch_bams_for_genotyping, From 426e4cddc6ec93a47151d93b18a41f3b666fb6eb Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 28 Mar 2025 12:05:35 +0000 Subject: [PATCH 12/18] module import for addreadgroups --- conf/modules.config | 17 ++++ modules.json | 5 + .../addorreplacereadgroups/environment.yml | 7 ++ .../picard/addorreplacereadgroups/main.nf | 65 +++++++++++++ .../picard/addorreplacereadgroups/meta.yml | 93 +++++++++++++++++++ .../addorreplacereadgroups/tests/bam.config | 13 +++ .../addorreplacereadgroups/tests/cram.config | 13 +++ .../addorreplacereadgroups/tests/main.nf.test | 93 +++++++++++++++++++ .../tests/main.nf.test.snap | 74 +++++++++++++++ .../addorreplacereadgroups/tests/tags.yml | 2 + 10 files changed, 382 insertions(+) create mode 100644 modules/nf-core/picard/addorreplacereadgroups/environment.yml create mode 100644 modules/nf-core/picard/addorreplacereadgroups/main.nf create mode 100644 modules/nf-core/picard/addorreplacereadgroups/meta.yml create mode 100644 modules/nf-core/picard/addorreplacereadgroups/tests/bam.config create mode 100644 modules/nf-core/picard/addorreplacereadgroups/tests/cram.config create mode 100644 modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test create mode 100644 modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap create mode 100644 modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml diff --git a/conf/modules.config b/conf/modules.config index 1c8eff35f..cba20d1ec 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1585,6 +1585,23 @@ process { // // GENOTYPING // + withName: PICARD_ADDORREPLACEREADGROUPS { + tag = { "${meta.reference}|${meta.library_id}" } + ext.args ={ + se_pe_string = meta.single_end ? "SE" : "PE" + [ + "--RGID ILLUMINA-${meta.library_id}", + "--RGSM ${meta.library_id}", + "--RGLB ${meta.library_id}", + "--RGPL illumina", + "--RGPU ILLUMINA-${meta.sample_id}_${meta.library_id}-${meta.strandedness}_stranded-${se_pe_string}" + ].join(' ').trim() } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_unique_rgid" } + publishDir = [ + enabled: false + ] + } + withName: SAMTOOLS_MPILEUP_PILEUPCALLER { tag = { "${meta.reference}|${meta.strandedness}" } ext.args = [ diff --git a/modules.json b/modules.json index fcd9e7240..d1ff1f777 100644 --- a/modules.json +++ b/modules.json @@ -225,6 +225,11 @@ "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", "installed_by": ["modules"] }, + "picard/addorreplacereadgroups": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": ["modules"] + }, "picard/createsequencedictionary": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", diff --git a/modules/nf-core/picard/addorreplacereadgroups/environment.yml b/modules/nf-core/picard/addorreplacereadgroups/environment.yml new file mode 100644 index 000000000..8f34e975c --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::picard=3.3.0 diff --git a/modules/nf-core/picard/addorreplacereadgroups/main.nf b/modules/nf-core/picard/addorreplacereadgroups/main.nf new file mode 100644 index 000000000..5a71e085a --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/main.nf @@ -0,0 +1,65 @@ +process PICARD_ADDORREPLACEREADGROUPS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' : + 'biocontainers/picard:3.3.0--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_index) + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.bai") , emit: bai, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "${reads.getExtension()}" + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard AddOrReplaceReadGroups] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + if ("$reads" == "${prefix}.${suffix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + picard \\ + -Xmx${avail_mem}M \\ + AddOrReplaceReadGroups \\ + $args \\ + $reference \\ + --INPUT ${reads} \\ + --OUTPUT ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(picard AddOrReplaceReadGroups --version 2>&1 | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "${reads.getExtension()}" + if ("$reads" == "${prefix}.${suffix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(picard AddOrReplaceReadGroups --version 2>&1 | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/meta.yml b/modules/nf-core/picard/addorreplacereadgroups/meta.yml new file mode 100644 index 000000000..77ce85035 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/meta.yml @@ -0,0 +1,93 @@ +name: picard_addorreplacereadgroups +description: Assigns all the reads in a file to a single new read-group +keywords: + - add + - replace + - read-group + - picard +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037226472-AddOrReplaceReadGroups-Picard- + tool_dev_url: https://github.com/broadinstitute/picard + licence: ["MIT"] + identifier: biotools:picard_tools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Sequence reads file, can be SAM/BAM/CRAM format + pattern: "*.{bam,cram,sam}" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference genome file + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_index: + type: file + description: Reference genome index file + pattern: "*.{fai,fasta.fai,fa.fai,fasta.gz.fai,fa.gz.fai}" +output: + - bam: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: Output BAM file + pattern: "*.{bam}" + - bai: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bai": + type: file + description: An optional BAM index file + pattern: "*.{bai}" + - cram: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.cram": + type: file + description: Output CRAM file + pattern: "*.{cram}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sateeshperi" + - "@mjcipriano" + - "@hseabolt" + - "@cmatKhan" + - "@muffato" +maintainers: + - "@sateeshperi" + - "@mjcipriano" + - "@hseabolt" + - "@cmatKhan" + - "@muffato" diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config b/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config new file mode 100644 index 000000000..3f37c2fd9 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config @@ -0,0 +1,13 @@ +process { + withName: 'PICARD_ADDORREPLACEREADGROUPS'{ + ext.prefix = { "${meta.id}.replaced"} + ext.args = {[ + "--CREATE_INDEX", + "-LB ${meta.id}", + "-PL ILLUMINA", + "-PU bc1", + "-SM ${meta.id}" + ].join(' ').trim()} + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config b/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config new file mode 100644 index 000000000..966c14d79 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config @@ -0,0 +1,13 @@ +process { + withName: 'PICARD_ADDORREPLACEREADGROUPS'{ + ext.prefix = { "${meta.id}.replaced"} + ext.args = {[ + "-LB ${meta.id}", + "-PL ILLUMINA", + "-PU bc1", + "-SM ${meta.id}" + ].join(' ').trim()} + ext.suffix = { "cram" } + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test new file mode 100644 index 000000000..9c029354f --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test @@ -0,0 +1,93 @@ + +nextflow_process { + + name "Test Process PICARD_ADDORREPLACEREADGROUPS" + script "../main.nf" + process "PICARD_ADDORREPLACEREADGROUPS" + + tag "modules" + tag "modules_nfcore" + tag "picard" + tag "picard/addorreplacereadgroups" + + test("sarscov2 - bam") { + config "./bam.config" + + when { + process { + """ + input[0] = [ [:], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) ] + input[1] = [ [:], [] ] + input[2] = [ [:], [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.bai[0][1]).name, + process.out.versions + ).match() + }, + ) + } + + } + + test("homo_sapiens - cram") { + config "./cram.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), ] + ] + input[1] = [ [:], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + input[2] = [ [:], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + process.out.versions + ).match() + }, + ) + } + + } + + test("sarscov2 - bam - stub") { + config "./bam.config" + options "-stub" + + when { + process { + """ + input[0] = [ [:], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) ] + input[1] = [ [:], [] ] + input[2] = [ [:], [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap new file mode 100644 index 000000000..3ae6b9171 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap @@ -0,0 +1,74 @@ +{ + "homo_sapiens - cram": { + "content": [ + "test.replaced.cram", + [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:13:23.115450366" + }, + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + + }, + "null.replaced.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ], + "bai": [ + + ], + "bam": [ + [ + { + + }, + "null.replaced.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + + ], + "versions": [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:13:47.23694937" + }, + "sarscov2 - bam": { + "content": [ + "null.replaced.bam", + "null.replaced.bai", + [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:12:53.083677142" + } +} \ No newline at end of file diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml b/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml new file mode 100644 index 000000000..733010d62 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml @@ -0,0 +1,2 @@ +picard/addorreplacereadgroups: + - "modules/nf-core/picard/addorreplacereadgroups/**" From 199dcd5a631ea2d8d34fbd5210eb18d05e06dff1 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 4 Apr 2025 08:53:15 +0000 Subject: [PATCH 13/18] adjusted creation of angsd input channel for unmerged libs --- subworkflows/local/genotype.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 123f74d05..2d86e60a3 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -409,7 +409,8 @@ workflow GENOTYPE { .map { combo_meta, metas, bams, bais -> def new_map = [:] - def ids = metas.collect { meta -> meta.sample_id } + // ids will either be sampleID or libraryID depending on desired input for genotyping (default is use meta.sample_id, for merged sample data) + def ids = params.genotyping_use_unmerged_libraries ? metas.collect { meta -> meta.library_id } : metas.collect { meta -> meta.sample_id } def strandedness = metas.collect { meta -> meta.strandedness } def single_ends = metas.collect { meta -> meta.single_end } def reference = combo_meta.reference From c1b41cf37b0e21e18026ac87bda9efe9a6f61ab5 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 4 Apr 2025 09:53:37 +0000 Subject: [PATCH 14/18] cleaning of development notes, views --- subworkflows/local/genotype.nf | 11 ----------- workflows/eager.nf | 8 +------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 2d86e60a3..c45e1474d 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -36,13 +36,6 @@ workflow GENOTYPE { if ( params.genotyping_tool == 'pileupcaller' ) { - // TODO: if running per-library, necessary to update readgroups - // reassign readgroups to be unique per library (eg rewrite any SM field from sample to SAMPLEID_LIBID) - // some tools (eg pileupcaller) break when multiple .bams input have the same SM field for read groups - // PICARD_ADDORREPLACEREADGROUPS(ch_bams_for_genotyping) - // ch_bams_for_genotyping = PICARD_ADDORREPLACEREADGROUPS.out - - // Compile together all reference based files ch_refs_prep = ch_fasta_plus // Because aux files are optional, the channel can be [[],[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. @@ -110,8 +103,6 @@ workflow GENOTYPE { } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later } - ch_mpileup_inputs_bams.view() - // Combine prepped bams and references ch_mpileup_inputs = ch_mpileup_inputs_bams .map { @@ -443,8 +434,6 @@ workflow GENOTYPE { fasta: [ ref_meta, fasta ] } - ch_input_for_angsd.bam.view() - ANGSD_GL( ch_input_for_angsd.bam, ch_input_for_angsd.fasta, diff --git a/workflows/eager.nf b/workflows/eager.nf index 203e8d0ee..9bb1bd444 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -540,9 +540,6 @@ workflow EAGER { if ( params.genotyping_use_unmerged_libraries ) { // Get UNMERGED data from either initial mapping (post deduplication/filtering, if done), or post-damage manipulation after deduplication/filtering (if done) -- Note: the .out channels include libraries that are not damage manipulated (eg UDG-Full)) ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : params.genotyping_source == 'pmd_trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams - - // ch_bams_for_genotyping.view() - } else { // Genotyping done on MERGED data, regardless of UDG-treatment vs not; damage manipulation per-library! @@ -550,14 +547,11 @@ workflow EAGER { ch_bams_for_genotyping = params.genotyping_source != 'raw' ? MANIPULATE_DAMAGE.out.merged_bams.filter{ it[0]['damage_manipulation'] == params.genotyping_source } : ch_merged_dedup_bams } - // ch_bams_for_genotyping.view() - + // consistent ref indexing regardless of merged/unmerged data ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference.map { meta, fasta, fai, dict, mapindex -> [meta, fasta, fai, dict] } - // ch_reference_for_genotyping.view() - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, From 63ba76fcd24ed30eb814338ce7a6d595e9d05e6a Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 4 Apr 2025 09:53:52 +0000 Subject: [PATCH 15/18] fixed typo in file output naming --- conf/modules.config | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index cba20d1ec..5f77fb2ac 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1662,7 +1662,7 @@ process { ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}_realigntarget" : "${sample_id}_${meta.reference}_realigntarget" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_realigntarget" : "${meta.sample_id}_${meta.reference}_realigntarget" } publishDir = [ enabled: false ] @@ -1673,7 +1673,7 @@ process { ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}_realigned" : "${sample_id}_${meta.reference}_realigned" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_realigned" : "${meta.sample_id}_${meta.reference}_realigned" } publishDir = [ [ // data @@ -1696,7 +1696,7 @@ process { "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() } - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1710,7 +1710,7 @@ process { withName: BCFTOOLS_INDEX_UG { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = "--tbi" //tbi indices for consistency with GATK HC - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1730,7 +1730,7 @@ process { "--output-mode ${params.genotyping_gatk_hc_out_mode}", "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", ].join(' ').trim() } - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1748,7 +1748,7 @@ process { "-C ${params.genotyping_freebayes_min_alternate_count}", params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", ].join(' ').trim() } - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1762,7 +1762,7 @@ process { withName: BCFTOOLS_INDEX_FREEBAYES { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = "--tbi" //tbi indices for consistency with GATK HC - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // data @@ -1775,7 +1775,7 @@ process { withName: BCFTOOLS_STATS_GENOTYPING { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } - ext.prefix = { params.genotyping_use_unmerged_libraries ? "${sample_id}_${meta.library_id}_${meta.reference}" : "${sample_id}_${meta.reference}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ // stats From 59d095fa88d3ffcbc148494f40ec607880a570dd Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 25 Apr 2025 09:02:18 +0000 Subject: [PATCH 16/18] implemented changes from PR review --- assets/schema_fasta.json | 1 + conf/modules.config | 24 +++++++++++++----------- subworkflows/local/genotype.nf | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json index cbdd4639f..c67f53829 100644 --- a/assets/schema_fasta.json +++ b/assets/schema_fasta.json @@ -124,6 +124,7 @@ "genotyping_reference_ploidy": { "type": "integer", "meta": ["genotyping_ploidy"], + "default": 2, "errorMessage": "Organism ploidy for GATK or FreeBayes must be provided as integers." }, "genotyping_gatk_dbsnp": { diff --git a/conf/modules.config b/conf/modules.config index 0d9e602dc..a5cb5f8f2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1203,7 +1203,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_RESCALED { - tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ @@ -1235,7 +1235,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_FILTERED { - tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ @@ -1277,7 +1277,7 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_TRIMMED { - tag = { "${meta.reference}|${meta.damage_manipulation}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ @@ -1291,7 +1291,7 @@ process { withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_MERGE_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}_unsorted" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}_unsorted" } publishDir = [ enabled: false ] @@ -1299,7 +1299,7 @@ process { withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_SORT_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } publishDir = [ [ // data @@ -1313,7 +1313,7 @@ process { withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_INDEX_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } ext.args = { params.fasta_largeref ? "-c" : "" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } publishDir = [ [ // data @@ -1326,7 +1326,7 @@ process { withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } publishDir = [ [ // stats @@ -1715,8 +1715,8 @@ process { withName: GATK_UNIFIEDGENOTYPER { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ - // ploidy set at 2 for non-specified ploidy in input fasta sheet for consistency with gatk default - meta2.ploidy ? "--sample_ploidy ${meta2.ploidy}" : "--sample_ploidy 2", + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with gatk default + "--sample_ploidy ${meta2.ploidy}", "-stand_call_conf ${params.genotyping_gatk_call_conf}", "-dcov ${params.genotyping_gatk_ug_downsample}", "--output_mode ${params.genotyping_gatk_ug_out_mode}", @@ -1752,7 +1752,8 @@ process { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ // Option names have changed from underscore_separated to hyphen-separated in GATK4 - meta2.ploidy ? "--sample-ploidy ${meta2.ploidy}" : "--sample-ploidy 2", + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with gatk default + "--sample-ploidy ${meta2.ploidy}", "-stand-call-conf ${params.genotyping_gatk_call_conf}", "--output-mode ${params.genotyping_gatk_hc_out_mode}", "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", @@ -1771,7 +1772,8 @@ process { withName: FREEBAYES { tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } ext.args = {[ - ref_meta.ploidy ? "--ploidy ${ref_meta.ploidy}" : "--ploidy 2", + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with freebayes default + "--ploidy ${ref_meta.ploidy}", "-C ${params.genotyping_freebayes_min_alternate_count}", params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", ].join(' ').trim() } diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index c45e1474d..6bb6f9c71 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -405,7 +405,7 @@ workflow GENOTYPE { def strandedness = metas.collect { meta -> meta.strandedness } def single_ends = metas.collect { meta -> meta.single_end } def reference = combo_meta.reference - new_meta = [ sample_id: ids, strandedness: strandedness, single_end: single_ends, reference: reference ] + def new_meta = [ sample_id: ids, strandedness: strandedness, single_end: single_ends, reference: reference ] [ combo_meta, new_meta, bams, bais ] // Drop bais } // Collect all IDs into a list in meta.sample_id. From 3216e3b5ca30a98676b990f2595d7faaede0ba6e Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 25 Apr 2025 09:52:45 +0000 Subject: [PATCH 17/18] added explanation of behavior when declaring both command line and input sheet ploidy --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 11b3bbf23..0b508fd0d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1096,7 +1096,7 @@ "type": "integer", "default": 2, "description": "Specify the ploidy of the reference organism.", - "help_text": "Specify the desired ploidy value of your reference organism for genotyping with GATK or FreeBayes. E.g. if you want to allow heterozygous calls this value should be >= 2.\n\n> Modifies GATK UnifiedGenotyper parameter: `--sample_ploidy`\n> Modifies GATK HaplotypeCaller parameter: `--sample-ploidy`\n> Modifies FreeBayes parameter: `-p`", + "help_text": "Specify the desired ploidy value of your reference organism for genotyping with GATK or FreeBayes. E.g. if you want to allow heterozygous calls this value should be >= 2.\n\n> Modifies GATK UnifiedGenotyper parameter: `--sample_ploidy`\n> Modifies GATK HaplotypeCaller parameter: `--sample-ploidy`\n> Modifies FreeBayes parameter: `-p`.\n\n Note: Note: ploidy in the input fasta list take priority", "fa_icon": "fas fa-pastafarianism" }, "genotyping_pileupcaller_min_base_quality": { From 6cb1598c94f4b54cd2f4386087c35e1db7df3404 Mon Sep 17 00:00:00 2001 From: Ian Light Date: Fri, 16 May 2025 09:33:14 +0000 Subject: [PATCH 18/18] update usage --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0264f9dad..80f474adb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -102,7 +102,7 @@ Only the `reference_name`, and `fasta` columns are mandatory, whereas all other | pmdtools_bed_for_masking | No | Optional path to SNP capture BED file to mask the reference for PMDtools | | sexdeterrmine_snp_bed | No | Optional path to SNP capture bed files for genetic sex estimation with SexDetERRmine | | bedtools_feature_file | No | Optional path to feature file for coverage calculation with bedtools | -| genotyping_reference_ploidy | No | Optional integer to specify organism ploidy for genotyping with GATK or FreeBayes | +| genotyping_reference_ploidy | No | Optional integer to specify organism ploidy for genotyping with GATK or FreeBayes (by default set to 2) | | genotyping_gatk_dbsnp | No | Optional path to SNP annotation file for genotyping with GATK | Files for `fai`, `dict`, `mapper_index` will be generated by the pipeline for you if not specified.