diff --git a/assets/schema_fasta.json b/assets/schema_fasta.json index cbdd4639f..c67f53829 100644 --- a/assets/schema_fasta.json +++ b/assets/schema_fasta.json @@ -124,6 +124,7 @@ "genotyping_reference_ploidy": { "type": "integer", "meta": ["genotyping_ploidy"], + "default": 2, "errorMessage": "Organism ploidy for GATK or FreeBayes must be provided as integers." }, "genotyping_gatk_dbsnp": { diff --git a/conf/modules.config b/conf/modules.config index 5c4ba94be..daf546419 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -512,6 +512,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: 'BWA_SAMSE|BWA_SAMPE' { tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -535,6 +536,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: BWA_MEM { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -551,6 +553,7 @@ process { ] } +// TODO: add readgroup split for using unmerged libs for genotyping withName: BOWTIE2_ALIGN { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.args = { @@ -1171,8 +1174,8 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_RESCALED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.args = { params.fasta_largeref ? "-c" : "" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } + ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ path: { "${params.outdir}/damage_manipulation/mapdamage2/data/" }, @@ -1203,8 +1206,8 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_FILTERED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.args = { params.fasta_largeref ? "-c" : "" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } + ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ path: { "${params.outdir}/damage_manipulation/pmdtools/data/" }, @@ -1242,8 +1245,8 @@ process { } withName: SAMTOOLS_INDEX_DAMAGE_TRIMMED { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.args = { params.fasta_largeref ? "-c" : "" } + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}|${meta.damage_manipulation}" } + ext.args = { params.fasta_largeref ? "-c" : "" } publishDir = [ [ path: { "${params.outdir}/damage_manipulation/bamutils_trimbam/data/" }, @@ -1253,6 +1256,54 @@ process { ] } + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_MERGE_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}_unsorted" } + publishDir = [ + enabled: false + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_SORT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } + publishDir = [ + [ + // data + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/data/" }, + mode: params.publish_dir_mode, + pattern: '*.bam' + ] + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_INDEX_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.args = { params.fasta_largeref ? "-c" : "" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } + publishDir = [ + [ + // data + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/data/" }, + mode: params.publish_dir_mode, + pattern: '*.{bai,csi}' + ] + ] + } + + withName: ".*MERGE_LIBRARIES_DAMAGE_MANIPULATION:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES" { + tag = { "${meta.reference}|${meta.sample_id}|${meta.damage_manipulation}" } + ext.prefix = { "${meta.sample_id}_${meta.damage_manipulation}_${meta.reference}" } + publishDir = [ + [ + // stats + path: { "${params.outdir}/final_bams/${meta.damage_manipulation}/stats/" }, + mode: params.publish_dir_mode, + pattern: '*.flagstat' + ] + ] + } + // // METAGENOMIC SCREENING // @@ -1502,54 +1553,27 @@ process { ] } - withName: '.*MERGE_LIBRARIES:SAMTOOLS_MERGE_LIBRARIES' { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}_unsorted" } - publishDir = [ - enabled: false - ] - } - - withName: '.*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_SORT_MERGED_LIBRARIES' { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } - publishDir = [ - [ - path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, - mode: params.publish_dir_mode, - pattern: '*.bam', - ] - ] - } - withName: '.*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_INDEX_MERGED_LIBRARIES' { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = { params.fasta_largeref ? "-c" : "" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } - publishDir = [ + // + // GENOTYPING + // + withName: PICARD_ADDORREPLACEREADGROUPS { + tag = { "${meta.reference}|${meta.library_id}" } + ext.args ={ + se_pe_string = meta.single_end ? "SE" : "PE" [ - path: { "${params.outdir}/final_bams/${params.genotyping_source}/data/" }, - mode: params.publish_dir_mode, - pattern: '*.{bai,csi}', - ] - ] - } - - withName: '.*MERGE_LIBRARIES_GENOTYPING:SAMTOOLS_FLAGSTAT_MERGED_LIBRARIES' { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + "--RGID ILLUMINA-${meta.library_id}", + "--RGSM ${meta.library_id}", + "--RGLB ${meta.library_id}", + "--RGPL illumina", + "--RGPU ILLUMINA-${meta.sample_id}_${meta.library_id}-${meta.strandedness}_stranded-${se_pe_string}" + ].join(' ').trim() } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_unique_rgid" } publishDir = [ - [ - path: { "${params.outdir}/final_bams/${params.genotyping_source}/stats/" }, - mode: params.publish_dir_mode, - pattern: '*.flagstat', - ] + enabled: false ] } - // - // GENOTYPING - // withName: SAMTOOLS_MPILEUP_PILEUPCALLER { tag = { "${meta.reference}|${meta.strandedness}" } ext.args = [ @@ -1607,22 +1631,22 @@ process { } withName: GATK_REALIGNERTARGETCREATOR { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = [ - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "" + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = [ + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}_realigntarget" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_realigntarget" : "${meta.sample_id}_${meta.reference}_realigntarget" } publishDir = [ enabled: false ] } withName: GATK_INDELREALIGNER { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = [ - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "" + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = [ + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}_realigned" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}_realigned" : "${meta.sample_id}_${meta.reference}_realigned" } publishDir = [ [ path: { "${params.outdir}/genotyping/gatk_ug/data/" }, @@ -1634,18 +1658,17 @@ process { } withName: GATK_UNIFIEDGENOTYPER { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = { - [ - meta2.ploidy ? "--sample_ploidy ${meta2.ploidy}" : "--sample_ploidy 2", - "-stand_call_conf ${params.genotyping_gatk_call_conf}", - "-dcov ${params.genotyping_gatk_ug_downsample}", - "--output_mode ${params.genotyping_gatk_ug_out_mode}", - "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", - ].join(' ').trim() - } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = {[ + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with gatk default + "--sample_ploidy ${meta2.ploidy}", + "-stand_call_conf ${params.genotyping_gatk_call_conf}", + "-dcov ${params.genotyping_gatk_ug_downsample}", + "--output_mode ${params.genotyping_gatk_ug_out_mode}", + "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. + ].join(' ').trim() } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/gatk_ug/data/" }, @@ -1656,9 +1679,9 @@ process { } withName: BCFTOOLS_INDEX_UG { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = "--tbi" - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = "--tbi" //tbi indices for consistency with GATK HC + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/gatk_ug/data/" }, @@ -1669,16 +1692,16 @@ process { } withName: GATK4_HAPLOTYPECALLER { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = { - [ - meta2.ploidy ? "--sample-ploidy ${meta2.ploidy}" : "--sample-ploidy 2", - "-stand-call-conf ${params.genotyping_gatk_call_conf}", - "--output-mode ${params.genotyping_gatk_hc_out_mode}", - "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", - ].join(' ').trim() - } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = {[ + // Option names have changed from underscore_separated to hyphen-separated in GATK4 + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with gatk default + "--sample-ploidy ${meta2.ploidy}", + "-stand-call-conf ${params.genotyping_gatk_call_conf}", + "--output-mode ${params.genotyping_gatk_hc_out_mode}", + "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", + ].join(' ').trim() } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/gatk_hc/data/" }, @@ -1689,15 +1712,14 @@ process { } withName: FREEBAYES { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = { - [ - ref_meta.ploidy ? "-p ${ref_meta.ploidy}" : '', - "-C ${params.genotyping_freebayes_min_alternate_count}", - params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", - ].join(' ').trim() - } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = {[ + // ploidy set at 2 for non-specified ploidy by schema_fasta.json (checks input fasta sheet) for consistency with freebayes default + "--ploidy ${ref_meta.ploidy}", + "-C ${params.genotyping_freebayes_min_alternate_count}", + params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", + ].join(' ').trim() } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/freebayes/data/" }, @@ -1708,9 +1730,9 @@ process { } withName: BCFTOOLS_INDEX_FREEBAYES { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = "--tbi" - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.args = "--tbi" //tbi indices for consistency with GATK HC + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/freebayes/data/" }, @@ -1721,8 +1743,8 @@ process { } withName: BCFTOOLS_STATS_GENOTYPING { - tag = { "${meta.reference}|${meta.sample_id}" } - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + tag = { params.genotyping_use_unmerged_libraries ? "${meta.reference}|${meta.sample_id}|${meta.library_id}" : "${meta.reference}|${meta.sample_id}" } + ext.prefix = { params.genotyping_use_unmerged_libraries ? "${meta.sample_id}_${meta.library_id}_${meta.reference}" : "${meta.sample_id}_${meta.reference}" } publishDir = [ [ path: { "${params.outdir}/genotyping/bcftools/stats/" }, diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 2bec03850..a1843b20f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -991,7 +991,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk UG on trimmed reads. Skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. ## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --run_trim_bam \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ @@ -1030,7 +1030,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Attempt to run Gatk HC on trimmed reads, without activating trimming. ## Expect: FAILURE. Cannot set genotyping source to ;trimmed' without trimming. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` @@ -1039,7 +1039,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk HC on trimmed reads, with different out mode and emit confidence. Skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' --run_trim_bam -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' --run_trim_bam -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` @@ -1063,7 +1063,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Freebayes on trimmed reads. Different options, and skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --genotyping_skip_bcftools_stats \ --run_trim_bam \ --genotyping_freebayes_skip_coverage 10 \ --genotyping_freebayes_min_alternate_count 2 \ diff --git a/docs/usage.md b/docs/usage.md index 0264f9dad..80f474adb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -102,7 +102,7 @@ Only the `reference_name`, and `fasta` columns are mandatory, whereas all other | pmdtools_bed_for_masking | No | Optional path to SNP capture BED file to mask the reference for PMDtools | | sexdeterrmine_snp_bed | No | Optional path to SNP capture bed files for genetic sex estimation with SexDetERRmine | | bedtools_feature_file | No | Optional path to feature file for coverage calculation with bedtools | -| genotyping_reference_ploidy | No | Optional integer to specify organism ploidy for genotyping with GATK or FreeBayes | +| genotyping_reference_ploidy | No | Optional integer to specify organism ploidy for genotyping with GATK or FreeBayes (by default set to 2) | | genotyping_gatk_dbsnp | No | Optional path to SNP annotation file for genotyping with GATK | Files for `fai`, `dict`, `mapper_index` will be generated by the pipeline for you if not specified. diff --git a/modules.json b/modules.json index 293de7bc4..5f9b96d0f 100644 --- a/modules.json +++ b/modules.json @@ -235,6 +235,11 @@ "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", "installed_by": ["modules"] }, + "picard/addorreplacereadgroups": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": ["modules"] + }, "picard/createsequencedictionary": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", diff --git a/modules/nf-core/picard/addorreplacereadgroups/environment.yml b/modules/nf-core/picard/addorreplacereadgroups/environment.yml new file mode 100644 index 000000000..8f34e975c --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::picard=3.3.0 diff --git a/modules/nf-core/picard/addorreplacereadgroups/main.nf b/modules/nf-core/picard/addorreplacereadgroups/main.nf new file mode 100644 index 000000000..5a71e085a --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/main.nf @@ -0,0 +1,65 @@ +process PICARD_ADDORREPLACEREADGROUPS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' : + 'biocontainers/picard:3.3.0--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_index) + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.bai") , emit: bai, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "${reads.getExtension()}" + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard AddOrReplaceReadGroups] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + if ("$reads" == "${prefix}.${suffix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + picard \\ + -Xmx${avail_mem}M \\ + AddOrReplaceReadGroups \\ + $args \\ + $reference \\ + --INPUT ${reads} \\ + --OUTPUT ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(picard AddOrReplaceReadGroups --version 2>&1 | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: "${reads.getExtension()}" + if ("$reads" == "${prefix}.${suffix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(picard AddOrReplaceReadGroups --version 2>&1 | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/meta.yml b/modules/nf-core/picard/addorreplacereadgroups/meta.yml new file mode 100644 index 000000000..77ce85035 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/meta.yml @@ -0,0 +1,93 @@ +name: picard_addorreplacereadgroups +description: Assigns all the reads in a file to a single new read-group +keywords: + - add + - replace + - read-group + - picard +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037226472-AddOrReplaceReadGroups-Picard- + tool_dev_url: https://github.com/broadinstitute/picard + licence: ["MIT"] + identifier: biotools:picard_tools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Sequence reads file, can be SAM/BAM/CRAM format + pattern: "*.{bam,cram,sam}" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference genome file + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_index: + type: file + description: Reference genome index file + pattern: "*.{fai,fasta.fai,fa.fai,fasta.gz.fai,fa.gz.fai}" +output: + - bam: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: Output BAM file + pattern: "*.{bam}" + - bai: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bai": + type: file + description: An optional BAM index file + pattern: "*.{bai}" + - cram: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.cram": + type: file + description: Output CRAM file + pattern: "*.{cram}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sateeshperi" + - "@mjcipriano" + - "@hseabolt" + - "@cmatKhan" + - "@muffato" +maintainers: + - "@sateeshperi" + - "@mjcipriano" + - "@hseabolt" + - "@cmatKhan" + - "@muffato" diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config b/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config new file mode 100644 index 000000000..3f37c2fd9 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/bam.config @@ -0,0 +1,13 @@ +process { + withName: 'PICARD_ADDORREPLACEREADGROUPS'{ + ext.prefix = { "${meta.id}.replaced"} + ext.args = {[ + "--CREATE_INDEX", + "-LB ${meta.id}", + "-PL ILLUMINA", + "-PU bc1", + "-SM ${meta.id}" + ].join(' ').trim()} + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config b/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config new file mode 100644 index 000000000..966c14d79 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/cram.config @@ -0,0 +1,13 @@ +process { + withName: 'PICARD_ADDORREPLACEREADGROUPS'{ + ext.prefix = { "${meta.id}.replaced"} + ext.args = {[ + "-LB ${meta.id}", + "-PL ILLUMINA", + "-PU bc1", + "-SM ${meta.id}" + ].join(' ').trim()} + ext.suffix = { "cram" } + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test new file mode 100644 index 000000000..9c029354f --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test @@ -0,0 +1,93 @@ + +nextflow_process { + + name "Test Process PICARD_ADDORREPLACEREADGROUPS" + script "../main.nf" + process "PICARD_ADDORREPLACEREADGROUPS" + + tag "modules" + tag "modules_nfcore" + tag "picard" + tag "picard/addorreplacereadgroups" + + test("sarscov2 - bam") { + config "./bam.config" + + when { + process { + """ + input[0] = [ [:], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) ] + input[1] = [ [:], [] ] + input[2] = [ [:], [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.bai[0][1]).name, + process.out.versions + ).match() + }, + ) + } + + } + + test("homo_sapiens - cram") { + config "./cram.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), ] + ] + input[1] = [ [:], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + input[2] = [ [:], file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + process.out.versions + ).match() + }, + ) + } + + } + + test("sarscov2 - bam - stub") { + config "./bam.config" + options "-stub" + + when { + process { + """ + input[0] = [ [:], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) ] + input[1] = [ [:], [] ] + input[2] = [ [:], [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap new file mode 100644 index 000000000..3ae6b9171 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/main.nf.test.snap @@ -0,0 +1,74 @@ +{ + "homo_sapiens - cram": { + "content": [ + "test.replaced.cram", + [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:13:23.115450366" + }, + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + + }, + "null.replaced.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ], + "bai": [ + + ], + "bam": [ + [ + { + + }, + "null.replaced.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + + ], + "versions": [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:13:47.23694937" + }, + "sarscov2 - bam": { + "content": [ + "null.replaced.bam", + "null.replaced.bai", + [ + "versions.yml:md5,e82588e0fecbeafdff36e3b3001b3bca" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-18T10:12:53.083677142" + } +} \ No newline at end of file diff --git a/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml b/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml new file mode 100644 index 000000000..733010d62 --- /dev/null +++ b/modules/nf-core/picard/addorreplacereadgroups/tests/tags.yml @@ -0,0 +1,2 @@ +picard/addorreplacereadgroups: + - "modules/nf-core/picard/addorreplacereadgroups/**" diff --git a/nextflow.config b/nextflow.config index 3333c140c..4c418f3c4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -252,8 +252,9 @@ params { // Genotyping run_genotyping = false genotyping_tool = null - genotyping_source = null - skip_bcftools_stats = false + genotyping_source = 'raw' + genotyping_use_unmerged_libraries = false + genotyping_skip_bcftools_stats = false genotyping_reference_ploidy = 2 genotyping_pileupcaller_min_base_quality = 30 genotyping_pileupcaller_min_map_quality = 30 diff --git a/nextflow_schema.json b/nextflow_schema.json index c69a1de19..25235c2fe 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1148,10 +1148,18 @@ }, "genotyping_source": { "type": "string", + "default": "raw", "description": "Specify which input BAM to use for genotyping.", - "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", + "help_text": "Specify which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (no damage manipulation - per-sample merged libraries, see Note below); 'pmd' (per-sample merged pmdtools output); 'trimmed' (for base-clipped BAMs); 'pmd-trimmed' (Base-clipped-PMD-filtered BAMs); 'rescaled' (mapDamage2 rescaling).\nNote: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if BAM filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", - "enum": ["raw", "pmd", "trimmed", "rescaled"] + "enum": ["raw", "pmd", "trimmed", "pmd_trimmed", "rescaled"] + }, + "genotyping_use_unmerged_libraries": { + "type": "boolean", + "fa_icon": "fas fa-merge ", + "default": false, + "description": "Use unmerged mapped libraries for genotyping.", + "help_text": "Rather than merging all mapped libraries into a single file per sample, data for each library will be genotyped separately. /nThis may be advisable when data for a single sample has both UDG and non-UDG treated libraries, or when damage rescaling is done and different libraries can have distinct damage models." }, "genotyping_tool": { "type": "string", @@ -1160,7 +1168,7 @@ "help_text": "Specify which genotyper to use. Current options are: pileupCaller, ANGSD, GATK UnifiedGenotyper (v3.5), GATK HaplotypeCaller (v4) or FreeBayes.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK v3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", "description": "Specify which genotyper to use." }, - "skip_bcftools_stats": { + "genotyping_skip_bcftools_stats": { "type": "boolean", "fa_icon": "fas fa-forward", "description": "Specify to skip generation of VCF-based variant calling statistics with bcftools.", @@ -1170,7 +1178,7 @@ "type": "integer", "default": 2, "description": "Specify the ploidy of the reference organism.", - "help_text": "Specify the desired ploidy value of your reference organism for genotyping with GATK or FreeBayes. E.g. if you want to allow heterozygous calls this value should be >= 2.\n\n> Modifies GATK UnifiedGenotyper parameter: `--sample_ploidy`\n> Modifies GATK HaplotypeCaller parameter: `--sample_ploidy`\n> Modifies FreeBayes parameter: `-p`", + "help_text": "Specify the desired ploidy value of your reference organism for genotyping with GATK or FreeBayes. E.g. if you want to allow heterozygous calls this value should be >= 2.\n\n> Modifies GATK UnifiedGenotyper parameter: `--sample_ploidy`\n> Modifies GATK HaplotypeCaller parameter: `--sample-ploidy`\n> Modifies FreeBayes parameter: `-p`.\n\n Note: Note: ploidy in the input fasta list take priority", "fa_icon": "fas fa-pastafarianism" }, "genotyping_pileupcaller_min_base_quality": { diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index a2b5f7525..347d1d136 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -2,6 +2,7 @@ // Genotype the input data using the requested genotyper. // +include { PICARD_ADDORREPLACEREADGROUPS } from '../../modules/nf-core/picard/addorreplacereadgroups/main' include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../../modules/nf-core/samtools/mpileup/main' include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' include { SEQUENCETOOLS_PILEUPCALLER } from '../../modules/nf-core/sequencetools/pileupcaller/main' @@ -61,85 +62,116 @@ workflow GENOTYPE { } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, bed, snp ] // Prepare collect bams for mpileup - ch_mpileup_inputs_bams = ch_bam_bai + // Recreate Read Group headers if no genotyping input is library-by-library + if ( params.genotyping_use_unmerged_libraries ) { + ch_input_for_rewriting_readgroups = ch_bam_bai .map { - addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + .multiMap { + ignore_me, combo_meta, bams, bais, ref_meta, fasta, fai, dict, bed, snp -> + bams: [ combo_meta, bams ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] } - .groupTuple() - .map { - combo_meta, metas, bams, bais -> - def ids = metas.collect { meta -> meta.sample_id } - [ combo_meta + [sample_id: ids], bams ] // Drop bais - } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later - - // Combine prepped bams and references - ch_mpileup_inputs = ch_mpileup_inputs_bams - .map { - addNewMetaFromAttributes( it, "reference", "reference" , false ) - } - .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) - // do not run if no bed file is provided - .filter { it[7] != []} - .multiMap { - ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> - def bedfile = bed != "" ? bed : [] - bams: [ combo_meta, bams, bedfile ] - fasta: [ fasta ] - } - SAMTOOLS_MPILEUP_PILEUPCALLER( - ch_mpileup_inputs.bams, - ch_mpileup_inputs.fasta, - ) - ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) + PICARD_ADDORREPLACEREADGROUPS(ch_input_for_rewriting_readgroups.bams, ch_input_for_rewriting_readgroups.fasta, ch_input_for_rewriting_readgroups.fai) - ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup + ch_mpileup_inputs_bams = PICARD_ADDORREPLACEREADGROUPS.out.bam .map { - addNewMetaFromAttributes( it, "reference", "reference" , false ) + addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) } - .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) - .multiMap { - ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> - // def snpfile = snp != "" ? snp : [] - mpileup: [ meta, mpileup ] - snpfile: snp - } - - // Run PileupCaller - SEQUENCETOOLS_PILEUPCALLER( - ch_pileupcaller_input.mpileup, - ch_pileupcaller_input.snpfile, - [] - ) - ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) - - // Merge/rename genotyping datasets - ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat + .groupTuple() + .map { + combo_meta, metas, bams -> + def ids = metas.collect { meta -> meta.library_id } + [ combo_meta + [sample_id: ids], bams ] + } // Collect all LIBRARY IDs into a list in meta.sample_id. Useful when running pileupCaller later + // distinct from running merged libraries as single sample -- libraries must be unique + } + else { + ch_mpileup_inputs_bams = ch_bam_bai .map { - addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) } .groupTuple() .map { - combo_meta, metas, geno, snp, ind -> - [ combo_meta, geno, snp, ind ] - } + combo_meta, metas, bams, bais -> + def ids = metas.collect { meta -> meta.sample_id } + [ combo_meta + [sample_id: ids], bams ] // Drop bais + } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later + } - COLLECT_GENOTYPES( ch_final_genotypes ) - // Add genotyper info to the meta - ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + // Combine prepped bams and references + ch_mpileup_inputs = ch_mpileup_inputs_bams .map { - meta, geno, snp, ind -> - [ meta + [ genotyper: "pileupcaller" ], geno , snp, ind ] + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + // do not run if no bed file is provided + .filter { it[7] != []} + .multiMap { + ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> + def bedfile = bed != "" ? bed : [] + bams: [ combo_meta, bams, bedfile ] + fasta: [ fasta ] } - ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) - // Calculate coverage stats for collected eigenstrat dataset - EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( - ch_pileupcaller_genotypes - ) - ch_eigenstrat_coverage_stats = EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.tsv - ch_versions = ch_versions.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.versions.first() ) - ch_multiqc_files = ch_multiqc_files.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.json ) + + SAMTOOLS_MPILEUP_PILEUPCALLER( + ch_mpileup_inputs.bams, + ch_mpileup_inputs.fasta, + ) + ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) + + ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup + .map { + addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) + .multiMap { + ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> + // def snpfile = snp != "" ? snp : [] + mpileup: [ meta, mpileup ] + snpfile: snp + } + + // Run PileupCaller + SEQUENCETOOLS_PILEUPCALLER( + ch_pileupcaller_input.mpileup, + ch_pileupcaller_input.snpfile, + [] + ) + ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) + + // Merge/rename genotyping datasets + ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat + .map { + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple() + .map { + combo_meta, metas, geno, snp, ind -> + [ combo_meta, geno, snp, ind ] + } + + COLLECT_GENOTYPES( ch_final_genotypes ) + // Add genotyper info to the meta + ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + .map { + meta, geno, snp, ind -> + [ meta + [ genotyper: "pileupcaller" ], geno , snp, ind ] + } + ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) + + // Calculate coverage stats for collected eigenstrat dataset + EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( + ch_pileupcaller_genotypes + ) + ch_eigenstrat_coverage_stats = EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.tsv + ch_versions = ch_versions.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.json ) } if ( params.genotyping_tool == 'ug' ) { @@ -368,11 +400,12 @@ workflow GENOTYPE { .map { combo_meta, metas, bams, bais -> def new_map = [:] - def ids = metas.collect { meta -> meta.sample_id } + // ids will either be sampleID or libraryID depending on desired input for genotyping (default is use meta.sample_id, for merged sample data) + def ids = params.genotyping_use_unmerged_libraries ? metas.collect { meta -> meta.library_id } : metas.collect { meta -> meta.sample_id } def strandedness = metas.collect { meta -> meta.strandedness } def single_ends = metas.collect { meta -> meta.single_end } def reference = combo_meta.reference - new_meta = [ sample_id: ids, strandedness: strandedness, single_end: single_ends, reference: reference ] + def new_meta = [ sample_id: ids, strandedness: strandedness, single_end: single_ends, reference: reference ] [ combo_meta, new_meta, bams, bais ] // Drop bais } // Collect all IDs into a list in meta.sample_id. @@ -418,7 +451,7 @@ workflow GENOTYPE { } // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes - if ( !params.skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { + if ( !params.genotyping_skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { ch_bcftools_input= ch_genotypes_vcf .map { addNewMetaFromAttributes( it, "reference" , "reference" , false ) diff --git a/subworkflows/local/manipulate_damage.nf b/subworkflows/local/manipulate_damage.nf index bc8d96cba..3e9966644 100644 --- a/subworkflows/local/manipulate_damage.nf +++ b/subworkflows/local/manipulate_damage.nf @@ -12,6 +12,7 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_RESCALED } from '../../ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_FILTERED } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_DAMAGE_TRIMMED } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED } from '../../modules/nf-core/samtools/flagstat/main' +include { MERGE_LIBRARIES as MERGE_LIBRARIES_DAMAGE_MANIPULATION } from '../../subworkflows/local/merge_libraries' // TODO: Add required channels and channel manipulations for reference-dependent bed masking before pmdtools. Requires multi-ref support before implementation. workflow MANIPULATE_DAMAGE { @@ -21,11 +22,14 @@ workflow MANIPULATE_DAMAGE { ch_pmd_masking // [ [ meta ], masked_fasta, bed_for_masking ] main: - ch_versions = Channel.empty() - ch_rescaled_bams = Channel.empty() - ch_pmd_filtered_bams = Channel.empty() - ch_trimmed_bams = Channel.empty() - ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads + ch_versions = Channel.empty() + ch_rescaled_bams = Channel.empty() + ch_pmd_filtered_bams = Channel.empty() + ch_trimmed_bams = Channel.empty() + ch_merged_damage_manipulated_bams = Channel.empty() + ch_pmd_filtered_flagstat = Channel.empty() // Only run flagstat on pmd filtered bam, since rescaling and trimming does not change the number of reads + ch_multiqc_files = Channel.empty() + // Ensure correct reference is associated with each bam_bai pair ch_refs = ch_fasta @@ -58,7 +62,7 @@ workflow MANIPULATE_DAMAGE { ch_mapdamage_input = ch_mapdamage_prep.no_skip .multiMap { ignore_me, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam ] + bam: [ meta + [ 'damage_manipulation' : 'rescaled' ], bam ] fasta: fasta } @@ -69,7 +73,8 @@ workflow MANIPULATE_DAMAGE { ch_versions = ch_versions.mix( SAMTOOLS_INDEX_DAMAGE_RESCALED.out.versions.first() ) ch_rescaled_index = params.fasta_largeref ? SAMTOOLS_INDEX_DAMAGE_RESCALED.out.csi : SAMTOOLS_INDEX_DAMAGE_RESCALED.out.bai - // TODO When adding library-level data merging pre-genotyping, make sure that rescaled bams are not merged in any way as the underlying damage model could differ between libraries + // It may be inappropriate to merge different rescaled libraries (or rescaled and non-rescaled libs) for genotyping/downstream analysis + // Distinct libraries may have distinct damage models! ch_rescaled_bams = MAPDAMAGE2.out.rescaled.join(ch_rescaled_index) .mix(ch_skip_rescale) // Should these be mixed actually, or excluded? Might not make sense to take rescaled and non-rescaled bams togetehr for anything downstream. } @@ -118,7 +123,7 @@ workflow MANIPULATE_DAMAGE { .combine( ch_pmd_fastas, by: 0 ) // [ [combine_meta], [meta], bam, bai, [ref_meta] fasta ] .multiMap { combine_meta, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam, bai ] + bam: [ meta + [ 'damage_manipulation' : 'pmd' ] , bam, bai ] fasta: fasta } @@ -133,21 +138,24 @@ workflow MANIPULATE_DAMAGE { SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED( ch_pmd_filtered_bams ) ch_pmd_filtered_flagstat = SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.flagstat + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.flagstat ) ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.versions.first() ) } if ( params.run_trim_bam ) { - if ( params.run_pmd_filtering ) { + // only run trimming on pmd reads if used downstream in genotyping + // must be explicilty asked for, since other tools not combined like this (eg mapdamage + trimming) + if ( params.genotyping_source == 'pmd_trimmed' ) { ch_to_trim = ch_pmd_filtered_bams .map{ meta, bam, bai -> - [ meta, bam ] + [ meta + ['damage_manipulation' : 'pmd_trimmed'], bam ] } } else { ch_to_trim = ch_bam_bai .map { meta, bam, bai -> - [ meta, bam ] + [ meta + ['damage_manipulation' : 'trimmed'], bam ] } } @@ -169,10 +177,21 @@ workflow MANIPULATE_DAMAGE { ch_trimmed_bams = BAMUTIL_TRIMBAM.out.bam.join( ch_trimmed_index ) } + // SUBWORKFLOW: merge libraries for saving (in final bams) and (potentially) genotyping + + ch_for_merging = ch_rescaled_bams.mix(ch_pmd_filtered_bams).mix(ch_trimmed_bams) + + MERGE_LIBRARIES_DAMAGE_MANIPULATION(ch_for_merging) + ch_versions = ch_versions.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.versions) + ch_merged_damage_manipulated_bams = MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.bam_bai + ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_DAMAGE_MANIPULATION.out.mqc.collect { it[1] }.ifEmpty([])) + emit: - rescaled = ch_rescaled_bams // [ meta, bam, bai ] - filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] - trimmed = ch_trimmed_bams // [ meta, bam, bai ] - flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] - versions = ch_versions + rescaled = ch_rescaled_bams // [ meta, bam, bai ] + filtered = ch_pmd_filtered_bams // [ meta, bam, bai ] + trimmed = ch_trimmed_bams // [ meta, bam, bai ] + merged_bams = ch_merged_damage_manipulated_bams // [ meta, bam, bai ] // per-tool + flagstat = ch_pmd_filtered_flagstat // [ meta, flagstat ] + mqc = ch_multiqc_files + versions = ch_versions } diff --git a/subworkflows/local/merge_libraries.nf b/subworkflows/local/merge_libraries.nf index d568b99ef..ebda7c858 100644 --- a/subworkflows/local/merge_libraries.nf +++ b/subworkflows/local/merge_libraries.nf @@ -18,9 +18,9 @@ workflow MERGE_LIBRARIES { ch_multiqc_files = Channel.empty() ch_library_merge_input = ch_bam_bai - .map { addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference"], ["id", "sample_id", "strandedness", "reference"], false ) } + .map { addNewMetaFromAttributes( it, ["id", "sample_id", "strandedness", "reference", "damage_manipulation"], ["id", "sample_id", "strandedness", "reference", "damage_manipulation"], false ) } .groupTuple(by: 0) - // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on) + // Discrad library-level metas, and bais. Add single_end: true to all metas (no SE/PE distinction from here on), preserve damage maniupulation tool info for downstream output sorting .map { meta, lib_metas, bam, bai -> [ meta + [ 'single_end': true ], bam ] diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index 945f16d24..dca7c0a34 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -143,6 +143,20 @@ workflow PIPELINE_INITIALISATION { [ meta, singlestrand ] } + // - No libraries with multiple UDG treatments (UDG treatment done as part of library generation) + ch_samplesheet_test = ch_samplesheet + .map { + meta, r1, r2, bam -> + [ meta.subMap('library_id'), meta.subMap('damage_treatment') ] + } + .groupTuple() + .map { meta, damage_treatment -> + if ( damage_treatment.toList().unique().size() > 1 ) { + exit 1, "[nf-core] ERROR: Validation of 'input' file failed. Library IDs can only have a single UDG treatment across lanes." + } + [ meta, damage_treatment ] + } + emit: samplesheet_fastqs = ch_samplesheet_fastqs samplesheet_bams = ch_samplesheet_bams @@ -214,9 +228,25 @@ def validateInputParameters() { if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads without singletons. If you want to use Dedup, please provide --preprocessing_excludeunmerged. For all other cases, please set --deduplication_tool to 'markduplicates'."} if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } if ( params.bamfiltering_generatefastq && params.run_bamfiltering ) { exit 1, ("[nf-core/eager] ERROR: --bamfiltering_generatefastq will NOT generate a fastq file unless BAM filtering is turned on with `--run_bamfiltering`") } + + // damage manipulation if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } - if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } - if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is turned on with `--run_pmd_filtering`.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is turned on with `--run_mapdamage_rescaling`.") } + if ( params.genotyping_source == 'pmd_trimmed' && ! ( params.run_pmd_filtering && params.run_trim_bam ) ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd_trimmed' unless PMD-filtering is turned on with `--run_pmd_filtering` and BAM trimming is turned on with `--run_trim_bam`.") } + + // genotyping + if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } + if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } + if ( params.genotyping_source == 'raw' && ( params.run_trim_bam || params.run_pmd_filtering || params.run_mapdamage_rescaling ) ) { log.warn("[nf-core/eager] WARNING: --genotyping_source is set to 'raw' AND damage correction carried out (rescaling, filtering, or trimming). The output of these tools will NOT be utilized for genotyping!") } + if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( ( params.genotyping_source == 'rescaled' || params.genotyping_source == 'pmd_timmed' || params.genotyping_source == 'pmd' ) && ! params.genotyping_use_unmerged_libraries ) { log.warn("[nf-core/eager] WARNING: Combining multiple libraries with rescaled damage for genotyping may be inappropriate!") } + // if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } + if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } + + // metagenomics if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { if (params.metagenomics_prinseq_dustscore == 0.5) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'dust' mode but provided an entropy score. Please specify a dust filter threshold using the --metagenomics_prinseq_dustscore flag") } } @@ -233,18 +263,13 @@ def validateInputParameters() { ){ exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked MALT with postprocessing but didnt provided required input files. Please provide the --metagenomics_maltextract_taxonlist and --metagenomics_maltextract_ncbidir flags") } if ( params.run_metagenomics && params.preprocessing_skippairmerging ) { log.warn("[nf-core/eager] WARNING: --preprocessing_skippairmerging selected in combination for metagenomics! All singletons from paired end samples will be discarded prior to input for metagenomics screening! This may be inappropriate for metaphlan, which does not utilize paired-end information!") } if ( params.run_metagenomics && params.preprocessing_skippairmerging && params.metagenomics_profiling_tool == 'malt' ) { exit 1, ("[nf-core/eager] ERROR: --preprocessing_skippairmerging selected in combination with MALT for metagenomics! MALT cannot accept separated read pair information, please remove --preprocessing_skippairmerging parameter.") } - if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } - if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } - if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } - if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } - if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } - if ( params.fasta && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - if ( params.fasta && params.mapping_tool == "circularmapper" && !params.fasta_circular_target ) { exit 1, ("[nf-core/eager] ERROR: Mapping with circularmapper requires --fasta_circular_target to be provided.") } } // // Validate channels from input samplesheet // +// FROM NF-CORE TEMPLATE: THIS IS NOT NECESSARY +// TODO: remove prior to 3.0 def validateInputSamplesheet(input) { def (metas, fastqs) = input[1..2] @@ -397,7 +422,6 @@ def addNewMetaFromAttributes( ArrayList row, Object source_attributes, Object ta } else { // Option D: Not both the same type or acceptable types. Error. throw new IllegalArgumentException("Error: target_attributes and source_attributes must be of same type (both Strings or both Lists).") - } def new_row = [ meta2 ] + row diff --git a/workflows/eager.nf b/workflows/eager.nf index 148f262c1..a9122f58a 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -33,7 +33,6 @@ include { ESTIMATE_CONTAMINATION } from '../subwork include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' include { RUN_SEXDETERRMINE } from '../subworkflows/local/run_sex_determination' include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' -include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING } from '../subworkflows/local/merge_libraries' include { GENOTYPE } from '../subworkflows/local/genotype' /* @@ -231,6 +230,7 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(FILTER_BAM.out.mqc.collect { it[1] }.ifEmpty([])) } else { + // TODO: more intuitive name for this?, since here we don't have filtered reads :P ch_bamfiltered_for_deduplication = MAP.out.bam .join(MAP.out.bai) .mix(ch_bams_from_input_lanemerged) @@ -254,6 +254,7 @@ workflow EAGER { ch_versions = ch_versions.mix(DEDUPLICATE.out.versions) } else { + // TODO: more intuitive name for this, since here we don't have deduplicated reads :P ch_dedupped_bams = ch_reads_for_deduplication ch_dedupped_flagstat = Channel.empty() } @@ -527,18 +528,8 @@ workflow EAGER { if (params.run_mapdamage_rescaling || params.run_pmd_filtering || params.run_trim_bam) { MANIPULATE_DAMAGE(ch_dedupped_bams, ch_fasta_for_deduplication.fasta, REFERENCE_INDEXING.out.pmd_masking) - ch_multiqc_files = ch_multiqc_files.mix(MANIPULATE_DAMAGE.out.flagstat.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MANIPULATE_DAMAGE.out.mqc.collect { it[1] }.ifEmpty([])) ch_versions = ch_versions.mix(MANIPULATE_DAMAGE.out.versions) - ch_bams_for_library_merge = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_merged_dedup_bams - - // SUBWORKFLOW: merge libraries for genotyping - MERGE_LIBRARIES_GENOTYPING(ch_bams_for_library_merge) - ch_versions = ch_versions.mix(MERGE_LIBRARIES_GENOTYPING.out.versions) - ch_bams_for_genotyping = MERGE_LIBRARIES_GENOTYPING.out.bam_bai - ch_multiqc_files = ch_multiqc_files.mix(MERGE_LIBRARIES_GENOTYPING.out.mqc.collect { it[1] }.ifEmpty([])) - } - else { - ch_bams_for_genotyping = ch_merged_dedup_bams } // @@ -546,9 +537,22 @@ workflow EAGER { // if (params.run_genotyping) { + + if ( params.genotyping_use_unmerged_libraries ) { + // Get UNMERGED data from either initial mapping (post deduplication/filtering, if done), or post-damage manipulation after deduplication/filtering (if done) -- Note: the .out channels include libraries that are not damage manipulated (eg UDG-Full)) + ch_bams_for_genotyping = params.genotyping_source == 'rescaled' ? MANIPULATE_DAMAGE.out.rescaled : params.genotyping_source == 'pmd' ? MANIPULATE_DAMAGE.out.filtered : params.genotyping_source == 'trimmed' ? MANIPULATE_DAMAGE.out.trimmed : params.genotyping_source == 'pmd_trimmed' ? MANIPULATE_DAMAGE.out.trimmed : ch_dedupped_bams + } + else { + // Genotyping done on MERGED data, regardless of UDG-treatment vs not; damage manipulation per-library! + // Select input for genotyping: Default of raw is merged libraries post-deduplication; otherwise, merged libraries post damage manipulated + ch_bams_for_genotyping = params.genotyping_source != 'raw' ? MANIPULATE_DAMAGE.out.merged_bams.filter{ it[0]['damage_manipulation'] == params.genotyping_source } : ch_merged_dedup_bams + } + + // consistent ref indexing regardless of merged/unmerged data ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference.map { meta, fasta, fai, dict, mapindex -> [meta, fasta, fai, dict] } + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping,