diff --git a/conf/modules.config b/conf/modules.config index 94a2a32..3062cb4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,6 +10,8 @@ ---------------------------------------------------------------------------------------- */ +nextflow.enable.moduleBinaries = true + process { publishDir = [ @@ -30,4 +32,8 @@ process { withName: SEQKIT_STATS { ext.args = ' ' // turn off --all default argument } + + withName: DSHBIO_FASTATOPARQUET { + ext.args = '--alphabet protein' + } } diff --git a/modules/local/dshbio/fastatoparquet/environment.yml b/modules/local/dshbio/fastatoparquet/environment.yml new file mode 100644 index 0000000..4549248 --- /dev/null +++ b/modules/local/dshbio/fastatoparquet/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::dsh-bio=3.0 diff --git a/modules/local/dshbio/fastatoparquet/main.nf b/modules/local/dshbio/fastatoparquet/main.nf new file mode 100644 index 0000000..dae0556 --- /dev/null +++ b/modules/local/dshbio/fastatoparquet/main.nf @@ -0,0 +1,47 @@ +process DSHBIO_FASTATOPARQUET { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dsh-bio:3.0--hdfd78af_0' : + 'biocontainers/dsh-bio:3.0--hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.sequences.parquet"), emit: parquet + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dsh-bio \\ + fasta-to-parquet3 \\ + $args \\ + -i $fasta \\ + -o ${prefix}.sequences.parquet + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix}.sequences.parquet + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/local/dshbio/fastatoparquet/meta.yml b/modules/local/dshbio/fastatoparquet/meta.yml new file mode 100644 index 0000000..b0961b7 --- /dev/null +++ b/modules/local/dshbio/fastatoparquet/meta.yml @@ -0,0 +1,52 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "dshbio_fastatoparquet" +description: Convert DNA or protein sequences in FASTA format to Parquet format +keywords: + - fasta + - parquet + - sequence +tools: + - dshbio: + description: | + Reads, features, variants, assemblies, alignments, genomic range trees, pangenome + graphs, and a bunch of random command line tools for bioinformatics. LGPL version 3 + or later. + homepage: https://github.com/heuermh/dishevelled-bio + documentation: https://github.com/heuermh/dishevelled-bio + doi: "10.5281/zenodo.15027131" + licence: ["LGPL-3.0-or-later"] + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: DNA or protein sequences in compressed FASTA format + pattern: "*.fasta.{gz|zst|bgz|bgzf|bzip2}" + +output: + - parquet: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.sequences.parquet": + type: directory + description: | + Directory of DNA or protein sequences in Parquet format with zstd compression + pattern: "*.sequences.parquet" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@heuermh" +maintainers: + - "@heuermh" diff --git a/modules/local/duckdb/aminoacidhistogram/environment.yml b/modules/local/duckdb/aminoacidhistogram/environment.yml new file mode 100644 index 0000000..4aed600 --- /dev/null +++ b/modules/local/duckdb/aminoacidhistogram/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + +dependencies: + - conda-forge::duckdb-cli=1.0.0 diff --git a/modules/local/duckdb/aminoacidhistogram/main.nf b/modules/local/duckdb/aminoacidhistogram/main.nf new file mode 100644 index 0000000..935dfe4 --- /dev/null +++ b/modules/local/duckdb/aminoacidhistogram/main.nf @@ -0,0 +1,34 @@ +process DUCKDB_AMINOACIDHISTOGRAM { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container 'community.wave.seqera.io/library/duckdb-cli:1.1.3--c5d9961e3b49178e' + //container 'community.wave.seqera.io/library/duckdb-cli_pip_duckdb-extension-parquet:48635535d267c0b5' + //container 'community.wave.seqera.io/library/duckdb-cli_pip_duckdb-extension-parquet:be97269b25d3a5b6' + //container 'community.wave.seqera.io/library/pip_duckdb-extension-parquet_duckdb:8326cfa0a50bf9c9' + + input: + tuple val(meta), path(parquet) + + output: + tuple val(meta), path("*.histogram.tsv"), emit: histogram + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def amino_acid_properties = file("${moduleDir}/assets/amino_acid_properties.tsv") + //template 'amino_acid_histogram.py' + def sql = "INSTALL parquet; LOAD parquet; COPY (WITH p AS (SELECT * FROM read_parquet('${parquet}/*.parquet')), s AS (SELECT unnest(string_to_array(sequence, '')) AS aa FROM p), h AS (SELECT unnest(map_entries(histogram(aa))) AS kv FROM s), e AS (SELECT * from read_csv_auto('${amino_acid_properties}')) SELECT '${prefix}' AS id, h.kv['value'] AS count, e.* FROM h JOIN e ON h.kv['key'] = e.one_letter_symbol) TO '${prefix}.histogram.tsv' (HEADER, DELIMITER '\t')" + """ + duckdb :memory: "$sql" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + duckdb: \$( duckdb --version | cut -f 1 -d " " ) + END_VERSIONS + """ +} diff --git a/modules/local/duckdb/aminoacidhistogram/resources/usr/bin/create_amino_acid_properties.sh b/modules/local/duckdb/aminoacidhistogram/resources/usr/bin/create_amino_acid_properties.sh new file mode 100755 index 0000000..8360d50 --- /dev/null +++ b/modules/local/duckdb/aminoacidhistogram/resources/usr/bin/create_amino_acid_properties.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# +# See https://en.wikipedia.org/wiki/Amino_acid + +cat < amino_acid_properties.tsv +amino_acid three_letter_symbol one_letter_symbol class chemical_polarity net_charge hydropathy_index molecular_mass abundance_in_proteins standard_genetic_coding hydrophobic aromatic aliphatic small hydrophilic positively_charged negatively_charged +Alanine Ala A Aliphatic Nonpolar Neutral 1.8 89.094 8.76 GCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Arginine Arg R Fixed cation Basic polar Positive −4.5 174.203 5.78 MGR, CGY FALSE FALSE FALSE FALSE TRUE TRUE FALSE +Asparagine Asn N Amide Polar Neutral −3.5 132.119 3.93 AAY FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Aspartate Asp D Anion Brønsted base Negative −3.5 133.104 5.49 GAY FALSE FALSE FALSE FALSE TRUE FALSE TRUE +Cysteine Cys C Thiol Brønsted acid Neutral 2.5 121.154 1.38 UGY FALSE FALSE FALSE FALSE FALSE FALSE FALSE +Glutamine Gln Q Amide Polar Neutral −3.5 146.146 3.9 CAR FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Glutamate Glu E Anion Brønsted base Negative −3.5 147.131 6.32 GAR FALSE FALSE FALSE FALSE TRUE FALSE TRUE +Glycine Gly G Aliphatic Nonpolar Neutral −0.4 75.067 7.03 GGN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Histidine His H Cationic Brønsted acid and base Positive, 10% Neutral, 90% −3.2 155.156 2.26 CAY FALSE TRUE FALSE FALSE TRUE TRUE FALSE +Isoleucine Ile I Aliphatic Nonpolar Neutral 4.5 131.175 5.49 AUH TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Leucine Leu L Aliphatic Nonpolar Neutral 3.8 131.175 9.68 YUR, CUY TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Lysine Lys K Cation Brønsted acid Positive −3.9 146.189 5.19 AAR FALSE FALSE FALSE FALSE TRUE TRUE FALSE +Methionine Met M Thioether Nonpolar Neutral 1.9 149.208 2.32 AUG TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Phenylalanine Phe F Aromatic Nonpolar Neutral 2.8 165.192 3.87 UUY TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Proline Pro P Cyclic Nonpolar Neutral −1.6 115.132 5.02 CCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Serine Ser S Hydroxylic Polar Neutral −0.8 105.093 7.14 UCN, AGY FALSE FALSE FALSE TRUE TRUE FALSE FALSE +Threonine Thr T Hydroxylic Polar Neutral −0.7 119.119 5.53 ACN FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Tryptophan Trp W Aromatic Nonpolar Neutral −0.9 204.228 1.25 UGG TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Tyrosine Tyr Y Aromatic Brønsted acid Neutral −1.3 181.191 2.91 UAY TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Valine Val V Aliphatic Nonpolar Neutral 4.2 117.148 6.73 GUN TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Selenocysteine Sec U 168.064 +Pyrrolysine Pyl O 255.313 +Any/unknown Xaa X +Asparagine or aspartate Asx B +Glutamine or glutamate Glx Z +Leucine or isoleucine Xle J +END_PROPERTIES diff --git a/modules/local/eider/aminoacidhistogram/assets/amino_acid_properties.tsv b/modules/local/eider/aminoacidhistogram/assets/amino_acid_properties.tsv new file mode 100644 index 0000000..13b888b --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/assets/amino_acid_properties.tsv @@ -0,0 +1,27 @@ +amino_acid three_letter_symbol one_letter_symbol class chemical_polarity net_charge hydropathy_index molecular_mass abundance_in_proteins standard_genetic_coding hydrophobic aromatic aliphatic small hydrophilic positively_charged negatively_charged +Alanine Ala A Aliphatic Nonpolar Neutral 1.8 89.094 8.76 GCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Arginine Arg R Fixed cation Basic polar Positive −4.5 174.203 5.78 MGR, CGY FALSE FALSE FALSE FALSE TRUE TRUE FALSE +Asparagine Asn N Amide Polar Neutral −3.5 132.119 3.93 AAY FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Aspartate Asp D Anion Brønsted base Negative −3.5 133.104 5.49 GAY FALSE FALSE FALSE FALSE TRUE FALSE TRUE +Cysteine Cys C Thiol Brønsted acid Neutral 2.5 121.154 1.38 UGY FALSE FALSE FALSE FALSE FALSE FALSE FALSE +Glutamine Gln Q Amide Polar Neutral −3.5 146.146 3.9 CAR FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Glutamate Glu E Anion Brønsted base Negative −3.5 147.131 6.32 GAR FALSE FALSE FALSE FALSE TRUE FALSE TRUE +Glycine Gly G Aliphatic Nonpolar Neutral −0.4 75.067 7.03 GGN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Histidine His H Cationic Brønsted acid and base Positive, 10% Neutral, 90% −3.2 155.156 2.26 CAY FALSE TRUE FALSE FALSE TRUE TRUE FALSE +Isoleucine Ile I Aliphatic Nonpolar Neutral 4.5 131.175 5.49 AUH TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Leucine Leu L Aliphatic Nonpolar Neutral 3.8 131.175 9.68 YUR, CUY TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Lysine Lys K Cation Brønsted acid Positive −3.9 146.189 5.19 AAR FALSE FALSE FALSE FALSE TRUE TRUE FALSE +Methionine Met M Thioether Nonpolar Neutral 1.9 149.208 2.32 AUG TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Phenylalanine Phe F Aromatic Nonpolar Neutral 2.8 165.192 3.87 UUY TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Proline Pro P Cyclic Nonpolar Neutral −1.6 115.132 5.02 CCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE +Serine Ser S Hydroxylic Polar Neutral −0.8 105.093 7.14 UCN, AGY FALSE FALSE FALSE TRUE TRUE FALSE FALSE +Threonine Thr T Hydroxylic Polar Neutral −0.7 119.119 5.53 ACN FALSE FALSE FALSE FALSE TRUE FALSE FALSE +Tryptophan Trp W Aromatic Nonpolar Neutral −0.9 204.228 1.25 UGG TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Tyrosine Tyr Y Aromatic Brønsted acid Neutral −1.3 181.191 2.91 UAY TRUE TRUE FALSE FALSE FALSE FALSE FALSE +Valine Val V Aliphatic Nonpolar Neutral 4.2 117.148 6.73 GUN TRUE FALSE TRUE FALSE FALSE FALSE FALSE +Selenocysteine Sec U 168.064 +Pyrrolysine Pyl O 255.313 +Any/unknown Xaa X +Asparagine or aspartate Asx B +Glutamine or glutamate Glx Z +Leucine or isoleucine Xle J diff --git a/modules/local/eider/aminoacidhistogram/assets/query_template.sql b/modules/local/eider/aminoacidhistogram/assets/query_template.sql new file mode 100644 index 0000000..6b3c936 --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/assets/query_template.sql @@ -0,0 +1,41 @@ +COPY ( + WITH p AS ( + SELECT * FROM read_parquet('${parquet}/*.parquet') + ), + s AS ( + SELECT unnest(string_to_array(sequence, '')) AS aa FROM p + ), + h AS ( + SELECT unnest(map_entries(histogram(aa))) AS kv FROM s + ), + e AS ( + SELECT * from read_csv_auto('${amino_acid_properties}') + ) + SELECT + '${prefix}' AS id, + h.kv['value'] AS count, + e.amino_acid, + e.one_letter_symbol, + e.three_letter_symbol, + e.class, + e.chemical_polarity, + e.net_charge, + e.hydropathy_index, + e.molecular_mass, + e.abundance_in_proteins, + e.standard_genetic_coding, + e.hydrophobic, + e.aromatic, + e.aliphatic, + e.small, + e.hydrophilic, + e.positively_charged, + e.negatively_charged + FROM + h + JOIN + e + ON + h.kv['key'] = e.one_letter_symbol +) +TO '${prefix}.histogram.tsv' (HEADER, DELIMITER '\t') diff --git a/modules/local/eider/aminoacidhistogram/environment.yml b/modules/local/eider/aminoacidhistogram/environment.yml new file mode 100644 index 0000000..737e4f8 --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::eider=0.1 \ No newline at end of file diff --git a/modules/local/eider/aminoacidhistogram/main.nf b/modules/local/eider/aminoacidhistogram/main.nf new file mode 100644 index 0000000..e440adb --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/main.nf @@ -0,0 +1,39 @@ +process EIDER_AMINOACIDHISTOGRAM { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eider:0.1--hdfd78af_0' : + 'biocontainers/eider:0.1--hdfd78af_0' }" + + input: + tuple val(meta), path(parquet) + + output: + tuple val(meta), path("*.histogram.tsv"), emit: histogram + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def amino_acid_properties = file("${moduleDir}/assets/amino_acid_properties.tsv") + def query_template = file("${moduleDir}/assets/query_template.sql") + """ + eider \ + $args \ + --verbose \ + --skip-history \ + --parameters prefix=${prefix} \ + --parameters amino_acid_properties=${amino_acid_properties} \ + --query-path ${query_template} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eider: \$(eider --version 2>&1 | grep -o 'eider .*' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/local/eider/aminoacidhistogram/meta.yml b/modules/local/eider/aminoacidhistogram/meta.yml new file mode 100644 index 0000000..8de1936 --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/meta.yml @@ -0,0 +1,47 @@ +name: eider_aminoacidhistogram +description: Build amino acid histogram from protein sequences +keywords: + - duckdb + - parquet + - sql + - query +tools: + - eider: + description: | + Command line bioinformatics tools for DuckDB. + homepage: https://github.com/heuermh/eider + documentation: https://github.com/heuermh/eider + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - parquet: + type: directory + description: | + Directory of protein sequences in Parquet format + pattern: "*.parquet" +output: + - histogram: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv": + type: file + description: | + Amino acid histogram in tab separated values (TSV) text format + pattern: "*.{tsv}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" +maintainers: + - "@heuermh" diff --git a/modules/local/eider/aminoacidhistogram/tests/assets/test.parquet/part-0-1.parquet b/modules/local/eider/aminoacidhistogram/tests/assets/test.parquet/part-0-1.parquet new file mode 100644 index 0000000..0a3e0be Binary files /dev/null and b/modules/local/eider/aminoacidhistogram/tests/assets/test.parquet/part-0-1.parquet differ diff --git a/modules/local/eider/aminoacidhistogram/tests/main.nf.test b/modules/local/eider/aminoacidhistogram/tests/main.nf.test new file mode 100644 index 0000000..9da76c1 --- /dev/null +++ b/modules/local/eider/aminoacidhistogram/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process EIDER_AMINOACIDHISTOGRAM" + script "../main.nf" + process "EIDER_AMINOACIDHISTOGRAM" + + tag "modules" + tag "modules_nfcore" + tag "eider" + tag "eider/aminoaicdhistogram" + + test("test-eider-aminoacidhistogram") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file('${moduleDir}/tests/assets/test.parquet', checkIfExists: true) + ] + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf index f7e4b65..8f73970 100644 --- a/workflows/proteinannotator.nf +++ b/workflows/proteinannotator.nf @@ -3,6 +3,8 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { DSHBIO_FASTATOPARQUET } from '../modules/local/dshbio/fastatoparquet/main' +include { EIDER_AMINOACIDHISTOGRAM } from '../modules/local/eider/aminoacidhistogram/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { SEQKIT_STATS } from '../modules/nf-core/seqkit/stats/main' include { paramsSummaryMap } from 'plugin/nf-schema' @@ -34,6 +36,14 @@ workflow PROTEINANNOTATOR { SEQKIT_STATS(ch_samplesheet) ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions) + // todo: move this to stats on input fasta subworkflow + DSHBIO_FASTATOPARQUET(ch_samplesheet) + ch_versions = ch_versions.mix(DSHBIO_FASTATOPARQUET.out.versions) + + // todo: move this to stats on input fasta subworkflow + EIDER_AMINOACIDHISTOGRAM(DSHBIO_FASTATOPARQUET.out.parquet) + ch_versions = ch_versions.mix(EIDER_AMINOACIDHISTOGRAM.out.versions) + // // Collate and save software versions //