Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
----------------------------------------------------------------------------------------
*/

nextflow.enable.moduleBinaries = true
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the best config file for this?


process {

publishDir = [
Expand All @@ -30,4 +32,8 @@ process {
withName: SEQKIT_STATS {
ext.args = ' ' // turn off --all default argument
}

withName: DSHBIO_FASTATOPARQUET {
ext.args = '--alphabet protein'
}
}
7 changes: 7 additions & 0 deletions modules/local/dshbio/fastatoparquet/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::dsh-bio=3.0
47 changes: 47 additions & 0 deletions modules/local/dshbio/fastatoparquet/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
process DSHBIO_FASTATOPARQUET {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/dsh-bio:3.0--hdfd78af_0' :
'biocontainers/dsh-bio:3.0--hdfd78af_0' }"

input:
tuple val(meta), path(fasta)

output:
tuple val(meta), path("*.sequences.parquet"), emit: parquet
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
dsh-bio \\
fasta-to-parquet3 \\
$args \\
-i $fasta \\
-o ${prefix}.sequences.parquet

cat <<-END_VERSIONS > versions.yml
"${task.process}":
dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ')
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
mkdir -p ${prefix}.sequences.parquet

cat <<-END_VERSIONS > versions.yml
"${task.process}":
dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ')
END_VERSIONS
"""
}
52 changes: 52 additions & 0 deletions modules/local/dshbio/fastatoparquet/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "dshbio_fastatoparquet"
description: Convert DNA or protein sequences in FASTA format to Parquet format
keywords:
- fasta
- parquet
- sequence
tools:
- dshbio:
description: |
Reads, features, variants, assemblies, alignments, genomic range trees, pangenome
graphs, and a bunch of random command line tools for bioinformatics. LGPL version 3
or later.
homepage: https://github.com/heuermh/dishevelled-bio
documentation: https://github.com/heuermh/dishevelled-bio
doi: "10.5281/zenodo.15027131"
licence: ["LGPL-3.0-or-later"]

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: DNA or protein sequences in compressed FASTA format
pattern: "*.fasta.{gz|zst|bgz|bgzf|bzip2}"

output:
- parquet:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.sequences.parquet":
type: directory
description: |
Directory of DNA or protein sequences in Parquet format with zstd compression
pattern: "*.sequences.parquet"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@heuermh"
maintainers:
- "@heuermh"
7 changes: 7 additions & 0 deletions modules/local/duckdb/aminoacidhistogram/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge

dependencies:
- conda-forge::duckdb-cli=1.0.0
34 changes: 34 additions & 0 deletions modules/local/duckdb/aminoacidhistogram/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
process DUCKDB_AMINOACIDHISTOGRAM {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container 'community.wave.seqera.io/library/duckdb-cli:1.1.3--c5d9961e3b49178e'
//container 'community.wave.seqera.io/library/duckdb-cli_pip_duckdb-extension-parquet:48635535d267c0b5'
//container 'community.wave.seqera.io/library/duckdb-cli_pip_duckdb-extension-parquet:be97269b25d3a5b6'
//container 'community.wave.seqera.io/library/pip_duckdb-extension-parquet_duckdb:8326cfa0a50bf9c9'

input:
tuple val(meta), path(parquet)

output:
tuple val(meta), path("*.histogram.tsv"), emit: histogram
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
def amino_acid_properties = file("${moduleDir}/assets/amino_acid_properties.tsv")
//template 'amino_acid_histogram.py'
def sql = "INSTALL parquet; LOAD parquet; COPY (WITH p AS (SELECT * FROM read_parquet('${parquet}/*.parquet')), s AS (SELECT unnest(string_to_array(sequence, '')) AS aa FROM p), h AS (SELECT unnest(map_entries(histogram(aa))) AS kv FROM s), e AS (SELECT * from read_csv_auto('${amino_acid_properties}')) SELECT '${prefix}' AS id, h.kv['value'] AS count, e.* FROM h JOIN e ON h.kv['key'] = e.one_letter_symbol) TO '${prefix}.histogram.tsv' (HEADER, DELIMITER '\t')"
"""
duckdb :memory: "$sql"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
duckdb: \$( duckdb --version | cut -f 1 -d " " )
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

#
# See https://en.wikipedia.org/wiki/Amino_acid

cat <<END_PROPERTIES > amino_acid_properties.tsv
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤢

amino_acid three_letter_symbol one_letter_symbol class chemical_polarity net_charge hydropathy_index molecular_mass abundance_in_proteins standard_genetic_coding hydrophobic aromatic aliphatic small hydrophilic positively_charged negatively_charged
Alanine Ala A Aliphatic Nonpolar Neutral 1.8 89.094 8.76 GCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Arginine Arg R Fixed cation Basic polar Positive −4.5 174.203 5.78 MGR, CGY FALSE FALSE FALSE FALSE TRUE TRUE FALSE
Asparagine Asn N Amide Polar Neutral −3.5 132.119 3.93 AAY FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Aspartate Asp D Anion Brønsted base Negative −3.5 133.104 5.49 GAY FALSE FALSE FALSE FALSE TRUE FALSE TRUE
Cysteine Cys C Thiol Brønsted acid Neutral 2.5 121.154 1.38 UGY FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Glutamine Gln Q Amide Polar Neutral −3.5 146.146 3.9 CAR FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Glutamate Glu E Anion Brønsted base Negative −3.5 147.131 6.32 GAR FALSE FALSE FALSE FALSE TRUE FALSE TRUE
Glycine Gly G Aliphatic Nonpolar Neutral −0.4 75.067 7.03 GGN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Histidine His H Cationic Brønsted acid and base Positive, 10% Neutral, 90% −3.2 155.156 2.26 CAY FALSE TRUE FALSE FALSE TRUE TRUE FALSE
Isoleucine Ile I Aliphatic Nonpolar Neutral 4.5 131.175 5.49 AUH TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Leucine Leu L Aliphatic Nonpolar Neutral 3.8 131.175 9.68 YUR, CUY TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Lysine Lys K Cation Brønsted acid Positive −3.9 146.189 5.19 AAR FALSE FALSE FALSE FALSE TRUE TRUE FALSE
Methionine Met M Thioether Nonpolar Neutral 1.9 149.208 2.32 AUG TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Phenylalanine Phe F Aromatic Nonpolar Neutral 2.8 165.192 3.87 UUY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Proline Pro P Cyclic Nonpolar Neutral −1.6 115.132 5.02 CCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Serine Ser S Hydroxylic Polar Neutral −0.8 105.093 7.14 UCN, AGY FALSE FALSE FALSE TRUE TRUE FALSE FALSE
Threonine Thr T Hydroxylic Polar Neutral −0.7 119.119 5.53 ACN FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Tryptophan Trp W Aromatic Nonpolar Neutral −0.9 204.228 1.25 UGG TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Tyrosine Tyr Y Aromatic Brønsted acid Neutral −1.3 181.191 2.91 UAY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Valine Val V Aliphatic Nonpolar Neutral 4.2 117.148 6.73 GUN TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Selenocysteine Sec U 168.064
Pyrrolysine Pyl O 255.313
Any/unknown Xaa X
Asparagine or aspartate Asx B
Glutamine or glutamate Glx Z
Leucine or isoleucine Xle J
END_PROPERTIES
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
amino_acid three_letter_symbol one_letter_symbol class chemical_polarity net_charge hydropathy_index molecular_mass abundance_in_proteins standard_genetic_coding hydrophobic aromatic aliphatic small hydrophilic positively_charged negatively_charged
Alanine Ala A Aliphatic Nonpolar Neutral 1.8 89.094 8.76 GCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Arginine Arg R Fixed cation Basic polar Positive −4.5 174.203 5.78 MGR, CGY FALSE FALSE FALSE FALSE TRUE TRUE FALSE
Asparagine Asn N Amide Polar Neutral −3.5 132.119 3.93 AAY FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Aspartate Asp D Anion Brønsted base Negative −3.5 133.104 5.49 GAY FALSE FALSE FALSE FALSE TRUE FALSE TRUE
Cysteine Cys C Thiol Brønsted acid Neutral 2.5 121.154 1.38 UGY FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Glutamine Gln Q Amide Polar Neutral −3.5 146.146 3.9 CAR FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Glutamate Glu E Anion Brønsted base Negative −3.5 147.131 6.32 GAR FALSE FALSE FALSE FALSE TRUE FALSE TRUE
Glycine Gly G Aliphatic Nonpolar Neutral −0.4 75.067 7.03 GGN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Histidine His H Cationic Brønsted acid and base Positive, 10% Neutral, 90% −3.2 155.156 2.26 CAY FALSE TRUE FALSE FALSE TRUE TRUE FALSE
Isoleucine Ile I Aliphatic Nonpolar Neutral 4.5 131.175 5.49 AUH TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Leucine Leu L Aliphatic Nonpolar Neutral 3.8 131.175 9.68 YUR, CUY TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Lysine Lys K Cation Brønsted acid Positive −3.9 146.189 5.19 AAR FALSE FALSE FALSE FALSE TRUE TRUE FALSE
Methionine Met M Thioether Nonpolar Neutral 1.9 149.208 2.32 AUG TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Phenylalanine Phe F Aromatic Nonpolar Neutral 2.8 165.192 3.87 UUY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Proline Pro P Cyclic Nonpolar Neutral −1.6 115.132 5.02 CCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
Serine Ser S Hydroxylic Polar Neutral −0.8 105.093 7.14 UCN, AGY FALSE FALSE FALSE TRUE TRUE FALSE FALSE
Threonine Thr T Hydroxylic Polar Neutral −0.7 119.119 5.53 ACN FALSE FALSE FALSE FALSE TRUE FALSE FALSE
Tryptophan Trp W Aromatic Nonpolar Neutral −0.9 204.228 1.25 UGG TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Tyrosine Tyr Y Aromatic Brønsted acid Neutral −1.3 181.191 2.91 UAY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
Valine Val V Aliphatic Nonpolar Neutral 4.2 117.148 6.73 GUN TRUE FALSE TRUE FALSE FALSE FALSE FALSE
Selenocysteine Sec U 168.064
Pyrrolysine Pyl O 255.313
Any/unknown Xaa X
Asparagine or aspartate Asx B
Glutamine or glutamate Glx Z
Leucine or isoleucine Xle J
41 changes: 41 additions & 0 deletions modules/local/eider/aminoacidhistogram/assets/query_template.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
COPY (
WITH p AS (
SELECT * FROM read_parquet('${parquet}/*.parquet')
),
s AS (
SELECT unnest(string_to_array(sequence, '')) AS aa FROM p
),
h AS (
SELECT unnest(map_entries(histogram(aa))) AS kv FROM s
),
e AS (
SELECT * from read_csv_auto('${amino_acid_properties}')
)
SELECT
'${prefix}' AS id,
h.kv['value'] AS count,
e.amino_acid,
e.one_letter_symbol,
e.three_letter_symbol,
e.class,
e.chemical_polarity,
e.net_charge,
e.hydropathy_index,
e.molecular_mass,
e.abundance_in_proteins,
e.standard_genetic_coding,
e.hydrophobic,
e.aromatic,
e.aliphatic,
e.small,
e.hydrophilic,
e.positively_charged,
e.negatively_charged
FROM
h
JOIN
e
ON
h.kv['key'] = e.one_letter_symbol
)
TO '${prefix}.histogram.tsv' (HEADER, DELIMITER '\t')
7 changes: 7 additions & 0 deletions modules/local/eider/aminoacidhistogram/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::eider=0.1
39 changes: 39 additions & 0 deletions modules/local/eider/aminoacidhistogram/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
process EIDER_AMINOACIDHISTOGRAM {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/eider:0.1--hdfd78af_0' :
'biocontainers/eider:0.1--hdfd78af_0' }"

input:
tuple val(meta), path(parquet)

output:
tuple val(meta), path("*.histogram.tsv"), emit: histogram
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def amino_acid_properties = file("${moduleDir}/assets/amino_acid_properties.tsv")
def query_template = file("${moduleDir}/assets/query_template.sql")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These do not stage when running with -profile docker

"""
eider \
$args \
--verbose \
--skip-history \
--parameters prefix=${prefix} \
--parameters amino_acid_properties=${amino_acid_properties} \
--query-path ${query_template}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
eider: \$(eider --version 2>&1 | grep -o 'eider .*' | cut -f2 -d ' ')
END_VERSIONS
"""
}
47 changes: 47 additions & 0 deletions modules/local/eider/aminoacidhistogram/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: eider_aminoacidhistogram
description: Build amino acid histogram from protein sequences
keywords:
- duckdb
- parquet
- sql
- query
tools:
- eider:
description: |
Command line bioinformatics tools for DuckDB.
homepage: https://github.com/heuermh/eider
documentation: https://github.com/heuermh/eider
licence: ["Apache-2.0"]
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- parquet:
type: directory
description: |
Directory of protein sequences in Parquet format
pattern: "*.parquet"
output:
- histogram:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.tsv":
type: file
description: |
Amino acid histogram in tab separated values (TSV) text format
pattern: "*.{tsv}"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@heuermh"
maintainers:
- "@heuermh"
Binary file not shown.
Loading
Loading