diff --git a/CHANGELOG.md b/CHANGELOG.md index e7a814d..31a635d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Initial release of nf-core/proteinannotator, created with the [nf-core](https://nf-co.re/) template. +### Credits + +Special thanks to the following for their contributions to the release: + +- [Tien Ly](https://github.com/tntly) + ### `Added` - [[PR #13](https://github.com/nf-core/proteinannotator/pull/13)] Add nf-core seqkit/stats module +- [[PR #17](https://github.com/nf-core/proteinannotator/pull/17)] Added Unifire module ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 843f5d3..8a6248d 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,6 +14,11 @@ > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [UniFIRE](https://gitlab.ebi.ac.uk/uniprot-public/unifire) + +> UniFIRE (The UniProt Functional annotation Inference Rule Engine) is an engine to execute rules in the UniProt Rule Markup Language (URML) format. It can be used to execute the UniProt annotation rules (UniRule and ARBA). +> License: Apache License 2.0 + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index 7405207..96bd58d 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,10 @@ -1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) + +1. Functional Annotation + 1. Perform UniProt's official annotation pipeline ([`UniFire`](https://gitlab.ebi.ac.uk/uniprot-public/unifire)) +1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/sampleesheet.csv b/conf/sampleesheet.csv new file mode 100644 index 0000000..0c3e4d1 --- /dev/null +++ b/conf/sampleesheet.csv @@ -0,0 +1,2 @@ +id,fasta +snap25_isoforms,https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/kmerseek/reference/snap25_isoforms_human_P60880.fasta diff --git a/conf/test.config b/conf/test.config index 9de8668..43859a0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,5 +27,5 @@ params { // TODO nf-core: Give any required params for the test so that command line flags are not needed // From: https://github.com/nf-core/proteinfold/blob/1.1.1/conf/test.config // Example: https://github.com/nf-core/test-datasets/blob/proteinfold/testdata/samplesheet/v1.2/samplesheet.csv - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.2/samplesheet.csv' + input = params.pipelines_testdata_base_path + 'conf/samplesheet.csv' } diff --git a/conf/test_full.config b/conf/test_full.config index ae76d99..fd9bf05 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,8 +17,6 @@ params { // Input data for full size test // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/snap25-isoforms.csv' - // Genome references - genome = 'R64-1-1' } diff --git a/docs/output.md b/docs/output.md index 3f5219a..3dba6eb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,10 +12,28 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [Functional Annotation](#functional-annotation) + - [UniFIRE] (#unifire) - Run UniProt's official UniFIRE workflow for protein function prediction - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [SeqKit stats](#seqkit_stats) - Simple statistics for protein FASTA files - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Functional Annotation + +#### UniFIRE + +
+Output files + +- `unifire/` + - `predictions_arba.out`: a file containing predictions from the Arba method. + - `predictions_unirule.out`: a file containing predictions from the Unirule method. + - `mpredictions_unirule-pirsr.out`: a file containing predictions from the Pirsr method. + +
+ +[UniFIRE](https://gitlab.ebi.ac.uk/uniprot-public/unifire) (The UniProt Functional annotation Inference Rule Engine) is an engine to execute rules in the UniProt Rule Markup Language (URML) format. It can be used to execute the UniProt annotation rules (UniRule and ARBA). + ### MultiQC
diff --git a/modules/local/unifire/main.nf b/modules/local/unifire/main.nf new file mode 100644 index 0000000..d4f1de4 --- /dev/null +++ b/modules/local/unifire/main.nf @@ -0,0 +1,57 @@ +process UNIFIRE { + tag "$meta.id" + label 'process_large' + + container "dockerhub.ebi.ac.uk/uniprot-public/unifire:2025.1" // TODO: Update once Bioconda is available + containerOptions { + if (workflow.containerEngine in ['singularity', 'apptainer']) { + return "--bind unifire:/volume" + } else { + return "-v ./unifire:/volume" + } + } + + input: + tuple val(meta), path(faa, stageAs: "unifire/proteins.fasta") + + output: + tuple val(meta), path("unifire/predictions_arba.out") , emit: arba + tuple val(meta), path("unifire/predictions_unirule.out") , emit: unirule + tuple val(meta), path("unifire/predictions_unirule-pirsr.out"), emit: pirsr + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2025.1' + // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + # This tool needs a specific folder to be mounted to work. + # Run UniFIRE workflow + /opt/scripts/bin/unifire-workflow.sh + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + UniFIRE: ${VERSION} + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2025.1' + """ + mkdir -p unifire + touch unifire/predictions_arba.out + touch unifire/predictions_unirule.out + touch unifire/predictions_unirule-pirsr.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + UniFIRE: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/local/unifire/meta.yml b/modules/local/unifire/meta.yml new file mode 100644 index 0000000..d6fe210 --- /dev/null +++ b/modules/local/unifire/meta.yml @@ -0,0 +1,65 @@ +name: "unifire" +description: Runs the UniFIRE workflow for protein function prediction +keywords: + - uniprot + - unifire + - protein function prediction + - functional annotation + - proteomics +tools: + - "unifire": + description: "UniFIRE: Unified Function Inference and Rule Extraction" + homepage: "https://gitlab.ebi.ac.uk/uniprot-public/unifire" + documentation: "https://gitlab.ebi.ac.uk/uniprot-public/unifire/-/blob/master/README.md?ref_type=heads" + tool_dev_url: "https://gitlab.ebi.ac.uk/uniprot-public/unifire" + licence: ["Apache-2.0"] +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - faa: + type: file + description: A protein sequence file in FASTA format + pattern: "*.{faa,faa.gz,fasta,fasta.gz,fas,fas.gz,fa,fa.gz}" +output: + - arba: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "unifire/predictions_arba.out": + type: file + description: Predictions from the Arba method + pattern: "unifire/predictions_arba.out" + - unirule: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "unifire/predictions_unirule.out": + type: file + description: Predictions from the Unirule method + pattern: "unifire/predictions_unirule.out" + - pirsr: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "unifire/predictions_unirule-pirsr.out": + type: file + description: Predictions from the Pirsr method + pattern: "unifire/predictions_unirule-pirsr.out" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@tntly" +maintainers: + - "@tntly" diff --git a/modules/local/unifire/tests/main.nf.test b/modules/local/unifire/tests/main.nf.test new file mode 100644 index 0000000..e6fbfb9 --- /dev/null +++ b/modules/local/unifire/tests/main.nf.test @@ -0,0 +1,75 @@ +nextflow_process { + + name "Test Process UNIFIRE" + script "../main.nf" + process "UNIFIRE" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "unifire" + tag "seqkit" + tag "seqkit/head" + + setup { + run("SEQKIT_HEAD") { + script "modules/nf-core/seqkit/head/main.nf" + process { + """ + input[0] = Channel.from([ + [ id:'test1' ], // meta + [file(params.modules_testdata_base_path + 'kmerseek/reference/snap25_isoforms_human_P60880.fasta', checkIfExists: true)], // fastas + 1 // seq_count + ]) + """ + } + } + } + + test("human - fasta") { + + when { + process { + """ + input[0] = SEQKIT_HEAD.out.subset + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.arba).match("arba") }, + { assert snapshot(process.out.unirule).match("unirule") }, + { assert snapshot(process.out.pirsr).match("pirsr") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("human - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = SEQKIT_HEAD.out.subset + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.arba).match("arba_stub") }, + { assert snapshot(process.out.unirule).match("unirule_stub") }, + { assert snapshot(process.out.pirsr).match("pirsr_stub") }, + { assert snapshot(process.out.versions).match("versions_stub") } + ) + } + + } + +} diff --git a/modules/local/unifire/tests/main.nf.test.snap b/modules/local/unifire/tests/main.nf.test.snap new file mode 100644 index 0000000..7692cf9 --- /dev/null +++ b/modules/local/unifire/tests/main.nf.test.snap @@ -0,0 +1,128 @@ +{ + "arba_stub": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_arba.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:33:52.55728" + }, + "versions_stub": { + "content": [ + [ + "versions.yml:md5,9300b1be29e89c68039f4497407a3f47" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:33:52.641584" + }, + "pirsr": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_unirule-pirsr.out:md5,4b58d027e735a319fc881213fb7907fd" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:09:44.375307" + }, + "unirule_stub": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_unirule.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:33:52.602665" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,9300b1be29e89c68039f4497407a3f47" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:09:44.388711" + }, + "unirule": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_unirule.out:md5,4b58d027e735a319fc881213fb7907fd" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:09:44.36028" + }, + "pirsr_stub": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_unirule-pirsr.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:33:52.614641" + }, + "arba": { + "content": [ + [ + [ + { + "id": "test1" + }, + "predictions_arba.out:md5,1e1a7b042c1e31726f0a50671bcdf50d" + ] + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-25T21:09:44.249684" + } +} diff --git a/modules/local/unifire/tests/nextflow.config b/modules/local/unifire/tests/nextflow.config new file mode 100644 index 0000000..4d20402 --- /dev/null +++ b/modules/local/unifire/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: UNIFIRE { + memory = 8.GB + } +} diff --git a/subworkflows/local/functional_annotation/main.nf b/subworkflows/local/functional_annotation/main.nf index af1134b..98d6907 100644 --- a/subworkflows/local/functional_annotation/main.nf +++ b/subworkflows/local/functional_annotation/main.nf @@ -1,3 +1,6 @@ +include { UNIFIRE } from '../../../modules/local/unifire/main' + + workflow FUNCTIONAL_ANNOTATION { take: @@ -7,11 +10,16 @@ workflow FUNCTIONAL_ANNOTATION { ch_versions = Channel.empty() + UNIFIRE ( ch_fasta ) + + ch_versions = ch_versions.mix( UNIFIRE.out.versions ) + // TODO nf-core: substitute modules here for the modules of your subworkflow emit: - // TODO nf-core: edit emitted channels - - versions = ch_versions // channel: [ versions.yml ] + unifire_arba = UNIFIRE.out.arba + unifire_unirule = UNIFIRE.out.unirule + unifire_pirsr = UNIFIRE.out.pirsr + versions = ch_versions // channel: [ versions.yml ] }