Add workflows/annotation/celltypist (#1083)

dorien-er · web-flow · commit 1332f7fc4b41 · 2025-10-30T09:48:29.000+01:00
* add lognormalization to celltypist component

* update changelog

* update descriptions component

* update changelog

* update changelog

* update changelog

* update changelog

* undo test changes

* wip

* create celltypist workflow

* parallelize
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@
 
 * `annotate/celltypist`: Enable CUDA acceleration for CellTypist annotation (PR #1091).
 
+* `workflows/annotation/celltypist`: Performs lognormalization (target count of 10000) followed by cell type annotation using CellTypist (PR #1083).
+
 ## EXPERIMENTAL
 
 * `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).
diff --git a/src/annotate/celltypist/config.vsh.yaml b/src/annotate/celltypist/config.vsh.yaml
@@ -26,7 +26,7 @@ argument_groups:
         required: false
       - name: "--input_layer"
         type: string
-        description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. 
+        description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used. 
       - name: "--input_var_gene_names"
         type: string
         required: false
@@ -50,7 +50,7 @@ argument_groups:
         required: false
       - name: "--reference_layer"
         type: string
-        description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
+        description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
         required: false
       - name: "--reference_obs_target"
         type: string
diff --git a/src/annotate/celltypist/script.py b/src/annotate/celltypist/script.py
@@ -3,7 +3,6 @@
 import mudata as mu
 import anndata as ad
 import pandas as pd
-import numpy as np
 from torch.cuda import is_available as cuda_is_available
 
 ## VIASH START
@@ -20,7 +19,6 @@
     "input_reference_gene_overlap": 100,
     "reference_obs_target": "cell_ontology_class",
     "reference_var_input": None,
-    "check_expression": False,
     "feature_selection": True,
     "majority_voting": True,
     "output_compression": "gzip",
@@ -47,12 +45,6 @@
 logger.info("GPU enabled? %s", use_gpu)
 
 
-def check_celltypist_format(indata):
-    if np.abs(np.expm1(indata[0]).sum() - 10000) > 1:
-        return False
-    return True
-
-
 def main(par):
     if (not par["model"] and not par["reference"]) or (
         par["model"] and par["reference"]
diff --git a/src/workflows/annotation/celltypist/config.vsh.yaml b/src/workflows/annotation/celltypist/config.vsh.yaml
@@ -0,0 +1,154 @@
+name: "celltypist"
+namespace: "workflows/annotation"
+scope: "public"
+description: "Cell type annotation workflow by performing lognormalization of the raw counts layer followed by cell type annotation with CellTypist."
+info:
+  name: "CellTypist annotation"
+  test_dependencies:
+    - name: celltypist_test
+      namespace: test_workflows/annotation
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author, maintainer ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Inputs
+    description: Input dataset (query) arguments
+    arguments:
+      - name: "--input"
+        alternatives: [-i]
+        type: file
+        description: The input (query) data to be labeled. Should be a .h5mu file.
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_layer"
+        type: string
+        description: The layer in the input data containing raw counts, if .X is not to be used. 
+      - name: "--input_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+  
+  - name: Reference
+    description: Arguments related to the reference dataset.
+    arguments:
+      - name: "--reference"
+        type: file
+        description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
+        example: reference.h5mu
+        direction: input
+        required: false
+      - name: "--reference_layer"
+        type: string
+        description: The layer in the reference data containing raw counts, if .X is not to be used.
+        required: false
+      - name: "--reference_obs_target"
+        type: string
+        description: The name of the adata obs column in the reference data containing cell type annotations.
+        default: "cell_ontology_class"
+      - name: "--reference_var_gene_names"
+        type: string
+        required: false
+        description: |
+          The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
+
+  - name: Model arguments
+    description: Model arguments.
+    arguments:
+      - name: "--model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+      - name: "--feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: false
+      - name: "--majority_voting"
+        type: boolean
+        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
+        default: false
+      - name: "--C"
+        type: double
+        description: "Inverse of regularization strength in logistic regression."
+        default: 1.0
+      - name: "--max_iter"
+        type: integer
+        description: "Maximum number of iterations before reaching the minimum of the cost function."
+        default: 1000
+      - name: "--use_SGD"
+        type: boolean_true
+        description: "Whether to use the stochastic gradient descent algorithm."
+      - name: "--min_prop"
+        type: double
+        description: |
+          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
+          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
+          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
+        default: 0
+
+  - name: Outputs
+    description: Output arguments.
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        default: celltypist_pred
+        required: false
+        description: |
+          In which `.obs` slots to store the predicted information.
+      - name: "--output_obs_probability"
+        type: string
+        default: celltypist_probability
+        required: false
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+    __merge__: [., /src/base/h5_compression_argument.yaml]
+
+dependencies:
+  - name: transform/normalize_total
+  - name: transform/log1p
+  - name: transform/delete_layer
+  - name: annotate/celltypist
+    alias: celltypist_component
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+  - path: /resources_test/annotation_test_data/demo_2000_cells.h5mu
+
+runners:
+  - type: nextflow
diff --git a/src/workflows/annotation/celltypist/integration_test.sh b/src/workflows/annotation/celltypist/integration_test.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+nextflow \
+  run . \
+  -main-script src/workflows/annotation/celltypist/test.nf \
+  -entry test_wf \
+  -profile docker,no_publish \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config \
+
+nextflow \
+  run . \
+  -main-script src/workflows/annotation/celltypist/test.nf \
+  -entry test_wf_2 \
+  -profile docker,no_publish \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config \
diff --git a/src/workflows/annotation/celltypist/main.nf b/src/workflows/annotation/celltypist/main.nf
@@ -0,0 +1,132 @@
+workflow run_wf {
+  take:
+    input_ch
+
+  main:
+    
+    query_ch = input_ch
+      // Log normalize query dataset to target sum of 10000
+      | normalize_total.run(
+        fromState: { id, state -> [
+          "input": state.input,
+          "modality": state.modality,
+          "input_layer": state.input_layer,
+        ]},
+        args: [
+          "output_layer": "normalized_10k",
+          "target_sum": "10000",
+        ],
+        toState: [
+          "input": "output",
+        ]
+      )
+      | log1p.run( 
+        fromState: { id, state -> [
+          "input": state.input,
+          "modality": state.modality
+        ]},
+        args: [
+          "input_layer": "normalized_10k",
+          "output_layer": "log_normalized_10k",
+        ],
+        toState: [
+          "input": "output"
+        ]
+      )
+      | delete_layer.run(
+        fromState: { id, state -> [
+          "input": state.input,
+          "modality": state.modality
+        ]},
+        args: [
+          "layer": "normalized_10k"
+        ],
+        toState: [
+          "input": "output"
+        ]
+      )
+      | view {"After query normalization: $it"}
+
+    ref_ch = input_ch
+      // Log normalize reference dataset to target sum of 10000
+      | normalize_total.run(
+        key: "normalize_total_reference",
+        runIf: { id, state ->
+          state.reference
+        },
+        fromState: { id, state -> [
+          "input": state.reference,
+          "modality": state.modality,
+          "input_layer": state.reference_layer,
+        ]},
+        args: [
+          "output_layer": "normalized_10k",
+          "target_sum": "10000",
+        ],
+        toState: [
+          "reference": "output",
+        ]
+      )
+      | log1p.run( 
+        key: "log1p_reference",
+        runIf: { id, state ->
+          state.reference
+        },
+        fromState: { id, state -> [
+          "input": state.reference,
+          "modality": state.modality
+        ]},
+        args: [
+          "input_layer": "normalized_10k",
+          "output_layer": "log_normalized_10k",
+        ],
+        toState: [
+          "reference": "output"
+        ]
+      )
+      | view {"After reference normalization: $it"}
+
+
+    output_ch = query_ch.join(ref_ch, failOnMismatch: true, failOnDuplicate: true)
+        | view {"After channel mixing: $it"}
+        // Set aside the output for this workflow to avoid conflicts
+        | map {id, query_state, ref_state -> 
+          def newState = query_state + ["reference": ref_state.reference]
+          [id, newState]
+        }        
+        // Run harmony integration with leiden clustering
+        | celltypist_component.run(
+          fromState: { id, state -> [
+            "input": state.input,
+            "modality": state.modality,
+            "input_var_gene_names": state.input_var_gene_names,
+            "input_reference_gene_overlap": state.input_reference_gene_overlap,
+            "reference": state.reference,
+            "reference_obs_target": state.reference_obs_target,
+            "reference_var_gene_names": state.reference_var_gene_names,
+            "reference_var_input": state.reference_var_input,
+            "model": state.model,
+            "feature_selection": state.feature_selection,
+            "majority_voting": state.majority_voting,
+            "C": state.C,
+            "max_iter": state.max_iter,
+            "use_SGD": state.use_SGD,
+            "min_prop": state.min_prop,
+            "output": state.output,
+            "output_obs_predictions": state.output_obs_predictions,
+            "output_obs_probability": state.output_obs_probability
+          ]},
+          args: [
+            "input_layer": "log_normalized_10k",
+            "reference_layer": "log_normalized_10k"
+          ],
+          toState: [
+            "output": "output"
+          ]
+        )
+        | view {"After annotation: $it"}
+        | setState(["output"])
+
+  emit:
+    output_ch
+}
diff --git a/src/workflows/annotation/celltypist/nextflow.config b/src/workflows/annotation/celltypist/nextflow.config
@@ -0,0 +1,10 @@
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
diff --git a/src/workflows/annotation/celltypist/test.nf b/src/workflows/annotation/celltypist/test.nf
diff --git a/src/workflows/test_workflows/annotation/celltypist/config.vsh.yaml b/src/workflows/test_workflows/annotation/celltypist/config.vsh.yaml
diff --git a/src/workflows/test_workflows/annotation/celltypist/script.py b/src/workflows/test_workflows/annotation/celltypist/script.py