Skip to content
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

* `annotate/celltypist`: Enable CUDA acceleration for CellTypist annotation (PR #1091).

* `workflows/annotation/celltypist`: Performs lognormalization (target count of 10000) followed by cell type annotation using CellTypist (PR #1083).

## EXPERIMENTAL

* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).
Expand Down
4 changes: 2 additions & 2 deletions src/annotate/celltypist/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ argument_groups:
required: false
- name: "--input_layer"
type: string
description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
Expand All @@ -50,7 +50,7 @@ argument_groups:
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
Expand Down
8 changes: 0 additions & 8 deletions src/annotate/celltypist/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import mudata as mu
import anndata as ad
import pandas as pd
import numpy as np
from torch.cuda import is_available as cuda_is_available

## VIASH START
Expand All @@ -20,7 +19,6 @@
"input_reference_gene_overlap": 100,
"reference_obs_target": "cell_ontology_class",
"reference_var_input": None,
"check_expression": False,
"feature_selection": True,
"majority_voting": True,
"output_compression": "gzip",
Expand All @@ -47,12 +45,6 @@
logger.info("GPU enabled? %s", use_gpu)


def check_celltypist_format(indata):
if np.abs(np.expm1(indata[0]).sum() - 10000) > 1:
return False
return True


def main(par):
if (not par["model"] and not par["reference"]) or (
par["model"] and par["reference"]
Expand Down
154 changes: 154 additions & 0 deletions src/workflows/annotation/celltypist/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
name: "celltypist"
namespace: "workflows/annotation"
scope: "public"
description: "Cell type annotation workflow by performing lognormalization of the raw counts layer followed by cell type annotation with CellTypist."
info:
name: "CellTypist annotation"
test_dependencies:
- name: celltypist_test
namespace: test_workflows/annotation
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]

argument_groups:
- name: Inputs
description: Input dataset (query) arguments
arguments:
- name: "--input"
alternatives: [-i]
type: file
description: The input (query) data to be labeled. Should be a .h5mu file.
direction: input
required: true
example: input.h5mu
- name: "--modality"
description: Which modality to process.
type: string
default: "rna"
required: false
- name: "--input_layer"
type: string
description: The layer in the input data containing raw counts, if .X is not to be used.
- name: "--input_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--input_reference_gene_overlap"
type: integer
default: 100
min: 1
description: |
The minimum number of genes present in both the reference and query datasets.

- name: Reference
description: Arguments related to the reference dataset.
arguments:
- name: "--reference"
type: file
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
example: reference.h5mu
direction: input
required: false
- name: "--reference_layer"
type: string
description: The layer in the reference data containing raw counts, if .X is not to be used.
required: false
- name: "--reference_obs_target"
type: string
description: The name of the adata obs column in the reference data containing cell type annotations.
default: "cell_ontology_class"
- name: "--reference_var_gene_names"
type: string
required: false
description: |
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
- name: "--reference_var_input"
type: string
required: false
description: |
.var column containing highly variable genes. By default, do not subset genes.

- name: Model arguments
description: Model arguments.
arguments:
- name: "--model"
type: file
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
required: false
example: pretrained_model.pkl
- name: "--feature_selection"
type: boolean
description: "Whether to perform feature selection."
default: false
- name: "--majority_voting"
type: boolean
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
default: false
- name: "--C"
type: double
description: "Inverse of regularization strength in logistic regression."
default: 1.0
- name: "--max_iter"
type: integer
description: "Maximum number of iterations before reaching the minimum of the cost function."
default: 1000
- name: "--use_SGD"
type: boolean_true
description: "Whether to use the stochastic gradient descent algorithm."
- name: "--min_prop"
type: double
description: |
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
default: 0

- name: Outputs
description: Output arguments.
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obs_predictions"
type: string
default: celltypist_pred
required: false
description: |
In which `.obs` slots to store the predicted information.
- name: "--output_obs_probability"
type: string
default: celltypist_probability
required: false
description: |
In which `.obs` slots to store the probability of the predictions.
__merge__: [., /src/base/h5_compression_argument.yaml]

dependencies:
- name: transform/normalize_total
- name: transform/log1p
- name: transform/delete_layer
- name: annotate/celltypist
alias: celltypist_component

resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf

test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_wf
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
- path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
- path: /resources_test/annotation_test_data/demo_2000_cells.h5mu

runners:
- type: nextflow
23 changes: 23 additions & 0 deletions src/workflows/annotation/celltypist/integration_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

nextflow \
run . \
-main-script src/workflows/annotation/celltypist/test.nf \
-entry test_wf \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \

nextflow \
run . \
-main-script src/workflows/annotation/celltypist/test.nf \
-entry test_wf_2 \
-profile docker,no_publish \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config \
132 changes: 132 additions & 0 deletions src/workflows/annotation/celltypist/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
workflow run_wf {
take:
input_ch

main:

query_ch = input_ch
// Log normalize query dataset to target sum of 10000
| normalize_total.run(
fromState: { id, state -> [
"input": state.input,
"modality": state.modality,
"input_layer": state.input_layer,
]},
args: [
"output_layer": "normalized_10k",
"target_sum": "10000",
],
toState: [
"input": "output",
]
)
| log1p.run(
fromState: { id, state -> [
"input": state.input,
"modality": state.modality
]},
args: [
"input_layer": "normalized_10k",
"output_layer": "log_normalized_10k",
],
toState: [
"input": "output"
]
)
| delete_layer.run(
fromState: { id, state -> [
"input": state.input,
"modality": state.modality
]},
args: [
"layer": "normalized_10k"
],
toState: [
"input": "output"
]
)
| view {"After query normalization: $it"}

ref_ch = input_ch
// Log normalize reference dataset to target sum of 10000
| normalize_total.run(
key: "normalize_total_reference",
runIf: { id, state ->
state.reference
},
fromState: { id, state -> [
"input": state.reference,
"modality": state.modality,
"input_layer": state.reference_layer,
]},
args: [
"output_layer": "normalized_10k",
"target_sum": "10000",
],
toState: [
"reference": "output",
]
)
| log1p.run(
key: "log1p_reference",
runIf: { id, state ->
state.reference
},
fromState: { id, state -> [
"input": state.reference,
"modality": state.modality
]},
args: [
"input_layer": "normalized_10k",
"output_layer": "log_normalized_10k",
],
toState: [
"reference": "output"
]
)
| view {"After reference normalization: $it"}


output_ch = query_ch.join(ref_ch, failOnMismatch: true, failOnDuplicate: true)
| view {"After channel mixing: $it"}
// Set aside the output for this workflow to avoid conflicts
| map {id, query_state, ref_state ->
def newState = query_state + ["reference": ref_state.reference]
[id, newState]
}
// Run harmony integration with leiden clustering
| celltypist_component.run(
fromState: { id, state -> [
"input": state.input,
"modality": state.modality,
"input_var_gene_names": state.input_var_gene_names,
"input_reference_gene_overlap": state.input_reference_gene_overlap,
"reference": state.reference,
"reference_obs_target": state.reference_obs_target,
"reference_var_gene_names": state.reference_var_gene_names,
"reference_var_input": state.reference_var_input,
"model": state.model,
"feature_selection": state.feature_selection,
"majority_voting": state.majority_voting,
"C": state.C,
"max_iter": state.max_iter,
"use_SGD": state.use_SGD,
"min_prop": state.min_prop,
"output": state.output,
"output_obs_predictions": state.output_obs_predictions,
"output_obs_probability": state.output_obs_probability
]},
args: [
"input_layer": "log_normalized_10k",
"reference_layer": "log_normalized_10k"
],
toState: [
"output": "output"
]
)
| view {"After annotation: $it"}
| setState(["output"])

emit:
output_ch
}
10 changes: 10 additions & 0 deletions src/workflows/annotation/celltypist/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}

params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
}

// include common settings
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
Loading
Loading