Skip to content

Commit 1332f7f

Browse files
authored
Add workflows/annotation/celltypist (#1083)
* add lognormalization to celltypist component * update changelog * update descriptions component * update changelog * update changelog * update changelog * update changelog * undo test changes * wip * create celltypist workflow * parallelize
1 parent 05a13ba commit 1332f7f

File tree

10 files changed

+483
-10
lines changed

10 files changed

+483
-10
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
* `annotate/celltypist`: Enable CUDA acceleration for CellTypist annotation (PR #1091).
1212

13+
* `workflows/annotation/celltypist`: Performs lognormalization (target count of 10000) followed by cell type annotation using CellTypist (PR #1083).
14+
1315
## EXPERIMENTAL
1416

1517
* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).

src/annotate/celltypist/config.vsh.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ argument_groups:
2626
required: false
2727
- name: "--input_layer"
2828
type: string
29-
description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.
29+
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
3030
- name: "--input_var_gene_names"
3131
type: string
3232
required: false
@@ -50,7 +50,7 @@ argument_groups:
5050
required: false
5151
- name: "--reference_layer"
5252
type: string
53-
description: The layer in the reference data to be used for cell type annotation if .X is not to be used. Data are expected to be processed in the same way as the --input query dataset.
53+
description: The layer in the reference data containing counts that are lognormalized to 10000, if .X is not to be used.
5454
required: false
5555
- name: "--reference_obs_target"
5656
type: string

src/annotate/celltypist/script.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import mudata as mu
44
import anndata as ad
55
import pandas as pd
6-
import numpy as np
76
from torch.cuda import is_available as cuda_is_available
87

98
## VIASH START
@@ -20,7 +19,6 @@
2019
"input_reference_gene_overlap": 100,
2120
"reference_obs_target": "cell_ontology_class",
2221
"reference_var_input": None,
23-
"check_expression": False,
2422
"feature_selection": True,
2523
"majority_voting": True,
2624
"output_compression": "gzip",
@@ -47,12 +45,6 @@
4745
logger.info("GPU enabled? %s", use_gpu)
4846

4947

50-
def check_celltypist_format(indata):
51-
if np.abs(np.expm1(indata[0]).sum() - 10000) > 1:
52-
return False
53-
return True
54-
55-
5648
def main(par):
5749
if (not par["model"] and not par["reference"]) or (
5850
par["model"] and par["reference"]
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
name: "celltypist"
2+
namespace: "workflows/annotation"
3+
scope: "public"
4+
description: "Cell type annotation workflow by performing lognormalization of the raw counts layer followed by cell type annotation with CellTypist."
5+
info:
6+
name: "CellTypist annotation"
7+
test_dependencies:
8+
- name: celltypist_test
9+
namespace: test_workflows/annotation
10+
authors:
11+
- __merge__: /src/authors/dorien_roosen.yaml
12+
roles: [ author, maintainer ]
13+
- __merge__: /src/authors/weiwei_schultz.yaml
14+
roles: [ contributor ]
15+
16+
argument_groups:
17+
- name: Inputs
18+
description: Input dataset (query) arguments
19+
arguments:
20+
- name: "--input"
21+
alternatives: [-i]
22+
type: file
23+
description: The input (query) data to be labeled. Should be a .h5mu file.
24+
direction: input
25+
required: true
26+
example: input.h5mu
27+
- name: "--modality"
28+
description: Which modality to process.
29+
type: string
30+
default: "rna"
31+
required: false
32+
- name: "--input_layer"
33+
type: string
34+
description: The layer in the input data containing raw counts, if .X is not to be used.
35+
- name: "--input_var_gene_names"
36+
type: string
37+
required: false
38+
description: |
39+
The name of the adata var column in the input data containing gene names; when no gene_name_layer is provided, the var index will be used.
40+
- name: "--input_reference_gene_overlap"
41+
type: integer
42+
default: 100
43+
min: 1
44+
description: |
45+
The minimum number of genes present in both the reference and query datasets.
46+
47+
- name: Reference
48+
description: Arguments related to the reference dataset.
49+
arguments:
50+
- name: "--reference"
51+
type: file
52+
description: "The reference data to train the CellTypist classifiers on. Only required if a pre-trained --model is not provided."
53+
example: reference.h5mu
54+
direction: input
55+
required: false
56+
- name: "--reference_layer"
57+
type: string
58+
description: The layer in the reference data containing raw counts, if .X is not to be used.
59+
required: false
60+
- name: "--reference_obs_target"
61+
type: string
62+
description: The name of the adata obs column in the reference data containing cell type annotations.
63+
default: "cell_ontology_class"
64+
- name: "--reference_var_gene_names"
65+
type: string
66+
required: false
67+
description: |
68+
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
69+
- name: "--reference_var_input"
70+
type: string
71+
required: false
72+
description: |
73+
.var column containing highly variable genes. By default, do not subset genes.
74+
75+
- name: Model arguments
76+
description: Model arguments.
77+
arguments:
78+
- name: "--model"
79+
type: file
80+
description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
81+
required: false
82+
example: pretrained_model.pkl
83+
- name: "--feature_selection"
84+
type: boolean
85+
description: "Whether to perform feature selection."
86+
default: false
87+
- name: "--majority_voting"
88+
type: boolean
89+
description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
90+
default: false
91+
- name: "--C"
92+
type: double
93+
description: "Inverse of regularization strength in logistic regression."
94+
default: 1.0
95+
- name: "--max_iter"
96+
type: integer
97+
description: "Maximum number of iterations before reaching the minimum of the cost function."
98+
default: 1000
99+
- name: "--use_SGD"
100+
type: boolean_true
101+
description: "Whether to use the stochastic gradient descent algorithm."
102+
- name: "--min_prop"
103+
type: double
104+
description: |
105+
"For the dominant cell type within a subcluster, the minimum proportion of cells required to
106+
support naming of the subcluster by this cell type. Ignored if majority_voting is set to False.
107+
Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
108+
default: 0
109+
110+
- name: Outputs
111+
description: Output arguments.
112+
arguments:
113+
- name: "--output"
114+
type: file
115+
description: Output h5mu file.
116+
direction: output
117+
example: output.h5mu
118+
- name: "--output_obs_predictions"
119+
type: string
120+
default: celltypist_pred
121+
required: false
122+
description: |
123+
In which `.obs` slots to store the predicted information.
124+
- name: "--output_obs_probability"
125+
type: string
126+
default: celltypist_probability
127+
required: false
128+
description: |
129+
In which `.obs` slots to store the probability of the predictions.
130+
__merge__: [., /src/base/h5_compression_argument.yaml]
131+
132+
dependencies:
133+
- name: transform/normalize_total
134+
- name: transform/log1p
135+
- name: transform/delete_layer
136+
- name: annotate/celltypist
137+
alias: celltypist_component
138+
139+
resources:
140+
- type: nextflow_script
141+
path: main.nf
142+
entrypoint: run_wf
143+
144+
test_resources:
145+
- type: nextflow_script
146+
path: test.nf
147+
entrypoint: test_wf
148+
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
149+
- path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
150+
- path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
151+
- path: /resources_test/annotation_test_data/demo_2000_cells.h5mu
152+
153+
runners:
154+
- type: nextflow
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
3+
# get the root of the directory
4+
REPO_ROOT=$(git rev-parse --show-toplevel)
5+
6+
# ensure that the command below is run from the root of the repository
7+
cd "$REPO_ROOT"
8+
9+
nextflow \
10+
run . \
11+
-main-script src/workflows/annotation/celltypist/test.nf \
12+
-entry test_wf \
13+
-profile docker,no_publish \
14+
-c src/workflows/utils/labels_ci.config \
15+
-c src/workflows/utils/integration_tests.config \
16+
17+
nextflow \
18+
run . \
19+
-main-script src/workflows/annotation/celltypist/test.nf \
20+
-entry test_wf_2 \
21+
-profile docker,no_publish \
22+
-c src/workflows/utils/labels_ci.config \
23+
-c src/workflows/utils/integration_tests.config \
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
workflow run_wf {
2+
take:
3+
input_ch
4+
5+
main:
6+
7+
query_ch = input_ch
8+
// Log normalize query dataset to target sum of 10000
9+
| normalize_total.run(
10+
fromState: { id, state -> [
11+
"input": state.input,
12+
"modality": state.modality,
13+
"input_layer": state.input_layer,
14+
]},
15+
args: [
16+
"output_layer": "normalized_10k",
17+
"target_sum": "10000",
18+
],
19+
toState: [
20+
"input": "output",
21+
]
22+
)
23+
| log1p.run(
24+
fromState: { id, state -> [
25+
"input": state.input,
26+
"modality": state.modality
27+
]},
28+
args: [
29+
"input_layer": "normalized_10k",
30+
"output_layer": "log_normalized_10k",
31+
],
32+
toState: [
33+
"input": "output"
34+
]
35+
)
36+
| delete_layer.run(
37+
fromState: { id, state -> [
38+
"input": state.input,
39+
"modality": state.modality
40+
]},
41+
args: [
42+
"layer": "normalized_10k"
43+
],
44+
toState: [
45+
"input": "output"
46+
]
47+
)
48+
| view {"After query normalization: $it"}
49+
50+
ref_ch = input_ch
51+
// Log normalize reference dataset to target sum of 10000
52+
| normalize_total.run(
53+
key: "normalize_total_reference",
54+
runIf: { id, state ->
55+
state.reference
56+
},
57+
fromState: { id, state -> [
58+
"input": state.reference,
59+
"modality": state.modality,
60+
"input_layer": state.reference_layer,
61+
]},
62+
args: [
63+
"output_layer": "normalized_10k",
64+
"target_sum": "10000",
65+
],
66+
toState: [
67+
"reference": "output",
68+
]
69+
)
70+
| log1p.run(
71+
key: "log1p_reference",
72+
runIf: { id, state ->
73+
state.reference
74+
},
75+
fromState: { id, state -> [
76+
"input": state.reference,
77+
"modality": state.modality
78+
]},
79+
args: [
80+
"input_layer": "normalized_10k",
81+
"output_layer": "log_normalized_10k",
82+
],
83+
toState: [
84+
"reference": "output"
85+
]
86+
)
87+
| view {"After reference normalization: $it"}
88+
89+
90+
output_ch = query_ch.join(ref_ch, failOnMismatch: true, failOnDuplicate: true)
91+
| view {"After channel mixing: $it"}
92+
// Set aside the output for this workflow to avoid conflicts
93+
| map {id, query_state, ref_state ->
94+
def newState = query_state + ["reference": ref_state.reference]
95+
[id, newState]
96+
}
97+
// Run harmony integration with leiden clustering
98+
| celltypist_component.run(
99+
fromState: { id, state -> [
100+
"input": state.input,
101+
"modality": state.modality,
102+
"input_var_gene_names": state.input_var_gene_names,
103+
"input_reference_gene_overlap": state.input_reference_gene_overlap,
104+
"reference": state.reference,
105+
"reference_obs_target": state.reference_obs_target,
106+
"reference_var_gene_names": state.reference_var_gene_names,
107+
"reference_var_input": state.reference_var_input,
108+
"model": state.model,
109+
"feature_selection": state.feature_selection,
110+
"majority_voting": state.majority_voting,
111+
"C": state.C,
112+
"max_iter": state.max_iter,
113+
"use_SGD": state.use_SGD,
114+
"min_prop": state.min_prop,
115+
"output": state.output,
116+
"output_obs_predictions": state.output_obs_predictions,
117+
"output_obs_probability": state.output_obs_probability
118+
]},
119+
args: [
120+
"input_layer": "log_normalized_10k",
121+
"reference_layer": "log_normalized_10k"
122+
],
123+
toState: [
124+
"output": "output"
125+
]
126+
)
127+
| view {"After annotation: $it"}
128+
| setState(["output"])
129+
130+
emit:
131+
output_ch
132+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
manifest {
2+
nextflowVersion = '!>=20.12.1-edge'
3+
}
4+
5+
params {
6+
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
7+
}
8+
9+
// include common settings
10+
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")

0 commit comments

Comments
 (0)