openpipelines-bio · dorien-er · Feb 11, 2025 · Feb 12, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/_viash.yaml b/_viash.yaml
@@ -14,10 +14,15 @@ repositories:
   - name: openpipeline
     repo: openpipelines-bio/openpipeline
     type: github
-    tag: main_build
+    tag: 2.0.0
 
 info:
   test_resources:
     - type: s3
       path: s3://openpipelines-bio/openpipeline_incubator/resources_test
       dest: resources_test
+
+config_mods: |
+  .resources += {path: '/src/utils/labels.config', dest: 'nextflow_labels.config'}
+  .runners[.type == 'nextflow'].directives.tag := '$id'
+  .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'
diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
@@ -33,24 +33,29 @@ argument_groups:
         required: false
         description: |
           The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.
-
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
+
   - name: Reference data arguments
     description: Dataset to be used as a reference for label transfer and to train annotation algorithms on.
     arguments:
       - name: "--reference"
         type: file
         required: false
+        example: reference.h5mu
         description: |
-          The reference dataset to be used as a reference mapper and to train annotation algorithms on.
-        # example: https://zenodo.org/records/7587774/files/TS_Lung_filtered.h5ad
+          The reference dataset in .h5mu format to be used as a reference mapper and to train annotation algorithms on.
       - name: "--reference_layer_raw_counts"
         type: string
         description: "The layer in the reference dataset containing the raw counts, if .X is not to be used."
         required: false
       - name: "--reference_layer_lognormalized_counts"
         type: string
-        default: log_normalized
-        description: "The layer in the reference dataset containing the log-normalized counts."
+        description: "The layer in the reference dataset containing the log-normalized counts, if .X is not to be used."
       - name: "--reference_var_gene_names"
         type: string
         required: false
@@ -66,15 +71,20 @@ argument_groups:
         example: cell_type
         required: false
         description: The `.obs` key of the target labels to tranfer.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
 
   - name: Annotation methods
     description: The available annotation methods to annotate the query dataset(s) with.
     arguments:
       - name: "--annotation_methods"
         type: string
         multiple: true
-        default: scgpt_annotation
-        choices: [harmony_knn, scgpt_annotation]
+        required: true
+        choices: [celltypist, harmony_knn, scgpt_annotation, scvi_knn]
         example: harmony_knn
 
   - name: "Pre-processing options: RNA filtering"
@@ -112,30 +122,13 @@ argument_groups:
   - name: "Pre-processing options: Highly variable features detection"
     description: Pre-processing options for detecting highly variable features
     arguments:
-      - name: "--top_n_vars"
+      - name: "--n_hvg"
         type: integer
         description: |
-          Number of top vars to be used to calculate cumulative proportions.
-          If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
-          cumulative proportion to the 20th and 50th most expressed vars.
-        multiple: true
-        multiple_sep: ','
-        required: false
-        default: [50, 100, 200, 500]
-      - name: "--highly_variable_features_var_output"
-        alternatives: ["--filter_with_hvg_var_output"]
-        required: false
-        type: string
-        default: "filter_with_hvg"
-        description: In which .var slot to store a boolean array corresponding to the highly variable genes.
-      - name: "--highly_variable_features_obs_batch_key"
-        alternatives: ["--filter_with_hvg_obs_batch_key"]
-        type: string
-        default: "sample_id"
-        required: false
-        description: |
-          If specified, highly-variable genes are selected within each batch separately and merged. This simple 
-          process avoids the selection of batch-specific genes and acts as a lightweight batch correction method.
+          Number of highly-variable features to keep. 
+          Only relevant if HVG need to be calculated across query and reference datasets (e.g. for --annotation_methods scvi_knn and harmony_knn). 
+          For reference mapping-based methods, the HVG's specified in --reference_var_input will be used.
+        default: 2000
 
   - name: "Pre-processing options: Mitochondrial Gene Detection"
     description: Pre-processing options for detecting mitochondrial genes
@@ -176,7 +169,7 @@ argument_groups:
         example: "ercc,highly_variable"
 
   - name: Harmony integration options
-    description: Specifications for harmony integration. Only relevant for annotation method 'harmony_knn'.
+    description: Specifications for harmony integration.
     arguments:
       - name: "--harmony_theta"
         type: double
@@ -188,6 +181,47 @@ argument_groups:
         example: [0, 1, 2]
         multiple: true
 
+  - name: SCVI integration options
+    description: Specifications for SCVI integration.
+    arguments:
+      - name: "--scvi_early_stopping"
+        required: false
+        type: boolean
+        description: "Whether to perform early stopping with respect to the validation set."
+      - name: "--scvi_early_stopping_monitor"
+        choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
+        default: "elbo_validation"
+        type: string
+        description: "Metric logged during validation set epoch."
+      - name: "--scvi_early_stopping_patience"
+        type: integer
+        min: 1
+        default: 45
+        description: "Number of validation epochs with no improvement after which training will be stopped."
+      - name: "--scvi_early_stopping_min_delta"
+        min: 0
+        type: double
+        default: 0.0
+        description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement."
+      - name: "--scvi_max_epochs"
+        type: integer
+        description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
+        required: false
+      - name: "--scvi_reduce_lr_on_plateau"
+        description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
+        type: boolean
+        default: True
+      - name: "--scvi_lr_factor"
+        description: "Factor to reduce learning rate."
+        type: double
+        default: 0.6
+        min: 0
+      - name: "--scvi_lr_patience"
+        description: "Number of epochs with no improvement after which learning rate will be reduced."
+        type: double
+        default: 30
+        min: 0
+
   - name: scGPT reference model
     description: scGPT model input, required for scGPT annotation methods
     arguments:
@@ -263,6 +297,45 @@ argument_groups:
         description: |
           Seed for random number generation used for binning. If not set, no seed is used.
 
+  - name: CellTypist reference model
+    description: The CellTypist reference model to use for annotation. If not provided, the reference dataset will be used for model training.
+    arguments:
+      - name: "--celltypist_model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+
+  - name: CellTypist annotation options
+    description: Specifications for CellTypist annotation.
+    arguments:
+      - name: "--celltypist_feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: false
+      - name: "--celltypist_majority_voting"
+        type: boolean
+        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
+        default: false
+      - name: "--celltypist_C"
+        type: double
+        description: "Inverse of regularization strength in logistic regression."
+        default: 1.0
+      - name: "--celltypist_max_iter"
+        type: integer
+        description: "Maximum number of iterations before reaching the minimum of the cost function."
+        default: 1000
+      - name: "--celltypist_use_SGD"
+        type: boolean_true
+        description: "Whether to use the stochastic gradient descent algorithm."
+      - name: "--celltypist_min_prop"
+        type: double
+        description: |
+          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
+          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
+          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
+        default: 0
+
   - name: Outputs
     description: The output file to write the annotated dataset to.
     arguments:
@@ -275,27 +348,43 @@ argument_groups:
       example: output.h5mu
 
 dependencies:
-  - name: metadata/add_id
-    repository: op
-  - name: dataflow/split_h5mu
-    repository: op
   - name: workflows/multiomics/process_samples
     alias: process_samples_workflow
-    repository: op
+    repository: openpipeline
   - name: workflows/annotation/scgpt_annotation
-    alias: scgpt_annotation_workflow
-    repository: op
+    repository: op-main
+  - name: annotate/celltypist
+    repository: op-main
+    alias: celltypist_annotation
+  - name: workflows/annotation/harmony_knn
+    repository: op-main
+    alias: harmony_knn_annotation
+  - name: workflows/annotation/scvi_knn
+    repository: op-scvi
+    alias: scvi_knn_annotation
 
 repositories:
-  - name: op
+  - name: op-main
+    type: github
+    repo: openpipelines-bio/openpipeline
+    tag: main_build
+  - name: op-scvi
     type: github
     repo: openpipelines-bio/openpipeline
-    tag: 2.0.0
+    tag: scvi-knn-annotation_build
 
 resources:
   - type: nextflow_script
     path: main.nf
     entrypoint: run_wf
 
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+
 runners:
   - type: nextflow
diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -entry test_wf \
+  -resume \
+  -profile docker,no_publish \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
+
+nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker,no_publish \
+  -resume \
+  -entry test_wf_2 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
+
+  nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker,no_publish \
+  -resume \
+  -entry test_wf_3 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
+
+  nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker,no_publish \
+  -resume \
+  -entry test_wf_4 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config