From 69a25876fe26cccbe0d583a779e1f1b80c7108dd Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 11 Feb 2025 18:47:26 +0100
Subject: [PATCH 01/21] add celltypist

---
 src/atlas_service/config.vsh.yaml |  54 ++++++++++++++-
 src/atlas_service/main.nf         | 108 ++++++++++++++++++------------
 src/atlas_service/test.sh         |   6 +-
 3 files changed, 123 insertions(+), 45 deletions(-)

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index c02e9ff..0100105 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -33,6 +33,12 @@ argument_groups:
         required: false
         description: |
           The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.
+      - name: "--input_reference_gene_overlap"
+        type: integer
+        default: 100
+        min: 1
+        description: | 
+          The minimum number of genes present in both the reference and query datasets.
   
   - name: Reference data arguments
     description: Dataset to be used as a reference for label transfer and to train annotation algorithms on.
@@ -74,7 +80,7 @@ argument_groups:
         type: string
         multiple: true
         default: scgpt_annotation
-        choices: [harmony_knn, scgpt_annotation]
+        choices: [celltypist, harmony_knn, scgpt_annotation]
         example: harmony_knn
         
   - name: "Pre-processing options: RNA filtering"
@@ -263,6 +269,45 @@ argument_groups:
         description: |
           Seed for random number generation used for binning. If not set, no seed is used.
 
+  - name: CellTypist reference model
+    description: The CellTypist reference model to use for annotation. If not provided, the reference dataset will be used for model training.
+    arguments:
+      - name: "--celltypist_model"
+        type: file
+        description: "Pretrained model in pkl format. If not provided, the model will be trained on the reference data and --reference should be provided."
+        required: false
+        example: pretrained_model.pkl
+
+  - name: CellTypist annotation options
+    description: Specifications for CellTypist annotation.
+    arguments:
+      - name: "--celltypist_feature_selection"
+        type: boolean
+        description: "Whether to perform feature selection."
+        default: false
+      - name: "--celltypist_majority_voting"
+        type: boolean
+        description: "Whether to refine the predicted labels by running the majority voting classifier after over-clustering."
+        default: false
+      - name: "--celltypist_C"
+        type: double
+        description: "Inverse of regularization strength in logistic regression."
+        default: 1.0
+      - name: "--celltypist_max_iter"
+        type: integer
+        description: "Maximum number of iterations before reaching the minimum of the cost function."
+        default: 1000
+      - name: "--celltypist_use_SGD"
+        type: boolean_true
+        description: "Whether to use the stochastic gradient descent algorithm."
+      - name: "--celltypist_min_prop"
+        type: double
+        description: |
+          "For the dominant cell type within a subcluster, the minimum proportion of cells required to 
+          support naming of the subcluster by this cell type. Ignored if majority_voting is set to False. 
+          Subcluster that fails to pass this proportion threshold will be assigned 'Heterogeneous'."
+        default: 0
+
   - name: Outputs
     description: The output file to write the annotated dataset to.
     arguments:
@@ -285,12 +330,19 @@ dependencies:
   - name: workflows/annotation/scgpt_annotation
     alias: scgpt_annotation_workflow
     repository: op
+  - name: annotate/celltypist
+    repository: op
 
 repositories:
   - name: op
     type: github
     repo: openpipelines-bio/openpipeline
     tag: 2.0.0
+  - name: op-harmony
+    type: github
+    repo: openpipelines-bio/openpipeline
+    tag: harmony_knn_annoation_workflow_build
+
 
 resources:
   - type: nextflow_script
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 2079645..f6af038 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -4,14 +4,10 @@ workflow run_wf {
 
   main:
     output_ch = input_ch
-    | map {id, state ->
-      def new_state = state + ["query_processed": state.output]
+    | map { id, state ->
+      def new_state = state + [ "query_processed": state.output, "_meta": ["join_id": id] ]
       [id, new_state]
-      }
-    // | map{ id, state -> 
-    //     def new_state = state + ["_meta": ["join_id": id]]
-    //     [id, new_state]
-    //   }
+    }
     | process_samples_workflow.run(
       fromState: {id, state ->
         def newState = [
@@ -32,46 +28,76 @@ workflow run_wf {
           "mitochondrial_gene_regex": state.mitochondrial_gene_regex,
           "var_qc_metrics": state.var_qc_metrics,
           "top_n_vars": state.top_n_vars,
-          ]
+        ]  
       },
       args: [
         "pca_overwrite": "true",
         "add_id_obs_output": "sample_id"
       ],
       toState: ["query_processed": "output"], 
-      )
-      | view {"After processing query: $it"}
-      | scgpt_annotation_workflow.run(
-        runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") },
-        fromState: { id, state ->
-          [ 
-            "id": id,
-            "input": state.query_processed,
-            "modality": state.modality,
-            "input_layer": state.input_layer,
-            "input_var_gene_names": state.input_var_gene_names,
-            "model": state.scgpt_model,
-            "model_config": state.scgpt_model_config,
-            "model_vocab": state.scgpt_model_vocab,
-            "finetuned_checkpoints_key": state.scgpt_finetuned_checkpoints_key,
-            "label_mapper_key": state.scgpt_label_mapper_key,
-            "pad_token": state.scgpt_pad_token,
-            "pad_value": state.scgpt_pad_value,
-            "n_hvg": state.scgpt_n_hvg,
-            "dsbn": state.scgpt_dsbn,
-            "batch_size": state.scgpt_batch_size,
-            "n_input_bins": state.scgpt_n_input_bins,
-            "seed": state.scgpt_seed
-          ]
-        },
-        args: [
-          "input_obs_batch_label": "sample_id",
-          "output_obs_predictions": "scgpt_pred",
-          "output_obs_probability": "scgpt_proba"
-        ],
-        toState: [ "query_processed": "output" ]
-        )
-      | setState(["output": "query_processed", "_meta": "_meta"])
+    )
+
+    | view {"After processing query: $it"}
+
+    | scgpt_annotation_workflow.run(
+      runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") },
+      fromState: { id, state ->
+        [ 
+          "id": id,
+          "input": state.query_processed,
+          "modality": state.modality,
+          "input_layer": state.input_layer,
+          "input_var_gene_names": state.input_var_gene_names,
+          "model": state.scgpt_model,
+          "model_config": state.scgpt_model_config,
+          "model_vocab": state.scgpt_model_vocab,
+          "finetuned_checkpoints_key": state.scgpt_finetuned_checkpoints_key,
+          "label_mapper_key": state.scgpt_label_mapper_key,
+          "pad_token": state.scgpt_pad_token,
+          "pad_value": state.scgpt_pad_value,
+          "n_hvg": state.scgpt_n_hvg,
+          "dsbn": state.scgpt_dsbn,
+          "batch_size": state.scgpt_batch_size,
+          "n_input_bins": state.scgpt_n_input_bins,
+          "seed": state.scgpt_seed
+        ]
+      },
+      args: [
+        "input_obs_batch_label": "sample_id",
+        "output_obs_predictions": "scgpt_pred",
+        "output_obs_probability": "scgpt_proba"
+      ],
+      toState: [ "query_processed": "output" ]
+    )
+
+    | view {"After scgpt: $it"}
+
+    // | celltypist.run(
+    //   runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model },
+    //   fromState: [ 
+    //     "input": "query_processed",
+    //     "modality": "modality",
+    //     "input_layer": "input_layer",
+    //     "input_var_gene_names": "input_var_gene_names",
+    //     "input_reference_gene_overlap": "input_reference_gene_overlap",
+    //     "model": "celltypist_model",
+    //     "majority_voting": "celltypist_majority_voting"
+    //   ],
+    //   args: [
+    //     "output_obs_predictions": "celltypist_pred",
+    //     "output_obs_probability": "celltypist_proba"
+    //   ],
+    //   toState: [ "query_processed": "output" ]
+    // )
+
+    // | view {"After celltypist: $it"}
+    | map {id, state ->
+      def new_state = state + ["output": state.query_processed]
+      [id, new_state]
+    }
+    | view {"After mapping: $it"}
+    | setState(["output", "_meta"])
+    | view {"After setstate: $it"}
 
   emit:
     output_ch
diff --git a/src/atlas_service/test.sh b/src/atlas_service/test.sh
index 924be40..0f16f83 100755
--- a/src/atlas_service/test.sh
+++ b/src/atlas_service/test.sh
@@ -3,17 +3,17 @@ id: run
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
+celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
 annotation_methods: scgpt_annotation
 scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
 scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
 scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
-publish_dir: output
 HERE
 
 nextflow run . \
 -main-script target/nextflow/atlas_service/main.nf \
 -params-file params.yaml \
 -resume \
--profile docker \
+-profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
\ No newline at end of file
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config

From 58f9e27a92ae9a3ea00ce7a503b9a65e501825ad Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 12 Feb 2025 11:29:47 +0100
Subject: [PATCH 02/21] update package name

---
 _viash.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_viash.yaml b/_viash.yaml
index 8c6dcd5..6a7d20e 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -3,7 +3,7 @@ viash_version: 0.9.1
 source: src
 target: target
 
-name: incubator
+name: openpipeline_incubator
 organization: openpipelines-bio
 
 links:

From 83cb586ce4466bd0942b2e071eb60cf3afe18d33 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 18 Feb 2025 19:02:49 +0100
Subject: [PATCH 03/21] add harmony

---
 src/atlas_service/config.vsh.yaml |  30 +++++--
 src/atlas_service/main.nf         | 125 ++++++++++++++++++++++++------
 src/atlas_service/test.sh         |  56 ++++++++++++-
 src/atlas_service/test_params.sh  |  53 +++++++++++++
 4 files changed, 233 insertions(+), 31 deletions(-)
 create mode 100755 src/atlas_service/test_params.sh

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index 0100105..447dfad 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -39,6 +39,9 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
+      - name: "--overwrite_existing_key"
+        type: boolean_true
+        description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.
   
   - name: Reference data arguments
     description: Dataset to be used as a reference for label transfer and to train annotation algorithms on.
@@ -46,17 +49,16 @@ argument_groups:
       - name: "--reference"
         type: file
         required: false
+        example: reference.h5mu
         description: |
-          The reference dataset to be used as a reference mapper and to train annotation algorithms on.
-        # example: https://zenodo.org/records/7587774/files/TS_Lung_filtered.h5ad
+          The reference dataset in .h5mu format to be used as a reference mapper and to train annotation algorithms on.
       - name: "--reference_layer_raw_counts"
         type: string
         description: "The layer in the reference dataset containing the raw counts, if .X is not to be used."
         required: false
       - name: "--reference_layer_lognormalized_counts"
         type: string
-        default: log_normalized
-        description: "The layer in the reference dataset containing the log-normalized counts."
+        description: "The layer in the reference dataset containing the log-normalized counts, if .X is not to be used."
       - name: "--reference_var_gene_names"
         type: string
         required: false
@@ -72,6 +74,11 @@ argument_groups:
         example: cell_type
         required: false
         description: The `.obs` key of the target labels to tranfer.
+      - name: "--reference_var_input"
+        type: string
+        required: false
+        description: |
+          .var column containing highly variable genes. By default, do not subset genes.
 
   - name: Annotation methods
     description: The available annotation methods to annotate the query dataset(s) with.
@@ -328,21 +335,28 @@ dependencies:
     alias: process_samples_workflow
     repository: op
   - name: workflows/annotation/scgpt_annotation
-    alias: scgpt_annotation_workflow
     repository: op
   - name: annotate/celltypist
-    repository: op
+    repository: op-celltypist
+    alias: celltypist_annotation
+  - name: workflows/annotation/harmony_knn
+    repository: op-harmony
+    alias: harmony_knn_annotation
+
 
 repositories:
   - name: op
     type: github
     repo: openpipelines-bio/openpipeline
     tag: 2.0.0
+  - name: op-celltypist
+    type: github
+    repo: openpipelines-bio/openpipeline
+    tag: celltypist-layer-passing_build
   - name: op-harmony
     type: github
     repo: openpipelines-bio/openpipeline
-    tag: harmony_knn_annoation_workflow_build
-
+    tag: harmony-knn-annoation-workflow_build
 
 resources:
   - type: nextflow_script
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index f6af038..c0f29a3 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -8,6 +8,28 @@ workflow run_wf {
       def new_state = state + [ "query_processed": state.output, "_meta": ["join_id": id] ]
       [id, new_state]
     }
+    // Enforce annotation method-specific required arguments
+    | map { id, state ->
+      def new_state = [:]
+      // Check scGPT arguments
+      if (state.annotation_methods.contains("scgpt_annotation") && 
+        (!state.scgpt_model || !state.scgpt_model_config || !state.scgp_model_vocab)) {
+        throw new RuntimeException("Using scgpt_annotation requires --scgpt_model, --scgpt_model_config and --scgp_model_vocab parameters.")
+        }
+      // Check CellTypist arguments
+      if (state.annotation_methods.contains("celltypist") && 
+        (!state.celltypist_model && !state.reference)) {
+        throw new RuntimeException("Celltypist was selected as an annotation method. Either --celltypist_model or --reference must be provided.")
+        }
+      if (state.annotation_methods.contains("celltypist") && state.celltypist_model && state.reference )  {
+        System.err.println(
+          "Warning: --celltypist_model is set and a --reference was provided. \
+          The pre-trained Celltypist model will be used for annotation, the reference will be ignored."
+          )
+        }
+
+      [id, state + new_state]
+    }
     | process_samples_workflow.run(
       fromState: {id, state ->
         def newState = [
@@ -37,9 +59,7 @@ workflow run_wf {
       toState: ["query_processed": "output"], 
     )
 
-    | view {"After processing query: $it"}
-
-    | scgpt_annotation_workflow.run(
+    | scgpt_annotation.run(
       runIf: { id, state -> state.annotation_methods.contains("scgpt_annotation") },
       fromState: { id, state ->
         [ 
@@ -70,27 +90,88 @@ workflow run_wf {
       toState: [ "query_processed": "output" ]
     )
 
-    | view {"After scgpt: $it"}
+    | celltypist_annotation.run(
+      runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model },
+      fromState: [ 
+        "input": "query_processed",
+        "modality": "modality",
+        "input_var_gene_names": "input_var_gene_names",
+        "input_reference_gene_overlap": "input_reference_gene_overlap",
+        "model": "celltypist_model",
+        "majority_voting": "celltypist_majority_voting"
+      ],
+      args: [
+        // log normalized counts are expected for celltypist
+        "input_layer": "log_normalized",
+        "output_obs_predictions": "celltypist_pred",
+        "output_obs_probability": "celltypist_proba"
+      ],
+      toState: [ "query_processed": "output" ]
+    )
+
+    | celltypist.run(
+      runIf: { id, state -> state.annotation_methods.contains("celltypist") && !state.celltypist_model },
+      fromState: [
+        "input": "query_processed",
+        "modality": "modality",
+        "input_var_gene_names": "input_var_gene_names",
+        "input_reference_gene_overlap": "input_reference_gene_overlap",
+        "reference": "reference",
+        "reference_layer": "reference_layer_lognormalized_counts",
+        "reference_obs_target": "reference_obs_label",
+        "reference_var_gene_names": "reference_var_gene_names",
+        "reference_obs_batch": "reference_obs_batch",
+        "reference_var_input": "reference_var_input",
+        "feature_selection": "celltypist_feature_selection",
+        "C": "celltypist_C",
+        "max_iter": "celltypist_max_iter",
+        "use_SGD": "celltypist_use_SGD",
+        "min_prop": "celltypist_min_prop",
+        "majority_voting": "celltypist_majority_voting"
+      ],
+      args: [
+        // log normalized counts are expected for celltypist
+        "input_layer": "log_normalized",
+        "check_expression": "true",
+        "output_obs_predictions": "celltypist_pred",
+        "output_obs_probability": "celltypist_proba"
+      ],
+      toState: [ "query_processed": "output" ]
+    )
 
-    // | celltypist.run(
-    //   runIf: { id, state -> state.annotation_methods.contains("celltypist") && state.celltypist_model },
-    //   fromState: [ 
-    //     "input": "query_processed",
-    //     "modality": "modality",
-    //     "input_layer": "input_layer",
-    //     "input_var_gene_names": "input_var_gene_names",
-    //     "input_reference_gene_overlap": "input_reference_gene_overlap",
-    //     "model": "celltypist_model",
-    //     "majority_voting": "celltypist_majority_voting"
-    //   ],
-    //   args: [
-    //     "output_obs_predictions": "celltypist_pred",
-    //     "output_obs_probability": "celltypist_proba"
-    //   ],
-    //   toState: [ "query_processed": "output" ]
-    // )
+    | harmony_knn_annotation.run(
+      runIf: { id, state -> state.annotation_methods.contains("harmony_knn") },
+      fromState: { id, state ->
+        [ 
+          "id": id,
+          "input": state.query_processed,
+          "modality": state.modality,
+          "input_layer": state.input_layer,
+          "input_var_gene_names": state.input_var_gene_names,
+          "input_reference_gene_overlap": state.input_reference_gene_overlap,
+          "overwrite_existing_key": state.overwrite_existing_key,
+          "reference": state.reference,
+          "reference_layer": state.reference_layer_raw_counts,
+          "reference_obs_target": state.reference_obs_label,
+          "reference_var_gene_names": state.reference_var_gene_names,
+          "reference_obs_batch_label": state.reference_obs_batch,
+          "harmony_theta": state.harmony_theta,
+          // disable arguments for pca/leiden/knn for now
+          // "pca_num_components": state.pca_num_components,
+          // "leiden_resolution": state.leiden_resolution,
+          // "knn_weights": state.knn_weights,
+          // "knn_n_neighbors": state.knn_n_neighbors
+        ]
+      },
+      args: [
+        "input_obs_batch_label": "sample_id",
+        "output_obs_predictions": "harmony_knn_pred",
+        "output_obs_probability": "harmony_knn_proba",
+        "output_obsm_integrated": "X_integrated_harmony",
+      ],
+      toState: [ "query_processed": "output" ]
+    )
 
-    // | view {"After celltypist: $it"}
     | map {id, state ->
       def new_state = state + ["output": state.query_processed]
       [id, new_state]
diff --git a/src/atlas_service/test.sh b/src/atlas_service/test.sh
index 0f16f83..1e66345 100755
--- a/src/atlas_service/test.sh
+++ b/src/atlas_service/test.sh
@@ -4,7 +4,7 @@ input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/p
 modality: rna
 input_var_gene_names: gene_symbol
 celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-annotation_methods: scgpt_annotation
+annotation_methods: scgpt_annotation;celltypist
 scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
 scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
 scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
@@ -17,3 +17,57 @@ nextflow run . \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+
+# Test required arguments scGPT
+cat > params.yaml << HERE
+id: run
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: scgpt_annotation;celltypist
+scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
+scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+
+# Test required arguments CellTypist
+cat > params.yaml << HERE
+id: run
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+annotation_methods: scgpt_annotation;celltypist
+celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+
+cat > params.yaml << HERE
+id: run
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: scgpt_annotation;celltypist
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
\ No newline at end of file
diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh
new file mode 100755
index 0000000..531b868
--- /dev/null
+++ b/src/atlas_service/test_params.sh
@@ -0,0 +1,53 @@
+# Test required arguments scGPT
+cat > params.yaml << HERE
+id: scgpt_no_params
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: scgpt_annotation
+scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
+scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+
+# Test required arguments CellTypist
+cat > params.yaml << HERE
+id: celltypist_overlapping_params
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: celltypist
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+
+cat > params.yaml << HERE
+id: celltypist_no_params
+input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+annotation_methods: scgpt,celltypist
+celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config

From 56102b88912afe1e84789c74b2cb5f966f78e446 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 18 Feb 2025 20:31:57 +0100
Subject: [PATCH 04/21] wip

---
 src/atlas_service/config.vsh.yaml             |  1 -
 src/atlas_service/main.nf                     |  9 +++-
 .../{test.sh => test_execution.sh}            | 46 ++++++-------------
 src/atlas_service/test_params.sh              | 13 +++---
 4 files changed, 27 insertions(+), 42 deletions(-)
 rename src/atlas_service/{test.sh => test_execution.sh} (62%)

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index 447dfad..0570fc5 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -343,7 +343,6 @@ dependencies:
     repository: op-harmony
     alias: harmony_knn_annotation
 
-
 repositories:
   - name: op
     type: github
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index c0f29a3..6778924 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -9,11 +9,12 @@ workflow run_wf {
       [id, new_state]
     }
     // Enforce annotation method-specific required arguments
+    | niceView()
     | map { id, state ->
       def new_state = [:]
       // Check scGPT arguments
       if (state.annotation_methods.contains("scgpt_annotation") && 
-        (!state.scgpt_model || !state.scgpt_model_config || !state.scgp_model_vocab)) {
+        (!state.scgpt_model || !state.scgpt_model_config || !state.scgpt_model_vocab)) {
         throw new RuntimeException("Using scgpt_annotation requires --scgpt_model, --scgpt_model_config and --scgp_model_vocab parameters.")
         }
       // Check CellTypist arguments
@@ -27,6 +28,10 @@ workflow run_wf {
           The pre-trained Celltypist model will be used for annotation, the reference will be ignored."
           )
         }
+      // Check Harmony KNN arguments
+            if (state.annotation_methods.contains("harmony_knn") && !state.reference ) {
+        throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.")
+        }
 
       [id, state + new_state]
     }
@@ -109,7 +114,7 @@ workflow run_wf {
       toState: [ "query_processed": "output" ]
     )
 
-    | celltypist.run(
+    | celltypist_annotation.run(
       runIf: { id, state -> state.annotation_methods.contains("celltypist") && !state.celltypist_model },
       fromState: [
         "input": "query_processed",
diff --git a/src/atlas_service/test.sh b/src/atlas_service/test_execution.sh
similarity index 62%
rename from src/atlas_service/test.sh
rename to src/atlas_service/test_execution.sh
index 1e66345..7a0620f 100755
--- a/src/atlas_service/test.sh
+++ b/src/atlas_service/test_execution.sh
@@ -1,13 +1,13 @@
 cat > params.yaml << HERE
-id: run
+id: harmony
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
-input_var_gene_names: gene_symbol
+reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+reference_var_gene_names: ensemblid
+reference_obs_batch: donor_assay
+reference_obs_label: cell_type
 celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-annotation_methods: scgpt_annotation;celltypist
-scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
-scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
-scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+annotation_methods: harmony_knn
 HERE
 
 nextflow run . \
@@ -18,15 +18,13 @@ nextflow run . \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
 
-# Test required arguments scGPT
 cat > params.yaml << HERE
-id: run
+id: celltypist
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
-annotation_methods: scgpt_annotation;celltypist
-scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
-scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+annotation_methods: celltypist
+publish_dir: output
 HERE
 
 nextflow run . \
@@ -37,15 +35,15 @@ nextflow run . \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
 
-# Test required arguments CellTypist
 cat > params.yaml << HERE
-id: run
+id: scgpt
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
+annotation_methods: scgpt_annotation
 input_var_gene_names: gene_symbol
-reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
-annotation_methods: scgpt_annotation;celltypist
-celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
+scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
+scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
 HERE
 
 nextflow run . \
@@ -55,19 +53,3 @@ nextflow run . \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
-
-cat > params.yaml << HERE
-id: run
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: scgpt_annotation;celltypist
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
\ No newline at end of file
diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh
index 531b868..4bbcc7f 100755
--- a/src/atlas_service/test_params.sh
+++ b/src/atlas_service/test_params.sh
@@ -1,6 +1,6 @@
 # Test required arguments scGPT
 cat > params.yaml << HERE
-id: scgpt_no_params
+id: scgpt
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
@@ -19,11 +19,11 @@ nextflow run . \
 
 # Test required arguments CellTypist
 cat > params.yaml << HERE
-id: celltypist_overlapping_params
+id: celltypist
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
-annotation_methods: celltypist
+annotation_methods: scgpt,celltypist
 HERE
 
 nextflow run . \
@@ -34,14 +34,13 @@ nextflow run . \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
 
+# Test required arguments CellTypist
 cat > params.yaml << HERE
-id: celltypist_no_params
+id: celltypist
 input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
-reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
-annotation_methods: scgpt,celltypist
-celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+annotation_methods: harmony_knn
 HERE
 
 nextflow run . \

From f855d33627774cd693734e97383770d5a7f60599 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Fri, 21 Feb 2025 17:06:45 +0000
Subject: [PATCH 05/21] add scvi-knn annotation workflow

---
 src/atlas_service/config.vsh.yaml   | 86 +++++++++++++++++++----------
 src/atlas_service/main.nf           | 53 ++++++++++++++----
 src/atlas_service/test_execution.sh | 75 ++++++++++++-------------
 3 files changed, 136 insertions(+), 78 deletions(-)

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index 0570fc5..5de5bdc 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -39,10 +39,7 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
-      - name: "--overwrite_existing_key"
-        type: boolean_true
-        description: If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process.
-  
+
   - name: Reference data arguments
     description: Dataset to be used as a reference for label transfer and to train annotation algorithms on.
     arguments:
@@ -86,8 +83,8 @@ argument_groups:
       - name: "--annotation_methods"
         type: string
         multiple: true
-        default: scgpt_annotation
-        choices: [celltypist, harmony_knn, scgpt_annotation]
+        required: true
+        choices: [celltypist, harmony_knn, scgpt_annotation, scvi_knn]
         example: harmony_knn
         
   - name: "Pre-processing options: RNA filtering"
@@ -125,30 +122,13 @@ argument_groups:
   - name: "Pre-processing options: Highly variable features detection"
     description: Pre-processing options for detecting highly variable features
     arguments:
-      - name: "--top_n_vars"
+      - name: "--n_hvg"
         type: integer
         description: |
-          Number of top vars to be used to calculate cumulative proportions.
-          If not specified, proportions are not calculated. `--top_n_vars 20,50` finds
-          cumulative proportion to the 20th and 50th most expressed vars.
-        multiple: true
-        multiple_sep: ','
-        required: false
-        default: [50, 100, 200, 500]
-      - name: "--highly_variable_features_var_output"
-        alternatives: ["--filter_with_hvg_var_output"]
-        required: false
-        type: string
-        default: "filter_with_hvg"
-        description: In which .var slot to store a boolean array corresponding to the highly variable genes.
-      - name: "--highly_variable_features_obs_batch_key"
-        alternatives: ["--filter_with_hvg_obs_batch_key"]
-        type: string
-        default: "sample_id"
-        required: false
-        description: |
-          If specified, highly-variable genes are selected within each batch separately and merged. This simple 
-          process avoids the selection of batch-specific genes and acts as a lightweight batch correction method.
+          Number of highly-variable features to keep. 
+          Only relevant if HVG need to be calculated across query and reference datasets (e.g. for --annotation_methods scvi_knn and harmony_knn). 
+          For reference mapping-based methods, the HVG's specified in --reference_var_input will be used.
+        default: 2000
   
   - name: "Pre-processing options: Mitochondrial Gene Detection"
     description: Pre-processing options for detecting mitochondrial genes
@@ -189,7 +169,7 @@ argument_groups:
         example: "ercc,highly_variable"
 
   - name: Harmony integration options
-    description: Specifications for harmony integration. Only relevant for annotation method 'harmony_knn'.
+    description: Specifications for harmony integration.
     arguments:
       - name: "--harmony_theta"
         type: double
@@ -201,6 +181,47 @@ argument_groups:
         example: [0, 1, 2]
         multiple: true
 
+  - name: SCVI integration options
+    description: Specifications for SCVI integration.
+    arguments:
+      - name: "--scvi_early_stopping"
+        required: false
+        type: boolean
+        description: "Whether to perform early stopping with respect to the validation set."
+      - name: "--scvi_early_stopping_monitor"
+        choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
+        default: "elbo_validation"
+        type: string
+        description: "Metric logged during validation set epoch."
+      - name: "--scvi_early_stopping_patience"
+        type: integer
+        min: 1
+        default: 45
+        description: "Number of validation epochs with no improvement after which training will be stopped."
+      - name: "--scvi_early_stopping_min_delta"
+        min: 0
+        type: double
+        default: 0.0
+        description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement."
+      - name: "--scvi_max_epochs"
+        type: integer
+        description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
+        required: false
+      - name: "--scvi_reduce_lr_on_plateau"
+        description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
+        type: boolean
+        default: True
+      - name: "--scvi_lr_factor"
+        description: "Factor to reduce learning rate."
+        type: double
+        default: 0.6
+        min: 0
+      - name: "--scvi_lr_patience"
+        description: "Number of epochs with no improvement after which learning rate will be reduced."
+        type: double
+        default: 30
+        min: 0
+
   - name: scGPT reference model
     description: scGPT model input, required for scGPT annotation methods
     arguments:
@@ -342,6 +363,9 @@ dependencies:
   - name: workflows/annotation/harmony_knn
     repository: op-harmony
     alias: harmony_knn_annotation
+  - name: workflows/annotation/scvi_knn
+    repository: op-scvi
+    alias: scvi_knn_annotation
 
 repositories:
   - name: op
@@ -356,6 +380,10 @@ repositories:
     type: github
     repo: openpipelines-bio/openpipeline
     tag: harmony-knn-annoation-workflow_build
+  - name: op-scvi
+    type: github
+    repo: openpipelines-bio/openpipeline
+    tag: scvi-knn-annotation_build
 
 resources:
   - type: nextflow_script
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 6778924..270d80c 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -9,7 +9,6 @@ workflow run_wf {
       [id, new_state]
     }
     // Enforce annotation method-specific required arguments
-    | niceView()
     | map { id, state ->
       def new_state = [:]
       // Check scGPT arguments
@@ -48,13 +47,10 @@ workflow run_wf {
           "rna_min_cells_per_gene": state.rna_min_cells_per_gene,
           "rna_min_fraction_mito": state.rna_min_fraction_mito,
           "rna_max_fraction_mito": state.rna_max_fraction_mito,
-          "highly_variable_features_var_output": state.highly_variable_features_var_output,
-          "highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key,
           "var_name_mitochondrial_genes": state.var_name_mitochondrial_genes,
           "var_gene_names": state.input_var_gene_names,
           "mitochondrial_gene_regex": state.mitochondrial_gene_regex,
-          "var_qc_metrics": state.var_qc_metrics,
-          "top_n_vars": state.top_n_vars,
+          "var_qc_metrics": state.var_qc_metrics
         ]  
       },
       args: [
@@ -145,6 +141,35 @@ workflow run_wf {
     )
 
     | harmony_knn_annotation.run(
+      runIf: { id, state -> state.annotation_methods.contains("harmony_knn") },
+      fromState: { id, state ->
+        [ 
+          "id": id,
+          "input": state.query_processed,
+          "modality": state.modality,
+          "input_var_gene_names": state.input_var_gene_names,
+          "input_reference_gene_overlap": state.input_reference_gene_overlap,
+          "reference": state.reference,
+          "reference_layer": state.reference_layer_lognormalized_counts,
+          "reference_obs_target": state.reference_obs_label,
+          "reference_var_gene_names": state.reference_var_gene_names,
+          "reference_obs_batch_label": state.reference_obs_batch,
+          "n_hvg": state.n_hvg,
+          "harmony_theta": state.harmony_theta,
+        ]
+      },
+      args: [
+        "input_layer": "log_normalized",
+        "input_obs_batch_label": "sample_id",
+        "output_obs_predictions": "harmony_knn_pred",
+        "output_obs_probability": "harmony_knn_proba",
+        "output_obsm_integrated": "X_integrated_harmony",
+        "overwrite_existing_key": "true"
+      ],
+      toState: [ "query_processed": "output" ]
+    )
+
+    | scvi_knn_annotation.run(
       runIf: { id, state -> state.annotation_methods.contains("harmony_knn") },
       fromState: { id, state ->
         [ 
@@ -154,25 +179,29 @@ workflow run_wf {
           "input_layer": state.input_layer,
           "input_var_gene_names": state.input_var_gene_names,
           "input_reference_gene_overlap": state.input_reference_gene_overlap,
-          "overwrite_existing_key": state.overwrite_existing_key,
           "reference": state.reference,
           "reference_layer": state.reference_layer_raw_counts,
+          "reference_layer_lognormalized": state.reference_layer_lognormalized_counts,
           "reference_obs_target": state.reference_obs_label,
           "reference_var_gene_names": state.reference_var_gene_names,
           "reference_obs_batch_label": state.reference_obs_batch,
-          "harmony_theta": state.harmony_theta,
-          // disable arguments for pca/leiden/knn for now
-          // "pca_num_components": state.pca_num_components,
-          // "leiden_resolution": state.leiden_resolution,
-          // "knn_weights": state.knn_weights,
-          // "knn_n_neighbors": state.knn_n_neighbors
+          "n_hvg": state.n_hvg,
+          "scvi_early_stopping": state.scvi_early_stopping,
+          "scvi_early_stopping_patience": state.scvi_early_stopping_patience,
+          "scvi_early_stopping_min_delta": state.scvi_early_stopping_min_delta,
+          "scvi_max_epochs": state.scvi_max_epochs,
+          "scvi_reduce_lr_on_plateau": state.scvi_reduce_lr_on_plateau,
+          "scvi_lr_factor": state.scvi_lr_factor,
+          "scvi_lr_patience": state.scvi_lr_patience
         ]
       },
       args: [
+        "input_layer_lognormalized": "log_normalized",
         "input_obs_batch_label": "sample_id",
         "output_obs_predictions": "harmony_knn_pred",
         "output_obs_probability": "harmony_knn_proba",
         "output_obsm_integrated": "X_integrated_harmony",
+        "overwrite_existing_key": "true"
       ],
       toState: [ "query_processed": "output" ]
     )
diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh
index 7a0620f..45e3a06 100755
--- a/src/atlas_service/test_execution.sh
+++ b/src/atlas_service/test_execution.sh
@@ -1,13 +1,14 @@
 cat > params.yaml << HERE
 id: harmony
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
-reference: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+reference_layer_lognormalized_counts: log_normalized
 reference_var_gene_names: ensemblid
 reference_obs_batch: donor_assay
 reference_obs_label: cell_type
-celltypist_model: /Users/dorienroosen/code/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-annotation_methods: harmony_knn
+celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+annotation_methods: harmony_knn;scvi_knn
 HERE
 
 nextflow run . \
@@ -16,40 +17,40 @@ nextflow run . \
 -resume \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
 
-cat > params.yaml << HERE
-id: celltypist
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: celltypist
-publish_dir: output
-HERE
+# cat > params.yaml << HERE
+# id: celltypist
+# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+# modality: rna
+# input_var_gene_names: gene_symbol
+# annotation_methods: celltypist
+# publish_dir: output
+# HERE
 
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+# nextflow run . \
+# -main-script target/nextflow/atlas_service/main.nf \
+# -params-file params.yaml \
+# -resume \
+# -profile docker,no_publish \
+# -c target/nextflow/atlas_service/nextflow.config \
+# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
 
-cat > params.yaml << HERE
-id: scgpt
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-annotation_methods: scgpt_annotation
-input_var_gene_names: gene_symbol
-scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
-scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
-scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
-HERE
+# cat > params.yaml << HERE
+# id: scgpt
+# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+# modality: rna
+# annotation_methods: scgpt_annotation
+# input_var_gene_names: gene_symbol
+# scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
+# scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
+# scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+# HERE
 
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+# nextflow run . \
+# -main-script target/nextflow/atlas_service/main.nf \
+# -params-file params.yaml \
+# -resume \
+# -profile docker,no_publish \
+# -c target/nextflow/atlas_service/nextflow.config \
+# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config

From 90db0253eda30e5fe98b4b732c13bc11a61d02d1 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 26 Feb 2025 10:19:17 +0000
Subject: [PATCH 06/21] update repos

---
 src/atlas_service/config.vsh.yaml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index 5de5bdc..f18b5ec 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -358,10 +358,10 @@ dependencies:
   - name: workflows/annotation/scgpt_annotation
     repository: op
   - name: annotate/celltypist
-    repository: op-celltypist
+    repository: op-main
     alias: celltypist_annotation
   - name: workflows/annotation/harmony_knn
-    repository: op-harmony
+    repository: op-main
     alias: harmony_knn_annotation
   - name: workflows/annotation/scvi_knn
     repository: op-scvi
@@ -372,14 +372,10 @@ repositories:
     type: github
     repo: openpipelines-bio/openpipeline
     tag: 2.0.0
-  - name: op-celltypist
+  - name: op-main
     type: github
     repo: openpipelines-bio/openpipeline
-    tag: celltypist-layer-passing_build
-  - name: op-harmony
-    type: github
-    repo: openpipelines-bio/openpipeline
-    tag: harmony-knn-annoation-workflow_build
+    tag: main_build
   - name: op-scvi
     type: github
     repo: openpipelines-bio/openpipeline

From a49cd41eb0c81e28bc56e56dba02745e3f9de47e Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 26 Feb 2025 15:47:47 +0000
Subject: [PATCH 07/21] update celltypist params

---
 src/atlas_service/main.nf           |  5 +--
 src/atlas_service/test_execution.sh |  7 ++--
 src/atlas_service/test_params.sh    | 60 +++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 270d80c..5671714 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -28,8 +28,8 @@ workflow run_wf {
           )
         }
       // Check Harmony KNN arguments
-            if (state.annotation_methods.contains("harmony_knn") && !state.reference ) {
-        throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.")
+            if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn"))  && !state.reference ) {
+        throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.")
         }
 
       [id, state + new_state]
@@ -133,7 +133,6 @@ workflow run_wf {
       args: [
         // log normalized counts are expected for celltypist
         "input_layer": "log_normalized",
-        "check_expression": "true",
         "output_obs_predictions": "celltypist_pred",
         "output_obs_probability": "celltypist_proba"
       ],
diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh
index 45e3a06..96bb15f 100755
--- a/src/atlas_service/test_execution.sh
+++ b/src/atlas_service/test_execution.sh
@@ -7,21 +7,22 @@ reference_layer_lognormalized_counts: log_normalized
 reference_var_gene_names: ensemblid
 reference_obs_batch: donor_assay
 reference_obs_label: cell_type
-celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-annotation_methods: harmony_knn;scvi_knn
+annotation_methods: harmony_knn;scvi_knn;celltypist
+publish_dir: aaas_test
 HERE
 
 nextflow run . \
 -main-script target/nextflow/atlas_service/main.nf \
 -params-file params.yaml \
 -resume \
--profile docker,no_publish \
+-profile docker \
 -c target/nextflow/atlas_service/nextflow.config \
 -c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
 
 # cat > params.yaml << HERE
 # id: celltypist
 # input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+# celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
 # modality: rna
 # input_var_gene_names: gene_symbol
 # annotation_methods: celltypist
diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh
index 4bbcc7f..53986e8 100755
--- a/src/atlas_service/test_params.sh
+++ b/src/atlas_service/test_params.sh
@@ -1,12 +1,12 @@
 # Test required arguments scGPT
 cat > params.yaml << HERE
 id: scgpt
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
 annotation_methods: scgpt_annotation
-scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
-scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
+scgpt_model_config: /home/dorienroosen/openpipeline/resources_test/scgpt/source/args.json
+scgpt_model_vocab: /home/dorienroosen/openpipeline/resources_test/scgpt/source/vocab.json
 HERE
 
 nextflow run . \
@@ -15,15 +15,15 @@ nextflow run . \
 -resume \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
 
 # Test required arguments CellTypist
 cat > params.yaml << HERE
-id: celltypist
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+id: celltypist_1
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
-annotation_methods: scgpt,celltypist
+annotation_methods: celltypist
 HERE
 
 nextflow run . \
@@ -32,12 +32,31 @@ nextflow run . \
 -resume \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
 
-# Test required arguments CellTypist
 cat > params.yaml << HERE
-id: celltypist
-input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+id: celltypist_2
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: celltypist
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
+
+
+# Test required arguments Harmony
+cat > params.yaml << HERE
+id: harmony
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
 input_var_gene_names: gene_symbol
 annotation_methods: harmony_knn
@@ -49,4 +68,21 @@ nextflow run . \
 -resume \
 -profile docker,no_publish \
 -c target/nextflow/atlas_service/nextflow.config \
--c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
+
+# Test required arguments SCVI
+cat > params.yaml << HERE
+id: scvi
+input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+modality: rna
+input_var_gene_names: gene_symbol
+annotation_methods: scvi_knn
+HERE
+
+nextflow run . \
+-main-script target/nextflow/atlas_service/main.nf \
+-params-file params.yaml \
+-resume \
+-profile docker,no_publish \
+-c target/nextflow/atlas_service/nextflow.config \
+-c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
\ No newline at end of file

From 37cd27a66131dfc3a9aa89f3ae8b764f1cddc581 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Mon, 3 Mar 2025 11:28:53 +0100
Subject: [PATCH 08/21] update celltypist params

---
 src/atlas_service/main.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 6778924..c4fdcd8 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -29,9 +29,8 @@ workflow run_wf {
           )
         }
       // Check Harmony KNN arguments
-            if (state.annotation_methods.contains("harmony_knn") && !state.reference ) {
-        throw new RuntimeException("Harmony KNN was selected as an annotation method. A --reference dataset must be provided.")
-        }
+      if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn"))  && !state.reference ) {
+        throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.")
 
       [id, state + new_state]
     }
@@ -137,7 +136,6 @@ workflow run_wf {
       args: [
         // log normalized counts are expected for celltypist
         "input_layer": "log_normalized",
-        "check_expression": "true",
         "output_obs_predictions": "celltypist_pred",
         "output_obs_probability": "celltypist_proba"
       ],

From 028e0d1735aedfc8acdcfbd2827a31538622f530 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 4 Mar 2025 15:05:26 +0000
Subject: [PATCH 09/21] tests wip

---
 src/atlas_service/config.vsh.yaml     |  8 +++++
 src/atlas_service/integration_test.sh | 17 ++++++++++
 src/atlas_service/nextflow.config     |  4 +--
 src/atlas_service/test.nf             | 48 +++++++++++++++++++++++++++
 src/atlas_service/test_execution.sh   |  6 ++--
 5 files changed, 78 insertions(+), 5 deletions(-)
 create mode 100755 src/atlas_service/integration_test.sh
 create mode 100644 src/atlas_service/test.nf

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index f18b5ec..47c6de9 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -386,5 +386,13 @@ resources:
     path: main.nf
     entrypoint: run_wf
 
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+  - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+  - path: /resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+  - path: /resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
+
 runners:
   - type: nextflow
\ No newline at end of file
diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh
new file mode 100755
index 0000000..ff15861
--- /dev/null
+++ b/src/atlas_service/integration_test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -entry test_wf \
+  -profile docker,no_publish \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
diff --git a/src/atlas_service/nextflow.config b/src/atlas_service/nextflow.config
index 8108bc2..71b491a 100644
--- a/src/atlas_service/nextflow.config
+++ b/src/atlas_service/nextflow.config
@@ -3,8 +3,8 @@ manifest {
 }
 
 params {
-  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+  rootDir = java.nio.file.Paths.get("$projectDir/../../").toAbsolutePath().normalize().toString()
 }
 
 // include common settings
-includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
\ No newline at end of file
+includeConfig("${params.rootDir}/src/labels.config")
\ No newline at end of file
diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf
new file mode 100644
index 0000000..a97bbee
--- /dev/null
+++ b/src/atlas_service/test.nf
@@ -0,0 +1,48 @@
+nextflow.enable.dsl=2
+
+include { atlas_service } from params.rootDir + "/target/nextflow/atlas_service/main.nf"
+params.resources_test = params.rootDir + "/resources_test"
+
+workflow test_wf {
+  // allow changing the resources_test dir
+  resources_test = file(params.resources_test)
+
+  output_ch = Channel.fromList(
+    [
+      [
+        id: "simple_execution_test",
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
+        reference_var_gene_names: "ensemblid",
+        reference_layer_lognormalized_counts: "log_normalized",
+        reference_obs_batch: "donor_assay",
+        reference_obs_label: "cell_type",
+        annotation_methods: "celltypist"
+      ]
+    ])
+    | view {"State at start: $it"}
+    | map{ state -> [state.id, state] }
+    | atlas_service 
+    | view {"After AaaS: $it"}
+    | view { output ->
+      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
+
+      // check id
+      def id = output[0]
+      assert id.endsWith("_test") : "Output ID should be same as input ID"
+
+      // check output
+      def state = output[1]
+      assert state instanceof Map : "State should be a map. Found: ${state}"
+      assert state.containsKey("output") : "Output should contain key 'output'."
+      assert state.output.isFile() : "'output' should be a file."
+      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
+    
+    "Output: $output"
+    }
+    | toSortedList({a, b -> a[0] <=> b[0]})
+    | map { output_list ->
+      assert output_list.size() == 2 : "output channel should contain 2 events"
+      assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"]
+    }
+    }
\ No newline at end of file
diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh
index 96bb15f..2463e37 100755
--- a/src/atlas_service/test_execution.sh
+++ b/src/atlas_service/test_execution.sh
@@ -1,8 +1,8 @@
 cat > params.yaml << HERE
 id: harmony
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
+input: resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
 modality: rna
-reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
+reference: resources_test/annotation_test_data/TS_Blood_filtered.h5mu
 reference_layer_lognormalized_counts: log_normalized
 reference_var_gene_names: ensemblid
 reference_obs_batch: donor_assay
@@ -17,7 +17,7 @@ nextflow run . \
 -resume \
 -profile docker \
 -c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
+-c src/utils/labels_ci.config
 
 # cat > params.yaml << HERE
 # id: celltypist

From dcaaaa14926ef751cc5ce04921a0162b67b4af6e Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 4 Mar 2025 15:26:58 +0000
Subject: [PATCH 10/21] tests wip

---
 src/atlas_service/main.nf          |  1 +
 src/utils/integration_tests.config | 36 ++++++++++++++++++++++++++++++
 src/utils/labels_ci.config         | 36 ++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)
 create mode 100644 src/utils/integration_tests.config
 create mode 100644 src/utils/labels_ci.config

diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 6b89c57..06c9760 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -30,6 +30,7 @@ workflow run_wf {
       // Check Harmony KNN arguments
       if ((state.annotation_methods.contains("harmony_knn") || state.annotation_methods.contains("scvi_knn"))  && !state.reference ) {
         throw new RuntimeException("When `harmony_knn` or `scvi_knn` are selected as an annotation method, a --reference dataset must be provided.")
+      }
 
       [id, state + new_state]
     }
diff --git a/src/utils/integration_tests.config b/src/utils/integration_tests.config
new file mode 100644
index 0000000..59d5b09
--- /dev/null
+++ b/src/utils/integration_tests.config
@@ -0,0 +1,36 @@
+profiles {
+
+  // detect tempdir
+  tempDir = java.nio.file.Paths.get(
+    System.getenv('NXF_TEMP') ?:
+      System.getenv('VIASH_TEMP') ?: 
+      System.getenv('TEMPDIR') ?: 
+      System.getenv('TMPDIR') ?: 
+      '/tmp'
+  ).toAbsolutePath()
+
+  mount_temp {
+    docker.temp            = tempDir
+    podman.temp            = tempDir
+    charliecloud.temp      = tempDir
+  }
+
+  no_publish {
+    process {
+      withName: '.*' {
+        publishDir = [
+          enabled: false
+        ]
+      }
+    }
+  }
+
+  docker {
+    docker.enabled         = true
+    // docker.userEmulation   = true
+    singularity.enabled    = false
+    podman.enabled         = false
+    shifter.enabled        = false
+    charliecloud.enabled   = false
+  }
+}
\ No newline at end of file
diff --git a/src/utils/labels_ci.config b/src/utils/labels_ci.config
new file mode 100644
index 0000000..dd2e23e
--- /dev/null
+++ b/src/utils/labels_ci.config
@@ -0,0 +1,36 @@
+process {
+  withLabel: lowmem { memory = 13.Gb }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midmem { memory = 13.Gb }
+  withLabel: midcpu { cpus = 4 }
+  withLabel: highmem { memory = 13.Gb }
+  withLabel: highcpu { cpus = 4 }
+  withLabel: veryhighmem { memory = 13.Gb }
+  // Nextflow apparently can't handle empty directives, i.e.
+  // withLabel: lowdisk {}
+  // so for that reason we have to add a dummy directive
+  withLabel: lowdisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: middisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: highdisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: veryhighdisk {
+    dummyDirective = "dummyValue"
+  }
+}
+
+env.NUMBA_CACHE_DIR = '/tmp'
+
+trace {
+    enabled = true
+    overwrite = true
+}
+dag {
+  overwrite = true
+}
+
+process.maxForks = 1

From 7f357550529c59adb0f76ccc6025c68490a140ff Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 4 Mar 2025 17:57:31 +0000
Subject: [PATCH 11/21] tests wip

---
 src/atlas_service/test.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf
index a97bbee..b96a612 100644
--- a/src/atlas_service/test.nf
+++ b/src/atlas_service/test.nf
@@ -29,7 +29,7 @@ workflow test_wf {
 
       // check id
       def id = output[0]
-      assert id.endsWith("_test") : "Output ID should be same as input ID"
+      assert id == "merged" : "Output ID should be `merged`"
 
       // check output
       def state = output[1]

From 61763f0ccba35aa2d762d97c94788e163c02981f Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 5 Mar 2025 16:06:32 +0000
Subject: [PATCH 12/21] tests wip

---
 src/atlas_service/integration_test.sh | 11 ++++
 src/atlas_service/test.nf             | 79 +++++++++++++++++++++++++--
 src/atlas_service/test.yaml           | 63 +++++++++++++++++++++
 3 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 src/atlas_service/test.yaml

diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh
index ff15861..546a62b 100755
--- a/src/atlas_service/integration_test.sh
+++ b/src/atlas_service/integration_test.sh
@@ -12,6 +12,17 @@ nextflow \
   run . \
   -main-script src/atlas_service/test.nf \
   -entry test_wf \
+  -resume \
   -profile docker,no_publish \
   -c src/utils/labels_ci.config \
   -c src/utils/integration_tests.config
+
+nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker, no_publish \
+  -resume \
+  -entry test_wf_2 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config \
+  --publish_dir test_2
\ No newline at end of file
diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf
index b96a612..6c01251 100644
--- a/src/atlas_service/test.nf
+++ b/src/atlas_service/test.nf
@@ -39,10 +39,77 @@ workflow test_wf {
       assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
     
     "Output: $output"
+  }
+}
+
+workflow test_wf_2 {
+  // allow changing the resources_test dir
+  resources_test = file(params.resources_test)
+
+  output_ch = Channel.fromList(
+    [
+      [
+        id: "pbmc",
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        var_name_mitochondrial_genes: 'mitochondrial',
+        rna_min_counts: 2,
+        prot_min_counts: 3,
+        add_id_to_obs: true,
+        add_id_make_observation_keys_unique: true,
+        add_id_obs_output: "sample_id",
+        reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
+        reference_var_gene_names: "ensemblid",
+        reference_layer_lognormalized_counts: "log_normalized",
+        reference_obs_batch: "donor_assay",
+        reference_obs_label: "cell_type",
+        annotation_methods: "celltypist"
+      ],
+      [
+        id: "pbmc_with_more_params",
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        rna_min_counts: 2,
+        rna_max_counts: 1000000,
+        rna_min_genes_per_cell: 1,
+        rna_max_genes_per_cell: 1000000,
+        rna_min_cells_per_gene: 1,
+        rna_min_fraction_mito: 0.0,
+        rna_max_fraction_mito: 1.0,
+        prot_min_counts: 3,
+        prot_max_counts: 1000000,
+        prot_min_proteins_per_cell: 1,
+        prot_max_proteins_per_cell: 1000000,
+        prot_min_cells_per_protein: 1,
+        var_name_mitochondrial_genes: 'mitochondrial',
+        obs_name_mitochondrial_fraction: 'fraction_mitochondrial',
+        add_id_to_obs: true,
+        add_id_make_observation_keys_unique: true,
+        add_id_obs_output: "sample_id",
+        reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
+        reference_var_gene_names: "ensemblid",
+        reference_layer_lognormalized_counts: "log_normalized",
+        reference_obs_batch: "donor_assay",
+        reference_obs_label: "cell_type",
+        annotation_methods: "celltypist"
+      ]
+    ])
+    | view {"State at start: $it"}
+    | map { state -> [state.id, state] }
+    | atlas_service 
+    | view {"After AaaS: $it"}
+    | view { output ->
+      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
+
+      // check id
+      def id = output[0]
+      assert id == "merged" : "Output ID should be `merged`"
+
+      // check output
+      def state = output[1]
+      assert state instanceof Map : "State should be a map. Found: ${state}"
+      assert state.containsKey("output") : "Output should contain key 'output'."
+      assert state.output.isFile() : "'output' should be a file."
+      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
+    
+      "Output: $output"
     }
-    | toSortedList({a, b -> a[0] <=> b[0]})
-    | map { output_list ->
-      assert output_list.size() == 2 : "output channel should contain 2 events"
-      assert output_list.collect{it[0]} == ["no_leiden_resolutions_test", "simple_execution_test"]
-    }
-    }
\ No newline at end of file
+  }
diff --git a/src/atlas_service/test.yaml b/src/atlas_service/test.yaml
new file mode 100644
index 0000000..2314410
--- /dev/null
+++ b/src/atlas_service/test.yaml
@@ -0,0 +1,63 @@
+output: $id.$key.output.html
+var_gene_names: gene_symbol
+var_name_mitochondrial_genes: mitochondrial
+var_name_ribosomal_genes: ribosomal
+publish_dir: s3://itx-del-data-pipelines/jmajerci/qc_ingestion_reports
+param_list:
+  - id: various_cart_sample_1
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_1.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_2
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_2.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_3
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_3.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_4
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_4.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_5
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_5.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_6
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_6.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_7
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_7.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_8
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_8.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_9
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_9.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_10
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_10.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_11
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_11.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_12
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_12.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_13
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_13.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_14
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_14.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_15
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_15.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_16
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_16.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_17
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_17.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_18
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_18.from_cellranger_multi_to_h5mu.output_0.h5mu
+  - id: various_cart_sample_19
+    input: >-
+      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_19.from_cellranger_multi_to_h5mu.output_0.h5mu
\ No newline at end of file

From 6a7ccd93c7b0b783918f0dc488b883c8b77b5a94 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 5 Mar 2025 16:30:54 +0000
Subject: [PATCH 13/21] update gitignore and test resources pointer

---
 .gitignore  | 3 +++
 _viash.yaml | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/.gitignore b/.gitignore
index 2198d08..3ad2825 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,6 @@ Thumbs.db
 work
 .nextflow*
 target
+
+# viash related
+resources_test
diff --git a/_viash.yaml b/_viash.yaml
index 6a7d20e..17094d7 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -15,3 +15,9 @@ repositories:
     repo: openpipelines-bio/openpipeline
     type: github
     tag: main_build
+
+info:
+  test_resources:
+    - type: s3
+      path: s3://openpipelines-data
+      dest: resources_test
\ No newline at end of file

From a22d1ea49bb96ab9b255e7dfc188ab28dd7d9600 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Wed, 5 Mar 2025 16:59:10 +0000
Subject: [PATCH 14/21] update annotation columns

---
 src/atlas_service/integration_test.sh |  2 +-
 src/atlas_service/main.nf             |  6 +-
 src/atlas_service/test.nf             |  2 +-
 src/atlas_service/test.yaml           | 63 -------------------
 src/atlas_service/test_execution.sh   | 57 -----------------
 src/atlas_service/test_params.sh      | 88 ---------------------------
 6 files changed, 5 insertions(+), 213 deletions(-)
 delete mode 100644 src/atlas_service/test.yaml
 delete mode 100755 src/atlas_service/test_execution.sh
 delete mode 100755 src/atlas_service/test_params.sh

diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh
index 546a62b..066f6c5 100755
--- a/src/atlas_service/integration_test.sh
+++ b/src/atlas_service/integration_test.sh
@@ -20,7 +20,7 @@ nextflow \
 nextflow \
   run . \
   -main-script src/atlas_service/test.nf \
-  -profile docker, no_publish \
+  -profile docker,no_publish \
   -resume \
   -entry test_wf_2 \
   -c src/utils/labels_ci.config \
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index 06c9760..b85c3d4 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -197,9 +197,9 @@ workflow run_wf {
       args: [
         "input_layer_lognormalized": "log_normalized",
         "input_obs_batch_label": "sample_id",
-        "output_obs_predictions": "harmony_knn_pred",
-        "output_obs_probability": "harmony_knn_proba",
-        "output_obsm_integrated": "X_integrated_harmony",
+        "output_obs_predictions": "scvi_knn_pred",
+        "output_obs_probability": "scvi_knn_proba",
+        "output_obsm_integrated": "X_integrated_scvi",
         "overwrite_existing_key": "true"
       ],
       toState: [ "query_processed": "output" ]
diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf
index 6c01251..e549bd3 100644
--- a/src/atlas_service/test.nf
+++ b/src/atlas_service/test.nf
@@ -17,7 +17,7 @@ workflow test_wf {
         reference_layer_lognormalized_counts: "log_normalized",
         reference_obs_batch: "donor_assay",
         reference_obs_label: "cell_type",
-        annotation_methods: "celltypist"
+        annotation_methods: "celltypist;scvi_knn;harmony_knn"
       ]
     ])
     | view {"State at start: $it"}
diff --git a/src/atlas_service/test.yaml b/src/atlas_service/test.yaml
deleted file mode 100644
index 2314410..0000000
--- a/src/atlas_service/test.yaml
+++ /dev/null
@@ -1,63 +0,0 @@
-output: $id.$key.output.html
-var_gene_names: gene_symbol
-var_name_mitochondrial_genes: mitochondrial
-var_name_ribosomal_genes: ribosomal
-publish_dir: s3://itx-del-data-pipelines/jmajerci/qc_ingestion_reports
-param_list:
-  - id: various_cart_sample_1
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_1.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_2
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_2.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_3
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_3.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_4
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_4.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_5
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_5.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_6
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_6.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_7
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_7.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_8
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_8.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_9
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_9.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_10
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_10.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_11
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_11.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_12
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_12.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_13
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_13.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_14
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_14.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_15
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_15.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_16
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_16.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_17
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_17.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_18
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_18.from_cellranger_multi_to_h5mu.output_0.h5mu
-  - id: various_cart_sample_19
-    input: >-
-      s3://itx-del-data-pipelines/tsztanka/various_cart_jan_2025/various_cart_sample_19.from_cellranger_multi_to_h5mu.output_0.h5mu
\ No newline at end of file
diff --git a/src/atlas_service/test_execution.sh b/src/atlas_service/test_execution.sh
deleted file mode 100755
index 2463e37..0000000
--- a/src/atlas_service/test_execution.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-cat > params.yaml << HERE
-id: harmony
-input: resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-reference: resources_test/annotation_test_data/TS_Blood_filtered.h5mu
-reference_layer_lognormalized_counts: log_normalized
-reference_var_gene_names: ensemblid
-reference_obs_batch: donor_assay
-reference_obs_label: cell_type
-annotation_methods: harmony_knn;scvi_knn;celltypist
-publish_dir: aaas_test
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker \
--c target/nextflow/atlas_service/nextflow.config \
--c src/utils/labels_ci.config
-
-# cat > params.yaml << HERE
-# id: celltypist
-# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-# celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-# modality: rna
-# input_var_gene_names: gene_symbol
-# annotation_methods: celltypist
-# publish_dir: output
-# HERE
-
-# nextflow run . \
-# -main-script target/nextflow/atlas_service/main.nf \
-# -params-file params.yaml \
-# -resume \
-# -profile docker,no_publish \
-# -c target/nextflow/atlas_service/nextflow.config \
-# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
-
-# cat > params.yaml << HERE
-# id: scgpt
-# input: /Users/dorienroosen/code/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-# modality: rna
-# annotation_methods: scgpt_annotation
-# input_var_gene_names: gene_symbol
-# scgpt_model: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/finetuned_model/best_model.pt
-# scgpt_model_config: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/args.json
-# scgpt_model_vocab: /Users/dorienroosen/code/openpipeline/resources_test/scgpt/source/vocab.json
-# HERE
-
-# nextflow run . \
-# -main-script target/nextflow/atlas_service/main.nf \
-# -params-file params.yaml \
-# -resume \
-# -profile docker,no_publish \
-# -c target/nextflow/atlas_service/nextflow.config \
-# -c /Users/dorienroosen/code/openpipeline/src/workflows/utils/labels_ci.config
diff --git a/src/atlas_service/test_params.sh b/src/atlas_service/test_params.sh
deleted file mode 100755
index 53986e8..0000000
--- a/src/atlas_service/test_params.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-# Test required arguments scGPT
-cat > params.yaml << HERE
-id: scgpt
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: scgpt_annotation
-scgpt_model_config: /home/dorienroosen/openpipeline/resources_test/scgpt/source/args.json
-scgpt_model_vocab: /home/dorienroosen/openpipeline/resources_test/scgpt/source/vocab.json
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
-
-# Test required arguments CellTypist
-cat > params.yaml << HERE
-id: celltypist_1
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: celltypist
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
-
-cat > params.yaml << HERE
-id: celltypist_2
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-celltypist_model: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/celltypist_model_Immune_All_Low.pkl
-reference: /home/dorienroosen/openpipeline/resources_test/annotation_test_data/TS_Blood_filtered.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: celltypist
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
-
-
-# Test required arguments Harmony
-cat > params.yaml << HERE
-id: harmony
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: harmony_knn
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
-
-# Test required arguments SCVI
-cat > params.yaml << HERE
-id: scvi
-input: /home/dorienroosen/openpipeline/resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu
-modality: rna
-input_var_gene_names: gene_symbol
-annotation_methods: scvi_knn
-HERE
-
-nextflow run . \
--main-script target/nextflow/atlas_service/main.nf \
--params-file params.yaml \
--resume \
--profile docker,no_publish \
--c target/nextflow/atlas_service/nextflow.config \
--c /home/dorienroosen/openpipeline/src/workflows/utils/labels_ci.config
\ No newline at end of file

From fcd1ead28673b706c0130c5a2ec9b474d108bd88 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 6 Mar 2025 18:25:18 +0000
Subject: [PATCH 15/21] finalize tests

---
 src/atlas_service/integration_test.sh | 21 +++++++-
 src/atlas_service/test.nf             | 76 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/src/atlas_service/integration_test.sh b/src/atlas_service/integration_test.sh
index 066f6c5..827f55d 100755
--- a/src/atlas_service/integration_test.sh
+++ b/src/atlas_service/integration_test.sh
@@ -24,5 +24,22 @@ nextflow \
   -resume \
   -entry test_wf_2 \
   -c src/utils/labels_ci.config \
-  -c src/utils/integration_tests.config \
-  --publish_dir test_2
\ No newline at end of file
+  -c src/utils/integration_tests.config
+
+  nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker,no_publish \
+  -resume \
+  -entry test_wf_3 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
+
+  nextflow \
+  run . \
+  -main-script src/atlas_service/test.nf \
+  -profile docker,no_publish \
+  -resume \
+  -entry test_wf_4 \
+  -c src/utils/labels_ci.config \
+  -c src/utils/integration_tests.config
\ No newline at end of file
diff --git a/src/atlas_service/test.nf b/src/atlas_service/test.nf
index e549bd3..ae151a4 100644
--- a/src/atlas_service/test.nf
+++ b/src/atlas_service/test.nf
@@ -43,6 +43,7 @@ workflow test_wf {
 }
 
 workflow test_wf_2 {
+
   // allow changing the resources_test dir
   resources_test = file(params.resources_test)
 
@@ -113,3 +114,78 @@ workflow test_wf_2 {
       "Output: $output"
     }
   }
+
+workflow test_wf_3 {
+  // allow changing the resources_test dir
+  resources_test = file(params.resources_test)
+
+  output_ch = Channel.fromList(
+    [
+      [
+        id: "celltypist_model",
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        celltypist_model: resources_test.resolve("annotation_test_data/celltypist_model_Immune_All_Low.pkl"),
+        annotation_methods: "celltypist",
+        input_var_gene_names: "gene_symbol"
+      ]
+    ])
+    | view {"State at start: $it"}
+    | map{ state -> [state.id, state] }
+    | atlas_service 
+    | view {"After AaaS: $it"}
+    | view { output ->
+      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
+
+      // check id
+      def id = output[0]
+      assert id == "merged" : "Output ID should be `merged`"
+
+      // check output
+      def state = output[1]
+      assert state instanceof Map : "State should be a map. Found: ${state}"
+      assert state.containsKey("output") : "Output should contain key 'output'."
+      assert state.output.isFile() : "'output' should be a file."
+      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
+    
+    "Output: $output"
+  }
+}
+
+workflow test_wf_4 {
+  // allow changing the resources_test dir
+  resources_test = file(params.resources_test)
+
+  output_ch = Channel.fromList(
+    [
+      [
+        id: "scgpt",
+        input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
+        annotation_methods: "scgpt_annotation",
+        input_var_gene_names: "gene_symbol",
+        scgpt_model: resources_test.resolve("scgpt/finetuned_model/best_model.pt"),
+        scgpt_model_config: resources_test.resolve("scgpt/source/args.json"),
+        scgpt_model_vocab: resources_test.resolve("scgpt/source/vocab.json"),
+        annotation_methods: "scgpt_annotation"
+      ]
+    ])
+    | view {"State at start: $it"}
+    | map{ state -> [state.id, state] }
+    | atlas_service 
+    | view {"After AaaS: $it"}
+    | view { output ->
+      assert output.size() == 2 : "Outputs should contain two elements; [id, state]"
+
+      // check id
+      def id = output[0]
+      assert id == "merged" : "Output ID should be `merged`"
+
+      // check output
+      def state = output[1]
+      assert state instanceof Map : "State should be a map. Found: ${state}"
+      assert state.containsKey("output") : "Output should contain key 'output'."
+      assert state.output.isFile() : "'output' should be a file."
+      assert state.output.toString().endsWith(".h5mu") : "Output file should end with '.h5mu'. Found: ${state.output}"
+    
+    "Output: $output"
+  }
+}

From 57f851a712b6ebf02a885d2598b29770993a07c9 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Tue, 11 Mar 2025 17:51:22 +0000
Subject: [PATCH 16/21] update scgpt annotation

---
 src/atlas_service/config.vsh.yaml | 2 +-
 src/atlas_service/main.nf         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index 47c6de9..e361553 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -356,7 +356,7 @@ dependencies:
     alias: process_samples_workflow
     repository: op
   - name: workflows/annotation/scgpt_annotation
-    repository: op
+    repository: op-main
   - name: annotate/celltypist
     repository: op-main
     alias: celltypist_annotation
diff --git a/src/atlas_service/main.nf b/src/atlas_service/main.nf
index b85c3d4..9289ee7 100644
--- a/src/atlas_service/main.nf
+++ b/src/atlas_service/main.nf
@@ -67,7 +67,6 @@ workflow run_wf {
           "id": id,
           "input": state.query_processed,
           "modality": state.modality,
-          "input_layer": state.input_layer,
           "input_var_gene_names": state.input_var_gene_names,
           "model": state.scgpt_model,
           "model_config": state.scgpt_model_config,
@@ -84,6 +83,7 @@ workflow run_wf {
         ]
       },
       args: [
+        "input_layer": "log_normalized",
         "input_obs_batch_label": "sample_id",
         "output_obs_predictions": "scgpt_pred",
         "output_obs_probability": "scgpt_proba"

From f3e3ad6b87fb8af29a8a80b5840cfbff24e2680d Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 20 Mar 2025 09:40:57 +0100
Subject: [PATCH 17/21] add viash config mods

---
 _viash.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/_viash.yaml b/_viash.yaml
index 56e5e0c..927260c 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -21,3 +21,8 @@ info:
     - type: s3
       path: s3://openpipelines-bio/openpipeline_incubator/resources_test
       dest: resources_test
+
+config_mods: |
+  .resources += {path: '/src/utils/labels.config', dest: 'nextflow_labels.config'}
+  .runners[.type == 'nextflow'].directives.tag := '$id'
+  .runners[.type == 'nextflow'].config.script := 'includeConfig("nextflow_labels.config")'

From 3e47eddf9b2978780455f7dc1820bea62ed8c21b Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 20 Mar 2025 09:41:13 +0100
Subject: [PATCH 18/21] add viash config mods

---
 src/labels.config | 68 -----------------------------------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 src/labels.config

diff --git a/src/labels.config b/src/labels.config
deleted file mode 100644
index 541aaad..0000000
--- a/src/labels.config
+++ /dev/null
@@ -1,68 +0,0 @@
-process {
-  // Default resources for components that hardly do any processing
-  memory = { 2.GB * task.attempt }
-  cpus = 1
-
-  // Retry for exit codes that have something to do with memory issues
-  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
-  maxRetries = 3
-  maxMemory = null
-
-  // CPU resources
-  withLabel: singlecpu { cpus = 1 }
-  withLabel: lowcpu { cpus = 4 }
-  withLabel: midcpu { cpus = 10 }
-  withLabel: highcpu { cpus = 20 }
-  
-  // Memory resources
-  withLabel: lowmem { memory = { get_memory( 50.GB * task.attempt ) } }
-  withLabel: midmem { memory = { get_memory( 50.GB * task.attempt ) } }
-  withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
-  withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
-
-  // Disk space
-  // Nextflow apparently can't handle empty directives, i.e.
-  // withLabel: lowdisk {}
-  // so for that reason we have to add a dummy directive
-  withLabel: lowdisk {
-    dummyDirective = "dummyValue"
-  }
-  withLabel: middisk {
-    dummyDirective = "dummyValue"
-  }
-  withLabel: highdisk {
-    dummyDirective = "dummyValue"
-  }
-  withLabel: veryhighdisk {
-    dummyDirective = "dummyValue"
-  }
-  // NOTE: The above labels intentionally do not have an effect by default.
-  // The user should set the disk space requirements by adding the following
-  // to the compute environment:
-  //
-  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
-  // withLabel: middisk { disk = { 100.GB * task.attempt } }
-  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
-  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
-}
-
-def get_memory(to_compare) {
-  if (!process.containsKey("maxMemory") || !process.maxMemory) {
-    return to_compare
-  }
-
-  try {
-    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
-      return process.maxMemory
-    }
-    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
-      return max_memory as nextflow.util.MemoryUnit
-    }
-    else {
-      return to_compare
-    }  
-  } catch (all) {
-        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
-        System.exit(1)
-  }
-}

From 210c3918321536a85bc32b960cc78b219016ce4f Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 20 Mar 2025 09:41:58 +0100
Subject: [PATCH 19/21] add viash config mods

---
 src/atlas_service/nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atlas_service/nextflow.config b/src/atlas_service/nextflow.config
index 71b491a..41a4570 100644
--- a/src/atlas_service/nextflow.config
+++ b/src/atlas_service/nextflow.config
@@ -7,4 +7,4 @@ params {
 }
 
 // include common settings
-includeConfig("${params.rootDir}/src/labels.config")
\ No newline at end of file
+includeConfig("${params.rootDir}/src/utils/labels.config")
\ No newline at end of file

From a887eb7c956ba72a99a1fecda63cf7713d400da0 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 20 Mar 2025 09:46:06 +0100
Subject: [PATCH 20/21] update labels

---
 src/utils/labels.config | 68 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 src/utils/labels.config

diff --git a/src/utils/labels.config b/src/utils/labels.config
new file mode 100644
index 0000000..541aaad
--- /dev/null
+++ b/src/utils/labels.config
@@ -0,0 +1,68 @@
+process {
+  // Default resources for components that hardly do any processing
+  memory = { 2.GB * task.attempt }
+  cpus = 1
+
+  // Retry for exit codes that have something to do with memory issues
+  errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+  maxRetries = 3
+  maxMemory = null
+
+  // CPU resources
+  withLabel: singlecpu { cpus = 1 }
+  withLabel: lowcpu { cpus = 4 }
+  withLabel: midcpu { cpus = 10 }
+  withLabel: highcpu { cpus = 20 }
+  
+  // Memory resources
+  withLabel: lowmem { memory = { get_memory( 50.GB * task.attempt ) } }
+  withLabel: midmem { memory = { get_memory( 50.GB * task.attempt ) } }
+  withLabel: highmem { memory = { get_memory( 50.GB * task.attempt ) } }
+  withLabel: veryhighmem { memory = { get_memory( 75.GB * task.attempt ) } }
+
+  // Disk space
+  // Nextflow apparently can't handle empty directives, i.e.
+  // withLabel: lowdisk {}
+  // so for that reason we have to add a dummy directive
+  withLabel: lowdisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: middisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: highdisk {
+    dummyDirective = "dummyValue"
+  }
+  withLabel: veryhighdisk {
+    dummyDirective = "dummyValue"
+  }
+  // NOTE: The above labels intentionally do not have an effect by default.
+  // The user should set the disk space requirements by adding the following
+  // to the compute environment:
+  //
+  // withLabel: lowdisk { disk = { 20.GB * task.attempt } }
+  // withLabel: middisk { disk = { 100.GB * task.attempt } }
+  // withLabel: highdisk { disk = { 200.GB * task.attempt } }
+  // withLabel: veryhighdisk { disk = { 500.GB * task.attempt } }
+}
+
+def get_memory(to_compare) {
+  if (!process.containsKey("maxMemory") || !process.maxMemory) {
+    return to_compare
+  }
+
+  try {
+    if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) {
+      return process.maxMemory
+    }
+    else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) {
+      return max_memory as nextflow.util.MemoryUnit
+    }
+    else {
+      return to_compare
+    }  
+  } catch (all) {
+        println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!"
+        System.exit(1)
+  }
+}

From cfa3a45191107157648e9e9e1b0d4c688dbca281 Mon Sep 17 00:00:00 2001
From: dorien-er <roosen.dorien@gmail.com>
Date: Thu, 20 Mar 2025 09:59:20 +0100
Subject: [PATCH 21/21] update _viash and dependencies

---
 _viash.yaml                       |  2 +-
 src/atlas_service/config.vsh.yaml | 10 +---------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/_viash.yaml b/_viash.yaml
index 927260c..51ace0d 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -14,7 +14,7 @@ repositories:
   - name: openpipeline
     repo: openpipelines-bio/openpipeline
     type: github
-    tag: main_build
+    tag: 2.0.0
 
 info:
   test_resources:
diff --git a/src/atlas_service/config.vsh.yaml b/src/atlas_service/config.vsh.yaml
index e361553..70137dd 100644
--- a/src/atlas_service/config.vsh.yaml
+++ b/src/atlas_service/config.vsh.yaml
@@ -348,13 +348,9 @@ argument_groups:
       example: output.h5mu
 
 dependencies:
-  - name: metadata/add_id
-    repository: op
-  - name: dataflow/split_h5mu
-    repository: op
   - name: workflows/multiomics/process_samples
     alias: process_samples_workflow
-    repository: op
+    repository: openpipeline
   - name: workflows/annotation/scgpt_annotation
     repository: op-main
   - name: annotate/celltypist
@@ -368,10 +364,6 @@ dependencies:
     alias: scvi_knn_annotation
 
 repositories:
-  - name: op
-    type: github
-    repo: openpipelines-bio/openpipeline
-    tag: 2.0.0
   - name: op-main
     type: github
     repo: openpipelines-bio/openpipeline