openpipelines-bio · jakubmajercik · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/_viash.yaml b/_viash.yaml
@@ -14,7 +14,7 @@ repositories:
   - name: openpipeline
     repo: openpipelines-bio/openpipeline
     type: github
-    tag: main_build
+    tag: 2.0.0
 
 info:
   test_resources:

diff --git a/resources_test_scripts/qc_sample_data.sh b/resources_test_scripts/qc_sample_data.sh
@@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data
 [ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
 
 # fetch/create h5mu from somewhere
-cat > /tmp/params.yaml <<EOF
+cat > /tmp/params_create_h5mu.yaml <<EOF
 param_list:
   - id: sample_one
     input_id: sample_one
@@ -24,18 +24,60 @@ nextflow run openpipelines-bio/openpipeline \
   -r 2.0.0 \
   -main-script target/nextflow/metadata/add_id/main.nf \
   -profile docker \
-  -params-file /tmp/params.yaml \
+  -params-file /tmp/params_create_h5mu.yaml \
+  -resume
+
+cat > /tmp/params_subset.yaml <<EOF
+param_list:
+  - id: sample_one
+    input: resources_test/qc_sample_data/sample_one.qc.h5mu
+  - id: sample_two
+    input: resources_test/qc_sample_data/sample_two.qc.h5mu
+output: '\$id.qc.h5mu'
+number_of_observations: 10000
+output_compression: gzip
+publish_dir: "$OUT_DIR"
+EOF
+
+# subset h5mus
+nextflow run openpipelines-bio/openpipeline \
+  -latest \
+  -r 2.0.0 \
+  -main-script target/nextflow/filter/subset_h5mu/main.nf \
+  -profile docker \
+  -params-file /tmp/params_subset.yaml \
+  -resume
+
+# generate cellbender out for testing
+cat > /tmp/params_cellbender.yaml <<EOF
+param_list:
+  - id: sample_one
+    input: resources_test/qc_sample_data/sample_one.qc.h5mu
+  - id: sample_two
+    input: resources_test/qc_sample_data/sample_two.qc.h5mu
+output: '\$id.qc.cellbender.h5mu'
+epochs: 5
+output_compression: gzip
+publish_dir: "$OUT_DIR"
+EOF
+
+nextflow run openpipelines-bio/openpipeline \
+  -latest \
+  -r 2.0.0 \
+  -main-script target/nextflow/correction/cellbender_remove_background/main.nf \
+  -profile docker \
+  -params-file /tmp/params_cellbender.yaml \
   -resume
 
 # generate json for testing
 viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
-  --input "$OUT_DIR"//sample_one.qc.h5mu \
-  --input "$OUT_DIR"/sample_two.qc.h5mu \
+  --input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
+  --input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
   --output "$OUT_DIR"/dataset.json
 
 # copy to s3
 aws s3 sync \
   --profile di \
   resources_test/qc_sample_data \
   s3://openpipelines-bio/openpipeline_incubator/resources_test/qc_sample_data \
-  --delete --dryrun
+  --delete
diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml
@@ -45,7 +45,7 @@ engines:
       - type: docker
         run: |
           npm install -g pnpm@latest-10 \
-          && cd /opt && git clone https://github.com/openpipelines-bio/incubator_ingestion_qc.git \
+          && cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \
           && cd incubator_ingestion_qc && pnpm install
 runners:
   - type: executable

diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml
@@ -59,7 +59,18 @@ argument_groups:
         default: "ribosomal"
         description: |
           In which .var slot to store a boolean array corresponding the ribosomal genes.
-
+  - name: Cellbender options
+    arguments:
+      - name: "--run_cellbender"
+        type: boolean
+        required: false
+        description: Whether to run cellbender or not.
+        default: false
+      - name: "--cellbender_epochs"
+        type: integer
+        required: false
+        description: Number of epochs to train cellbender. 
+        default: 150
   - name: Outputs
     arguments:
       - name: --output
@@ -68,17 +79,23 @@ argument_groups:
         direction: output
         description: The output HTML report
         example: path/to/file.html
+
 resources:
   - type: nextflow_script
     entrypoint: run_wf
     path: main.nf
+
 dependencies:
   - name: metadata/add_id
     repository: openpipeline
   - name: workflows/qc/qc
     alias: qc_wf
     repository: openpipeline
+  - name: correction/cellbender_remove_background
+    alias: cellbender
+    repository: openpipeline
   - name: ingestion_qc/h5mu_to_qc_json
   - name: ingestion_qc/generate_html
+
 runners:
   - type: nextflow
diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf
@@ -8,14 +8,25 @@ workflow run_wf {
       [id, state + [_meta: [join_id: id]]]
     }
 
+    // run cellbender
+    | cellbender.run(
+      runIf: {id, state -> state.run_cellbender},
+      fromState: [
+        id: "id",
+        input: "input",
+        epochs: "cellbender_epochs",
+      ],
+      toState: ["output"]
+    )
+
     // run qc on each sample
     | qc_wf.run(
       fromState: [
-        "id",
-        "input",
-        "var_gene_names",
-        "var_name_mitochondrial_genes",
-        "var_name_ribosomal_genes"
+        id: "id",
+        input: "output",
+        var_gene_names: "var_gene_names",
+        var_name_mitochondrial_genes: "var_name_mitochondrial_genes",
+        var_name_ribosomal_genes: "var_name_ribosomal_genes"
       ],
       toState: ["output"]
     )

diff --git a/src/ingestion_qc/generate_report/test.sh b/src/ingestion_qc/generate_report/test.sh
@@ -4,17 +4,19 @@ viash ns build --setup cb --parallel
 
 cat > /tmp/params.yaml <<EOF
 param_list:
-  - input: resources_test/sample_data/sample_1.qc.output.h5mu
+  - input: resources_test/qc_sample_data/sample_one.qc.h5mu
     id: sample_one
-  - input: resources_test/sample_data/sample_2.qc.output.h5mu
+  - input: resources_test/qc_sample_data/sample_two.qc.h5mu
     id: sample_two
+cellbender_epochs: 5
+run_cellbender: true
 output_qc_json: output_qc.json
 output_html: output_report.html
 EOF
 
+
 nextflow run . \
   -main-script target/nextflow/ingestion_qc/generate_report/main.nf \
   -params-file /tmp/params.yaml \
   -profile docker \
-  --publish_dir test_results \
-  --resume
+  --publish_dir test_results
diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
@@ -51,18 +51,25 @@ argument_groups:
         multiple: true
         description: The keys in the h5mu .obs to include in the output JSON
         default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
+      - name: --cellbender_obs_keys
+        type: string
+        multiple: true
+        description: The cellbender keys in the h5mu .obs to include in the output JSON
+        default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
+                  "cellbender_droplet_efficiency"]
       - name: --cellranger_metrics_uns_key
         type: string
         description: The key in the h5mu file .uns that contains the cellranger metrics
         default: metrics_cellranger
 resources:
   - type: python_script
     path: script.py
-test_resources:
-  - type: python_script
-    path: test.py
-  - type: file
-    path: /resources_test
+  - path: /src/utils/setup_logger.py
+# test_resources:
+#   - type: python_script
+#     path: test.py
+#   - type: file
+#     path: /resources_test
 engines:
   - type: docker
     image: python:3.12-slim

diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py
@@ -3,11 +3,12 @@
 from pathlib import Path
 import anndata as ad
 import h5py
+import sys
 
 ## VIASH START
 # inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
 # output = "data/sample-data.json"
-inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu"))
+inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu"))
 output = "tmp.json"
 par = {
     "input": sorted([str(x) for x in inputs]),
@@ -21,14 +22,23 @@
         "num_nonzero_vars",
         "fraction_mitochondrial",
         "fraction_ribosomal",
-        "pct_of_counts_in_top_50_vars",
     ],
+    "cellbender_obs_keys": [
+        "cellbender_background_fraction",
+        "cellbender_cell_probability",
+        "cellbender_cell_size",
+        "cellbender_droplet_efficiency",
+    ],        
     "cellranger_metrics_uns_key": "metrics_cellranger",
 }
 i = 0
 mudata_file = par["input"][i]
 ## VIASH END
 
+sys.path.append(meta["resources_dir"])
+from setup_logger import setup_logger
+
+logger = setup_logger()
 
 def transform_df(df):
     """Transform a DataFrame into the annotation object format."""
@@ -93,6 +103,11 @@ def main(par):
         missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns]
         if missing_keys:
             raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")
+
+        if par["cellbender_obs_keys"]:
+            missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns]
+            if missing_cellbender_keys:
+                logger.info(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first to include these metrics.")
 
         sample_id = (
             mod_obs[par["sample_id_key"]].tolist()
@@ -104,9 +119,10 @@ def main(par):
             {
                 "sample_id": pd.Categorical(sample_id),
                 **{key: mod_obs[key] for key in par["obs_keys"]},
+                **{key: mod_obs[key] for key in par["cellbender_obs_keys"] if par["cellbender_obs_keys"]},
             }
         )
-
+        
         sample_summary_stats = pd.DataFrame(
             {
                 "sample_id": pd.Categorical([sample_id[0]]),
@@ -146,7 +162,7 @@ def main(par):
                 metrics[col] = pd.to_numeric(metrics[col], errors="coerce")
             metrics["sample_id"] = [sample_id[0]]
             metrics_cellranger_dfs.append(metrics)
-
+        
         cell_stats_dfs.append(cell_rna_stats)
         sample_stats_dfs.append(sample_summary_stats)
 

diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py
@@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path):
 
     run_component(
         [
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
             "--output", output_json_path,
         ]
     )
@@ -29,8 +29,10 @@ def test_simple_execution(run_component, tmp_path):
 
     assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
 
-    column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
-    assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
+    column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
+    assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal",
+                                 "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
+                                 "cellbender_droplet_efficiency"]
 
     for key in output_json_dict.keys():
         assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}
@@ -43,8 +45,8 @@ def test_set_filters(run_component, tmp_path):
 
     run_component(
         [
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
             "--output", output_json_path,
             "--sample_id_key", "sample_id",
             "--min_total_counts", "10",
@@ -62,7 +64,8 @@ def test_set_filters(run_component, tmp_path):
     assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
 
     column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
-    assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"]
+    assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "cellbender_background_fraction",
+                            "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"]
 
     for key in output_json_dict.keys():
         assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}

diff --git a/src/utils/setup_logger.py b/src/utils/setup_logger.py
@@ -0,0 +1,12 @@
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger