openpipelines-bio · jakubmajercik · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/resources_test_scripts/qc_sample_data.sh b/resources_test_scripts/qc_sample_data.sh
@@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data
 [ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"
 
 # fetch/create h5mu from somewhere
-cat > /tmp/params.yaml <<EOF
+cat > /tmp/params_create_h5mu.yaml <<EOF
 param_list:
   - id: sample_one
     input_id: sample_one
@@ -24,13 +24,55 @@ nextflow run openpipelines-bio/openpipeline \
   -r 2.0.0 \
   -main-script target/nextflow/metadata/add_id/main.nf \
   -profile docker \
-  -params-file /tmp/params.yaml \
+  -params-file /tmp/params_create_h5mu.yaml \
+  -resume
+
+cat > /tmp/params_subset.yaml <<EOF
+param_list:
+  - id: sample_one
+    input: resources_test/qc_sample_data/sample_one.qc.h5mu
+  - id: sample_two
+    input: resources_test/qc_sample_data/sample_two.qc.h5mu
+output: '\$id.qc.h5mu'
+number_of_observations: 10000
+output_compression: gzip
+publish_dir: "$OUT_DIR"
+EOF
+
+# subset h5mus
+nextflow run openpipelines-bio/openpipeline \
+  -latest \
+  -r 2.0.0 \
+  -main-script target/nextflow/filter/subset_h5mu/main.nf \
+  -profile docker \
+  -params-file /tmp/params_subset.yaml \
+  -resume
+
+# generate cellbender out for testing
+cat > /tmp/params_cellbender.yaml <<EOF
+param_list:
+  - id: sample_one
+    input: resources_test/qc_sample_data/sample_one.qc.h5mu
+  - id: sample_two
+    input: resources_test/qc_sample_data/sample_two.qc.h5mu
+output: '\$id.qc.cellbender.h5mu'
+epochs: 5
+output_compression: gzip
+publish_dir: "$OUT_DIR"
+EOF
+
+nextflow run openpipelines-bio/openpipeline \
+  -latest \
+  -r 2.0.0 \
+  -main-script target/nextflow/correction/cellbender_remove_background/main.nf \
+  -profile docker \
+  -params-file /tmp/params_cellbender.yaml \
   -resume
 
 # generate json for testing
 viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
-  --input "$OUT_DIR"//sample_one.qc.h5mu \
-  --input "$OUT_DIR"/sample_two.qc.h5mu \
+  --input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
+  --input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
   --output "$OUT_DIR"/dataset.json
 
 # copy to s3

diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml
@@ -59,6 +59,17 @@ argument_groups:
         default: "ribosomal"
         description: |
           In which .var slot to store a boolean array corresponding the ribosomal genes.
+      - name: "--obs_cell_probability"
+        type: string
+        required: false
+        default: "cellbender_cell_probability"
+        description: |
+          In which .obs slot to store the cell probability.
+      - name: "--cellbender_epochs"
+        type: integer
+        required: false
+        description: Number of epochs to train cellbender. 
+        default: 150
 
   - name: Outputs
     arguments:
@@ -78,6 +89,9 @@ dependencies:
   - name: workflows/qc/qc
     alias: qc_wf
     repository: openpipeline
+  - name: correction/cellbender_remove_background
+    alias: cellbender
+    repository: openpipeline
   - name: ingestion_qc/h5mu_to_qc_json
   - name: ingestion_qc/generate_html
 runners:

diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf
@@ -8,14 +8,25 @@ workflow run_wf {
       [id, state + [_meta: [join_id: id]]]
     }
 
+    // run cellbender
+    | cellbender.run(
+      fromState: [
+        id: "id",
+        input: "input",
+        obs_cell_probability: "obs_cell_probability",
+        epochs: "cellbender_epochs",
+      ],
+      toState: ["output"]
+    )
+
     // run qc on each sample
     | qc_wf.run(
       fromState: [
-        "id",
-        "input",
-        "var_gene_names",
-        "var_name_mitochondrial_genes",
-        "var_name_ribosomal_genes"
+        id: "id",
+        input: "output",
+        var_gene_names: "var_gene_names",
+        var_name_mitochondrial_genes: "var_name_mitochondrial_genes",
+        var_name_ribosomal_genes: "var_name_ribosomal_genes"
       ],
       toState: ["output"]
     )

diff --git a/src/ingestion_qc/generate_report/test.sh b/src/ingestion_qc/generate_report/test.sh
@@ -4,17 +4,17 @@ viash ns build --setup cb --parallel
 
 cat > /tmp/params.yaml <<EOF
 param_list:
-  - input: resources_test/sample_data/sample_1.qc.output.h5mu
+  - input: resources_test/qc_sample_data/sample_one.qc.h5mu
     id: sample_one
-  - input: resources_test/sample_data/sample_2.qc.output.h5mu
+  - input: resources_test/qc_sample_data/sample_two.qc.h5mu
     id: sample_two
 output_qc_json: output_qc.json
 output_html: output_report.html
 EOF
 
+
 nextflow run . \
   -main-script target/nextflow/ingestion_qc/generate_report/main.nf \
   -params-file /tmp/params.yaml \
   -profile docker \
   --publish_dir test_results \
-  --resume
diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
@@ -51,6 +51,12 @@ argument_groups:
         multiple: true
         description: The keys in the h5mu .obs to include in the output JSON
         default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
+      - name: --cellbender_obs_keys
+        type: string
+        multiple: true
+        description: The cellbender keys in the h5mu .obs to include in the output JSON
+        default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
+                  "cellbender_droplet_efficiency"]
       - name: --cellranger_metrics_uns_key
         type: string
         description: The key in the h5mu file .uns that contains the cellranger metrics

diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py
@@ -7,7 +7,7 @@
 ## VIASH START
 # inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
 # output = "data/sample-data.json"
-inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu"))
+inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu"))
 output = "tmp.json"
 par = {
     "input": sorted([str(x) for x in inputs]),
@@ -21,8 +21,13 @@
         "num_nonzero_vars",
         "fraction_mitochondrial",
         "fraction_ribosomal",
-        "pct_of_counts_in_top_50_vars",
     ],
+    "cellbender_obs_keys": [
+        "cellbender_background_fraction",
+        "cellbender_cell_probability",
+        "cellbender_cell_size",
+        "cellbender_droplet_efficiency",
+    ],        
     "cellranger_metrics_uns_key": "metrics_cellranger",
 }
 i = 0
@@ -62,6 +67,7 @@ def transform_df(df):
 
 def main(par):
     cell_stats_dfs = []
+    cellbender_cell_stats_dfs = []
     sample_stats_dfs = []
     metrics_cellranger_dfs = []
 
@@ -93,6 +99,7 @@ def main(par):
         missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns]
         if missing_keys:
             raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")
+
 
         sample_id = (
             mod_obs[par["sample_id_key"]].tolist()
@@ -106,7 +113,7 @@ def main(par):
                 **{key: mod_obs[key] for key in par["obs_keys"]},
             }
         )
-
+        
         sample_summary_stats = pd.DataFrame(
             {
                 "sample_id": pd.Categorical([sample_id[0]]),
@@ -147,18 +154,36 @@ def main(par):
             metrics["sample_id"] = [sample_id[0]]
             metrics_cellranger_dfs.append(metrics)
 
+        if par["cellbender_obs_keys"]:
+            missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns]
+            if missing_cellbender_keys:
+                raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbenbder first.")
+
+            cellbender_rna_stats = pd.DataFrame(
+                {
+                    "sample_id": pd.Categorical(sample_id),
+                    **{key: mod_obs[key] for key in par["cellbender_obs_keys"]},
+                }
+            )
+
+        else:
+            cellbender_rna_stats = pd.DataFrame()
+
         cell_stats_dfs.append(cell_rna_stats)
+        cellbender_cell_stats_dfs.append(cellbender_rna_stats)
         sample_stats_dfs.append(sample_summary_stats)
 
     combined_cell_stats = pd.concat(cell_stats_dfs, ignore_index=True)
+    combined_cellbender_stats = pd.concat(cellbender_cell_stats_dfs, ignore_index=True)
     combined_sample_stats = pd.concat(sample_stats_dfs, ignore_index=True)
     combined_metrics_cellranger = pd.concat(metrics_cellranger_dfs, ignore_index=True)
 
-    for df in [combined_cell_stats, combined_sample_stats, combined_metrics_cellranger]:
+    for df in [combined_cell_stats, combined_cellbender_stats, combined_sample_stats, combined_metrics_cellranger]:
         df["sample_id"] = pd.Categorical(df["sample_id"])
 
     output = {
         "cell_rna_stats": transform_df(combined_cell_stats),
+        "cellbender_rna_stats": transform_df(combined_cellbender_stats),
         "sample_summary_stats": transform_df(combined_sample_stats),
         "metrics_cellranger_stats": transform_df(combined_metrics_cellranger),
     }

diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py
@@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path):
 
     run_component(
         [
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
             "--output", output_json_path,
         ]
     )
@@ -27,10 +27,13 @@ def test_simple_execution(run_component, tmp_path):
     with open(output_json_path, "r") as f:
         output_json_dict = json.load(f)
 
-    assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
+    assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"}
 
-    column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
-    assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
+    column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
+    assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
+
+    column_names_cellbender = [col["name"] for col in output_json_dict["cellbender_rna_stats"]["columns"]]
+    assert column_names_cellbender == ["sample_id", "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"]
 
     for key in output_json_dict.keys():
         assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}
@@ -43,8 +46,8 @@ def test_set_filters(run_component, tmp_path):
 
     run_component(
         [
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
-            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
+            "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
             "--output", output_json_path,
             "--sample_id_key", "sample_id",
             "--min_total_counts", "10",
@@ -59,7 +62,7 @@ def test_set_filters(run_component, tmp_path):
     with open(output_json_path, "r") as f:
         output_json_dict = json.load(f)
 
-    assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
+    assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"}
 
     column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
     assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"]