From f7e1ff6bd6c59cfab1a3edd57f3577717b9ea3eb Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Mon, 10 Mar 2025 10:15:43 +0100 Subject: [PATCH 01/19] add params, update test data --- resources_test_scripts/qc_sample_data.sh | 50 +++++++++++++++++-- .../generate_report/config.vsh.yaml | 14 ++++++ src/ingestion_qc/generate_report/main.nf | 21 ++++++-- src/ingestion_qc/generate_report/test.sh | 6 +-- .../h5mu_to_qc_json/config.vsh.yaml | 4 +- src/ingestion_qc/h5mu_to_qc_json/test.py | 4 +- 6 files changed, 84 insertions(+), 15 deletions(-) diff --git a/resources_test_scripts/qc_sample_data.sh b/resources_test_scripts/qc_sample_data.sh index d88c436..993a6e1 100755 --- a/resources_test_scripts/qc_sample_data.sh +++ b/resources_test_scripts/qc_sample_data.sh @@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data [ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR" # fetch/create h5mu from somewhere -cat > /tmp/params.yaml <<EOF +cat > /tmp/params_create_h5mu.yaml <<EOF param_list: - id: sample_one input_id: sample_one @@ -24,13 +24,55 @@ nextflow run openpipelines-bio/openpipeline \ -r 2.0.0 \ -main-script target/nextflow/metadata/add_id/main.nf \ -profile docker \ - -params-file /tmp/params.yaml \ + -params-file /tmp/params_create_h5mu.yaml \ + -resume + +cat > /tmp/params_subset.yaml <<EOF +param_list: + - id: sample_one + input: resources_test/qc_sample_data/sample_one.qc.h5mu + - id: sample_two + input: resources_test/qc_sample_data/sample_two.qc.h5mu +output: '\$id.qc.h5mu' +number_of_observations: 10000 +output_compression: gzip +publish_dir: "$OUT_DIR" +EOF + +# subset h5mus +nextflow run openpipelines-bio/openpipeline \ + -latest \ + -r 2.0.0 \ + -main-script target/nextflow/filter/subset_h5mu/main.nf \ + -profile docker \ + -params-file /tmp/params_subset.yaml \ + -resume + +# generate cellbender out for testing +cat > /tmp/params_cellbender.yaml <<EOF +param_list: + - id: sample_one + input: resources_test/qc_sample_data/sample_one.qc.h5mu + - id: sample_two + input: resources_test/qc_sample_data/sample_two.qc.h5mu +output: '\$id.qc.cellbender.h5mu' +epochs: 5 +output_compression: gzip +publish_dir: "$OUT_DIR" +EOF + +nextflow run openpipelines-bio/openpipeline \ + -latest \ + -r 2.0.0 \ + -main-script target/nextflow/correction/cellbender_remove_background/main.nf \ + -profile docker \ + -params-file /tmp/params_cellbender.yaml \ -resume # generate json for testing viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \ - --input "$OUT_DIR"//sample_one.qc.h5mu \ - --input "$OUT_DIR"/sample_two.qc.h5mu \ + --input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \ + --input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \ --output "$OUT_DIR"/dataset.json # copy to s3 diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index 23db8a6..57878ae 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -59,6 +59,17 @@ argument_groups: default: "ribosomal" description: | In which .var slot to store a boolean array corresponding the ribosomal genes. + - mame: "--obs_cell_probability" + type: string + required: false + default: "cellbender_cell_probability" + description: | + In which .obs slot to store the cell probability. + - name: "--cellbender_epochs" + type: integer + required: false + description: Number of epochs to train cellbender. + default: 150 - name: Outputs arguments: @@ -78,6 +89,9 @@ dependencies: - name: workflows/qc/qc alias: qc_wf repository: openpipeline + - name: correction/cellbender_remove_background + alias: cellbender + repository: openpipeline - name: ingestion_qc/h5mu_to_qc_json - name: ingestion_qc/generate_html runners: diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index e8c7964..f6260c8 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -8,14 +8,25 @@ workflow run_wf { [id, state + [_meta: [join_id: id]]] } + // run cellbender + | cellbender.run( + fromState: [ + id: "id", + input: "input", + obs_cell_probability: "obs_cell_probability", + epochs: "cellbender_epochs", + ], + toState: ["output"] + ) + // run qc on each sample | qc_wf.run( fromState: [ - "id", - "input", - "var_gene_names", - "var_name_mitochondrial_genes", - "var_name_ribosomal_genes" + id: "id", + input: "output", + var_gene_names: "var_gene_names", + var_name_mitochondrial_genes: "var_name_mitochondrial_genes", + var_name_ribosomal_genes: "var_name_ribosomal_genes" ], toState: ["output"] ) diff --git a/src/ingestion_qc/generate_report/test.sh b/src/ingestion_qc/generate_report/test.sh index e0e81aa..f681159 100755 --- a/src/ingestion_qc/generate_report/test.sh +++ b/src/ingestion_qc/generate_report/test.sh @@ -4,17 +4,17 @@ viash ns build --setup cb --parallel cat > /tmp/params.yaml <<EOF param_list: - - input: resources_test/sample_data/sample_1.qc.output.h5mu + - input: resources_test/qc_sample_data/sample_one.qc.h5mu id: sample_one - - input: resources_test/sample_data/sample_2.qc.output.h5mu + - input: resources_test/qc_sample_data/sample_two.qc.h5mu id: sample_two output_qc_json: output_qc.json output_html: output_report.html EOF + nextflow run . \ -main-script target/nextflow/ingestion_qc/generate_report/main.nf \ -params-file /tmp/params.yaml \ -profile docker \ --publish_dir test_results \ - --resume diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index 5aa68fd..6e01e2e 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -50,7 +50,9 @@ argument_groups: type: string multiple: true description: The keys in the h5mu .obs to include in the output JSON - default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", + "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + "cellbender_droplet_efficiency"] - name: --cellranger_metrics_uns_key type: string description: The key in the h5mu file .uns that contains the cellranger metrics diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index f5bddd4..a655f74 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path): run_component( [ - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu", - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu", "--output", output_json_path, ] ) From 2adbb699da6c70fcd02a68a888d204da14193fcc Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Mon, 10 Mar 2025 10:20:46 +0100 Subject: [PATCH 02/19] add executable runner --- src/ingestion_qc/generate_report/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index 57878ae..48bb970 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -95,4 +95,5 @@ dependencies: - name: ingestion_qc/h5mu_to_qc_json - name: ingestion_qc/generate_html runners: + - type: executable - type: nextflow From dc9f4cee833cc7021059a84b4ebfd62c282f0329 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Mon, 10 Mar 2025 10:40:24 +0100 Subject: [PATCH 03/19] fix typo --- src/ingestion_qc/generate_report/config.vsh.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index 48bb970..eab9da2 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -59,7 +59,7 @@ argument_groups: default: "ribosomal" description: | In which .var slot to store a boolean array corresponding the ribosomal genes. - - mame: "--obs_cell_probability" + - name: "--obs_cell_probability" type: string required: false default: "cellbender_cell_probability" @@ -95,5 +95,4 @@ dependencies: - name: ingestion_qc/h5mu_to_qc_json - name: ingestion_qc/generate_html runners: - - type: executable - type: nextflow From a44993c4df07781a9e91d2c43c3ec3b3fb97a487 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Mon, 10 Mar 2025 11:24:05 +0100 Subject: [PATCH 04/19] update unit tests --- src/ingestion_qc/h5mu_to_qc_json/test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index a655f74..955749d 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -30,7 +30,9 @@ def test_simple_execution(run_component, tmp_path): assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", + "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} @@ -43,8 +45,8 @@ def test_set_filters(run_component, tmp_path): run_component( [ - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu", - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu", "--output", output_json_path, "--sample_id_key", "sample_id", "--min_total_counts", "10", From 95f884fd06bbbf5197684b7b03f424f557077763 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Thu, 13 Mar 2025 15:29:12 +0100 Subject: [PATCH 05/19] updated data parsing --- .../h5mu_to_qc_json/config.vsh.yaml | 8 +++-- src/ingestion_qc/h5mu_to_qc_json/script.py | 33 ++++++++++++++++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index 6e01e2e..825f0d8 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -50,8 +50,12 @@ argument_groups: type: string multiple: true description: The keys in the h5mu .obs to include in the output JSON - default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", - "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + - name: --cellbender_obs_keys + type: string + multiple: true + description: The cellbender keys in the h5mu .obs to include in the output JSON + default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"] - name: --cellranger_metrics_uns_key type: string diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py index 34210c3..7161c5d 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/script.py +++ b/src/ingestion_qc/h5mu_to_qc_json/script.py @@ -7,7 +7,7 @@ ## VIASH START # inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu")) # output = "data/sample-data.json" -inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu")) +inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu")) output = "tmp.json" par = { "input": sorted([str(x) for x in inputs]), @@ -21,8 +21,13 @@ "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", - "pct_of_counts_in_top_50_vars", ], + "cellbender_obs_keys": [ + "cellbender_background_fraction", + "cellbender_cell_probability", + "cellbender_cell_size", + "cellbender_droplet_efficiency", + ], "cellranger_metrics_uns_key": "metrics_cellranger", } i = 0 @@ -62,6 +67,7 @@ def transform_df(df): def main(par): cell_stats_dfs = [] + cellbender_cell_stats_dfs = [] sample_stats_dfs = [] metrics_cellranger_dfs = [] @@ -93,6 +99,7 @@ def main(par): missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns] if missing_keys: raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}") + sample_id = ( mod_obs[par["sample_id_key"]].tolist() @@ -106,7 +113,7 @@ def main(par): **{key: mod_obs[key] for key in par["obs_keys"]}, } ) - + sample_summary_stats = pd.DataFrame( { "sample_id": pd.Categorical([sample_id[0]]), @@ -147,18 +154,36 @@ def main(par): metrics["sample_id"] = [sample_id[0]] metrics_cellranger_dfs.append(metrics) + if par["cellbender_obs_keys"]: + missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns] + if missing_cellbender_keys: + raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbenbder first.") + + cellbender_rna_stats = pd.DataFrame( + { + "sample_id": pd.Categorical(sample_id), + **{key: mod_obs[key] for key in par["cellbender_obs_keys"]}, + } + ) + + else: + cellbender_rna_stats = pd.DataFrame() + cell_stats_dfs.append(cell_rna_stats) + cellbender_cell_stats_dfs.append(cellbender_rna_stats) sample_stats_dfs.append(sample_summary_stats) combined_cell_stats = pd.concat(cell_stats_dfs, ignore_index=True) + combined_cellbender_stats = pd.concat(cellbender_cell_stats_dfs, ignore_index=True) combined_sample_stats = pd.concat(sample_stats_dfs, ignore_index=True) combined_metrics_cellranger = pd.concat(metrics_cellranger_dfs, ignore_index=True) - for df in [combined_cell_stats, combined_sample_stats, combined_metrics_cellranger]: + for df in [combined_cell_stats, combined_cellbender_stats, combined_sample_stats, combined_metrics_cellranger]: df["sample_id"] = pd.Categorical(df["sample_id"]) output = { "cell_rna_stats": transform_df(combined_cell_stats), + "cellbender_rna_stats": transform_df(combined_cellbender_stats), "sample_summary_stats": transform_df(combined_sample_stats), "metrics_cellranger_stats": transform_df(combined_metrics_cellranger), } From 844968f144c55bfcbd56e664cc21ebfe343125de Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Thu, 13 Mar 2025 16:20:57 +0100 Subject: [PATCH 06/19] update tests --- src/ingestion_qc/h5mu_to_qc_json/test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index 955749d..eb84ceb 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -27,12 +27,13 @@ def test_simple_execution(run_component, tmp_path): with open(output_json_path, "r") as f: output_json_dict = json.load(f) - assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} + assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"} - column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", - "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", - "cellbender_droplet_efficiency"] + column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] + assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + + column_names_cellbender = [col["name"] for col in output_json_dict["cellbender_rna_stats"]["columns"]] + assert column_names_cellbender == ["sample_id", "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} @@ -61,7 +62,7 @@ def test_set_filters(run_component, tmp_path): with open(output_json_path, "r") as f: output_json_dict = json.load(f) - assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} + assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"} column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"] From c570a5482819197e4000d1924d0236c8d913a4d9 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 18 Mar 2025 17:31:01 +0100 Subject: [PATCH 07/19] PR updates --- _viash.yaml | 2 +- src/ingestion_qc/generate_report/config.vsh.yaml | 9 ++++++++- src/ingestion_qc/generate_report/main.nf | 1 + src/ingestion_qc/generate_report/test.sh | 4 +++- src/ingestion_qc/h5mu_to_qc_json/script.py | 2 +- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 56e5e0c..251e5f3 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,7 @@ repositories: - name: openpipeline repo: openpipelines-bio/openpipeline type: github - tag: main_build + tag: 2.0.0 info: test_resources: diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index eab9da2..a4149bd 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -70,7 +70,11 @@ argument_groups: required: false description: Number of epochs to train cellbender. default: 150 - + - name: "--run_cellbender" + type: boolean + required: false + description: Whether to run cellbender or not. + default: false - name: Outputs arguments: - name: --output @@ -79,10 +83,12 @@ argument_groups: direction: output description: The output HTML report example: path/to/file.html + resources: - type: nextflow_script entrypoint: run_wf path: main.nf + dependencies: - name: metadata/add_id repository: openpipeline @@ -94,5 +100,6 @@ dependencies: repository: openpipeline - name: ingestion_qc/h5mu_to_qc_json - name: ingestion_qc/generate_html + runners: - type: nextflow diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index f6260c8..1b16d26 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -10,6 +10,7 @@ workflow run_wf { // run cellbender | cellbender.run( + runIf: {id, state -> state.run_cellbender}, fromState: [ id: "id", input: "input", diff --git a/src/ingestion_qc/generate_report/test.sh b/src/ingestion_qc/generate_report/test.sh index f681159..08f77c0 100755 --- a/src/ingestion_qc/generate_report/test.sh +++ b/src/ingestion_qc/generate_report/test.sh @@ -8,6 +8,8 @@ param_list: id: sample_one - input: resources_test/qc_sample_data/sample_two.qc.h5mu id: sample_two +cellbender_epochs: 5 +run_cellbender: true output_qc_json: output_qc.json output_html: output_report.html EOF @@ -17,4 +19,4 @@ nextflow run . \ -main-script target/nextflow/ingestion_qc/generate_report/main.nf \ -params-file /tmp/params.yaml \ -profile docker \ - --publish_dir test_results \ + --publish_dir test_results diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py index 7161c5d..65874f2 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/script.py +++ b/src/ingestion_qc/h5mu_to_qc_json/script.py @@ -157,7 +157,7 @@ def main(par): if par["cellbender_obs_keys"]: missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns] if missing_cellbender_keys: - raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbenbder first.") + raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first.") cellbender_rna_stats = pd.DataFrame( { From 329bc5cdbd41b04a43986e16f124590c559a676d Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 19 Mar 2025 13:41:52 +0100 Subject: [PATCH 08/19] update optional cellbender step --- .../generate_html/config.vsh.yaml | 2 +- .../generate_report/config.vsh.yaml | 16 ++++------ src/ingestion_qc/generate_report/main.nf | 1 - .../h5mu_to_qc_json/config.vsh.yaml | 1 + src/ingestion_qc/h5mu_to_qc_json/script.py | 31 +++++++------------ src/ingestion_qc/h5mu_to_qc_json/test.py | 11 +++---- src/utils/setup_logger.py | 12 +++++++ 7 files changed, 36 insertions(+), 38 deletions(-) create mode 100644 src/utils/setup_logger.py diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml index fa79d60..d86d57b 100644 --- a/src/ingestion_qc/generate_html/config.vsh.yaml +++ b/src/ingestion_qc/generate_html/config.vsh.yaml @@ -45,7 +45,7 @@ engines: - type: docker run: | npm install -g pnpm@latest-10 \ - && cd /opt && git clone https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ + && cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ && cd incubator_ingestion_qc && pnpm install runners: - type: executable diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index a4149bd..1c3e810 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -59,22 +59,18 @@ argument_groups: default: "ribosomal" description: | In which .var slot to store a boolean array corresponding the ribosomal genes. - - name: "--obs_cell_probability" - type: string + - name: Cellbender options + arguments: + - name: "--run_cellbender" + type: boolean required: false - default: "cellbender_cell_probability" - description: | - In which .obs slot to store the cell probability. + description: Whether to run cellbender or not. + default: false - name: "--cellbender_epochs" type: integer required: false description: Number of epochs to train cellbender. default: 150 - - name: "--run_cellbender" - type: boolean - required: false - description: Whether to run cellbender or not. - default: false - name: Outputs arguments: - name: --output diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index 1b16d26..213caa7 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -14,7 +14,6 @@ workflow run_wf { fromState: [ id: "id", input: "input", - obs_cell_probability: "obs_cell_probability", epochs: "cellbender_epochs", ], toState: ["output"] diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index 825f0d8..7e49651 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -64,6 +64,7 @@ argument_groups: resources: - type: python_script path: script.py + - path: /src/utils/setup_logger.py test_resources: - type: python_script path: test.py diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py index 65874f2..5992bb9 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/script.py +++ b/src/ingestion_qc/h5mu_to_qc_json/script.py @@ -3,6 +3,7 @@ from pathlib import Path import anndata as ad import h5py +import sys ## VIASH START # inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu")) @@ -34,6 +35,10 @@ mudata_file = par["input"][i] ## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() def transform_df(df): """Transform a DataFrame into the annotation object format.""" @@ -67,7 +72,6 @@ def transform_df(df): def main(par): cell_stats_dfs = [] - cellbender_cell_stats_dfs = [] sample_stats_dfs = [] metrics_cellranger_dfs = [] @@ -100,6 +104,10 @@ def main(par): if missing_keys: raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}") + if par["cellbender_obs_keys"]: + missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns] + if missing_cellbender_keys: + logger.info(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first to include these metrics.") sample_id = ( mod_obs[par["sample_id_key"]].tolist() @@ -111,6 +119,7 @@ def main(par): { "sample_id": pd.Categorical(sample_id), **{key: mod_obs[key] for key in par["obs_keys"]}, + **{key: mod_obs[key] for key in par["cellbender_obs_keys"] if par["cellbender_obs_keys"]}, } ) @@ -153,37 +162,19 @@ def main(par): metrics[col] = pd.to_numeric(metrics[col], errors="coerce") metrics["sample_id"] = [sample_id[0]] metrics_cellranger_dfs.append(metrics) - - if par["cellbender_obs_keys"]: - missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns] - if missing_cellbender_keys: - raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first.") - - cellbender_rna_stats = pd.DataFrame( - { - "sample_id": pd.Categorical(sample_id), - **{key: mod_obs[key] for key in par["cellbender_obs_keys"]}, - } - ) - - else: - cellbender_rna_stats = pd.DataFrame() cell_stats_dfs.append(cell_rna_stats) - cellbender_cell_stats_dfs.append(cellbender_rna_stats) sample_stats_dfs.append(sample_summary_stats) combined_cell_stats = pd.concat(cell_stats_dfs, ignore_index=True) - combined_cellbender_stats = pd.concat(cellbender_cell_stats_dfs, ignore_index=True) combined_sample_stats = pd.concat(sample_stats_dfs, ignore_index=True) combined_metrics_cellranger = pd.concat(metrics_cellranger_dfs, ignore_index=True) - for df in [combined_cell_stats, combined_cellbender_stats, combined_sample_stats, combined_metrics_cellranger]: + for df in [combined_cell_stats, combined_sample_stats, combined_metrics_cellranger]: df["sample_id"] = pd.Categorical(df["sample_id"]) output = { "cell_rna_stats": transform_df(combined_cell_stats), - "cellbender_rna_stats": transform_df(combined_cellbender_stats), "sample_summary_stats": transform_df(combined_sample_stats), "metrics_cellranger_stats": transform_df(combined_metrics_cellranger), } diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index eb84ceb..c9349ca 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -27,13 +27,12 @@ def test_simple_execution(run_component, tmp_path): with open(output_json_path, "r") as f: output_json_dict = json.load(f) - assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"} + assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] - - column_names_cellbender = [col["name"] for col in output_json_dict["cellbender_rna_stats"]["columns"]] - assert column_names_cellbender == ["sample_id", "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"] + assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", + "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} @@ -62,7 +61,7 @@ def test_set_filters(run_component, tmp_path): with open(output_json_path, "r") as f: output_json_dict = json.load(f) - assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"} + assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"] diff --git a/src/utils/setup_logger.py b/src/utils/setup_logger.py new file mode 100644 index 0000000..3ca1cdb --- /dev/null +++ b/src/utils/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger From a6be8db3dd9d36f09fecde92f6dea5a12c742ed6 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 19 Mar 2025 13:48:38 +0100 Subject: [PATCH 09/19] update unit test --- src/ingestion_qc/h5mu_to_qc_json/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index c9349ca..908f763 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -64,7 +64,8 @@ def test_set_filters(run_component, tmp_path): assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"] + assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "cellbender_background_fraction", + "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} From d4034fa43b601d3365400d5f4bac2f1e2e289dad Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 19 Mar 2025 14:09:48 +0100 Subject: [PATCH 10/19] remove dryrun --- resources_test_scripts/qc_sample_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources_test_scripts/qc_sample_data.sh b/resources_test_scripts/qc_sample_data.sh index 993a6e1..59b6b79 100755 --- a/resources_test_scripts/qc_sample_data.sh +++ b/resources_test_scripts/qc_sample_data.sh @@ -80,4 +80,4 @@ aws s3 sync \ --profile di \ resources_test/qc_sample_data \ s3://openpipelines-bio/openpipeline_incubator/resources_test/qc_sample_data \ - --delete --dryrun + --delete From 955af4ee4e1bc1094db62a4597d2d8819a8cd314 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 19 Mar 2025 14:17:18 +0100 Subject: [PATCH 11/19] ignore tests conversion component --- src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index 7e49651..f1f1f1b 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -65,11 +65,11 @@ resources: - type: python_script path: script.py - path: /src/utils/setup_logger.py -test_resources: - - type: python_script - path: test.py - - type: file - path: /resources_test +# test_resources: +# - type: python_script +# path: test.py +# - type: file +# path: /resources_test engines: - type: docker image: python:3.12-slim From d6307a59da05e4215617a284980b27f4de078702 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 25 Mar 2025 10:58:13 +0100 Subject: [PATCH 12/19] add direwctives --- src/ingestion_qc/generate_html/config.vsh.yaml | 2 ++ src/ingestion_qc/generate_report/main.nf | 12 ++++++------ src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml | 2 ++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml index d86d57b..2a744bf 100644 --- a/src/ingestion_qc/generate_html/config.vsh.yaml +++ b/src/ingestion_qc/generate_html/config.vsh.yaml @@ -50,3 +50,5 @@ engines: runners: - type: executable - type: nextflow + directives: + label: [lowmem, lowdisk] diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index 213caa7..952d98e 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -8,6 +8,12 @@ workflow run_wf { [id, state + [_meta: [join_id: id]]] } + // add sample ids to each state + | add_id.run( + fromState: [input_id: "id", input: "output"], + toState: ["output"] + ) + // run cellbender | cellbender.run( runIf: {id, state -> state.run_cellbender}, @@ -31,12 +37,6 @@ workflow run_wf { toState: ["output"] ) - // add sample ids to each state - | add_id.run( - fromState: [input_id: "id", input: "output"], - toState: ["output"] - ) - // combine files into one state | joinStates { ids, states -> def newId = "combined" diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index f1f1f1b..e9f9bc8 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -92,3 +92,5 @@ engines: runners: - type: executable - type: nextflow + directives: + label: [midmem, middisk] From c16be2c1b492d031f190798281b1f77c1c5a64d7 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 25 Mar 2025 11:12:33 +0100 Subject: [PATCH 13/19] move add_id --- src/ingestion_qc/generate_report/main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index 952d98e..213caa7 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -8,12 +8,6 @@ workflow run_wf { [id, state + [_meta: [join_id: id]]] } - // add sample ids to each state - | add_id.run( - fromState: [input_id: "id", input: "output"], - toState: ["output"] - ) - // run cellbender | cellbender.run( runIf: {id, state -> state.run_cellbender}, @@ -37,6 +31,12 @@ workflow run_wf { toState: ["output"] ) + // add sample ids to each state + | add_id.run( + fromState: [input_id: "id", input: "output"], + toState: ["output"] + ) + // combine files into one state | joinStates { ids, states -> def newId = "combined" From a67a150c025c99209e2d10bfd6cf034b8c9cfad2 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 25 Mar 2025 17:54:10 +0100 Subject: [PATCH 14/19] get op components from main --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 251e5f3..947519a 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,7 @@ repositories: - name: openpipeline repo: openpipelines-bio/openpipeline type: github - tag: 2.0.0 + tag: main info: test_resources: From 87d934f9774ade2e83b87e97a52286fd17e20811 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 25 Mar 2025 18:25:39 +0100 Subject: [PATCH 15/19] remove tag --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 947519a..926a45f 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,7 @@ repositories: - name: openpipeline repo: openpipelines-bio/openpipeline type: github - tag: main + # tag: main info: test_resources: From e031b2ff25104467b65d583976227239af362154 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Tue, 25 Mar 2025 19:13:36 +0100 Subject: [PATCH 16/19] op main_build tag --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 926a45f..56e5e0c 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,7 @@ repositories: - name: openpipeline repo: openpipelines-bio/openpipeline type: github - # tag: main + tag: main_build info: test_resources: From 9fc517c3a5557d8cc39762c04a348c25553cc2ee Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 26 Mar 2025 10:32:42 +0100 Subject: [PATCH 17/19] trigger container rebuild --- src/ingestion_qc/generate_html/config.vsh.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml index 2a744bf..c290f69 100644 --- a/src/ingestion_qc/generate_html/config.vsh.yaml +++ b/src/ingestion_qc/generate_html/config.vsh.yaml @@ -46,7 +46,8 @@ engines: run: | npm install -g pnpm@latest-10 \ && cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ - && cd incubator_ingestion_qc && pnpm install + && cd incubator_ingestion_qc && pnpm install \ + && true runners: - type: executable - type: nextflow From 155413334d1fbecb3a5e6eb975029624eb4e2d91 Mon Sep 17 00:00:00 2001 From: jakubmajercik <jakub.majercik@gmail.com> Date: Wed, 26 Mar 2025 22:53:39 +0100 Subject: [PATCH 18/19] remove from docker setup --- src/ingestion_qc/generate_html/config.vsh.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml index c290f69..2a744bf 100644 --- a/src/ingestion_qc/generate_html/config.vsh.yaml +++ b/src/ingestion_qc/generate_html/config.vsh.yaml @@ -46,8 +46,7 @@ engines: run: | npm install -g pnpm@latest-10 \ && cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ - && cd incubator_ingestion_qc && pnpm install \ - && true + && cd incubator_ingestion_qc && pnpm install runners: - type: executable - type: nextflow From aa59d28d37a313a50b126125e2fb7e909e82d222 Mon Sep 17 00:00:00 2001 From: Dorien <41797896+dorien-er@users.noreply.github.com> Date: Fri, 28 Mar 2025 14:35:22 +0100 Subject: [PATCH 19/19] Update src/ingestion_qc/h5mu_to_qc_json/script.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ingestion_qc/h5mu_to_qc_json/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py index 5992bb9..3580070 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/script.py +++ b/src/ingestion_qc/h5mu_to_qc_json/script.py @@ -119,7 +119,7 @@ def main(par): { "sample_id": pd.Categorical(sample_id), **{key: mod_obs[key] for key in par["obs_keys"]}, - **{key: mod_obs[key] for key in par["cellbender_obs_keys"] if par["cellbender_obs_keys"]}, + **{key: mod_obs[key] for key in par["cellbender_obs_keys"] if key in mod_obs.columns}, } )