diff --git a/resources_test_scripts/qc_sample_data.sh b/resources_test_scripts/qc_sample_data.sh index d88c436..59b6b79 100755 --- a/resources_test_scripts/qc_sample_data.sh +++ b/resources_test_scripts/qc_sample_data.sh @@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data [ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR" # fetch/create h5mu from somewhere -cat > /tmp/params.yaml <<EOF +cat > /tmp/params_create_h5mu.yaml <<EOF param_list: - id: sample_one input_id: sample_one @@ -24,13 +24,55 @@ nextflow run openpipelines-bio/openpipeline \ -r 2.0.0 \ -main-script target/nextflow/metadata/add_id/main.nf \ -profile docker \ - -params-file /tmp/params.yaml \ + -params-file /tmp/params_create_h5mu.yaml \ + -resume + +cat > /tmp/params_subset.yaml <<EOF +param_list: + - id: sample_one + input: resources_test/qc_sample_data/sample_one.qc.h5mu + - id: sample_two + input: resources_test/qc_sample_data/sample_two.qc.h5mu +output: '\$id.qc.h5mu' +number_of_observations: 10000 +output_compression: gzip +publish_dir: "$OUT_DIR" +EOF + +# subset h5mus +nextflow run openpipelines-bio/openpipeline \ + -latest \ + -r 2.0.0 \ + -main-script target/nextflow/filter/subset_h5mu/main.nf \ + -profile docker \ + -params-file /tmp/params_subset.yaml \ + -resume + +# generate cellbender out for testing +cat > /tmp/params_cellbender.yaml <<EOF +param_list: + - id: sample_one + input: resources_test/qc_sample_data/sample_one.qc.h5mu + - id: sample_two + input: resources_test/qc_sample_data/sample_two.qc.h5mu +output: '\$id.qc.cellbender.h5mu' +epochs: 5 +output_compression: gzip +publish_dir: "$OUT_DIR" +EOF + +nextflow run openpipelines-bio/openpipeline \ + -latest \ + -r 2.0.0 \ + -main-script target/nextflow/correction/cellbender_remove_background/main.nf \ + -profile docker \ + -params-file /tmp/params_cellbender.yaml \ -resume # generate json for testing viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \ - --input "$OUT_DIR"//sample_one.qc.h5mu \ - --input "$OUT_DIR"/sample_two.qc.h5mu \ + --input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \ + --input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \ --output "$OUT_DIR"/dataset.json # copy to s3 @@ -38,4 +80,4 @@ aws s3 sync \ --profile di \ resources_test/qc_sample_data \ s3://openpipelines-bio/openpipeline_incubator/resources_test/qc_sample_data \ - --delete --dryrun + --delete diff --git a/src/ingestion_qc/generate_html/config.vsh.yaml b/src/ingestion_qc/generate_html/config.vsh.yaml index fa79d60..2a744bf 100644 --- a/src/ingestion_qc/generate_html/config.vsh.yaml +++ b/src/ingestion_qc/generate_html/config.vsh.yaml @@ -45,8 +45,10 @@ engines: - type: docker run: | npm install -g pnpm@latest-10 \ - && cd /opt && git clone https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ + && cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \ && cd incubator_ingestion_qc && pnpm install runners: - type: executable - type: nextflow + directives: + label: [lowmem, lowdisk] diff --git a/src/ingestion_qc/generate_report/config.vsh.yaml b/src/ingestion_qc/generate_report/config.vsh.yaml index 23db8a6..1c3e810 100644 --- a/src/ingestion_qc/generate_report/config.vsh.yaml +++ b/src/ingestion_qc/generate_report/config.vsh.yaml @@ -59,7 +59,18 @@ argument_groups: default: "ribosomal" description: | In which .var slot to store a boolean array corresponding the ribosomal genes. - + - name: Cellbender options + arguments: + - name: "--run_cellbender" + type: boolean + required: false + description: Whether to run cellbender or not. + default: false + - name: "--cellbender_epochs" + type: integer + required: false + description: Number of epochs to train cellbender. + default: 150 - name: Outputs arguments: - name: --output @@ -68,17 +79,23 @@ argument_groups: direction: output description: The output HTML report example: path/to/file.html + resources: - type: nextflow_script entrypoint: run_wf path: main.nf + dependencies: - name: metadata/add_id repository: openpipeline - name: workflows/qc/qc alias: qc_wf repository: openpipeline + - name: correction/cellbender_remove_background + alias: cellbender + repository: openpipeline - name: ingestion_qc/h5mu_to_qc_json - name: ingestion_qc/generate_html + runners: - type: nextflow diff --git a/src/ingestion_qc/generate_report/main.nf b/src/ingestion_qc/generate_report/main.nf index e8c7964..213caa7 100644 --- a/src/ingestion_qc/generate_report/main.nf +++ b/src/ingestion_qc/generate_report/main.nf @@ -8,14 +8,25 @@ workflow run_wf { [id, state + [_meta: [join_id: id]]] } + // run cellbender + | cellbender.run( + runIf: {id, state -> state.run_cellbender}, + fromState: [ + id: "id", + input: "input", + epochs: "cellbender_epochs", + ], + toState: ["output"] + ) + // run qc on each sample | qc_wf.run( fromState: [ - "id", - "input", - "var_gene_names", - "var_name_mitochondrial_genes", - "var_name_ribosomal_genes" + id: "id", + input: "output", + var_gene_names: "var_gene_names", + var_name_mitochondrial_genes: "var_name_mitochondrial_genes", + var_name_ribosomal_genes: "var_name_ribosomal_genes" ], toState: ["output"] ) diff --git a/src/ingestion_qc/generate_report/test.sh b/src/ingestion_qc/generate_report/test.sh index e0e81aa..08f77c0 100755 --- a/src/ingestion_qc/generate_report/test.sh +++ b/src/ingestion_qc/generate_report/test.sh @@ -4,17 +4,19 @@ viash ns build --setup cb --parallel cat > /tmp/params.yaml <<EOF param_list: - - input: resources_test/sample_data/sample_1.qc.output.h5mu + - input: resources_test/qc_sample_data/sample_one.qc.h5mu id: sample_one - - input: resources_test/sample_data/sample_2.qc.output.h5mu + - input: resources_test/qc_sample_data/sample_two.qc.h5mu id: sample_two +cellbender_epochs: 5 +run_cellbender: true output_qc_json: output_qc.json output_html: output_report.html EOF + nextflow run . \ -main-script target/nextflow/ingestion_qc/generate_report/main.nf \ -params-file /tmp/params.yaml \ -profile docker \ - --publish_dir test_results \ - --resume + --publish_dir test_results diff --git a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml index 5aa68fd..e9f9bc8 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml +++ b/src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml @@ -51,6 +51,12 @@ argument_groups: multiple: true description: The keys in the h5mu .obs to include in the output JSON default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + - name: --cellbender_obs_keys + type: string + multiple: true + description: The cellbender keys in the h5mu .obs to include in the output JSON + default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + "cellbender_droplet_efficiency"] - name: --cellranger_metrics_uns_key type: string description: The key in the h5mu file .uns that contains the cellranger metrics @@ -58,11 +64,12 @@ argument_groups: resources: - type: python_script path: script.py -test_resources: - - type: python_script - path: test.py - - type: file - path: /resources_test + - path: /src/utils/setup_logger.py +# test_resources: +# - type: python_script +# path: test.py +# - type: file +# path: /resources_test engines: - type: docker image: python:3.12-slim @@ -85,3 +92,5 @@ engines: runners: - type: executable - type: nextflow + directives: + label: [midmem, middisk] diff --git a/src/ingestion_qc/h5mu_to_qc_json/script.py b/src/ingestion_qc/h5mu_to_qc_json/script.py index 34210c3..3580070 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/script.py +++ b/src/ingestion_qc/h5mu_to_qc_json/script.py @@ -3,11 +3,12 @@ from pathlib import Path import anndata as ad import h5py +import sys ## VIASH START # inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu")) # output = "data/sample-data.json" -inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu")) +inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu")) output = "tmp.json" par = { "input": sorted([str(x) for x in inputs]), @@ -21,14 +22,23 @@ "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", - "pct_of_counts_in_top_50_vars", ], + "cellbender_obs_keys": [ + "cellbender_background_fraction", + "cellbender_cell_probability", + "cellbender_cell_size", + "cellbender_droplet_efficiency", + ], "cellranger_metrics_uns_key": "metrics_cellranger", } i = 0 mudata_file = par["input"][i] ## VIASH END +sys.path.append(meta["resources_dir"]) +from setup_logger import setup_logger + +logger = setup_logger() def transform_df(df): """Transform a DataFrame into the annotation object format.""" @@ -93,6 +103,11 @@ def main(par): missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns] if missing_keys: raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}") + + if par["cellbender_obs_keys"]: + missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns] + if missing_cellbender_keys: + logger.info(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first to include these metrics.") sample_id = ( mod_obs[par["sample_id_key"]].tolist() @@ -104,9 +119,10 @@ def main(par): { "sample_id": pd.Categorical(sample_id), **{key: mod_obs[key] for key in par["obs_keys"]}, + **{key: mod_obs[key] for key in par["cellbender_obs_keys"] if key in mod_obs.columns}, } ) - + sample_summary_stats = pd.DataFrame( { "sample_id": pd.Categorical([sample_id[0]]), @@ -146,7 +162,7 @@ def main(par): metrics[col] = pd.to_numeric(metrics[col], errors="coerce") metrics["sample_id"] = [sample_id[0]] metrics_cellranger_dfs.append(metrics) - + cell_stats_dfs.append(cell_rna_stats) sample_stats_dfs.append(sample_summary_stats) diff --git a/src/ingestion_qc/h5mu_to_qc_json/test.py b/src/ingestion_qc/h5mu_to_qc_json/test.py index f5bddd4..908f763 100644 --- a/src/ingestion_qc/h5mu_to_qc_json/test.py +++ b/src/ingestion_qc/h5mu_to_qc_json/test.py @@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path): run_component( [ - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu", - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu", "--output", output_json_path, ] ) @@ -29,8 +29,10 @@ def test_simple_execution(run_component, tmp_path): assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} - column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"] + column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] + assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal", + "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", + "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} @@ -43,8 +45,8 @@ def test_set_filters(run_component, tmp_path): run_component( [ - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu", - "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu", + "--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu", "--output", output_json_path, "--sample_id_key", "sample_id", "--min_total_counts", "10", @@ -62,7 +64,8 @@ def test_set_filters(run_component, tmp_path): assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"} column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]] - assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"] + assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "cellbender_background_fraction", + "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"] for key in output_json_dict.keys(): assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"} diff --git a/src/utils/setup_logger.py b/src/utils/setup_logger.py new file mode 100644 index 0000000..3ca1cdb --- /dev/null +++ b/src/utils/setup_logger.py @@ -0,0 +1,12 @@ +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger