Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cellbender step #5

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion _viash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repositories:
- name: openpipeline
repo: openpipelines-bio/openpipeline
type: github
tag: main_build
tag: 2.0.0

info:
test_resources:
Expand Down
52 changes: 47 additions & 5 deletions resources_test_scripts/qc_sample_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"

# fetch/create h5mu from somewhere
cat > /tmp/params.yaml <<EOF
cat > /tmp/params_create_h5mu.yaml <<EOF
param_list:
- id: sample_one
input_id: sample_one
Expand All @@ -24,18 +24,60 @@ nextflow run openpipelines-bio/openpipeline \
-r 2.0.0 \
-main-script target/nextflow/metadata/add_id/main.nf \
-profile docker \
-params-file /tmp/params.yaml \
-params-file /tmp/params_create_h5mu.yaml \
-resume

cat > /tmp/params_subset.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.h5mu'
number_of_observations: 10000
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF

# subset h5mus
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.0.0 \
-main-script target/nextflow/filter/subset_h5mu/main.nf \
-profile docker \
-params-file /tmp/params_subset.yaml \
-resume

# generate cellbender out for testing
cat > /tmp/params_cellbender.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.cellbender.h5mu'
epochs: 5
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF

nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.0.0 \
-main-script target/nextflow/correction/cellbender_remove_background/main.nf \
-profile docker \
-params-file /tmp/params_cellbender.yaml \
-resume

# generate json for testing
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR"//sample_one.qc.h5mu \
--input "$OUT_DIR"/sample_two.qc.h5mu \
--input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
--input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
--output "$OUT_DIR"/dataset.json

# copy to s3
aws s3 sync \
--profile di \
resources_test/qc_sample_data \
s3://openpipelines-bio/openpipeline_incubator/resources_test/qc_sample_data \
--delete --dryrun
--delete
2 changes: 1 addition & 1 deletion src/ingestion_qc/generate_html/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ engines:
- type: docker
run: |
npm install -g pnpm@latest-10 \
&& cd /opt && git clone https://github.com/openpipelines-bio/incubator_ingestion_qc.git \
&& cd /opt && git clone -b ambient-rna https://github.com/openpipelines-bio/incubator_ingestion_qc.git \
&& cd incubator_ingestion_qc && pnpm install
runners:
- type: executable
Expand Down
19 changes: 18 additions & 1 deletion src/ingestion_qc/generate_report/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,18 @@ argument_groups:
default: "ribosomal"
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.

- name: Cellbender options
arguments:
- name: "--run_cellbender"
type: boolean
required: false
description: Whether to run cellbender or not.
default: false
- name: "--cellbender_epochs"
type: integer
required: false
description: Number of epochs to train cellbender.
default: 150
- name: Outputs
arguments:
- name: --output
Expand All @@ -68,17 +79,23 @@ argument_groups:
direction: output
description: The output HTML report
example: path/to/file.html

resources:
- type: nextflow_script
entrypoint: run_wf
path: main.nf

dependencies:
- name: metadata/add_id
repository: openpipeline
- name: workflows/qc/qc
alias: qc_wf
repository: openpipeline
- name: correction/cellbender_remove_background
alias: cellbender
repository: openpipeline
- name: ingestion_qc/h5mu_to_qc_json
- name: ingestion_qc/generate_html

runners:
- type: nextflow
21 changes: 16 additions & 5 deletions src/ingestion_qc/generate_report/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,25 @@ workflow run_wf {
[id, state + [_meta: [join_id: id]]]
}

// run cellbender
| cellbender.run(
runIf: {id, state -> state.run_cellbender},
fromState: [
id: "id",
input: "input",
epochs: "cellbender_epochs",
],
toState: ["output"]
)

// run qc on each sample
| qc_wf.run(
fromState: [
"id",
"input",
"var_gene_names",
"var_name_mitochondrial_genes",
"var_name_ribosomal_genes"
id: "id",
input: "output",
var_gene_names: "var_gene_names",
var_name_mitochondrial_genes: "var_name_mitochondrial_genes",
var_name_ribosomal_genes: "var_name_ribosomal_genes"
],
toState: ["output"]
)
Expand Down
10 changes: 6 additions & 4 deletions src/ingestion_qc/generate_report/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@ viash ns build --setup cb --parallel

cat > /tmp/params.yaml <<EOF
param_list:
- input: resources_test/sample_data/sample_1.qc.output.h5mu
- input: resources_test/qc_sample_data/sample_one.qc.h5mu
id: sample_one
- input: resources_test/sample_data/sample_2.qc.output.h5mu
- input: resources_test/qc_sample_data/sample_two.qc.h5mu
id: sample_two
cellbender_epochs: 5
run_cellbender: true
output_qc_json: output_qc.json
output_html: output_report.html
EOF


nextflow run . \
-main-script target/nextflow/ingestion_qc/generate_report/main.nf \
-params-file /tmp/params.yaml \
-profile docker \
--publish_dir test_results \
--resume
--publish_dir test_results
17 changes: 12 additions & 5 deletions src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,25 @@ argument_groups:
multiple: true
description: The keys in the h5mu .obs to include in the output JSON
default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
- name: --cellbender_obs_keys
type: string
multiple: true
description: The cellbender keys in the h5mu .obs to include in the output JSON
default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
"cellbender_droplet_efficiency"]
- name: --cellranger_metrics_uns_key
type: string
description: The key in the h5mu file .uns that contains the cellranger metrics
default: metrics_cellranger
resources:
- type: python_script
path: script.py
test_resources:
- type: python_script
path: test.py
- type: file
path: /resources_test
- path: /src/utils/setup_logger.py
# test_resources:
# - type: python_script
# path: test.py
# - type: file
# path: /resources_test
engines:
- type: docker
image: python:3.12-slim
Expand Down
24 changes: 20 additions & 4 deletions src/ingestion_qc/h5mu_to_qc_json/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from pathlib import Path
import anndata as ad
import h5py
import sys

## VIASH START
# inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
# output = "data/sample-data.json"
inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu"))
inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu"))
output = "tmp.json"
par = {
"input": sorted([str(x) for x in inputs]),
Expand All @@ -21,14 +22,23 @@
"num_nonzero_vars",
"fraction_mitochondrial",
"fraction_ribosomal",
"pct_of_counts_in_top_50_vars",
],
"cellbender_obs_keys": [
"cellbender_background_fraction",
"cellbender_cell_probability",
"cellbender_cell_size",
"cellbender_droplet_efficiency",
],
"cellranger_metrics_uns_key": "metrics_cellranger",
}
i = 0
mudata_file = par["input"][i]
## VIASH END

sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger

logger = setup_logger()

def transform_df(df):
"""Transform a DataFrame into the annotation object format."""
Expand Down Expand Up @@ -93,6 +103,11 @@ def main(par):
missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns]
if missing_keys:
raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")

if par["cellbender_obs_keys"]:
missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns]
if missing_cellbender_keys:
logger.info(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbender first to include these metrics.")

sample_id = (
mod_obs[par["sample_id_key"]].tolist()
Expand All @@ -104,9 +119,10 @@ def main(par):
{
"sample_id": pd.Categorical(sample_id),
**{key: mod_obs[key] for key in par["obs_keys"]},
**{key: mod_obs[key] for key in par["cellbender_obs_keys"] if par["cellbender_obs_keys"]},
}
)

sample_summary_stats = pd.DataFrame(
{
"sample_id": pd.Categorical([sample_id[0]]),
Expand Down Expand Up @@ -146,7 +162,7 @@ def main(par):
metrics[col] = pd.to_numeric(metrics[col], errors="coerce")
metrics["sample_id"] = [sample_id[0]]
metrics_cellranger_dfs.append(metrics)

cell_stats_dfs.append(cell_rna_stats)
sample_stats_dfs.append(sample_summary_stats)

Expand Down
17 changes: 10 additions & 7 deletions src/ingestion_qc/h5mu_to_qc_json/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path):

run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--output", output_json_path,
]
)
Expand All @@ -29,8 +29,10 @@ def test_simple_execution(run_component, tmp_path):

assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}

column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal",
"cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
"cellbender_droplet_efficiency"]

for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}
Expand All @@ -43,8 +45,8 @@ def test_set_filters(run_component, tmp_path):

run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--output", output_json_path,
"--sample_id_key", "sample_id",
"--min_total_counts", "10",
Expand All @@ -62,7 +64,8 @@ def test_set_filters(run_component, tmp_path):
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}

column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"]
assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "cellbender_background_fraction",
"cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"]

for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}
Expand Down
12 changes: 12 additions & 0 deletions src/utils/setup_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger