Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cellbender step #5

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
50 changes: 46 additions & 4 deletions resources_test_scripts/qc_sample_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ OUT_DIR=resources_test/qc_sample_data
[ ! -d "$OUT_DIR" ] && mkdir -p "$OUT_DIR"

# fetch/create h5mu from somewhere
cat > /tmp/params.yaml <<EOF
cat > /tmp/params_create_h5mu.yaml <<EOF
param_list:
- id: sample_one
input_id: sample_one
Expand All @@ -24,13 +24,55 @@ nextflow run openpipelines-bio/openpipeline \
-r 2.0.0 \
-main-script target/nextflow/metadata/add_id/main.nf \
-profile docker \
-params-file /tmp/params.yaml \
-params-file /tmp/params_create_h5mu.yaml \
-resume

cat > /tmp/params_subset.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.h5mu'
number_of_observations: 10000
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF

# subset h5mus
nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.0.0 \
-main-script target/nextflow/filter/subset_h5mu/main.nf \
-profile docker \
-params-file /tmp/params_subset.yaml \
-resume

# generate cellbender out for testing
cat > /tmp/params_cellbender.yaml <<EOF
param_list:
- id: sample_one
input: resources_test/qc_sample_data/sample_one.qc.h5mu
- id: sample_two
input: resources_test/qc_sample_data/sample_two.qc.h5mu
output: '\$id.qc.cellbender.h5mu'
epochs: 5
output_compression: gzip
publish_dir: "$OUT_DIR"
EOF

nextflow run openpipelines-bio/openpipeline \
-latest \
-r 2.0.0 \
-main-script target/nextflow/correction/cellbender_remove_background/main.nf \
-profile docker \
-params-file /tmp/params_cellbender.yaml \
-resume

# generate json for testing
viash run src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml --engine docker -- \
--input "$OUT_DIR"//sample_one.qc.h5mu \
--input "$OUT_DIR"/sample_two.qc.h5mu \
--input "$OUT_DIR"/sample_one.qc.cellbender.h5mu \
--input "$OUT_DIR"/sample_two.qc.cellbender.h5mu \
--output "$OUT_DIR"/dataset.json

# copy to s3
Expand Down
14 changes: 14 additions & 0 deletions src/ingestion_qc/generate_report/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ argument_groups:
default: "ribosomal"
description: |
In which .var slot to store a boolean array corresponding the ribosomal genes.
- name: "--obs_cell_probability"
type: string
required: false
default: "cellbender_cell_probability"
description: |
In which .obs slot to store the cell probability.
- name: "--cellbender_epochs"
type: integer
required: false
description: Number of epochs to train cellbender.
default: 150

- name: Outputs
arguments:
Expand All @@ -78,6 +89,9 @@ dependencies:
- name: workflows/qc/qc
alias: qc_wf
repository: openpipeline
- name: correction/cellbender_remove_background
alias: cellbender
repository: openpipeline
- name: ingestion_qc/h5mu_to_qc_json
- name: ingestion_qc/generate_html
runners:
Expand Down
21 changes: 16 additions & 5 deletions src/ingestion_qc/generate_report/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,25 @@ workflow run_wf {
[id, state + [_meta: [join_id: id]]]
}

// run cellbender
| cellbender.run(
fromState: [
id: "id",
input: "input",
obs_cell_probability: "obs_cell_probability",
epochs: "cellbender_epochs",
],
toState: ["output"]
)

// run qc on each sample
| qc_wf.run(
fromState: [
"id",
"input",
"var_gene_names",
"var_name_mitochondrial_genes",
"var_name_ribosomal_genes"
id: "id",
input: "output",
var_gene_names: "var_gene_names",
var_name_mitochondrial_genes: "var_name_mitochondrial_genes",
var_name_ribosomal_genes: "var_name_ribosomal_genes"
],
toState: ["output"]
)
Expand Down
6 changes: 3 additions & 3 deletions src/ingestion_qc/generate_report/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@ viash ns build --setup cb --parallel

cat > /tmp/params.yaml <<EOF
param_list:
- input: resources_test/sample_data/sample_1.qc.output.h5mu
- input: resources_test/qc_sample_data/sample_one.qc.h5mu
id: sample_one
- input: resources_test/sample_data/sample_2.qc.output.h5mu
- input: resources_test/qc_sample_data/sample_two.qc.h5mu
id: sample_two
output_qc_json: output_qc.json
output_html: output_report.html
EOF


nextflow run . \
-main-script target/nextflow/ingestion_qc/generate_report/main.nf \
-params-file /tmp/params.yaml \
-profile docker \
--publish_dir test_results \
--resume
6 changes: 6 additions & 0 deletions src/ingestion_qc/h5mu_to_qc_json/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ argument_groups:
multiple: true
description: The keys in the h5mu .obs to include in the output JSON
default: ["total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
- name: --cellbender_obs_keys
type: string
multiple: true
description: The cellbender keys in the h5mu .obs to include in the output JSON
default: ["cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size",
"cellbender_droplet_efficiency"]
- name: --cellranger_metrics_uns_key
type: string
description: The key in the h5mu file .uns that contains the cellranger metrics
Expand Down
33 changes: 29 additions & 4 deletions src/ingestion_qc/h5mu_to_qc_json/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
## VIASH START
# inputs = list(Path("data/sample_data/sample_data").glob("*.h5mu"))
# output = "data/sample-data.json"
inputs = list(Path("resources_test/qc_sample_data").glob("*.h5mu"))
inputs = list(Path("resources_test/qc_sample_data").glob("*.qc.cellbender.h5mu"))
output = "tmp.json"
par = {
"input": sorted([str(x) for x in inputs]),
Expand All @@ -21,8 +21,13 @@
"num_nonzero_vars",
"fraction_mitochondrial",
"fraction_ribosomal",
"pct_of_counts_in_top_50_vars",
],
"cellbender_obs_keys": [
"cellbender_background_fraction",
"cellbender_cell_probability",
"cellbender_cell_size",
"cellbender_droplet_efficiency",
],
"cellranger_metrics_uns_key": "metrics_cellranger",
}
i = 0
Expand Down Expand Up @@ -62,6 +67,7 @@ def transform_df(df):

def main(par):
cell_stats_dfs = []
cellbender_cell_stats_dfs = []
sample_stats_dfs = []
metrics_cellranger_dfs = []

Expand Down Expand Up @@ -93,6 +99,7 @@ def main(par):
missing_keys = [key for key in par["obs_keys"] if key not in mod_obs.columns]
if missing_keys:
raise ValueError(f"Missing keys in obs: {', '.join(missing_keys)}")


sample_id = (
mod_obs[par["sample_id_key"]].tolist()
Expand All @@ -106,7 +113,7 @@ def main(par):
**{key: mod_obs[key] for key in par["obs_keys"]},
}
)

sample_summary_stats = pd.DataFrame(
{
"sample_id": pd.Categorical([sample_id[0]]),
Expand Down Expand Up @@ -147,18 +154,36 @@ def main(par):
metrics["sample_id"] = [sample_id[0]]
metrics_cellranger_dfs.append(metrics)

if par["cellbender_obs_keys"]:
missing_cellbender_keys = [key for key in par["cellbender_obs_keys"] if key not in mod_obs.columns]
if missing_cellbender_keys:
raise ValueError(f"Missing keys in obs: {', '.join(missing_cellbender_keys)}. Run cellbenbder first.")

cellbender_rna_stats = pd.DataFrame(
{
"sample_id": pd.Categorical(sample_id),
**{key: mod_obs[key] for key in par["cellbender_obs_keys"]},
}
)

else:
cellbender_rna_stats = pd.DataFrame()

cell_stats_dfs.append(cell_rna_stats)
cellbender_cell_stats_dfs.append(cellbender_rna_stats)
sample_stats_dfs.append(sample_summary_stats)

combined_cell_stats = pd.concat(cell_stats_dfs, ignore_index=True)
combined_cellbender_stats = pd.concat(cellbender_cell_stats_dfs, ignore_index=True)
combined_sample_stats = pd.concat(sample_stats_dfs, ignore_index=True)
combined_metrics_cellranger = pd.concat(metrics_cellranger_dfs, ignore_index=True)

for df in [combined_cell_stats, combined_sample_stats, combined_metrics_cellranger]:
for df in [combined_cell_stats, combined_cellbender_stats, combined_sample_stats, combined_metrics_cellranger]:
df["sample_id"] = pd.Categorical(df["sample_id"])

output = {
"cell_rna_stats": transform_df(combined_cell_stats),
"cellbender_rna_stats": transform_df(combined_cellbender_stats),
"sample_summary_stats": transform_df(combined_sample_stats),
"metrics_cellranger_stats": transform_df(combined_metrics_cellranger),
}
Expand Down
19 changes: 11 additions & 8 deletions src/ingestion_qc/h5mu_to_qc_json/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def test_simple_execution(run_component, tmp_path):

run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--output", output_json_path,
]
)
Expand All @@ -27,10 +27,13 @@ def test_simple_execution(run_component, tmp_path):
with open(output_json_path, "r") as f:
output_json_dict = json.load(f)

assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"}

column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]
column_names_cell = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names_cell == ["sample_id", "total_counts", "num_nonzero_vars", "fraction_mitochondrial", "fraction_ribosomal"]

column_names_cellbender = [col["name"] for col in output_json_dict["cellbender_rna_stats"]["columns"]]
assert column_names_cellbender == ["sample_id", "cellbender_background_fraction", "cellbender_cell_probability", "cellbender_cell_size", "cellbender_droplet_efficiency"]

for key in output_json_dict.keys():
assert output_json_dict[key].keys() == {"num_rows", "num_cols", "columns"}
Expand All @@ -43,8 +46,8 @@ def test_set_filters(run_component, tmp_path):

run_component(
[
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_one.qc.cellbender.h5mu",
"--input", meta["resources_dir"] + "/resources_test/qc_sample_data/sample_two.qc.cellbender.h5mu",
"--output", output_json_path,
"--sample_id_key", "sample_id",
"--min_total_counts", "10",
Expand All @@ -59,7 +62,7 @@ def test_set_filters(run_component, tmp_path):
with open(output_json_path, "r") as f:
output_json_dict = json.load(f)

assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "metrics_cellranger_stats"}
assert output_json_dict.keys() == {"cell_rna_stats", "sample_summary_stats", "cellbender_rna_stats", "metrics_cellranger_stats"}

column_names = [col["name"] for col in output_json_dict["cell_rna_stats"]["columns"]]
assert column_names == ["sample_id", "total_counts", "num_nonzero_vars"]
Expand Down