diff --git a/.gitignore b/.gitignore index 25cf7cdc..d953d783 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.egg-info __pycache__/ .ipynb_checkpoints -results/ +/results/ +/temp/ .snakemake logs/* diff --git a/common b/common index 67ef9612..a1409176 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 67ef9612fce1bbf22e07971e55a9858e8dd2dfa5 +Subproject commit a1409176da317c8a7c9c65d1488bcf3b5afee3d6 diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh index 01ce45ae..0b113f3d 100644 --- a/scripts/create_test_resources.sh +++ b/scripts/create_test_resources.sh @@ -9,34 +9,34 @@ cd "$REPO_ROOT" set -e DATASET_ID="10x_xenium/2023_10x_mouse_brain_xenium" -RAW_OUT="resources/tmp_datasets_raw/$DATASET_ID" -RESOURCES_OUT="resources/datasets/10x_xenium/$DATASET_ID" +TMP_DIR="temp/datasets/$DATASET_ID" +OUT_DIR="resources_test/common/2023_10x_mouse_brain_xenium" # https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip # https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip # https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip -rep1="$RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs" -rep2="$RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs" -rep3="$RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs" +rep1="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs" +rep2="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs" +rep3="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs" if [ ! -d "$rep1" ]; then wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \ - -O $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip - unzip $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip -d $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_1 + -O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip + unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1 fi if [ ! -d "$rep2" ]; then wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip \ - -O $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip - unzip $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip -d $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_2 + -O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip + unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2 fi if [ ! -d "$rep3" ]; then wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip \ - -O $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip - unzip $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip -d $RAW_OUT/Xenium_V1_FF_Mouse_Brain_MultiSection_3 + -O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip + unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3 fi # convert to zarr and concatenate @@ -47,7 +47,7 @@ viash run src/data_loaders/download_10x_xenium/config.vsh.yaml -- \ --replicate_id rep1 \ --replicate_id rep2 \ --replicate_id rep3 \ - --output $RAW_OUT/full_dataset.zarr \ + --output $TMP_DIR/full_dataset.zarr \ --dataset_id "$DATASET_ID" \ --dataset_name "Xenium V1 Fresh Frozen Mouse Brain" \ --dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" \ @@ -57,8 +57,8 @@ viash run src/data_loaders/download_10x_xenium/config.vsh.yaml -- \ # crop the region viash run src/data_processors/crop_region/config.vsh.yaml -- \ - --input $RAW_OUT/full_dataset.zarr \ - --output $RESOURCES_OUT/dataset.zarr \ + --input "$TMP_DIR/full_dataset.zarr" \ + --output "$OUT_DIR/dataset.zarr" \ --replicate_id "rep1" \ --min_x 10000 \ --max_x 12000 \ @@ -74,3 +74,8 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \ --max_x 12000 \ --min_y 10000 \ --max_y 12000 + +aws s3 sync \ + "resources_test/common/2023_10x_mouse_brain_xenium" \ + "s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium" \ + --delete --dryrun diff --git a/src/data_processors/crop_region/config.vsh.yaml b/src/data_processors/crop_region/config.vsh.yaml index 70bb93ad..5828b054 100644 --- a/src/data_processors/crop_region/config.vsh.yaml +++ b/src/data_processors/crop_region/config.vsh.yaml @@ -10,6 +10,7 @@ arguments: name: --output required: true description: The output file to write the cropped data to. + direction: output - type: string name: --replicate_id required: false diff --git a/src/data_processors/crop_region/script.py b/src/data_processors/crop_region/script.py index e7496aed..74979371 100644 --- a/src/data_processors/crop_region/script.py +++ b/src/data_processors/crop_region/script.py @@ -2,28 +2,33 @@ ## VIASH START par = { - "input": "resources/datasets/10x_xenium/10x_fresh_frozen_mouse_brain_replicates/dataset.zarr", - "output": "output.zarr", - "replicate": ["rep1"], - "min_x": [10000], - "max_x": [12000], - "min_y": [10000], - "max_y": [12000] + "input": "temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/full_dataset.zarr", + "output": "resources_test/common/2023_10x_mouse_brain_xenium/dataset.zarr", + "replicate_id": ["rep1", "rep2", "rep3"], + "min_x": [10000, 10000, 10000], + "max_x": [12000, 12000, 12000], + "min_y": [10000, 10000, 10000], + "max_y": [12000, 12000, 12000], } ## VIASH END sdata = sd.read_zarr(par["input"]) -for i, replicate_id in enumerate(par["replicate"]): +sdata_out = [] + +for i, replicate_id in enumerate(par["replicate_id"]): min_x = par["min_x"][i] max_x = par["max_x"][i] min_y = par["min_y"][i] max_y = par["max_y"][i] - sdata = sdata.query.bounding_box( + sdata_query = sdata.query.bounding_box( axes=["x", "y"], min_coordinate=[min_x, min_y], max_coordinate=[max_x, max_y], target_coordinate_system=f"{replicate_id}_global", ) + sdata_out.append(sdata_query) + +sdata_output = sd.concatenate(sdata_out) -sdata.write_zarr(par["output"]) +sdata.write(par["output"]) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py deleted file mode 100644 index 350d5564..00000000 --- a/src/data_processors/process_dataset/script.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -import random -import numpy as np -import anndata as ad -import openproblems as op - -## VIASH START -par = { - 'input': 'resources_test/common/pancreas/dataset.h5ad', - 'method': 'batch', - 'seed': None, - 'obs_batch': 'batch', - 'obs_label': 'cell_type', - 'output_train': 'train.h5ad', - 'output_test': 'test.h5ad', - 'output_solution': 'solution.h5ad' -} -meta = { - 'resources_dir': 'target/executable/data_processors/process_dataset', - 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' -} -## VIASH END - -# import helper functions -sys.path.append(meta['resources_dir']) -from subset_h5ad_by_format import subset_h5ad_by_format - -config = op.project.read_viash_config(meta["config"]) - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - -print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) - -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] - -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} - -print(">> Creating train data", flush=True) -output_train = subset_h5ad_by_format( - adata[[not x for x in is_test]], - config, - "output_train", - slot_mapping -) - -print(">> Creating test data", flush=True) -output_test = subset_h5ad_by_format( - adata[is_test], - config, - "output_test", - slot_mapping -) - -print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format( - adata[is_test], - config, - "output_solution", - slot_mapping -) - -print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"])