From 72fd831153cf8db03d67286e7786272f90e7f67a Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 26 Aug 2024 17:49:53 +0200 Subject: [PATCH] First work in creating a first dataset loader Co-authored-by: LouisK92 --- .gitignore | 3 +- README.md | 462 +++++++++++++----- src/api/comp_control_method.yaml | 37 -- src/api/comp_data_loader.yaml | 23 + src/api/comp_data_processor.yaml | 31 -- src/api/comp_method.yaml | 28 -- src/api/comp_metric.yaml | 28 -- src/api/file_common_dataset.yaml | 72 --- src/api/file_prediction.yaml | 26 - src/api/file_raw_data.yaml | 185 +++++++ src/api/file_score.yaml | 31 -- src/api/file_solution.yaml | 73 --- src/api/file_test_h5ad.yaml | 45 -- src/api/file_train_h5ad.yaml | 49 -- .../download_10x_xenium/config.vsh.yaml | 32 ++ .../download_10x_xenium/script.py | 60 +++ .../process_dataset/config.vsh.yaml | 34 -- 17 files changed, 649 insertions(+), 570 deletions(-) delete mode 100644 src/api/comp_control_method.yaml create mode 100644 src/api/comp_data_loader.yaml delete mode 100644 src/api/comp_data_processor.yaml delete mode 100644 src/api/comp_method.yaml delete mode 100644 src/api/comp_metric.yaml delete mode 100644 src/api/file_common_dataset.yaml delete mode 100644 src/api/file_prediction.yaml create mode 100644 src/api/file_raw_data.yaml delete mode 100644 src/api/file_score.yaml delete mode 100644 src/api/file_solution.yaml delete mode 100644 src/api/file_test_h5ad.yaml delete mode 100644 src/api/file_train_h5ad.yaml create mode 100644 src/data_loaders/download_10x_xenium/config.vsh.yaml create mode 100644 src/data_loaders/download_10x_xenium/script.py delete mode 100644 src/data_processors/process_dataset/config.vsh.yaml diff --git a/.gitignore b/.gitignore index d86eb196..add10c06 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ singularity_container/ target .nextflow -.resources_test \ No newline at end of file +resources_test +resources \ No newline at end of file diff --git a/README.md b/README.md index 1f7d70b5..bdb8be30 100644 --- a/README.md +++ b/README.md @@ -1,115 +1,347 @@ -# txsim-pipeline -Pipeline using txsim to compare single cell and spatial transcriptomics data - - -# Installation -In a clean `conda` environment, run in the terminal: - -```git clone https://github.com/theislab/txsim-pipeline.git``` - -In your `conda` environment, use the `conda-forge` channel to install: - -- `mamba` -- `snakemake` -- `anndata` - -Note: You can also use the `bioconda` channel to install snakemake - -## Dependencies - -This pipeline uses [snakemake](https://snakemake.readthedocs.io/en/stable/index.html) which will generate the conda environments needed for each job. However, the `txsim` package is still in developement at this time. In order to properly use the pipeline, follow these steps: -- Clone the [txsim repo](https://github.com/theislab/txsim.git) (Note: do not clone inside the `txsim-pipeline` folder) -- In each folder in `envs`, change the file path `"/mnt/c/..."` to the path of the cloned `txsim` repo - - -# Running the pipeline - -## Setup -Before you can run your pipeline, make sure to edit the config file - -Most important is to change all the paths to your desired `test` data and results folder. - -Once this is done, try running: - -```snakemake -n``` - -for a dry run to ensure everything was installed correctly. Since this only calculates the dependencies and does not actually run any jobs, you can usually run this commnad on a submit node for a cluster if so desired. - -## Adding data -To add or rename a dataset, simply follow the convention of `test` for the folder, image, spots, and scRNAseq data paths. The `segmented_image` path is optional if one has manually segmented the nuclei (i.e. using ImageJ). - -## Changing the workflow -All supported workflows are shown in the `full_config.yaml` file in `configs`. The names of the batches are arbitrary. The names of the method groups (such as `segmentation`) are also arbitrary but must match the names in the `workflow` key. For each method group, group-wide parameters should be put in the key `[GROUP]_params`. The names of the methods are not arbitrary and are case sensitive. In general, the format of new parameters should follow that of the `full_config.yaml`. - -All parameters should be entered as a list, even if it is 1 value. If a method takes a dictionary as a parameter, the config should have a list of dictionaries. This can also be done using the "-" character for a multi-line list - -Note: the `custom` method in segmentation will use the `segmented_image` key in `DATA_SCENARIOS` - -## Details on combinations - -The ConfigParser will generate all possible combinations of methods and parameters within a given batch. Doing a `snakemake` dry-run (using the `-n` flag) will show all final files to be generated. A dictionary of parameters for a given method and id number will be generated in the results folder, named `param_dict.csv`. This should be saved if needed, since it will be overwritten with each snakemake run. - -## Running in the terminal -The pipeline can be run using the `snakemake` command. For example: - -```snakemake --cores all --use-conda``` - -will run the pipeline using all provided cores and create the conda environments for each job. It is a good idea to add the `-n` flag for a dry run to get a preview of what will actually be run. - -## Cluster -To use this pipeline on a cluster it is recommended to use a cluster profile. An example for a SLURM cluster is in `configs/cluster/config.yaml`. To use this profile with `sbatch` one could use the following in a `.sh` file: - -``` -#!/bin/bash - -#SBATCH -o logs/snakemake_output_%j.txt -#SBATCH -e logs/snakemake_output_%j.txt -#SBATCH -J tx_snakemake -#SBATCH -c 1 -#SBATCH --mem=4G -#SBATCH -t 3-00:00:00 -#SBATCH --nice=10000 -#SBATCH -p cpu_p - -mkdir logs -mkdir ./logs/$SLURM_JOB_ID - -conda activate tx - -export TMPDIR= -export SINGULARITY_TMPDIR= -export APPTAINER_TMPDIR= -export MAIN_SMK_JOB_ID=$SLURM_JOB_ID - -snakemake --profile configs/cluster/ -``` - -Here, the `snakemake` command will be run on a compute node so it does not terminate if the user closes the submit node. The snakemake job will then in turn submit more jobs to the cluster using the arguments specified in the cluster profile. In this case, the `snakemake` command itself will use the sbatch arguments, i.e. a time limit of 3 days. However, the sbatch arguments for each job will be specified by the `Snakefile` and the cluster `config.yaml` (i.e. the job `watershed` will have a time limit of 12 hours). - -Tip for cluster users: if the snakemake job unexpectedly terminates, make sure to unlock the working directory before re-running the command. - -# Input and Intermediate files - -The pipeline will generate many intermediate files in the `RESULTS` folder specified in the config. Files will follow this naming convention: -`[type]_[method1]-[id]_[method2]-[id].[ext]` - -- `type`: the kind of data in the file, such as "segments" or "counts" -- `method2`: the method used to generate that file, such as `cellpose`. This will be the last method listed -- `method1`: these are the previous methods used to generate that data. For example, an `assignments.csv` file will probably have come from a `segments.tif` file and its associated methods -- `id`: this is the specific id code for a method that details its parameters. This information will be in the `param_dict.csv` made upon running the `Snakefile` - -## Format of input files - -- `image`: The DAPI stain of the tissue as a single channel `tif` -- `sc_data`: The scRNA-seq data as an AnnData `.h5ad` file with the cell type in the key `adata.obs['celltype']` -- `molecules`: The spatial transcriptomics data as a `.csv` with 4 columns in this order: 1) Gene name, 2) X-coord, 3) Y-coord, 4) Cell type. The x and y coordinates should be in pixel coordinates of the DAPI image. -- `segmented_image`: Used with the `custom` segmentation key- should be a segmented image with background = 0. If the cells are binary they should be seperated (unique values will be assigned during the `segmentation` step). - -## Format of intermediate files - -- `segments.tif`: a `.tif` representing segmented cells as a matrix. Each value in the matrix corresponds to a unique cell with 0 being background(Note: does not always start from 1). This is an overlay of the original DAPI stain. -- `areas.csv`: the area of each cell from the `segments.tif` in pixels -- `assignments.csv`: similar to `molecules.csv` but now with a column for the id of the cell each molecule is assigned to -- `celltypes.csv`: the type for each cell given an assignment algorithm- not generated by every assignment method -- `counts.h5ad`: The normalized AnnData count matrix for the spatial data. Raw counts stored in `adata.layers['raw_counts']`. Cell area stored in `adata.obs['area']`, cell type stored in `adata.obs['celltype']` and `adata.obs['prior_celltype']` -- `metrics.csv`: Selected metrics calculated for a given `counts.h5ad` +# Template + + + + +A one sentence summary of purpose and methodology. Used for creating an +overview tables. + +Repository: +[openproblems-bio/task_preprocessing_imagingbased_st](https://github.com/openproblems-bio/task_preprocessing_imagingbased_st) + +## Description + +Provide a clear and concise description of your task, detailing the +specific problem it aims to solve. Outline the input data types, the +expected output, and any assumptions or constraints. Be sure to explain +any terminology or concepts that are essential for understanding the +task. + +Explain the motivation behind your proposed task. Describe the +biological or computational problem you aim to address and why it’s +important. Discuss the current state of research in this area and any +gaps or challenges that your task could help address. This section +should convince readers of the significance and relevance of your task. + +## Authors & contributors + +| name | roles | +|:---------|:-------------------| +| John Doe | author, maintainer | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common Dataset") + comp_data_processor[/"Data processor"/] + file_solution("Solution") + file_test_h5ad("Test data") + file_train_h5ad("Training data") + comp_control_method[/"Control Method"/] + comp_metric[/"Metric"/] + comp_method[/"Method"/] + file_prediction("Predicted data") + file_score("Score") + file_common_dataset---comp_data_processor + comp_data_processor-->file_solution + comp_data_processor-->file_test_h5ad + comp_data_processor-->file_train_h5ad + file_solution---comp_control_method + file_solution---comp_metric + file_test_h5ad---comp_control_method + file_test_h5ad---comp_method + file_train_h5ad---comp_control_method + file_train_h5ad---comp_method + comp_control_method-->file_prediction + comp_metric-->file_score + comp_method-->file_prediction + file_prediction---comp_metric +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: `resources_test/common/pancreas/dataset.h5ad` + +Format: + +
+ + AnnData object + obs: 'cell_type', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_type"]` | `string` | Cell type information. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Data processor + +A data processor. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | A subset of the common dataset. | +| `--output_train` | `file` | (*Output*) The training data in h5ad format. | +| `--output_test` | `file` | (*Output*) The subset of molecules used for the test dataset. | +| `--output_solution` | `file` | (*Output*) The solution for the test data. | + +
+ +## File format: Solution + +The solution for the test data + +Example file: `resources_test/task_template/pancreas/solution.h5ad` + +Format: + +
+ + AnnData object + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["label"]` | `string` | Ground truth cell type labels. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Test data + +The subset of molecules used for the test dataset + +Example file: `resources_test/task_template/pancreas/test.h5ad` + +Format: + +
+ + AnnData object + obs: 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## File format: Training data + +The training data in h5ad format + +Example file: `resources_test/task_template/pancreas/train.h5ad` + +Format: + +
+ + AnnData object + obs: 'label', 'batch' + var: 'hvg', 'hvg_score' + obsm: 'X_pca' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'normalization_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `obs["label"]` | `string` | Ground truth cell type labels. | +| `obs["batch"]` | `string` | Batch information. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | +| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized counts. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | + +
+ +## Component type: Control Method + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_train` | `file` | The training data in h5ad format. | +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--input_solution` | `file` | The solution for the test data. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## Component type: Metric + +A task template metric. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_solution` | `file` | The solution for the test data. | +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | + +
+ +## Component type: Method + +A method. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_train` | `file` | The training data in h5ad format. | +| `--input_test` | `file` | The subset of molecules used for the test dataset. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | + +
+ +## File format: Predicted data + +A predicted dataset as output by a method. + +Example file: `resources_test/task_template/pancreas/prediction.h5ad` + +Format: + +
+ + AnnData object + obs: 'label_pred' + uns: 'dataset_id', 'normalization_id', 'method_id' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:--------------------------|:---------|:-------------------------------------| +| `obs["label_pred"]` | `string` | Predicted labels for the test cells. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +File indicating the score of a metric. + +Example file: `resources/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Data structure: + +
+ +| Slot | Type | Description | +|:---|:---|:---| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["normalization_id"]` | `string` | Which normalization was used. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml deleted file mode 100644 index 4d767d8e..00000000 --- a/src/api/comp_control_method.yaml +++ /dev/null @@ -1,37 +0,0 @@ -namespace: control_methods -info: - type: control_method - type_info: - label: Control Method - summary: Quality control methods for verifying the pipeline. - description: | - This folder contains control components for the task. - These components have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. -arguments: - - name: --input_train - __merge__: file_train_h5ad.yaml - required: true - direction: input - - name: --input_test - __merge__: file_test_h5ad.yaml - required: true - direction: input - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: --output - __merge__: file_prediction.yaml - required: true - direction: output -test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - - type: python_script - path: /common/component_tests/check_config.py - - path: /resources_test/task_template/pancreas - dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_data_loader.yaml b/src/api/comp_data_loader.yaml new file mode 100644 index 00000000..b22196bc --- /dev/null +++ b/src/api/comp_data_loader.yaml @@ -0,0 +1,23 @@ + +info: + type: data_processor + type_info: + label: Data loader + summary: A data loader component which downloads data from a resource and stores it as a zarr file. + description: | + TODO: fill in +arguments: + - name: "--dataset_organism" + type: string + direction: input + required: true + - name: "--output" + __merge__: file_raw_data.yaml + direction: output + required: true +# test_resources: +# - path: /resources_test/common/pancreas +# dest: resources_test/common/pancreas +# - type: python_script +# path: /common/component_tests/run_and_check_output.py + \ No newline at end of file diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml deleted file mode 100644 index 184bc548..00000000 --- a/src/api/comp_data_processor.yaml +++ /dev/null @@ -1,31 +0,0 @@ -namespace: "data_processors" -info: - type: data_processor - type_info: - label: Data processor - summary: A data processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. -arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train_h5ad.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test_h5ad.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true -test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /common/component_tests/run_and_check_output.py - diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml deleted file mode 100644 index d7be9578..00000000 --- a/src/api/comp_method.yaml +++ /dev/null @@ -1,28 +0,0 @@ -namespace: "methods" -info: - type: method - type_info: - label: Method - summary: A method. - description: | - A method to predict the task effects. -arguments: - - name: --input_train - __merge__: file_train_h5ad.yaml - required: true - direction: input - - name: "--input_test" - __merge__: file_test_h5ad.yaml - direction: input - required: true - - name: --output - __merge__: file_prediction.yaml - required: true - direction: output -test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - - type: python_script - path: /common/component_tests/check_config.py - - path: /resources_test/task_template/pancreas - dest: resources_test/task_template/pancreas \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml deleted file mode 100644 index e3295da0..00000000 --- a/src/api/comp_metric.yaml +++ /dev/null @@ -1,28 +0,0 @@ -namespace: "metrics" -info: - type: metric - type_info: - label: Metric - summary: A task template metric. - description: | - A metric for evaluating method predictions. -arguments: - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--input_prediction" - __merge__: file_prediction.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - direction: output - required: true -test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - - type: python_script - path: /common/component_tests/check_config.py - - path: /resources_test/task_template/pancreas - dest: resources_test/task_template/pancreas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml deleted file mode 100644 index 0927ea0a..00000000 --- a/src/api/file_common_dataset.yaml +++ /dev/null @@ -1,72 +0,0 @@ -type: file -example: "resources_test/common/pancreas/dataset.h5ad" -label: "Common Dataset" -summary: A subset of the common dataset. -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Cell type information - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml deleted file mode 100644 index 4a6dc328..00000000 --- a/src/api/file_prediction.yaml +++ /dev/null @@ -1,26 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/pancreas/prediction.h5ad" -label: "Predicted data" -summary: A predicted dataset as output by a method. -info: - format: - type: h5ad - obs: - - type: string - name: label_pred - description: Predicted labels for the test cells. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true \ No newline at end of file diff --git a/src/api/file_raw_data.yaml b/src/api/file_raw_data.yaml new file mode 100644 index 00000000..414a8aad --- /dev/null +++ b/src/api/file_raw_data.yaml @@ -0,0 +1,185 @@ +#SpatialData object, with associated Zarr store: +#├── Images +#│ ├── '{rep}_he_image': DataTree[cyx] (3, 45087, 11580), (3, 22543, 5790), (3, 11271, 2895), (3, 5635, 1447), (3, 2817, 723) +#│ ├── '{rep}_{flourescence_img}': DataTree[cyx] (2, 17098, 51187), (2, 8549, 25593), (2, 4274, 12796), (2, 2137, 6398), (2, 1068, 3199) +#│ └── '{rep}_{flourescence_img}_3D': DataTree[czyx] (2, 8, 17098, 51187), (2, 8, 8549, 25593), (2, 8, 4274, 12796), (2, 8, 2137, 6398), (2, 8, 1068, 3199) +#├── Labels +#│ ├── '{rep}_{segm1}': DataTree[yxz] (17098, 51187, 3), (8549, 25593, 3), (4274, 12796, 3), (2137, 6398, 3), (1068, 3199, 3) +#│ ├── '{rep}_{segm2}': DataTree[yxz] (17098, 51187, 3), (8549, 25593, 3), (4274, 12796, 3), (2137, 6398, 3), (1068, 3199, 3) +#│ └── '{rep}_expert_segm_{patch}': DataTree[yxz] (17098, 51187, 3), (8549, 25593, 3), (4274, 12796, 3), (2137, 6398, 3), (1068, 3199, 3) +#├── Points +#│ └── '{rep}_transcripts': DataFrame with shape: (, 11) (3D points) +#├── Shapes +#│ ├── '{rep}_{segm1}_boundaries': GeoDataFrame shape: (162254, 1) (2D shapes - 3D supported??) +#│ └── '{rep}_{segm2}_boundaries': GeoDataFrame shape: (162254, 1) (2D shapes) +#└── Tables +# ├── '{rep}_{segm1}': AnnData (162254, 377) +# ├── '{rep}_{segm2}': AnnData (162254, 377) +# ├── '{reference1}': AnnData (n_obs_ref1, ?) +# └── '{reference2}': AnnData (n_obs_ref2, ?) +#with coordinate systems: +# ▸ '{rep}_global', with elements: +# .... + +type: file +example: "resources_test/task_preprocessing_imagingbased_st/raw_dataset.zarr" +label: "Raw Dataset" +summary: An unprocessed spatial imaging dataset stored as a zarr file. +info: + format: + type: spatialdata_zarr + variables: + - name: rep + type: string + description: The replicate identifier + required: true + - name: reference + type: str + description: Name of the reference dissociated dataset + required: true + - name: segm + type: string + description: Custom segmentation identifier + required: false + - name: patch + type: string + description: Expert segmentation image patch identifier + required: false + + images: + - type: DataTree[cyx] + dtype: int/float #TODO + name: "{rep}_image" + description: The raw image data + required: true + - type: DataTree[czyx] + dtype: int/float #TODO + name: "{rep}_image_3D" + description: The raw 3D image data + required: false + - type: + dtype: + name: "{rep}_he_image" + description: H&E image data + required: false + labels: + - type: DataTree[yx] + dtype: int + name: "{rep}_{segm}" + description: Custom segmentation of the data + required: false + - type: DataTree[zyx] + dtype: int + name: "{rep}_{segm}_3D" + description: Custom segmentation of the 3D data + required: false + - type: DataTree[yx] + dtype: int + name: "{rep}_expert_segm_{patch}" + description: Expert segmentation of a patch of the data + required: false + - type: DataTree[zyx] + dtype: int + name: "{rep}_expert_segm_{patch}_3D" + description: Expert segmentation of a 3D patch of the data + required: false + points: + - type: DataFrame + dtype: str & float + name: "{rep}_transcripts" + description: Point cloud data of transcripts + required: true + shapes: + - type: GeoDataFrame + dtype: ??? + name: "{rep}_{segm}_boundaries" + description: Cell polygons referring to "{rep}_{segm}" + required: false + tables: + - type: anndata + dtype: ??? + name: "metadata" + description: Metadata of spatial dataset + required: true + - type: anndata + dtype: ??? + name: "{reference}" + description: Map to define the reference cells to compare to for each rep + required: true + - type: anndata + dtype: ??? + name: "{rep}_{segm}_table" + description: Count data referring to "{rep}_{segm}" + required: false + coordinate_systems: + - type: ??? + dtype: ??? + name: "{rep}_global" + description: Coordinate system of the replicate + required: true + + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml deleted file mode 100644 index f6022a83..00000000 --- a/src/api/file_score.yaml +++ /dev/null @@ -1,31 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: resources/score.h5ad -label: Score -summary: "File indicating the score of a metric." -info: - format: - type: h5ad - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true \ No newline at end of file diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml deleted file mode 100644 index 81e168e9..00000000 --- a/src/api/file_solution.yaml +++ /dev/null @@ -1,73 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/pancreas/solution.h5ad" -label: "Solution" -summary: "The solution for the test data" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml deleted file mode 100644 index 6ee21ac5..00000000 --- a/src/api/file_test_h5ad.yaml +++ /dev/null @@ -1,45 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/pancreas/test.h5ad" -label: "Test data" -summary: The subset of molecules used for the test dataset -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml deleted file mode 100644 index 7d2b51d5..00000000 --- a/src/api/file_train_h5ad.yaml +++ /dev/null @@ -1,49 +0,0 @@ -#TODO: Change to the required and/or optional fields of the anndata -type: file -example: "resources_test/task_template/pancreas/train.h5ad" -label: "Training data" -summary: "The training data in h5ad format" -info: - format: - type: h5ad - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true \ No newline at end of file diff --git a/src/data_loaders/download_10x_xenium/config.vsh.yaml b/src/data_loaders/download_10x_xenium/config.vsh.yaml new file mode 100644 index 00000000..d64c89c7 --- /dev/null +++ b/src/data_loaders/download_10x_xenium/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_data_loader.yaml +name: download_10x_xenium + +arguments: + - type: file + name: --input + required: true + description: A 10x xenium zip file + multiple: true + - type: string + name: --replicate_name + required: true + description: The replicate identifier + multiple: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - spatialdata-io + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_loaders/download_10x_xenium/script.py b/src/data_loaders/download_10x_xenium/script.py new file mode 100644 index 00000000..72822754 --- /dev/null +++ b/src/data_loaders/download_10x_xenium/script.py @@ -0,0 +1,60 @@ +# https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard + + +from spatialdata_io import xenium +from pathlib import Path +import shutil + +## VIASH START +# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip +# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip +# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip +par = { + "input": [ + "resources/datasets_raw/10x_fresh_frozen_mouse_brain_replicates/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs" + ], + "replicate_id": [ + "rep1" + ], + "output": "resources/datasets/10x_xenium/10x_fresh_frozen_mouse_brain_replicates/dataset.zarr" +} +## VIASH END + +input = Path(par["input"]).resolve() +output = Path(par["output"]).resolve() + +print("parsing the data... ", end="", flush=True) +sdata = xenium( + path=str(input), + n_jobs=8, + cell_boundaries=True, + nucleus_boundaries=True, + morphology_focus=True, + cells_as_circles=True, +) + +print("writing the data... ", end="", flush=True) +if output.exists(): + shutil.rmtree(output) +sdata.write(output) + + +# SpatialData object +# ├── Images +# │ ├── 'morphology_focus': DataTree[cyx] (1, 33131, 48358), (1, 16565, 24179), (1, 8282, 12089), (1, 4141, 6044), (1, 2070, 3022) +# │ └── 'morphology_mip': DataTree[cyx] (1, 33131, 48358), (1, 16565, 24179), (1, 8282, 12089), (1, 4141, 6044), (1, 2070, 3022) +# ├── Labels +# │ ├── 'cell_labels': DataTree[yx] (33131, 48358), (16565, 24179), (8282, 12089), (4141, 6044), (2070, 3022) +# │ └── 'nucleus_labels': DataTree[yx] (33131, 48358), (16565, 24179), (8282, 12089), (4141, 6044), (2070, 3022) +# ├── Points +# │ └── 'transcripts': DataFrame with shape: (, 8) (3D points) +# ├── Shapes +# │ ├── 'cell_boundaries': GeoDataFrame shape: (162033, 1) (2D shapes) +# │ ├── 'cell_circles': GeoDataFrame shape: (162033, 2) (2D shapes) +# │ └── 'nucleus_boundaries': GeoDataFrame shape: (162033, 1) (2D shapes) +# └── Tables +# └── 'table': AnnData (162033, 248) +# with coordinate systems: +# ▸ 'global', with elements: +# morphology_focus (Images), morphology_mip (Images), cell_labels (Labels), nucleus_labels (Labels), transcripts (Points), cell_boundaries (Shapes), cell_circles (Shapes), nucleus_boundaries (Shapes) + diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml deleted file mode 100644 index a9977208..00000000 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -__merge__: ../../api/comp_data_processor.yaml -name: process_dataset -arguments: - - name: "--method" - type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--obs_batch" - type: "string" - description: "Which .obs slot to use as batch covariate." - default: "batch" - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 -resources: - - type: python_script - path: script.py - - path: /common/helper_functions/subset_h5ad_by_format.py - -engines: - - type: docker - image: openproblems/base_python:1.0.0 - -runners: - - type: executable - - type: nextflow - directives: - label: [highmem, midcpu, midtime] \ No newline at end of file