diff --git a/.github/workflows/spatial_deconvolution-ci.yml b/.github/workflows/spatial_deconvolution_cell2location-ci.yml
similarity index 90%
rename from .github/workflows/spatial_deconvolution-ci.yml
rename to .github/workflows/spatial_deconvolution_cell2location-ci.yml
index b22e40da..b43215bb 100644
--- a/.github/workflows/spatial_deconvolution-ci.yml
+++ b/.github/workflows/spatial_deconvolution_cell2location-ci.yml
@@ -1,4 +1,4 @@
-name: Run tutorials (spatial deconvolution)
+name: Run tutorials (spatial deconvolution cell2location)
on:
push:
@@ -12,7 +12,7 @@ env:
debug: 'true'
jobs:
- spatial_deconvolution:
+ spatial_deconvolution_cell2location:
runs-on: ubuntu-latest
strategy:
fail-fast: false
@@ -60,7 +60,9 @@ jobs:
cd deconvolution/data
curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677
cd spatial_data
- curl -L -o Human_Heart.h5mu https://figshare.com/ndownloader/files/44969488
+ curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673
+ unzip Human_Heart.zarr.zip
+ rm Human_Heart.zarr.zip
# Note: we run the following to test that the commands works
diff --git a/.github/workflows/spatial_deconvolution_tangram-ci.yml b/.github/workflows/spatial_deconvolution_tangram-ci.yml
new file mode 100644
index 00000000..c06d8b75
--- /dev/null
+++ b/.github/workflows/spatial_deconvolution_tangram-ci.yml
@@ -0,0 +1,98 @@
+name: Run tutorials (spatial deconvolution tangram)
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+env:
+ debug: 'true'
+
+jobs:
+ spatial_deconvolution_tangram:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+ python-version: ["3.10"]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree
+
+ - uses: conda-incubator/setup-miniconda@v3
+ with:
+ miniforge-version: latest
+ auto-activate-base: true
+ auto-update-conda: true
+ channels: conda-forge
+ channel-priority: strict
+ activate-environment: pipeline_env
+ environment-file: pipeline_env.yaml
+
+ - name: Install Panpipes
+ shell: bash -el {0}
+ run: |
+ pip install .[spatial]
+ conda list
+
+ - name: Conda info
+ if: env.debug == 'true'
+ shell: bash -el {0}
+ run: conda info
+
+ - name: Conda list
+ if: env.debug == 'true'
+ shell: pwsh
+ run: conda list
+
+
+ - name: Preparing the data
+ run: |
+ mkdir deconvolution_tangram deconvolution_tangram/data deconvolution_tangram/data/spatial_data
+ cd deconvolution_tangram/data
+ curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677
+ cd spatial_data
+ curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673
+ unzip Human_Heart.zarr.zip
+ rm Human_Heart.zarr.zip
+
+
+ # Note: we run the following to test that the commands works
+ - name: Preparing the configuration file
+ shell: bash -el {0}
+ run: |
+ cd deconvolution_tangram
+ panpipes deconvolution_spatial config
+
+ - name: Edit the submission file
+ run: |
+ cd deconvolution_tangram
+ curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/deconvolution_tangram/pipeline.yml
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree deconvolution_tangram
+
+ - name: Review pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd deconvolution_tangram
+ panpipes deconvolution_spatial show full --local
+
+ - name: Run pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd deconvolution_tangram
+ panpipes deconvolution_spatial make full --local
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree deconvolution_tangram
diff --git a/.github/workflows/spatial_ingestion_merfish-ci.yml b/.github/workflows/spatial_ingestion_merfish-ci.yml
index 89812404..e0b8ae06 100644
--- a/.github/workflows/spatial_ingestion_merfish-ci.yml
+++ b/.github/workflows/spatial_ingestion_merfish-ci.yml
@@ -58,11 +58,11 @@ jobs:
run: |
mkdir spatial spatial/ingestion_merfish spatial/ingestion_merfish/data
cd spatial/ingestion_merfish/data
- curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv https://figshare.com/ndownloader/files/45028624
- curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv https://figshare.com/ndownloader/files/45028621
+ curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/45028624
+ curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/45028621
mkdir images
cd images
- curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_images_micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645
+ curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645
# Note: we run the following to test that the commands works
@@ -75,12 +75,12 @@ jobs:
- name: Preparing the submission file
run: |
cd spatial/ingestion_merfish
- curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/sample_file_qc_spatial.txt
+ curl -o sample_file_qc_merfish.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt
- name: Preparing the yaml file
run: |
cd spatial/ingestion_merfish
- curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/pipeline.yml
+ curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/pipeline.yml
- name: File tree
if: env.debug == 'true'
diff --git a/.github/workflows/spatial_ingestion_merscope-ci.yml b/.github/workflows/spatial_ingestion_merscope-ci.yml
new file mode 100644
index 00000000..3a2da28a
--- /dev/null
+++ b/.github/workflows/spatial_ingestion_merscope-ci.yml
@@ -0,0 +1,105 @@
+name: Run tutorials (spatial ingest merscope)
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+env:
+ debug: 'true'
+
+jobs:
+ spatial_ingest_merscope:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+ python-version: ["3.10"]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree
+
+ - uses: conda-incubator/setup-miniconda@v3
+ with:
+ miniforge-version: latest
+ auto-activate-base: true
+ auto-update-conda: true
+ channels: conda-forge
+ channel-priority: strict
+ activate-environment: pipeline_env
+ environment-file: pipeline_env.yaml
+
+ - name: Install Panpipes
+ shell: bash -el {0}
+ run: |
+ pip install .[spatial]
+ conda list
+
+ - name: Conda info
+ if: env.debug == 'true'
+ shell: bash -el {0}
+ run: conda info
+
+ - name: Conda list
+ if: env.debug == 'true'
+ shell: pwsh
+ run: conda list
+
+
+ - name: Preparing the data
+ run: |
+ mkdir spatial spatial/ingestion_merscope spatial/ingestion_merscope/data
+ cd spatial/ingestion_merscope/data
+ curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/50899455
+ curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/50899452
+ curl -L -o cellpose_micron_space.parquet https://figshare.com/ndownloader/files/50899458
+ # curl -L -o detected_transcripts.csv https://figshare.com/ndownloader/files/50899476
+ mkdir images
+ cd images
+ curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/50899449
+
+
+ # Note: we run the following to test that the commands works
+ - name: Preparing the configuration file
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_merscope
+ panpipes qc_spatial config
+
+ - name: Preparing the submission file
+ run: |
+ cd spatial/ingestion_merscope
+ curl -o sample_file_qc_merscope.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt
+
+ - name: Preparing the yaml file
+ run: |
+ cd spatial/ingestion_merscope
+ curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/pipeline.yml
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree spatial/ingestion_merscope
+
+ - name: Review pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_merscope
+ panpipes qc_spatial show full --local
+
+ - name: Run pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_merscope
+ panpipes qc_spatial make full --local
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree spatial/ingestion_merscope
diff --git a/.github/workflows/spatial_ingestion_visium-ci.yml b/.github/workflows/spatial_ingestion_visium-ci.yml
index 8fb57e05..77da20f2 100644
--- a/.github/workflows/spatial_ingestion_visium-ci.yml
+++ b/.github/workflows/spatial_ingestion_visium-ci.yml
@@ -78,12 +78,11 @@ jobs:
- name: Preparing the submission file
run: |
cd spatial/ingestion
- curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/sample_file_qc_spatial.txt
-
+ curl -o sample_file_qc_visium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt
- name: Preparing the yaml file
run: |
cd spatial/ingestion
- curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/pipeline.yml
+ curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/pipeline.yml
- name: File tree
if: env.debug == 'true'
diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml
new file mode 100644
index 00000000..366beb3d
--- /dev/null
+++ b/.github/workflows/spatial_ingestion_xenium.yml
@@ -0,0 +1,109 @@
+name: Run tutorials (spatial ingest xenium)
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+env:
+ debug: 'true'
+
+jobs:
+ spatial_ingest_xenium:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ os: ["ubuntu-latest"] # , "macos-latest", "windows-latest"
+ python-version: ["3.10"]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree
+
+ - uses: conda-incubator/setup-miniconda@v3
+ with:
+ miniforge-version: latest
+ auto-activate-base: true
+ auto-update-conda: true
+ channels: conda-forge
+ channel-priority: strict
+ activate-environment: pipeline_env
+ environment-file: pipeline_env.yaml
+
+ - name: Install Panpipes
+ shell: bash -el {0}
+ run: |
+ pip install .[spatial]
+ conda list
+
+ - name: Conda info
+ if: env.debug == 'true'
+ shell: bash -el {0}
+ run: conda info
+
+ - name: Conda list
+ if: env.debug == 'true'
+ shell: pwsh
+ run: conda list
+
+
+ - name: Preparing the data
+ run: |
+ mkdir spatial spatial/ingestion_xenium spatial/ingestion_xenium/data
+ cd spatial/ingestion_xenium/data
+ curl -L -o experiment.xenium https://figshare.com/ndownloader/files/51244265
+ curl -L -o nucleus_boundaries.parquet https://figshare.com/ndownloader/files/51244286
+ curl -L -o cell_boundaries.parquet https://figshare.com/ndownloader/files/51244244
+ curl -L -o transcripts.parquet https://figshare.com/ndownloader/files/51244283
+ curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51244247
+ curl -L -o cells.parquet https://figshare.com/ndownloader/files/51244259
+ curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51244415
+ curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244262
+ mkdir morphology_focus
+ cd morphology_focus
+ curl -L -o morphology_focus_0000.ome.tif https://figshare.com/ndownloader/files/51244277
+
+
+ # Note: we run the following to test that the commands works
+ - name: Preparing the configuration file
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_xenium
+ panpipes qc_spatial config
+
+ - name: Preparing the submission file
+ run: |
+ cd spatial/ingestion_xenium
+ curl -o sample_file_qc_xenium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt
+
+ - name: Preparing the yaml file
+ run: |
+ cd spatial/ingestion_xenium
+ curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/pipeline.yml
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree spatial/ingestion_xenium
+
+ - name: Review pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_xenium
+ panpipes qc_spatial show full --local
+
+ - name: Run pipeline tasks
+ shell: bash -el {0}
+ run: |
+ cd spatial/ingestion_xenium
+ panpipes qc_spatial make full --local
+
+ - name: File tree
+ if: env.debug == 'true'
+ run: tree spatial/ingestion_xenium
diff --git a/.github/workflows/spatial_preprocess-ci.yml b/.github/workflows/spatial_preprocess-ci.yml
index f9a6123f..ce322ee6 100644
--- a/.github/workflows/spatial_preprocess-ci.yml
+++ b/.github/workflows/spatial_preprocess-ci.yml
@@ -53,14 +53,18 @@ jobs:
shell: pwsh
run: conda list
-
- name: Preparing the data
run: |
mkdir spatial spatial/preprocess spatial/preprocess/data
cd spatial/preprocess/data
- curl -L -o V1_Human_Heart_unfilt.h5mu https://figshare.com/ndownloader/files/45031048
- curl -L -o V1_Human_Lymph_Node_unfilt.h5mu https://figshare.com/ndownloader/files/45031051
+ curl -L -o V1_Human_Heart_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236521
+ unzip V1_Human_Heart_unfilt.zarr.zip
+ rm V1_Human_Heart_unfilt.zarr.zip
+ curl -L -o V1_Human_Lymph_Node_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236575
+ unzip V1_Human_Lymph_Node_unfilt.zarr.zip
+ rm V1_Human_Lymph_Node_unfilt.zarr.zip
+
# Note: we run the following to test that the commands works
- name: Preparing the configuration file
@@ -78,7 +82,7 @@ jobs:
run: |
cd spatial/preprocess
sed -i 's+../ingestion/qc.data/+./data/+g' pipeline.yml
-
+
- name: File tree
if: env.debug == 'true'
run: tree spatial/preprocess
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ac8d32d..4baadcb7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,24 @@
## [latest]
+### added
+- moved from MuData to SpatialData
+- xenium ingestion & ingest_xenium GitHub action
+- `export_gene_by_spot` for Cell2Location
+- separate sample submission files for the different spatial technologies
+- separate GitHub actions for Cell2Location & Tangram
+- separate GitHub actions for MERSCOPE & MERFISH
+
+
+### fixed
+
+
+### dependencies
+- pinned "spatialdata==0.2.6", "spatialdata-io==0.1.6", "dask==2024.12.1" as temporary fix
+
+
+## v1.1.0
+
### added
### fixed
diff --git a/docs/usage/setup_for_spatial_workflows.md b/docs/usage/setup_for_spatial_workflows.md
index 248c33b9..456d519f 100644
--- a/docs/usage/setup_for_spatial_workflows.md
+++ b/docs/usage/setup_for_spatial_workflows.md
@@ -1,28 +1,67 @@
Sample submission file for the ingestion of spatial data
===========================
-The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium` and `Vizgen` data.
+The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium`, `Vizgen`, and `Xenium` data. The data of different technologies needs to be ingested separately with different sample submission files.
-The 6 columns of the sample submission file are:
+
+The minimum required (non-optional) columns for each submission file are
**sample id**: Unique sample ID.
-**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [squidpy.read.visium](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.visium.html) (for `Visium` data) or [squidpy.read.vizgen](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.vizgen.html) (for `Vizgen` data) functions.
+**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) (for `Visium` data), [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) (for `Vizgen` data), or [spatialdata_io.xenium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.xenium.html) (for `Xenium` data) functions.
+
+**spatial_filetype**: Either "vizgen", "visium", or "xenium".
+
+
+## Visium
+
+The 7 columns of the Visium sample submission file are:
+
+sample_id | spatial_path | spatial_filetype | visium_feature_bc_matrix | visium_fullres_image_file | visium_tissue_positions_file | visium_scalefactors_file
+----------|----------|------------|-----------|----------|-------------|-------------
+
+The following 4 columns are **optional**:
+
+**visium_feature_bc_matrix**: Name of the counts file. Corresponds to the `counts_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_fullres_image_file**: Path to the full-resolution image. Corresponds to the `fullres_image_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_tissue_positions_file**: Path to the tissue positions file. Corresponds to the `tissue_positions_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+**visium_scalefactors_file**: Path to the scalefactors file. Corresponds to the `scalefactors_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html)
+
+##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt)
+
+
+## Vizgen
+
+The 6 columns of the Vizgen sample submission file are:
+
+sample_id | spatial_path | spatial_filetype | vpt_cell_by_gene | vpt_cell_metadata | vpt_cell_boundaries
+----------|----------|------------|----------|-------------|-------------
+
+The following 3 columns are **optional**:
+
+**vpt_cell_by_gene**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+**vpt_cell_metadata**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+**vpt_cell_boundaries**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html)
+
+
+##### Example submission files [MERFISH](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt) [MERSCOPE](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt)
+
+## Xenium
+
+The 3 columns of the Xenium sample submission file are:
+
+sample_id | spatial_path | spatial_filetype |
+----------|----------|------------
-**spatial_filetype**: Either "vizgen" or "visium".
+##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt)
-**spatial_counts**: The count matrix file. Usually `filtered_feature_bc_matrix.h5` or `raw_feature_bc_matrix.h5` for a `Visium` dataset. For `Vizgen` inputs, this file typically ends with `_cell_by_gene.csv.`
-**spatial_metadata**: The metadata csv-file for `Vizgen` data. Leave empty for `Visium` data.
-**spatial_transformation**: The transformation csv-file for `Vizgen` data. This column is **optional** for `Vizgen` data. Leave empty for `Visium` data.
-**Note, that the columns, `sample_id`, `spatial_path`, `spatial_filetype`, and `spatial_counts` are required for both, `Visium` and `Vizgen` data. The `spatial_metadata`(required) and `spatial_transformation`(optional) columns are `Vizgen`-specific and should be left empty for `Visium` data.**
-### Example submission file
-| sample_id | spatial_path | spatial_filetype | spatial_counts | spatial_metadata | spatial_transformation |
-| --------- |--------------|------------------|-----------------------------------------|------------------------------------------|--------------------|
-| V1_Human_Heart |./data_visium/V1_Human_Heart |visium |V1_Human_Heart_filtered_feature_bc_matrix.h5 |
-| V1_Human_Lymph_Node |./data_visium/V1_Human_Lymph_Node| visium | V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5 |
-Mouse_Brain | ./data_vizgen | vizgen | cell_by_gene_S1R1.csv | cell_metadata_S1R1.csv | images_micron_to_mosaic_pixel_transform.csv
diff --git a/docs/workflows/clustering_spatial.md b/docs/workflows/clustering_spatial.md
index 3c63b62e..99ae8b49 100644
--- a/docs/workflows/clustering_spatial.md
+++ b/docs/workflows/clustering_spatial.md
@@ -1,6 +1,6 @@
# Clustering spatial data
The `clustering` workflow accepts both cell suspension datasets and spatial transcriptomics data as input that have been ingested with the `qc_spatial` workflow and optionally filtered with the `spatial_preprocess` workflow.
-The workflow expects a **single `MuData` object** with the spatial data saved in `mdata.mod["spatial"]`.
+The workflow expects a **single `SpatialData` object**.
Set `spatial: True` in the configuration file and customize the spatial modality clustering parameters exactly as you would for a single cell experiment. For more information check the [clustering workflow](./clustering.md)
diff --git a/docs/workflows/deconvolute_spatial.md b/docs/workflows/deconvolute_spatial.md
index e1233dba..d03f56b9 100644
--- a/docs/workflows/deconvolute_spatial.md
+++ b/docs/workflows/deconvolute_spatial.md
@@ -1,6 +1,6 @@
# Deconvoluting spatial data
-With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input.
+With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input.
The workflow provides the possibility to run deconvolution using `Cell2Location` and `Tangram`.
@@ -19,8 +19,9 @@ For the reference and each spatial slide the following steps are run. **Note, th
- Regression/reference model is fitted and a plot of the training history as well as QC plots are saved in the `./figures/Cell2Location` directory. Additionally, a csv-file `Cell2Loc_inf_anver.csv` with the estimated expression of every gene in every cell type is saved in `./cell2location.output`.
- (Optional) Reference model is saved in `./cell2location.output`
- Spatial mapping model is fitted. Training history and QC plots are saved in the `./figures/Cell2Location` directory. Plot of the spatial embedding coloured by `q05_cell_abundance_w_sf` is also saved in `./figures/Cell2Location`.
+- (Optional) A gene by spot matrix for each cell type is saved to a layer in the table of the `SpatialData` object
- (Optional) Spatial mapping model is saved in `./cell2location.output`
-- `MuData` objects of the spatial slide and the reference are saved in `./cell2location.output`. The `MuData` object of the spatial slide contains the estimated cell type abundances.
+- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./cell2location.output`. The `SpatialData` object of the spatial slide contains the estimated cell type abundances.
### Tangram
@@ -34,7 +35,7 @@ For the reference and each spatial slide the following steps are run. **Note, th
- Data is preprocessed with [tangram.pp_adatas](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.pp_adatas.html)
- Tangram model is fitted with [tangram.mapping_utils.map_cells_to_space](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.map_cells_to_space.html) and annotations are transfered from single-cell data onto space with [tangram.project_cell_annotations](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.utils.project_cell_annotations.html)
- Plot of the spatial embedding coloured by `tangram_ct_pred` is saved in `./figures/Tangram`
-- `MuData` objects of the spatial slide and the reference are saved in `./tangram.output`. The `MuData` object of the spatial slide contains the deconvolution predictions.
+- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./tangram.output`. The `SpatialData` object of the spatial slide contains the deconvolution predictions.
diff --git a/docs/workflows/ingest_spatial.md b/docs/workflows/ingest_spatial.md
index a9992d6d..f6ded09a 100644
--- a/docs/workflows/ingest_spatial.md
+++ b/docs/workflows/ingest_spatial.md
@@ -1,19 +1,19 @@
# Ingesting spatial data
-Similar to the cell suspension workflow, `spatial_qc` ingests `Vizgen` and/or `Visium` data and saves the data into `MuData` objects.
-A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `MuData` objects, each with a `spatial` layer. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition. In a future release, we will use [SpatialData](https://spatialdata.scverse.org/en/latest/tutorials/notebooks/notebooks.html) as a data format and framework to process multi-slides experiments.
+The `spatial_qc` workflow ingests `Vizgen`, `Visium`, or `Xenium` data and saves the data into `SpatialData` objects.
+A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `SpatialData` objects. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition.
## Steps
-- Data is ingested into `MuData` objects with the modality `spatial`. The workflow generates one MuData per dataset.
- - Raw `MuData` objects are saved into `./tmp`
+- Data is ingested into `SpatialData` objects. The workflow generates one `SpatialData` per dataset.
+ - `SpatialData` objects of the raw data are saved into `./tmp` as `zarr` files
- QC metrics are computed using `scanpy` functionalities:
- Basic QC metrics are computed using `sc.pp.calculate_qc_metrics`
- (Optional) Compute cell-cycle scores using `sc.tl.score_genes_cell_cycle`. For that, the [default gene list](../../panpipes/resources/cell_cycle_genes.tsv) can be used or a path to a tsv file can be specified.
- (Optional) Custom genes actions. [Default gene list](../../panpipes/resources/qc_genelist_1.0.csv) can be used or a path to a csv file can be specified.
- Calculate proportions of gene groups, e.g. mitochondrial genes
- Score genes using `sc.tl.score_genes`
- - `MuData` objects with calculated QC metrics are saved in `qc.data`
+ - `SpatialData` objects with calculated QC metrics are saved in `qc.data`
- Metadata (`.obs`) is saved into the current directory as tsv files
- Specified QC metrics are plotted in violin and spatial embedding plots
- For `Vizgen` data, additional histograms are plotted
diff --git a/docs/workflows/preprocess_spatial.md b/docs/workflows/preprocess_spatial.md
index 925bd534..3bfae011 100644
--- a/docs/workflows/preprocess_spatial.md
+++ b/docs/workflows/preprocess_spatial.md
@@ -1,17 +1,17 @@
# Preprocessing spatial data
-The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `MuData` objects of the same assay (`Visium` or `Vizgen`), each with a `spatial` modality, can be filtered and preprocessed in one run.
+The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `SpatialData` objects of the same assay (`Visium`, `Vizgen`, or `Xenium`) can be filtered and preprocessed in one run.
## Steps
-If multiple `MuData` objects are provided, the following steps are run for each **with the same parameter setting.**
+If multiple `SpatialData` objects are provided, the following steps are run for each **with the same parameter setting.**
-- `MuData` object is filtered by the specified thresholds in the pipeline.yml. Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`.
+- `SpatialData` object is filtered by the specified thresholds in the pipeline.yml. Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`.
- Post-filter plotting is performed (only when data was filtered, i.e. `run: True`). Specified metrics in the pipeline.yml are plotted in violin and spatial embedding plots. Plots are saved into the `./figures/spatial` directory.
- Data is normalized and HVGs are selected.
Before normalization, raw counts are saved into `.layers["raw_counts"]`, if not present already. Normalized counts are saved into `.X` and `.layers["lognorm"]` or `.layers["norm_pearson_resid"]`, depending on the chosen normalization. HVGs are saved into `.var["highly_variable"]`.
- PCA is computed and plotted. PCA plots are also saved into the `./figures/spatial` directory.
-- Final `MuData` object is saved into the `./filtered.data` directory
+- Final `SpatialData` object is saved into the `./filtered.data` directory as a `zarr` file
## Steps to run
diff --git a/docs/yaml_docs/spatial_deconvolution.md b/docs/yaml_docs/spatial_deconvolution.md
index 29a4e93a..a1debae4 100644
--- a/docs/yaml_docs/spatial_deconvolution.md
+++ b/docs/yaml_docs/spatial_deconvolution.md
@@ -33,7 +33,7 @@ Specified by the following three parameters:
- threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks.
- For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+ For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
- threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks.
@@ -46,12 +46,12 @@ Specified by the following three parameters:
## 1. Input Options
-With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide.
For the **spatial** input, the workflow, therefore, reads in **all `.h5mu` objects of a directory** (see below). **The spatial and single-cell data thus need to be saved in different folders.**
+With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide.
For the **spatial** input, the workflow, therefore, reads in **all `.zarr` objects of a directory** (see below).
input
- spatial `String`, Mandatory parameter
- Path to folder containing one or multiple `MuDatas` of spatial data. The pipeline is reading in all `MuData` files in that folder and assuming that they are `MuDatas` of spatial slides.
+ Path to folder containing one or multiple `SpatialDatas` of spatial data. The pipeline is reading in all `SpatialData` files in that folder.
- singlecell `String`, Mandatory parameter
Path to the MuData **file** (not folder) of the reference single-cell data.
@@ -151,6 +151,8 @@ You can specify whether both models (spatial and reference) should be saved with
save_models, Default: False
Whether to save the reference & spatial mapping models.
+export_gene_by_spot, Default: False
+ Whether to save a gene by spot matrix for each cell type in a layer.
## 3. Tangram Options
diff --git a/docs/yaml_docs/spatial_preprocess.md b/docs/yaml_docs/spatial_preprocess.md
index 28270c99..2d0db62c 100644
--- a/docs/yaml_docs/spatial_preprocess.md
+++ b/docs/yaml_docs/spatial_preprocess.md
@@ -35,7 +35,7 @@ Specified by the following three parameters:
- threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks.
- For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+ For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
- threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks.
@@ -48,14 +48,14 @@ Specified by the following three parameters:
## 1. Input Options
-With the preprocess_spatial workflow, one or multiple `MuData` objects can be preprocessed in one run. The workflow **reads in all `.h5mu` objects of a directory**. The `MuData` objects in the directory need to be of the same assay (vizgen or visium). The workflow then runs the preprocessing of each `MuData` object separately with the same parameters that are specified in the yaml file.
+With the preprocess_spatial workflow, one or multiple `SpatialData` objects can be preprocessed in one run. The workflow **reads in all `.zarr` objects of a directory**. The `SpatialData` objects in the directory need to be of the same assay (Vizgen, Visium, or Xenium). The workflow then runs the preprocessing of each `SpatialData` object separately with the same parameters that are specified in the yaml file.
input_dir `String`, Mandatory parameter
- Path to the folder containing all input `h5mu` files.
+ Path to the folder containing all input `zarr` files.
assay [`'visium'`, `'vizgen'`], Default: `'visium'`
- Spatial transcriptomics assay of the `h5mu` files in `input_dir`.
+ Spatial transcriptomics assay of the `zarr` files in `input_dir`.
@@ -70,7 +70,7 @@ With the preprocess_spatial workflow, one or multiple `MuData` objects can be pr
-With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the h5mu object.
Please slso make sure, that the specified metrics are present in all `h5mu` objects of the `input_dir`, i.e. the `MuData` objects for that the preprocessing is run.
+With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the table of the `SpatialData` object.
Please also make sure, that the specified metrics are present in all `SpatialData` objects of the `input_dir`, i.e. the `SpatialData` objects for that the preprocessing is run.
---
diff --git a/docs/yaml_docs/spatial_qc.md b/docs/yaml_docs/spatial_qc.md
index 5a742ab0..baeb2850 100644
--- a/docs/yaml_docs/spatial_qc.md
+++ b/docs/yaml_docs/spatial_qc.md
@@ -32,11 +32,11 @@ Computing resources to use, specifically the number of threads used for parallel
Specified by the following three parameters:
- threads_high `Integer`, Default: 1
Number of threads used for high intensity computing tasks.
- For each thread, there must be enough memory to load all your input files at once and create the MuData object.
+ For each thread, there must be enough memory to load all your input files at once and create the SpatialData object.
- threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks.
- For each thread, there must be enough memory to load your mudata and do computationally light tasks.
+ For each thread, there must be enough memory to load your SpatialData and do computationally light tasks.
- threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks.
diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py
index 77d74d2b..b5d635ab 100644
--- a/panpipes/funcs/io.py
+++ b/panpipes/funcs/io.py
@@ -156,36 +156,68 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
spatial_filetype = None
else:
spatial_path = caf["spatial_path"][nn]
+ if caf['spatial_filetype'][nn]=="xenium":
+ spatial_filetype = caf['spatial_filetype'][nn]
+ visium_feature_bc_matrix = None
+ visium_fullres_image_file = None
+ visium_tissue_positions_file = None
+ visium_scalefactors_file = None
+ vpt_cell_by_gene = None
+ vpt_cell_metadata = None
+ vpt_cell_boundaries = None
if caf['spatial_filetype'][nn]=="vizgen":
+ visium_feature_bc_matrix = None
+ visium_fullres_image_file = None
+ visium_tissue_positions_file = None
+ visium_scalefactors_file = None
spatial_filetype = caf['spatial_filetype'][nn]
- #path, counts and metadata are mandatory
- if pd.notna(caf["spatial_counts"][nn]):
- spatial_counts= caf["spatial_counts"][nn]
- else:
- spatial_counts = None
- if pd.notna(caf["spatial_metadata"][nn]):
- spatial_metadata = caf["spatial_metadata"][nn]
- else:
- spatial_metadata = None
- #transformation is optional
- if pd.notna(caf["spatial_transformation"][nn]):
- spatial_transformation = caf["spatial_transformation"][nn]
- else:
- spatial_transformation = None
+ vpt_cell_by_gene = None
+ vpt_cell_metadata = None
+ vpt_cell_boundaries = None
+ if "vpt_cell_by_gene" in caf.columns:
+ if pd.notna(caf['vpt_cell_by_gene'][nn]):
+ vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn]
+ if "vpt_cell_metadata" in caf.columns:
+ if pd.notna(caf['vpt_cell_metadata'][nn]):
+ vpt_cell_metadata = caf['vpt_cell_metadata'][nn]
+ if "vpt_cell_boundaries" in caf.columns:
+ if pd.notna(caf['vpt_cell_boundaries'][nn]):
+ vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn]
elif caf['spatial_filetype'][nn]=="visium":
- spatial_metadata= None
- spatial_transformation = None
+ visium_feature_bc_matrix = None
+ visium_fullres_image_file = None
+ visium_tissue_positions_file = None
+ visium_scalefactors_file = None
+ vpt_cell_by_gene = None
+ vpt_cell_metadata = None
+ vpt_cell_boundaries = None
spatial_filetype = caf['spatial_filetype'][nn]
- if pd.notna(caf["spatial_counts"][nn]):
- spatial_counts= caf["spatial_counts"][nn]
- else:
- spatial_counts = None
+ #counts file
+ if "visium_feature_bc_matrix" in caf.columns:
+ if pd.notna(caf["visium_feature_bc_matrix"][nn]):
+ visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn]
+ # fullres image
+ if "visium_fullres_image_file" in caf.columns:
+ if pd.notna(caf["visium_fullres_image_file"][nn]):
+ visium_fullres_image_file= caf["visium_fullres_image_file"][nn]
+ # tissue position
+ if "visium_tissue_positions_file" in caf.columns:
+ if pd.notna(caf["visium_tissue_positions_file"][nn]):
+ visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn]
+ # scalefactor
+ if "visium_scalefactors_file" in caf.columns:
+ if pd.notna(caf["visium_scalefactors_file"][nn]):
+ visium_scalefactors_file= caf["visium_scalefactors_file"][nn]
else:
spatial_path= None
spatial_filetype = None
- spatial_counts = None
- spatial_metadata = None
- spatial_transformation = None
+ visium_feature_bc_matrix = None
+ visium_fullres_image_file = None
+ visium_tissue_positions_file = None
+ visium_scalefactors_file = None
+ vpt_cell_by_gene = None
+ vpt_cell_metadata = None
+ vpt_cell_boundaries = None
if 'barcode_mtd_path' in caf.columns:
cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now!
@@ -194,12 +226,14 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True):
# create the output file
outfile = "./tmp/" + caf['sample_id'][nn]
if load_raw:
- outfile = outfile + "_raw.h5mu"
+ outfile = outfile + "_raw.zarr"
else:
- outfile = outfile + ".h5mu"
+ outfile = outfile + ".zarr"
sample_id = caf['sample_id'][nn]
- yield spatial_path, outfile, \
- sample_id, spatial_filetype, spatial_counts, spatial_metadata, spatial_transformation
+
+ yield spatial_path, outfile, sample_id, spatial_filetype, \
+ visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file, \
+ vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries
def read_anndata(
diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial.py b/panpipes/panpipes/pipeline_deconvolution_spatial.py
index 41e60dde..eb970434 100644
--- a/panpipes/panpipes/pipeline_deconvolution_spatial.py
+++ b/panpipes/panpipes/pipeline_deconvolution_spatial.py
@@ -30,13 +30,12 @@ def get_logger():
def gen_filter_jobs():
- input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.h5mu"))
+ input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.zarr"))
input_singlecell = PARAMS["input_singlecell"]
for input_spatial in input_paths_spatial:
sample_prefix = os.path.basename(input_spatial)
- sample_prefix = sample_prefix.replace(".h5mu","")
- outfile_spatial = "cell2location.output/" + sample_prefix + "/Cell2Loc_spatial_output.h5mu"
- yield input_spatial, outfile_spatial, sample_prefix, input_singlecell
+ sample_prefix = sample_prefix.replace(".zarr","")
+ yield input_spatial, sample_prefix, input_singlecell
@mkdir("logs")
@@ -45,7 +44,7 @@ def gen_filter_jobs():
@mkdir("figures/Cell2Location")
@mkdir("cell2location.output")
@files(gen_filter_jobs)
-def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singlecell):
+def run_cell2location(input_spatial, sample_prefix, input_singlecell):
figdir = "./figures/Cell2Location/" + sample_prefix
output_dir = "./cell2location.output/" + sample_prefix
@@ -103,6 +102,8 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl
if PARAMS['Cell2Location_save_models'] is not None:
cmd += " --save_models %(Cell2Location_save_models)s"
+ if PARAMS['Cell2Location_export_gene_by_spot'] is not None:
+ cmd += " --export_gene_by_spot %(Cell2Location_export_gene_by_spot)s"
cmd += " > logs/%(log_file)s "
job_kwargs["job_threads"] = PARAMS['resources_threads_low']
@@ -116,7 +117,7 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl
@mkdir("figures/Tangram")
@mkdir("tangram.output")
@files(gen_filter_jobs)
-def run_tangram(input_spatial, outfile_spatial, sample_prefix, input_singlecell):
+def run_tangram(input_spatial, sample_prefix, input_singlecell):
figdir = "./figures/Tangram/" + sample_prefix
output_dir = "./tangram.output/" + sample_prefix
diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
index 66d524f0..b9d6d430 100644
--- a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
+++ b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml
@@ -89,6 +89,7 @@ Cell2Location:
# -------------------------------
save_models: False # Default False; whether to save the reference and spatial mapping models
+ export_gene_by_spot: False # Default False; whether to save a gene by spot matrix for each cell type in a layer
diff --git a/panpipes/panpipes/pipeline_preprocess_spatial.py b/panpipes/panpipes/pipeline_preprocess_spatial.py
index b4bf3f71..dbed8c9c 100644
--- a/panpipes/panpipes/pipeline_preprocess_spatial.py
+++ b/panpipes/panpipes/pipeline_preprocess_spatial.py
@@ -39,7 +39,7 @@ def gen_filter_jobs():
input_dir = "../qc.data"
if not os.path.exists(input_dir):
sys.exit("can't find input data")
- input_paths=glob.glob(os.path.join(input_dir,"*unfilt.h5mu"))
+ input_paths=glob.glob(os.path.join(input_dir,"*unfilt.zarr"))
for infile_path in input_paths:
file_name = os.path.basename(infile_path)
outfile = file_name.replace("unfilt","filtered")
@@ -52,24 +52,24 @@ def gen_filter_jobs():
@mkdir("tables")
@mkdir("filtered.data")
@files(gen_filter_jobs)
-def filter_mudata(infile_path,outfile):
+def filter_spatialdata(infile_path,outfile):
print('processing file = %s' % str(infile_path))
log_file = os.path.basename(outfile)
- log_file= "1_filtering."+log_file.replace("filtered.h5mu","") + ".log"
+ log_file= "1_filtering."+log_file.replace("filtered.zarr","") + ".log"
filter_dict = dictionary_stripper(PARAMS['filtering'])
cmd = """
python %(py_path)s/run_filter_spatial.py
- --input_mudata %(infile_path)s
- --output_mudata %(outfile)s
+ --input_spatialdata %(infile_path)s
+ --output_spatialdata %(outfile)s
--filter_dict "%(filter_dict)s"
"""
if PARAMS['filtering_keep_barcodes'] is not None:
cmd += " --keep_barcodes %(filtering_keep_barcodes)s"
cmd += " > logs/%(log_file)s "
job_kwargs["job_threads"] = PARAMS['resources_threads_low']
- log_msg = f"TASK: 'filter_mudata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION."
+ log_msg = f"TASK: 'filter_spatialdata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION."
get_logger().info(log_msg)
P.run(cmd, **job_kwargs)
@@ -84,8 +84,8 @@ def run_plotqc_query(pqc_dict):
@active_if(run_plotqc_query(PARAMS['plotqc']))
@active_if(PARAMS['filtering_run'])
-@transform(filter_mudata,
- regex("./filtered.data/(.*)_filtered.h5(.*)"),
+@transform(filter_spatialdata,
+ regex("./filtered.data/(.*)_filtered.zarr"),
r"./logs/2_postfilterplot.\1.log")
def postfilterplot_spatial(filt_file,log_file):
print(filt_file)
@@ -93,7 +93,7 @@ def postfilterplot_spatial(filt_file,log_file):
spatial_filetype = PARAMS["assay"]
cmd = """
python %(py_path)s/plot_qc_spatial.py
- --input_mudata %(filt_file)s
+ --input_spatialdata %(filt_file)s
--spatial_filetype %(spatial_filetype)s
--figdir ./figures/spatial
"""
@@ -108,8 +108,8 @@ def postfilterplot_spatial(filt_file,log_file):
P.run(cmd, **job_kwargs)
-@transform(filter_mudata,
- regex("./filtered.data/(.*)_filtered.h5(.*)"),
+@transform(filter_spatialdata,
+ regex("./filtered.data/(.*)_filtered.zarr"),
r"./logs/3_preprocess.\1.log")
def spatial_preprocess(filt_file,log_file):
if os.path.exists("figures/spatial") is False:
@@ -119,8 +119,8 @@ def spatial_preprocess(filt_file,log_file):
write_output = os.path.join("./tmp/",os.path.basename(filt_file))
cmd = """
python %(py_path)s/run_preprocess_spatial.py
- --input_mudata %(filt_file)s
- --output_mudata %(write_output)s
+ --input_spatialdata %(filt_file)s
+ --output_spatialdata %(write_output)s
--figdir ./figures/spatial
"""
if PARAMS['spatial_norm_hvg_flavour'] is not None:
@@ -154,7 +154,7 @@ def spatial_preprocess(filt_file,log_file):
get_logger().info(log_msg)
P.run(cmd, **job_kwargs)
-@follows(filter_mudata, postfilterplot_spatial, spatial_preprocess)
+@follows(filter_spatialdata, postfilterplot_spatial, spatial_preprocess)
@originate("cleanup_done.txt")
def cleanup(file):
# remove any ctmp fails
diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py
index 538cf67c..6342e8f7 100644
--- a/panpipes/panpipes/pipeline_qc_spatial.py
+++ b/panpipes/panpipes/pipeline_qc_spatial.py
@@ -56,7 +56,7 @@ def set_up_dirs(log_file):
pass
# -----------------------------------------------------------------------------------------------
-## Creating h5mu from filtered data files
+## Creating spatialData from filtered data files
# -----------------------------------------------------------------------------------------------
@@ -73,12 +73,9 @@ def gen_load_spatial_anndata_jobs():
@follows(mkdir("logs"))
@follows(mkdir("tmp"))
@files(gen_load_spatial_anndata_jobs)
-def load_mudatas(spatial_path, outfile,
- sample_id,
- spatial_filetype,
- spatial_counts,
- spatial_metadata,
- spatial_transformation):
+def load_spatialdatas(spatial_path, outfile,
+ sample_id, spatial_filetype, visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file,
+ vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries):
path_dict = {'spatial':spatial_path}
@@ -86,45 +83,56 @@ def load_mudatas(spatial_path, outfile,
print('sample_id = %s' % str(sample_id))
print('outfile = %s' % str(outfile))
print('spatial_filetype = %s' % str(spatial_filetype))
- print('spatial_counts = %s' % str(spatial_counts))
+
+ if spatial_filetype == "visium":
+ print('visium_feature_bc_matrix = %s' % str(visium_feature_bc_matrix))
+ print('visium_fullres_image_file= %s' % str(visium_fullres_image_file))
+ print('visium_tissue_positions_file= %s' % str(visium_tissue_positions_file))
+ print('visium_scalefactors_file= %s' % str(visium_scalefactors_file))
if spatial_filetype == "vizgen":
- print('spatial_metadata = %s' % str(spatial_metadata))
- print('spatial_transformation = %s' % str(spatial_transformation))
- else:
- print("visium")
+ print('vpt_cell_by_gene = %s' % str(vpt_cell_by_gene))
+ print('vpt_cell_metadata= %s' % str(vpt_cell_metadata))
+ print('vpt_cell_boundaries= %s' % str(vpt_cell_boundaries))
modality_dict = {k:True if path_dict[k] is not None else False for k,v in {'spatial': True}.items() }
print(modality_dict)
assays[outfile] = spatial_filetype
cmd = """
- python %(py_path)s/make_mudataspatial_from_csv.py
+ python %(py_path)s/make_spatialData_from_csv.py
--mode_dictionary "%(modality_dict)s"
--sample_id %(sample_id)s
--output_file %(outfile)s
--spatial_filetype %(spatial_filetype)s
--spatial_infile %(spatial_path)s
- --spatial_counts %(spatial_counts)s
"""
+ if spatial_filetype == "visium":
+ cmd += """
+ --visium_feature_bc_matrix %(visium_feature_bc_matrix)s
+ --scalefactors_file %(visium_scalefactors_file)s
+ --fullres_image_file %(visium_fullres_image_file)s
+ --tissue_positions_file %(visium_tissue_positions_file)s
+ """
if spatial_filetype == "vizgen":
cmd += """
- --spatial_metadata %(spatial_metadata)s
- --spatial_transformation %(spatial_transformation)s
+ --vpt_cell_by_gene %(vpt_cell_by_gene)s
+ --vpt_cell_metadata %(vpt_cell_metadata)s
+ --vpt_cell_boundaries %(vpt_cell_boundaries)s
"""
- cmd += " > logs/1_make_mudatas_%(sample_id)s.log"
+ cmd += " > logs/1_make_spatialdatas_%(sample_id)s.log"
job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
- log_msg = f"TASK: 'load_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_mudatas_{sample_id}.log' FOR MORE INFORMATION."
+ log_msg = f"TASK: 'load_spatialdatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_spatialdatas_{sample_id}.log' FOR MORE INFORMATION."
get_logger().info(log_msg)
P.run(cmd, **job_kwargs)
-@follows(load_mudatas)
+@follows(load_spatialdatas)
@follows(mkdir("qc.data"))
@follows(mkdir("./figures"))
-@transform(load_mudatas,
- regex("./tmp/(.*)_raw.h5(.*)"),
+@transform(load_spatialdatas,
+ regex("./tmp/(.*)_raw.zarr"),
r"./logs/2_spatialQC_\1.log")
def spatialQC(infile,log_file):
spatial_filetype = assays[infile]
@@ -171,8 +179,8 @@ def run_plotqc_query(pqc_dict):
@follows(spatialQC)
@follows(mkdir("./figures/spatial"))
@active_if(run_plotqc_query(PARAMS['plotqc']))
-@transform(load_mudatas,
- regex("./tmp/(.*)_raw.h5(.*)"),
+@transform(load_spatialdatas,
+ regex("./tmp/(.*)_raw.zarr"),
r"./logs/3_qcplot.\1.log")
def plotQC_spatial(unfilt_file,log_file):
spatial_filetype = assays[unfilt_file]
@@ -180,7 +188,7 @@ def plotQC_spatial(unfilt_file,log_file):
unfilt_file = unfilt_file.replace("tmp", "qc.data")
cmd = """
python %(py_path)s/plot_qc_spatial.py
- --input_mudata %(unfilt_file)s
+ --input_spatialdata %(unfilt_file)s
--spatial_filetype %(spatial_filetype)s
--figdir ./figures/spatial
"""
diff --git a/panpipes/python_scripts/collate_mdata.py b/panpipes/python_scripts/collate_mdata.py
index cf500bcc..3134accc 100644
--- a/panpipes/python_scripts/collate_mdata.py
+++ b/panpipes/python_scripts/collate_mdata.py
@@ -34,8 +34,19 @@
L.info("Running with params: %s", args)
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
+#L.info("Reading in MuData from '%s'" % args.input_mudata)
+#mdata = mu.read(args.input_mudata)
+L.info("Reading in data from '%s'" % args.input_mudata)
+if ".zarr" in args.input_mudata:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.input_mudata)
+ mdata = sd.read_zarr(args.input_mudata)
+else:
+ L.info("Reading in MuData from '%s'" % args.input_mudata)
+ mdata = mu.read(args.input_mudata)
+
+
+
L.info("Reading in cluster information")
cf = pd.read_csv(args.clusters_files_csv)
@@ -55,46 +66,78 @@
# add in the clusters
+if isinstance(mdata, MuData):
+ L.info("Adding cluster information to MuData")
+ for i in range(cf.shape[0]):
+ cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0)
+ cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
+ cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
+
+ if cf['mod'][i] != "multimodal":
+ mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True)
+ else:
+ mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True)
+elif isinstance(mdata, sd.SpatialData):
+ L.info("Adding cluster information to SpatialData")
+ for i in range(cf.shape[0]):
+ cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0)
+ cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
+ cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
+ mdata["table"].obs = mdata["table"].obs.merge(cf_df, left_index=True, right_index=True)
-L.info("Adding cluster information to MuData")
-for i in range(cf.shape[0]):
- cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0)
- cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category')
- cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]})
-
- if cf['mod'][i] != "multimodal":
- mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True)
- else:
- mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True)
L.info("Adding UMAP coordinates to MuData")
uf = pd.read_csv(args.umap_files_csv)
-for i in range(uf.shape[0]):
- uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0)
- mod = uf['mod'][i]
- new_key = uf['new_key'][i]
- if uf['mod'][i] != "multimodal":
- if all(mdata[mod].obs_names == uf_df.index):
- mdata[mod].obsm[new_key] = uf_df.to_numpy()
+if isinstance(mdata, MuData):
+ for i in range(uf.shape[0]):
+ uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0)
+ mod = uf['mod'][i]
+ new_key = uf['new_key'][i]
+ if uf['mod'][i] != "multimodal":
+ if all(mdata[mod].obs_names == uf_df.index):
+ mdata[mod].obsm[new_key] = uf_df.to_numpy()
+ else:
+ L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
else:
- L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
- else:
- # check the observations are the same
- if set(mdata.obs_names).difference(uf_df.index) == set():
- # put the observations in the same order
- uf_df = uf_df.loc[mdata.obs_names,:]
- mdata.obsm[new_key] = uf_df.to_numpy()
+ # check the observations are the same
+ if set(mdata.obs_names).difference(uf_df.index) == set():
+ # put the observations in the same order
+ uf_df = uf_df.loc[mdata.obs_names,:]
+ mdata.obsm[new_key] = uf_df.to_numpy()
+ else:
+ L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
+elif isinstance(mdata, sd.SpatialData):
+ for i in range(uf.shape[0]):
+ uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0)
+ mod = uf['mod'][i]
+ new_key = uf['new_key'][i]
+ if uf['mod'][i] != "multimodal":
+ if all(mdata["table"].obs_names == uf_df.index):
+ mdata["table"].obsm[new_key] = uf_df.to_numpy()
+ else:
+ L.warn("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] )
else:
- L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] )
-
-
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
-
-output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata)
-L.info("Saving metadata to '%s'" % output_csv)
-mdata.obs.to_csv(output_csv, sep='\t')
+ # check the observations are the same
+ if set(mdata["table"].obs_names).difference(uf_df.index) == set():
+ # put the observations in the same order
+ uf_df = uf_df.loc[mdata["table"].obs_names,:]
+ mdata["table"].obsm[new_key] = uf_df.to_numpy()
+ else:
+ L.warning("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] )
+
+if isinstance(mdata, MuData):
+ L.info("Saving updated MuData to '%s'" % args.output_mudata)
+ mdata.write(args.output_mudata)
+ output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata)
+ L.info("Saving metadata to '%s'" % output_csv)
+ mdata.obs.to_csv(output_csv, sep='\t')
+elif isinstance(mdata, sd.SpatialData):
+ L.info("Saving updated SpatialData to '%s'" % args.output_mudata)
+ mdata.write(args.output_mudata)
+ output_csv = re.sub(".zarr", "_cell_metdata.tsv", args.output_mudata)
+ L.info("Saving metadata to '%s'" % output_csv)
+ mdata.obs.to_csv(output_csv, sep='\t')
L.info("Done")
diff --git a/panpipes/python_scripts/make_mudataspatial_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py
similarity index 52%
rename from panpipes/python_scripts/make_mudataspatial_from_csv.py
rename to panpipes/python_scripts/make_spatialData_from_csv.py
index 06453411..d9d8a02e 100644
--- a/panpipes/python_scripts/make_mudataspatial_from_csv.py
+++ b/panpipes/python_scripts/make_spatialData_from_csv.py
@@ -1,21 +1,20 @@
import argparse
import yaml
# import scanpy as sc
-import pandas as pd
+#import pandas as pd
# import numpy as np
# from scipy.sparse import csr_matrix
-import muon as mu
-import warnings
-from muon._atac.tools import add_peak_annotation, locate_fragments
-import squidpy as sq
-from mudata import MuData
+#import muon as mu
+#import warnings
+#from muon._atac.tools import add_peak_annotation, locate_fragments
+#import squidpy as sq
+import spatialdata_io as sd_io
+#from mudata import MuData
import os
+from pathlib import Path
"""
-this script copies the make_adata_from_csv.py that creates
-ONE MUDATA PER SAMPLE, with in each ONE LAYER per modality
-for cell-suspension, saves them to temp.
-concatenation of the mudatas saved in tmp happens
-in the concat_anndata.py script
+This script is an adjustment of the make_adata_from_csv.py. It creates
+ONE SPATIALDATA PER SAMPLE and saves them to temp.
"""
import sys
@@ -49,13 +48,25 @@
parser.add_argument('--spatial_filetype',
default=None,
help='')
-parser.add_argument('--spatial_counts',
+parser.add_argument('--visium_feature_bc_matrix',
default=None,
help='')
-parser.add_argument('--spatial_metadata',
+parser.add_argument('--scalefactors_file',
default=None,
help='')
-parser.add_argument('--spatial_transformation',
+parser.add_argument('--fullres_image_file',
+ default=None,
+ help='')
+parser.add_argument('--tissue_positions_file',
+ default=None,
+ help='')
+parser.add_argument('--vpt_cell_by_gene',
+ default=None,
+ help='')
+parser.add_argument('--vpt_cell_metadata',
+ default=None,
+ help='')
+parser.add_argument('--vpt_cell_boundaries',
default=None,
help='')
@@ -64,21 +75,27 @@
L.info("Running with params: %s", args)
# unimodal mu (check if all the modalities)
-if isinstance(args.mode_dictionary, dict):
- mode_dictionary = args.mode_dictionary
-else:
- mode_dictionary = read_yaml(args.mode_dictionary)
+#if isinstance(args.mode_dictionary, dict):
+# mode_dictionary = args.mode_dictionary
+#else:
+# mode_dictionary = read_yaml(args.mode_dictionary)
#{'spatialT': True}
-permf = [key for key, value in mode_dictionary.items() if value == True]
+#permf = [key for key, value in mode_dictionary.items() if value == True]
all_files = {
- "spatial":[args.spatial_infile, #path, mandatory for squidpy
+ "spatial":[args.spatial_infile, #path
args.spatial_filetype, #needed for the load_adata_in function to call one of vizgen,visium
- args.spatial_counts, #name of the counts file, mandatory for squidpy
- args.spatial_metadata, #name of the metadata file, mandatory for squidpy
- args.spatial_transformation]}
+ args.visium_feature_bc_matrix, #name of the counts file, mandatory for squidpy
+ args.fullres_image_file, # visium
+ args.tissue_positions_file, #visium
+ args.scalefactors_file,
+ args.vpt_cell_by_gene,
+ args.vpt_cell_metadata,
+ args.vpt_cell_boundaries ]} # visium
+# args.spatial_metadata, #name of the metadata file, mandatory for squidpy
+# args.spatial_transformation]}
#subset to the modalities we want from permf (in this case only spatial)
-all_files = {nm: x for (nm, x) in all_files.items() if nm in permf}
+#all_files = {nm: x for (nm, x) in all_files.items() if nm in permf}
#[check_filetype(x[0], x[1]) for x in all_files.values()]
# read the spatial data with one of the functions inside
@@ -117,24 +134,32 @@ def check_dir_transform(infile_path, transform_file):
if args.spatial_filetype=="vizgen":
L.info("Reading in Vizgen data with squidpy.read.vizgen() into AnnData from directory " + args.spatial_infile)
- adata = sq.read.vizgen(path = args.spatial_infile, #path, mandatory for squidpy
- counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
- meta_file = args.spatial_metadata, #name of the metadata file, mandatory for squidpy
- transformation_file=args.spatial_transformation,
- library_id = str(args.sample_id)) #this also has kwargs for read_10x_h5 but keep simple
- adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str)
+ # check that all vpt parameters are not None
+ if "None" not in (args.vpt_cell_by_gene, args.vpt_cell_metadata, args.vpt_cell_boundaries):
+ vpt_outputs = {'cell_by_gene': Path(args.vpt_cell_by_gene) ,
+ 'cell_metadata': Path(args.vpt_cell_metadata) ,
+ 'cell_boundaries': Path(args.vpt_cell_boundaries)}
+ sdata = sd_io.merscope(path = args.spatial_infile, vpt_outputs=vpt_outputs)
+ else:
+ sdata = sd_io.merscope(path = args.spatial_infile)
+
elif args.spatial_filetype =="visium":
L.info("Reading in Visium data with squidpy.read.visium() into AnnData from directory " + args.spatial_infile)
- adata = sq.read.visium(path = args.spatial_infile, #path, mandatory for squidpy
- counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy
- library_id = str(args.sample_id)
- ) #this also has kwargs for read_10x_h5 but keep simple
+ sdata = sd_io.visium(path=args.spatial_infile,
+ dataset_id=str(args.sample_id),
+ counts_file=args.visium_feature_bc_matrix,
+ fullres_image_file=args.fullres_image_file,
+ tissue_positions_file=args.tissue_positions_file,
+ scalefactors_file=args.scalefactors_file)
+
+elif args.spatial_filetype =="xenium":
+ sdata = sd_io.xenium(path = args.spatial_infile)
-L.info("Resulting AnnData is:")
-L.info(adata)
-L.info("Creating MuData with .mod['spatial']")
+L.info("Resulting SpatialData is:")
+L.info(sdata)
+#L.info("Creating MuData with .mod['spatial']")
-mdata = MuData({"spatial": adata})
+#mdata = MuData({"spatial": adata})
#---------------
@@ -143,25 +168,25 @@ def check_dir_transform(infile_path, transform_file):
L.info("Making var names unique")
#make var names unique
-for mm in mdata.mod.keys():
- mdata[mm].var_names_make_unique()
+#for mm in mdata.mod.keys():
+sdata["table"].var_names_make_unique()
L.info("Adding sample_id '%s'to MuData.obs and MuData.mod['spatial'].obs" % args.sample_id)
-mdata.obs['sample_id'] = str(args.sample_id)
+sdata["table"].obs['sample_id'] = str(args.sample_id)
# copy the sample_id to each modality
-for mm in mdata.mod.keys():
+#for mm in mdata.mod.keys():
# mdata[mm].obs['sample_id'] = mdata.obs['sample_id']
- mdata[mm].obs['sample_id'] = mdata.obs.loc[mdata[mm].obs_names,:]['sample_id']
+sdata["table"].obs['sample_id'] = sdata["table"].obs.loc[sdata["table"].obs_names,:]['sample_id']
-mdata.update()
+#mdata.update()
-L.info("Resulting MuData is:")
-L.info(mdata)
+L.info("Resulting SpatialData is:")
+L.info(sdata)
-L.info("Saving MuData to '%s'" % args.output_file)
-L.debug(mdata)
-mdata.write(args.output_file)
+L.info("Saving SpatialData to '%s'" % args.output_file)
+L.debug(sdata)
+sdata.write(args.output_file)
L.info("Done")
diff --git a/panpipes/python_scripts/plot_cluster_umaps.py b/panpipes/python_scripts/plot_cluster_umaps.py
index 39e73b19..18b804c3 100644
--- a/panpipes/python_scripts/plot_cluster_umaps.py
+++ b/panpipes/python_scripts/plot_cluster_umaps.py
@@ -90,9 +90,13 @@ def plot_spatial(adata,figdir):
fig.savefig(os.path.join(figdir, ok + "_clusters.png"))
-
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
+if ".zarr" in args.infile:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ data = sd.read_zarr(args.infile)
+else:
+ L.info("Reading in MuData from '%s'" % args.infile)
+ data = read(args.infile)
mods = args.modalities.split(',')
# detemin initial figure directory based on object type
@@ -102,21 +106,27 @@ def plot_spatial(adata,figdir):
if os.path.exists("multimodal/figures") is False:
os.makedirs("multimodal/figures")
L.info("Plotting multimodal figures")
- main(mdata, figdir="multimodal/figures")
+ main(data, figdir="multimodal/figures")
# we also need to plot per modality
-if type(mdata) is MuData:
- for mod in mdata.mod.keys():
+if type(data) is MuData:
+ for mod in data.mod.keys():
if mod in mods:
L.info("Plotting for modality: %s" % mod)
figdir = os.path.join(mod, "figures")
if os.path.exists(figdir) is False:
os.makedirs(figdir)
if mod == "spatial": # added separate function for spatial
- plot_spatial(mdata[mod], figdir)
+ plot_spatial(data[mod], figdir)
else:
- main(mdata[mod], figdir)
+ main(data[mod], figdir)
+elif isinstance(data, sd.SpatialData):
+ L.info("Plotting for modality: spatial")
+ figdir = os.path.join("spatial", "figures")
+ if os.path.exists(figdir) is False:
+ os.makedirs(figdir)
+ plot_spatial(data["table"], figdir)
diff --git a/panpipes/python_scripts/plot_qc_spatial.py b/panpipes/python_scripts/plot_qc_spatial.py
index 6434b3b4..205bf74a 100644
--- a/panpipes/python_scripts/plot_qc_spatial.py
+++ b/panpipes/python_scripts/plot_qc_spatial.py
@@ -13,6 +13,7 @@
import sys
import logging
import re
+import spatialdata as sd
L = logging.getLogger()
L.setLevel(logging.INFO)
log_handler = logging.StreamHandler(sys.stdout)
@@ -27,8 +28,8 @@
parser = argparse.ArgumentParser()
-parser.add_argument("--input_mudata",
- default="mudata_unfilt.h5mu",
+parser.add_argument("--input_spatialdata",
+ default="spatialdata_unfilt.h5mu",
help="")
parser.add_argument("--figdir",
default="./figures/",
@@ -57,15 +58,16 @@
sc.settings.figdir = figdir
sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
-spatial = mdata.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
+#spatial = mdata.mod['spatial']
-input_data = os.path.basename(args.input_mudata)
-pattern = r"_filtered.h5(.*)"
+input_data = os.path.basename(args.input_spatialdata)
+pattern = r"_filtered.zarr"
match = re.search(pattern, input_data)
if match is None:
- match = re.search(r"_unfilt.h5(.*)", input_data)
+ match = re.search(r"_unfilt.zarr", input_data)
sprefix = input_data[:match.start()]
# convert string to list of strings
@@ -74,15 +76,16 @@
# check if metrics in adata.obs or adata.var
-qc_metrics = [metric if metric in spatial.obs.columns or metric in spatial.var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics]
+qc_metrics = [metric if metric in
+ sdata["table"].obs.columns or metric in sdata["table"].var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics]
qc_metrics = [metric for metric in qc_metrics if metric is not None]
# check that group_vars are in adata.obs
-group_var = [group if group in spatial.obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var]
+group_var = [group if group in sdata["table"].obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var]
group_var = [group for group in group_var if group is not None]
# make sure that it's saved as categorical
for group in group_var:
- spatial.obs[group] = spatial.obs[group].astype("category")
+ sdata["table"].obs[group] = sdata["table"].obs[group].astype("category")
if group_var == []:
group_var = None
@@ -93,34 +96,34 @@
for metric in qc_metrics:
# check if in adata.obs:
- if metric in spatial.obs.columns:
+ if metric in sdata["table"].obs.columns:
# check that it's a numeric column, so that it can be plotted:
- if metric not in spatial.obs._get_numeric_data().columns:
+ if metric not in sdata["table"].obs._get_numeric_data().columns:
L.warning("Variable '%s' not numerical in adata.obs, will not be plotted" % metric)
else:
L.info("Creating violin plot for '%s' of .obs" % metric)
if group_var is None:
- sc.pl.violin(spatial, keys = metric, xlabel = metric+ " in .obs",
+ sc.pl.violin(sdata["table"], keys = metric, xlabel = metric+ " in .obs",
save = "_obs_" + metric+ "_" + "."+sprefix + ".png", show = False)
else: #plot violin for each group
for group in group_var:
- sc.pl.violin(spatial, keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs",
+ sc.pl.violin(sdata["table"], keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs",
save = "_obs_" + metric+ "_" + group+ "."+sprefix +".png", show = False)
#plot spatial
L.info("Creating spatial embedding plot for '%s' of .obs" % metric)
- sc.pl.embedding(spatial,basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False)
+ sc.pl.embedding(sdata["table"],basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False)
#check if in adata.var:
- if metric in spatial.var.columns:
+ if metric in sdata["table"].var.columns:
- if metric not in spatial.var._get_numeric_data().columns:
+ if metric not in sdata["table"].var._get_numeric_data().columns:
L.warning("Variable '%s' not numerical in adata.var, will not be plotted" % metric)
else:
# plot violins
L.info("Creating violin plot for '%s' of .var" % metric)
ax = sns.violinplot(
- data=spatial.var[[metric]],
+ data=sdata["table"].var[[metric]],
orient='vertical',
)
ax.set(xlabel=metric+ " in .var" )
@@ -135,28 +138,28 @@
axs[0].set_title("Total transcripts per cell")
sns.histplot(
- spatial.obs["total_counts"],
+ sdata["table"].obs["total_counts"],
kde=False,
ax=axs[0],
)
axs[1].set_title("Unique transcripts per cell")
sns.histplot(
- spatial.obs["n_genes_by_counts"],
+ sdata["table"].obs["n_genes_by_counts"],
kde=False,
ax=axs[1],
)
axs[2].set_title("Transcripts per FOV")
sns.histplot(
- spatial.obs.groupby('fov')[['total_counts']].sum(),
+ sdata["table"].obs.groupby('fov')[['total_counts']].sum(),
kde=False,
ax=axs[2],
)
axs[3].set_title("Volume of segmented cells")
sns.histplot(
- spatial.obs["volume"],
+ sdata["table"].obs["volume"],
kde=False,
ax=axs[3],
)
diff --git a/panpipes/python_scripts/plot_scanpy_markers.py b/panpipes/python_scripts/plot_scanpy_markers.py
index 09fd4455..c7073b6c 100644
--- a/panpipes/python_scripts/plot_scanpy_markers.py
+++ b/panpipes/python_scripts/plot_scanpy_markers.py
@@ -115,17 +115,23 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None):
# read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-
-if type(mdata) is AnnData:
- adata = mdata
- # main function only does rank_gene_groups on X, so
-elif type(mdata) is mu.MuData and args.modality is not None:
- adata = mdata[args.modality]
-else:
- L.error("If the input is a MuData object, a modality needs to be specified")
- sys.exit('If the input is a MuData object, a modality needs to be specified')
+if args.modality != "spatial":
+ L.info("Reading in MuData from '%s'" % args.infile)
+ mdata = mu.read(args.infile)
+
+ if type(mdata) is AnnData:
+ adata = mdata
+ # main function only does rank_gene_groups on X, so
+ elif type(mdata) is mu.MuData and args.modality is not None:
+ adata = mdata[args.modality]
+ else:
+ L.error("If the input is a MuData object, a modality needs to be specified")
+ sys.exit('If the input is a MuData object, a modality needs to be specified')
+else:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ adata = sd.read_zarr(args.infile)["table"]
+
L.info("Loading marker information from '%s'" % args.marker_file)
mf = pd.read_csv(args.marker_file, sep='\t' )
diff --git a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
index ad675080..ba1af40a 100644
--- a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
+++ b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py
@@ -4,6 +4,7 @@
import logging
import scanpy as sc
from muon import MuData, read
+
from panpipes.funcs.scmethods import run_neighbors_method_choice
from panpipes.funcs.io import read_yaml
from panpipes.funcs.scmethods import lsi
@@ -37,53 +38,80 @@
sc.settings.n_jobs = int(args.n_threads)
# read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
+if ".zarr" in args.infile:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ sdata = sd.read_zarr(args.infile)
+else:
+ L.info("Reading in MuData from '%s'" % args.infile)
+ mdata = read(args.infile)
for mod in neighbor_dict.keys():
- if mod in mdata.mod.keys():
+ if mod != "spatial":
+ if mod in mdata.mod.keys():
+ if neighbor_dict[mod]['use_existing']:
+ L.info('Using existing neighbors graph for %s' % mod)
+ pass
+ else:
+ L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red']))
+ if type(mdata) is MuData:
+ adata=mdata[mod]
+ if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
+ L.info("X_pca not found, computing it using default parameters")
+ sc.tl.pca(adata)
+ if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
+ dimrem = int(neighbor_dict[mod]['dim_remove'])
+ adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
+ adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
+ if mod == "atac":
+ if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
+ L.info("X_lsi not found, computing it using default parameters")
+ lsi(adata=adata, num_components=50)
+ if neighbor_dict[mod]['dim_remove'] is not None:
+ L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove'])
+ dimrem = int(neighbor_dict[mod]['dim_remove'])
+ adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
+ adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
+ adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
+
+ # run command
+ opts = dict(method=neighbor_dict[mod]['method'],
+ n_neighbors=int(neighbor_dict[mod]['k']),
+ n_pcs=int(neighbor_dict[mod]['n_dim_red']),
+ metric=neighbor_dict[mod]['metric'],
+ nthreads=args.n_threads,
+ use_rep=neighbor_dict[mod]['dim_red'])
+
+
+ run_neighbors_method_choice(adata,**opts)
+ mdata.mod[mod] = adata
+ mdata.update()
+ else:
if neighbor_dict[mod]['use_existing']:
L.info('Using existing neighbors graph for %s' % mod)
pass
else:
L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red']))
- if type(mdata) is MuData:
- adata=mdata[mod]
- if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()):
+ if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in sdata["table"].obsm.keys()):
L.info("X_pca not found, computing it using default parameters")
- sc.tl.pca(adata)
- if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None):
- dimrem = int(neighbor_dict[mod]['dim_remove'])
- adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:]
- adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:]
- if mod == "atac":
- if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()):
- L.info("X_lsi not found, computing it using default parameters")
- lsi(adata=adata, num_components=50)
- if neighbor_dict[mod]['dim_remove'] is not None:
- L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove'])
- dimrem = int(neighbor_dict[mod]['dim_remove'])
- adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:]
- adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:]
- adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:]
-
- # run command
+ sc.tl.pca(sdata["table"])
opts = dict(method=neighbor_dict[mod]['method'],
n_neighbors=int(neighbor_dict[mod]['k']),
n_pcs=int(neighbor_dict[mod]['n_dim_red']),
metric=neighbor_dict[mod]['metric'],
nthreads=args.n_threads,
use_rep=neighbor_dict[mod]['dim_red'])
+ # run command
+ run_neighbors_method_choice(sdata["table"],**opts)
- run_neighbors_method_choice(adata,**opts)
- mdata.mod[mod] = adata
- mdata.update()
-
-
+if ".zarr" in args.infile:
+ L.info("Saving updated SpatialData to '%s'" % args.outfile)
+ sdata.write(args.outfile)
+else:
+ L.info("Saving updated MuData to '%s'" % args.outfile)
+ mdata.write(args.outfile)
-L.info("Saving updated MuData to '%s'" % args.outfile)
-mdata.write(args.outfile)
-L.info("Done")
\ No newline at end of file
+L.info("Done")
diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py
index 3bd74d6c..9be32a1d 100644
--- a/panpipes/python_scripts/run_cell2location.py
+++ b/panpipes/python_scripts/run_cell2location.py
@@ -7,6 +7,7 @@
import cell2location as c2l
import scanpy as sc
import pandas as pd
+import spatialdata as sd
import muon as mu
import os
@@ -20,6 +21,7 @@
from panpipes.funcs.scmethods import cell2loc_filter_genes
+
L = logging.getLogger()
L.setLevel(logging.INFO)
log_handler = logging.StreamHandler(sys.stdout)
@@ -47,6 +49,9 @@
parser.add_argument("--save_models",
default=False,
help="whether to save the reference & spatial mapping models")
+parser.add_argument("--export_gene_by_spot",
+ default=False,
+ help="whether to save a gene by spot matrix for each cell type in a layer")
# parameters for feature selection:
@@ -146,6 +151,11 @@
save_models = False
else:
save_models = True
+
+if (args.export_gene_by_spot is False) or (args.export_gene_by_spot == "False"):
+ export_gene_by_spot = False
+else:
+ export_gene_by_spot = True
if (args.remove_mt is True) or (args.remove_mt == "True"):
remove_mt = True
@@ -197,9 +207,10 @@
#1. read in the data
#spatial:
-L.info("Reading in spatial MuData from '%s'" % args.input_spatial)
-mdata_spatial = mu.read(args.input_spatial)
-adata_st = mdata_spatial.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_spatial)
+sdata_st = sd.read_zarr(args.input_spatial)
+#mdata_spatial = mu.read(args.input_spatial)
+#adata_st = mdata_spatial.mod['spatial']
#single-cell:
L.info("Reading in reference MuData from '%s'" % args.input_singlecell)
mdata_singlecell = mu.read(args.input_singlecell)
@@ -218,11 +229,11 @@
reduced_gene_set.columns = ["HVGs"]
L.info("Subsetting data on gene list")
adata_sc.var["selected_gene"] = adata_sc.var.index.isin(reduced_gene_set["HVGs"])
- adata_st.var["selected_gene"] = adata_st.var.index.isin(reduced_gene_set["HVGs"])
+ sdata_st["table"].var["selected_gene"] = sdata_st["table"].var.index.isin(reduced_gene_set["HVGs"])
adata_sc = adata_sc[:, adata_sc.var["selected_gene"]]
- adata_st = adata_st[:, adata_st.var["selected_gene"]]
+ sdata_st["table"] = sdata_st["table"][:, sdata_st["table"].var["selected_gene"]]
# check whether all genes are present in both, spatial & reference
- if set(adata_st.var.index) != set(adata_sc.var.index):
+ if set(sdata_st["table"].var.index) != set(adata_sc.var.index):
L.error(
"Not all genes of the gene list %s are present in the reference as well as in the ST data. Please provide a gene list where all genes are present in both, reference and ST.", args.gene_list)
sys.exit(
@@ -231,14 +242,14 @@
else: # perform feature selection according to cell2loc
if remove_mt is True:
L.info("Removing MT genes")
- adata_st.var["MT_gene"] = [gene.startswith("MT-") for gene in adata_st.var.index]
- adata_st.obsm["MT"] = adata_st[:, adata_st.var["MT_gene"].values].X.toarray()
- adata_st = adata_st[:, ~adata_st.var["MT_gene"].values]
+ sdata_st["table"].var["MT_gene"] = [gene.startswith("MT-") for gene in sdata_st["table"].var.index]
+ sdata_st["table"].obsm["MT"] = sdata_st["table"][:, sdata_st["table"].var["MT_gene"].values].X.toarray()
+ sdata_st["table"] = sdata_st["table"][:, ~sdata_st["table"].var["MT_gene"].values]
# intersect vars of reference and spatial
L.info("Intersecting vars of reference and spatial ")
- shared_features = [feature for feature in adata_st.var_names if feature in adata_sc.var_names]
+ shared_features = [feature for feature in sdata_st["table"].var_names if feature in adata_sc.var_names]
adata_sc = adata_sc[:, shared_features]
- adata_st = adata_st[:, shared_features]
+ sdata_st["table"] = sdata_st["table"][:, shared_features]
# select features
L.info("Selecting features using 'cell2location.utils.filtering.filter_genes() function'")
selected = cell2loc_filter_genes(adata_sc, figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff),
@@ -246,7 +257,7 @@
nonz_mean_cutoff=float(args.nonz_mean_cutoff))
L.info("Subsetting data on selected features")
adata_sc = adata_sc[:, selected]
- adata_st = adata_st[:, selected]
+ sdata_st["table"] = sdata_st["table"][:, selected]
@@ -280,7 +291,7 @@
L.info("Plotting QC plots")
cell2loc_plot_QC_reference(model_ref, figdir + "/QC_reference_reconstruction_accuracy.png", figdir + "/QC_reference_expression signatures_vs_avg_expression.png")
-# save model and update mudata
+# save model
if adata_sc.var.index.names[0] in adata_sc.var.columns:
adata_sc.var.index.names = [None]
mdata_singlecell.mod["rna"] = adata_sc
@@ -293,7 +304,7 @@
# 4. Fit mapping model
L.info("Setting up AnnData for the spatial model")
-c2l.models.Cell2location.setup_anndata(adata=adata_st,
+c2l.models.Cell2location.setup_anndata(adata=sdata_st["table"],
labels_key = args.labels_key_st,
layer= args.layer_st,
batch_key= args.batch_key_st,
@@ -301,7 +312,7 @@
continuous_covariate_keys = continuous_covariate_keys_st)
-model_spatial = c2l.models.Cell2location(adata = adata_st, cell_state_df=inf_aver,
+model_spatial = c2l.models.Cell2location(adata = sdata_st["table"], cell_state_df=inf_aver,
N_cells_per_location=float(args.N_cells_per_location),
detection_alpha=float(args.detection_alpha))
L.info("Training the spatial model")
@@ -312,32 +323,40 @@
cell2loc_plot_history(model_spatial, figdir + "/ELBO_spatial_model.png")
#extract posterior
L.info("Extracting the posterior of the spatial model")
-adata_st = model_spatial.export_posterior(adata_st)
+sdata_st["table"] = model_spatial.export_posterior(sdata_st["table"])
#plot QC
L.info("Plotting QC plots")
cell2loc_plot_QC_reconstr(model_spatial, figdir + "/QC_spatial_reconstruction_accuracy.png")
+# export a gene by spot matrix for each cell type
+if export_gene_by_spot:
+ # Compute expected expression per cell type
+ expected_dict = model_spatial.module.model.compute_expected_per_cell_type(model_spatial.samples["post_sample_q05"], model_spatial.adata_manager)
+ # Add to anndata layers
+ for i, n in enumerate(model_spatial.factor_names_):
+ sdata_st["table"].layers[n] = expected_dict['mu'][i]
+
#plot output
L.info("Plotting spatial embedding plot coloured by 'q05_cell_abundance_w_sf'")
-adata_st.obs[adata_st.uns["mod"]["factor_names"]] = adata_st.obsm["q05_cell_abundance_w_sf"]
-sc.pl.spatial(adata_st,color=adata_st.uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png")
+sdata_st["table"].obs[sdata_st["table"].uns["mod"]["factor_names"]] = sdata_st["table"].obsm["q05_cell_abundance_w_sf"]
+sc.pl.spatial(sdata_st["table"],color=sdata_st["table"].uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png")
-# save model and update mudata
-if adata_st.var.index.names[0] in adata_st.var.columns:
- adata_st.var.index.names = [None]
-mdata_spatial.mod["spatial"] = adata_st
-mdata_spatial.update()
+# save model
+if sdata_st["table"].var.index.names[0] in sdata_st["table"].var.columns:
+ sdata_st["table"].var.index.names = [None]
+#mdata_spatial.mod["spatial"] = adata_st
+#mdata_spatial.update()
if save_models is True:
L.info("Saving spatial model to '%s'" % output_dir)
model_spatial.save(output_dir+"/Spatial_mapping_model", overwrite=True)
#6. save mudatas
-L.info("Saving MuDatas to '%s'" % output_dir)
+L.info("Saving SpatialDatas to '%s'" % output_dir)
mdata_singlecell.write(output_dir+"/Cell2Loc_screference_output.h5mu")
-mdata_spatial.write(output_dir+"/Cell2Loc_spatial_output.h5mu")
+sdata_st.write(output_dir+"/Cell2Loc_spatial_output.zarr")
L.info("Done")
diff --git a/panpipes/python_scripts/run_clustering.py b/panpipes/python_scripts/run_clustering.py
index fcd2e5c5..ee183c90 100644
--- a/panpipes/python_scripts/run_clustering.py
+++ b/panpipes/python_scripts/run_clustering.py
@@ -34,13 +34,20 @@
# read data
L.info("Reading in data from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-if type(mdata) is AnnData:
- adata = mdata
-elif args.modality is not None:
- adata = mdata[args.modality]
-else:
- adata = mdata
+if ".zarr" in args.infile:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ sdata = sd.read_zarr(args.infile)
+ adata = sdata["table"]
+else:
+ mdata = mu.read(args.infile)
+ if type(mdata) is AnnData:
+ adata = mdata
+ elif args.modality is not None:
+ adata = mdata[args.modality]
+ else:
+ adata = mdata
+
uns_key=args.neighbors_key
# check sc.pp.neihgbours has been run
diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py
index 980f7070..c3a05c91 100644
--- a/panpipes/python_scripts/run_filter_spatial.py
+++ b/panpipes/python_scripts/run_filter_spatial.py
@@ -4,6 +4,7 @@
import re
import muon as mu
from anndata import AnnData
+import spatialdata as sd
import os
# import scpipelines.funcs as scp
from panpipes.funcs.processing import intersect_obs_by_mod, remove_unused_categories
@@ -41,10 +42,10 @@ def test_matching_df_ignore_cat(new_df, old_df):
# parse arguments
parser = argparse.ArgumentParser()
-parser.add_argument('--input_mudata',
+parser.add_argument('--input_spatialdata',
default='gut_minus1_amp.h5ad',
help='')
-parser.add_argument('--output_mudata',
+parser.add_argument('--output_spatialdata',
default='',
help='')
parser.add_argument('--filter_dict',
@@ -52,7 +53,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
help='this is pull')
# cross modalities args
parser.add_argument('--keep_barcodes', default=None,
- help='1 column list of barcodes to keep, note that they should match the mudata input, this filtering happens first')
+ help='1 column list of barcodes to keep, note that they should match the spatialdata input, this filtering happens first')
# load options
@@ -72,93 +73,89 @@ def test_matching_df_ignore_cat(new_df, old_df):
filter_dict = dictionary_stripper(filter_dict)
L.info("Filter dictionary:\n %s" %filter_dict)
-# load mudata
+# load spatialdata
-L.info("Reading in MuData from '%s'" % args.input_mudata)
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
-mdata = mu.read(args.input_mudata)
+#if isinstance(mdata, AnnData):
+# raise TypeError("Input '%s' should be of spatialdata format, not Anndata" % args.input_spatialdata)
-if isinstance(mdata, AnnData):
- raise TypeError("Input '%s' should be of MuData format, not Anndata" % args.input_mudata)
+orig_obs = sdata["table"].obs.copy()
-orig_obs = mdata.obs.copy()
-
-L.info("Before filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features")
+L.info("Before filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features")
# filter based on provided barcodes -----
if args.keep_barcodes is not None:
- L.info("Filtering MuData by keep_barcodes file")
+ L.info("Filtering SpatialData by keep_barcodes file")
keep_bc = pd.read_csv(args.keep_barcodes,header=None)
- mdata = mdata[mdata.obs_names.isin(keep_bc[0]),:].copy()
- remove_unused_categories(mdata.obs)
- mdata.update()
- L.info("Remaining cells: %d" % mdata.n_obs)
+ sdata["table"] = sdata["table"][sdata["table"].obs_names.isin(keep_bc[0]),:].copy()
+ remove_unused_categories(sdata["table"].obs)
+ #mdata.update()
+ L.info("Remaining cells: %d" % sdata["table"].n_obs)
# filter more than
if filter_dict['run']:
- # this will go through the modalities one at a time,
- # then the categories max, min and bool
- for mod in mdata.mod.keys():
- L.info(mod)
- if mod in filter_dict.keys():
- for marg in filter_dict[mod].keys():
- if marg == "obs":
- if "max" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['max'].items():
- L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % (mod, col, n))
- mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x <= n)
- L.info("Remaining cells: %d" % mdata[mod].n_obs)
- if "min" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['min'].items():
- L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % (mod, col, n))
- mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x >= n)
- L.info("Remaining cells: %d" % mdata[mod].n_obs)
- if "bool" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['bool'].items():
- L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % (mod, col, n))
- mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x == n)
- L.info("Remaining cells: %d" % mdata[mod].n_obs)
- if marg == "var":
- if "max" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['max'].items():
- L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % (mod, col, n))
- mu.pp.filter_var(mdata.mod[mod], col, lambda x: x <= n)
- L.info("Remaining features: %d" % mdata[mod].n_vars)
-
- if "min" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['min'].items():
- L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % (mod, col, n))
- mu.pp.filter_var(mdata.mod[mod], col, lambda x: x >= n)
- L.info("Remaining features: %d" % mdata[mod].n_vars)
-
- if "bool" in filter_dict[mod][marg].keys():
- for col, n in filter_dict[mod][marg]['bool'].items():
- L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % (mod, col, n))
- mu.pp.filter_var(mdata.mod[mod], col, lambda x: x == n)
- L.info("Remaining features: %d" % mdata[mod].n_vars)
-
-
-mdata.update()
-
-L.info("After filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features")
-
-remove_unused_categories(mdata.obs)
+ for marg in filter_dict["spatial"].keys():
+ if marg == "obs":
+ if "max" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['max'].items():
+ L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % ("spatial", col, n))
+ mu.pp.filter_obs(sdata["table"], col, lambda x: x <= n)
+ L.info("Remaining cells: %d" % sdata["table"].n_obs)
+ if "min" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['min'].items():
+ L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % ("spatial", col, n))
+ mu.pp.filter_obs(sdata["table"], col, lambda x: x >= n)
+ L.info("Remaining cells: %d" % sdata["table"].n_obs)
+ if "bool" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['bool'].items():
+ L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % ("spatial", col, n))
+ mu.pp.filter_obs(sdata["table"], col, lambda x: x == n)
+ L.info("Remaining cells: %d" % sdata["table"].n_obs)
+ if marg == "var":
+ if "max" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['max'].items():
+ L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % ("spatial", col, n))
+ mu.pp.filter_var(sdata["table"], col, lambda x: x <= n)
+ L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+ if "min" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['min'].items():
+ L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % ("spatial", col, n))
+ mu.pp.filter_var(sdata["table"], col, lambda x: x >= n)
+ L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+ if "bool" in filter_dict["spatial"][marg].keys():
+ for col, n in filter_dict["spatial"][marg]['bool'].items():
+ L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % ("spatial", col, n))
+ mu.pp.filter_var(sdata["table"], col, lambda x: x == n)
+ L.info("Remaining features: %d" % sdata["table"].n_vars)
+
+
+
+#mdata.update()
+
+L.info("After filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features")
+
+remove_unused_categories(sdata["table"].obs)
# run quick test before saving out.
-assert test_matching_df_ignore_cat(mdata.obs, orig_obs)
+assert test_matching_df_ignore_cat(sdata["table"].obs, orig_obs)
# write out obs
-output_prefix = re.sub(".h5mu", "", os.path.basename(args.output_mudata))
+output_prefix = re.sub(".zarr", "", os.path.basename(args.output_spatialdata))
L.info("Saving updated obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'")
-write_obs(mdata, output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
+write_obs(sdata["table"], output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
# write out the per sample_id cell numbers
cell_counts_dict={}
-for mm in mdata.mod.keys():
- cell_counts_dict[mm] = mdata[mm].obs.sample_id.value_counts().to_frame('n_cells')
+#for mm in mdata.mod.keys():
+cell_counts_dict["spatial"] = sdata["table"].obs.sample_id.value_counts().to_frame('n_cells')
cell_counts = pd.concat(cell_counts_dict).reset_index().rename(
columns={"level_0": "modality", "level_1": "sample_id"})
@@ -167,10 +164,10 @@ def test_matching_df_ignore_cat(new_df, old_df):
L.info("Saving cell counts in a metadata csv file to './tables/" + output_prefix + "_cell_counts.csv'")
cell_counts.to_csv("tables/" + output_prefix + "_cell_counts.csv", index=None)
-mdata.update()
+#mdata.update()
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
+L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata)
+sdata.write(args.output_spatialdata)
L.info("Done")
diff --git a/panpipes/python_scripts/run_find_markers_multi.py b/panpipes/python_scripts/run_find_markers_multi.py
index ba1422d7..0ae1d067 100644
--- a/panpipes/python_scripts/run_find_markers_multi.py
+++ b/panpipes/python_scripts/run_find_markers_multi.py
@@ -201,19 +201,22 @@ def main(adata,
L.info("Running with params: %s", args)
# read data
-L.info("Reading in MuData from '%s'" % args.infile)
-mdata = read(args.infile)
-
-
-if type(mdata) is AnnData:
- adata = mdata
- # main function only does rank_gene_groups on X, so
-elif type(mdata) is MuData and args.modality is not None:
- adata = mdata[args.modality]
-else:
- L.error("If the input is a MuData object, a modality needs to be specified")
- sys.exit('If the input is a MuData object, a modality needs to be specified')
-
+if args.modality != "spatial":
+ L.info("Reading in MuData from '%s'" % args.infile)
+ mdata = read(args.infile)
+ if type(mdata) is AnnData:
+ adata = mdata
+ # main function only does rank_gene_groups on X, so
+ elif type(mdata) is MuData and args.modality is not None:
+ adata = mdata[args.modality]
+ else:
+ L.error("If the input is a MuData object, a modality needs to be specified")
+ sys.exit('If the input is a MuData object, a modality needs to be specified')
+else:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ adata = sd.read_zarr(args.infile)["table"]
+
main(adata,
mod=args.modality,
diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py
index bdf28cf6..250057ec 100644
--- a/panpipes/python_scripts/run_preprocess_spatial.py
+++ b/panpipes/python_scripts/run_preprocess_spatial.py
@@ -8,6 +8,7 @@
import scanpy as sc
import muon as mu
import scanpy.experimental as sce
+import spatialdata as sd
import os
import argparse
@@ -31,11 +32,11 @@
parser = argparse.ArgumentParser()
-parser.add_argument("--input_mudata",
- default="mudata_unfilt.h5mu",
+parser.add_argument("--input_spatialdata",
+ default="spatialdata_unfilt.h5mu",
help="")
-parser.add_argument("--output_mudata",
- default="mudata_unfilt.h5mu",
+parser.add_argument("--output_spatialdata",
+ default="spatialdata_unfilt.h5mu",
help="")
parser.add_argument("--figdir",
default="./figures/",
@@ -88,12 +89,13 @@
sc.settings.figdir = figdir
sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
-L.info("Reading in MuData from '%s'" % args.input_mudata)
-mdata = mu.read(args.input_mudata)
-spatial = mdata.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_spatialdata)
+sdata = sd.read_zarr(args.input_spatialdata)
+#mdata = mu.read(args.input_spatialdata)
+#spatial = mdata.mod['spatial']
-input_data = os.path.basename(args.input_mudata)
-pattern = r"_filtered.h5(.*)"
+input_data = os.path.basename(args.input_spatialdata)
+pattern = r"_filtered.zarr"
match = re.search(pattern, input_data)
sprefix = input_data[:match.start()]
@@ -101,12 +103,12 @@
# check if raw data is available
#maybe layer of raw data as parameter
L.info("Checking if raw data is available")
-if X_is_raw(spatial):
+if X_is_raw(sdata["table"]):
L.info("Saving raw counts from .X to .layers['raw_counts']")
- spatial.layers['raw_counts'] = spatial.X.copy()
-elif "raw_counts" in spatial.layers :
+ sdata["table"].layers['raw_counts'] = sdata["table"].X.copy()
+elif "raw_counts" in sdata["table"].layers :
L.info(".layers['raw_counts'] already exists and copying it to .X")
- spatial.X = spatial.layers['raw_counts'].copy()
+ sdata["table"].X = sdata["table"].layers['raw_counts'].copy()
else:
L.error("X is not raw data and 'raw_counts' layer not found")
sys.exit("X is not raw data and 'raw_counts' layer not found")
@@ -116,24 +118,24 @@
if args.norm_hvg_flavour == "squidpy":
if args.squidpy_hvg_flavour == "seurat_v3":
L.info("Running HVG selection with flavor seurat_v3")
- sc.pp.highly_variable_genes(spatial, flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg,
+ sc.pp.highly_variable_genes(sdata["table"], flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg,
batch_key=args.hvg_batch_key)
L.info("Log-normalizing data")
- sc.pp.normalize_total(spatial)
- sc.pp.log1p(spatial)
+ sc.pp.normalize_total(sdata["table"])
+ sc.pp.log1p(sdata["table"])
else:
L.info("Log-normalizing data")
- sc.pp.normalize_total(spatial)
- sc.pp.log1p(spatial)
+ sc.pp.normalize_total(sdata["table"])
+ sc.pp.log1p(sdata["table"])
L.info("Running HVG selection with flavor %s" % args.squidpy_hvg_flavour)
- sc.pp.highly_variable_genes(spatial, flavor=args.squidpy_hvg_flavour,
+ sc.pp.highly_variable_genes(sdata["table"], flavor=args.squidpy_hvg_flavour,
min_mean=float(args.min_mean),
max_mean=float(args.max_mean),
min_disp=float(args.min_disp), subset=args.filter_by_hvg, batch_key=args.hvg_batch_key)
L.info("Saving log-normalized counts to .layers['lognorm']")
- spatial.layers["lognorm"] = spatial.X.copy()
+ sdata["table"].layers["lognorm"] = sdata["table"].X.copy()
# plot HVGs:
- sc.pl.highly_variable_genes(spatial, show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png")
+ sc.pl.highly_variable_genes(sdata["table"], show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png")
elif args.norm_hvg_flavour == "seurat":
if args.clip is None:
@@ -145,35 +147,35 @@
else:
clip = float(args.clip)
L.info("Running Pearson Residuals HVG selection")
- sce.pp.highly_variable_genes(spatial, theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes),
+ sce.pp.highly_variable_genes(sdata["table"], theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes),
batch_key=args.hvg_batch_key, flavor='pearson_residuals',
layer="raw_counts", subset=args.filter_by_hvg)
L.info("Running Pearson Residuals normalization")
- sce.pp.normalize_pearson_residuals(spatial, theta=float(args.theta), clip=clip, layer="raw_counts")
+ sce.pp.normalize_pearson_residuals(sdata["table"], theta=float(args.theta), clip=clip, layer="raw_counts")
L.info("Saving log-normalized counts to .layers['norm_pearson_resid']")
- spatial.layers["norm_pearson_resid"] = spatial.X.copy()
+ sdata["table"].layers["norm_pearson_resid"] = sdata["table"].X.copy()
else:
# error or warning?
L.warning("No normalization and HVG selection was performed! To perform, please specify the 'norm_hvg_flavour' as either 'squidpy' or 'seurat'")
-if "highly_variable" in spatial.var:
- L.info("You have %s Highly Variable Features", np.sum(spatial.var.highly_variable))
+if "highly_variable" in sdata["table"].var:
+ L.info("You have %s Highly Variable Features", np.sum(sdata["table"].var.highly_variable))
#PCA
L.info("Running PCA")
-sc.pp.pca(spatial, n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0)
+sc.pp.pca(sdata["table"], n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0)
L.info("Plotting PCA")
-sc.pl.pca(spatial, save = "_vars" + "."+ sprefix+".png")
-sc.pl.pca_variance_ratio(spatial, log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png")
+sc.pl.pca(sdata["table"], save = "_vars" + "."+ sprefix+".png")
+sc.pl.pca_variance_ratio(sdata["table"], log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png")
-mdata.update()
-L.info("Saving updated MuData to '%s'" % args.output_mudata)
-mdata.write(args.output_mudata)
+#mdata.update()
+L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata)
+sdata.write(args.output_spatialdata)
L.info("Done")
diff --git a/panpipes/python_scripts/run_scanpyQC_spatial.py b/panpipes/python_scripts/run_scanpyQC_spatial.py
index 3e3059c6..57c35343 100644
--- a/panpipes/python_scripts/run_scanpyQC_spatial.py
+++ b/panpipes/python_scripts/run_scanpyQC_spatial.py
@@ -18,6 +18,7 @@
import argparse
import scanpy as sc
import muon as mu
+import spatialdata as sd
from panpipes.funcs.io import write_obs
@@ -64,14 +65,15 @@
sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
-L.info("Reading in MuData from '%s'" % args.input_anndata)
+L.info("Reading in SpatialData from '%s'" % args.input_anndata)
-mdata = mu.read(args.input_anndata)
-spatial = mdata['spatial']
+#mdata = mu.read(args.input_anndata)
+sdata = sd.read_zarr(args.input_anndata)
+#spatial = mdata['spatial']
L.info("Spatial data is:")
-print(spatial)
-L.info("With sample id '%s'" % spatial.obs["sample_id"].unique()[0])
+print(sdata)
+L.info("With sample id '%s'" % sdata["table"].obs["sample_id"].unique()[0])
qc_vars = []
@@ -94,7 +96,7 @@
for kk in calc_proportions:
xname= kk
gene_list = cat_dic[kk]
- spatial.var[xname] = [x in gene_list for x in spatial.var_names]
+ sdata["table"].var[xname] = [x in gene_list for x in sdata["table"].var_names]
qc_vars.append(xname)
# Score genes
@@ -105,7 +107,7 @@
L.info("Computing gene scores for '%s'" % kk)
xname= kk
gene_list = cat_dic[kk]
- sc.tl.score_genes(spatial, gene_list ,
+ sc.tl.score_genes(sdata["table"], gene_list ,
ctrl_size=min(len(gene_list), 50),
gene_pool=None,
n_bins=25,
@@ -127,11 +129,11 @@
qc_info = " and calculating proportions for '%s'" % qc_vars
L.info("Calculating QC metrics with scanpy.pp.calculate_qc_metrics()" + qc_info)
percent_top = [50, 100, 200, 500] #default
-percent_top = [x for x in percent_top if x <= spatial.n_vars]
-sc.pp.calculate_qc_metrics(spatial, qc_vars=qc_vars, percent_top=percent_top, inplace=True)
+percent_top = [x for x in percent_top if x <= sdata["table"].n_vars]
+sc.pp.calculate_qc_metrics(sdata["table"], qc_vars=qc_vars, percent_top=percent_top, inplace=True)
-if (args.spatial_filetype == "vizgen") and ("blank_genes" in spatial.obsm):
- spatial.obsm["blank_genes"].to_numpy().sum() / spatial.var["total_counts"].sum() * 100
+if (args.spatial_filetype == "vizgen") and ("blank_genes" in sdata["table"].obsm):
+ sdata["table"].obsm["blank_genes"].to_numpy().sum() / sdata["table"].var["total_counts"].sum() * 100
# Calculate cc scores
if args.ccgenes is not None:
@@ -144,7 +146,7 @@
sgenes = ccgenes[ccgenes["cc_phase"] == "s"]["gene_name"].tolist()
g2mgenes = ccgenes[ccgenes["cc_phase"] == "g2m"]["gene_name"].tolist()
L.info("Calculating cell cycle scores")
- sc.tl.score_genes_cell_cycle(spatial, s_genes=sgenes, g2m_genes=g2mgenes)
+ sc.tl.score_genes_cell_cycle(sdata["table"], s_genes=sgenes, g2m_genes=g2mgenes)
else:
L.error("The path of the cell cycle genes tsv file '%s' could not be found" % args.ccgenes)
sys.exit("The path of the cell cycle genes tsv file '%s' could not be found" % args.ccgenes)
@@ -153,15 +155,15 @@
#TODO: we now need to update the mdata object to pick the calc proportion outputs made on
# spatial = mdata['spatial']
-mdata.update()
+#mdata.update()
single_id = os.path.basename(str(args.input_anndata))
single_id = single_id.replace("_raw.h5mu","")
L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
-write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv")
-L.info("Saving updated MuData to '%s'" % args.outfile)
-mdata.write(args.outfile)
+write_obs(sdata["table"], output_prefix=single_id, output_suffix="_cell_metadata.tsv")
+L.info("Saving updated SpatialData to '%s'" % args.outfile)
+sdata.write(args.outfile)
L.info("Done")
diff --git a/panpipes/python_scripts/run_tangram.py b/panpipes/python_scripts/run_tangram.py
index 6b2cc6a2..6f545771 100644
--- a/panpipes/python_scripts/run_tangram.py
+++ b/panpipes/python_scripts/run_tangram.py
@@ -9,6 +9,7 @@
import scanpy as sc
import tangram as tg
import muon as mu
+import spatialdata as sd
import os
import argparse
@@ -100,11 +101,12 @@
#1. read in the data
#spatial:
-L.info("Reading in spatial MuData from '%s'" % args.input_spatial)
-mdata_spatial = mu.read(args.input_spatial)
-adata_st = mdata_spatial.mod['spatial']
+L.info("Reading in SpatialData from '%s'" % args.input_spatial)
+sdata_st = sd.read_zarr(args.input_spatial)
+#mdata_spatial = mu.read(args.input_spatial)
+#adata_st = mdata_spatial.mod['spatial']
#single-cell:
-L.info("Reading in reference MuData from '%s'" % args.input_singlecell)
+L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell)
mdata_singlecell = mu.read(args.input_singlecell)
adata_sc = mdata_singlecell.mod['rna']
@@ -131,33 +133,33 @@
# "Preprocess" anndatas
L.info("Preprocessing AnnDatas")
-tg.pp_adatas(adata_sc=adata_sc, adata_sp=adata_st, genes=markers)
+tg.pp_adatas(adata_sc=adata_sc, adata_sp=sdata_st["table"], genes=markers)
# 3. Run tangram
L.info("Training model")
adata_results = tg.mapping_utils.map_cells_to_space(
- adata_sc=adata_sc, adata_sp=adata_st, num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
+ adata_sc=adata_sc, adata_sp=sdata_st["table"], num_epochs=int(args.num_epochs), device=args.device, **args.kwargs
)
# 3. Extract and plot results
L.info("Extracting annotations")
-tg.project_cell_annotations(adata_results, adata_st, annotation=args.labels_key_model)
+tg.project_cell_annotations(adata_results, sdata_st["table"], annotation=args.labels_key_model)
L.info("Plotting spatial embedding plot coloured by 'tangram_ct_pred'")
annotation_list = list(pd.unique(adata_sc.obs[args.labels_key_model]))
-df = adata_st.obsm["tangram_ct_pred"][annotation_list]
-tg.construct_obs_plot(df, adata_st, perc=0.05)
-if "spatial" in adata_st.uns:
- sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png")
+df = sdata_st["table"].obsm["tangram_ct_pred"][annotation_list]
+tg.construct_obs_plot(df, sdata_st["table"], perc=0.05)
+if "spatial" in sdata_st["table"].uns:
+ sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png")
else:
- sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5)
+ sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5)
mdata_singlecell_results = mu.MuData({"rna": adata_sc})
-mdata_spatial_results = mu.MuData({"spatial": adata_st})
+#mdata_spatial_results = mu.MuData({"spatial": adata_st})
-L.info("Saving MuDatas to '%s'" % output_dir)
+L.info("Saving SpatialData and MuData to '%s'" % output_dir)
mdata_singlecell_results.write(output_dir+"/Tangram_screference_output.h5mu")
-mdata_spatial_results.write(output_dir+"/Tangram_spatial_output.h5mu")
+sdata_st.write(output_dir+"/Tangram_spatial_output.zarr")
L.info("Done")
diff --git a/panpipes/python_scripts/run_umap.py b/panpipes/python_scripts/run_umap.py
index b70c19f3..112e9d6c 100644
--- a/panpipes/python_scripts/run_umap.py
+++ b/panpipes/python_scripts/run_umap.py
@@ -10,6 +10,7 @@
import muon as mu
from anndata import AnnData
+
import sys
import logging
L = logging.getLogger()
@@ -40,13 +41,19 @@
# read data
L.info("Reading in data from '%s'" % args.infile)
-mdata = mu.read(args.infile)
-if type(mdata) is AnnData:
- adata = mdata
-elif args.modality is not None:
- adata = mdata[args.modality]
-else:
- adata = mdata
+if ".zarr" in args.infile:
+ import spatialdata as sd
+ L.info("Reading in SpatialData from '%s'" % args.infile)
+ sdata = sd.read_zarr(args.infile)
+ adata = sdata["table"]
+else:
+ mdata = mu.read(args.infile)
+ if type(mdata) is AnnData:
+ adata = mdata
+ elif args.modality is not None:
+ adata = mdata[args.modality]
+ else:
+ adata = mdata
# set seed
diff --git a/pyproject.toml b/pyproject.toml
index 2dc39d84..02285df5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,10 @@ spatial = [
"scipy==1.12.0",
"squidpy",
"cell2location",
- "tangram-sc"
+ "tangram-sc",
+ "spatialdata==0.2.6",
+ "spatialdata-io==0.1.6",
+ "dask==2024.12.1"
]
refmap_old = [