diff --git a/.github/workflows/spatial_deconvolution-ci.yml b/.github/workflows/spatial_deconvolution_cell2location-ci.yml similarity index 90% rename from .github/workflows/spatial_deconvolution-ci.yml rename to .github/workflows/spatial_deconvolution_cell2location-ci.yml index b22e40da..b43215bb 100644 --- a/.github/workflows/spatial_deconvolution-ci.yml +++ b/.github/workflows/spatial_deconvolution_cell2location-ci.yml @@ -1,4 +1,4 @@ -name: Run tutorials (spatial deconvolution) +name: Run tutorials (spatial deconvolution cell2location) on: push: @@ -12,7 +12,7 @@ env: debug: 'true' jobs: - spatial_deconvolution: + spatial_deconvolution_cell2location: runs-on: ubuntu-latest strategy: fail-fast: false @@ -60,7 +60,9 @@ jobs: cd deconvolution/data curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677 cd spatial_data - curl -L -o Human_Heart.h5mu https://figshare.com/ndownloader/files/44969488 + curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673 + unzip Human_Heart.zarr.zip + rm Human_Heart.zarr.zip # Note: we run the following to test that the commands works diff --git a/.github/workflows/spatial_deconvolution_tangram-ci.yml b/.github/workflows/spatial_deconvolution_tangram-ci.yml new file mode 100644 index 00000000..c06d8b75 --- /dev/null +++ b/.github/workflows/spatial_deconvolution_tangram-ci.yml @@ -0,0 +1,98 @@ +name: Run tutorials (spatial deconvolution tangram) + +on: + push: + branches: + - main + pull_request: + branches: + - main + +env: + debug: 'true' + +jobs: + spatial_deconvolution_tangram: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] # , "macos-latest", "windows-latest" + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: File tree + if: env.debug == 'true' + run: tree + + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest + auto-activate-base: true + auto-update-conda: true + channels: conda-forge + channel-priority: strict + activate-environment: pipeline_env + environment-file: pipeline_env.yaml + + - name: Install Panpipes + shell: bash -el {0} + run: | + pip install .[spatial] + conda list + + - name: Conda info + if: env.debug == 'true' + shell: bash -el {0} + run: conda info + + - name: Conda list + if: env.debug == 'true' + shell: pwsh + run: conda list + + + - name: Preparing the data + run: | + mkdir deconvolution_tangram deconvolution_tangram/data deconvolution_tangram/data/spatial_data + cd deconvolution_tangram/data + curl -L -o Human_Heart_reference.h5mu https://figshare.com/ndownloader/files/44969677 + cd spatial_data + curl -L -o Human_Heart.zarr.zip https://figshare.com/ndownloader/files/51667673 + unzip Human_Heart.zarr.zip + rm Human_Heart.zarr.zip + + + # Note: we run the following to test that the commands works + - name: Preparing the configuration file + shell: bash -el {0} + run: | + cd deconvolution_tangram + panpipes deconvolution_spatial config + + - name: Edit the submission file + run: | + cd deconvolution_tangram + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/deconvolution_tangram/pipeline.yml + + - name: File tree + if: env.debug == 'true' + run: tree deconvolution_tangram + + - name: Review pipeline tasks + shell: bash -el {0} + run: | + cd deconvolution_tangram + panpipes deconvolution_spatial show full --local + + - name: Run pipeline tasks + shell: bash -el {0} + run: | + cd deconvolution_tangram + panpipes deconvolution_spatial make full --local + + - name: File tree + if: env.debug == 'true' + run: tree deconvolution_tangram diff --git a/.github/workflows/spatial_ingestion_merfish-ci.yml b/.github/workflows/spatial_ingestion_merfish-ci.yml index 89812404..e0b8ae06 100644 --- a/.github/workflows/spatial_ingestion_merfish-ci.yml +++ b/.github/workflows/spatial_ingestion_merfish-ci.yml @@ -58,11 +58,11 @@ jobs: run: | mkdir spatial spatial/ingestion_merfish spatial/ingestion_merfish/data cd spatial/ingestion_merfish/data - curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv https://figshare.com/ndownloader/files/45028624 - curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv https://figshare.com/ndownloader/files/45028621 + curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/45028624 + curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/45028621 mkdir images cd images - curl -L -o datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_images_micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645 + curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/45028645 # Note: we run the following to test that the commands works @@ -75,12 +75,12 @@ jobs: - name: Preparing the submission file run: | cd spatial/ingestion_merfish - curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/sample_file_qc_spatial.txt + curl -o sample_file_qc_merfish.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt - name: Preparing the yaml file run: | cd spatial/ingestion_merfish - curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_merfish_data/pipeline.yml + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merfish_data/pipeline.yml - name: File tree if: env.debug == 'true' diff --git a/.github/workflows/spatial_ingestion_merscope-ci.yml b/.github/workflows/spatial_ingestion_merscope-ci.yml new file mode 100644 index 00000000..3a2da28a --- /dev/null +++ b/.github/workflows/spatial_ingestion_merscope-ci.yml @@ -0,0 +1,105 @@ +name: Run tutorials (spatial ingest merscope) + +on: + push: + branches: + - main + pull_request: + branches: + - main + +env: + debug: 'true' + +jobs: + spatial_ingest_merscope: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] # , "macos-latest", "windows-latest" + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: File tree + if: env.debug == 'true' + run: tree + + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest + auto-activate-base: true + auto-update-conda: true + channels: conda-forge + channel-priority: strict + activate-environment: pipeline_env + environment-file: pipeline_env.yaml + + - name: Install Panpipes + shell: bash -el {0} + run: | + pip install .[spatial] + conda list + + - name: Conda info + if: env.debug == 'true' + shell: bash -el {0} + run: conda info + + - name: Conda list + if: env.debug == 'true' + shell: pwsh + run: conda list + + + - name: Preparing the data + run: | + mkdir spatial spatial/ingestion_merscope spatial/ingestion_merscope/data + cd spatial/ingestion_merscope/data + curl -L -o cell_by_gene.csv https://figshare.com/ndownloader/files/50899455 + curl -L -o cell_metadata.csv https://figshare.com/ndownloader/files/50899452 + curl -L -o cellpose_micron_space.parquet https://figshare.com/ndownloader/files/50899458 + # curl -L -o detected_transcripts.csv https://figshare.com/ndownloader/files/50899476 + mkdir images + cd images + curl -L -o micron_to_mosaic_pixel_transform.csv https://figshare.com/ndownloader/files/50899449 + + + # Note: we run the following to test that the commands works + - name: Preparing the configuration file + shell: bash -el {0} + run: | + cd spatial/ingestion_merscope + panpipes qc_spatial config + + - name: Preparing the submission file + run: | + cd spatial/ingestion_merscope + curl -o sample_file_qc_merscope.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt + + - name: Preparing the yaml file + run: | + cd spatial/ingestion_merscope + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_merscope_data/pipeline.yml + + - name: File tree + if: env.debug == 'true' + run: tree spatial/ingestion_merscope + + - name: Review pipeline tasks + shell: bash -el {0} + run: | + cd spatial/ingestion_merscope + panpipes qc_spatial show full --local + + - name: Run pipeline tasks + shell: bash -el {0} + run: | + cd spatial/ingestion_merscope + panpipes qc_spatial make full --local + + - name: File tree + if: env.debug == 'true' + run: tree spatial/ingestion_merscope diff --git a/.github/workflows/spatial_ingestion_visium-ci.yml b/.github/workflows/spatial_ingestion_visium-ci.yml index 8fb57e05..77da20f2 100644 --- a/.github/workflows/spatial_ingestion_visium-ci.yml +++ b/.github/workflows/spatial_ingestion_visium-ci.yml @@ -78,12 +78,11 @@ jobs: - name: Preparing the submission file run: | cd spatial/ingestion - curl -o sample_file_qc_spatial.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/sample_file_qc_spatial.txt - + curl -o sample_file_qc_visium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt - name: Preparing the yaml file run: | cd spatial/ingestion - curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/main/docs/ingesting_visium_data/pipeline.yml + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_visium_data/pipeline.yml - name: File tree if: env.debug == 'true' diff --git a/.github/workflows/spatial_ingestion_xenium.yml b/.github/workflows/spatial_ingestion_xenium.yml new file mode 100644 index 00000000..366beb3d --- /dev/null +++ b/.github/workflows/spatial_ingestion_xenium.yml @@ -0,0 +1,109 @@ +name: Run tutorials (spatial ingest xenium) + +on: + push: + branches: + - main + pull_request: + branches: + - main + +env: + debug: 'true' + +jobs: + spatial_ingest_xenium: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] # , "macos-latest", "windows-latest" + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: File tree + if: env.debug == 'true' + run: tree + + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest + auto-activate-base: true + auto-update-conda: true + channels: conda-forge + channel-priority: strict + activate-environment: pipeline_env + environment-file: pipeline_env.yaml + + - name: Install Panpipes + shell: bash -el {0} + run: | + pip install .[spatial] + conda list + + - name: Conda info + if: env.debug == 'true' + shell: bash -el {0} + run: conda info + + - name: Conda list + if: env.debug == 'true' + shell: pwsh + run: conda list + + + - name: Preparing the data + run: | + mkdir spatial spatial/ingestion_xenium spatial/ingestion_xenium/data + cd spatial/ingestion_xenium/data + curl -L -o experiment.xenium https://figshare.com/ndownloader/files/51244265 + curl -L -o nucleus_boundaries.parquet https://figshare.com/ndownloader/files/51244286 + curl -L -o cell_boundaries.parquet https://figshare.com/ndownloader/files/51244244 + curl -L -o transcripts.parquet https://figshare.com/ndownloader/files/51244283 + curl -L -o cell_feature_matrix.h5 https://figshare.com/ndownloader/files/51244247 + curl -L -o cells.parquet https://figshare.com/ndownloader/files/51244259 + curl -L -o morphology_mip.ome.tif https://figshare.com/ndownloader/files/51244415 + curl -L -o cells.zarr.zip https://figshare.com/ndownloader/files/51244262 + mkdir morphology_focus + cd morphology_focus + curl -L -o morphology_focus_0000.ome.tif https://figshare.com/ndownloader/files/51244277 + + + # Note: we run the following to test that the commands works + - name: Preparing the configuration file + shell: bash -el {0} + run: | + cd spatial/ingestion_xenium + panpipes qc_spatial config + + - name: Preparing the submission file + run: | + cd spatial/ingestion_xenium + curl -o sample_file_qc_xenium.txt https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt + + - name: Preparing the yaml file + run: | + cd spatial/ingestion_xenium + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes-tutorials/sarah_spatialData/docs/ingesting_xenium_data/pipeline.yml + + - name: File tree + if: env.debug == 'true' + run: tree spatial/ingestion_xenium + + - name: Review pipeline tasks + shell: bash -el {0} + run: | + cd spatial/ingestion_xenium + panpipes qc_spatial show full --local + + - name: Run pipeline tasks + shell: bash -el {0} + run: | + cd spatial/ingestion_xenium + panpipes qc_spatial make full --local + + - name: File tree + if: env.debug == 'true' + run: tree spatial/ingestion_xenium diff --git a/.github/workflows/spatial_preprocess-ci.yml b/.github/workflows/spatial_preprocess-ci.yml index f9a6123f..ce322ee6 100644 --- a/.github/workflows/spatial_preprocess-ci.yml +++ b/.github/workflows/spatial_preprocess-ci.yml @@ -53,14 +53,18 @@ jobs: shell: pwsh run: conda list - - name: Preparing the data run: | mkdir spatial spatial/preprocess spatial/preprocess/data cd spatial/preprocess/data - curl -L -o V1_Human_Heart_unfilt.h5mu https://figshare.com/ndownloader/files/45031048 - curl -L -o V1_Human_Lymph_Node_unfilt.h5mu https://figshare.com/ndownloader/files/45031051 + curl -L -o V1_Human_Heart_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236521 + unzip V1_Human_Heart_unfilt.zarr.zip + rm V1_Human_Heart_unfilt.zarr.zip + curl -L -o V1_Human_Lymph_Node_unfilt.zarr.zip https://figshare.com/ndownloader/files/52236575 + unzip V1_Human_Lymph_Node_unfilt.zarr.zip + rm V1_Human_Lymph_Node_unfilt.zarr.zip + # Note: we run the following to test that the commands works - name: Preparing the configuration file @@ -78,7 +82,7 @@ jobs: run: | cd spatial/preprocess sed -i 's+../ingestion/qc.data/+./data/+g' pipeline.yml - + - name: File tree if: env.debug == 'true' run: tree spatial/preprocess diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ac8d32d..4baadcb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,24 @@ ## [latest] +### added +- moved from MuData to SpatialData +- xenium ingestion & ingest_xenium GitHub action +- `export_gene_by_spot` for Cell2Location +- separate sample submission files for the different spatial technologies +- separate GitHub actions for Cell2Location & Tangram +- separate GitHub actions for MERSCOPE & MERFISH + + +### fixed + + +### dependencies +- pinned "spatialdata==0.2.6", "spatialdata-io==0.1.6", "dask==2024.12.1" as temporary fix + + +## v1.1.0 + ### added ### fixed diff --git a/docs/usage/setup_for_spatial_workflows.md b/docs/usage/setup_for_spatial_workflows.md index 248c33b9..456d519f 100644 --- a/docs/usage/setup_for_spatial_workflows.md +++ b/docs/usage/setup_for_spatial_workflows.md @@ -1,28 +1,67 @@ Sample submission file for the ingestion of spatial data =========================== -The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium` and `Vizgen` data. +The spatial transcriptomics ingestion workflow requires a sample submission file that specifies the location of the input files. The sample submission file is a tab-separated file with one row per sample. Panpipes currently supports the ingestion of `Visium`, `Vizgen`, and `Xenium` data. The data of different technologies needs to be ingested separately with different sample submission files. -The 6 columns of the sample submission file are: + +The minimum required (non-optional) columns for each submission file are **sample id**: Unique sample ID. -**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [squidpy.read.visium](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.visium.html) (for `Visium` data) or [squidpy.read.vizgen](https://squidpy.readthedocs.io/en/stable/api/squidpy.read.vizgen.html) (for `Vizgen` data) functions. +**spatial_path**: The root directory containing the data files. Please note, that the folder structure of the root directory needs to be structured as expected by the [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) (for `Visium` data), [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) (for `Vizgen` data), or [spatialdata_io.xenium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.xenium.html) (for `Xenium` data) functions. + +**spatial_filetype**: Either "vizgen", "visium", or "xenium". + + +## Visium + +The 7 columns of the Visium sample submission file are: + +sample_id | spatial_path | spatial_filetype | visium_feature_bc_matrix | visium_fullres_image_file | visium_tissue_positions_file | visium_scalefactors_file +----------|----------|------------|-----------|----------|-------------|------------- + +The following 4 columns are **optional**: + +**visium_feature_bc_matrix**: Name of the counts file. Corresponds to the `counts_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) + +**visium_fullres_image_file**: Path to the full-resolution image. Corresponds to the `fullres_image_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) + +**visium_tissue_positions_file**: Path to the tissue positions file. Corresponds to the `tissue_positions_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) + +**visium_scalefactors_file**: Path to the scalefactors file. Corresponds to the `scalefactors_file` parameter of [spatialdata_io.visium](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.visium.html) + +##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_visium_data/sample_file_qc_visium.txt) + + +## Vizgen + +The 6 columns of the Vizgen sample submission file are: + +sample_id | spatial_path | spatial_filetype | vpt_cell_by_gene | vpt_cell_metadata | vpt_cell_boundaries +----------|----------|------------|----------|-------------|------------- + +The following 3 columns are **optional**: + +**vpt_cell_by_gene**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) + +**vpt_cell_metadata**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) + +**vpt_cell_boundaries**: The file name of the output of the vizgen-postprocessing-tool. See [spatialdata_io.merscope](https://spatialdata.scverse.org/projects/io/en/latest/generated/spatialdata_io.merscope.html) + + +##### Example submission files [MERFISH](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merfish_data/sample_file_qc_merfish.txt) [MERSCOPE](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_merscope_data/sample_file_qc_merscope.txt) + +## Xenium + +The 3 columns of the Xenium sample submission file are: + +sample_id | spatial_path | spatial_filetype | +----------|----------|------------ -**spatial_filetype**: Either "vizgen" or "visium". +##### [Example submission file](https://github.com/DendrouLab/panpipes-tutorials/blob/sarah_spatialData/docs/ingesting_xenium_data/sample_file_qc_xenium.txt) -**spatial_counts**: The count matrix file. Usually `filtered_feature_bc_matrix.h5` or `raw_feature_bc_matrix.h5` for a `Visium` dataset. For `Vizgen` inputs, this file typically ends with `_cell_by_gene.csv.` -**spatial_metadata**: The metadata csv-file for `Vizgen` data. Leave empty for `Visium` data. -**spatial_transformation**: The transformation csv-file for `Vizgen` data. This column is **optional** for `Vizgen` data. Leave empty for `Visium` data. -**Note, that the columns, `sample_id`, `spatial_path`, `spatial_filetype`, and `spatial_counts` are required for both, `Visium` and `Vizgen` data. The `spatial_metadata`(required) and `spatial_transformation`(optional) columns are `Vizgen`-specific and should be left empty for `Visium` data.** -### Example submission file -| sample_id | spatial_path | spatial_filetype | spatial_counts | spatial_metadata | spatial_transformation | -| --------- |--------------|------------------|-----------------------------------------|------------------------------------------|--------------------| -| V1_Human_Heart |./data_visium/V1_Human_Heart |visium |V1_Human_Heart_filtered_feature_bc_matrix.h5 | -| V1_Human_Lymph_Node |./data_visium/V1_Human_Lymph_Node| visium | V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5 | -Mouse_Brain | ./data_vizgen | vizgen | cell_by_gene_S1R1.csv | cell_metadata_S1R1.csv | images_micron_to_mosaic_pixel_transform.csv diff --git a/docs/workflows/clustering_spatial.md b/docs/workflows/clustering_spatial.md index 3c63b62e..99ae8b49 100644 --- a/docs/workflows/clustering_spatial.md +++ b/docs/workflows/clustering_spatial.md @@ -1,6 +1,6 @@ # Clustering spatial data The `clustering` workflow accepts both cell suspension datasets and spatial transcriptomics data as input that have been ingested with the `qc_spatial` workflow and optionally filtered with the `spatial_preprocess` workflow. -The workflow expects a **single `MuData` object** with the spatial data saved in `mdata.mod["spatial"]`. +The workflow expects a **single `SpatialData` object**. Set `spatial: True` in the configuration file and customize the spatial modality clustering parameters exactly as you would for a single cell experiment. For more information check the [clustering workflow](./clustering.md) diff --git a/docs/workflows/deconvolute_spatial.md b/docs/workflows/deconvolute_spatial.md index e1233dba..d03f56b9 100644 --- a/docs/workflows/deconvolute_spatial.md +++ b/docs/workflows/deconvolute_spatial.md @@ -1,6 +1,6 @@ # Deconvoluting spatial data -With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. +With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted using the same reference. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. The workflow provides the possibility to run deconvolution using `Cell2Location` and `Tangram`. @@ -19,8 +19,9 @@ For the reference and each spatial slide the following steps are run. **Note, th - Regression/reference model is fitted and a plot of the training history as well as QC plots are saved in the `./figures/Cell2Location` directory. Additionally, a csv-file `Cell2Loc_inf_anver.csv` with the estimated expression of every gene in every cell type is saved in `./cell2location.output`. - (Optional) Reference model is saved in `./cell2location.output` - Spatial mapping model is fitted. Training history and QC plots are saved in the `./figures/Cell2Location` directory. Plot of the spatial embedding coloured by `q05_cell_abundance_w_sf` is also saved in `./figures/Cell2Location`. +- (Optional) A gene by spot matrix for each cell type is saved to a layer in the table of the `SpatialData` object - (Optional) Spatial mapping model is saved in `./cell2location.output` -- `MuData` objects of the spatial slide and the reference are saved in `./cell2location.output`. The `MuData` object of the spatial slide contains the estimated cell type abundances. +- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./cell2location.output`. The `SpatialData` object of the spatial slide contains the estimated cell type abundances. ### Tangram @@ -34,7 +35,7 @@ For the reference and each spatial slide the following steps are run. **Note, th - Data is preprocessed with [tangram.pp_adatas](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.pp_adatas.html) - Tangram model is fitted with [tangram.mapping_utils.map_cells_to_space](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.mapping_utils.map_cells_to_space.html) and annotations are transfered from single-cell data onto space with [tangram.project_cell_annotations](https://tangram-sc.readthedocs.io/en/latest/classes/tangram.utils.project_cell_annotations.html) - Plot of the spatial embedding coloured by `tangram_ct_pred` is saved in `./figures/Tangram` -- `MuData` objects of the spatial slide and the reference are saved in `./tangram.output`. The `MuData` object of the spatial slide contains the deconvolution predictions. +- The `SpatialData` object of the spatial slide and the `MuData` object of the reference are saved in `./tangram.output`. The `SpatialData` object of the spatial slide contains the deconvolution predictions. diff --git a/docs/workflows/ingest_spatial.md b/docs/workflows/ingest_spatial.md index a9992d6d..f6ded09a 100644 --- a/docs/workflows/ingest_spatial.md +++ b/docs/workflows/ingest_spatial.md @@ -1,19 +1,19 @@ # Ingesting spatial data -Similar to the cell suspension workflow, `spatial_qc` ingests `Vizgen` and/or `Visium` data and saves the data into `MuData` objects. -A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `MuData` objects, each with a `spatial` layer. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition. In a future release, we will use [SpatialData](https://spatialdata.scverse.org/en/latest/tutorials/notebooks/notebooks.html) as a data format and framework to process multi-slides experiments. +The `spatial_qc` workflow ingests `Vizgen`, `Visium`, or `Xenium` data and saves the data into `SpatialData` objects. +A primary difference to the cell suspension `ingestion` workflow is that we are not concatenating the input data into a single matrix, but keeping the samples as separate `SpatialData` objects. This ensures that the processing does not introduce any technical batch effect when tissue slides are very different in cell composition. ## Steps -- Data is ingested into `MuData` objects with the modality `spatial`. The workflow generates one MuData per dataset. - - Raw `MuData` objects are saved into `./tmp` +- Data is ingested into `SpatialData` objects. The workflow generates one `SpatialData` per dataset. + - `SpatialData` objects of the raw data are saved into `./tmp` as `zarr` files - QC metrics are computed using `scanpy` functionalities: - Basic QC metrics are computed using `sc.pp.calculate_qc_metrics` - (Optional) Compute cell-cycle scores using `sc.tl.score_genes_cell_cycle`. For that, the [default gene list](../../panpipes/resources/cell_cycle_genes.tsv) can be used or a path to a tsv file can be specified. - (Optional) Custom genes actions. [Default gene list](../../panpipes/resources/qc_genelist_1.0.csv) can be used or a path to a csv file can be specified. - Calculate proportions of gene groups, e.g. mitochondrial genes - Score genes using `sc.tl.score_genes` - - `MuData` objects with calculated QC metrics are saved in `qc.data` + - `SpatialData` objects with calculated QC metrics are saved in `qc.data` - Metadata (`.obs`) is saved into the current directory as tsv files - Specified QC metrics are plotted in violin and spatial embedding plots - For `Vizgen` data, additional histograms are plotted diff --git a/docs/workflows/preprocess_spatial.md b/docs/workflows/preprocess_spatial.md index 925bd534..3bfae011 100644 --- a/docs/workflows/preprocess_spatial.md +++ b/docs/workflows/preprocess_spatial.md @@ -1,17 +1,17 @@ # Preprocessing spatial data -The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `MuData` objects of the same assay (`Visium` or `Vizgen`), each with a `spatial` modality, can be filtered and preprocessed in one run. +The `preprocess_spatial` workflow filters the data and preprocesses the data by normalization, HVG selection, and PCA computation. Multiple `SpatialData` objects of the same assay (`Visium`, `Vizgen`, or `Xenium`) can be filtered and preprocessed in one run. ## Steps -If multiple `MuData` objects are provided, the following steps are run for each **with the same parameter setting.** +If multiple `SpatialData` objects are provided, the following steps are run for each **with the same parameter setting.** -- `MuData` object is filtered by the specified thresholds in the pipeline.yml. Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`. +- `SpatialData` object is filtered by the specified thresholds in the pipeline.yml. Note, that the filtering step is **optional**. You can avoid filtering by setting the `run` parameter in the pipeline.yml under `filtering` to `False`. - Post-filter plotting is performed (only when data was filtered, i.e. `run: True`). Specified metrics in the pipeline.yml are plotted in violin and spatial embedding plots. Plots are saved into the `./figures/spatial` directory. - Data is normalized and HVGs are selected. Before normalization, raw counts are saved into `.layers["raw_counts"]`, if not present already. Normalized counts are saved into `.X` and `.layers["lognorm"]` or `.layers["norm_pearson_resid"]`, depending on the chosen normalization. HVGs are saved into `.var["highly_variable"]`. - PCA is computed and plotted. PCA plots are also saved into the `./figures/spatial` directory. -- Final `MuData` object is saved into the `./filtered.data` directory +- Final `SpatialData` object is saved into the `./filtered.data` directory as a `zarr` file ## Steps to run diff --git a/docs/yaml_docs/spatial_deconvolution.md b/docs/yaml_docs/spatial_deconvolution.md index 29a4e93a..a1debae4 100644 --- a/docs/yaml_docs/spatial_deconvolution.md +++ b/docs/yaml_docs/spatial_deconvolution.md @@ -33,7 +33,7 @@ Specified by the following three parameters: - threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks. - For each thread, there must be enough memory to load your mudata and do computationally light tasks. + For each thread, there must be enough memory to load your SpatialData and do computationally light tasks. - threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks. @@ -46,12 +46,12 @@ Specified by the following three parameters: ## 1. Input Options -With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `MuData` object for each slide is expected, with the spatial data saved in `mdata.mod["spatial"]`. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide.
For the **spatial** input, the workflow, therefore, reads in **all `.h5mu` objects of a directory** (see below). **The spatial and single-cell data thus need to be saved in different folders.** +With the `deconvolution_spatial` workflow, one or multiple spatial slides can be deconvoluted in one run. For that, a `SpatialData` object for each slide is expected. The spatial slides are deconvoluted **using the same reference**. For the reference, one `MuData` with the gene expression data saved in `mdata.mod["rna"]` is expected as input. Please note, that the same parameter setting is used for each slide.
For the **spatial** input, the workflow, therefore, reads in **all `.zarr` objects of a directory** (see below).
input
- spatial `String`, Mandatory parameter
- Path to folder containing one or multiple `MuDatas` of spatial data. The pipeline is reading in all `MuData` files in that folder and assuming that they are `MuDatas` of spatial slides. + Path to folder containing one or multiple `SpatialDatas` of spatial data. The pipeline is reading in all `SpatialData` files in that folder. - singlecell `String`, Mandatory parameter
Path to the MuData **file** (not folder) of the reference single-cell data. @@ -151,6 +151,8 @@ You can specify whether both models (spatial and reference) should be saved with save_models, Default: False
Whether to save the reference & spatial mapping models. +export_gene_by_spot, Default: False
+ Whether to save a gene by spot matrix for each cell type in a layer. ## 3. Tangram Options diff --git a/docs/yaml_docs/spatial_preprocess.md b/docs/yaml_docs/spatial_preprocess.md index 28270c99..2d0db62c 100644 --- a/docs/yaml_docs/spatial_preprocess.md +++ b/docs/yaml_docs/spatial_preprocess.md @@ -35,7 +35,7 @@ Specified by the following three parameters: - threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks. - For each thread, there must be enough memory to load your mudata and do computationally light tasks. + For each thread, there must be enough memory to load your SpatialData and do computationally light tasks. - threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks. @@ -48,14 +48,14 @@ Specified by the following three parameters: ## 1. Input Options -With the preprocess_spatial workflow, one or multiple `MuData` objects can be preprocessed in one run. The workflow **reads in all `.h5mu` objects of a directory**. The `MuData` objects in the directory need to be of the same assay (vizgen or visium). The workflow then runs the preprocessing of each `MuData` object separately with the same parameters that are specified in the yaml file. +With the preprocess_spatial workflow, one or multiple `SpatialData` objects can be preprocessed in one run. The workflow **reads in all `.zarr` objects of a directory**. The `SpatialData` objects in the directory need to be of the same assay (Vizgen, Visium, or Xenium). The workflow then runs the preprocessing of each `SpatialData` object separately with the same parameters that are specified in the yaml file.
input_dir `String`, Mandatory parameter
- Path to the folder containing all input `h5mu` files. + Path to the folder containing all input `zarr` files. assay [`'visium'`, `'vizgen'`], Default: `'visium'`
- Spatial transcriptomics assay of the `h5mu` files in `input_dir`. + Spatial transcriptomics assay of the `zarr` files in `input_dir`. @@ -70,7 +70,7 @@ With the preprocess_spatial workflow, one or multiple `MuData` objects can be pr
-With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the h5mu object.
Please slso make sure, that the specified metrics are present in all `h5mu` objects of the `input_dir`, i.e. the `MuData` objects for that the preprocessing is run. +With the parameters below you can specify thresholds for filtering. The filtering is fully customisable to any columns in `.obs` or `.var`. You are not restricted by the columns given as default. When specifying a column name, please make sure it exactly matches the column name in the table of the `SpatialData` object.
Please also make sure, that the specified metrics are present in all `SpatialData` objects of the `input_dir`, i.e. the `SpatialData` objects for that the preprocessing is run. --- diff --git a/docs/yaml_docs/spatial_qc.md b/docs/yaml_docs/spatial_qc.md index 5a742ab0..baeb2850 100644 --- a/docs/yaml_docs/spatial_qc.md +++ b/docs/yaml_docs/spatial_qc.md @@ -32,11 +32,11 @@ Computing resources to use, specifically the number of threads used for parallel Specified by the following three parameters: - threads_high `Integer`, Default: 1
Number of threads used for high intensity computing tasks. - For each thread, there must be enough memory to load all your input files at once and create the MuData object. + For each thread, there must be enough memory to load all your input files at once and create the SpatialData object. - threads_medium `Integer`, Default: 1
Number of threads used for medium intensity computing tasks. - For each thread, there must be enough memory to load your mudata and do computationally light tasks. + For each thread, there must be enough memory to load your SpatialData and do computationally light tasks. - threads_low `Integer`, Default: 1
Number of threads used for low intensity computing tasks. diff --git a/panpipes/funcs/io.py b/panpipes/funcs/io.py index 77d74d2b..b5d635ab 100644 --- a/panpipes/funcs/io.py +++ b/panpipes/funcs/io.py @@ -156,36 +156,68 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True): spatial_filetype = None else: spatial_path = caf["spatial_path"][nn] + if caf['spatial_filetype'][nn]=="xenium": + spatial_filetype = caf['spatial_filetype'][nn] + visium_feature_bc_matrix = None + visium_fullres_image_file = None + visium_tissue_positions_file = None + visium_scalefactors_file = None + vpt_cell_by_gene = None + vpt_cell_metadata = None + vpt_cell_boundaries = None if caf['spatial_filetype'][nn]=="vizgen": + visium_feature_bc_matrix = None + visium_fullres_image_file = None + visium_tissue_positions_file = None + visium_scalefactors_file = None spatial_filetype = caf['spatial_filetype'][nn] - #path, counts and metadata are mandatory - if pd.notna(caf["spatial_counts"][nn]): - spatial_counts= caf["spatial_counts"][nn] - else: - spatial_counts = None - if pd.notna(caf["spatial_metadata"][nn]): - spatial_metadata = caf["spatial_metadata"][nn] - else: - spatial_metadata = None - #transformation is optional - if pd.notna(caf["spatial_transformation"][nn]): - spatial_transformation = caf["spatial_transformation"][nn] - else: - spatial_transformation = None + vpt_cell_by_gene = None + vpt_cell_metadata = None + vpt_cell_boundaries = None + if "vpt_cell_by_gene" in caf.columns: + if pd.notna(caf['vpt_cell_by_gene'][nn]): + vpt_cell_by_gene = caf['vpt_cell_by_gene'][nn] + if "vpt_cell_metadata" in caf.columns: + if pd.notna(caf['vpt_cell_metadata'][nn]): + vpt_cell_metadata = caf['vpt_cell_metadata'][nn] + if "vpt_cell_boundaries" in caf.columns: + if pd.notna(caf['vpt_cell_boundaries'][nn]): + vpt_cell_boundaries = caf['vpt_cell_boundaries'][nn] elif caf['spatial_filetype'][nn]=="visium": - spatial_metadata= None - spatial_transformation = None + visium_feature_bc_matrix = None + visium_fullres_image_file = None + visium_tissue_positions_file = None + visium_scalefactors_file = None + vpt_cell_by_gene = None + vpt_cell_metadata = None + vpt_cell_boundaries = None spatial_filetype = caf['spatial_filetype'][nn] - if pd.notna(caf["spatial_counts"][nn]): - spatial_counts= caf["spatial_counts"][nn] - else: - spatial_counts = None + #counts file + if "visium_feature_bc_matrix" in caf.columns: + if pd.notna(caf["visium_feature_bc_matrix"][nn]): + visium_feature_bc_matrix= caf["visium_feature_bc_matrix"][nn] + # fullres image + if "visium_fullres_image_file" in caf.columns: + if pd.notna(caf["visium_fullres_image_file"][nn]): + visium_fullres_image_file= caf["visium_fullres_image_file"][nn] + # tissue position + if "visium_tissue_positions_file" in caf.columns: + if pd.notna(caf["visium_tissue_positions_file"][nn]): + visium_tissue_positions_file= caf["visium_tissue_positions_file"][nn] + # scalefactor + if "visium_scalefactors_file" in caf.columns: + if pd.notna(caf["visium_scalefactors_file"][nn]): + visium_scalefactors_file= caf["visium_scalefactors_file"][nn] else: spatial_path= None spatial_filetype = None - spatial_counts = None - spatial_metadata = None - spatial_transformation = None + visium_feature_bc_matrix = None + visium_fullres_image_file = None + visium_tissue_positions_file = None + visium_scalefactors_file = None + vpt_cell_by_gene = None + vpt_cell_metadata = None + vpt_cell_boundaries = None if 'barcode_mtd_path' in caf.columns: cell_mtd_path = caf['barcode_mtd_path'][nn] #not yielding this right now! @@ -194,12 +226,14 @@ def gen_load_spatial_jobs(caf, mode_dictionary = {}, load_raw=True): # create the output file outfile = "./tmp/" + caf['sample_id'][nn] if load_raw: - outfile = outfile + "_raw.h5mu" + outfile = outfile + "_raw.zarr" else: - outfile = outfile + ".h5mu" + outfile = outfile + ".zarr" sample_id = caf['sample_id'][nn] - yield spatial_path, outfile, \ - sample_id, spatial_filetype, spatial_counts, spatial_metadata, spatial_transformation + + yield spatial_path, outfile, sample_id, spatial_filetype, \ + visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file, \ + vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries def read_anndata( diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial.py b/panpipes/panpipes/pipeline_deconvolution_spatial.py index 41e60dde..eb970434 100644 --- a/panpipes/panpipes/pipeline_deconvolution_spatial.py +++ b/panpipes/panpipes/pipeline_deconvolution_spatial.py @@ -30,13 +30,12 @@ def get_logger(): def gen_filter_jobs(): - input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.h5mu")) + input_paths_spatial=glob.glob(os.path.join(PARAMS["input_spatial"],"*.zarr")) input_singlecell = PARAMS["input_singlecell"] for input_spatial in input_paths_spatial: sample_prefix = os.path.basename(input_spatial) - sample_prefix = sample_prefix.replace(".h5mu","") - outfile_spatial = "cell2location.output/" + sample_prefix + "/Cell2Loc_spatial_output.h5mu" - yield input_spatial, outfile_spatial, sample_prefix, input_singlecell + sample_prefix = sample_prefix.replace(".zarr","") + yield input_spatial, sample_prefix, input_singlecell @mkdir("logs") @@ -45,7 +44,7 @@ def gen_filter_jobs(): @mkdir("figures/Cell2Location") @mkdir("cell2location.output") @files(gen_filter_jobs) -def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singlecell): +def run_cell2location(input_spatial, sample_prefix, input_singlecell): figdir = "./figures/Cell2Location/" + sample_prefix output_dir = "./cell2location.output/" + sample_prefix @@ -103,6 +102,8 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl if PARAMS['Cell2Location_save_models'] is not None: cmd += " --save_models %(Cell2Location_save_models)s" + if PARAMS['Cell2Location_export_gene_by_spot'] is not None: + cmd += " --export_gene_by_spot %(Cell2Location_export_gene_by_spot)s" cmd += " > logs/%(log_file)s " job_kwargs["job_threads"] = PARAMS['resources_threads_low'] @@ -116,7 +117,7 @@ def run_cell2location(input_spatial, outfile_spatial, sample_prefix, input_singl @mkdir("figures/Tangram") @mkdir("tangram.output") @files(gen_filter_jobs) -def run_tangram(input_spatial, outfile_spatial, sample_prefix, input_singlecell): +def run_tangram(input_spatial, sample_prefix, input_singlecell): figdir = "./figures/Tangram/" + sample_prefix output_dir = "./tangram.output/" + sample_prefix diff --git a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml index 66d524f0..b9d6d430 100644 --- a/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml +++ b/panpipes/panpipes/pipeline_deconvolution_spatial/pipeline.yml @@ -89,6 +89,7 @@ Cell2Location: # ------------------------------- save_models: False # Default False; whether to save the reference and spatial mapping models + export_gene_by_spot: False # Default False; whether to save a gene by spot matrix for each cell type in a layer diff --git a/panpipes/panpipes/pipeline_preprocess_spatial.py b/panpipes/panpipes/pipeline_preprocess_spatial.py index b4bf3f71..dbed8c9c 100644 --- a/panpipes/panpipes/pipeline_preprocess_spatial.py +++ b/panpipes/panpipes/pipeline_preprocess_spatial.py @@ -39,7 +39,7 @@ def gen_filter_jobs(): input_dir = "../qc.data" if not os.path.exists(input_dir): sys.exit("can't find input data") - input_paths=glob.glob(os.path.join(input_dir,"*unfilt.h5mu")) + input_paths=glob.glob(os.path.join(input_dir,"*unfilt.zarr")) for infile_path in input_paths: file_name = os.path.basename(infile_path) outfile = file_name.replace("unfilt","filtered") @@ -52,24 +52,24 @@ def gen_filter_jobs(): @mkdir("tables") @mkdir("filtered.data") @files(gen_filter_jobs) -def filter_mudata(infile_path,outfile): +def filter_spatialdata(infile_path,outfile): print('processing file = %s' % str(infile_path)) log_file = os.path.basename(outfile) - log_file= "1_filtering."+log_file.replace("filtered.h5mu","") + ".log" + log_file= "1_filtering."+log_file.replace("filtered.zarr","") + ".log" filter_dict = dictionary_stripper(PARAMS['filtering']) cmd = """ python %(py_path)s/run_filter_spatial.py - --input_mudata %(infile_path)s - --output_mudata %(outfile)s + --input_spatialdata %(infile_path)s + --output_spatialdata %(outfile)s --filter_dict "%(filter_dict)s" """ if PARAMS['filtering_keep_barcodes'] is not None: cmd += " --keep_barcodes %(filtering_keep_barcodes)s" cmd += " > logs/%(log_file)s " job_kwargs["job_threads"] = PARAMS['resources_threads_low'] - log_msg = f"TASK: 'filter_mudata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION." + log_msg = f"TASK: 'filter_spatialdata'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/{log_file}' FOR MORE INFORMATION." get_logger().info(log_msg) P.run(cmd, **job_kwargs) @@ -84,8 +84,8 @@ def run_plotqc_query(pqc_dict): @active_if(run_plotqc_query(PARAMS['plotqc'])) @active_if(PARAMS['filtering_run']) -@transform(filter_mudata, - regex("./filtered.data/(.*)_filtered.h5(.*)"), +@transform(filter_spatialdata, + regex("./filtered.data/(.*)_filtered.zarr"), r"./logs/2_postfilterplot.\1.log") def postfilterplot_spatial(filt_file,log_file): print(filt_file) @@ -93,7 +93,7 @@ def postfilterplot_spatial(filt_file,log_file): spatial_filetype = PARAMS["assay"] cmd = """ python %(py_path)s/plot_qc_spatial.py - --input_mudata %(filt_file)s + --input_spatialdata %(filt_file)s --spatial_filetype %(spatial_filetype)s --figdir ./figures/spatial """ @@ -108,8 +108,8 @@ def postfilterplot_spatial(filt_file,log_file): P.run(cmd, **job_kwargs) -@transform(filter_mudata, - regex("./filtered.data/(.*)_filtered.h5(.*)"), +@transform(filter_spatialdata, + regex("./filtered.data/(.*)_filtered.zarr"), r"./logs/3_preprocess.\1.log") def spatial_preprocess(filt_file,log_file): if os.path.exists("figures/spatial") is False: @@ -119,8 +119,8 @@ def spatial_preprocess(filt_file,log_file): write_output = os.path.join("./tmp/",os.path.basename(filt_file)) cmd = """ python %(py_path)s/run_preprocess_spatial.py - --input_mudata %(filt_file)s - --output_mudata %(write_output)s + --input_spatialdata %(filt_file)s + --output_spatialdata %(write_output)s --figdir ./figures/spatial """ if PARAMS['spatial_norm_hvg_flavour'] is not None: @@ -154,7 +154,7 @@ def spatial_preprocess(filt_file,log_file): get_logger().info(log_msg) P.run(cmd, **job_kwargs) -@follows(filter_mudata, postfilterplot_spatial, spatial_preprocess) +@follows(filter_spatialdata, postfilterplot_spatial, spatial_preprocess) @originate("cleanup_done.txt") def cleanup(file): # remove any ctmp fails diff --git a/panpipes/panpipes/pipeline_qc_spatial.py b/panpipes/panpipes/pipeline_qc_spatial.py index 538cf67c..6342e8f7 100644 --- a/panpipes/panpipes/pipeline_qc_spatial.py +++ b/panpipes/panpipes/pipeline_qc_spatial.py @@ -56,7 +56,7 @@ def set_up_dirs(log_file): pass # ----------------------------------------------------------------------------------------------- -## Creating h5mu from filtered data files +## Creating spatialData from filtered data files # ----------------------------------------------------------------------------------------------- @@ -73,12 +73,9 @@ def gen_load_spatial_anndata_jobs(): @follows(mkdir("logs")) @follows(mkdir("tmp")) @files(gen_load_spatial_anndata_jobs) -def load_mudatas(spatial_path, outfile, - sample_id, - spatial_filetype, - spatial_counts, - spatial_metadata, - spatial_transformation): +def load_spatialdatas(spatial_path, outfile, + sample_id, spatial_filetype, visium_feature_bc_matrix, visium_fullres_image_file, visium_tissue_positions_file, visium_scalefactors_file, + vpt_cell_by_gene, vpt_cell_metadata, vpt_cell_boundaries): path_dict = {'spatial':spatial_path} @@ -86,45 +83,56 @@ def load_mudatas(spatial_path, outfile, print('sample_id = %s' % str(sample_id)) print('outfile = %s' % str(outfile)) print('spatial_filetype = %s' % str(spatial_filetype)) - print('spatial_counts = %s' % str(spatial_counts)) + + if spatial_filetype == "visium": + print('visium_feature_bc_matrix = %s' % str(visium_feature_bc_matrix)) + print('visium_fullres_image_file= %s' % str(visium_fullres_image_file)) + print('visium_tissue_positions_file= %s' % str(visium_tissue_positions_file)) + print('visium_scalefactors_file= %s' % str(visium_scalefactors_file)) if spatial_filetype == "vizgen": - print('spatial_metadata = %s' % str(spatial_metadata)) - print('spatial_transformation = %s' % str(spatial_transformation)) - else: - print("visium") + print('vpt_cell_by_gene = %s' % str(vpt_cell_by_gene)) + print('vpt_cell_metadata= %s' % str(vpt_cell_metadata)) + print('vpt_cell_boundaries= %s' % str(vpt_cell_boundaries)) modality_dict = {k:True if path_dict[k] is not None else False for k,v in {'spatial': True}.items() } print(modality_dict) assays[outfile] = spatial_filetype cmd = """ - python %(py_path)s/make_mudataspatial_from_csv.py + python %(py_path)s/make_spatialData_from_csv.py --mode_dictionary "%(modality_dict)s" --sample_id %(sample_id)s --output_file %(outfile)s --spatial_filetype %(spatial_filetype)s --spatial_infile %(spatial_path)s - --spatial_counts %(spatial_counts)s """ + if spatial_filetype == "visium": + cmd += """ + --visium_feature_bc_matrix %(visium_feature_bc_matrix)s + --scalefactors_file %(visium_scalefactors_file)s + --fullres_image_file %(visium_fullres_image_file)s + --tissue_positions_file %(visium_tissue_positions_file)s + """ if spatial_filetype == "vizgen": cmd += """ - --spatial_metadata %(spatial_metadata)s - --spatial_transformation %(spatial_transformation)s + --vpt_cell_by_gene %(vpt_cell_by_gene)s + --vpt_cell_metadata %(vpt_cell_metadata)s + --vpt_cell_boundaries %(vpt_cell_boundaries)s """ - cmd += " > logs/1_make_mudatas_%(sample_id)s.log" + cmd += " > logs/1_make_spatialdatas_%(sample_id)s.log" job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] - log_msg = f"TASK: 'load_mudatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_mudatas_{sample_id}.log' FOR MORE INFORMATION." + log_msg = f"TASK: 'load_spatialdatas'" + f" IN CASE OF ERROR, PLEASE REFER TO : 'logs/1_make_spatialdatas_{sample_id}.log' FOR MORE INFORMATION." get_logger().info(log_msg) P.run(cmd, **job_kwargs) -@follows(load_mudatas) +@follows(load_spatialdatas) @follows(mkdir("qc.data")) @follows(mkdir("./figures")) -@transform(load_mudatas, - regex("./tmp/(.*)_raw.h5(.*)"), +@transform(load_spatialdatas, + regex("./tmp/(.*)_raw.zarr"), r"./logs/2_spatialQC_\1.log") def spatialQC(infile,log_file): spatial_filetype = assays[infile] @@ -171,8 +179,8 @@ def run_plotqc_query(pqc_dict): @follows(spatialQC) @follows(mkdir("./figures/spatial")) @active_if(run_plotqc_query(PARAMS['plotqc'])) -@transform(load_mudatas, - regex("./tmp/(.*)_raw.h5(.*)"), +@transform(load_spatialdatas, + regex("./tmp/(.*)_raw.zarr"), r"./logs/3_qcplot.\1.log") def plotQC_spatial(unfilt_file,log_file): spatial_filetype = assays[unfilt_file] @@ -180,7 +188,7 @@ def plotQC_spatial(unfilt_file,log_file): unfilt_file = unfilt_file.replace("tmp", "qc.data") cmd = """ python %(py_path)s/plot_qc_spatial.py - --input_mudata %(unfilt_file)s + --input_spatialdata %(unfilt_file)s --spatial_filetype %(spatial_filetype)s --figdir ./figures/spatial """ diff --git a/panpipes/python_scripts/collate_mdata.py b/panpipes/python_scripts/collate_mdata.py index cf500bcc..3134accc 100644 --- a/panpipes/python_scripts/collate_mdata.py +++ b/panpipes/python_scripts/collate_mdata.py @@ -34,8 +34,19 @@ L.info("Running with params: %s", args) -L.info("Reading in MuData from '%s'" % args.input_mudata) -mdata = mu.read(args.input_mudata) +#L.info("Reading in MuData from '%s'" % args.input_mudata) +#mdata = mu.read(args.input_mudata) +L.info("Reading in data from '%s'" % args.input_mudata) +if ".zarr" in args.input_mudata: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.input_mudata) + mdata = sd.read_zarr(args.input_mudata) +else: + L.info("Reading in MuData from '%s'" % args.input_mudata) + mdata = mu.read(args.input_mudata) + + + L.info("Reading in cluster information") cf = pd.read_csv(args.clusters_files_csv) @@ -55,46 +66,78 @@ # add in the clusters +if isinstance(mdata, MuData): + L.info("Adding cluster information to MuData") + for i in range(cf.shape[0]): + cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) + cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category') + cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]}) + + if cf['mod'][i] != "multimodal": + mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True) + else: + mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True) +elif isinstance(mdata, sd.SpatialData): + L.info("Adding cluster information to SpatialData") + for i in range(cf.shape[0]): + cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) + cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category') + cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]}) + mdata["table"].obs = mdata["table"].obs.merge(cf_df, left_index=True, right_index=True) -L.info("Adding cluster information to MuData") -for i in range(cf.shape[0]): - cf_df = pd.read_csv(cf['fpath'][i], sep='\t', index_col=0) - cf_df['clusters'] = cf_df['clusters'].astype('str').astype('category') - cf_df = cf_df.rename(columns={"clusters":cf['new_key'][i]}) - - if cf['mod'][i] != "multimodal": - mdata[cf['mod'][i]].obs = mdata[cf['mod'][i]].obs.merge(cf_df, left_index=True, right_index=True) - else: - mdata.obs = mdata.obs.merge(cf_df, left_index=True, right_index=True) L.info("Adding UMAP coordinates to MuData") uf = pd.read_csv(args.umap_files_csv) -for i in range(uf.shape[0]): - uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) - mod = uf['mod'][i] - new_key = uf['new_key'][i] - if uf['mod'][i] != "multimodal": - if all(mdata[mod].obs_names == uf_df.index): - mdata[mod].obsm[new_key] = uf_df.to_numpy() +if isinstance(mdata, MuData): + for i in range(uf.shape[0]): + uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) + mod = uf['mod'][i] + new_key = uf['new_key'][i] + if uf['mod'][i] != "multimodal": + if all(mdata[mod].obs_names == uf_df.index): + mdata[mod].obsm[new_key] = uf_df.to_numpy() + else: + L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] ) else: - L.warn("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] ) - else: - # check the observations are the same - if set(mdata.obs_names).difference(uf_df.index) == set(): - # put the observations in the same order - uf_df = uf_df.loc[mdata.obs_names,:] - mdata.obsm[new_key] = uf_df.to_numpy() + # check the observations are the same + if set(mdata.obs_names).difference(uf_df.index) == set(): + # put the observations in the same order + uf_df = uf_df.loc[mdata.obs_names,:] + mdata.obsm[new_key] = uf_df.to_numpy() + else: + L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] ) +elif isinstance(mdata, sd.SpatialData): + for i in range(uf.shape[0]): + uf_df = pd.read_csv(uf['fpath'][i], sep='\t', index_col=0) + mod = uf['mod'][i] + new_key = uf['new_key'][i] + if uf['mod'][i] != "multimodal": + if all(mdata["table"].obs_names == uf_df.index): + mdata["table"].obsm[new_key] = uf_df.to_numpy() + else: + L.warn("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] ) else: - L.warning("Cannot integrate %s into mdata as obs_names mismatch" % uf.iloc[i,:] ) - - -L.info("Saving updated MuData to '%s'" % args.output_mudata) -mdata.write(args.output_mudata) - -output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata) -L.info("Saving metadata to '%s'" % output_csv) -mdata.obs.to_csv(output_csv, sep='\t') + # check the observations are the same + if set(mdata["table"].obs_names).difference(uf_df.index) == set(): + # put the observations in the same order + uf_df = uf_df.loc[mdata["table"].obs_names,:] + mdata["table"].obsm[new_key] = uf_df.to_numpy() + else: + L.warning("Cannot integrate %s into adata as obs_names mismatch" % uf.iloc[i,:] ) + +if isinstance(mdata, MuData): + L.info("Saving updated MuData to '%s'" % args.output_mudata) + mdata.write(args.output_mudata) + output_csv = re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata) + L.info("Saving metadata to '%s'" % output_csv) + mdata.obs.to_csv(output_csv, sep='\t') +elif isinstance(mdata, sd.SpatialData): + L.info("Saving updated SpatialData to '%s'" % args.output_mudata) + mdata.write(args.output_mudata) + output_csv = re.sub(".zarr", "_cell_metdata.tsv", args.output_mudata) + L.info("Saving metadata to '%s'" % output_csv) + mdata.obs.to_csv(output_csv, sep='\t') L.info("Done") diff --git a/panpipes/python_scripts/make_mudataspatial_from_csv.py b/panpipes/python_scripts/make_spatialData_from_csv.py similarity index 52% rename from panpipes/python_scripts/make_mudataspatial_from_csv.py rename to panpipes/python_scripts/make_spatialData_from_csv.py index 06453411..d9d8a02e 100644 --- a/panpipes/python_scripts/make_mudataspatial_from_csv.py +++ b/panpipes/python_scripts/make_spatialData_from_csv.py @@ -1,21 +1,20 @@ import argparse import yaml # import scanpy as sc -import pandas as pd +#import pandas as pd # import numpy as np # from scipy.sparse import csr_matrix -import muon as mu -import warnings -from muon._atac.tools import add_peak_annotation, locate_fragments -import squidpy as sq -from mudata import MuData +#import muon as mu +#import warnings +#from muon._atac.tools import add_peak_annotation, locate_fragments +#import squidpy as sq +import spatialdata_io as sd_io +#from mudata import MuData import os +from pathlib import Path """ -this script copies the make_adata_from_csv.py that creates -ONE MUDATA PER SAMPLE, with in each ONE LAYER per modality -for cell-suspension, saves them to temp. -concatenation of the mudatas saved in tmp happens -in the concat_anndata.py script +This script is an adjustment of the make_adata_from_csv.py. It creates +ONE SPATIALDATA PER SAMPLE and saves them to temp. """ import sys @@ -49,13 +48,25 @@ parser.add_argument('--spatial_filetype', default=None, help='') -parser.add_argument('--spatial_counts', +parser.add_argument('--visium_feature_bc_matrix', default=None, help='') -parser.add_argument('--spatial_metadata', +parser.add_argument('--scalefactors_file', default=None, help='') -parser.add_argument('--spatial_transformation', +parser.add_argument('--fullres_image_file', + default=None, + help='') +parser.add_argument('--tissue_positions_file', + default=None, + help='') +parser.add_argument('--vpt_cell_by_gene', + default=None, + help='') +parser.add_argument('--vpt_cell_metadata', + default=None, + help='') +parser.add_argument('--vpt_cell_boundaries', default=None, help='') @@ -64,21 +75,27 @@ L.info("Running with params: %s", args) # unimodal mu (check if all the modalities) -if isinstance(args.mode_dictionary, dict): - mode_dictionary = args.mode_dictionary -else: - mode_dictionary = read_yaml(args.mode_dictionary) +#if isinstance(args.mode_dictionary, dict): +# mode_dictionary = args.mode_dictionary +#else: +# mode_dictionary = read_yaml(args.mode_dictionary) #{'spatialT': True} -permf = [key for key, value in mode_dictionary.items() if value == True] +#permf = [key for key, value in mode_dictionary.items() if value == True] all_files = { - "spatial":[args.spatial_infile, #path, mandatory for squidpy + "spatial":[args.spatial_infile, #path args.spatial_filetype, #needed for the load_adata_in function to call one of vizgen,visium - args.spatial_counts, #name of the counts file, mandatory for squidpy - args.spatial_metadata, #name of the metadata file, mandatory for squidpy - args.spatial_transformation]} + args.visium_feature_bc_matrix, #name of the counts file, mandatory for squidpy + args.fullres_image_file, # visium + args.tissue_positions_file, #visium + args.scalefactors_file, + args.vpt_cell_by_gene, + args.vpt_cell_metadata, + args.vpt_cell_boundaries ]} # visium +# args.spatial_metadata, #name of the metadata file, mandatory for squidpy +# args.spatial_transformation]} #subset to the modalities we want from permf (in this case only spatial) -all_files = {nm: x for (nm, x) in all_files.items() if nm in permf} +#all_files = {nm: x for (nm, x) in all_files.items() if nm in permf} #[check_filetype(x[0], x[1]) for x in all_files.values()] # read the spatial data with one of the functions inside @@ -117,24 +134,32 @@ def check_dir_transform(infile_path, transform_file): if args.spatial_filetype=="vizgen": L.info("Reading in Vizgen data with squidpy.read.vizgen() into AnnData from directory " + args.spatial_infile) - adata = sq.read.vizgen(path = args.spatial_infile, #path, mandatory for squidpy - counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy - meta_file = args.spatial_metadata, #name of the metadata file, mandatory for squidpy - transformation_file=args.spatial_transformation, - library_id = str(args.sample_id)) #this also has kwargs for read_10x_h5 but keep simple - adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns = adata.uns["spatial"][str(args.sample_id)]["scalefactors"]["transformation_matrix"].columns.astype(str) + # check that all vpt parameters are not None + if "None" not in (args.vpt_cell_by_gene, args.vpt_cell_metadata, args.vpt_cell_boundaries): + vpt_outputs = {'cell_by_gene': Path(args.vpt_cell_by_gene) , + 'cell_metadata': Path(args.vpt_cell_metadata) , + 'cell_boundaries': Path(args.vpt_cell_boundaries)} + sdata = sd_io.merscope(path = args.spatial_infile, vpt_outputs=vpt_outputs) + else: + sdata = sd_io.merscope(path = args.spatial_infile) + elif args.spatial_filetype =="visium": L.info("Reading in Visium data with squidpy.read.visium() into AnnData from directory " + args.spatial_infile) - adata = sq.read.visium(path = args.spatial_infile, #path, mandatory for squidpy - counts_file=args.spatial_counts, #name of the counts file, mandatory for squidpy - library_id = str(args.sample_id) - ) #this also has kwargs for read_10x_h5 but keep simple + sdata = sd_io.visium(path=args.spatial_infile, + dataset_id=str(args.sample_id), + counts_file=args.visium_feature_bc_matrix, + fullres_image_file=args.fullres_image_file, + tissue_positions_file=args.tissue_positions_file, + scalefactors_file=args.scalefactors_file) + +elif args.spatial_filetype =="xenium": + sdata = sd_io.xenium(path = args.spatial_infile) -L.info("Resulting AnnData is:") -L.info(adata) -L.info("Creating MuData with .mod['spatial']") +L.info("Resulting SpatialData is:") +L.info(sdata) +#L.info("Creating MuData with .mod['spatial']") -mdata = MuData({"spatial": adata}) +#mdata = MuData({"spatial": adata}) #--------------- @@ -143,25 +168,25 @@ def check_dir_transform(infile_path, transform_file): L.info("Making var names unique") #make var names unique -for mm in mdata.mod.keys(): - mdata[mm].var_names_make_unique() +#for mm in mdata.mod.keys(): +sdata["table"].var_names_make_unique() L.info("Adding sample_id '%s'to MuData.obs and MuData.mod['spatial'].obs" % args.sample_id) -mdata.obs['sample_id'] = str(args.sample_id) +sdata["table"].obs['sample_id'] = str(args.sample_id) # copy the sample_id to each modality -for mm in mdata.mod.keys(): +#for mm in mdata.mod.keys(): # mdata[mm].obs['sample_id'] = mdata.obs['sample_id'] - mdata[mm].obs['sample_id'] = mdata.obs.loc[mdata[mm].obs_names,:]['sample_id'] +sdata["table"].obs['sample_id'] = sdata["table"].obs.loc[sdata["table"].obs_names,:]['sample_id'] -mdata.update() +#mdata.update() -L.info("Resulting MuData is:") -L.info(mdata) +L.info("Resulting SpatialData is:") +L.info(sdata) -L.info("Saving MuData to '%s'" % args.output_file) -L.debug(mdata) -mdata.write(args.output_file) +L.info("Saving SpatialData to '%s'" % args.output_file) +L.debug(sdata) +sdata.write(args.output_file) L.info("Done") diff --git a/panpipes/python_scripts/plot_cluster_umaps.py b/panpipes/python_scripts/plot_cluster_umaps.py index 39e73b19..18b804c3 100644 --- a/panpipes/python_scripts/plot_cluster_umaps.py +++ b/panpipes/python_scripts/plot_cluster_umaps.py @@ -90,9 +90,13 @@ def plot_spatial(adata,figdir): fig.savefig(os.path.join(figdir, ok + "_clusters.png")) - -L.info("Reading in MuData from '%s'" % args.infile) -mdata = read(args.infile) +if ".zarr" in args.infile: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + data = sd.read_zarr(args.infile) +else: + L.info("Reading in MuData from '%s'" % args.infile) + data = read(args.infile) mods = args.modalities.split(',') # detemin initial figure directory based on object type @@ -102,21 +106,27 @@ def plot_spatial(adata,figdir): if os.path.exists("multimodal/figures") is False: os.makedirs("multimodal/figures") L.info("Plotting multimodal figures") - main(mdata, figdir="multimodal/figures") + main(data, figdir="multimodal/figures") # we also need to plot per modality -if type(mdata) is MuData: - for mod in mdata.mod.keys(): +if type(data) is MuData: + for mod in data.mod.keys(): if mod in mods: L.info("Plotting for modality: %s" % mod) figdir = os.path.join(mod, "figures") if os.path.exists(figdir) is False: os.makedirs(figdir) if mod == "spatial": # added separate function for spatial - plot_spatial(mdata[mod], figdir) + plot_spatial(data[mod], figdir) else: - main(mdata[mod], figdir) + main(data[mod], figdir) +elif isinstance(data, sd.SpatialData): + L.info("Plotting for modality: spatial") + figdir = os.path.join("spatial", "figures") + if os.path.exists(figdir) is False: + os.makedirs(figdir) + plot_spatial(data["table"], figdir) diff --git a/panpipes/python_scripts/plot_qc_spatial.py b/panpipes/python_scripts/plot_qc_spatial.py index 6434b3b4..205bf74a 100644 --- a/panpipes/python_scripts/plot_qc_spatial.py +++ b/panpipes/python_scripts/plot_qc_spatial.py @@ -13,6 +13,7 @@ import sys import logging import re +import spatialdata as sd L = logging.getLogger() L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) @@ -27,8 +28,8 @@ parser = argparse.ArgumentParser() -parser.add_argument("--input_mudata", - default="mudata_unfilt.h5mu", +parser.add_argument("--input_spatialdata", + default="spatialdata_unfilt.h5mu", help="") parser.add_argument("--figdir", default="./figures/", @@ -57,15 +58,16 @@ sc.settings.figdir = figdir sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5)) -L.info("Reading in MuData from '%s'" % args.input_mudata) -mdata = mu.read(args.input_mudata) -spatial = mdata.mod['spatial'] +L.info("Reading in SpatialData from '%s'" % args.input_spatialdata) +sdata = sd.read_zarr(args.input_spatialdata) +#mdata = mu.read(args.input_spatialdata) +#spatial = mdata.mod['spatial'] -input_data = os.path.basename(args.input_mudata) -pattern = r"_filtered.h5(.*)" +input_data = os.path.basename(args.input_spatialdata) +pattern = r"_filtered.zarr" match = re.search(pattern, input_data) if match is None: - match = re.search(r"_unfilt.h5(.*)", input_data) + match = re.search(r"_unfilt.zarr", input_data) sprefix = input_data[:match.start()] # convert string to list of strings @@ -74,15 +76,16 @@ # check if metrics in adata.obs or adata.var -qc_metrics = [metric if metric in spatial.obs.columns or metric in spatial.var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics] +qc_metrics = [metric if metric in + sdata["table"].obs.columns or metric in sdata["table"].var.columns else L.warning("Variable '%s' not found in adata.var or adata.obs, will not be plotted" % metric) for metric in qc_metrics] qc_metrics = [metric for metric in qc_metrics if metric is not None] # check that group_vars are in adata.obs -group_var = [group if group in spatial.obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var] +group_var = [group if group in sdata["table"].obs.columns else L.warning("group_var '%s' not found in adata.obs, will be ignored" % group) for group in group_var] group_var = [group for group in group_var if group is not None] # make sure that it's saved as categorical for group in group_var: - spatial.obs[group] = spatial.obs[group].astype("category") + sdata["table"].obs[group] = sdata["table"].obs[group].astype("category") if group_var == []: group_var = None @@ -93,34 +96,34 @@ for metric in qc_metrics: # check if in adata.obs: - if metric in spatial.obs.columns: + if metric in sdata["table"].obs.columns: # check that it's a numeric column, so that it can be plotted: - if metric not in spatial.obs._get_numeric_data().columns: + if metric not in sdata["table"].obs._get_numeric_data().columns: L.warning("Variable '%s' not numerical in adata.obs, will not be plotted" % metric) else: L.info("Creating violin plot for '%s' of .obs" % metric) if group_var is None: - sc.pl.violin(spatial, keys = metric, xlabel = metric+ " in .obs", + sc.pl.violin(sdata["table"], keys = metric, xlabel = metric+ " in .obs", save = "_obs_" + metric+ "_" + "."+sprefix + ".png", show = False) else: #plot violin for each group for group in group_var: - sc.pl.violin(spatial, keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs", + sc.pl.violin(sdata["table"], keys = metric,groupby = group, xlabel = group + ", "+ metric+ " in .obs", save = "_obs_" + metric+ "_" + group+ "."+sprefix +".png", show = False) #plot spatial L.info("Creating spatial embedding plot for '%s' of .obs" % metric) - sc.pl.embedding(spatial,basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False) + sc.pl.embedding(sdata["table"],basis="spatial", color = metric, save = "_spatial_" + metric + "."+sprefix +".png", show = False) #check if in adata.var: - if metric in spatial.var.columns: + if metric in sdata["table"].var.columns: - if metric not in spatial.var._get_numeric_data().columns: + if metric not in sdata["table"].var._get_numeric_data().columns: L.warning("Variable '%s' not numerical in adata.var, will not be plotted" % metric) else: # plot violins L.info("Creating violin plot for '%s' of .var" % metric) ax = sns.violinplot( - data=spatial.var[[metric]], + data=sdata["table"].var[[metric]], orient='vertical', ) ax.set(xlabel=metric+ " in .var" ) @@ -135,28 +138,28 @@ axs[0].set_title("Total transcripts per cell") sns.histplot( - spatial.obs["total_counts"], + sdata["table"].obs["total_counts"], kde=False, ax=axs[0], ) axs[1].set_title("Unique transcripts per cell") sns.histplot( - spatial.obs["n_genes_by_counts"], + sdata["table"].obs["n_genes_by_counts"], kde=False, ax=axs[1], ) axs[2].set_title("Transcripts per FOV") sns.histplot( - spatial.obs.groupby('fov')[['total_counts']].sum(), + sdata["table"].obs.groupby('fov')[['total_counts']].sum(), kde=False, ax=axs[2], ) axs[3].set_title("Volume of segmented cells") sns.histplot( - spatial.obs["volume"], + sdata["table"].obs["volume"], kde=False, ax=axs[3], ) diff --git a/panpipes/python_scripts/plot_scanpy_markers.py b/panpipes/python_scripts/plot_scanpy_markers.py index 09fd4455..c7073b6c 100644 --- a/panpipes/python_scripts/plot_scanpy_markers.py +++ b/panpipes/python_scripts/plot_scanpy_markers.py @@ -115,17 +115,23 @@ def do_plots(adata, mod, group_col, mf, n=10, layer=None): # read data -L.info("Reading in MuData from '%s'" % args.infile) -mdata = mu.read(args.infile) - -if type(mdata) is AnnData: - adata = mdata - # main function only does rank_gene_groups on X, so -elif type(mdata) is mu.MuData and args.modality is not None: - adata = mdata[args.modality] -else: - L.error("If the input is a MuData object, a modality needs to be specified") - sys.exit('If the input is a MuData object, a modality needs to be specified') +if args.modality != "spatial": + L.info("Reading in MuData from '%s'" % args.infile) + mdata = mu.read(args.infile) + + if type(mdata) is AnnData: + adata = mdata + # main function only does rank_gene_groups on X, so + elif type(mdata) is mu.MuData and args.modality is not None: + adata = mdata[args.modality] + else: + L.error("If the input is a MuData object, a modality needs to be specified") + sys.exit('If the input is a MuData object, a modality needs to be specified') +else: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + adata = sd.read_zarr(args.infile)["table"] + L.info("Loading marker information from '%s'" % args.marker_file) mf = pd.read_csv(args.marker_file, sep='\t' ) diff --git a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py index ad675080..ba1af40a 100644 --- a/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py +++ b/panpipes/python_scripts/rerun_find_neighbors_for_clustering.py @@ -4,6 +4,7 @@ import logging import scanpy as sc from muon import MuData, read + from panpipes.funcs.scmethods import run_neighbors_method_choice from panpipes.funcs.io import read_yaml from panpipes.funcs.scmethods import lsi @@ -37,53 +38,80 @@ sc.settings.n_jobs = int(args.n_threads) # read data -L.info("Reading in MuData from '%s'" % args.infile) -mdata = read(args.infile) +if ".zarr" in args.infile: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + sdata = sd.read_zarr(args.infile) +else: + L.info("Reading in MuData from '%s'" % args.infile) + mdata = read(args.infile) for mod in neighbor_dict.keys(): - if mod in mdata.mod.keys(): + if mod != "spatial": + if mod in mdata.mod.keys(): + if neighbor_dict[mod]['use_existing']: + L.info('Using existing neighbors graph for %s' % mod) + pass + else: + L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red'])) + if type(mdata) is MuData: + adata=mdata[mod] + if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()): + L.info("X_pca not found, computing it using default parameters") + sc.tl.pca(adata) + if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None): + dimrem = int(neighbor_dict[mod]['dim_remove']) + adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:] + adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:] + if mod == "atac": + if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()): + L.info("X_lsi not found, computing it using default parameters") + lsi(adata=adata, num_components=50) + if neighbor_dict[mod]['dim_remove'] is not None: + L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove']) + dimrem = int(neighbor_dict[mod]['dim_remove']) + adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:] + adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:] + adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:] + + # run command + opts = dict(method=neighbor_dict[mod]['method'], + n_neighbors=int(neighbor_dict[mod]['k']), + n_pcs=int(neighbor_dict[mod]['n_dim_red']), + metric=neighbor_dict[mod]['metric'], + nthreads=args.n_threads, + use_rep=neighbor_dict[mod]['dim_red']) + + + run_neighbors_method_choice(adata,**opts) + mdata.mod[mod] = adata + mdata.update() + else: if neighbor_dict[mod]['use_existing']: L.info('Using existing neighbors graph for %s' % mod) pass else: L.info("Computing new neighbors for modality %s on %s" % (mod, neighbor_dict[mod]['dim_red'])) - if type(mdata) is MuData: - adata=mdata[mod] - if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in adata.obsm.keys()): + if (neighbor_dict[mod]['dim_red'] == "X_pca") and ("X_pca" not in sdata["table"].obsm.keys()): L.info("X_pca not found, computing it using default parameters") - sc.tl.pca(adata) - if (mod == "atac") and (neighbor_dict[mod]['dim_remove'] is not None): - dimrem = int(neighbor_dict[mod]['dim_remove']) - adata.obsm['X_pca'] = adata.obsm['X_pca'][:, dimrem:] - adata.varm["PCs"] = adata.varm["PCs"][:, dimrem:] - if mod == "atac": - if (neighbor_dict[mod]['dim_red'] == "X_lsi") and ("X_lsi" not in adata.obsm.keys()): - L.info("X_lsi not found, computing it using default parameters") - lsi(adata=adata, num_components=50) - if neighbor_dict[mod]['dim_remove'] is not None: - L.info("Removing dimension %s from X_lsi" % neighbor_dict[mod]['dim_remove']) - dimrem = int(neighbor_dict[mod]['dim_remove']) - adata.obsm['X_lsi'] = adata.obsm['X_lsi'][:, dimrem:] - adata.varm["LSI"] = adata.varm["LSI"][:, dimrem:] - adata.uns["lsi"]["stdev"] = adata.uns["lsi"]["stdev"][dimrem:] - - # run command + sc.tl.pca(sdata["table"]) opts = dict(method=neighbor_dict[mod]['method'], n_neighbors=int(neighbor_dict[mod]['k']), n_pcs=int(neighbor_dict[mod]['n_dim_red']), metric=neighbor_dict[mod]['metric'], nthreads=args.n_threads, use_rep=neighbor_dict[mod]['dim_red']) + # run command + run_neighbors_method_choice(sdata["table"],**opts) - run_neighbors_method_choice(adata,**opts) - mdata.mod[mod] = adata - mdata.update() - - +if ".zarr" in args.infile: + L.info("Saving updated SpatialData to '%s'" % args.outfile) + sdata.write(args.outfile) +else: + L.info("Saving updated MuData to '%s'" % args.outfile) + mdata.write(args.outfile) -L.info("Saving updated MuData to '%s'" % args.outfile) -mdata.write(args.outfile) -L.info("Done") \ No newline at end of file +L.info("Done") diff --git a/panpipes/python_scripts/run_cell2location.py b/panpipes/python_scripts/run_cell2location.py index 3bd74d6c..9be32a1d 100644 --- a/panpipes/python_scripts/run_cell2location.py +++ b/panpipes/python_scripts/run_cell2location.py @@ -7,6 +7,7 @@ import cell2location as c2l import scanpy as sc import pandas as pd +import spatialdata as sd import muon as mu import os @@ -20,6 +21,7 @@ from panpipes.funcs.scmethods import cell2loc_filter_genes + L = logging.getLogger() L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) @@ -47,6 +49,9 @@ parser.add_argument("--save_models", default=False, help="whether to save the reference & spatial mapping models") +parser.add_argument("--export_gene_by_spot", + default=False, + help="whether to save a gene by spot matrix for each cell type in a layer") # parameters for feature selection: @@ -146,6 +151,11 @@ save_models = False else: save_models = True + +if (args.export_gene_by_spot is False) or (args.export_gene_by_spot == "False"): + export_gene_by_spot = False +else: + export_gene_by_spot = True if (args.remove_mt is True) or (args.remove_mt == "True"): remove_mt = True @@ -197,9 +207,10 @@ #1. read in the data #spatial: -L.info("Reading in spatial MuData from '%s'" % args.input_spatial) -mdata_spatial = mu.read(args.input_spatial) -adata_st = mdata_spatial.mod['spatial'] +L.info("Reading in SpatialData from '%s'" % args.input_spatial) +sdata_st = sd.read_zarr(args.input_spatial) +#mdata_spatial = mu.read(args.input_spatial) +#adata_st = mdata_spatial.mod['spatial'] #single-cell: L.info("Reading in reference MuData from '%s'" % args.input_singlecell) mdata_singlecell = mu.read(args.input_singlecell) @@ -218,11 +229,11 @@ reduced_gene_set.columns = ["HVGs"] L.info("Subsetting data on gene list") adata_sc.var["selected_gene"] = adata_sc.var.index.isin(reduced_gene_set["HVGs"]) - adata_st.var["selected_gene"] = adata_st.var.index.isin(reduced_gene_set["HVGs"]) + sdata_st["table"].var["selected_gene"] = sdata_st["table"].var.index.isin(reduced_gene_set["HVGs"]) adata_sc = adata_sc[:, adata_sc.var["selected_gene"]] - adata_st = adata_st[:, adata_st.var["selected_gene"]] + sdata_st["table"] = sdata_st["table"][:, sdata_st["table"].var["selected_gene"]] # check whether all genes are present in both, spatial & reference - if set(adata_st.var.index) != set(adata_sc.var.index): + if set(sdata_st["table"].var.index) != set(adata_sc.var.index): L.error( "Not all genes of the gene list %s are present in the reference as well as in the ST data. Please provide a gene list where all genes are present in both, reference and ST.", args.gene_list) sys.exit( @@ -231,14 +242,14 @@ else: # perform feature selection according to cell2loc if remove_mt is True: L.info("Removing MT genes") - adata_st.var["MT_gene"] = [gene.startswith("MT-") for gene in adata_st.var.index] - adata_st.obsm["MT"] = adata_st[:, adata_st.var["MT_gene"].values].X.toarray() - adata_st = adata_st[:, ~adata_st.var["MT_gene"].values] + sdata_st["table"].var["MT_gene"] = [gene.startswith("MT-") for gene in sdata_st["table"].var.index] + sdata_st["table"].obsm["MT"] = sdata_st["table"][:, sdata_st["table"].var["MT_gene"].values].X.toarray() + sdata_st["table"] = sdata_st["table"][:, ~sdata_st["table"].var["MT_gene"].values] # intersect vars of reference and spatial L.info("Intersecting vars of reference and spatial ") - shared_features = [feature for feature in adata_st.var_names if feature in adata_sc.var_names] + shared_features = [feature for feature in sdata_st["table"].var_names if feature in adata_sc.var_names] adata_sc = adata_sc[:, shared_features] - adata_st = adata_st[:, shared_features] + sdata_st["table"] = sdata_st["table"][:, shared_features] # select features L.info("Selecting features using 'cell2location.utils.filtering.filter_genes() function'") selected = cell2loc_filter_genes(adata_sc, figdir + "/gene_filter.png", cell_count_cutoff=float(args.cell_count_cutoff), @@ -246,7 +257,7 @@ nonz_mean_cutoff=float(args.nonz_mean_cutoff)) L.info("Subsetting data on selected features") adata_sc = adata_sc[:, selected] - adata_st = adata_st[:, selected] + sdata_st["table"] = sdata_st["table"][:, selected] @@ -280,7 +291,7 @@ L.info("Plotting QC plots") cell2loc_plot_QC_reference(model_ref, figdir + "/QC_reference_reconstruction_accuracy.png", figdir + "/QC_reference_expression signatures_vs_avg_expression.png") -# save model and update mudata +# save model if adata_sc.var.index.names[0] in adata_sc.var.columns: adata_sc.var.index.names = [None] mdata_singlecell.mod["rna"] = adata_sc @@ -293,7 +304,7 @@ # 4. Fit mapping model L.info("Setting up AnnData for the spatial model") -c2l.models.Cell2location.setup_anndata(adata=adata_st, +c2l.models.Cell2location.setup_anndata(adata=sdata_st["table"], labels_key = args.labels_key_st, layer= args.layer_st, batch_key= args.batch_key_st, @@ -301,7 +312,7 @@ continuous_covariate_keys = continuous_covariate_keys_st) -model_spatial = c2l.models.Cell2location(adata = adata_st, cell_state_df=inf_aver, +model_spatial = c2l.models.Cell2location(adata = sdata_st["table"], cell_state_df=inf_aver, N_cells_per_location=float(args.N_cells_per_location), detection_alpha=float(args.detection_alpha)) L.info("Training the spatial model") @@ -312,32 +323,40 @@ cell2loc_plot_history(model_spatial, figdir + "/ELBO_spatial_model.png") #extract posterior L.info("Extracting the posterior of the spatial model") -adata_st = model_spatial.export_posterior(adata_st) +sdata_st["table"] = model_spatial.export_posterior(sdata_st["table"]) #plot QC L.info("Plotting QC plots") cell2loc_plot_QC_reconstr(model_spatial, figdir + "/QC_spatial_reconstruction_accuracy.png") +# export a gene by spot matrix for each cell type +if export_gene_by_spot: + # Compute expected expression per cell type + expected_dict = model_spatial.module.model.compute_expected_per_cell_type(model_spatial.samples["post_sample_q05"], model_spatial.adata_manager) + # Add to anndata layers + for i, n in enumerate(model_spatial.factor_names_): + sdata_st["table"].layers[n] = expected_dict['mu'][i] + #plot output L.info("Plotting spatial embedding plot coloured by 'q05_cell_abundance_w_sf'") -adata_st.obs[adata_st.uns["mod"]["factor_names"]] = adata_st.obsm["q05_cell_abundance_w_sf"] -sc.pl.spatial(adata_st,color=adata_st.uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png") +sdata_st["table"].obs[sdata_st["table"].uns["mod"]["factor_names"]] = sdata_st["table"].obsm["q05_cell_abundance_w_sf"] +sc.pl.spatial(sdata_st["table"],color=sdata_st["table"].uns["mod"]["factor_names"], show = False, save = "_Cell2Loc_q05_cell_abundance_w_sf.png") -# save model and update mudata -if adata_st.var.index.names[0] in adata_st.var.columns: - adata_st.var.index.names = [None] -mdata_spatial.mod["spatial"] = adata_st -mdata_spatial.update() +# save model +if sdata_st["table"].var.index.names[0] in sdata_st["table"].var.columns: + sdata_st["table"].var.index.names = [None] +#mdata_spatial.mod["spatial"] = adata_st +#mdata_spatial.update() if save_models is True: L.info("Saving spatial model to '%s'" % output_dir) model_spatial.save(output_dir+"/Spatial_mapping_model", overwrite=True) #6. save mudatas -L.info("Saving MuDatas to '%s'" % output_dir) +L.info("Saving SpatialDatas to '%s'" % output_dir) mdata_singlecell.write(output_dir+"/Cell2Loc_screference_output.h5mu") -mdata_spatial.write(output_dir+"/Cell2Loc_spatial_output.h5mu") +sdata_st.write(output_dir+"/Cell2Loc_spatial_output.zarr") L.info("Done") diff --git a/panpipes/python_scripts/run_clustering.py b/panpipes/python_scripts/run_clustering.py index fcd2e5c5..ee183c90 100644 --- a/panpipes/python_scripts/run_clustering.py +++ b/panpipes/python_scripts/run_clustering.py @@ -34,13 +34,20 @@ # read data L.info("Reading in data from '%s'" % args.infile) -mdata = mu.read(args.infile) -if type(mdata) is AnnData: - adata = mdata -elif args.modality is not None: - adata = mdata[args.modality] -else: - adata = mdata +if ".zarr" in args.infile: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + sdata = sd.read_zarr(args.infile) + adata = sdata["table"] +else: + mdata = mu.read(args.infile) + if type(mdata) is AnnData: + adata = mdata + elif args.modality is not None: + adata = mdata[args.modality] + else: + adata = mdata + uns_key=args.neighbors_key # check sc.pp.neihgbours has been run diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py index 980f7070..c3a05c91 100644 --- a/panpipes/python_scripts/run_filter_spatial.py +++ b/panpipes/python_scripts/run_filter_spatial.py @@ -4,6 +4,7 @@ import re import muon as mu from anndata import AnnData +import spatialdata as sd import os # import scpipelines.funcs as scp from panpipes.funcs.processing import intersect_obs_by_mod, remove_unused_categories @@ -41,10 +42,10 @@ def test_matching_df_ignore_cat(new_df, old_df): # parse arguments parser = argparse.ArgumentParser() -parser.add_argument('--input_mudata', +parser.add_argument('--input_spatialdata', default='gut_minus1_amp.h5ad', help='') -parser.add_argument('--output_mudata', +parser.add_argument('--output_spatialdata', default='', help='') parser.add_argument('--filter_dict', @@ -52,7 +53,7 @@ def test_matching_df_ignore_cat(new_df, old_df): help='this is pull') # cross modalities args parser.add_argument('--keep_barcodes', default=None, - help='1 column list of barcodes to keep, note that they should match the mudata input, this filtering happens first') + help='1 column list of barcodes to keep, note that they should match the spatialdata input, this filtering happens first') # load options @@ -72,93 +73,89 @@ def test_matching_df_ignore_cat(new_df, old_df): filter_dict = dictionary_stripper(filter_dict) L.info("Filter dictionary:\n %s" %filter_dict) -# load mudata +# load spatialdata -L.info("Reading in MuData from '%s'" % args.input_mudata) +L.info("Reading in SpatialData from '%s'" % args.input_spatialdata) +sdata = sd.read_zarr(args.input_spatialdata) +#mdata = mu.read(args.input_spatialdata) -mdata = mu.read(args.input_mudata) +#if isinstance(mdata, AnnData): +# raise TypeError("Input '%s' should be of spatialdata format, not Anndata" % args.input_spatialdata) -if isinstance(mdata, AnnData): - raise TypeError("Input '%s' should be of MuData format, not Anndata" % args.input_mudata) +orig_obs = sdata["table"].obs.copy() -orig_obs = mdata.obs.copy() - -L.info("Before filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features") +L.info("Before filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features") # filter based on provided barcodes ----- if args.keep_barcodes is not None: - L.info("Filtering MuData by keep_barcodes file") + L.info("Filtering SpatialData by keep_barcodes file") keep_bc = pd.read_csv(args.keep_barcodes,header=None) - mdata = mdata[mdata.obs_names.isin(keep_bc[0]),:].copy() - remove_unused_categories(mdata.obs) - mdata.update() - L.info("Remaining cells: %d" % mdata.n_obs) + sdata["table"] = sdata["table"][sdata["table"].obs_names.isin(keep_bc[0]),:].copy() + remove_unused_categories(sdata["table"].obs) + #mdata.update() + L.info("Remaining cells: %d" % sdata["table"].n_obs) # filter more than if filter_dict['run']: - # this will go through the modalities one at a time, - # then the categories max, min and bool - for mod in mdata.mod.keys(): - L.info(mod) - if mod in filter_dict.keys(): - for marg in filter_dict[mod].keys(): - if marg == "obs": - if "max" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['max'].items(): - L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % (mod, col, n)) - mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x <= n) - L.info("Remaining cells: %d" % mdata[mod].n_obs) - if "min" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['min'].items(): - L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % (mod, col, n)) - mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x >= n) - L.info("Remaining cells: %d" % mdata[mod].n_obs) - if "bool" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['bool'].items(): - L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % (mod, col, n)) - mu.pp.filter_obs(mdata.mod[mod], col, lambda x: x == n) - L.info("Remaining cells: %d" % mdata[mod].n_obs) - if marg == "var": - if "max" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['max'].items(): - L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % (mod, col, n)) - mu.pp.filter_var(mdata.mod[mod], col, lambda x: x <= n) - L.info("Remaining features: %d" % mdata[mod].n_vars) - - if "min" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['min'].items(): - L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % (mod, col, n)) - mu.pp.filter_var(mdata.mod[mod], col, lambda x: x >= n) - L.info("Remaining features: %d" % mdata[mod].n_vars) - - if "bool" in filter_dict[mod][marg].keys(): - for col, n in filter_dict[mod][marg]['bool'].items(): - L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % (mod, col, n)) - mu.pp.filter_var(mdata.mod[mod], col, lambda x: x == n) - L.info("Remaining features: %d" % mdata[mod].n_vars) - - -mdata.update() - -L.info("After filtering: "+ str(mdata.n_obs) + " cells and " + str(mdata.n_vars) + " features") - -remove_unused_categories(mdata.obs) + for marg in filter_dict["spatial"].keys(): + if marg == "obs": + if "max" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['max'].items(): + L.info("Filtering cells of modality '%s' by '%s' in .obs to less than %s" % ("spatial", col, n)) + mu.pp.filter_obs(sdata["table"], col, lambda x: x <= n) + L.info("Remaining cells: %d" % sdata["table"].n_obs) + if "min" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['min'].items(): + L.info("Filtering cells of modality '%s' by '%s' in .obs to more than %s" % ("spatial", col, n)) + mu.pp.filter_obs(sdata["table"], col, lambda x: x >= n) + L.info("Remaining cells: %d" % sdata["table"].n_obs) + if "bool" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['bool'].items(): + L.info("Filtering cells of modality '%s' by '%s' in .obs marked %s" % ("spatial", col, n)) + mu.pp.filter_obs(sdata["table"], col, lambda x: x == n) + L.info("Remaining cells: %d" % sdata["table"].n_obs) + if marg == "var": + if "max" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['max'].items(): + L.info("Filtering features of modality '%s' by '%s' in .var to less than %s" % ("spatial", col, n)) + mu.pp.filter_var(sdata["table"], col, lambda x: x <= n) + L.info("Remaining features: %d" % sdata["table"].n_vars) + + if "min" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['min'].items(): + L.info("Filtering features of modality '%s' by '%s' in .var to more than %s" % ("spatial", col, n)) + mu.pp.filter_var(sdata["table"], col, lambda x: x >= n) + L.info("Remaining features: %d" % sdata["table"].n_vars) + + if "bool" in filter_dict["spatial"][marg].keys(): + for col, n in filter_dict["spatial"][marg]['bool'].items(): + L.info("Filtering features of modality '%s' by '%s' in .var marked %s" % ("spatial", col, n)) + mu.pp.filter_var(sdata["table"], col, lambda x: x == n) + L.info("Remaining features: %d" % sdata["table"].n_vars) + + + +#mdata.update() + +L.info("After filtering: "+ str(sdata["table"].n_obs) + " cells and " + str(sdata["table"].n_vars) + " features") + +remove_unused_categories(sdata["table"].obs) # run quick test before saving out. -assert test_matching_df_ignore_cat(mdata.obs, orig_obs) +assert test_matching_df_ignore_cat(sdata["table"].obs, orig_obs) # write out obs -output_prefix = re.sub(".h5mu", "", os.path.basename(args.output_mudata)) +output_prefix = re.sub(".zarr", "", os.path.basename(args.output_spatialdata)) L.info("Saving updated obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'") -write_obs(mdata, output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv") +write_obs(sdata["table"], output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv") # write out the per sample_id cell numbers cell_counts_dict={} -for mm in mdata.mod.keys(): - cell_counts_dict[mm] = mdata[mm].obs.sample_id.value_counts().to_frame('n_cells') +#for mm in mdata.mod.keys(): +cell_counts_dict["spatial"] = sdata["table"].obs.sample_id.value_counts().to_frame('n_cells') cell_counts = pd.concat(cell_counts_dict).reset_index().rename( columns={"level_0": "modality", "level_1": "sample_id"}) @@ -167,10 +164,10 @@ def test_matching_df_ignore_cat(new_df, old_df): L.info("Saving cell counts in a metadata csv file to './tables/" + output_prefix + "_cell_counts.csv'") cell_counts.to_csv("tables/" + output_prefix + "_cell_counts.csv", index=None) -mdata.update() +#mdata.update() -L.info("Saving updated MuData to '%s'" % args.output_mudata) -mdata.write(args.output_mudata) +L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata) +sdata.write(args.output_spatialdata) L.info("Done") diff --git a/panpipes/python_scripts/run_find_markers_multi.py b/panpipes/python_scripts/run_find_markers_multi.py index ba1422d7..0ae1d067 100644 --- a/panpipes/python_scripts/run_find_markers_multi.py +++ b/panpipes/python_scripts/run_find_markers_multi.py @@ -201,19 +201,22 @@ def main(adata, L.info("Running with params: %s", args) # read data -L.info("Reading in MuData from '%s'" % args.infile) -mdata = read(args.infile) - - -if type(mdata) is AnnData: - adata = mdata - # main function only does rank_gene_groups on X, so -elif type(mdata) is MuData and args.modality is not None: - adata = mdata[args.modality] -else: - L.error("If the input is a MuData object, a modality needs to be specified") - sys.exit('If the input is a MuData object, a modality needs to be specified') - +if args.modality != "spatial": + L.info("Reading in MuData from '%s'" % args.infile) + mdata = read(args.infile) + if type(mdata) is AnnData: + adata = mdata + # main function only does rank_gene_groups on X, so + elif type(mdata) is MuData and args.modality is not None: + adata = mdata[args.modality] + else: + L.error("If the input is a MuData object, a modality needs to be specified") + sys.exit('If the input is a MuData object, a modality needs to be specified') +else: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + adata = sd.read_zarr(args.infile)["table"] + main(adata, mod=args.modality, diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py index bdf28cf6..250057ec 100644 --- a/panpipes/python_scripts/run_preprocess_spatial.py +++ b/panpipes/python_scripts/run_preprocess_spatial.py @@ -8,6 +8,7 @@ import scanpy as sc import muon as mu import scanpy.experimental as sce +import spatialdata as sd import os import argparse @@ -31,11 +32,11 @@ parser = argparse.ArgumentParser() -parser.add_argument("--input_mudata", - default="mudata_unfilt.h5mu", +parser.add_argument("--input_spatialdata", + default="spatialdata_unfilt.h5mu", help="") -parser.add_argument("--output_mudata", - default="mudata_unfilt.h5mu", +parser.add_argument("--output_spatialdata", + default="spatialdata_unfilt.h5mu", help="") parser.add_argument("--figdir", default="./figures/", @@ -88,12 +89,13 @@ sc.settings.figdir = figdir sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5)) -L.info("Reading in MuData from '%s'" % args.input_mudata) -mdata = mu.read(args.input_mudata) -spatial = mdata.mod['spatial'] +L.info("Reading in SpatialData from '%s'" % args.input_spatialdata) +sdata = sd.read_zarr(args.input_spatialdata) +#mdata = mu.read(args.input_spatialdata) +#spatial = mdata.mod['spatial'] -input_data = os.path.basename(args.input_mudata) -pattern = r"_filtered.h5(.*)" +input_data = os.path.basename(args.input_spatialdata) +pattern = r"_filtered.zarr" match = re.search(pattern, input_data) sprefix = input_data[:match.start()] @@ -101,12 +103,12 @@ # check if raw data is available #maybe layer of raw data as parameter L.info("Checking if raw data is available") -if X_is_raw(spatial): +if X_is_raw(sdata["table"]): L.info("Saving raw counts from .X to .layers['raw_counts']") - spatial.layers['raw_counts'] = spatial.X.copy() -elif "raw_counts" in spatial.layers : + sdata["table"].layers['raw_counts'] = sdata["table"].X.copy() +elif "raw_counts" in sdata["table"].layers : L.info(".layers['raw_counts'] already exists and copying it to .X") - spatial.X = spatial.layers['raw_counts'].copy() + sdata["table"].X = sdata["table"].layers['raw_counts'].copy() else: L.error("X is not raw data and 'raw_counts' layer not found") sys.exit("X is not raw data and 'raw_counts' layer not found") @@ -116,24 +118,24 @@ if args.norm_hvg_flavour == "squidpy": if args.squidpy_hvg_flavour == "seurat_v3": L.info("Running HVG selection with flavor seurat_v3") - sc.pp.highly_variable_genes(spatial, flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg, + sc.pp.highly_variable_genes(sdata["table"], flavor="seurat_v3", n_top_genes=int(args.n_top_genes), subset=args.filter_by_hvg, batch_key=args.hvg_batch_key) L.info("Log-normalizing data") - sc.pp.normalize_total(spatial) - sc.pp.log1p(spatial) + sc.pp.normalize_total(sdata["table"]) + sc.pp.log1p(sdata["table"]) else: L.info("Log-normalizing data") - sc.pp.normalize_total(spatial) - sc.pp.log1p(spatial) + sc.pp.normalize_total(sdata["table"]) + sc.pp.log1p(sdata["table"]) L.info("Running HVG selection with flavor %s" % args.squidpy_hvg_flavour) - sc.pp.highly_variable_genes(spatial, flavor=args.squidpy_hvg_flavour, + sc.pp.highly_variable_genes(sdata["table"], flavor=args.squidpy_hvg_flavour, min_mean=float(args.min_mean), max_mean=float(args.max_mean), min_disp=float(args.min_disp), subset=args.filter_by_hvg, batch_key=args.hvg_batch_key) L.info("Saving log-normalized counts to .layers['lognorm']") - spatial.layers["lognorm"] = spatial.X.copy() + sdata["table"].layers["lognorm"] = sdata["table"].X.copy() # plot HVGs: - sc.pl.highly_variable_genes(spatial, show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png") + sc.pl.highly_variable_genes(sdata["table"], show=False, save="_genes_highlyvar" + "."+ sprefix+ ".png") elif args.norm_hvg_flavour == "seurat": if args.clip is None: @@ -145,35 +147,35 @@ else: clip = float(args.clip) L.info("Running Pearson Residuals HVG selection") - sce.pp.highly_variable_genes(spatial, theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes), + sce.pp.highly_variable_genes(sdata["table"], theta=float(args.theta), clip=clip, n_top_genes=int(args.n_top_genes), batch_key=args.hvg_batch_key, flavor='pearson_residuals', layer="raw_counts", subset=args.filter_by_hvg) L.info("Running Pearson Residuals normalization") - sce.pp.normalize_pearson_residuals(spatial, theta=float(args.theta), clip=clip, layer="raw_counts") + sce.pp.normalize_pearson_residuals(sdata["table"], theta=float(args.theta), clip=clip, layer="raw_counts") L.info("Saving log-normalized counts to .layers['norm_pearson_resid']") - spatial.layers["norm_pearson_resid"] = spatial.X.copy() + sdata["table"].layers["norm_pearson_resid"] = sdata["table"].X.copy() else: # error or warning? L.warning("No normalization and HVG selection was performed! To perform, please specify the 'norm_hvg_flavour' as either 'squidpy' or 'seurat'") -if "highly_variable" in spatial.var: - L.info("You have %s Highly Variable Features", np.sum(spatial.var.highly_variable)) +if "highly_variable" in sdata["table"].var: + L.info("You have %s Highly Variable Features", np.sum(sdata["table"].var.highly_variable)) #PCA L.info("Running PCA") -sc.pp.pca(spatial, n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0) +sc.pp.pca(sdata["table"], n_comps=int(args.n_pcs), svd_solver='arpack', random_state=0) L.info("Plotting PCA") -sc.pl.pca(spatial, save = "_vars" + "."+ sprefix+".png") -sc.pl.pca_variance_ratio(spatial, log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png") +sc.pl.pca(sdata["table"], save = "_vars" + "."+ sprefix+".png") +sc.pl.pca_variance_ratio(sdata["table"], log=True, n_pcs=int(args.n_pcs), save= "."+ sprefix+".png") -mdata.update() -L.info("Saving updated MuData to '%s'" % args.output_mudata) -mdata.write(args.output_mudata) +#mdata.update() +L.info("Saving updated SpatialData to '%s'" % args.output_spatialdata) +sdata.write(args.output_spatialdata) L.info("Done") diff --git a/panpipes/python_scripts/run_scanpyQC_spatial.py b/panpipes/python_scripts/run_scanpyQC_spatial.py index 3e3059c6..57c35343 100644 --- a/panpipes/python_scripts/run_scanpyQC_spatial.py +++ b/panpipes/python_scripts/run_scanpyQC_spatial.py @@ -18,6 +18,7 @@ import argparse import scanpy as sc import muon as mu +import spatialdata as sd from panpipes.funcs.io import write_obs @@ -64,14 +65,15 @@ sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5)) -L.info("Reading in MuData from '%s'" % args.input_anndata) +L.info("Reading in SpatialData from '%s'" % args.input_anndata) -mdata = mu.read(args.input_anndata) -spatial = mdata['spatial'] +#mdata = mu.read(args.input_anndata) +sdata = sd.read_zarr(args.input_anndata) +#spatial = mdata['spatial'] L.info("Spatial data is:") -print(spatial) -L.info("With sample id '%s'" % spatial.obs["sample_id"].unique()[0]) +print(sdata) +L.info("With sample id '%s'" % sdata["table"].obs["sample_id"].unique()[0]) qc_vars = [] @@ -94,7 +96,7 @@ for kk in calc_proportions: xname= kk gene_list = cat_dic[kk] - spatial.var[xname] = [x in gene_list for x in spatial.var_names] + sdata["table"].var[xname] = [x in gene_list for x in sdata["table"].var_names] qc_vars.append(xname) # Score genes @@ -105,7 +107,7 @@ L.info("Computing gene scores for '%s'" % kk) xname= kk gene_list = cat_dic[kk] - sc.tl.score_genes(spatial, gene_list , + sc.tl.score_genes(sdata["table"], gene_list , ctrl_size=min(len(gene_list), 50), gene_pool=None, n_bins=25, @@ -127,11 +129,11 @@ qc_info = " and calculating proportions for '%s'" % qc_vars L.info("Calculating QC metrics with scanpy.pp.calculate_qc_metrics()" + qc_info) percent_top = [50, 100, 200, 500] #default -percent_top = [x for x in percent_top if x <= spatial.n_vars] -sc.pp.calculate_qc_metrics(spatial, qc_vars=qc_vars, percent_top=percent_top, inplace=True) +percent_top = [x for x in percent_top if x <= sdata["table"].n_vars] +sc.pp.calculate_qc_metrics(sdata["table"], qc_vars=qc_vars, percent_top=percent_top, inplace=True) -if (args.spatial_filetype == "vizgen") and ("blank_genes" in spatial.obsm): - spatial.obsm["blank_genes"].to_numpy().sum() / spatial.var["total_counts"].sum() * 100 +if (args.spatial_filetype == "vizgen") and ("blank_genes" in sdata["table"].obsm): + sdata["table"].obsm["blank_genes"].to_numpy().sum() / sdata["table"].var["total_counts"].sum() * 100 # Calculate cc scores if args.ccgenes is not None: @@ -144,7 +146,7 @@ sgenes = ccgenes[ccgenes["cc_phase"] == "s"]["gene_name"].tolist() g2mgenes = ccgenes[ccgenes["cc_phase"] == "g2m"]["gene_name"].tolist() L.info("Calculating cell cycle scores") - sc.tl.score_genes_cell_cycle(spatial, s_genes=sgenes, g2m_genes=g2mgenes) + sc.tl.score_genes_cell_cycle(sdata["table"], s_genes=sgenes, g2m_genes=g2mgenes) else: L.error("The path of the cell cycle genes tsv file '%s' could not be found" % args.ccgenes) sys.exit("The path of the cell cycle genes tsv file '%s' could not be found" % args.ccgenes) @@ -153,15 +155,15 @@ #TODO: we now need to update the mdata object to pick the calc proportion outputs made on # spatial = mdata['spatial'] -mdata.update() +#mdata.update() single_id = os.path.basename(str(args.input_anndata)) single_id = single_id.replace("_raw.h5mu","") L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv") -write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv") -L.info("Saving updated MuData to '%s'" % args.outfile) -mdata.write(args.outfile) +write_obs(sdata["table"], output_prefix=single_id, output_suffix="_cell_metadata.tsv") +L.info("Saving updated SpatialData to '%s'" % args.outfile) +sdata.write(args.outfile) L.info("Done") diff --git a/panpipes/python_scripts/run_tangram.py b/panpipes/python_scripts/run_tangram.py index 6b2cc6a2..6f545771 100644 --- a/panpipes/python_scripts/run_tangram.py +++ b/panpipes/python_scripts/run_tangram.py @@ -9,6 +9,7 @@ import scanpy as sc import tangram as tg import muon as mu +import spatialdata as sd import os import argparse @@ -100,11 +101,12 @@ #1. read in the data #spatial: -L.info("Reading in spatial MuData from '%s'" % args.input_spatial) -mdata_spatial = mu.read(args.input_spatial) -adata_st = mdata_spatial.mod['spatial'] +L.info("Reading in SpatialData from '%s'" % args.input_spatial) +sdata_st = sd.read_zarr(args.input_spatial) +#mdata_spatial = mu.read(args.input_spatial) +#adata_st = mdata_spatial.mod['spatial'] #single-cell: -L.info("Reading in reference MuData from '%s'" % args.input_singlecell) +L.info("Reading in reference SpatialData from '%s'" % args.input_singlecell) mdata_singlecell = mu.read(args.input_singlecell) adata_sc = mdata_singlecell.mod['rna'] @@ -131,33 +133,33 @@ # "Preprocess" anndatas L.info("Preprocessing AnnDatas") -tg.pp_adatas(adata_sc=adata_sc, adata_sp=adata_st, genes=markers) +tg.pp_adatas(adata_sc=adata_sc, adata_sp=sdata_st["table"], genes=markers) # 3. Run tangram L.info("Training model") adata_results = tg.mapping_utils.map_cells_to_space( - adata_sc=adata_sc, adata_sp=adata_st, num_epochs=int(args.num_epochs), device=args.device, **args.kwargs + adata_sc=adata_sc, adata_sp=sdata_st["table"], num_epochs=int(args.num_epochs), device=args.device, **args.kwargs ) # 3. Extract and plot results L.info("Extracting annotations") -tg.project_cell_annotations(adata_results, adata_st, annotation=args.labels_key_model) +tg.project_cell_annotations(adata_results, sdata_st["table"], annotation=args.labels_key_model) L.info("Plotting spatial embedding plot coloured by 'tangram_ct_pred'") annotation_list = list(pd.unique(adata_sc.obs[args.labels_key_model])) -df = adata_st.obsm["tangram_ct_pred"][annotation_list] -tg.construct_obs_plot(df, adata_st, perc=0.05) -if "spatial" in adata_st.uns: - sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png") +df = sdata_st["table"].obsm["tangram_ct_pred"][annotation_list] +tg.construct_obs_plot(df, sdata_st["table"], perc=0.05) +if "spatial" in sdata_st["table"].uns: + sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png") else: - sc.pl.spatial(adata_st, color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5) + sc.pl.spatial(sdata_st["table"], color=annotation_list, cmap="viridis", show=False, frameon=False, ncols=3, save = "_tangram_ct_pred.png",spot_size=0.5) mdata_singlecell_results = mu.MuData({"rna": adata_sc}) -mdata_spatial_results = mu.MuData({"spatial": adata_st}) +#mdata_spatial_results = mu.MuData({"spatial": adata_st}) -L.info("Saving MuDatas to '%s'" % output_dir) +L.info("Saving SpatialData and MuData to '%s'" % output_dir) mdata_singlecell_results.write(output_dir+"/Tangram_screference_output.h5mu") -mdata_spatial_results.write(output_dir+"/Tangram_spatial_output.h5mu") +sdata_st.write(output_dir+"/Tangram_spatial_output.zarr") L.info("Done") diff --git a/panpipes/python_scripts/run_umap.py b/panpipes/python_scripts/run_umap.py index b70c19f3..112e9d6c 100644 --- a/panpipes/python_scripts/run_umap.py +++ b/panpipes/python_scripts/run_umap.py @@ -10,6 +10,7 @@ import muon as mu from anndata import AnnData + import sys import logging L = logging.getLogger() @@ -40,13 +41,19 @@ # read data L.info("Reading in data from '%s'" % args.infile) -mdata = mu.read(args.infile) -if type(mdata) is AnnData: - adata = mdata -elif args.modality is not None: - adata = mdata[args.modality] -else: - adata = mdata +if ".zarr" in args.infile: + import spatialdata as sd + L.info("Reading in SpatialData from '%s'" % args.infile) + sdata = sd.read_zarr(args.infile) + adata = sdata["table"] +else: + mdata = mu.read(args.infile) + if type(mdata) is AnnData: + adata = mdata + elif args.modality is not None: + adata = mdata[args.modality] + else: + adata = mdata # set seed diff --git a/pyproject.toml b/pyproject.toml index 2dc39d84..02285df5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,10 @@ spatial = [ "scipy==1.12.0", "squidpy", "cell2location", - "tangram-sc" + "tangram-sc", + "spatialdata==0.2.6", + "spatialdata-io==0.1.6", + "dask==2024.12.1" ] refmap_old = [