diff --git a/pyproject.toml b/pyproject.toml index e5edc4d..34619c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ license = "BSD-3-Clause" license-files = ["LICEN[CS]E*"] dependencies = [ - "cap-anndata>=0.5.0", + "cap-anndata>=0.5.2", "numpy>=2.0.2", "pandas>=2.2.3", "scipy>=1.13.1", diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index c77e7a1..b766819 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -113,3 +113,18 @@ class AnnDataNonStandardVarError(CapException): If there are other species you wish to upload to CAP, please contact support@celltype.info and we will work to accommodate your request. """ + +class CSCMatrixInX(CapException): + name = "CSCMatrixInX" + + def __init__(self, locations: list[str]): + """ + locations: list of matrix locations, e.g. ['X'], ['raw.X'], or ['X', 'raw.X'] + """ + super().__init__() + self.locations = locations + loc_str = " and ".join(locations) + self.message = ( + f"The CSC matrix is found in {loc_str}. " + "Gene expression matrix must be stored in CSR or dense format!" + ) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index a3eb219..b425158 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -3,7 +3,7 @@ from scipy.sparse import issparse from cap_anndata import CapAnnData, read_h5ad import logging -from h5py import Dataset +from h5py import Dataset, Group from .gene_mapping import ( GeneMap, @@ -22,6 +22,7 @@ AnnDataNonStandardVarError, BadAnnDataFile, AnnDataNoneInGeneralMetadata, + CSCMatrixInX, ) from typing import Optional @@ -68,6 +69,7 @@ def validate(self, report_success: bool = True) -> None: if cap_adata.raw is not None: cap_adata.raw.read_var(columns=[]) + self._validate_x_and_raw_x_formats(cap_adata) self._check_X(cap_adata) self._check_obsm(cap_adata) self._check_obs(cap_adata) @@ -287,3 +289,48 @@ def _remove_gene_version(ensemble_ids: pd.Index) -> pd.Index: clean_index = ensemble_ids.to_series().apply(lambda x: x.split(".")[0]) clean_index = pd.Index(clean_index) return clean_index + + def _is_csc(self, group_or_dataset) -> bool: + """ + Returns True if HDF5 object represents a CSC sparse matrix. + """ + if not isinstance(group_or_dataset, Group): + return False + + encoding = group_or_dataset.attrs.get("encoding-type", None) + return encoding == "csc_matrix" + + def _is_csr(self, group_or_dataset) -> bool: + if not isinstance(group_or_dataset, Group): + return False + return group_or_dataset.attrs.get("encoding-type", None) == "csr_matrix" + + def _is_dense(self, group_or_dataset) -> bool: + return isinstance(group_or_dataset, Dataset) + + def _validate_x_and_raw_x_formats(self, cap_adata: CapAnnData) -> None: + """ + Validate that X and raw.X (if exists) are dense or CSR. + Raise CSCMatrixInX otherwise. + """ + locations = [] + + f = cap_adata.file + + # X + x = f["X"] + if self._is_csc(x): + locations.append("X") + elif not (self._is_dense(x) or self._is_csr(x)): + locations.append("X") + + # raw.X + if "raw" in f and "X" in f["raw"]: + raw_x = f["raw/X"] + if self._is_csc(raw_x): + locations.append("raw.X") + elif not (self._is_dense(raw_x) or self._is_csr(raw_x)): + locations.append("raw.X") + + if locations: + raise CSCMatrixInX(locations=locations) diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index 2725427..3d9c104 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -1,7 +1,12 @@ import pytest import numpy as np import pandas as pd + import anndata as ad +from packaging import version +if version.parse(ad.__version__) >= version.parse("0.11.0"): + ad.settings.allow_write_nullable_strings = True + import scipy.sparse as sp from pathlib import Path import tempfile @@ -27,6 +32,7 @@ AnnDataNonStandardVarError, CapMultiException, AnnDataNoneInGeneralMetadata, + CSCMatrixInX, ) TMP_DIR = Path(tempfile.mkdtemp()) @@ -262,3 +268,75 @@ def test_ontology_id_instead_general_metadata(names_provided, with_none): with context: adata.read_obs(GENERAL_METADATA) v._check_obs(adata) + + +def write_adata_with_matrix(path, X, raw_X=None): + adata = ad.AnnData(X=X) + + if raw_X is not None: + adata.raw = ad.AnnData(X=raw_X) + + adata.write_h5ad(path) + + +def test_csc_in_x_raises(tmp_path): + p = tmp_path / "test.h5ad" + + X = sp.csc_matrix(np.eye(5)) # CSC + write_adata_with_matrix(p, X=X) + + v = UploadValidator(p) + + with pytest.raises(CSCMatrixInX) as e: + with read_h5ad(p, edit=False) as cap_adata: + v._validate_x_and_raw_x_formats(cap_adata) + + assert "X" in e.value.message + + +def test_csc_in_raw_x_raises(tmp_path): + p = tmp_path / "test.h5ad" + + X = sp.csr_matrix(np.eye(5)) # valid CSR + raw_X = sp.csc_matrix(np.eye(5)) # invalid CSC + + write_adata_with_matrix(p, X=X, raw_X=raw_X) + + v = UploadValidator(p) + + with pytest.raises(CSCMatrixInX) as e: + with read_h5ad(p, edit=False) as cap_adata: + v._validate_x_and_raw_x_formats(cap_adata) + + assert "raw.X" in e.value.message + + +def test_csc_in_both_raises(tmp_path): + p = tmp_path / "test.h5ad" + + X = sp.csc_matrix(np.eye(5)) + raw_X = sp.csc_matrix(np.eye(5)) + + write_adata_with_matrix(p, X=X, raw_X=raw_X) + + v = UploadValidator(p) + + with pytest.raises(CSCMatrixInX) as e: + with read_h5ad(p, edit=False) as cap_adata: + v._validate_x_and_raw_x_formats(cap_adata) + + assert "X and raw.X" in e.value.message + + +def test_dense_and_csr_pass(tmp_path): + p = tmp_path / "test.h5ad" + + X = np.random.rand(5, 5) # valid dense + raw_X = sp.csr_matrix(np.eye(5)) # valid CSR + + write_adata_with_matrix(p, X=X, raw_X=raw_X) + + v = UploadValidator(p) + + with read_h5ad(p, edit=False) as cap_adata: + v._validate_x_and_raw_x_formats(cap_adata) # should not raise diff --git a/uv.lock b/uv.lock index 6b3bef6..68b347a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", @@ -65,7 +65,7 @@ wheels = [ [[package]] name = "cap-anndata" -version = "0.5.0" +version = "0.5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anndata", version = "0.10.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -74,14 +74,14 @@ dependencies = [ { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pandas" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/12/b8/78a21f27e2eaa8c28a6855b2b6509ee3765139fede84042147f553e8a190/cap_anndata-0.5.0.tar.gz", hash = "sha256:8b288c1c948e068979eede4aa87b4375e91ba68ead36598a810e8e15077f580c", size = 16563, upload-time = "2025-05-30T13:06:23.545Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/41/f0ca38348b2f5be0c5ceaa524a672327dafa10fa0aaaf9cc4ae56fcd851e/cap_anndata-0.5.2.tar.gz", hash = "sha256:dff2d52fd9255eebd3c47982d0eb40036e8ff868585a5e688532d3c7cf2b9c79", size = 16584, upload-time = "2026-02-02T10:31:20.717Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/0d/823f21cafd2445301fe24a6524a4d71f0597047853d759add77d3b05914a/cap_anndata-0.5.0-py3-none-any.whl", hash = "sha256:2e634dcf13c4eecbacfacd76088dd149ce2eaa6a69cecda9032e1d29d2aa067f", size = 10551, upload-time = "2025-05-30T13:06:22.443Z" }, + { url = "https://files.pythonhosted.org/packages/b1/3f/5f6f3b061fbd6b0af505fa5069f800560a1e656c6ac99aea24ea34e27627/cap_anndata-0.5.2-py3-none-any.whl", hash = "sha256:e5da884cc3a884c40c64dc105da44ec1969e60b6e5a2fd8b8444b229bb1180b7", size = 10584, upload-time = "2026-02-02T10:31:19.183Z" }, ] [[package]] name = "cap-upload-validator" -version = "1.3.1" +version = "1.5.1" source = { editable = "." } dependencies = [ { name = "cap-anndata" }, @@ -94,7 +94,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "cap-anndata", specifier = ">=0.5.0" }, + { name = "cap-anndata", specifier = ">=0.5.2" }, { name = "numpy", specifier = ">=2.0.2" }, { name = "pandas", specifier = ">=2.2.3" }, { name = "scipy", specifier = ">=1.13.1" },