Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ classifiers = [
license = "BSD-3-Clause"
license-files = ["LICEN[CS]E*"]
dependencies = [
"cap-anndata>=0.5.0",
"cap-anndata>=0.5.2",
"numpy>=2.0.2",
"pandas>=2.2.3",
"scipy>=1.13.1",
Expand Down
15 changes: 15 additions & 0 deletions src/cap_upload_validator/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,18 @@ class AnnDataNonStandardVarError(CapException):
If there are other species you wish to upload to CAP, please contact
support@celltype.info and we will work to accommodate your request.
"""

class CSCMatrixInX(CapException):
name = "CSCMatrixInX"

def __init__(self, locations: list[str]):
"""
locations: list of matrix locations, e.g. ['X'], ['raw.X'], or ['X', 'raw.X']
"""
super().__init__()
self.locations = locations
loc_str = " and ".join(locations)
self.message = (
f"The CSC matrix is found in {loc_str}. "
"Gene expression matrix must be stored in CSR or dense format!"
)
49 changes: 48 additions & 1 deletion src/cap_upload_validator/upload_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from scipy.sparse import issparse
from cap_anndata import CapAnnData, read_h5ad
import logging
from h5py import Dataset
from h5py import Dataset, Group

from .gene_mapping import (
GeneMap,
Expand All @@ -22,6 +22,7 @@
AnnDataNonStandardVarError,
BadAnnDataFile,
AnnDataNoneInGeneralMetadata,
CSCMatrixInX,
)
from typing import Optional

Expand Down Expand Up @@ -68,6 +69,7 @@ def validate(self, report_success: bool = True) -> None:
if cap_adata.raw is not None:
cap_adata.raw.read_var(columns=[])

self._validate_x_and_raw_x_formats(cap_adata)
self._check_X(cap_adata)
self._check_obsm(cap_adata)
self._check_obs(cap_adata)
Expand Down Expand Up @@ -287,3 +289,48 @@ def _remove_gene_version(ensemble_ids: pd.Index) -> pd.Index:
clean_index = ensemble_ids.to_series().apply(lambda x: x.split(".")[0])
clean_index = pd.Index(clean_index)
return clean_index

def _is_csc(self, group_or_dataset) -> bool:
"""
Returns True if HDF5 object represents a CSC sparse matrix.
"""
if not isinstance(group_or_dataset, Group):
return False

encoding = group_or_dataset.attrs.get("encoding-type", None)
return encoding == "csc_matrix"

def _is_csr(self, group_or_dataset) -> bool:
if not isinstance(group_or_dataset, Group):
return False
return group_or_dataset.attrs.get("encoding-type", None) == "csr_matrix"

def _is_dense(self, group_or_dataset) -> bool:
return isinstance(group_or_dataset, Dataset)

def _validate_x_and_raw_x_formats(self, cap_adata: CapAnnData) -> None:
"""
Validate that X and raw.X (if exists) are dense or CSR.
Raise CSCMatrixInX otherwise.
"""
locations = []

f = cap_adata.file

# X
x = f["X"]
if self._is_csc(x):
locations.append("X")
elif not (self._is_dense(x) or self._is_csr(x)):
locations.append("X")

# raw.X
if "raw" in f and "X" in f["raw"]:
raw_x = f["raw/X"]
if self._is_csc(raw_x):
locations.append("raw.X")
elif not (self._is_dense(raw_x) or self._is_csr(raw_x)):
locations.append("raw.X")

if locations:
raise CSCMatrixInX(locations=locations)
78 changes: 78 additions & 0 deletions test/test_upload_validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import pytest
import numpy as np
import pandas as pd

import anndata as ad
from packaging import version
if version.parse(ad.__version__) >= version.parse("0.11.0"):
ad.settings.allow_write_nullable_strings = True

import scipy.sparse as sp
from pathlib import Path
import tempfile
Expand All @@ -27,6 +32,7 @@
AnnDataNonStandardVarError,
CapMultiException,
AnnDataNoneInGeneralMetadata,
CSCMatrixInX,
)

TMP_DIR = Path(tempfile.mkdtemp())
Expand Down Expand Up @@ -262,3 +268,75 @@ def test_ontology_id_instead_general_metadata(names_provided, with_none):
with context:
adata.read_obs(GENERAL_METADATA)
v._check_obs(adata)


def write_adata_with_matrix(path, X, raw_X=None):
adata = ad.AnnData(X=X)

if raw_X is not None:
adata.raw = ad.AnnData(X=raw_X)

adata.write_h5ad(path)


def test_csc_in_x_raises(tmp_path):
p = tmp_path / "test.h5ad"

X = sp.csc_matrix(np.eye(5)) # CSC
write_adata_with_matrix(p, X=X)

v = UploadValidator(p)

with pytest.raises(CSCMatrixInX) as e:
with read_h5ad(p, edit=False) as cap_adata:
v._validate_x_and_raw_x_formats(cap_adata)

assert "X" in e.value.message


def test_csc_in_raw_x_raises(tmp_path):
p = tmp_path / "test.h5ad"

X = sp.csr_matrix(np.eye(5)) # valid CSR
raw_X = sp.csc_matrix(np.eye(5)) # invalid CSC

write_adata_with_matrix(p, X=X, raw_X=raw_X)

v = UploadValidator(p)

with pytest.raises(CSCMatrixInX) as e:
with read_h5ad(p, edit=False) as cap_adata:
v._validate_x_and_raw_x_formats(cap_adata)

assert "raw.X" in e.value.message


def test_csc_in_both_raises(tmp_path):
p = tmp_path / "test.h5ad"

X = sp.csc_matrix(np.eye(5))
raw_X = sp.csc_matrix(np.eye(5))

write_adata_with_matrix(p, X=X, raw_X=raw_X)

v = UploadValidator(p)

with pytest.raises(CSCMatrixInX) as e:
with read_h5ad(p, edit=False) as cap_adata:
v._validate_x_and_raw_x_formats(cap_adata)

assert "X and raw.X" in e.value.message


def test_dense_and_csr_pass(tmp_path):
p = tmp_path / "test.h5ad"

X = np.random.rand(5, 5) # valid dense
raw_X = sp.csr_matrix(np.eye(5)) # valid CSR

write_adata_with_matrix(p, X=X, raw_X=raw_X)

v = UploadValidator(p)

with read_h5ad(p, edit=False) as cap_adata:
v._validate_x_and_raw_x_formats(cap_adata) # should not raise
12 changes: 6 additions & 6 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.