From bf187814bffd4492743c828c4a5fe0a1fca3ced1 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 17 Oct 2025 00:55:00 +0200 Subject: [PATCH 1/5] configure the input and output of the codec --- zarr_sparse/codec/codec.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/zarr_sparse/codec/codec.py b/zarr_sparse/codec/codec.py index b6ab650..9409c3c 100644 --- a/zarr_sparse/codec/codec.py +++ b/zarr_sparse/codec/codec.py @@ -9,11 +9,12 @@ from zarr.codecs import BytesCodec, ZstdCodec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import Buffer, NDBuffer +from zarr.core.buffer.cpu import Buffer as CPUBuffer from zarr.core.common import JSON, parse_named_configuration from zarr.core.dtype.npy.int import Int64 from zarr.registry import get_pipeline_class, register_codec -from zarr_sparse.buffer import sparse_buffer_prototype +from zarr_sparse.buffer import SparseNDBuffer, sparse_buffer_prototype from zarr_sparse.codec import metadata from zarr_sparse.combine import first_value from zarr_sparse.comparison import compare_fill_value @@ -104,6 +105,9 @@ async def decode_metadata_table(table_data: Buffer) -> dict[str, Any]: class SparseArrayCodec(ArrayBytesCodec): + codec_input = SparseNDBuffer + codec_output = CPUBuffer + def __init__(self): self.array_codecs = (BytesCodec(), ZstdCodec()) self.table_codecs = (BytesCodec(),) From 385f060d42ea4869672d563114f61ce2b8bd7226 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 17 Oct 2025 00:55:19 +0200 Subject: [PATCH 2/5] add dask and xarray to the dev env --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d8a77e4..b0d9454 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ test = [ [dependency-groups] dev = [ "black>=25.1.0", + "dask>=2025.10.0", "hatch>=1.14.1", "hypothesis>=6.138.0", "ipdb>=0.13.13", @@ -34,6 +35,7 @@ dev = [ "pyinstrument>=5.1.1", "pytest>=8.3.5", "pytest-xdist>=3.6.1", + "xarray>=2025.10.1", ] [tool.hatch] From 66fc5ec4eef34520285b887dc759fd6b0240eb17 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 17 Oct 2025 00:55:39 +0200 Subject: [PATCH 3/5] fix a bunch of problems triggered by xarray --- zarr_sparse/buffer.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/zarr_sparse/buffer.py b/zarr_sparse/buffer.py index 8216716..e013037 100644 --- a/zarr_sparse/buffer.py +++ b/zarr_sparse/buffer.py @@ -10,7 +10,7 @@ from zarr.registry import register_ndbuffer from zarr_sparse.chunk_grid import ChunkGrid -from zarr_sparse.combine import combine_nd +from zarr_sparse.combine import combine_nd, first_value from zarr_sparse.slices import slice_size from zarr_sparse.utils import as_decorator @@ -21,6 +21,16 @@ def sparse_equal(a, b, equal_nan: bool) -> bool: equal_nan = equal_nan if a.dtype.kind not in ("U", "S", "T", "O", "V") else False + if isinstance(a, ChunkGrid): + if len(a.data) == 1: + a = next(iter(a.data.values())) + else: + raise RuntimeError("comparing multi-chunk grid") + if isinstance(b, ChunkGrid): + if len(b.data) == 1: + b = next(iter(b.data.values())) + else: + raise RuntimeError("comparing multi-chunk grid") if b.ndim == 0: if not np.array_equal( @@ -104,7 +114,9 @@ def __getitem__(self, key: Any) -> Self: def __setitem__(self, key: Any, value: Any) -> None: if isinstance(value, NDBuffer): - value = value._data + if len(value._data.data) != 1: + raise RuntimeError("setting a non-one-sized buffer is not allowed") + value = first_value(value._data.data) slice_sizes = tuple( slice_size(slice_, size) for slice_, size in zip(key, self._data.shape) From 6970c3a2ba49ec8072a6db7316f4fc2a967fd339 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 17 Oct 2025 09:16:38 +0200 Subject: [PATCH 4/5] use the new zarr PR for codec support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a4f6205..ba61c04 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ This does make reading specific parts (e.g. the coordinates) in a single request ```sh pip install \ - "zarr @ git+https://github.com/keewis/zarr-python.git@zarr-sparse-patch" \ + "zarr @ git+https://github.com/keewis/zarr-python.git@array-registry" \ "zarr-sparse @ git+https://github.com/keewis/zarr-sparse.git@main" ``` From c851414a1322dd4558e013506e302226729ef256 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 17 Oct 2025 09:17:16 +0200 Subject: [PATCH 5/5] add useful links --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index ba61c04..5badefb 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,11 @@ Unlike [binsparse-python](https://github.com/ivirshup/binsparse-python), the dif This does make reading specific parts (e.g. the coordinates) in a single request a bit harder, but having a single logical array map to a on-disk zarr array does have its advantages. +Useful links: + +- zarr-python PR: https://github.com/zarr-developers/zarr-python/pull/3529 +- sparse indexing adapter: https://github.com/keewis/sparse-indexing-adapter + ## Installation `zarr-sparse` currently requires a special version of zarr. To install it, use: