Migrate to zarr-python 3

dstansby · dstansby · commit d14777ccb6f1 · 2025-04-03T17:19:57.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,10 +23,7 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: Implementation :: CPython",
 ]
-dependencies = [
-    "zarr<3",
-    "pydantic>2.0.0"
-    ]
+dependencies = ["zarr>=3", "pydantic>2.0.0"]
 
 [project.urls]
 Documentation = "https://zarr.dev/pydantic-zarr/"
@@ -194,7 +191,9 @@ addopts = [
     "--durations=10", "-ra", "--strict-config", "--strict-markers",
 ]
 filterwarnings = [
-    "error"
+    "error",
+    # https://github.com/zarr-developers/zarr-python/issues/2948
+    "ignore:The `order` keyword argument has no effect for Zarr format 3 arrays:RuntimeWarning",
 ]
 
 [tool.repo-review]
diff --git a/src/pydantic_zarr/v2.py b/src/pydantic_zarr/v2.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 import os
 from collections.abc import Mapping
 from typing import (
@@ -21,9 +22,9 @@
 from numcodecs.abc import Codec
 from pydantic import AfterValidator, model_validator
 from pydantic.functional_validators import BeforeValidator
+from zarr.abc.store import Store
+from zarr.core.sync_group import get_node
 from zarr.errors import ContainsArrayError, ContainsGroupError
-from zarr.storage import BaseStore, contains_array, contains_group, init_group
-from zarr.util import guess_chunks
 
 from pydantic_zarr.core import (
     IncEx,
@@ -36,6 +37,17 @@
 TItem = TypeVar("TItem", bound=Union["GroupSpec", "ArraySpec"])
 
 
+def _contains_array(store: Store, path: str) -> bool:
+    try:
+        return isinstance(get_node(store, path, zarr_format=2), zarr.Array)
+    except FileNotFoundError:
+        return False
+
+
+def _contains_group(store: Store, path: str) -> bool:
+    return isinstance(get_node(store, path, zarr_format=2), zarr.Group)
+
+
 def stringify_dtype(value: npt.DTypeLike) -> str:
     """
     Convert a `numpy.dtype` object into a `str`.
@@ -318,14 +330,14 @@ def from_zarr(cls, array: zarr.Array) -> Self:
             fill_value=array.dtype.type(array.fill_value).tolist(),
             order=array.order,
             filters=array.filters,
-            dimension_separator=array._dimension_separator,
-            compressor=array.compressor,
+            dimension_separator=array.metadata.dimension_separator,
+            compressor=array.compressors[0].get_config(),
             attributes=array.attrs.asdict(),
         )
 
     def to_zarr(
         self,
-        store: BaseStore,
+        store: Store,
         path: str,
         *,
         overwrite: bool = False,
@@ -337,14 +349,15 @@ def to_zarr(
 
         Parameters
         ----------
-        store : instance of zarr.BaseStore
+        store : instance of zarr.abc.store.Store
             The storage backend that will manifest the array.
         path : str
             The location of the array inside the store.
         overwrite: bool, default = False
             Whether to overwrite existing objects in storage to create the Zarr array.
         **kwargs : Any
             Additional keyword arguments are passed to `zarr.create`.
+
         Returns
         -------
         zarr.Array
@@ -356,24 +369,20 @@ def to_zarr(
             spec_dict["compressor"] = numcodecs.get_codec(spec_dict["compressor"])
         if self.filters is not None:
             spec_dict["filters"] = [numcodecs.get_codec(f) for f in spec_dict["filters"]]
-        if contains_array(store, path):
-            extant_array = zarr.open_array(store, path=path, mode="r")
+        if _contains_array(store, path):
+            extant_array = zarr.open_array(store, path=path, mode="r", zarr_format=2)
 
             if not self.like(extant_array):
                 if not overwrite:
-                    msg = (
-                        f"An array already exists at path {path}. "
-                        "That array is structurally dissimilar to the array you are trying to "
-                        "store. Call to_zarr with overwrite=True to overwrite that array."
-                    )
-                    raise ContainsArrayError(msg)
+                    raise ContainsArrayError(store, path)
             else:
                 if not overwrite:
                     # extant_array is read-only, so we make a new array handle that
                     # takes **kwargs
                     return zarr.open_array(
-                        store=extant_array.store, path=extant_array.path, **kwargs
+                        store=extant_array.store, path=extant_array.path, zarr_format=2, **kwargs
                     )
+        spec_dict["zarr_format"] = spec_dict.pop("zarr_version", 2)
         result = zarr.create(store=store, path=path, overwrite=overwrite, **spec_dict, **kwargs)
         result.attrs.put(attrs)
         return result
@@ -519,13 +528,14 @@ def from_zarr(cls, group: zarr.Group, *, depth: int = -1) -> Self:
         result = cls(attributes=attributes, members=members)
         return result
 
-    def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwargs):
+    def to_zarr(self, store: Store, path: str, *, overwrite: bool = False, **kwargs):
         """
-        Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.BaseStore`.
+        Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.abc.store.Store`.
         This operation will create metadata documents in the store.
+
         Parameters
         ----------
-        store : zarr.BaseStore
+        store : zarr.abc.store.Store
             The storage backend that will manifest the group and its contents.
         path : str
             The location of the group inside the store.
@@ -542,7 +552,7 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa
         """
         spec_dict = self.model_dump(exclude={"members": True})
         attrs = spec_dict.pop("attributes")
-        if contains_group(store, path):
+        if _contains_group(store, path):
             extant_group = zarr.group(store, path=path)
             if not self.like(extant_group):
                 if not overwrite:
@@ -558,14 +568,14 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa
                     # then just return the extant group
                     return extant_group
 
-        elif contains_array(store, path) and not overwrite:
+        elif _contains_array(store, path) and not overwrite:
             msg = (
                 f"An array already exists at path {path}. "
                 "Call to_zarr with overwrite=True to overwrite the array."
             )
             raise ContainsArrayError(msg)
         else:
-            init_group(store=store, overwrite=overwrite, path=path)
+            zarr.create_group(store=store, overwrite=overwrite, path=path, zarr_format=2)
 
         result = zarr.group(store=store, path=path, overwrite=overwrite)
         result.attrs.put(attrs)
@@ -746,7 +756,7 @@ def from_zarr(element: zarr.Array | zarr.Group, depth: int = -1) -> ArraySpec |
 @overload
 def to_zarr(
     spec: ArraySpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     *,
     overwrite: bool = False,
@@ -757,7 +767,7 @@ def to_zarr(
 @overload
 def to_zarr(
     spec: GroupSpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     *,
     overwrite: bool = False,
@@ -767,7 +777,7 @@ def to_zarr(
 
 def to_zarr(
     spec: ArraySpec | GroupSpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     *,
     overwrite: bool = False,
@@ -781,7 +791,7 @@ def to_zarr(
     ----------
     spec : ArraySpec | GroupSpec
         The `GroupSpec` or `ArraySpec` that will be serialized to storage.
-    store : zarr.BaseStore
+    store : zarr.abc.store.BaseStore
         The storage backend that will manifest the Zarr group or array modeled by `spec`.
     path : str
         The location of the Zarr group or array inside the store.
@@ -985,7 +995,7 @@ def auto_chunks(data: Any) -> tuple[int, ...]:
         return data.chunksize
     if hasattr(data, "chunks"):
         return data.chunks
-    return guess_chunks(data.shape, np.dtype(data.dtype).itemsize)
+    return _guess_chunks(data.shape, np.dtype(data.dtype).itemsize)
 
 
 def auto_attributes(data: Any) -> Mapping[str, Any]:
@@ -1045,3 +1055,55 @@ def auto_dimension_separator(data: Any) -> Literal["/", "."]:
     if hasattr(data, "dimension_separator"):
         return data.dimension_separator
     return "/"
+
+
+def _guess_chunks(shape: tuple[int, ...], typesize: int) -> tuple[int, ...]:
+    """
+    Vendored from zarr-python v2.
+
+    Guess an appropriate chunk layout for an array, given its shape and
+    the size of each element in bytes.  Will allocate chunks only as large
+    as MAX_SIZE.  Chunks are generally close to some power-of-2 fraction of
+    each axis, slightly favoring bigger values for the last index.
+    Undocumented and subject to change without warning.
+    """
+
+    CHUNK_BASE = 256 * 1024  # Multiplier by which chunks are adjusted
+    CHUNK_MIN = 128 * 1024  # Soft lower limit (128k)
+    CHUNK_MAX = 64 * 1024 * 1024  # Hard upper limit
+
+    ndims = len(shape)
+    # require chunks to have non-zero length for all dimensions
+    chunks = np.maximum(np.array(shape, dtype="=f8"), 1)
+
+    # Determine the optimal chunk size in bytes using a PyTables expression.
+    # This is kept as a float.
+    dset_size = np.prod(chunks) * typesize
+    target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024)))
+
+    if target_size > CHUNK_MAX:
+        target_size = CHUNK_MAX
+    elif target_size < CHUNK_MIN:
+        target_size = CHUNK_MIN
+
+    idx = 0
+    while True:
+        # Repeatedly loop over the axes, dividing them by 2.  Stop when:
+        # 1a. We're smaller than the target chunk size, OR
+        # 1b. We're within 50% of the target chunk size, AND
+        # 2. The chunk is smaller than the maximum chunk size
+
+        chunk_bytes = np.prod(chunks) * typesize
+
+        if (
+            chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5
+        ) and chunk_bytes < CHUNK_MAX:
+            break
+
+        if np.prod(chunks) == 1:
+            break  # Element size larger than CHUNK_MAX
+
+        chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0)
+        idx += 1
+
+    return tuple(int(x) for x in chunks)
diff --git a/src/pydantic_zarr/v3.py b/src/pydantic_zarr/v3.py
@@ -13,7 +13,7 @@
 
 import numpy.typing as npt
 import zarr
-from zarr.storage import BaseStore
+from zarr.abc.store import Store
 
 from pydantic_zarr.core import StrictBase
 from pydantic_zarr.v2 import DtypeStr
@@ -159,13 +159,13 @@ def from_zarr(cls, zarray: zarr.Array):
         """
         raise NotImplementedError
 
-    def to_zarr(self, store: BaseStore, path: str, overwrite: bool = False) -> zarr.Array:
+    def to_zarr(self, store: Store, path: str, overwrite: bool = False) -> zarr.Array:
         """
         Serialize an ArraySpec to a zarr array at a specific path in a zarr store.
 
         Parameters
         ----------
-        store : instance of zarr.BaseStore
+        store : instance of zarr.abc.store.Store
             The storage backend that will manifest the array.
         path : str
             The location of the array inside the store.
@@ -222,13 +222,13 @@ def from_zarr(cls, group: zarr.Group) -> GroupSpec[TAttr, TItem]:
 
         raise NotImplementedError
 
-    def to_zarr(self, store: BaseStore, path: str, overwrite: bool = False):
+    def to_zarr(self, store: Store, path: str, overwrite: bool = False):
         """
         Serialize a GroupSpec to a zarr group at a specific path in a zarr store.
 
         Parameters
         ----------
-        store : instance of zarr.BaseStore
+        store : instance of zarr.abc.store.Store
             The storage backend that will manifest the group and its contents.
         path : str
             The location of the group inside the store.
@@ -273,7 +273,7 @@ def from_zarr(element: zarr.Array | zarr.Group) -> ArraySpec | GroupSpec:
 @overload
 def to_zarr(
     spec: ArraySpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     overwrite: bool = False,
 ) -> zarr.Array: ...
@@ -282,15 +282,15 @@ def to_zarr(
 @overload
 def to_zarr(
     spec: GroupSpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     overwrite: bool = False,
 ) -> zarr.Group: ...
 
 
 def to_zarr(
     spec: ArraySpec | GroupSpec,
-    store: BaseStore,
+    store: Store,
     path: str,
     overwrite: bool = False,
 ) -> zarr.Array | zarr.Group:
@@ -302,7 +302,7 @@ def to_zarr(
     ----------
     spec : GroupSpec or ArraySpec
         The GroupSpec or ArraySpec that will be serialized to storage.
-    store : instance of zarr.BaseStore
+    store : instance of zarr.abc.store.Store
         The storage backend that will manifest the group or array.
     path : str
         The location of the group or array inside the store.
diff --git a/tests/test_pydantic_zarr/test_v2.py b/tests/test_pydantic_zarr/test_v2.py