diff --git a/README.md b/README.md index 4773a5c..4af1c49 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,6 @@ [Pydantic](https://docs.pydantic.dev/latest/) models for [Zarr](https://zarr.readthedocs.io/en/stable/index.html). -## ⚠️ Disclaimer ⚠️ -This project is under flux -- I want to add [zarr version 3](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) support to this project, but the [reference python implementation](https://github.com/zarr-developers/zarr-python) doesn't support version 3 yet. As the ecosystem evolves things will break so be advised! - ## Installation `pip install -U pydantic-zarr` @@ -56,5 +53,7 @@ print(spec.model_dump()) } """ ``` + ## History + This project was developed at [HHMI / Janelia Research Campus](https://www.janelia.org/). It was originally written by Davis Bennett to solve problems he encountered while working on the [Cellmap Project team](https://www.janelia.org/project-team/cellmap/members). In December of 2024 this project was migrated from the [`janelia-cellmap`](https://github.com/janelia-cellmap) github organization to [`zarr-developers`](https://github.com/zarr-developers) organization. diff --git a/docs/index.md b/docs/index.md index 0f47891..1ca5773 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,15 +8,17 @@ Static typing and runtime validation for Zarr hierarchies. `pydantic-zarr` expresses data stored in the [Zarr](https://zarr.readthedocs.io/en/stable/) format with [Pydantic](https://docs.pydantic.dev/1.10/). Specifically, `pydantic-zarr` encodes Zarr groups and arrays as [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/). These models are useful for formalizing the structure of Zarr hierarchies, type-checking Zarr hierarchies, and runtime validation for Zarr-based data. - ```python import zarr + from pydantic_zarr.v2 import GroupSpec # create a Zarr group -group = zarr.group(path='foo') +group = zarr.group(path='foo', zarr_format=2) # put an array inside the group -array = zarr.create(store = group.store, path='foo/bar', shape=10, dtype='uint8') +array = zarr.create( + store=group.store, path='foo/bar', shape=10, dtype='uint8', zarr_format=2 +) array.attrs.put({'metadata': 'hello'}) # create a pydantic model to model the Zarr group @@ -37,13 +39,7 @@ print(spec.model_dump()) 'order': 'C', 'filters': None, 'dimension_separator': '.', - 'compressor': { - 'id': 'blosc', - 'cname': 'lz4', - 'clevel': 5, - 'shuffle': 1, - 'blocksize': 0, - }, + 'compressor': {'id': 'zstd', 'level': 0, 'checksum': False}, } }, } @@ -56,11 +52,11 @@ More examples can be found in the [usage guide](usage_zarr_v2.md). `pip install -U pydantic-zarr` - ### Limitations #### No array data operations -This library only provides tools to represent the *layout* of Zarr groups and arrays, and the structure of their attributes. `pydantic-zarr` performs no type checking or runtime validation of the multidimensional array data contained *inside* Zarr arrays, and `pydantic-zarr` does not contain any tools for efficiently reading or writing Zarr arrays. + +This library only provides tools to represent the _layout_ of Zarr groups and arrays, and the structure of their attributes. `pydantic-zarr` performs no type checking or runtime validation of the multidimensional array data contained _inside_ Zarr arrays, and `pydantic-zarr` does not contain any tools for efficiently reading or writing Zarr arrays. #### Supported Zarr versions @@ -84,7 +80,7 @@ In `pydantic-zarr`, Zarr groups are modeled by the `GroupSpec` class, which is a Zarr arrays are represented by the `ArraySpec` class, which has a similar `attributes` field, as well as fields for all the Zarr array properties (`dtype`, `shape`, `chunks`, etc). -`GroupSpec` and `ArraySpec` are both [generic models](https://docs.pydantic.dev/1.10/usage/models/#generic-models). `GroupSpec` takes two type parameters, the first specializing the type of `GroupSpec.attributes`, and the second specializing the type of the *values* of `GroupSpec.members` (the keys of `GroupSpec.members` are always strings). `ArraySpec` only takes one type parameter, which specializes the type of `ArraySpec.attributes`. +`GroupSpec` and `ArraySpec` are both [generic models](https://docs.pydantic.dev/1.10/usage/models/#generic-models). `GroupSpec` takes two type parameters, the first specializing the type of `GroupSpec.attributes`, and the second specializing the type of the _values_ of `GroupSpec.members` (the keys of `GroupSpec.members` are always strings). `ArraySpec` only takes one type parameter, which specializes the type of `ArraySpec.attributes`. Examples using this generic typing functionality can be found in the [usage guide](usage_zarr_v2.md#using-generic-types). @@ -100,4 +96,4 @@ To handle these cases, `pydantic-zarr` allows the `members` attribute of a `Grou ## Standardization -The Zarr specifications do not define a model of the Zarr hierarchy. `pydantic-zarr` is an implementation of a particular model that can be found formalized in this [specification document](https://github.com/d-v-b/zeps/blob/zom/draft/ZEP0006.md), which has been proposed for inclusion in the Zarr specifications. You can find the discussion of that proposal in [this pull request](https://github.com/zarr-developers/zeps/pull/46). \ No newline at end of file +The Zarr specifications do not define a model of the Zarr hierarchy. `pydantic-zarr` is an implementation of a particular model that can be found formalized in this [specification document](https://github.com/d-v-b/zeps/blob/zom/draft/ZEP0006.md), which has been proposed for inclusion in the Zarr specifications. You can find the discussion of that proposal in [this pull request](https://github.com/zarr-developers/zeps/pull/46). diff --git a/docs/usage_zarr_v2.md b/docs/usage_zarr_v2.md index 1b284b6..da3ccd7 100644 --- a/docs/usage_zarr_v2.md +++ b/docs/usage_zarr_v2.md @@ -6,26 +6,27 @@ The `GroupSpec` and `ArraySpec` classes represent Zarr v2 groups and arrays, respectively. To create an instance of a `GroupSpec` or `ArraySpec` from an existing Zarr group or array, pass the Zarr group / array to the `.from_zarr` method defined on the `GroupSpec` / `ArraySpec` classes. This will result in a `pydantic-zarr` model of the Zarr object. -> By default `GroupSpec.from_zarr(zarr_group)` will traverse the entire hierarchy under `zarr_group`. This can be extremely slow if used on an extensive Zarr group on high latency storage. To limit the depth of traversal to a specific depth, use the `depth` keyword argument, e.g. `GroupSpec.from_zarr(zarr_group, depth=1)` +> By default `GroupSpec.from_zarr(zarr_group)` will traverse the entire hierarchy under `zarr_group`. This can be extremely slow if used on an extensive Zarr group on high latency storage. To limit the depth of traversal to a specific depth, use the `depth` keyword argument, e.g. `GroupSpec.from_zarr(zarr_group, depth=1)` -Note that `from_zarr` will *not* read the data inside an array. +Note that `from_zarr` will _not_ read the data inside an array. ### Writing To write a hierarchy to some zarr-compatible storage backend, `GroupSpec` and `ArraySpec` have `to_zarr` methods that take a Zarr store and a path and return a Zarr array or group created in the store at the given path. -Note that `to_zarr` will *not* write any array data. You have to do this separately. +Note that `to_zarr` will _not_ write any array data. You have to do this separately. ```python -from zarr import group -from zarr.creation import create -from zarr.storage import MemoryStore +from zarr import create, group + from pydantic_zarr.v2 import GroupSpec # create an in-memory Zarr group + array with attributes -grp = group(path='foo') +grp = group(path='foo', zarr_format=2) grp.attrs.put({'group_metadata': 10}) -arr = create(path='foo/bar', store=grp.store, shape=(10,), compressor=None) +arr = create( + path='foo/bar', store=grp.store, shape=(10,), compressor=None, zarr_format=2 +) arr.attrs.put({'array_metadata': True}) spec = GroupSpec.from_zarr(grp) @@ -63,15 +64,9 @@ spec_dict2['members']['bar']['shape'] = (100,) # serialize the spec to the store group2 = GroupSpec(**spec_dict2).to_zarr(grp.store, path='foo2') -print(group2) -#> - print(dict(group2.attrs)) #> {'a': 100, 'b': 'metadata'} -print(group2['bar']) -#> - print(dict(group2['bar'].attrs)) #> {'array_metadata': True} ``` @@ -81,9 +76,10 @@ print(dict(group2['bar'].attrs)) The `ArraySpec` class has a `from_array` static method that takes an array-like object and returns an `ArraySpec` with `shape` and `dtype` fields matching those of the array-like object. ```python -from pydantic_zarr.v2 import ArraySpec import numpy as np +from pydantic_zarr.v2 import ArraySpec + print(ArraySpec.from_array(np.arange(10)).model_dump()) """ { @@ -100,6 +96,7 @@ print(ArraySpec.from_array(np.arange(10)).model_dump()) } """ ``` + ### Flattening and unflattening Zarr hierarchies In the previous section we built a model of a Zarr hierarchy by defining `GroupSpec` and `ArraySpec` @@ -117,15 +114,16 @@ methods to convert to / from these dictionaries. This example demonstrates how to create a `GroupSpec` from a `dict` representation of a Zarr hierarchy. ```python -from pydantic_zarr.v2 import GroupSpec, ArraySpec +from pydantic_zarr.v2 import ArraySpec, GroupSpec + # other than the key representing the root path "", # the keys must be valid paths in the Zarr storage hierarchy # note that the `members` attribute is `None` for the `GroupSpec` instances in this `dict`. tree = { "": GroupSpec(members=None, attributes={"root": True}), "/a": GroupSpec(members=None, attributes={"root": False}), - "/a/b": ArraySpec(shape=(10,10), dtype="uint8", chunks=(1,1)) - } + "/a/b": ArraySpec(shape=(10, 10), dtype="uint8", chunks=(1, 1)), +} print(GroupSpec.from_flat(tree).model_dump()) """ @@ -162,12 +160,13 @@ This is similar to the example above, except that we are working in reverse -- w flat `dict` from the `GroupSpec` object. ```python -from pydantic_zarr.v2 import GroupSpec, ArraySpec +from pydantic_zarr.v2 import ArraySpec, GroupSpec + # other than the key representing the root path "", # the keys must be valid paths in the Zarr storage hierarchy # note that the `members` attribute is `None` for the `GroupSpec` instances in this `dict`. -a_b = ArraySpec(shape=(10,10), dtype="uint8", chunks=(1,1)) +a_b = ArraySpec(shape=(10, 10), dtype="uint8", chunks=(1, 1)) a = GroupSpec(members={'b': a_b}, attributes={"root": False}) root = GroupSpec(members={'a': a}, attributes={"root": True}) @@ -193,12 +192,14 @@ print(root.to_flat()) ``` #### Implicit groups + `zarr-python` supports creating Zarr arrays or groups deep in the hierarchy without explicitly creating the intermediate groups first. `from_flat` models this behavior. For example, `{'/a/b/c': ArraySpec(...)}` implicitly defines the existence of a groups named `a` and `b` (which is contained in `a`). `from_flat` will create the expected `GroupSpec` object from such `dict` instances. ```python -from pydantic_zarr.v2 import GroupSpec, ArraySpec +from pydantic_zarr.v2 import ArraySpec, GroupSpec + tree = {'/a/b/c': ArraySpec(shape=(1,), dtype='uint8', chunks=(1,))} print(GroupSpec.from_flat(tree).model_dump()) """ @@ -244,8 +245,11 @@ The `like` method works by converting both input models to `dict` via `pydantic. The `like` method takes keyword arguments `include` and `exclude`, which determine the attributes included or excluded from the model comparison. So it's possible to use `like` to check if two `ArraySpec` instances have the same `shape`, `dtype` and `chunks` by calling `array_a.like(array_b, include={'shape', 'dtype', 'chunks'})`. This is useful if you don't care about the compressor or filters and just want to ensure that you can safely write an in-memory array to a Zarr array, which depends just on the two arrays having matching `shape`, `dtype`, and `chunks` attributes. ```python -from pydantic_zarr.v2 import ArraySpec, GroupSpec import zarr +import zarr.storage + +from pydantic_zarr.v2 import ArraySpec, GroupSpec + arr_a = ArraySpec(shape=(1,), dtype='uint8', chunks=(1,)) # make an array with a different shape arr_b = ArraySpec(shape=(2,), dtype='uint8', chunks=(1,)) @@ -259,7 +263,7 @@ print(arr_a.like(arr_b, exclude={'shape'})) #> True # `ArraySpec.like` will convert a zarr.Array to ArraySpec -store = zarr.MemoryStore() +store = zarr.storage.MemoryStore() # This is a zarr.Array arr_a_stored = arr_a.to_zarr(store, path='arr_a') @@ -302,25 +306,28 @@ This example shows how to specialize `GroupSpec` and `ArraySpec` with type param ```python import sys -from pydantic_zarr.v2 import GroupSpec, ArraySpec, TItem, TAttr + from pydantic import ValidationError -from typing import Any + +from pydantic_zarr.v2 import ArraySpec, GroupSpec, TAttr, TItem if sys.version_info < (3, 12): from typing_extensions import TypedDict else: from typing import TypedDict + # a Pydantic BaseModel would also work here class GroupAttrs(TypedDict): a: int b: int + # a Zarr group with attributes consistent with GroupAttrs SpecificAttrsGroup = GroupSpec[GroupAttrs, TItem] try: - SpecificAttrsGroup(attributes={'a' : 10, 'b': 'foo'}) + SpecificAttrsGroup(attributes={'a': 10, 'b': 'foo'}) except ValidationError as exc: print(exc) """ @@ -350,11 +357,11 @@ except ValidationError as exc: """ # this passes validation -items = {'foo': ArraySpec(attributes={}, - shape=(1,), - dtype='uint8', - chunks=(1,), - compressor=None)} +items = { + 'foo': ArraySpec( + attributes={}, shape=(1,), dtype='uint8', chunks=(1,), compressor=None + ) +} print(ArraysOnlyGroup(attributes={}, members=items).model_dump()) """ { diff --git a/docs/usage_zarr_v3.md b/docs/usage_zarr_v3.md index 57fc073..63f6ee8 100644 --- a/docs/usage_zarr_v3.md +++ b/docs/usage_zarr_v3.md @@ -12,7 +12,8 @@ the backend for that doesn't exist. ## Defining Zarr v3 hierarchies ```python -from pydantic_zarr.v3 import GroupSpec, ArraySpec, NamedConfig +from pydantic_zarr.v3 import ArraySpec, GroupSpec, NamedConfig + array_attributes = {"baz": [1, 2, 3]} group_attributes = {"foo": 42, "bar": False} @@ -21,12 +22,8 @@ array_spec = ArraySpec( shape=[1000, 1000], dimension_names=["rows", "columns"], data_type="uint8", - chunk_grid=NamedConfig( - name="regular", configuration={"chunk_shape": [1000, 100]} - ), - chunk_key_encoding=NamedConfig( - name="default", configuration={"separator": "/"} - ), + chunk_grid=NamedConfig(name="regular", configuration={"chunk_shape": [1000, 100]}), + chunk_key_encoding=NamedConfig(name="default", configuration={"separator": "/"}), codecs=[NamedConfig(name="GZip", configuration={"level": 1})], fill_value=0, ) diff --git a/pyproject.toml b/pyproject.toml index 008135f..748bd17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,9 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", ] dependencies = [ - "zarr<3", - "pydantic>2.0.0" - ] + "zarr@git+https://github.com/zarr-developers/zarr-python.git@main", + "pydantic>2.0.0", +] [project.urls] Documentation = "https://zarr.dev/pydantic-zarr/" @@ -52,6 +52,9 @@ docs = [ version.source = "vcs" build.hooks.vcs.version-file = "src/pydantic_zarr/_version.py" +[tool.hatch.metadata] +allow-direct-references = true + [tool.hatch.envs.test] features = ["test"] @@ -194,7 +197,9 @@ addopts = [ "--durations=10", "-ra", "--strict-config", "--strict-markers", ] filterwarnings = [ - "error" + "error", + # https://github.com/zarr-developers/zarr-python/issues/2948 + "ignore:The `order` keyword argument has no effect for Zarr format 3 arrays:RuntimeWarning", ] [tool.repo-review] diff --git a/src/pydantic_zarr/core.py b/src/pydantic_zarr/core.py index abb832a..207d64f 100644 --- a/src/pydantic_zarr/core.py +++ b/src/pydantic_zarr/core.py @@ -7,6 +7,8 @@ TypeAlias, ) +import numpy as np +import numpy.typing as npt from pydantic import BaseModel, ConfigDict IncEx: TypeAlias = set[int] | set[str] | dict[int, Any] | dict[str, Any] | None @@ -18,6 +20,23 @@ class StrictBase(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") +def stringify_dtype(value: npt.DTypeLike) -> str: + """ + Convert a `numpy.dtype` object into a `str`. + + Parameters + --------- + value: `npt.DTypeLike` + Some object that can be coerced to a numpy dtype + + Returns + ------- + + A numpy dtype string representation of `value`. + """ + return np.dtype(value).str + + def ensure_member_name(data: Any) -> str: """ If the input is a string, then ensure that it is a valid diff --git a/src/pydantic_zarr/v2.py b/src/pydantic_zarr/v2.py index 786d1f6..d167537 100644 --- a/src/pydantic_zarr/v2.py +++ b/src/pydantic_zarr/v2.py @@ -1,5 +1,6 @@ from __future__ import annotations +import math import os from collections.abc import Mapping from typing import ( @@ -19,38 +20,30 @@ import numpy.typing as npt import zarr from numcodecs.abc import Codec -from pydantic import AfterValidator, model_validator +from pydantic import AfterValidator, field_validator, model_validator from pydantic.functional_validators import BeforeValidator +from zarr.abc.store import Store +from zarr.core.sync_group import get_node from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.storage import BaseStore, contains_array, contains_group, init_group -from zarr.util import guess_chunks - -from pydantic_zarr.core import ( - IncEx, - StrictBase, - ensure_key_no_path, - model_like, -) + +from pydantic_zarr.core import IncEx, StrictBase, ensure_key_no_path, model_like, stringify_dtype TAttr = TypeVar("TAttr", bound=Mapping[str, Any]) TItem = TypeVar("TItem", bound=Union["GroupSpec", "ArraySpec"]) -def stringify_dtype(value: npt.DTypeLike) -> str: - """ - Convert a `numpy.dtype` object into a `str`. - - Parameters - --------- - value: `npt.DTypeLike` - Some object that can be coerced to a numpy dtype +def _contains_array(store: Store, path: str) -> bool: + try: + return isinstance(get_node(store, path, zarr_format=2), zarr.Array) + except FileNotFoundError: + return False - Returns - ------- - A numpy dtype string representation of `value`. - """ - return np.dtype(value).str +def _contains_group(store: Store, path: str) -> bool: + try: + return isinstance(get_node(store, path, zarr_format=2), zarr.Group) + except FileNotFoundError: + return False DtypeStr = Annotated[str, BeforeValidator(stringify_dtype)] @@ -165,6 +158,14 @@ class ArraySpec(NodeSpec, Generic[TAttr]): ] = "/" compressor: CodecDict | None = None + @field_validator("filters", mode="after") + @classmethod + def validate_filters(cls, value: list[CodecDict] | None) -> list[CodecDict] | None: + # Make sure filters is never an empty list + if value == []: + return None + return value + @model_validator(mode="after") def check_ndim(self): """ @@ -318,14 +319,14 @@ def from_zarr(cls, array: zarr.Array) -> Self: fill_value=array.dtype.type(array.fill_value).tolist(), order=array.order, filters=array.filters, - dimension_separator=array._dimension_separator, - compressor=array.compressor, + dimension_separator=array.metadata.dimension_separator, + compressor=array.compressors[0].get_config() if len(array.compressors) else None, attributes=array.attrs.asdict(), ) def to_zarr( self, - store: BaseStore, + store: Store, path: str, *, overwrite: bool = False, @@ -337,7 +338,7 @@ def to_zarr( Parameters ---------- - store : instance of zarr.BaseStore + store : instance of zarr.abc.store.Store The storage backend that will manifest the array. path : str The location of the array inside the store. @@ -345,6 +346,7 @@ def to_zarr( Whether to overwrite existing objects in storage to create the Zarr array. **kwargs : Any Additional keyword arguments are passed to `zarr.create`. + Returns ------- zarr.Array @@ -356,24 +358,20 @@ def to_zarr( spec_dict["compressor"] = numcodecs.get_codec(spec_dict["compressor"]) if self.filters is not None: spec_dict["filters"] = [numcodecs.get_codec(f) for f in spec_dict["filters"]] - if contains_array(store, path): - extant_array = zarr.open_array(store, path=path, mode="r") + if _contains_array(store, path): + extant_array = zarr.open_array(store, path=path, mode="r", zarr_format=2) if not self.like(extant_array): if not overwrite: - msg = ( - f"An array already exists at path {path}. " - "That array is structurally dissimilar to the array you are trying to " - "store. Call to_zarr with overwrite=True to overwrite that array." - ) - raise ContainsArrayError(msg) + raise ContainsArrayError(store, path) else: if not overwrite: # extant_array is read-only, so we make a new array handle that # takes **kwargs return zarr.open_array( - store=extant_array.store, path=extant_array.path, **kwargs + store=extant_array.store, path=extant_array.path, zarr_format=2, **kwargs ) + spec_dict["zarr_format"] = spec_dict.pop("zarr_version", 2) result = zarr.create(store=store, path=path, overwrite=overwrite, **spec_dict, **kwargs) result.attrs.put(attrs) return result @@ -500,7 +498,8 @@ def from_zarr(cls, group: zarr.Group, *, depth: int = -1) -> Self: if depth == 0: return cls(attributes=attributes, members=None) new_depth = max(depth - 1, -1) - for name, item in group.items(): + for name in group: + item = group[name] if isinstance(item, zarr.Array): # convert to dict before the final typed GroupSpec construction item_out = ArraySpec.from_zarr(item).model_dump() @@ -519,13 +518,14 @@ def from_zarr(cls, group: zarr.Group, *, depth: int = -1) -> Self: result = cls(attributes=attributes, members=members) return result - def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwargs): + def to_zarr(self, store: Store, path: str, *, overwrite: bool = False, **kwargs): """ - Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.BaseStore`. + Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.abc.store.Store`. This operation will create metadata documents in the store. + Parameters ---------- - store : zarr.BaseStore + store : zarr.abc.store.Store The storage backend that will manifest the group and its contents. path : str The location of the group inside the store. @@ -542,8 +542,8 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa """ spec_dict = self.model_dump(exclude={"members": True}) attrs = spec_dict.pop("attributes") - if contains_group(store, path): - extant_group = zarr.group(store, path=path) + if _contains_group(store, path): + extant_group = zarr.group(store, path=path, zarr_format=2) if not self.like(extant_group): if not overwrite: msg = ( @@ -558,16 +558,16 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa # then just return the extant group return extant_group - elif contains_array(store, path) and not overwrite: + elif _contains_array(store, path) and not overwrite: msg = ( f"An array already exists at path {path}. " "Call to_zarr with overwrite=True to overwrite the array." ) raise ContainsArrayError(msg) else: - init_group(store=store, overwrite=overwrite, path=path) + zarr.create_group(store=store, overwrite=overwrite, path=path, zarr_format=2) - result = zarr.group(store=store, path=path, overwrite=overwrite) + result = zarr.group(store=store, path=path, overwrite=overwrite, zarr_format=2) result.attrs.put(attrs) # consider raising an exception if a partial GroupSpec is provided if self.members is not None: @@ -746,7 +746,7 @@ def from_zarr(element: zarr.Array | zarr.Group, depth: int = -1) -> ArraySpec | @overload def to_zarr( spec: ArraySpec, - store: BaseStore, + store: Store, path: str, *, overwrite: bool = False, @@ -757,7 +757,7 @@ def to_zarr( @overload def to_zarr( spec: GroupSpec, - store: BaseStore, + store: Store, path: str, *, overwrite: bool = False, @@ -767,7 +767,7 @@ def to_zarr( def to_zarr( spec: ArraySpec | GroupSpec, - store: BaseStore, + store: Store, path: str, *, overwrite: bool = False, @@ -781,7 +781,7 @@ def to_zarr( ---------- spec : ArraySpec | GroupSpec The `GroupSpec` or `ArraySpec` that will be serialized to storage. - store : zarr.BaseStore + store : zarr.abc.store.BaseStore The storage backend that will manifest the Zarr group or array modeled by `spec`. path : str The location of the Zarr group or array inside the store. @@ -985,7 +985,7 @@ def auto_chunks(data: Any) -> tuple[int, ...]: return data.chunksize if hasattr(data, "chunks"): return data.chunks - return guess_chunks(data.shape, np.dtype(data.dtype).itemsize) + return _guess_chunks(data.shape, np.dtype(data.dtype).itemsize) def auto_attributes(data: Any) -> Mapping[str, Any]: @@ -1045,3 +1045,55 @@ def auto_dimension_separator(data: Any) -> Literal["/", "."]: if hasattr(data, "dimension_separator"): return data.dimension_separator return "/" + + +def _guess_chunks(shape: tuple[int, ...], typesize: int) -> tuple[int, ...]: + """ + Vendored from zarr-python v2. + + Guess an appropriate chunk layout for an array, given its shape and + the size of each element in bytes. Will allocate chunks only as large + as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of + each axis, slightly favoring bigger values for the last index. + Undocumented and subject to change without warning. + """ + + CHUNK_BASE = 256 * 1024 # Multiplier by which chunks are adjusted + CHUNK_MIN = 128 * 1024 # Soft lower limit (128k) + CHUNK_MAX = 64 * 1024 * 1024 # Hard upper limit + + ndims = len(shape) + # require chunks to have non-zero length for all dimensions + chunks = np.maximum(np.array(shape, dtype="=f8"), 1) + + # Determine the optimal chunk size in bytes using a PyTables expression. + # This is kept as a float. + dset_size = np.prod(chunks) * typesize + target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024))) + + if target_size > CHUNK_MAX: + target_size = CHUNK_MAX + elif target_size < CHUNK_MIN: + target_size = CHUNK_MIN + + idx = 0 + while True: + # Repeatedly loop over the axes, dividing them by 2. Stop when: + # 1a. We're smaller than the target chunk size, OR + # 1b. We're within 50% of the target chunk size, AND + # 2. The chunk is smaller than the maximum chunk size + + chunk_bytes = np.prod(chunks) * typesize + + if ( + chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5 + ) and chunk_bytes < CHUNK_MAX: + break + + if np.prod(chunks) == 1: + break # Element size larger than CHUNK_MAX + + chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0) + idx += 1 + + return tuple(int(x) for x in chunks) diff --git a/src/pydantic_zarr/v3.py b/src/pydantic_zarr/v3.py index 60ee80e..361e5c8 100644 --- a/src/pydantic_zarr/v3.py +++ b/src/pydantic_zarr/v3.py @@ -1,43 +1,49 @@ from __future__ import annotations -from collections.abc import Mapping, Sequence from typing import ( + TYPE_CHECKING, + Annotated, Any, Generic, Literal, + Self, TypeVar, Union, cast, overload, ) +import numpy as np import numpy.typing as npt -import zarr -from zarr.storage import BaseStore +from pydantic import BeforeValidator from pydantic_zarr.core import StrictBase -from pydantic_zarr.v2 import DtypeStr +from pydantic_zarr.v2 import stringify_dtype + +if TYPE_CHECKING: + from collections.abc import Sequence + + import zarr + from zarr.abc.store import Store TAttr = TypeVar("TAttr", bound=dict[str, Any]) TItem = TypeVar("TItem", bound=Union["GroupSpec", "ArraySpec"]) NodeType = Literal["group", "array"] +BoolFillValue = bool +IntFillValue = int # todo: introduce a type that represents hexadecimal representations of floats -FillValue = Union[ - Literal["Infinity", "-Infinity", "NaN"], - bool, - int, - float, - str, - tuple[float, float], - tuple[int, ...], -] +FloatFillValue = Literal["Infinity", "-Infinity", "NaN"] | float +ComplexFillValue = tuple[FloatFillValue, FloatFillValue] +RawFillValue = tuple[int, ...] + +FillValue = BoolFillValue | IntFillValue | FloatFillValue | ComplexFillValue | RawFillValue class NamedConfig(StrictBase): name: str - configuration: Mapping[str, Any] | None + configuration: dict[str, Any] | None class RegularChunkingConfig(StrictBase): @@ -50,12 +56,12 @@ class RegularChunking(NamedConfig): class DefaultChunkKeyEncodingConfig(StrictBase): - separator: Literal[".", "/"] + separator: Literal[".", "/"] = "/" class DefaultChunkKeyEncoding(NamedConfig): - name: Literal["default"] - configuration: DefaultChunkKeyEncodingConfig | None + name: Literal["default"] = "default" + configuration: DefaultChunkKeyEncodingConfig | None = DefaultChunkKeyEncodingConfig() class NodeSpec(StrictBase): @@ -72,6 +78,9 @@ class NodeSpec(StrictBase): zarr_format: Literal[3] = 3 +DtypeStr = Annotated[str, BeforeValidator(stringify_dtype)] + + class ArraySpec(NodeSpec, Generic[TAttr]): """ A model of a Zarr Version 3 Array. @@ -104,17 +113,28 @@ class ArraySpec(NodeSpec, Generic[TAttr]): node_type: Literal["array"] = "array" attributes: TAttr = cast(TAttr, {}) - shape: Sequence[int] + shape: tuple[int, ...] data_type: DtypeStr chunk_grid: NamedConfig # todo: validate this against shape chunk_key_encoding: NamedConfig # todo: validate this against shape fill_value: FillValue # todo: validate this against the data type - codecs: Sequence[NamedConfig] - storage_transformers: Sequence[NamedConfig] | None = None - dimension_names: Sequence[str] | None # todo: validate this against shape + codecs: tuple[NamedConfig, ...] + storage_transformers: tuple[NamedConfig, ...] + dimension_names: tuple[str | None, ...] # todo: validate this against shape @classmethod - def from_array(cls, array: npt.NDArray[Any], **kwargs): + def from_array( + cls, + array: npt.NDArray[Any], + *, + attributes: Literal["auto"] | TAttr = "auto", + chunk_grid: Literal["auto"] | NamedConfig = "auto", + chunk_key_encoding: Literal["auto"] | NamedConfig = "auto", + fill_value: Literal["auto"] | FillValue = "auto", + codecs: Literal["auto"] | Sequence[NamedConfig] = "auto", + storage_transformers: Literal["auto"] | Sequence[NamedConfig] = "auto", + dimension_names: Literal["auto"] | Sequence[str | None] = "auto", + ) -> Self: """ Create an ArraySpec from a numpy array-like object. @@ -131,15 +151,51 @@ def from_array(cls, array: npt.NDArray[Any], **kwargs): An instance of ArraySpec with properties derived from the provided array. """ - default_chunks = RegularChunking( - configuration=RegularChunkingConfig(chunk_shape=list(array.shape)) - ) + if attributes == "auto": + attributes_actual = cast(TAttr, auto_attributes(array)) + else: + attributes_actual = attributes + + if chunk_grid == "auto": + chunk_grid_actual = auto_chunk_grid(array) + else: + chunk_grid_actual = chunk_grid + + if chunk_key_encoding == "auto": + chunk_key_actual = DefaultChunkKeyEncoding() + else: + chunk_key_actual = chunk_key_encoding + + if fill_value == "auto": + fill_value_actual = auto_fill_value(array) + else: + fill_value_actual = fill_value + + if codecs == "auto": + codecs_actual = auto_codecs(array) + else: + codecs_actual = codecs + + if storage_transformers == "auto": + storage_transformers_actual = auto_storage_transformers(array) + else: + storage_transformers_actual = storage_transformers + + if dimension_names == "auto": + dimension_names_actual = auto_dimension_names(array) + else: + dimension_names_actual = dimension_names + return cls( shape=array.shape, data_type=str(array.dtype), - chunk_grid=kwargs.pop("chunks", default_chunks), - attributes=kwargs.pop("attributes", {}), - **kwargs, + chunk_grid=chunk_grid_actual, + attributes=attributes_actual, + chunk_key_encoding=chunk_key_actual, + fill_value=fill_value_actual, + codecs=codecs_actual, + storage_transformers=storage_transformers_actual, + dimension_names=dimension_names_actual, ) @classmethod @@ -159,13 +215,13 @@ def from_zarr(cls, zarray: zarr.Array): """ raise NotImplementedError - def to_zarr(self, store: BaseStore, path: str, overwrite: bool = False) -> zarr.Array: + def to_zarr(self, store: Store, path: str, overwrite: bool = False) -> zarr.Array: """ Serialize an ArraySpec to a zarr array at a specific path in a zarr store. Parameters ---------- - store : instance of zarr.BaseStore + store : instance of zarr.abc.store.Store The storage backend that will manifest the array. path : str The location of the array inside the store. @@ -222,13 +278,13 @@ def from_zarr(cls, group: zarr.Group) -> GroupSpec[TAttr, TItem]: raise NotImplementedError - def to_zarr(self, store: BaseStore, path: str, overwrite: bool = False): + def to_zarr(self, store: Store, path: str, overwrite: bool = False): """ Serialize a GroupSpec to a zarr group at a specific path in a zarr store. Parameters ---------- - store : instance of zarr.BaseStore + store : instance of zarr.abc.store.Store The storage backend that will manifest the group and its contents. path : str The location of the group inside the store. @@ -273,7 +329,7 @@ def from_zarr(element: zarr.Array | zarr.Group) -> ArraySpec | GroupSpec: @overload def to_zarr( spec: ArraySpec, - store: BaseStore, + store: Store, path: str, overwrite: bool = False, ) -> zarr.Array: ... @@ -282,7 +338,7 @@ def to_zarr( @overload def to_zarr( spec: GroupSpec, - store: BaseStore, + store: Store, path: str, overwrite: bool = False, ) -> zarr.Group: ... @@ -290,7 +346,7 @@ def to_zarr( def to_zarr( spec: ArraySpec | GroupSpec, - store: BaseStore, + store: Store, path: str, overwrite: bool = False, ) -> zarr.Array | zarr.Group: @@ -302,7 +358,7 @@ def to_zarr( ---------- spec : GroupSpec or ArraySpec The GroupSpec or ArraySpec that will be serialized to storage. - store : instance of zarr.BaseStore + store : instance of zarr.abc.store.Store The storage backend that will manifest the group or array. path : str The location of the group or array inside the store. @@ -325,3 +381,53 @@ def to_zarr( raise ValueError(msg) return result + + +def auto_attributes(array: Any) -> TAttr: + if hasattr(array, "attributes"): + return array.attributes + return cast(TAttr, {}) + + +def auto_chunk_grid(array: Any) -> NamedConfig: + if hasattr(array, "chunk_shape"): + return array.chunk_shape + elif hasattr(array, "shape"): + return RegularChunking(configuration=RegularChunkingConfig(chunk_shape=list(array.shape))) + raise ValueError("Cannot get chunk grid from object without .shape attribute") + + +def auto_fill_value(array: Any) -> FillValue: + if hasattr(array, "fill_value"): + return array.fill_value + elif hasattr(array, "dtype"): + kind = np.dtype(array.dtype).kind + if kind == "?": + return False + elif kind in ["i", "u"]: + return 0 + elif kind in ["f"]: + return "NaN" + elif kind in ["c"]: + return ("NaN", "NaN") + else: + raise ValueError(f"Cannot determine default fill value for data type {kind}") + raise ValueError("Cannot determine default data type for object without shape attribute.") + + +def auto_codecs(array: Any) -> Sequence[NamedConfig]: + if hasattr(array, "codecs"): + return array.codecs + return [] + + +def auto_storage_transformers(array: Any) -> list: + if hasattr(array, "storage_transformers"): + return array.storage_transformers + return [] + + +def auto_dimension_names(array: Any) -> list[str | None]: + if hasattr(array, "dimension_names"): + return array.dimension_names + return [None] * np.asanyarray(array, copy=False).ndim diff --git a/tests/test_pydantic_zarr/test_v2.py b/tests/test_pydantic_zarr/test_v2.py index 708334f..b0ef2b7 100644 --- a/tests/test_pydantic_zarr/test_v2.py +++ b/tests/test_pydantic_zarr/test_v2.py @@ -4,9 +4,20 @@ from __future__ import annotations +from typing import TYPE_CHECKING, Any + +import pytest +import zarr +import zarr.storage +from pydantic import ValidationError +from zarr.errors import ContainsArrayError, ContainsGroupError + +if TYPE_CHECKING: + from typing import Literal + import sys from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from numcodecs.abc import Codec @@ -14,11 +25,8 @@ import numcodecs import numpy as np import numpy.typing as npt -import pytest import zarr from numcodecs import GZip -from pydantic import ValidationError -from zarr.errors import ContainsArrayError, ContainsGroupError from pydantic_zarr.v2 import ( ArraySpec, @@ -85,7 +93,7 @@ def test_array_spec( compressor: Codec | None, filters: tuple[str, ...] | None, ) -> None: - store = zarr.MemoryStore() + store = zarr.storage.MemoryStore() _filters: list[Codec] | None if filters is not None: _filters = [] @@ -107,56 +115,57 @@ def test_array_spec( dimension_separator=dimension_separator, compressor=compressor, filters=_filters, + zarr_format=2, ) attributes = {"foo": [100, 200, 300], "bar": "hello"} array.attrs.put(attributes) spec = ArraySpec.from_zarr(array) - assert spec.zarr_format == array._version + assert spec.zarr_format == array.metadata.zarr_format assert spec.dtype == array.dtype assert spec.attributes == array.attrs assert spec.chunks == array.chunks - assert spec.dimension_separator == array._dimension_separator + assert spec.dimension_separator == array.metadata.dimension_separator assert spec.shape == array.shape assert spec.fill_value == array.fill_value # this is a sign that nullability is being misused in zarr-python # the correct approach would be to use an empty list to express "no filters". - if array.filters is not None: + if len(array.filters): assert spec.filters == [f.get_config() for f in array.filters] else: - assert spec.filters == array.filters + assert spec.filters is None - if array.compressor is not None: - assert spec.compressor == array.compressor.get_config() + if len(array.compressors): + assert spec.compressor == array.compressors[0].get_config() else: - assert spec.compressor == array.compressor + assert spec.compressor is None assert spec.order == array.order array2 = spec.to_zarr(store, "foo2") - assert spec.zarr_format == array2._version + assert spec.zarr_format == array2.metadata.zarr_format assert spec.dtype == array2.dtype assert spec.attributes == array2.attrs assert spec.chunks == array2.chunks - if array2.compressor is not None: - assert spec.compressor == array2.compressor.get_config() + if len(array2.compressors): + assert spec.compressor == array2.compressors[0].get_config() else: - assert spec.compressor == array2.compressor + assert spec.compressor is None - if array2.filters is not None: + if len(array2.filters): assert spec.filters == [f.get_config() for f in array2.filters] else: - assert spec.filters == array2.filters + assert spec.filters is None - assert spec.dimension_separator == array2._dimension_separator + assert spec.dimension_separator == array2.metadata.dimension_separator assert spec.shape == array2.shape assert spec.fill_value == array2.fill_value # test serialization - store = zarr.MemoryStore() + store = zarr.storage.MemoryStore() stored = spec.to_zarr(store, path="foo") assert ArraySpec.from_zarr(stored) == spec @@ -174,9 +183,19 @@ def test_array_spec( # test that mode and write_empty_chunks get passed through assert spec_2.to_zarr(store, path="foo", mode="a").read_only is False - assert spec_2.to_zarr(store, path="foo", mode="r").read_only is True - assert spec_2.to_zarr(store, path="foo", write_empty_chunks=False)._write_empty_chunks is False - assert spec_2.to_zarr(store, path="foo", write_empty_chunks=True)._write_empty_chunks is True + # TODO: uncomment line below when https://github.com/zarr-developers/zarr-python/issues/2949 is fixed + # assert spec_2.to_zarr(store, path="foo", mode="r").read_only is True + # TODO: work out if there's a way to get the status of "write_empty_chunks" from an array + """ + assert ( + spec_2.to_zarr(store, path="foo", config={"write_empty_chunks": False})._write_empty_chunks + is False + ) + assert ( + spec_2.to_zarr(store, path="foo", config={"write_empty_chunks": True})._write_empty_chunks + is True + ) + """ @dataclass @@ -276,7 +295,7 @@ def test_array_spec_from_array( if filters in auto_options: assert spec.filters == auto_filters(array) else: - assert spec.filters == filters + assert spec.filters is None if dimension_separator in auto_options: assert spec.dimension_separator == auto_dimension_separator(array) @@ -293,9 +312,7 @@ def test_array_spec_from_array( @pytest.mark.parametrize("dtype", ["bool", "uint8", np.dtype("uint8"), "float64"]) @pytest.mark.parametrize("dimension_separator", [".", "/"]) @pytest.mark.parametrize("compressor", [numcodecs.LZMA().get_config(), numcodecs.GZip()]) -@pytest.mark.parametrize( - "filters", [None, ("delta",), ("scale_offset",), ("delta", "scale_offset")] -) +@pytest.mark.parametrize("filters", [(), ("delta",), ("scale_offset",), ("delta", "scale_offset")]) def test_serialize_deserialize_groupspec( chunks: tuple[int, ...], memory_order: ArrayMemoryOrder, @@ -328,7 +345,7 @@ class SubGroupAttrs(TypedDict): class ArrayAttrs(TypedDict): scale: list[float] - store = zarr.MemoryStore() + store = zarr.storage.MemoryStore() spec = GroupSpec[RootAttrs, ArraySpec | SubGroup]( attributes=RootAttrs(foo=10, bar=[0, 1, 2]), @@ -367,7 +384,7 @@ class ArrayAttrs(TypedDict): assert observed == spec # assert that we get the same group twice - assert to_zarr(spec, store, "/group_a") == group + assert to_zarr(spec, store, "/group_a", overwrite=True) == group # check that we can't call to_zarr targeting the original group with a different spec spec_2 = spec.model_copy(update={"attributes": RootAttrs(foo=99, bar=[0, 1, 2])}) @@ -375,9 +392,10 @@ class ArrayAttrs(TypedDict): _ = to_zarr(spec_2, store, "/group_a") # check that we can't call to_zarr with the original spec if the group has changed - group.attrs.put({"foo": 100}) + group.attrs["foo"] = 100 with pytest.raises(ContainsGroupError): _ = to_zarr(spec, store, "/group_a") + group.attrs["foo"] = 10 # materialize again with overwrite group2 = to_zarr(spec, store, "/group_a", overwrite=True) @@ -423,7 +441,7 @@ class ArrayAttrsB(TypedDict): GroupA = GroupSpec[GroupAttrsA, ArrayA] GroupB = GroupSpec[GroupAttrsB, ArrayB] - store = zarr.MemoryStore + store = zarr.storage.MemoryStore specA = GroupA( attributes=GroupAttrsA(group_a=True), @@ -456,7 +474,7 @@ class ArrayAttrsB(TypedDict): members={}, ) - store = zarr.MemoryStore() + store = zarr.storage.MemoryStore() groupAMat = specA.to_zarr(store, path="group_a") groupBMat = specB.to_zarr(store, path="group_b") @@ -564,6 +582,15 @@ def test_array_like() -> None: assert a.like(c, include={"dtype"}) +def test_array_like_with_zarr() -> None: + arr = ArraySpec(shape=(1,), dtype="uint8", chunks=(1,)) + store = zarr.storage.MemoryStore() + arr_stored = arr.to_zarr(store, path="arr") + print(arr) + print(ArraySpec.from_zarr(arr_stored)) + assert arr.like(arr_stored) + + # todo: parametrize def test_group_like() -> None: tree = { @@ -589,7 +616,7 @@ def test_from_zarr_depth() -> None: "/1/2/2": ArraySpec.from_array(np.arange(20), attributes={"level": 3, "type": "array"}), } - store = zarr.MemoryStore() + store = zarr.storage.MemoryStore() group_out = GroupSpec.from_flat(tree).to_zarr(store, path="test") group_in_0 = GroupSpec.from_zarr(group_out, depth=0) assert group_in_0 == tree[""] diff --git a/tests/test_pydantic_zarr/test_v3.py b/tests/test_pydantic_zarr/test_v3.py index 8efef53..0ad0980 100644 --- a/tests/test_pydantic_zarr/test_v3.py +++ b/tests/test_pydantic_zarr/test_v3.py @@ -1,4 +1,14 @@ -from pydantic_zarr.v3 import ArraySpec, GroupSpec, NamedConfig +import numpy as np + +from pydantic_zarr.v3 import ( + ArraySpec, + DefaultChunkKeyEncoding, + DefaultChunkKeyEncodingConfig, + GroupSpec, + NamedConfig, + RegularChunking, + RegularChunkingConfig, +) def test_serialize_deserialize() -> None: @@ -15,6 +25,28 @@ def test_serialize_deserialize() -> None: chunk_key_encoding=NamedConfig(name="default", configuration={"separator": "/"}), codecs=[NamedConfig(name="GZip", configuration={"level": 1})], fill_value="NaN", + storage_transformers=[], ) GroupSpec(attributes=group_attributes, members={"array": array_spec}) + + +def test_from_array() -> None: + array_spec = ArraySpec.from_array(np.arange(10)) + assert array_spec == ArraySpec( + zarr_format=3, + node_type="array", + attributes={}, + shape=(10,), + data_type="