diff --git a/src/zarr/v2/__init__.py b/src/zarr/v2/__init__.py new file mode 100644 index 0000000000..6cecb40af8 --- /dev/null +++ b/src/zarr/v2/__init__.py @@ -0,0 +1,70 @@ +# flake8: noqa +from zarr.codecs import * +from zarr.convenience import ( + consolidate_metadata, + copy, + copy_all, + copy_store, + load, + open, + open_consolidated, + save, + save_array, + save_group, + tree, +) +from zarr.core import Array +from zarr.creation import ( + array, + create, + empty, + empty_like, + full, + full_like, + ones, + ones_like, + open_array, + open_like, + zeros, + zeros_like, +) +from zarr.errors import CopyError, MetadataError +from zarr.hierarchy import Group, group, open_group +from zarr.n5 import N5Store, N5FSStore +from zarr._storage.store import v3_api_available +from zarr.storage import ( + ABSStore, + DBMStore, + DictStore, + DirectoryStore, + KVStore, + LMDBStore, + LRUStoreCache, + MemoryStore, + MongoDBStore, + NestedDirectoryStore, + RedisStore, + SQLiteStore, + TempStore, + ZipStore, +) +from zarr.sync import ProcessSynchronizer, ThreadSynchronizer +from zarr.version import version as __version__ + +# in case setuptools scm screw up and find version to be 0.0.0 +assert not __version__.startswith("0.0.0") + +if v3_api_available: + from zarr._storage.v3 import ( + ABSStoreV3, + DBMStoreV3, + KVStoreV3, + DirectoryStoreV3, + LMDBStoreV3, + LRUStoreCacheV3, + MemoryStoreV3, + MongoDBStoreV3, + RedisStoreV3, + SQLiteStoreV3, + ZipStoreV3, + ) diff --git a/src/zarr/v2/_storage/__init__.py b/src/zarr/v2/_storage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/v2/_storage/absstore.py b/src/zarr/v2/_storage/absstore.py new file mode 100644 index 0000000000..4fbb4078eb --- /dev/null +++ b/src/zarr/v2/_storage/absstore.py @@ -0,0 +1,293 @@ +"""This module contains storage classes related to Azure Blob Storage (ABS)""" + +from typing import Optional +import warnings + +from numcodecs.compat import ensure_bytes +from zarr.util import normalize_storage_path +from zarr._storage.store import ( + _get_metadata_suffix, + data_root, + meta_root, + Store, + StoreV3, + V3_DEPRECATION_MESSAGE, +) +from zarr.types import DIMENSION_SEPARATOR + +__doctest_requires__ = { + ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], +} + + +class ABSStore(Store): + """Storage class using Azure Blob Storage (ABS). + + Parameters + ---------- + container : string + The name of the ABS container to use. + + .. deprecated:: + Use ``client`` instead. + + prefix : string + Location of the "directory" to use as the root of the storage hierarchy + within the container. + + account_name : string + The Azure blob storage account name. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + account_key : string + The Azure blob storage account access key. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + blob_service_kwargs : dictionary + Extra arguments to be passed into the azure blob client, for e.g. when + using the emulator, pass in blob_service_kwargs={'is_emulated': True}. + + .. deprecated:: 2.8.3 + Use ``client`` instead. + + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + client : azure.storage.blob.ContainerClient, optional + And ``azure.storage.blob.ContainerClient`` to connect with. See + `here `_ # noqa + for more. + + .. versionadded:: 2.8.3 + + Notes + ----- + In order to use this store, you must install the Microsoft Azure Storage SDK for Python, + ``azure-storage-blob>=12.5.0``. + """ # noqa: E501 + + def __init__( + self, + container=None, + prefix="", + account_name=None, + account_key=None, + blob_service_kwargs=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + client=None, + ): + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=3, + ) + + self._dimension_separator = dimension_separator + self.prefix = normalize_storage_path(prefix) + if client is None: + # deprecated option, try to construct the client for them + msg = ( + "Providing 'container', 'account_name', 'account_key', and 'blob_service_kwargs'" + "is deprecated. Provide and instance of 'azure.storage.blob.ContainerClient' " + "'client' instead." + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + from azure.storage.blob import ContainerClient + + blob_service_kwargs = blob_service_kwargs or {} + client = ContainerClient( + f"https://{account_name}.blob.core.windows.net/", + container, + credential=account_key, + **blob_service_kwargs, + ) + + self.client = client + self._container = container + self._account_name = account_name + self._account_key = account_key + + @staticmethod + def _warn_deprecated(property_): + msg = ( + "The {} property is deprecated and will be removed in a future " + "version. Get the property from 'ABSStore.client' instead." + ) + warnings.warn(msg.format(property_), FutureWarning, stacklevel=3) + + @property + def container(self): + self._warn_deprecated("container") + return self._container + + @property + def account_name(self): + self._warn_deprecated("account_name") + return self._account_name + + @property + def account_key(self): + self._warn_deprecated("account_key") + return self._account_key + + def _append_path_to_prefix(self, path): + if self.prefix == "": + return normalize_storage_path(path) + else: + return "/".join([self.prefix, normalize_storage_path(path)]) + + @staticmethod + def _strip_prefix_from_path(path, prefix): + # normalized things will not have any leading or trailing slashes + path_norm = normalize_storage_path(path) + prefix_norm = normalize_storage_path(prefix) + if prefix: + return path_norm[(len(prefix_norm) + 1) :] + else: + return path_norm + + def __getitem__(self, key): + from azure.core.exceptions import ResourceNotFoundError + + blob_name = self._append_path_to_prefix(key) + try: + return self.client.download_blob(blob_name).readall() + except ResourceNotFoundError as e: + raise KeyError(f"Blob {blob_name} not found") from e + + def __setitem__(self, key, value): + value = ensure_bytes(value) + blob_name = self._append_path_to_prefix(key) + self.client.upload_blob(blob_name, value, overwrite=True) + + def __delitem__(self, key): + from azure.core.exceptions import ResourceNotFoundError + + try: + self.client.delete_blob(self._append_path_to_prefix(key)) + except ResourceNotFoundError as e: + raise KeyError(f"Blob {key} not found") from e + + def __eq__(self, other): + return ( + isinstance(other, ABSStore) + and self.client == other.client + and self.prefix == other.prefix + ) + + def keys(self): + return list(self.__iter__()) + + def __iter__(self): + if self.prefix: + list_blobs_prefix = self.prefix + "/" + else: + list_blobs_prefix = None + for blob in self.client.list_blobs(list_blobs_prefix): + yield self._strip_prefix_from_path(blob.name, self.prefix) + + def __len__(self): + return len(self.keys()) + + def __contains__(self, key): + blob_name = self._append_path_to_prefix(key) + return self.client.get_blob_client(blob_name).exists() + + def listdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path)) + if dir_path: + dir_path += "/" + items = [ + self._strip_prefix_from_path(blob.name, dir_path) + for blob in self.client.walk_blobs(name_starts_with=dir_path, delimiter="/") + ] + return items + + def rmdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path)) + if dir_path: + dir_path += "/" + for blob in self.client.list_blobs(name_starts_with=dir_path): + self.client.delete_blob(blob) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self._append_path_to_prefix(store_path) + if fs_path: + blob_client = self.client.get_blob_client(fs_path) + else: + blob_client = None + + if blob_client and blob_client.exists(): + return blob_client.get_blob_properties().size + else: + size = 0 + if fs_path == "": + fs_path = None + elif not fs_path.endswith("/"): + fs_path += "/" + for blob in self.client.walk_blobs(name_starts_with=fs_path, delimiter="/"): + blob_client = self.client.get_blob_client(blob.name) + if blob_client.exists(): + size += blob_client.get_blob_properties().size + return size + + def clear(self): + self.rmdir() + + +class ABSStoreV3(ABSStore, StoreV3): + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ABSStoreV3) + and self.client == other.client + and self.prefix == other.prefix + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def rmdir(self, path=None): + if not path: + # Currently allowing clear to delete everything as in v2 + + # If we disallow an empty path then we will need to modify + # TestABSStoreV3 to have the create_store method use a prefix. + ABSStore.rmdir(self, "") + return + + meta_dir = meta_root + path + meta_dir = meta_dir.rstrip("/") + ABSStore.rmdir(self, meta_dir) + + # remove data folder + data_dir = data_root + path + data_dir = data_dir.rstrip("/") + ABSStore.rmdir(self, data_dir) + + # remove metadata files + sfx = _get_metadata_suffix(self) + array_meta_file = meta_dir + ".array" + sfx + if array_meta_file in self: + del self[array_meta_file] + group_meta_file = meta_dir + ".group" + sfx + if group_meta_file in self: + del self[group_meta_file] + + # TODO: adapt the v2 getsize method to work for v3 + # For now, calling the generic keys-based _getsize + def getsize(self, path=None): + from zarr.storage import _getsize # avoid circular import + + return _getsize(self, path) + + +ABSStoreV3.__doc__ = ABSStore.__doc__ diff --git a/src/zarr/v2/_storage/store.py b/src/zarr/v2/_storage/store.py new file mode 100644 index 0000000000..dba29d13c0 --- /dev/null +++ b/src/zarr/v2/_storage/store.py @@ -0,0 +1,715 @@ +import abc +import os +import warnings +from collections import defaultdict +from collections.abc import MutableMapping +from copy import copy +from string import ascii_letters, digits +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union + +from zarr.meta import Metadata2, Metadata3 +from zarr.util import normalize_storage_path +from zarr.context import Context +from zarr.types import ZARR_VERSION + +# v2 store keys +array_meta_key = ".zarray" +group_meta_key = ".zgroup" +attrs_key = ".zattrs" + +# v3 paths +meta_root = "meta/root/" +data_root = "data/root/" + +DEFAULT_ZARR_VERSION: ZARR_VERSION = 2 + +v3_api_available = os.environ.get("ZARR_V3_EXPERIMENTAL_API", "0").lower() not in ["0", "false"] +_has_warned_about_v3 = False # to avoid printing the warning multiple times + +V3_DEPRECATION_MESSAGE = ( + "The {store} is deprecated and will be removed in a Zarr-Python version 3, see " + "https://github.com/zarr-developers/zarr-python/issues/1274 for more information." +) + + +def assert_zarr_v3_api_available(): + # we issue a warning about the experimental v3 implementation when it is first used + global _has_warned_about_v3 + if v3_api_available and not _has_warned_about_v3: + warnings.warn( + "The experimental Zarr V3 implementation in this version of Zarr-Python is not " + "in alignment with the final V3 specification. This version will be removed in " + "Zarr-Python 3 in favor of a spec compliant version.", + FutureWarning, + stacklevel=1, + ) + _has_warned_about_v3 = True + if not v3_api_available: + raise NotImplementedError( + "# V3 reading and writing is experimental! To enable support, set:\n" + "ZARR_V3_EXPERIMENTAL_API=1" + ) # pragma: no cover + + +class BaseStore(MutableMapping): + """Abstract base class for store implementations. + + This is a thin wrapper over MutableMapping that provides methods to check + whether a store is readable, writeable, eraseable and or listable. + + Stores cannot be mutable mapping as they do have a couple of other + requirements that would break Liskov substitution principle (stores only + allow strings as keys, mutable mapping are more generic). + + Having no-op base method also helps simplifying store usage and do not need + to check the presence of attributes and methods, like `close()`. + + Stores can be used as context manager to make sure they close on exit. + + .. added: 2.11.0 + + """ + + _readable = True + _writeable = True + _erasable = True + _listable = True + _store_version = 2 + _metadata_class = Metadata2 + + def is_readable(self): + return self._readable + + def is_writeable(self): + return self._writeable + + def is_listable(self): + return self._listable + + def is_erasable(self): + return self._erasable + + def __enter__(self): + if not hasattr(self, "_open_count"): + self._open_count = 0 + self._open_count += 1 + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._open_count -= 1 + if self._open_count == 0: + self.close() + + def close(self) -> None: + """Do nothing by default""" + pass + + def rename(self, src_path: str, dst_path: str) -> None: + if not self.is_erasable(): + raise NotImplementedError( + f'{type(self)} is not erasable, cannot call "rename"' + ) # pragma: no cover + _rename_from_keys(self, src_path, dst_path) + + @staticmethod + def _ensure_store(store: Any): + """ + We want to make sure internally that zarr stores are always a class + with a specific interface derived from ``BaseStore``, which is slightly + different than ``MutableMapping``. + + We'll do this conversion in a few places automatically + """ + from zarr.storage import KVStore # avoid circular import + + if isinstance(store, BaseStore): + if not store._store_version == 2: + raise ValueError( + f"cannot initialize a v2 store with a v{store._store_version} store" + ) + return store + elif isinstance(store, MutableMapping): + return KVStore(store) + else: + for attr in [ + "keys", + "values", + "get", + "__setitem__", + "__getitem__", + "__delitem__", + "__contains__", + ]: + if not hasattr(store, attr): + break + else: + return KVStore(store) + + raise ValueError( + "Starting with Zarr 2.11.0, stores must be subclasses of " + "BaseStore, if your store exposes the MutableMapping interface " + f"wrap it in Zarr.storage.KVStore. Got {store}" + ) + + def getitems( + self, keys: Sequence[str], *, contexts: Mapping[str, Context] + ) -> Mapping[str, Any]: + """Retrieve data from multiple keys. + + Parameters + ---------- + keys : Iterable[str] + The keys to retrieve + contexts: Mapping[str, Context] + A mapping of keys to their context. Each context is a mapping of store + specific information. E.g. a context could be a dict telling the store + the preferred output array type: `{"meta_array": cupy.empty(())}` + + Returns + ------- + Mapping + A collection mapping the input keys to their results. + + Notes + ----- + This default implementation uses __getitem__() to read each key sequentially and + ignores contexts. Overwrite this method to implement concurrent reads of multiple + keys and/or to utilize the contexts. + """ + return {k: self[k] for k in keys if k in self} + + +class Store(BaseStore): + """Abstract store class used by implementations following the Zarr v2 spec. + + Adds public `listdir`, `rename`, and `rmdir` methods on top of BaseStore. + + .. added: 2.11.0 + + """ + + def listdir(self, path: str = "") -> List[str]: + path = normalize_storage_path(path) + return _listdir_from_keys(self, path) + + def rmdir(self, path: str = "") -> None: + if not self.is_erasable(): + raise NotImplementedError( + f'{type(self)} is not erasable, cannot call "rmdir"' + ) # pragma: no cover + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + + +class StoreV3(BaseStore): + _store_version = 3 + _metadata_class = Metadata3 + _valid_key_characters = set(ascii_letters + digits + "/.-_") + + def _valid_key(self, key: str) -> bool: + """ + Verify that a key conforms to the specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + """ + if not isinstance(key, str) or not key.isascii(): + return False + if set(key) - self._valid_key_characters: + return False + return True + + def _validate_key(self, key: str): + """ + Verify that a key conforms to the v3 specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + + In spec v3, keys can only start with the prefix meta/, data/ or be + exactly zarr.json and should not end with /. This should not be exposed + to the user, and is a store implementation detail, so this method will + raise a ValueError in that case. + """ + if not self._valid_key(key): + raise ValueError( + f"Keys must be ascii strings and may only contain the " + f"characters {''.join(sorted(self._valid_key_characters))}" + ) + + if ( + not key.startswith(("data/", "meta/")) + and key != "zarr.json" + # TODO: Possibly allow key == ".zmetadata" too if we write a + # consolidated metadata spec corresponding to this? + ): + raise ValueError(f"key starts with unexpected value: `{key}`") + + if key.endswith("/"): + raise ValueError("keys may not end in /") + + def list_prefix(self, prefix): + if prefix.startswith("/"): + raise ValueError("prefix must not begin with /") + # TODO: force prefix to end with /? + return [k for k in self.list() if k.startswith(prefix)] + + def erase(self, key): + self.__delitem__(key) + + def erase_prefix(self, prefix): + assert prefix.endswith("/") + + if prefix == "/": + all_keys = self.list() + else: + all_keys = self.list_prefix(prefix) + for key in all_keys: + self.erase(key) + + def list_dir(self, prefix): + """ + TODO: carefully test this with trailing/leading slashes + """ + if prefix: # allow prefix = "" ? + assert prefix.endswith("/") + + all_keys = self.list_prefix(prefix) + len_prefix = len(prefix) + keys = [] + prefixes = [] + for k in all_keys: + trail = k[len_prefix:] + if "/" not in trail: + keys.append(prefix + trail) + else: + prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") + return keys, list(set(prefixes)) + + def list(self): + return list(self.keys()) + + def __contains__(self, key): + return key in self.list() + + @abc.abstractmethod + def __setitem__(self, key, value): + """Set a value.""" + + @abc.abstractmethod + def __getitem__(self, key): + """Get a value.""" + + @abc.abstractmethod + def rmdir(self, path=None): + """Remove a data path and all its subkeys and related metadata. + Expects a path without the data or meta root prefix.""" + + @property + def supports_efficient_get_partial_values(self): + return False + + def get_partial_values( + self, key_ranges: Sequence[Tuple[str, Tuple[int, Optional[int]]]] + ) -> List[Union[bytes, memoryview, bytearray]]: + """Get multiple partial values. + key_ranges can be an iterable of key, range pairs, + where a range specifies two integers range_start and range_length + as a tuple, (range_start, range_length). + range_length may be None to indicate to read until the end. + range_start may be negative to start reading range_start bytes + from the end of the file. + A key may occur multiple times with different ranges. + Inserts None for missing keys into the returned list.""" + results: List[Union[bytes, memoryview, bytearray]] = [None] * len(key_ranges) # type: ignore[list-item] # noqa: E501 + indexed_ranges_by_key: Dict[str, List[Tuple[int, Tuple[int, Optional[int]]]]] = defaultdict( + list + ) + for i, (key, range_) in enumerate(key_ranges): + indexed_ranges_by_key[key].append((i, range_)) + for key, indexed_ranges in indexed_ranges_by_key.items(): + try: + value = self[key] + except KeyError: # pragma: no cover + continue + for i, (range_from, range_length) in indexed_ranges: + if range_length is None: + results[i] = value[range_from:] + else: + results[i] = value[range_from : range_from + range_length] + return results + + def supports_efficient_set_partial_values(self): + return False + + def set_partial_values(self, key_start_values): + """Set multiple partial values. + key_start_values can be an iterable of key, start and value triplets + as tuples, (key, start, value), where start defines the offset in bytes. + A key may occur multiple times with different starts and non-overlapping values. + Also, start may only be beyond the current value if other values fill the gap. + start may be negative to start writing start bytes from the current + end of the file, ending the file with the new value.""" + unique_keys = set(next(zip(*key_start_values))) + values = {} + for key in unique_keys: + old_value = self.get(key) + values[key] = None if old_value is None else bytearray(old_value) + for key, start, value in key_start_values: + if values[key] is None: + assert start == 0 + values[key] = value + else: + if start > len(values[key]): # pragma: no cover + raise ValueError( + f"Cannot set value at start {start}, " + + f"since it is beyond the data at key {key}, " + + f"having length {len(values[key])}." + ) + if start < 0: + values[key][start:] = value + else: + values[key][start : start + len(value)] = value + for key, value in values.items(): + self[key] = value + + def clear(self): + """Remove all items from store.""" + self.erase_prefix("/") + + def __eq__(self, other): + return NotImplemented + + @staticmethod + def _ensure_store(store): + """ + We want to make sure internally that zarr stores are always a class + with a specific interface derived from ``Store``, which is slightly + different than ``MutableMapping``. + + We'll do this conversion in a few places automatically + """ + from zarr._storage.v3 import KVStoreV3 # avoid circular import + + if store is None: + return None + elif isinstance(store, StoreV3): + return store + elif isinstance(store, Store): + raise ValueError(f"cannot initialize a v3 store with a v{store._store_version} store") + elif isinstance(store, MutableMapping): + return KVStoreV3(store) + else: + for attr in [ + "keys", + "values", + "get", + "__setitem__", + "__getitem__", + "__delitem__", + "__contains__", + ]: + if not hasattr(store, attr): + break + else: + return KVStoreV3(store) + + raise ValueError( + "v3 stores must be subclasses of StoreV3, " + "if your store exposes the MutableMapping interface wrap it in " + f"Zarr.storage.KVStoreV3. Got {store}" + ) + + +class StorageTransformer(MutableMapping, abc.ABC): + """Base class for storage transformers. The methods simply pass on the data as-is + and should be overwritten by sub-classes.""" + + _store_version = 3 + _metadata_class = Metadata3 + + def __init__(self, _type) -> None: + if _type not in self.valid_types: # pragma: no cover + raise ValueError( + f"Storage transformer cannot be initialized with type {_type}, " + + f"must be one of {list(self.valid_types)}." + ) + self.type = _type + self._inner_store = None + + def _copy_for_array(self, array, inner_store): + transformer_copy = copy(self) + transformer_copy._inner_store = inner_store + return transformer_copy + + @abc.abstractproperty + def extension_uri(self): + pass # pragma: no cover + + @abc.abstractproperty + def valid_types(self): + pass # pragma: no cover + + def get_config(self): + """Return a dictionary holding configuration parameters for this + storage transformer. All values must be compatible with JSON encoding.""" + # Override in sub-class if need special encoding of config values. + # By default, assume all non-private members are configuration + # parameters except for type . + return {k: v for k, v in self.__dict__.items() if not k.startswith("_") and k != "type"} + + @classmethod + def from_config(cls, _type, config): + """Instantiate storage transformer from a configuration object.""" + # override in sub-class if need special decoding of config values + + # by default, assume constructor accepts configuration parameters as + # keyword arguments without any special decoding + return cls(_type, **config) + + @property + def inner_store(self) -> Union["StorageTransformer", StoreV3]: + assert ( + self._inner_store is not None + ), "inner_store is not initialized, first get a copy via _copy_for_array." + return self._inner_store + + # The following implementations are usually fine to keep as-is: + + def __eq__(self, other): + return ( + type(self) is type(other) + and self._inner_store == other._inner_store + and self.get_config() == other.get_config() + ) + + def erase(self, key): + self.__delitem__(key) + + def list(self): + return list(self.keys()) + + def list_dir(self, prefix): + return StoreV3.list_dir(self, prefix) + + def is_readable(self): + return self.inner_store.is_readable() + + def is_writeable(self): + return self.inner_store.is_writeable() + + def is_listable(self): + return self.inner_store.is_listable() + + def is_erasable(self): + return self.inner_store.is_erasable() + + def clear(self): + return self.inner_store.clear() + + def __enter__(self): + return self.inner_store.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self.inner_store.__exit__(exc_type, exc_value, traceback) + + def close(self) -> None: + return self.inner_store.close() + + # The following implementations might need to be re-implemented + # by subclasses implementing storage transformers: + + def rename(self, src_path: str, dst_path: str) -> None: + return self.inner_store.rename(src_path, dst_path) + + def list_prefix(self, prefix): + return self.inner_store.list_prefix(prefix) + + def erase_prefix(self, prefix): + return self.inner_store.erase_prefix(prefix) + + def rmdir(self, path=None): + return self.inner_store.rmdir(path) + + def __contains__(self, key): + return self.inner_store.__contains__(key) + + def __setitem__(self, key, value): + return self.inner_store.__setitem__(key, value) + + def __getitem__(self, key): + return self.inner_store.__getitem__(key) + + def __delitem__(self, key): + return self.inner_store.__delitem__(key) + + def __iter__(self): + return self.inner_store.__iter__() + + def __len__(self): + return self.inner_store.__len__() + + @property + def supports_efficient_get_partial_values(self): + return self.inner_store.supports_efficient_get_partial_values + + def get_partial_values(self, key_ranges): + return self.inner_store.get_partial_values(key_ranges) + + def supports_efficient_set_partial_values(self): + return self.inner_store.supports_efficient_set_partial_values() + + def set_partial_values(self, key_start_values): + return self.inner_store.set_partial_values(key_start_values) + + +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + +def _path_to_prefix(path: Optional[str]) -> str: + # assume path already normalized + if path: + prefix = path + "/" + else: + prefix = "" + return prefix + + +def _get_hierarchy_metadata(store: StoreV3) -> Mapping[str, Any]: + version = getattr(store, "_store_version", 2) + if version < 3: + raise ValueError("zarr.json hierarchy metadata not stored for " f"zarr v{version} stores") + if "zarr.json" not in store: + raise ValueError("zarr.json metadata not found in store") + return store._metadata_class.decode_hierarchy_metadata(store["zarr.json"]) + + +def _get_metadata_suffix(store: StoreV3) -> str: + if "zarr.json" in store: + return _get_hierarchy_metadata(store)["metadata_key_suffix"] + return ".json" + + +def _rename_metadata_v3(store: StoreV3, src_path: str, dst_path: str) -> bool: + """Rename source or group metadata file associated with src_path.""" + any_renamed = False + sfx = _get_metadata_suffix(store) + src_path = src_path.rstrip("/") + dst_path = dst_path.rstrip("/") + _src_array_json = meta_root + src_path + ".array" + sfx + if _src_array_json in store: + new_key = meta_root + dst_path + ".array" + sfx + store[new_key] = store.pop(_src_array_json) + any_renamed = True + _src_group_json = meta_root + src_path + ".group" + sfx + if _src_group_json in store: + new_key = meta_root + dst_path + ".group" + sfx + store[new_key] = store.pop(_src_group_json) + any_renamed = True + return any_renamed + + +def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: + # assume path already normalized + src_prefix = _path_to_prefix(src_path) + dst_prefix = _path_to_prefix(dst_path) + version = getattr(store, "_store_version", 2) + if version == 2: + for key in list(store.keys()): + if key.startswith(src_prefix): + new_key = dst_prefix + key.lstrip(src_prefix) + store[new_key] = store.pop(key) + else: + any_renamed = False + for root_prefix in [meta_root, data_root]: + _src_prefix = root_prefix + src_prefix + _dst_prefix = root_prefix + dst_prefix + for key in store.list_prefix(_src_prefix): # type: ignore + new_key = _dst_prefix + key[len(_src_prefix) :] + store[new_key] = store.pop(key) + any_renamed = True + any_meta_renamed = _rename_metadata_v3(store, src_path, dst_path) # type: ignore + any_renamed = any_meta_renamed or any_renamed + + if not any_renamed: + raise ValueError(f"no item {src_path} found to rename") + + +def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: + # assume path already normalized + prefix = _path_to_prefix(path) + for key in list(store.keys()): + if key.startswith(prefix): + del store[key] + + +def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: + meta_dir = meta_root + path + meta_dir = meta_dir.rstrip("/") + _rmdir_from_keys(store, meta_dir) + + # remove data folder + data_dir = data_root + path + data_dir = data_dir.rstrip("/") + _rmdir_from_keys(store, data_dir) + + # remove metadata files + sfx = _get_metadata_suffix(store) + array_meta_file = meta_dir + ".array" + sfx + if array_meta_file in store: + store.erase(array_meta_file) + group_meta_file = meta_dir + ".group" + sfx + if group_meta_file in store: + store.erase(group_meta_file) + + +def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str]: + # assume path already normalized + prefix = _path_to_prefix(path) + children = set() + for key in list(store.keys()): + if key.startswith(prefix) and len(key) > len(prefix): + suffix = key[len(prefix) :] + child = suffix.split("/")[0] + children.add(child) + return sorted(children) + + +def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + sfx = _get_metadata_suffix(store) # type: ignore + if prefix: + key = meta_root + prefix.rstrip("/") + ".array" + sfx + else: + key = meta_root[:-1] + ".array" + sfx + else: + key = prefix + array_meta_key + return key + + +def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + sfx = _get_metadata_suffix(store) # type: ignore + if prefix: + key = meta_root + prefix.rstrip("/") + ".group" + sfx + else: + key = meta_root[:-1] + ".group" + sfx + else: + key = prefix + group_meta_key + return key + + +def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + # for v3, attributes are stored in the array metadata + sfx = _get_metadata_suffix(store) # type: ignore + if prefix: + key = meta_root + prefix.rstrip("/") + ".array" + sfx + else: + key = meta_root[:-1] + ".array" + sfx + else: + key = prefix + attrs_key + return key diff --git a/src/zarr/v2/_storage/v3.py b/src/zarr/v2/_storage/v3.py new file mode 100644 index 0000000000..334788585f --- /dev/null +++ b/src/zarr/v2/_storage/v3.py @@ -0,0 +1,628 @@ +import os +import shutil +from collections import OrderedDict +from collections.abc import MutableMapping +from threading import Lock +from typing import Union, Dict, Any, Optional + +from zarr.errors import ( + MetadataError, + ReadOnlyError, +) +from zarr.util import buffer_size, json_loads, normalize_storage_path +from zarr.types import DIMENSION_SEPARATOR + +from zarr._storage.absstore import ABSStoreV3 # noqa: F401 +from zarr._storage.store import ( # noqa: F401 + _get_hierarchy_metadata, + _get_metadata_suffix, + _listdir_from_keys, + _rename_from_keys, + _rename_metadata_v3, + _rmdir_from_keys, + _rmdir_from_keys_v3, + _path_to_prefix, + _prefix_to_array_key, + _prefix_to_group_key, + array_meta_key, + attrs_key, + data_root, + group_meta_key, + meta_root, + BaseStore, + Store, + StoreV3, +) +from zarr.storage import ( + DBMStore, + ConsolidatedMetadataStore, + DirectoryStore, + FSStore, + KVStore, + LMDBStore, + LRUStoreCache, + MemoryStore, + MongoDBStore, + RedisStore, + SQLiteStore, + ZipStore, + _getsize, +) + +__doctest_requires__ = { + ("RedisStore", "RedisStore.*"): ["redis"], + ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], + ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], +} + + +try: + # noinspection PyUnresolvedReferences + from zarr.codecs import Blosc + + default_compressor = Blosc() +except ImportError: # pragma: no cover + from zarr.codecs import Zlib + + default_compressor = Zlib() + + +Path = Union[str, bytes, None] +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + +class RmdirV3: + """Mixin class that can be used to ensure override of any existing v2 rmdir class.""" + + def rmdir(self, path: str = "") -> None: + path = normalize_storage_path(path) + _rmdir_from_keys_v3(self, path) # type: ignore + + +class KVStoreV3(RmdirV3, KVStore, StoreV3): + def list(self): + return list(self._mutable_mapping.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def __eq__(self, other): + return isinstance(other, KVStoreV3) and self._mutable_mapping == other._mutable_mapping + + +KVStoreV3.__doc__ = KVStore.__doc__ + + +def _get_files_and_dirs_from_path(store, path): + path = normalize_storage_path(path) + + files = [] + # add array metadata file if present + array_key = _prefix_to_array_key(store, path) + if array_key in store: + files.append(os.path.join(store.path, array_key)) + + # add group metadata file if present + group_key = _prefix_to_group_key(store, path) + if group_key in store: + files.append(os.path.join(store.path, group_key)) + + dirs = [] + # add array and group folders if present + for d in [data_root + path, meta_root + path]: + dir_path = os.path.join(store.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + return files, dirs + + +class FSStoreV3(FSStore, StoreV3): + # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) + _META_KEYS = () + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "/" + + def list(self): + return list(self.keys()) + + def _normalize_key(self, key): + key = normalize_storage_path(key).lstrip("/") + return key.lower() if self.normalize_keys else key + + def getsize(self, path=None): + size = 0 + if path is None or path == "": + # size of both the data and meta subdirs + dirs = [] + for d in ["data/root", "meta/root"]: + dir_path = os.path.join(self.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + elif path in self: + # access individual element by full path + return buffer_size(self[path]) + else: + files, dirs = _get_files_and_dirs_from_path(self, path) + for file in files: + size += os.path.getsize(file) + for d in dirs: + size += self.fs.du(d, total=True, maxdepth=None) + return size + + def setitems(self, values): + if self.mode == "r": + raise ReadOnlyError() + values = {self._normalize_key(key): val for key, val in values.items()} + + # initialize the /data/root/... folder corresponding to the array! + # Note: zarr.tests.test_core_v3.TestArrayWithFSStoreV3PartialRead fails + # without this explicit creation of directories + subdirectories = set(os.path.dirname(v) for v in values.keys()) + for subdirectory in subdirectories: + data_dir = os.path.join(self.path, subdirectory) + if not self.fs.exists(data_dir): + self.fs.mkdir(data_dir) + + self.map.setitems(values) + + def rmdir(self, path=None): + if self.mode == "r": + raise ReadOnlyError() + if path: + for base in [meta_root, data_root]: + store_path = self.dir_path(base + path) + if self.fs.isdir(store_path): + self.fs.rm(store_path, recursive=True) + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip("/") + array_meta_file = meta_dir + ".array" + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + ".group" + sfx + self.pop(group_meta_file, None) + else: + store_path = self.dir_path(path) + if self.fs.isdir(store_path): + self.fs.rm(store_path, recursive=True) + + @property + def supports_efficient_get_partial_values(self): + return True + + def get_partial_values(self, key_ranges): + """Get multiple partial values. + key_ranges can be an iterable of key, range pairs, + where a range specifies two integers range_start and range_length + as a tuple, (range_start, range_length). + range_length may be None to indicate to read until the end. + range_start may be negative to start reading range_start bytes + from the end of the file. + A key may occur multiple times with different ranges. + Inserts None for missing keys into the returned list.""" + results = [] + for key, (range_start, range_length) in key_ranges: + key = self._normalize_key(key) + path = self.dir_path(key) + try: + if range_start is None or range_length is None: + end = None + else: + end = range_start + range_length + result = self.fs.cat_file(path, start=range_start, end=end) + except self.map.missing_exceptions: + result = None + results.append(result) + return results + + +class MemoryStoreV3(MemoryStore, StoreV3): + def __init__( + self, root=None, cls=dict, dimension_separator: Optional[DIMENSION_SEPARATOR] = None + ): + if root is None: + self.root = cls() + else: + self.root = root + self.cls = cls + self.write_mutex = Lock() + self._dimension_separator = dimension_separator # TODO: modify for v3? + + def __eq__(self, other): + return ( + isinstance(other, MemoryStoreV3) and self.root == other.root and self.cls == other.cls + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def list(self): + return list(self.keys()) + + def getsize(self, path: Path = None): + return _getsize(self, path) + + def rename(self, src_path: Path, dst_path: Path): + src_path = normalize_storage_path(src_path) + dst_path = normalize_storage_path(dst_path) + + any_renamed = False + for base in [meta_root, data_root]: + if self.list_prefix(base + src_path): + src_parent, src_key = self._get_parent(base + src_path) + dst_parent, dst_key = self._require_parent(base + dst_path) + + if src_key in src_parent: + dst_parent[dst_key] = src_parent.pop(src_key) + + if base == meta_root: + # check for and move corresponding metadata + sfx = _get_metadata_suffix(self) + src_meta = src_key + ".array" + sfx + if src_meta in src_parent: + dst_meta = dst_key + ".array" + sfx + dst_parent[dst_meta] = src_parent.pop(src_meta) + src_meta = src_key + ".group" + sfx + if src_meta in src_parent: + dst_meta = dst_key + ".group" + sfx + dst_parent[dst_meta] = src_parent.pop(src_meta) + any_renamed = True + any_renamed = _rename_metadata_v3(self, src_path, dst_path) or any_renamed + if not any_renamed: + raise ValueError(f"no item {src_path} found to rename") + + def rmdir(self, path: Path = None): + path = normalize_storage_path(path) + if path: + for base in [meta_root, data_root]: + try: + parent, key = self._get_parent(base + path) + value = parent[key] + except KeyError: + continue + else: + if isinstance(value, self.cls): + del parent[key] + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip("/") + array_meta_file = meta_dir + ".array" + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + ".group" + sfx + self.pop(group_meta_file, None) + else: + # clear out root + self.root = self.cls() + + +MemoryStoreV3.__doc__ = MemoryStore.__doc__ + + +class DirectoryStoreV3(DirectoryStore, StoreV3): + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return isinstance(other, DirectoryStoreV3) and self.path == other.path + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def getsize(self, path: Path = None): + return _getsize(self, path) + + def rename(self, src_path, dst_path, metadata_key_suffix=".json"): + store_src_path = normalize_storage_path(src_path) + store_dst_path = normalize_storage_path(dst_path) + + dir_path = self.path + any_existed = False + for root_prefix in ["meta", "data"]: + src_path = os.path.join(dir_path, root_prefix, "root", store_src_path) + if os.path.exists(src_path): + any_existed = True + dst_path = os.path.join(dir_path, root_prefix, "root", store_dst_path) + os.renames(src_path, dst_path) + + for suffix in [".array" + metadata_key_suffix, ".group" + metadata_key_suffix]: + src_meta = os.path.join(dir_path, "meta", "root", store_src_path + suffix) + if os.path.exists(src_meta): + any_existed = True + dst_meta = os.path.join(dir_path, "meta", "root", store_dst_path + suffix) + dst_dir = os.path.dirname(dst_meta) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + os.rename(src_meta, dst_meta) + if not any_existed: + raise FileNotFoundError("nothing found at src_path") + + def rmdir(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + for base in [meta_root, data_root]: + dir_path = os.path.join(dir_path, base + store_path) + if os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip("/") + array_meta_file = meta_dir + ".array" + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + ".group" + sfx + self.pop(group_meta_file, None) + + elif os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + +DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ + + +class ZipStoreV3(ZipStore, StoreV3): + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) + and self.path == other.path + and self.compression == other.compression + and self.allowZip64 == other.allowZip64 + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.list_prefix(data_root + path) + children += self.list_prefix(meta_root + path) + print(f"path={path}, children={children}") + if children: + size = 0 + for name in children: + info = self.zf.getinfo(name) + size += info.compress_size + return size + elif path in self: + info = self.zf.getinfo(path) + return info.compress_size + else: + return 0 + + +ZipStoreV3.__doc__ = ZipStore.__doc__ + + +class RedisStoreV3(RmdirV3, RedisStore, StoreV3): + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +RedisStoreV3.__doc__ = RedisStore.__doc__ + + +class MongoDBStoreV3(RmdirV3, MongoDBStore, StoreV3): + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ + + +class DBMStoreV3(RmdirV3, DBMStore, StoreV3): + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +DBMStoreV3.__doc__ = DBMStore.__doc__ + + +class LMDBStoreV3(RmdirV3, LMDBStore, StoreV3): + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +LMDBStoreV3.__doc__ = LMDBStore.__doc__ + + +class SQLiteStoreV3(SQLiteStore, StoreV3): + def list(self): + return list(self.keys()) + + def getsize(self, path=None): + # TODO: why does the query below not work in this case? + # For now fall back to the default _getsize implementation + # size = 0 + # for _path in [data_root + path, meta_root + path]: + # c = self.cursor.execute( + # ''' + # SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + # WHERE k LIKE (? || "%") AND + # 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + # ''', + # (_path, _path) + # ) + # for item_size, in c: + # size += item_size + # return size + + # fallback to default implementation for now + return _getsize(self, path) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + for base in [meta_root, data_root]: + with self.lock: + self.cursor.execute("DELETE FROM zarr WHERE k LIKE (? || '/%')", (base + path,)) + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip("/") + array_meta_file = meta_dir + ".array" + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + ".group" + sfx + self.pop(group_meta_file, None) + else: + self.clear() + + +SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ + + +class LRUStoreCacheV3(RmdirV3, LRUStoreCache, StoreV3): + def __init__(self, store, max_size: int): + self._store = StoreV3._ensure_store(store) + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache = {} + self._listdir_cache: Dict[Path, Any] = dict() + self._values_cache: Dict[Path, Any] = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ + + +class ConsolidatedMetadataStoreV3(ConsolidatedMetadataStore, StoreV3): + """A layer over other storage, where the metadata has been consolidated into + a single key. + + The purpose of this class, is to be able to get all of the metadata for + a given array in a single read operation from the underlying storage. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the array metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. note:: This is an experimental feature. + + Parameters + ---------- + store: Store + Containing the zarr array. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + + """ + + def __init__(self, store: StoreLike, metadata_key=meta_root + "consolidated/.zmetadata"): + self.store = StoreV3._ensure_store(store) + + # retrieve consolidated metadata + meta = json_loads(self.store[metadata_key]) + + # check format of consolidated metadata + consolidated_format = meta.get("zarr_consolidated_format", None) + if consolidated_format != 1: + raise MetadataError( + f"unsupported zarr consolidated metadata format: {consolidated_format}" + ) + + # decode metadata + self.meta_store: Store = KVStoreV3(meta["metadata"]) + + def rmdir(self, key): + raise ReadOnlyError() + + +def _normalize_store_arg_v3(store: Any, storage_options=None, mode="r") -> BaseStore: + # default to v2 store for backward compatibility + zarr_version = getattr(store, "_store_version", 3) + if zarr_version != 3: + raise ValueError("store must be a version 3 store") + if store is None: + store = KVStoreV3(dict()) + # add default zarr.json metadata + store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) + return store + if isinstance(store, os.PathLike): + store = os.fspath(store) + if FSStore._fsspec_installed(): + import fsspec + + if isinstance(store, fsspec.FSMap): + return FSStoreV3( + store.root, + fs=store.fs, + mode=mode, + check=store.check, + create=store.create, + missing_exceptions=store.missing_exceptions, + **(storage_options or {}), + ) + if isinstance(store, str): + if "://" in store or "::" in store: + store = FSStoreV3(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + elif store.endswith(".zip"): + store = ZipStoreV3(store, mode=mode) + elif store.endswith(".n5"): + raise NotImplementedError("N5Store not yet implemented for V3") + # return N5StoreV3(store) + else: + store = DirectoryStoreV3(store) + else: + store = StoreV3._ensure_store(store) + + if "zarr.json" not in store: + # add default zarr.json metadata + store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) + return store diff --git a/src/zarr/v2/_storage/v3_storage_transformers.py b/src/zarr/v2/_storage/v3_storage_transformers.py new file mode 100644 index 0000000000..00467d44f9 --- /dev/null +++ b/src/zarr/v2/_storage/v3_storage_transformers.py @@ -0,0 +1,367 @@ +import functools +import itertools +import os +from typing import NamedTuple, Tuple, Optional, Union, Iterator + +from numcodecs.compat import ensure_bytes +import numpy as np + +from zarr._storage.store import StorageTransformer, StoreV3, _rmdir_from_keys_v3 +from zarr.util import normalize_storage_path +from zarr.types import DIMENSION_SEPARATOR + + +MAX_UINT_64 = 2**64 - 1 + + +v3_sharding_available = os.environ.get("ZARR_V3_SHARDING", "0").lower() not in ["0", "false"] + + +def assert_zarr_v3_sharding_available(): + if not v3_sharding_available: + raise NotImplementedError( + "Using V3 sharding is experimental and not yet finalized! To enable support, set:\n" + "ZARR_V3_SHARDING=1" + ) # pragma: no cover + + +class _ShardIndex(NamedTuple): + store: "ShardingStorageTransformer" + # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) + offsets_and_lengths: np.ndarray + + def __localize_chunk__(self, chunk: Tuple[int, ...]) -> Tuple[int, ...]: + return tuple( + chunk_i % shard_i for chunk_i, shard_i in zip(chunk, self.store.chunks_per_shard) + ) + + def is_all_empty(self) -> bool: + return np.array_equiv(self.offsets_and_lengths, MAX_UINT_64) + + def get_chunk_slice(self, chunk: Tuple[int, ...]) -> Optional[slice]: + localized_chunk = self.__localize_chunk__(chunk) + chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] + if (chunk_start, chunk_len) == (MAX_UINT_64, MAX_UINT_64): + return None + else: + return slice(int(chunk_start), int(chunk_start + chunk_len)) + + def set_chunk_slice(self, chunk: Tuple[int, ...], chunk_slice: Optional[slice]) -> None: + localized_chunk = self.__localize_chunk__(chunk) + if chunk_slice is None: + self.offsets_and_lengths[localized_chunk] = (MAX_UINT_64, MAX_UINT_64) + else: + self.offsets_and_lengths[localized_chunk] = ( + chunk_slice.start, + chunk_slice.stop - chunk_slice.start, + ) + + def to_bytes(self) -> bytes: + return self.offsets_and_lengths.tobytes(order="C") + + @classmethod + def from_bytes( + cls, buffer: Union[bytes, bytearray], store: "ShardingStorageTransformer" + ) -> "_ShardIndex": + try: + return cls( + store=store, + offsets_and_lengths=np.frombuffer(bytearray(buffer), dtype=" None: + assert_zarr_v3_sharding_available() + super().__init__(_type) + if isinstance(chunks_per_shard, int): + chunks_per_shard = (chunks_per_shard,) + else: + chunks_per_shard = tuple(int(i) for i in chunks_per_shard) + if chunks_per_shard == (): + chunks_per_shard = (1,) + self.chunks_per_shard = chunks_per_shard + self._num_chunks_per_shard = functools.reduce(lambda x, y: x * y, chunks_per_shard, 1) + self._dimension_separator = None + self._data_key_prefix = None + + def _copy_for_array(self, array, inner_store): + transformer_copy = super()._copy_for_array(array, inner_store) + transformer_copy._dimension_separator = array._dimension_separator + transformer_copy._data_key_prefix = array._data_key_prefix + if len(array._shape) > len(self.chunks_per_shard): + # The array shape might be longer when initialized with subdtypes. + # subdtypes dimensions come last, therefore padding chunks_per_shard + # with ones, effectively disabling sharding on the unlisted dimensions. + transformer_copy.chunks_per_shard += (1,) * ( + len(array._shape) - len(self.chunks_per_shard) + ) + return transformer_copy + + @property + def dimension_separator(self) -> DIMENSION_SEPARATOR: + assert ( + self._dimension_separator is not None + ), "dimension_separator is not initialized, first get a copy via _copy_for_array." + return self._dimension_separator + + def _is_data_key(self, key: str) -> bool: + assert ( + self._data_key_prefix is not None + ), "data_key_prefix is not initialized, first get a copy via _copy_for_array." + return key.startswith(self._data_key_prefix) + + def _key_to_shard(self, chunk_key: str) -> Tuple[str, Tuple[int, ...]]: + prefix, _, chunk_string = chunk_key.rpartition("c") + chunk_subkeys = ( + tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) + ) + shard_key_tuple = ( + subkey // shard_i for subkey, shard_i in zip(chunk_subkeys, self.chunks_per_shard) + ) + shard_key = prefix + "c" + self.dimension_separator.join(map(str, shard_key_tuple)) + return shard_key, chunk_subkeys + + def _get_index_from_store(self, shard_key: str) -> _ShardIndex: + # At the end of each shard 2*64bit per chunk for offset and length define the index: + index_bytes = self.inner_store.get_partial_values( + [(shard_key, (-16 * self._num_chunks_per_shard, None))] + )[0] + if index_bytes is None: + raise KeyError(shard_key) + return _ShardIndex.from_bytes( + index_bytes, + self, + ) + + def _get_index_from_buffer(self, buffer: Union[bytes, bytearray]) -> _ShardIndex: + # At the end of each shard 2*64bit per chunk for offset and length define the index: + return _ShardIndex.from_bytes(buffer[-16 * self._num_chunks_per_shard :], self) + + def _get_chunks_in_shard(self, shard_key: str) -> Iterator[Tuple[int, ...]]: + _, _, chunk_string = shard_key.rpartition("c") + shard_key_tuple = ( + tuple(map(int, chunk_string.split(self.dimension_separator))) if chunk_string else (0,) + ) + for chunk_offset in itertools.product(*(range(i) for i in self.chunks_per_shard)): + yield tuple( + shard_key_i * shards_i + offset_i + for shard_key_i, offset_i, shards_i in zip( + shard_key_tuple, chunk_offset, self.chunks_per_shard + ) + ) + + def __getitem__(self, key): + if self._is_data_key(key): + if self.supports_efficient_get_partial_values: + # Use the partial implementation, which fetches the index separately + value = self.get_partial_values([(key, (0, None))])[0] + if value is None: + raise KeyError(key) + else: + return value + shard_key, chunk_subkey = self._key_to_shard(key) + try: + full_shard_value = self.inner_store[shard_key] + except KeyError as e: + raise KeyError(key) from e + index = self._get_index_from_buffer(full_shard_value) + chunk_slice = index.get_chunk_slice(chunk_subkey) + if chunk_slice is not None: + return full_shard_value[chunk_slice] + else: + raise KeyError(key) + else: + return self.inner_store.__getitem__(key) + + def __setitem__(self, key, value): + value = ensure_bytes(value) + if self._is_data_key(key): + shard_key, chunk_subkey = self._key_to_shard(key) + chunks_to_read = set(self._get_chunks_in_shard(shard_key)) + chunks_to_read.remove(chunk_subkey) + new_content = {chunk_subkey: value} + try: + if self.supports_efficient_get_partial_values: + index = self._get_index_from_store(shard_key) + full_shard_value = None + else: + full_shard_value = self.inner_store[shard_key] + index = self._get_index_from_buffer(full_shard_value) + except KeyError: + index = _ShardIndex.create_empty(self) + else: + chunk_slices = [ + (chunk_to_read, index.get_chunk_slice(chunk_to_read)) + for chunk_to_read in chunks_to_read + ] + valid_chunk_slices = [ + (chunk_to_read, chunk_slice) + for chunk_to_read, chunk_slice in chunk_slices + if chunk_slice is not None + ] + # use get_partial_values if less than half of the available chunks must be read: + # (This can be changed when set_partial_values can be used efficiently.) + use_partial_get = ( + self.supports_efficient_get_partial_values + and len(valid_chunk_slices) < len(chunk_slices) / 2 + ) + + if use_partial_get: + chunk_values = self.inner_store.get_partial_values( + [ + ( + shard_key, + ( + chunk_slice.start, + chunk_slice.stop - chunk_slice.start, + ), + ) + for _, chunk_slice in valid_chunk_slices + ] + ) + for chunk_value, (chunk_to_read, _) in zip(chunk_values, valid_chunk_slices): + new_content[chunk_to_read] = chunk_value + else: + if full_shard_value is None: + full_shard_value = self.inner_store[shard_key] + for chunk_to_read, chunk_slice in valid_chunk_slices: + if chunk_slice is not None: + new_content[chunk_to_read] = full_shard_value[chunk_slice] + + shard_content = b"" + for chunk_subkey, chunk_content in new_content.items(): + chunk_slice = slice(len(shard_content), len(shard_content) + len(chunk_content)) + index.set_chunk_slice(chunk_subkey, chunk_slice) + shard_content += chunk_content + # Appending the index at the end of the shard: + shard_content += index.to_bytes() + self.inner_store[shard_key] = shard_content + else: # pragma: no cover + self.inner_store[key] = value + + def __delitem__(self, key): + if self._is_data_key(key): + shard_key, chunk_subkey = self._key_to_shard(key) + try: + index = self._get_index_from_store(shard_key) + except KeyError as e: + raise KeyError(key) from e + + index.set_chunk_slice(chunk_subkey, None) + + if index.is_all_empty(): + del self.inner_store[shard_key] + else: + index_bytes = index.to_bytes() + self.inner_store.set_partial_values([(shard_key, -len(index_bytes), index_bytes)]) + else: # pragma: no cover + del self.inner_store[key] + + def _shard_key_to_original_keys(self, key: str) -> Iterator[str]: + if self._is_data_key(key): + index = self._get_index_from_store(key) + prefix, _, _ = key.rpartition("c") + for chunk_tuple in self._get_chunks_in_shard(key): + if index.get_chunk_slice(chunk_tuple) is not None: + yield prefix + "c" + self.dimension_separator.join(map(str, chunk_tuple)) + else: + yield key + + def __iter__(self) -> Iterator[str]: + for key in self.inner_store: + yield from self._shard_key_to_original_keys(key) + + def __len__(self): + return sum(1 for _ in self.keys()) + + def get_partial_values(self, key_ranges): + if self.supports_efficient_get_partial_values: + transformed_key_ranges = [] + cached_indices = {} + none_indices = [] + for i, (key, range_) in enumerate(key_ranges): + if self._is_data_key(key): + shard_key, chunk_subkey = self._key_to_shard(key) + try: + index = cached_indices[shard_key] + except KeyError: + try: + index = self._get_index_from_store(shard_key) + except KeyError: + none_indices.append(i) + continue + cached_indices[shard_key] = index + chunk_slice = index.get_chunk_slice(chunk_subkey) + if chunk_slice is None: + none_indices.append(i) + continue + range_start, range_length = range_ + if range_length is None: + range_length = chunk_slice.stop - chunk_slice.start + transformed_key_ranges.append( + (shard_key, (range_start + chunk_slice.start, range_length)) + ) + else: # pragma: no cover + transformed_key_ranges.append((key, range_)) + values = self.inner_store.get_partial_values(transformed_key_ranges) + for i in none_indices: + values.insert(i, None) + return values + else: + return StoreV3.get_partial_values(self, key_ranges) + + def supports_efficient_set_partial_values(self): + return False + + def set_partial_values(self, key_start_values): + # This does not yet implement efficient set_partial_values + StoreV3.set_partial_values(self, key_start_values) + + def rename(self, src_path: str, dst_path: str) -> None: + StoreV3.rename(self, src_path, dst_path) # type: ignore[arg-type] + + def list_prefix(self, prefix): + return StoreV3.list_prefix(self, prefix) + + def erase_prefix(self, prefix): + if self._is_data_key(prefix): + StoreV3.erase_prefix(self, prefix) + else: + self.inner_store.erase_prefix(prefix) + + def rmdir(self, path=None): + path = normalize_storage_path(path) + _rmdir_from_keys_v3(self, path) + + def __contains__(self, key): + if self._is_data_key(key): + shard_key, chunk_subkeys = self._key_to_shard(key) + try: + index = self._get_index_from_store(shard_key) + except KeyError: + return False + chunk_slice = index.get_chunk_slice(chunk_subkeys) + return chunk_slice is not None + else: + return self._inner_store.__contains__(key) diff --git a/src/zarr/v2/attrs.py b/src/zarr/v2/attrs.py new file mode 100644 index 0000000000..af9a5f1d30 --- /dev/null +++ b/src/zarr/v2/attrs.py @@ -0,0 +1,201 @@ +import warnings +from collections.abc import MutableMapping + +from zarr._storage.store import Store, StoreV3 +from zarr.util import json_dumps + + +class Attributes(MutableMapping): + """Class providing access to user attributes on an array or group. Should not be + instantiated directly, will be available via the `.attrs` property of an array or + group. + + Parameters + ---------- + store : MutableMapping + The store in which to store the attributes. + key : str, optional + The key under which the attributes will be stored. + read_only : bool, optional + If True, attributes cannot be modified. + cache : bool, optional + If True (default), attributes will be cached locally. + synchronizer : Synchronizer + Only necessary if attributes may be modified from multiple threads or processes. + + """ + + def __init__( + self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None, cached_dict=None + ): + self._version = getattr(store, "_store_version", 2) + _Store = Store if self._version == 2 else StoreV3 + self.store = _Store._ensure_store(store) + self.key = key + self.read_only = read_only + self.cache = cache + self._cached_asdict = cached_dict if cache else None + self.synchronizer = synchronizer + + def _get_nosync(self): + try: + data = self.store[self.key] + except KeyError: + d = dict() + if self._version > 2: + d["attributes"] = {} + else: + d = self.store._metadata_class.parse_metadata(data) + return d + + def asdict(self): + """Retrieve all attributes as a dictionary.""" + if self.cache and self._cached_asdict is not None: + return self._cached_asdict + d = self._get_nosync() + if self._version == 3: + d = d["attributes"] + if self.cache: + self._cached_asdict = d + return d + + def refresh(self): + """Refresh cached attributes from the store.""" + if self.cache: + if self._version == 2: + self._cached_asdict = self._get_nosync() + else: + self._cached_asdict = self._get_nosync()["attributes"] + + def __contains__(self, x): + return x in self.asdict() + + def __getitem__(self, item): + return self.asdict()[item] + + def _write_op(self, f, *args, **kwargs): + # guard condition + if self.read_only: + raise PermissionError("attributes are read-only") + + # synchronization + if self.synchronizer is None: + return f(*args, **kwargs) + else: + with self.synchronizer[self.key]: + return f(*args, **kwargs) + + def __setitem__(self, item, value): + self._write_op(self._setitem_nosync, item, value) + + def _setitem_nosync(self, item, value): + # load existing data + d = self._get_nosync() + + # set key value + if self._version == 2: + d[item] = value + else: + d["attributes"][item] = value + + # _put modified data + self._put_nosync(d) + + def __delitem__(self, item): + self._write_op(self._delitem_nosync, item) + + def _delitem_nosync(self, key): + # load existing data + d = self._get_nosync() + + # delete key value + if self._version == 2: + del d[key] + else: + del d["attributes"][key] + + # _put modified data + self._put_nosync(d) + + def put(self, d): + """Overwrite all attributes with the key/value pairs in the provided dictionary + `d` in a single operation.""" + if self._version == 2: + self._write_op(self._put_nosync, d) + else: + self._write_op(self._put_nosync, dict(attributes=d)) + + def _put_nosync(self, d): + d_to_check = d if self._version == 2 else d["attributes"] + if not all(isinstance(item, str) for item in d_to_check): + # TODO: Raise an error for non-string keys + # raise TypeError("attribute keys must be strings") + warnings.warn( + "only attribute keys of type 'string' will be allowed in the future", + DeprecationWarning, + stacklevel=2, + ) + + try: + d_to_check = {str(k): v for k, v in d_to_check.items()} + except TypeError as ex: # pragma: no cover + raise TypeError("attribute keys can not be stringified") from ex + + if self._version == 2: + d = d_to_check + else: + d["attributes"] = d_to_check + + if self._version == 2: + self.store[self.key] = json_dumps(d) + if self.cache: + self._cached_asdict = d + else: + try: + meta_unparsed = self.store[self.key] + # Cannot write the attributes directly to JSON, but have to + # store it within the pre-existing attributes key of the v3 + # metadata. + + # Note: this changes the store.counter result in test_caching_on! + + meta = self.store._metadata_class.parse_metadata(meta_unparsed) + if "attributes" in meta and "filters" in meta["attributes"]: + # need to preserve any existing "filters" attribute + d["attributes"]["filters"] = meta["attributes"]["filters"] + meta["attributes"] = d["attributes"] + except KeyError: + meta = d + self.store[self.key] = json_dumps(meta) + if self.cache: + self._cached_asdict = d["attributes"] + + # noinspection PyMethodOverriding + def update(self, *args, **kwargs): + """Update the values of several attributes in a single operation.""" + self._write_op(self._update_nosync, *args, **kwargs) + + def _update_nosync(self, *args, **kwargs): + # load existing data + d = self._get_nosync() + + # update + if self._version == 2: + d.update(*args, **kwargs) + else: + d["attributes"].update(*args, **kwargs) + + # _put modified data + self._put_nosync(d) + + def keys(self): + return self.asdict().keys() + + def __iter__(self): + return iter(self.asdict()) + + def __len__(self): + return len(self.asdict()) + + def _ipython_key_completions_(self): + return sorted(self) diff --git a/src/zarr/v2/codecs.py b/src/zarr/v2/codecs.py new file mode 100644 index 0000000000..6fd5e20401 --- /dev/null +++ b/src/zarr/v2/codecs.py @@ -0,0 +1,4 @@ +# flake8: noqa +from numcodecs import * +from numcodecs import get_codec, Blosc, Pickle, Zlib, Zstd, Delta, AsType, BZ2 +from numcodecs.registry import codec_registry diff --git a/src/zarr/v2/context.py b/src/zarr/v2/context.py new file mode 100644 index 0000000000..3dd7dda4ac --- /dev/null +++ b/src/zarr/v2/context.py @@ -0,0 +1,19 @@ +from typing import TypedDict + +from numcodecs.compat import NDArrayLike + + +class Context(TypedDict, total=False): + """A context for component specific information + + All keys are optional. Any component reading the context must provide + a default implementation in the case a key cannot be found. + + Items + ----- + meta_array : array-like, optional + An array-like instance to use for determining the preferred output + array type. + """ + + meta_array: NDArrayLike diff --git a/src/zarr/v2/convenience.py b/src/zarr/v2/convenience.py new file mode 100644 index 0000000000..a3cd702c9d --- /dev/null +++ b/src/zarr/v2/convenience.py @@ -0,0 +1,1366 @@ +"""Convenience functions for storing and loading data.""" + +import itertools +import os +import re +from collections.abc import Mapping, MutableMapping + +from zarr._storage.store import data_root, meta_root, assert_zarr_v3_api_available +from zarr.core import Array +from zarr.creation import array as _create_array +from zarr.creation import open_array +from zarr.errors import CopyError, PathNotFoundError +from zarr.hierarchy import Group +from zarr.hierarchy import group as _create_group +from zarr.hierarchy import open_group +from zarr.meta import json_dumps, json_loads +from zarr.storage import ( + _get_metadata_suffix, + contains_array, + contains_group, + normalize_store_arg, + BaseStore, + ConsolidatedMetadataStore, +) +from zarr._storage.v3 import ConsolidatedMetadataStoreV3 +from zarr.util import TreeViewer, buffer_size, normalize_storage_path + +from typing import Union + +StoreLike = Union[BaseStore, MutableMapping, str, None] + +_builtin_open = open # builtin open is later shadowed by a local open function + +__doctest_requires__ = {("*"): ["numpy>=2.2"]} + + +def _check_and_update_path(store: BaseStore, path): + if getattr(store, "_store_version", 2) > 2 and not path: + raise ValueError("path must be provided for v3 stores") + return normalize_storage_path(path) + + +# noinspection PyShadowingBuiltins +def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=None, **kwargs): + """Convenience function to open a group or array using file-mode-like semantics. + + Parameters + ---------- + store : Store or string, optional + Store or path to directory in file system or name of zip file. + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + zarr_version : {2, 3, None}, optional + The zarr protocol version to use. The default value of None will attempt + to infer the version from `store` if possible, otherwise it will fall + back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + + path : str or None, optional + The path within the store to open. + **kwargs + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + z : :class:`zarr.core.Array` or :class:`zarr.hierarchy.Group` + Array or group, depending on what exists in the given store. + + See Also + -------- + zarr.creation.open_array, zarr.hierarchy.open_group + + Examples + -------- + + Storing data in a directory 'data/example.zarr' on the local file system:: + + >>> import zarr + >>> store = 'data/example.zarr' + >>> zw = zarr.open(store, mode='w', shape=100, dtype='i4') # open new array + >>> zw + + >>> za = zarr.open(store, mode='a') # open existing array for reading and writing + >>> za + + >>> zr = zarr.open(store, mode='r') # open existing array read-only + >>> zr + + >>> gw = zarr.open(store, mode='w') # open new group, overwriting previous data + >>> gw + + >>> ga = zarr.open(store, mode='a') # open existing group for reading and writing + >>> ga + + >>> gr = zarr.open(store, mode='r') # open existing group read-only + >>> gr + + + """ + + # handle polymorphic store arg + # we pass storage options explicitly, since normalize_store_arg might construct + # a store if the input is a fsspec-compatible URL + _store: BaseStore = normalize_store_arg( + store, + storage_options=kwargs.pop("storage_options", {}), + mode=mode, + zarr_version=zarr_version, + ) + # path = _check_and_update_path(_store, path) + path = normalize_storage_path(path) + kwargs["path"] = path + + if mode in {"w", "w-", "x"}: + if "shape" in kwargs: + return open_array(_store, mode=mode, **kwargs) + else: + return open_group(_store, mode=mode, **kwargs) + + elif mode == "a": + if "shape" in kwargs or contains_array(_store, path): + return open_array(_store, mode=mode, **kwargs) + else: + return open_group(_store, mode=mode, **kwargs) + + else: + if contains_array(_store, path): + return open_array(_store, mode=mode, **kwargs) + elif contains_group(_store, path): + return open_group(_store, mode=mode, **kwargs) + else: + raise PathNotFoundError(path) + + +def _might_close(path): + return isinstance(path, (str, os.PathLike)) + + +def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs): + """Convenience function to save a NumPy array to the local file system, following a + similar API to the NumPy save() function. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + arr : ndarray + NumPy array with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + + path : str or None, optional + The path within the store where the array will be saved. + kwargs + Passed through to :func:`create`, e.g., compressor. + + Examples + -------- + Save an array to a directory on the file system (uses a :class:`DirectoryStore`):: + + >>> import zarr + >>> import numpy as np + >>> arr = np.arange(10000) + >>> zarr.save_array('data/example.zarr', arr) + >>> zarr.load('data/example.zarr') + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + + Save an array to a single file (uses a :class:`ZipStore`):: + + >>> zarr.save_array('data/example.zip', arr) + >>> zarr.load('data/example.zip') + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + + """ + may_need_closing = _might_close(store) + _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + path = _check_and_update_path(_store, path) + try: + _create_array( + arr, store=_store, overwrite=True, zarr_version=zarr_version, path=path, **kwargs + ) + finally: + if may_need_closing: + # needed to ensure zip file records are written + _store.close() + + +def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): + """Convenience function to save several NumPy arrays to the local file system, following a + similar API to the NumPy savez()/savez_compressed() functions. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + args : ndarray + NumPy arrays with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + + path : str or None, optional + Path within the store where the group will be saved. + kwargs + NumPy arrays with data to save. + + Examples + -------- + Save several arrays to a directory on the file system (uses a + :class:`DirectoryStore`): + + >>> import zarr + >>> import numpy as np + >>> a1 = np.arange(10000) + >>> a2 = np.arange(10000, 0, -1) + >>> zarr.save_group('data/example.zarr', a1, a2) + >>> loader = zarr.load('data/example.zarr') + >>> loader + + >>> loader['arr_0'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['arr_1'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + Save several arrays using named keyword arguments:: + + >>> zarr.save_group('data/example.zarr', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zarr') + >>> loader + + >>> loader['foo'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['bar'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + Store several arrays in a single zip file (uses a :class:`ZipStore`):: + + >>> zarr.save_group('data/example.zip', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zip') + >>> loader + + >>> loader['foo'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['bar'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + Notes + ----- + Default compression options will be used. + + """ + if len(args) == 0 and len(kwargs) == 0: + raise ValueError("at least one array must be provided") + # handle polymorphic store arg + may_need_closing = _might_close(store) + _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + path = _check_and_update_path(_store, path) + try: + grp = _create_group(_store, path=path, overwrite=True, zarr_version=zarr_version) + for i, arr in enumerate(args): + k = f"arr_{i}" + grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) + for k, arr in kwargs.items(): + grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) + finally: + if may_need_closing: + # needed to ensure zip file records are written + _store.close() + + +def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): + """Convenience function to save an array or group of arrays to the local file system. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + args : ndarray + NumPy arrays with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + + path : str or None, optional + The path within the group where the arrays will be saved. + kwargs + NumPy arrays with data to save. + + Examples + -------- + Save an array to a directory on the file system (uses a :class:`DirectoryStore`):: + + >>> import zarr + >>> import numpy as np + >>> arr = np.arange(10000) + >>> zarr.save('data/example.zarr', arr) + >>> zarr.load('data/example.zarr') + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + + Save an array to a Zip file (uses a :class:`ZipStore`):: + + >>> zarr.save('data/example.zip', arr) + >>> zarr.load('data/example.zip') + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + + Save several arrays to a directory on the file system (uses a + :class:`DirectoryStore` and stores arrays in a group):: + + >>> import zarr + >>> import numpy as np + >>> a1 = np.arange(10000) + >>> a2 = np.arange(10000, 0, -1) + >>> zarr.save('data/example.zarr', a1, a2) + >>> loader = zarr.load('data/example.zarr') + >>> loader + + >>> loader['arr_0'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['arr_1'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + Save several arrays using named keyword arguments:: + + >>> zarr.save('data/example.zarr', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zarr') + >>> loader + + >>> loader['foo'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['bar'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + Store several arrays in a single zip file (uses a :class:`ZipStore`):: + + >>> zarr.save('data/example.zip', foo=a1, bar=a2) + >>> loader = zarr.load('data/example.zip') + >>> loader + + >>> loader['foo'] + array([ 0, 1, 2, ..., 9997, 9998, 9999], shape=(10000,)) + >>> loader['bar'] + array([10000, 9999, 9998, ..., 3, 2, 1], shape=(10000,)) + + See Also + -------- + save_array, save_group + + """ + if len(args) == 0 and len(kwargs) == 0: + raise ValueError("at least one array must be provided") + if len(args) == 1 and len(kwargs) == 0: + save_array(store, args[0], zarr_version=zarr_version, path=path) + else: + save_group(store, *args, zarr_version=zarr_version, path=path, **kwargs) + + +class LazyLoader(Mapping): + def __init__(self, grp): + self.grp = grp + self.cache = dict() + + def __getitem__(self, item): + try: + return self.cache[item] + except KeyError: + arr = self.grp[item][...] + self.cache[item] = arr + return arr + + def __len__(self): + return len(self.grp) + + def __iter__(self): + return iter(self.grp) + + def __contains__(self, item): + return item in self.grp + + def __repr__(self): + r = ">> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g3.create_group('baz') + >>> g5 = g3.create_group('qux') + >>> d1 = g5.create_dataset('baz', shape=100, chunks=10) + >>> g1.tree() + / + ├── bar + │ ├── baz + │ └── qux + │ └── baz (100,) float64 + └── foo + >>> import h5py + >>> h5f = h5py.File('data/example.h5', mode='w') + >>> zarr.copy_all(g1, h5f) + (5, 0, 800) + >>> zarr.tree(h5f) + / + ├── bar + │ ├── baz + │ └── qux + │ └── baz (100,) float64 + └── foo + + See Also + -------- + zarr.hierarchy.Group.tree + + Notes + ----- + Please note that this is an experimental feature. The behaviour of this + function is still evolving and the default output and/or parameters may change + in future versions. + + """ + + return TreeViewer(grp, expand=expand, level=level) + + +class _LogWriter: + def __init__(self, log): + self.log_func = None + self.log_file = None + self.needs_closing = False + if log is None: + # don't do any logging + pass + elif callable(log): + self.log_func = log + elif isinstance(log, str): + self.log_file = _builtin_open(log, mode="w") + self.needs_closing = True + elif hasattr(log, "write"): + self.log_file = log + else: + raise TypeError( + f"log must be a callable function, file path or file-like object, found {log!r}" + ) + + def __enter__(self): + return self + + def __exit__(self, *args): + if self.log_file is not None and self.needs_closing: + self.log_file.close() + + def __call__(self, *args, **kwargs): + if self.log_file is not None: + kwargs["file"] = self.log_file + print(*args, **kwargs) + if hasattr(self.log_file, "flush"): + # get immediate feedback + self.log_file.flush() + elif self.log_func is not None: + self.log_func(*args, **kwargs) + + +def _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied): + # log a final message with a summary of what happened + if dry_run: + message = "dry run: " + else: + message = "all done: " + message += f"{n_copied:,} copied, {n_skipped:,} skipped" + if not dry_run: + message += f", {n_bytes_copied:,} bytes copied" + log(message) + + +def copy_store( + source, + dest, + source_path="", + dest_path="", + excludes=None, + includes=None, + flags=0, + if_exists="raise", + dry_run=False, + log=None, +): + """Copy data directly from the `source` store to the `dest` store. Use this + function when you want to copy a group or array in the most efficient way, + preserving all configuration and attributes. This function is more efficient + than the copy() or copy_all() functions because it avoids de-compressing and + re-compressing data, rather the compressed chunk data for each array are + copied directly between stores. + + Parameters + ---------- + source : Mapping + Store to copy data from. + dest : MutableMapping + Store to copy data into. + source_path : str, optional + Only copy data from under this path in the source store. + dest_path : str, optional + Copy data into this path in the destination store. + excludes : sequence of str, optional + One or more regular expressions which will be matched against keys in + the source store. Any matching key will not be copied. + includes : sequence of str, optional + One or more regular expressions which will be matched against keys in + the source store and will override any excludes also matching. + flags : int, optional + Regular expression flags used for matching excludes and includes. + if_exists : {'raise', 'replace', 'skip'}, optional + How to handle keys that already exist in the destination store. If + 'raise' then a CopyError is raised on the first key already present + in the destination store. If 'replace' then any data will be replaced in + the destination. If 'skip' then any existing keys will not be copied. + dry_run : bool, optional + If True, don't actually copy anything, just log what would have + happened. + log : callable, file path or file-like object, optional + If provided, will be used to log progress information. + + Returns + ------- + n_copied : int + Number of items copied. + n_skipped : int + Number of items skipped. + n_bytes_copied : int + Number of bytes of data that were actually copied. + + Examples + -------- + + >>> import zarr + >>> store1 = zarr.DirectoryStore('data/example.zarr') + >>> root = zarr.group(store1, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.create_group('bar') + >>> baz = bar.create_dataset('baz', shape=100, chunks=50, dtype='i8') + >>> import numpy as np + >>> baz[:] = np.arange(100) + >>> root.tree() + / + └── foo + └── bar + └── baz (100,) int64 + >>> from sys import stdout + >>> store2 = zarr.ZipStore('data/example.zip', mode='w') + >>> zarr.copy_store(store1, store2, log=stdout) + copy .zgroup + copy foo/.zgroup + copy foo/bar/.zgroup + copy foo/bar/baz/.zarray + copy foo/bar/baz/0 + copy foo/bar/baz/1 + all done: 6 copied, 0 skipped, 566 bytes copied + (6, 0, 566) + >>> new_root = zarr.group(store2) + >>> new_root.tree() + / + └── foo + └── bar + └── baz (100,) int64 + >>> new_root['foo/bar/baz'][:] + array([ 0, 1, 2, ..., 97, 98, 99]) + >>> store2.close() # zip stores need to be closed + + Notes + ----- + Please note that this is an experimental feature. The behaviour of this + function is still evolving and the default behaviour and/or parameters may change + in future versions. + + """ + + # normalize paths + source_path = normalize_storage_path(source_path) + dest_path = normalize_storage_path(dest_path) + if source_path: + source_path = source_path + "/" + if dest_path: + dest_path = dest_path + "/" + + # normalize excludes and includes + if excludes is None: + excludes = [] + elif isinstance(excludes, str): + excludes = [excludes] + if includes is None: + includes = [] + elif isinstance(includes, str): + includes = [includes] + excludes = [re.compile(e, flags) for e in excludes] + includes = [re.compile(i, flags) for i in includes] + + # check if_exists parameter + valid_if_exists = ["raise", "replace", "skip"] + if if_exists not in valid_if_exists: + raise ValueError(f"if_exists must be one of {valid_if_exists!r}; found {if_exists!r}") + + # setup counting variables + n_copied = n_skipped = n_bytes_copied = 0 + + source_store_version = getattr(source, "_store_version", 2) + dest_store_version = getattr(dest, "_store_version", 2) + if source_store_version != dest_store_version: + raise ValueError("zarr stores must share the same protocol version") + + if source_store_version > 2: + nchar_root = len(meta_root) + # code below assumes len(meta_root) === len(data_root) + assert len(data_root) == nchar_root + + # setup logging + with _LogWriter(log) as log: + # iterate over source keys + for source_key in sorted(source.keys()): + # filter to keys under source path + if source_store_version == 2: + if not source_key.startswith(source_path): + continue + elif source_store_version == 3: + # skip 'meta/root/' or 'data/root/' at start of source_key + if not source_key[nchar_root:].startswith(source_path): + continue + + # process excludes and includes + exclude = False + for prog in excludes: + if prog.search(source_key): + exclude = True + break + if exclude: + for prog in includes: + if prog.search(source_key): + exclude = False + break + if exclude: + continue + + # map key to destination path + if source_store_version == 2: + key_suffix = source_key[len(source_path) :] + dest_key = dest_path + key_suffix + elif source_store_version == 3: + # nchar_root is length of 'meta/root/' or 'data/root/' + key_suffix = source_key[nchar_root + len(source_path) :] + dest_key = source_key[:nchar_root] + dest_path + key_suffix + + # create a descriptive label for this operation + descr = source_key + if dest_key != source_key: + descr = descr + " -> " + dest_key + + # decide what to do + do_copy = True + if if_exists != "replace": + if dest_key in dest: + if if_exists == "raise": + raise CopyError(f"key {dest_key!r} exists in destination") + elif if_exists == "skip": + do_copy = False + + # take action + if do_copy: + log(f"copy {descr}") + if not dry_run: + data = source[source_key] + n_bytes_copied += buffer_size(data) + dest[dest_key] = data + n_copied += 1 + else: + log(f"skip {descr}") + n_skipped += 1 + + # log a final message with a summary of what happened + _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) + + return n_copied, n_skipped, n_bytes_copied + + +def _check_dest_is_group(dest): + if not hasattr(dest, "create_dataset"): + raise ValueError(f"dest must be a group, got {dest!r}") + + +def copy( + source, + dest, + name=None, + shallow=False, + without_attrs=False, + log=None, + if_exists="raise", + dry_run=False, + **create_kws, +): + """Copy the `source` array or group into the `dest` group. + + Parameters + ---------- + source : group or array/dataset + A zarr group or array, or an h5py group or dataset. + dest : group + A zarr or h5py group. + name : str, optional + Name to copy the object to. + shallow : bool, optional + If True, only copy immediate children of `source`. + without_attrs : bool, optional + Do not copy user attributes. + log : callable, file path or file-like object, optional + If provided, will be used to log progress information. + if_exists : {'raise', 'replace', 'skip', 'skip_initialized'}, optional + How to handle arrays that already exist in the destination group. If + 'raise' then a CopyError is raised on the first array already present + in the destination group. If 'replace' then any array will be + replaced in the destination. If 'skip' then any existing arrays will + not be copied. If 'skip_initialized' then any existing arrays with + all chunks initialized will not be copied (not available when copying to + h5py). + dry_run : bool, optional + If True, don't actually copy anything, just log what would have + happened. + **create_kws + Passed through to the create_dataset method when copying an array/dataset. + + Returns + ------- + n_copied : int + Number of items copied. + n_skipped : int + Number of items skipped. + n_bytes_copied : int + Number of bytes of data that were actually copied. + + Examples + -------- + Here's an example of copying a group named 'foo' from an HDF5 file to a + Zarr group:: + + >>> import h5py + >>> import zarr + >>> import numpy as np + >>> source = h5py.File('data/example.h5', mode='w') + >>> foo = source.create_group('foo') + >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) + >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) + >>> zarr.tree(source) + / + ├── foo + │ └── bar + │ └── baz (100,) int64 + └── spam (100,) int64 + >>> dest = zarr.group() + >>> from sys import stdout + >>> zarr.copy(source['foo'], dest, log=stdout) + copy /foo + copy /foo/bar + copy /foo/bar/baz (100,) int64 + all done: 3 copied, 0 skipped, 800 bytes copied + (3, 0, 800) + >>> dest.tree() # N.B., no spam + / + └── foo + └── bar + └── baz (100,) int64 + >>> source.close() + + The ``if_exists`` parameter provides options for how to handle pre-existing data in + the destination. Here are some examples of these options, also using + ``dry_run=True`` to find out what would happen without actually copying anything:: + + >>> source = zarr.group() + >>> dest = zarr.group() + >>> baz = source.create_dataset('foo/bar/baz', data=np.arange(100)) + >>> spam = source.create_dataset('foo/spam', data=np.arange(1000)) + >>> existing_spam = dest.create_dataset('foo/spam', data=np.arange(1000)) + >>> from sys import stdout + >>> try: + ... zarr.copy(source['foo'], dest, log=stdout, dry_run=True) + ... except zarr.CopyError as e: + ... print(e) + ... + copy /foo + copy /foo/bar + copy /foo/bar/baz (100,) int64 + an object 'spam' already exists in destination '/foo' + >>> zarr.copy(source['foo'], dest, log=stdout, if_exists='replace', dry_run=True) + copy /foo + copy /foo/bar + copy /foo/bar/baz (100,) int64 + copy /foo/spam (1000,) int64 + dry run: 4 copied, 0 skipped + (4, 0, 0) + >>> zarr.copy(source['foo'], dest, log=stdout, if_exists='skip', dry_run=True) + copy /foo + copy /foo/bar + copy /foo/bar/baz (100,) int64 + skip /foo/spam (1000,) int64 + dry run: 3 copied, 1 skipped + (3, 1, 0) + + Notes + ----- + Please note that this is an experimental feature. The behaviour of this + function is still evolving and the default behaviour and/or parameters may change + in future versions. + + """ + + # value checks + _check_dest_is_group(dest) + + # setup logging + with _LogWriter(log) as log: + # do the copying + n_copied, n_skipped, n_bytes_copied = _copy( + log, + source, + dest, + name=name, + root=True, + shallow=shallow, + without_attrs=without_attrs, + if_exists=if_exists, + dry_run=dry_run, + **create_kws, + ) + + # log a final message with a summary of what happened + _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) + + return n_copied, n_skipped, n_bytes_copied + + +def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_run, **create_kws): + # N.B., if this is a dry run, dest may be None + + # setup counting variables + n_copied = n_skipped = n_bytes_copied = 0 + + # are we copying to/from h5py? + source_h5py = source.__module__.startswith("h5py.") + dest_h5py = dest is not None and dest.__module__.startswith("h5py.") + + # check if_exists parameter + valid_if_exists = ["raise", "replace", "skip", "skip_initialized"] + if if_exists not in valid_if_exists: + raise ValueError(f"if_exists must be one of {valid_if_exists!r}; found {if_exists!r}") + if dest_h5py and if_exists == "skip_initialized": + raise ValueError(f"{if_exists!r} can only be used when copying to zarr") + + # determine name to copy to + if name is None: + name = source.name.split("/")[-1] + if not name: + # this can happen if source is the root group + raise TypeError( + "source has no name, please provide the `name` " + "parameter to indicate a name to copy to" + ) + + if hasattr(source, "shape"): + # copy a dataset/array + + # check if already exists, decide what to do + do_copy = True + exists = dest is not None and name in dest + if exists: + if if_exists == "raise": + raise CopyError(f"an object {name!r} already exists in destination {dest.name!r}") + elif if_exists == "skip": + do_copy = False + elif if_exists == "skip_initialized": + ds = dest[name] + if ds.nchunks_initialized == ds.nchunks: + do_copy = False + + # take action + if do_copy: + # log a message about what we're going to do + log(f"copy {source.name} {source.shape} {source.dtype}") + + if not dry_run: + # clear the way + if exists: + del dest[name] + + # setup creation keyword arguments + kws = create_kws.copy() + + # setup chunks option, preserve by default + kws.setdefault("chunks", source.chunks) + + # setup compression options + if source_h5py: + if dest_h5py: + # h5py -> h5py; preserve compression options by default + kws.setdefault("compression", source.compression) + kws.setdefault("compression_opts", source.compression_opts) + kws.setdefault("shuffle", source.shuffle) + kws.setdefault("fletcher32", source.fletcher32) + kws.setdefault("fillvalue", source.fillvalue) + else: + # h5py -> zarr; use zarr default compression options + kws.setdefault("fill_value", source.fillvalue) + else: + if dest_h5py: + # zarr -> h5py; use some vaguely sensible defaults + kws.setdefault("chunks", True) + kws.setdefault("compression", "gzip") + kws.setdefault("compression_opts", 1) + kws.setdefault("shuffle", False) + kws.setdefault("fillvalue", source.fill_value) + else: + # zarr -> zarr; preserve compression options by default + kws.setdefault("compressor", source.compressor) + kws.setdefault("filters", source.filters) + kws.setdefault("order", source.order) + kws.setdefault("fill_value", source.fill_value) + + # create new dataset in destination + ds = dest.create_dataset(name, shape=source.shape, dtype=source.dtype, **kws) + + # copy data - N.B., go chunk by chunk to avoid loading + # everything into memory + shape = ds.shape + chunks = ds.chunks + chunk_offsets = [range(0, s, c) for s, c in zip(shape, chunks)] + for offset in itertools.product(*chunk_offsets): + sel = tuple(slice(o, min(s, o + c)) for o, s, c in zip(offset, shape, chunks)) + ds[sel] = source[sel] + n_bytes_copied += ds.size * ds.dtype.itemsize + + # copy attributes + if not without_attrs: + if dest_h5py and "filters" in source.attrs: + # No filters key in v3 metadata so it was stored in the + # attributes instead. We cannot copy this key to + # HDF5 attrs, though! + source_attrs = source.attrs.asdict().copy() + source_attrs.pop("filters", None) + else: + source_attrs = source.attrs + ds.attrs.update(source_attrs) + + n_copied += 1 + + else: + log(f"skip {source.name} {source.shape} {source.dtype}") + n_skipped += 1 + + elif root or not shallow: + # copy a group + + # check if an array is in the way + do_copy = True + exists_array = dest is not None and name in dest and hasattr(dest[name], "shape") + if exists_array: + if if_exists == "raise": + raise CopyError(f"an array {name!r} already exists in destination {dest.name!r}") + elif if_exists == "skip": + do_copy = False + + # take action + if do_copy: + # log action + log(f"copy {source.name}") + + if not dry_run: + # clear the way + if exists_array: + del dest[name] + + # require group in destination + grp = dest.require_group(name) + + # copy attributes + if not without_attrs: + grp.attrs.update(source.attrs) + + else: + # setup for dry run without creating any groups in the + # destination + if dest is not None: + grp = dest.get(name, None) + else: + grp = None + + # recurse + for k in source.keys(): + c, s, b = _copy( + log, + source[k], + grp, + name=k, + root=False, + shallow=shallow, + without_attrs=without_attrs, + if_exists=if_exists, + dry_run=dry_run, + **create_kws, + ) + n_copied += c + n_skipped += s + n_bytes_copied += b + + n_copied += 1 + + else: + log(f"skip {source.name}") + n_skipped += 1 + + return n_copied, n_skipped, n_bytes_copied + + +def copy_all( + source, + dest, + shallow=False, + without_attrs=False, + log=None, + if_exists="raise", + dry_run=False, + **create_kws, +): + """Copy all children of the `source` group into the `dest` group. + + Parameters + ---------- + source : group or array/dataset + A zarr group or array, or an h5py group or dataset. + dest : group + A zarr or h5py group. + shallow : bool, optional + If True, only copy immediate children of `source`. + without_attrs : bool, optional + Do not copy user attributes. + log : callable, file path or file-like object, optional + If provided, will be used to log progress information. + if_exists : {'raise', 'replace', 'skip', 'skip_initialized'}, optional + How to handle arrays that already exist in the destination group. If + 'raise' then a CopyError is raised on the first array already present + in the destination group. If 'replace' then any array will be + replaced in the destination. If 'skip' then any existing arrays will + not be copied. If 'skip_initialized' then any existing arrays with + all chunks initialized will not be copied (not available when copying to + h5py). + dry_run : bool, optional + If True, don't actually copy anything, just log what would have + happened. + **create_kws + Passed through to the create_dataset method when copying an + array/dataset. + + Returns + ------- + n_copied : int + Number of items copied. + n_skipped : int + Number of items skipped. + n_bytes_copied : int + Number of bytes of data that were actually copied. + + Examples + -------- + >>> import h5py + >>> import zarr + >>> import numpy as np + >>> source = h5py.File('data/example.h5', mode='w') + >>> foo = source.create_group('foo') + >>> baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) + >>> spam = source.create_dataset('spam', data=np.arange(100, 200), chunks=(30,)) + >>> zarr.tree(source) + / + ├── foo + │ └── bar + │ └── baz (100,) int64 + └── spam (100,) int64 + >>> dest = zarr.group() + >>> import sys + >>> zarr.copy_all(source, dest, log=sys.stdout) + copy /foo + copy /foo/bar + copy /foo/bar/baz (100,) int64 + copy /spam (100,) int64 + all done: 4 copied, 0 skipped, 1,600 bytes copied + (4, 0, 1600) + >>> dest.tree() + / + ├── foo + │ └── bar + │ └── baz (100,) int64 + └── spam (100,) int64 + >>> source.close() + + Notes + ----- + Please note that this is an experimental feature. The behaviour of this + function is still evolving and the default behaviour and/or parameters may change + in future versions. + + """ + + # value checks + _check_dest_is_group(dest) + + # setup counting variables + n_copied = n_skipped = n_bytes_copied = 0 + + zarr_version = getattr(source, "_version", 2) + + # setup logging + with _LogWriter(log) as log: + for k in source.keys(): + c, s, b = _copy( + log, + source[k], + dest, + name=k, + root=False, + shallow=shallow, + without_attrs=without_attrs, + if_exists=if_exists, + dry_run=dry_run, + **create_kws, + ) + n_copied += c + n_skipped += s + n_bytes_copied += b + if zarr_version == 2: + dest.attrs.update(**source.attrs) + + # log a final message with a summary of what happened + _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) + + return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(store: BaseStore, metadata_key=".zmetadata", *, path=""): + """ + Consolidate all metadata for groups and arrays within the given store + into a single resource and put it under the given key. + + This produces a single object in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. After + metadata have been consolidated, use :func:`open_consolidated` to open + the root group in optimised, read-only mode, using the consolidated + metadata to reduce the number of read operations on the backend store. + + Note, that if the metadata in the store is changed after this + consolidation, then the metadata read by :func:`open_consolidated` + would be incorrect unless this function is called again. + + .. note:: This is an experimental feature. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to put the consolidated metadata under. + path : str or None + Path corresponding to the group that is being consolidated. Not required + for zarr v2 stores. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the new consolidated metadata. + + See Also + -------- + open_consolidated + + """ + store = normalize_store_arg(store, mode="w") + + version = store._store_version + + if version == 2: + + def is_zarr_key(key): + return key.endswith(".zarray") or key.endswith(".zgroup") or key.endswith(".zattrs") + + else: + assert_zarr_v3_api_available() + + sfx = _get_metadata_suffix(store) # type: ignore + + def is_zarr_key(key): + return ( + key.endswith(".array" + sfx) or key.endswith(".group" + sfx) or key == "zarr.json" + ) + + # cannot create a group without a path in v3 + # so create /meta/root/consolidated group to store the metadata + if "consolidated" not in store: + _create_group(store, path="consolidated") + if not metadata_key.startswith("meta/root/"): + metadata_key = "meta/root/consolidated/" + metadata_key + # path = 'consolidated' + + out = { + "zarr_consolidated_format": 1, + "metadata": {key: json_loads(store[key]) for key in store if is_zarr_key(key)}, + } + store[metadata_key] = json_dumps(out) + return open_consolidated(store, metadata_key=metadata_key, path=path) + + +def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", **kwargs): + """Open group using metadata previously consolidated into a single key. + + This is an optimised method for opening a Zarr group, where instead of + traversing the group/array hierarchy by accessing the metadata keys at + each level, a single key contains all of the metadata for everything. + For remote data sources where the overhead of accessing a key is large + compared to the time to read data. + + The group accessed must have already had its metadata consolidated into a + single key using the function :func:`consolidate_metadata`. + + This optimised method only works in modes which do not change the + metadata, although the data may still be written/updated. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to read the consolidated metadata from. The default (.zmetadata) + corresponds to the default used by :func:`consolidate_metadata`. + mode : {'r', 'r+'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist) although only writes to data are allowed, + changes to metadata including creation of new arrays or group + are not allowed. + **kwargs + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the consolidated metadata. + + See Also + -------- + consolidate_metadata + + """ + + # normalize parameters + zarr_version = kwargs.get("zarr_version") + store = normalize_store_arg( + store, storage_options=kwargs.get("storage_options"), mode=mode, zarr_version=zarr_version + ) + if mode not in {"r", "r+"}: + raise ValueError(f"invalid mode, expected either 'r' or 'r+'; found {mode!r}") + + path = kwargs.pop("path", None) + if store._store_version == 2: + ConsolidatedStoreClass = ConsolidatedMetadataStore + else: + assert_zarr_v3_api_available() + ConsolidatedStoreClass = ConsolidatedMetadataStoreV3 + # default is to store within 'consolidated' group on v3 + if not metadata_key.startswith("meta/root/"): + metadata_key = "meta/root/consolidated/" + metadata_key + + # setup metadata store + meta_store = ConsolidatedStoreClass(store, metadata_key=metadata_key) + + # pass through + chunk_store = kwargs.pop("chunk_store", None) or store + return open(store=meta_store, chunk_store=chunk_store, mode=mode, path=path, **kwargs) diff --git a/src/zarr/v2/core.py b/src/zarr/v2/core.py new file mode 100644 index 0000000000..0bbea83816 --- /dev/null +++ b/src/zarr/v2/core.py @@ -0,0 +1,2960 @@ +import binascii +import hashlib +import itertools +import math +import operator +import re +from functools import reduce +from typing import Any +import warnings + +import numpy as np +from numcodecs.compat import ensure_bytes + +from zarr._storage.store import _prefix_to_attrs_key, assert_zarr_v3_api_available +from zarr.attrs import Attributes +from zarr.codecs import AsType, get_codec +from zarr.context import Context +from zarr.errors import ArrayNotFoundError, ReadOnlyError, ArrayIndexError +from zarr.indexing import ( + BasicIndexer, + CoordinateIndexer, + MaskIndexer, + OIndex, + OrthogonalIndexer, + VIndex, + BlockIndex, + BlockIndexer, + PartialChunkIterator, + check_fields, + check_no_multi_fields, + ensure_tuple, + err_too_many_indices, + is_contiguous_selection, + is_pure_fancy_indexing, + is_pure_orthogonal_indexing, + is_scalar, + pop_fields, +) +from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_array_key, + KVStore, + getsize, + listdir, + normalize_store_arg, +) +from zarr.util import ( + ConstantMap, + all_equal, + InfoReporter, + check_array_shape, + human_readable_size, + is_total_slice, + nolock, + normalize_chunks, + normalize_resize_args, + normalize_shape, + normalize_storage_path, + PartialReadBuffer, + UncompressedPartialReadBufferV3, + ensure_ndarray_like, +) + +__all__ = ["Array"] +__doctest_requires__ = {("*"): ["numpy>=2.2"]} + + +# noinspection PyUnresolvedReferences +class Array: + """Instantiate an array from an initialized store. + + Parameters + ---------- + store : MutableMapping + Array store, already initialized. + path : string, optional + Storage path. + read_only : bool, optional + True if array should be protected against modification. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + synchronizer : object, optional + Array synchronizer. + cache_metadata : bool, optional + If True (default), array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + write_empty_chunks : bool, optional + If True, all chunks will be stored regardless of their contents. If + False (default), each chunk is compared to the array's fill value prior + to storing. If a chunk is uniformly equal to the fill value, then that + chunk is not be stored, and the store entry for that chunk's key is + deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + + .. versionadded:: 2.11 + + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.13 + """ + + def __init__( + self, + store: Any, # BaseStore not strictly required due to normalize_store_arg + path=None, + read_only=False, + chunk_store=None, + synchronizer=None, + cache_metadata=True, + cache_attrs=True, + partial_decompress=None, + write_empty_chunks=True, + zarr_version=None, + meta_array=None, + ): + # N.B., expect at this point store is fully initialized with all + # configuration metadata fully specified and normalized + + store = normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = store._store_version + + if zarr_version != 2: + assert_zarr_v3_api_available() + + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, zarr_version=zarr_version) + + self._store = store + self._chunk_store = chunk_store + self._transformed_chunk_store = None + self._path = normalize_storage_path(path) + if self._path: + self._key_prefix = self._path + "/" + else: + self._key_prefix = "" + self._read_only = bool(read_only) + self._synchronizer = synchronizer + self._cache_metadata = cache_metadata + self._is_view = False + if partial_decompress is not None: + warnings.warn( + "Support for partial decompression is no longer supported in numcodecs. " + "Support for partial decompression will be removed in a future version of zarr-python v2.", + DeprecationWarning, + stacklevel=1, + ) + self._partial_decompress = partial_decompress + self._write_empty_chunks = write_empty_chunks + if meta_array is not None: + self._meta_array = np.empty_like(meta_array, shape=()) + else: + self._meta_array = np.empty(()) + self._version = zarr_version + if self._version == 3: + self._data_key_prefix = "data/root/" + self._key_prefix + self._data_path = "data/root/" + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) + self._metadata_key_suffix = self._hierarchy_metadata["metadata_key_suffix"] + + # initialize metadata + self._load_metadata() + + # initialize attributes + akey = _prefix_to_attrs_key(self._store, self._key_prefix) + self._attrs = Attributes( + store, + key=akey, + read_only=read_only, + synchronizer=synchronizer, + cache=cache_attrs, + cached_dict=self._meta["attributes"] if self._version == 3 else None, + ) + + # initialize info reporter + + # initialize indexing helpers + self._oindex = OIndex(self) + self._vindex = VIndex(self) + self._blocks = BlockIndex(self) + + def _load_metadata(self): + """(Re)load metadata from store.""" + if self._synchronizer is None: + self._load_metadata_nosync() + else: + mkey = _prefix_to_array_key(self._store, self._key_prefix) + with self._synchronizer[mkey]: + self._load_metadata_nosync() + + def _load_metadata_nosync(self): + try: + mkey = _prefix_to_array_key(self._store, self._key_prefix) + meta_bytes = self._store[mkey] + except KeyError as e: + raise ArrayNotFoundError(self._path) from e + else: + # decode and store metadata as instance members + meta = self._store._metadata_class.decode_array_metadata(meta_bytes) + self._meta = meta + self._shape = meta["shape"] + self._fill_value = meta["fill_value"] + dimension_separator = meta.get("dimension_separator", None) + if self._version == 2: + self._chunks = meta["chunks"] + self._dtype = meta["dtype"] + self._order = meta["order"] + if dimension_separator is None: + try: + dimension_separator = self._store._dimension_separator + except (AttributeError, KeyError): + pass + + # Fallback for any stores which do not choose a default + if dimension_separator is None: + dimension_separator = "." + else: + self._chunks = meta["chunk_grid"]["chunk_shape"] + self._dtype = meta["data_type"] + self._order = meta["chunk_memory_layout"] + chunk_separator = meta["chunk_grid"]["separator"] + if dimension_separator is None: + dimension_separator = meta.get("dimension_separator", chunk_separator) + + self._dimension_separator = dimension_separator + + # setup compressor + compressor = meta.get("compressor", None) + if compressor is None: + self._compressor = None + elif self._version == 2: + self._compressor = get_codec(compressor) + else: + self._compressor = compressor + + # setup filters + if self._version == 2: + filters = meta.get("filters", []) + else: + # TODO: storing filters under attributes for now since the v3 + # array metadata does not have a 'filters' attribute. + filters = meta["attributes"].get("filters", []) + if filters: + filters = [get_codec(config) for config in filters] + self._filters = filters + + if self._version == 3: + storage_transformers = meta.get("storage_transformers", []) + if storage_transformers: + transformed_store = self._chunk_store or self._store + for storage_transformer in storage_transformers[::-1]: + transformed_store = storage_transformer._copy_for_array( + self, transformed_store + ) + self._transformed_chunk_store = transformed_store + + def _refresh_metadata(self): + if not self._cache_metadata: + self._load_metadata() + + def _refresh_metadata_nosync(self): + if not self._cache_metadata and not self._is_view: + self._load_metadata_nosync() + + def _flush_metadata_nosync(self): + if self._is_view: + raise PermissionError("operation not permitted for views") + + if self._compressor: + compressor_config = self._compressor.get_config() + else: + compressor_config = None + if self._filters: + filters_config = [f.get_config() for f in self._filters] + else: + filters_config = None + _compressor = compressor_config if self._version == 2 else self._compressor + meta = dict( + shape=self._shape, + compressor=_compressor, + fill_value=self._fill_value, + filters=filters_config, + ) + if getattr(self._store, "_store_version", 2) == 2: + meta.update( + dict( + chunks=self._chunks, + dtype=self._dtype, + order=self._order, + dimension_separator=self._dimension_separator, + ) + ) + else: + meta.update( + dict( + chunk_grid=dict( + type="regular", + chunk_shape=self._chunks, + separator=self._dimension_separator, + ), + data_type=self._dtype, + chunk_memory_layout=self._order, + attributes=self.attrs.asdict(), + ) + ) + mkey = _prefix_to_array_key(self._store, self._key_prefix) + self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) + + @property + def store(self): + """A MutableMapping providing the underlying storage for the array.""" + return self._store + + @property + def path(self): + """Storage path.""" + return self._path + + @property + def name(self): + """Array name following h5py convention.""" + if self.path: + # follow h5py convention: add leading slash + name = self.path + if name[0] != "/": + name = "/" + name + return name + return None + + @property + def basename(self): + """Final component of name.""" + if self.name is not None: + return self.name.split("/")[-1] + return None + + @property + def read_only(self): + """A boolean, True if modification operations are not permitted.""" + return self._read_only + + @read_only.setter + def read_only(self, value): + self._read_only = bool(value) + + @property + def chunk_store(self): + """A MutableMapping providing the underlying storage for array chunks.""" + if self._transformed_chunk_store is not None: + return self._transformed_chunk_store + elif self._chunk_store is not None: + return self._chunk_store + else: + return self._store + + @property + def shape(self): + """A tuple of integers describing the length of each dimension of + the array.""" + # N.B., shape may change if array is resized, hence need to refresh + # metadata + self._refresh_metadata() + return self._shape + + @shape.setter + def shape(self, value): + self.resize(value) + + @property + def chunks(self): + """A tuple of integers describing the length of each dimension of a + chunk of the array.""" + return self._chunks + + @property + def dtype(self): + """The NumPy data type.""" + return self._dtype + + @property + def compressor(self): + """Primary compression codec.""" + return self._compressor + + @property + def fill_value(self): + """A value used for uninitialized portions of the array.""" + return self._fill_value + + @fill_value.setter + def fill_value(self, new): + self._fill_value = new + self._flush_metadata_nosync() + + @property + def order(self): + """A string indicating the order in which bytes are arranged within + chunks of the array.""" + return self._order + + @property + def filters(self): + """One or more codecs used to transform data prior to compression.""" + return self._filters + + @property + def synchronizer(self): + """Object used to synchronize write access to the array.""" + return self._synchronizer + + @property + def attrs(self): + """A MutableMapping containing user-defined attributes. Note that + attribute values must be JSON serializable.""" + return self._attrs + + @property + def ndim(self): + """Number of dimensions.""" + return len(self._shape) + + @property + def _size(self): + return reduce(operator.mul, self._shape, 1) + + @property + def size(self): + """The total number of elements in the array.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._size + + @property + def itemsize(self): + """The size in bytes of each item in the array.""" + return self.dtype.itemsize + + @property + def _nbytes(self): + return self._size * self.itemsize + + @property + def nbytes(self): + """The total number of bytes that would be required to store the + array without compression.""" + # N.B., this property depends on shape, and shape may change if array + # is resized, hence need to refresh metadata + self._refresh_metadata() + return self._nbytes + + @property + def nbytes_stored(self): + """The total number of stored bytes of data for the array. This + includes storage required for configuration metadata and user + attributes.""" + m = getsize(self._store, self._path) + if self._chunk_store is None: + return m + else: + n = getsize(self._chunk_store, self._path) + if m < 0 or n < 0: + return -1 + else: + return m + n + + @property + def _cdata_shape(self): + if self._shape == (): + return (1,) + else: + return tuple(math.ceil(s / c) for s, c in zip(self._shape, self._chunks)) + + @property + def cdata_shape(self): + """A tuple of integers describing the number of chunks along each + dimension of the array.""" + self._refresh_metadata() + return self._cdata_shape + + @property + def _nchunks(self): + return reduce(operator.mul, self._cdata_shape, 1) + + @property + def nchunks(self): + """Total number of chunks.""" + self._refresh_metadata() + return self._nchunks + + @property + def nchunks_initialized(self): + """The number of chunks that have been initialized with some data.""" + + # count chunk keys + if self._version == 3: + # # key pattern for chunk keys + # prog = re.compile(r'\.'.join([r'c\d+'] * min(1, self.ndim))) + # # get chunk keys, excluding the prefix + # members = self.chunk_store.list_prefix(self._data_path) + # members = [k.split(self._data_key_prefix)[1] for k in members] + # # count the chunk keys + # return sum(1 for k in members if prog.match(k)) + + # key pattern for chunk keys + prog = re.compile(self._data_key_prefix + r"c\d+") # TODO: ndim == 0 case? + # get chunk keys, excluding the prefix + members = self.chunk_store.list_prefix(self._data_path) + # count the chunk keys + return sum(1 for k in members if prog.match(k)) + else: + # key pattern for chunk keys + prog = re.compile(r"\.".join([r"\d+"] * min(1, self.ndim))) + + # count chunk keys + return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) + + # backwards compatibility + initialized = nchunks_initialized + + @property + def is_view(self): + """A boolean, True if this array is a view on another array.""" + return self._is_view + + @property + def oindex(self): + """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and + :func:`set_orthogonal_selection` for documentation and examples.""" + return self._oindex + + @property + def vindex(self): + """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, + :func:`set_coordinate_selection`, :func:`get_mask_selection` and + :func:`set_mask_selection` for documentation and examples.""" + return self._vindex + + @property + def blocks(self): + """Shortcut for blocked chunked indexing, see :func:`get_block_selection` and + :func:`set_block_selection` for documentation and examples.""" + return self._blocks + + @property + def write_empty_chunks(self) -> bool: + """A Boolean, True if chunks composed of the array's fill value + will be stored. If False, such chunks will not be stored. + """ + return self._write_empty_chunks + + @property + def meta_array(self): + """An array-like instance to use for determining arrays to create and return + to users. + """ + return self._meta_array + + def __eq__(self, other): + return ( + isinstance(other, Array) + and self.store == other.store + and self.read_only == other.read_only + and self.path == other.path + and not self._is_view + # N.B., no need to compare other properties, should be covered by + # store comparison + ) + + def __array__(self, *args, **kwargs): + return np.array(self[...], *args, **kwargs) + + def islice(self, start=None, end=None): + """ + Yield a generator for iterating over the entire or parts of the + array. Uses a cache so chunks only have to be decompressed once. + + Parameters + ---------- + start : int, optional + Start index for the generator to start at. Defaults to 0. + end : int, optional + End index for the generator to stop at. Defaults to self.shape[0]. + + Yields + ------ + out : generator + A generator that can be used to iterate over the requested region + the array. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100)) + + Iterate over part of the array: + >>> for value in z.islice(25, 30): value; + np.int64(25) + np.int64(26) + np.int64(27) + np.int64(28) + np.int64(29) + """ + + if len(self.shape) == 0: + # Same error as numpy + raise TypeError("iteration over a 0-d array") + if start is None: + start = 0 + if end is None or end > self.shape[0]: + end = self.shape[0] + + if not isinstance(start, int) or start < 0: + raise ValueError("start must be a nonnegative integer") + + if not isinstance(end, int) or end < 0: + raise ValueError("end must be a nonnegative integer") + + # Avoid repeatedly decompressing chunks by iterating over the chunks + # in the first dimension. + chunk_size = self.chunks[0] + chunk = None + for j in range(start, end): + if j % chunk_size == 0: + chunk = self[j : j + chunk_size] + # init chunk if we start offset of chunk borders + elif chunk is None: + chunk_start = j - j % chunk_size + chunk_end = chunk_start + chunk_size + chunk = self[chunk_start:chunk_end] + yield chunk[j % chunk_size] + + def __iter__(self): + return self.islice() + + def __len__(self): + if self.shape: + return self.shape[0] + else: + # 0-dimensional array, same error message as numpy + raise TypeError("len() of unsized object") + + def __getitem__(self, selection): + """Retrieve data for an item or region of the array. + + Parameters + ---------- + selection : tuple + An integer index or slice or tuple of int/slice objects specifying the + requested item or region for each dimension of the array. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested region. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100)) + + Retrieve a single item:: + + >>> z[5] + np.int64(5) + + Retrieve a region via slicing:: + + >>> z[:5] + array([0, 1, 2, 3, 4]) + >>> z[-5:] + array([95, 96, 97, 98, 99]) + >>> z[5:10] + array([5, 6, 7, 8, 9]) + >>> z[5:10:2] + array([5, 7, 9]) + >>> z[::2] + array([ 0, 2, 4, ..., 94, 96, 98]) + + Load the entire array into memory:: + + >>> z[...] + array([ 0, 1, 2, ..., 97, 98, 99]) + + Setup a 2-dimensional array:: + + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + + Retrieve an item:: + + >>> z[2, 2] + np.int64(22) + + Retrieve a region via slicing:: + + >>> z[1:3, 1:3] + array([[11, 12], + [21, 22]]) + >>> z[1:3, :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z[:, 1:3] + array([[ 1, 2], + [11, 12], + [21, 22], + [31, 32], + [41, 42], + [51, 52], + [61, 62], + [71, 72], + [81, 82], + [91, 92]]) + >>> z[0:5:2, 0:5:2] + array([[ 0, 2, 4], + [20, 22, 24], + [40, 42, 44]]) + >>> z[::2, ::2] + array([[ 0, 2, 4, 6, 8], + [20, 22, 24, 26, 28], + [40, 42, 44, 46, 48], + [60, 62, 64, 66, 68], + [80, 82, 84, 86, 88]]) + + Load the entire array into memory:: + + >>> z[...] + array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], + [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], + [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], + [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + + For arrays with a structured dtype, specific fields can be retrieved, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z['foo'] + array([b'aaa', b'bbb', b'ccc'], + dtype='|S3') + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + Currently the implementation for __getitem__ is provided by + :func:`vindex` if the indexing is pure fancy indexing (ie a + broadcast-compatible tuple of integer array indices), or by + :func:`set_basic_selection` otherwise. + + Effectively, this means that the following indexing modes are supported: + + - integer indexing + - slice indexing + - mixed slice and integer indexing + - boolean indexing + - fancy indexing (vectorized list of integers) + + For specific indexing options including outer indexing, see the + methods listed under See Also. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __setitem__ + + """ + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + result = self.vindex[selection] + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + result = self.get_orthogonal_selection(pure_selection, fields=fields) + else: + result = self.get_basic_selection(pure_selection, fields=fields) + return result + + def get_basic_selection(self, selection=Ellipsis, out=None, fields=None): + """Retrieve data for an item or region of the array. + + Parameters + ---------- + selection : tuple + A tuple specifying the requested item or region for each dimension of the + array. May be any combination of int and/or slice for multidimensional arrays. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested region. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100)) + + Retrieve a single item:: + + >>> z.get_basic_selection(5) + np.int64(5) + + Retrieve a region via slicing:: + + >>> z.get_basic_selection(slice(5)) + array([0, 1, 2, 3, 4]) + >>> z.get_basic_selection(slice(-5, None)) + array([95, 96, 97, 98, 99]) + >>> z.get_basic_selection(slice(5, 10)) + array([5, 6, 7, 8, 9]) + >>> z.get_basic_selection(slice(5, 10, 2)) + array([5, 7, 9]) + >>> z.get_basic_selection(slice(None, None, 2)) + array([ 0, 2, 4, ..., 94, 96, 98]) + + Setup a 2-dimensional array:: + + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + + Retrieve an item:: + + >>> z.get_basic_selection((2, 2)) + np.int64(22) + + Retrieve a region via slicing:: + + >>> z.get_basic_selection((slice(1, 3), slice(1, 3))) + array([[11, 12], + [21, 22]]) + >>> z.get_basic_selection((slice(1, 3), slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z.get_basic_selection((slice(None), slice(1, 3))) + array([[ 1, 2], + [11, 12], + [21, 22], + [31, 32], + [41, 42], + [51, 52], + [61, 62], + [71, 72], + [81, 82], + [91, 92]]) + >>> z.get_basic_selection((slice(0, 5, 2), slice(0, 5, 2))) + array([[ 0, 2, 4], + [20, 22, 24], + [40, 42, 44]]) + >>> z.get_basic_selection((slice(None, None, 2), slice(None, None, 2))) + array([[ 0, 2, 4, 6, 8], + [20, 22, 24, 26, 28], + [40, 42, 44, 46, 48], + [60, 62, 64, 66, 68], + [80, 82, 84, 86, 88]]) + + For arrays with a structured dtype, specific fields can be retrieved, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z.get_basic_selection(slice(2), fields='foo') + array([b'aaa', b'bbb'], + dtype='|S3') + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + Currently this method provides the implementation for accessing data via the + square bracket notation (__getitem__). See :func:`__getitem__` for examples + using the alternative notation. + + See Also + -------- + set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # check args + check_fields(fields, self._dtype) + + # handle zero-dimensional arrays + if self._shape == (): + return self._get_basic_selection_zd(selection=selection, out=out, fields=fields) + else: + return self._get_basic_selection_nd(selection=selection, out=out, fields=fields) + + def _get_basic_selection_zd(self, selection, out=None, fields=None): + # special case basic selection for zero-dimensional array + + # check selection is valid + selection = ensure_tuple(selection) + if selection not in ((), (Ellipsis,)): + err_too_many_indices(selection, ()) + + try: + # obtain encoded data for chunk + ckey = self._chunk_key((0,)) + cdata = self.chunk_store[ckey] + + except KeyError: + # chunk not initialized + chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) + if self._fill_value is not None: + chunk.fill(self._fill_value) + + else: + chunk = self._decode_chunk(cdata) + + # handle fields + if fields: + chunk = chunk[fields] + + # handle selection of the scalar value via empty tuple + if out is None: + out = chunk[selection] + else: + out[selection] = chunk[selection] + + return out + + def _get_basic_selection_nd(self, selection, out=None, fields=None): + # implementation of basic selection for array with at least one dimension + + # setup indexer + indexer = BasicIndexer(selection, self) + + return self._get_selection(indexer=indexer, out=out, fields=fields) + + def get_orthogonal_selection(self, selection, out=None, fields=None): + """Retrieve data by making a selection for each dimension of the array. For + example, if an array has 2 dimensions, allows selecting specific rows and/or + columns. The selection for each dimension can be either an integer (indexing a + single item), a slice, an array of integers, or a Boolean array where True + values indicate a selection. + + Parameters + ---------- + selection : tuple + A selection for each dimension of the array. May be any combination of int, + slice, integer array or Boolean array. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + + Retrieve rows and columns via any combination of int, slice, integer array and/or + Boolean array:: + + >>> z.get_orthogonal_selection(([1, 4], slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.get_orthogonal_selection((slice(None), [1, 4])) + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.get_orthogonal_selection(([1, 4], [1, 4])) + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.get_orthogonal_selection((sel, sel)) + array([[11, 14], + [41, 44]]) + + For convenience, the orthogonal selection functionality is also available via the + `oindex` property, e.g.:: + + >>> z.oindex[[1, 4], :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.oindex[:, [1, 4]] + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.oindex[[1, 4], [1, 4]] + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.oindex[sel, sel] + array([[11, 14], + [41, 44]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # check args + check_fields(fields, self._dtype) + + # setup indexer + indexer = OrthogonalIndexer(selection, self) + + return self._get_selection(indexer=indexer, out=out, fields=fields) + + def get_coordinate_selection(self, selection, out=None, fields=None): + """Retrieve a selection of individual items, by providing the indices + (coordinates) for each selected item. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + + Retrieve items by specifying their coordinates:: + + >>> z.get_coordinate_selection(([1, 4], [1, 4])) + array([11, 44]) + + For convenience, the coordinate selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[[1, 4], [1, 4]] + array([11, 44]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. + + Coordinate arrays may be multidimensional, in which case the output array will + also be multidimensional. Coordinate arrays are broadcast against each other + before being applied. The shape of the output will be the same as the shape of + each coordinate array after broadcasting. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # check args + check_fields(fields, self._dtype) + + # setup indexer + indexer = CoordinateIndexer(selection, self) + + # handle output - need to flatten + if out is not None: + out = out.reshape(-1) + + out = self._get_selection(indexer=indexer, out=out, fields=fields) + + # restore shape + out = out.reshape(indexer.sel_shape) + + return out + + def get_block_selection(self, selection, out=None, fields=None): + """Retrieve a selection of individual chunk blocks, by providing the indices + (coordinates) for each chunk block. + + Parameters + ---------- + selection : tuple + An integer (coordinate) or slice for each dimension of the array. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10), chunks=(3, 3)) + + Retrieve items by specifying their block coordinates:: + + >>> z.get_block_selection((1, slice(None))) + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + Which is equivalent to:: + + >>> z[3:6, :] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + For convenience, the block selection functionality is also available via the + `blocks` property, e.g.:: + + >>> z.blocks[1] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + Notes + ----- + Block indexing is a convenience indexing method to work on individual chunks + with chunk index slicing. It has the same concept as Dask's `Array.blocks` + indexing. + + Slices are supported. However, only with a step size of one. + + Block index arrays may be multidimensional to index multidimensional arrays. + For example:: + + >>> z.blocks[0, 1:3] + array([[ 3, 4, 5, 6, 7, 8], + [13, 14, 15, 16, 17, 18], + [23, 24, 25, 26, 27, 28]]) + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + if not self._cache_metadata: + self._load_metadata() + + # check args + check_fields(fields, self._dtype) + + # setup indexer + indexer = BlockIndexer(selection, self) + + return self._get_selection(indexer=indexer, out=out, fields=fields) + + def get_mask_selection(self, selection, out=None, fields=None): + """Retrieve a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is + being made. + out : ndarray, optional + If given, load the selected data directly into this array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + + Returns + ------- + out : ndarray + A NumPy array containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.array(np.arange(100).reshape(10, 10)) + + Retrieve items by specifying a mask:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.get_mask_selection(sel) + array([11, 44]) + + For convenience, the mask selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[sel] + array([11, 44]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + """ + + # refresh metadata + if not self._cache_metadata: + self._load_metadata() + + # check args + check_fields(fields, self._dtype) + + # setup indexer + indexer = MaskIndexer(selection, self) + + return self._get_selection(indexer=indexer, out=out, fields=fields) + + def _get_selection(self, indexer, out=None, fields=None): + # We iterate over all chunks which overlap the selection and thus contain data + # that needs to be extracted. Each chunk is processed in turn, extracting the + # necessary data and storing into the correct location in the output array. + + # N.B., it is an important optimisation that we only visit chunks which overlap + # the selection. This minimises the number of iterations in the main for loop. + + # check fields are sensible + out_dtype = check_fields(fields, self._dtype) + + # determine output shape + out_shape = indexer.shape + + # setup output array + if out is None: + out = np.empty_like( + self._meta_array, shape=out_shape, dtype=out_dtype, order=self._order + ) + else: + check_array_shape("out", out, out_shape) + + # iterate over chunks + + if math.prod(out_shape) > 0: + # allow storage to get multiple items at once + lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) + self._chunk_getitems( + lchunk_coords, + lchunk_selection, + out, + lout_selection, + drop_axes=indexer.drop_axes, + fields=fields, + ) + if out.shape: + return out + else: + return out[()] + + def __setitem__(self, selection, value): + """Modify data for an item or region of the array. + + Parameters + ---------- + selection : tuple + An integer index or slice or tuple of int/slice specifying the requested + region for each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros(100, dtype=int) + + Set all array elements to the same scalar value:: + + >>> z[...] = 42 + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) + + Set a portion of the array:: + + >>> z[:10] = np.arange(10) + >>> z[-10:] = np.arange(10)[::-1] + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros((5, 5), dtype=int) + + Set all array elements to the same scalar value:: + + >>> z[...] = 42 + + Set a portion of the array:: + + >>> z[0, :] = np.arange(z.shape[1]) + >>> z[:, 0] = np.arange(z.shape[0]) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) + + For arrays with a structured dtype, specific fields can be modified, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z['foo'] = b'zzz' + >>> z[...] + array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'zzz', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', ' 1 are supported, but slices with negative step are not. + + Currently the implementation for __setitem__ is provided by + :func:`vindex` if the indexing is pure fancy indexing (ie a + broadcast-compatible tuple of integer array indices), or by + :func:`set_basic_selection` otherwise. + + Effectively, this means that the following indexing modes are supported: + + - integer indexing + - slice indexing + - mixed slice and integer indexing + - boolean indexing + - fancy indexing (vectorized list of integers) + + For specific indexing options including outer indexing, see the + methods listed under See Also. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__ + + """ + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + self.vindex[selection] = value + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + self.set_orthogonal_selection(pure_selection, value, fields=fields) + else: + self.set_basic_selection(pure_selection, value, fields=fields) + + def set_basic_selection(self, selection, value, fields=None): + """Modify data for an item or region of the array. + + Parameters + ---------- + selection : tuple + An integer index or slice or tuple of int/slice specifying the requested + region for each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros(100, dtype=int) + + Set all array elements to the same scalar value:: + + >>> z.set_basic_selection(..., 42) + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) + + Set a portion of the array:: + + >>> z.set_basic_selection(slice(10), np.arange(10)) + >>> z.set_basic_selection(slice(-10, None), np.arange(10)[::-1]) + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros((5, 5), dtype=int) + + Set all array elements to the same scalar value:: + + >>> z.set_basic_selection(..., 42) + + Set a portion of the array:: + + >>> z.set_basic_selection((0, slice(None)), np.arange(z.shape[1])) + >>> z.set_basic_selection((slice(None), 0), np.arange(z.shape[0])) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) + + For arrays with a structured dtype, the `fields` parameter can be used to set + data for a specific field, e.g.:: + + >>> a = np.array([(b'aaa', 1, 4.2), + ... (b'bbb', 2, 8.4), + ... (b'ccc', 3, 12.6)], + ... dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + >>> z = zarr.array(a) + >>> z.set_basic_selection(slice(0, 2), b'zzz', fields='foo') + >>> z[:] + array([(b'zzz', 1, 4.2), (b'zzz', 2, 8.4), (b'ccc', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', '>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of rows:: + + >>> z.set_orthogonal_selection(([1, 4], slice(None)), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [1, 1, 1, 1, 1]]) + + Set data for a selection of columns:: + + >>> z.set_orthogonal_selection((slice(None), [1, 4]), 2) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 2, 1, 1, 2], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 2, 1, 1, 2]]) + + Set data for a selection of rows and columns:: + + >>> z.set_orthogonal_selection(([1, 4], [1, 4]), 3) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 3, 1, 1, 3], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 3, 1, 1, 3]]) + + For convenience, this functionality is also available via the `oindex` property. + E.g.:: + + >>> z.oindex[[1, 4], [1, 4]] = 4 + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 4, 1, 1, 4], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 4, 1, 1, 4]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # guard conditions + if self._read_only: + raise ReadOnlyError() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = OrthogonalIndexer(selection, self) + + self._set_selection(indexer, value, fields=fields) + + def set_coordinate_selection(self, selection, value, fields=None): + """Modify a selection of individual items, by providing the indices (coordinates) + for each item to be modified. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of items:: + + >>> z.set_coordinate_selection(([1, 4], [1, 4]), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. + E.g.:: + + >>> z.vindex[[1, 4], [1, 4]] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # guard conditions + if self._read_only: + raise ReadOnlyError() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = CoordinateIndexer(selection, self) + + # handle value - need ndarray-like flatten value + if not is_scalar(value, self._dtype): + try: + value = ensure_ndarray_like(value) + except TypeError: + # Handle types like `list` or `tuple` + value = np.array(value, like=self._meta_array) + if hasattr(value, "shape") and len(value.shape) > 1: + value = value.reshape(-1) + + self._set_selection(indexer, value, fields=fields) + + def set_block_selection(self, selection, value, fields=None): + """Modify a selection of individual blocks, by providing the chunk indices + (coordinates) for each block to be modified. + + Parameters + ---------- + selection : tuple + An integer (coordinate) or slice for each dimension of the array. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Set up a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((6, 6), dtype=int, chunks=2) + + Set data for a selection of items:: + + >>> z.set_block_selection((1, 0), 1) + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]) + + For convenience, this functionality is also available via the `blocks` property. + E.g.:: + + >>> z.blocks[2, 1] = 4 + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 4, 4, 0, 0], + [0, 0, 4, 4, 0, 0]]) + + >>> z.blocks[:, 2] = 7 + >>> z[...] + array([[0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [0, 0, 4, 4, 7, 7], + [0, 0, 4, 4, 7, 7]]) + + Notes + ----- + Block indexing is a convenience indexing method to work on individual chunks + with chunk index slicing. It has the same concept as Dask's `Array.blocks` + indexing. + + Slices are supported. However, only with a step size of one. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + # guard conditions + if self._read_only: + raise ReadOnlyError() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = BlockIndexer(selection, self) + + self._set_selection(indexer, value, fields=fields) + + def set_mask_selection(self, selection, value, fields=None): + """Modify a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is + being made. + value : scalar or array-like + Value to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> z = zarr.zeros((5, 5), dtype=int) + + Set data for a selection of items:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.set_mask_selection(sel, 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. + E.g.:: + + >>> z.vindex[sel] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + # guard conditions + if self._read_only: + raise ReadOnlyError() + + # refresh metadata + if not self._cache_metadata: + self._load_metadata_nosync() + + # setup indexer + indexer = MaskIndexer(selection, self) + + self._set_selection(indexer, value, fields=fields) + + def _set_basic_selection_zd(self, selection, value, fields=None): + # special case __setitem__ for zero-dimensional array + + # check selection is valid + selection = ensure_tuple(selection) + if selection not in ((), (Ellipsis,)): + err_too_many_indices(selection, self._shape) + + # check fields + check_fields(fields, self._dtype) + fields = check_no_multi_fields(fields) + + # obtain key for chunk + ckey = self._chunk_key((0,)) + + # setup chunk + try: + # obtain compressed data for chunk + cdata = self.chunk_store[ckey] + + except KeyError: + # chunk not initialized + chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) + if self._fill_value is not None: + chunk.fill(self._fill_value) + + else: + # decode chunk + chunk = self._decode_chunk(cdata).copy() + + # set value + if fields: + chunk[fields][selection] = value + else: + chunk[selection] = value + + # remove chunk if write_empty_chunks is false and it only contains the fill value + if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): + try: + del self.chunk_store[ckey] + return + except Exception: # pragma: no cover + # deleting failed, fallback to overwriting + pass + else: + # encode and store + cdata = self._encode_chunk(chunk) + self.chunk_store[ckey] = cdata + + def _set_basic_selection_nd(self, selection, value, fields=None): + # implementation of __setitem__ for array with at least one dimension + + # setup indexer + indexer = BasicIndexer(selection, self) + + self._set_selection(indexer, value, fields=fields) + + def _set_selection(self, indexer, value, fields=None): + # We iterate over all chunks which overlap the selection and thus contain data + # that needs to be replaced. Each chunk is processed in turn, extracting the + # necessary data from the value array and storing into the chunk array. + + # N.B., it is an important optimisation that we only visit chunks which overlap + # the selection. This minimises the number of iterations in the main for loop. + + # check fields are sensible + check_fields(fields, self._dtype) + fields = check_no_multi_fields(fields) + + # determine indices of chunks overlapping the selection + sel_shape = indexer.shape + + # check value shape + if sel_shape == (): + # setting a single item + pass + elif is_scalar(value, self._dtype): + # setting a scalar value + pass + else: + if not hasattr(value, "shape"): + value = np.asanyarray(value, like=self._meta_array) + check_array_shape("value", value, sel_shape) + + # iterate over chunks in range + if ( + not hasattr(self.chunk_store, "setitems") + or self._synchronizer is not None + or any(map(lambda x: x == 0, self.shape)) + ): + # iterative approach + for chunk_coords, chunk_selection, out_selection in indexer: + # extract data to store + if sel_shape == (): + chunk_value = value + elif is_scalar(value, self._dtype): + chunk_value = value + else: + chunk_value = value[out_selection] + # handle missing singleton dimensions + if indexer.drop_axes: + item = [slice(None)] * self.ndim + for a in indexer.drop_axes: + item[a] = np.newaxis + item = tuple(item) + chunk_value = chunk_value[item] + + # put data + self._chunk_setitem(chunk_coords, chunk_selection, chunk_value, fields=fields) + else: + lchunk_coords, lchunk_selection, lout_selection = zip(*indexer) + chunk_values = [] + for out_selection in lout_selection: + if sel_shape == (): + chunk_values.append(value) + elif is_scalar(value, self._dtype): + chunk_values.append(value) + else: + cv = value[out_selection] + # handle missing singleton dimensions + if indexer.drop_axes: # pragma: no cover + item = [slice(None)] * self.ndim + for a in indexer.drop_axes: + item[a] = np.newaxis + item = tuple(item) + cv = chunk_value[item] + chunk_values.append(cv) + + self._chunk_setitems(lchunk_coords, lchunk_selection, chunk_values, fields=fields) + + def _process_chunk( + self, + out, + cdata, + chunk_selection, + drop_axes, + out_is_ndarray, + fields, + out_selection, + partial_read_decode=False, + ): + """Take binary data from storage and fill output array""" + if ( + out_is_ndarray + and not fields + and is_contiguous_selection(out_selection) + and is_total_slice(chunk_selection, self._chunks) + and not self._filters + and self._dtype != object + ): + # For 0D arrays out_selection = () and out[out_selection] is a scalar + # Avoid that + dest = out[out_selection] if out_selection else out + # Assume that array-like objects that doesn't have a + # `writeable` flag is writable. + dest_is_writable = getattr(dest, "writeable", True) + write_direct = dest_is_writable and ( + (self._order == "C" and dest.flags.c_contiguous) + or (self._order == "F" and dest.flags.f_contiguous) + ) + + if write_direct: + # optimization: we want the whole chunk, and the destination is + # contiguous, so we can decompress directly from the chunk + # into the destination array + if self._compressor: + if isinstance(cdata, PartialReadBuffer): + cdata = cdata.read_full() + self._compressor.decode(cdata, dest) + else: + if isinstance(cdata, UncompressedPartialReadBufferV3): + cdata = cdata.read_full() + chunk = ensure_ndarray_like(cdata).view(self._dtype) + # dest.shape is not self._chunks when a dimensions is squeezed out + # For example, assume self._chunks = (5, 5, 1) + # and the selection is [:, :, 0] + # Then out_selection is (slice(5), slice(5)) + # See https://github.com/zarr-developers/zarr-python/issues/1931 + chunk = chunk.reshape(dest.shape, order=self._order) + np.copyto(dest, chunk) + return + + # decode chunk + try: + if partial_read_decode: + cdata.prepare_chunk() + # size of chunk + tmp = np.empty_like(self._meta_array, shape=self._chunks, dtype=self.dtype) + index_selection = PartialChunkIterator(chunk_selection, self.chunks) + for start, nitems, partial_out_selection in index_selection: + expected_shape = [ + ( + len(range(*partial_out_selection[i].indices(self.chunks[0] + 1))) + if i < len(partial_out_selection) + else dim + ) + for i, dim in enumerate(self.chunks) + ] + if isinstance(cdata, UncompressedPartialReadBufferV3): + chunk_partial = self._decode_chunk( + cdata.read_part(start, nitems), + start=start, + nitems=nitems, + expected_shape=expected_shape, + ) + else: + cdata.read_part(start, nitems) + chunk_partial = self._decode_chunk( + cdata.buff, + start=start, + nitems=nitems, + expected_shape=expected_shape, + ) + tmp[partial_out_selection] = chunk_partial + out[out_selection] = tmp[chunk_selection] + return + except ArrayIndexError: + cdata = cdata.read_full() + chunk = self._decode_chunk(cdata) + + # select data from chunk + if fields: + chunk = chunk[fields] + tmp = chunk[chunk_selection] + if drop_axes: + tmp = np.squeeze(tmp, axis=drop_axes) + + # store selected data in output + out[out_selection] = tmp + + def _chunk_getitems( + self, lchunk_coords, lchunk_selection, out, lout_selection, drop_axes=None, fields=None + ): + """Obtain part or whole of chunks. + + Parameters + ---------- + chunk_coords : list of tuple of ints + Indices of the chunks. + chunk_selection : list of selections + Location of region within the chunks to extract. + out : ndarray + Array to store result in. + out_selection : list of selections + Location of regions within output array to store results in. + drop_axes : tuple of ints + Axes to squeeze out of the chunk. + fields + TODO + """ + + out_is_ndarray = True + try: + out = ensure_ndarray_like(out) + except TypeError: # pragma: no cover + out_is_ndarray = False + + # Keys to retrieve + ckeys = [self._chunk_key(ch) for ch in lchunk_coords] + + # Check if we can do a partial read + if ( + self._partial_decompress + and self._compressor + and self._compressor.codec_id == "blosc" + and hasattr(self._compressor, "decode_partial") + and not fields + and self.dtype != object + and hasattr(self.chunk_store, "getitems") + ): + partial_read_decode = True + cdatas = { + ckey: PartialReadBuffer(ckey, self.chunk_store) + for ckey in ckeys + if ckey in self.chunk_store + } + elif ( + self._partial_decompress + and not self._compressor + and not fields + and self.dtype != object + and hasattr(self.chunk_store, "get_partial_values") + and self.chunk_store.supports_efficient_get_partial_values + ): + partial_read_decode = True + cdatas = { + ckey: UncompressedPartialReadBufferV3( + ckey, self.chunk_store, itemsize=self.itemsize + ) + for ckey in ckeys + if ckey in self.chunk_store + } + elif hasattr(self.chunk_store, "get_partial_values"): + partial_read_decode = False + values = self.chunk_store.get_partial_values([(ckey, (0, None)) for ckey in ckeys]) + cdatas = {key: value for key, value in zip(ckeys, values) if value is not None} + else: + partial_read_decode = False + contexts = {} + if not isinstance(self._meta_array, np.ndarray): + contexts = ConstantMap(ckeys, constant=Context(meta_array=self._meta_array)) + cdatas = self.chunk_store.getitems(ckeys, contexts=contexts) + + for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection): + if ckey in cdatas: + self._process_chunk( + out, + cdatas[ckey], + chunk_select, + drop_axes, + out_is_ndarray, + fields, + out_select, + partial_read_decode=partial_read_decode, + ) + else: + # check exception type + if self._fill_value is not None: + if fields: + fill_value = self._fill_value[fields] + else: + fill_value = self._fill_value + out[out_select] = fill_value + + def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): + ckeys = map(self._chunk_key, lchunk_coords) + cdatas = { + key: self._process_for_setitem(key, sel, val, fields=fields) + for key, sel, val in zip(ckeys, lchunk_selection, values) + } + to_store = {} + if not self.write_empty_chunks: + empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} + self._chunk_delitems(empty_chunks.keys()) + nonempty_keys = cdatas.keys() - empty_chunks.keys() + to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} + else: + to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} + self.chunk_store.setitems(to_store) + + def _chunk_delitems(self, ckeys): + if hasattr(self.store, "delitems"): + self.store.delitems(ckeys) + else: # pragma: no cover + # exempting this branch from coverage as there are no extant stores + # that will trigger this condition, but it's possible that they + # will be developed in the future. + tuple(map(self._chunk_delitem, ckeys)) + + def _chunk_delitem(self, ckey): + """ + Attempt to delete the value associated with ckey. + """ + try: + del self.chunk_store[ckey] + except KeyError: + pass + + def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): + """Replace part or whole of a chunk. + + Parameters + ---------- + chunk_coords : tuple of ints + Indices of the chunk. + chunk_selection : tuple of slices + Location of region within the chunk. + value : scalar or ndarray + Value to set. + + """ + + if self._synchronizer is None: + # no synchronization + lock = nolock + else: + # synchronize on the chunk + ckey = self._chunk_key(chunk_coords) + lock = self._synchronizer[ckey] + + with lock: + self._chunk_setitem_nosync(chunk_coords, chunk_selection, value, fields=fields) + + def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): + ckey = self._chunk_key(chunk_coords) + cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) + + # attempt to delete chunk if it only contains the fill value + if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): + self._chunk_delitem(ckey) + else: + self.chunk_store[ckey] = self._encode_chunk(cdata) + + def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): + if is_total_slice(chunk_selection, self._chunks) and not fields: + # totally replace chunk + + # optimization: we are completely replacing the chunk, so no need + # to access the existing chunk data + + if is_scalar(value, self._dtype): + # setup array filled with value + chunk = np.empty_like( + self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + ) + chunk.fill(value) + + else: + # ensure array is contiguous + chunk = value.astype(self._dtype, order=self._order, copy=False) + + else: + # partially replace the contents of this chunk + + try: + # obtain compressed data for chunk + cdata = self.chunk_store[ckey] + + except KeyError: + # chunk not initialized + if self._fill_value is not None: + chunk = np.empty_like( + self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + ) + chunk.fill(self._fill_value) + elif self._dtype == object: + chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) + else: + # N.B., use zeros here so any region beyond the array has consistent + # and compressible data + chunk = np.zeros_like( + self._meta_array, shape=self._chunks, dtype=self._dtype, order=self._order + ) + + else: + # decode chunk + chunk = self._decode_chunk(cdata) + if not chunk.flags.writeable: + chunk = chunk.copy(order="K") + + # modify + if fields: + # N.B., currently multi-field assignment is not supported in numpy, so + # this only works for a single field + chunk[fields][chunk_selection] = value + else: + chunk[chunk_selection] = value + + return chunk + + def _chunk_key(self, chunk_coords): + if self._version == 3: + # _chunk_key() corresponds to data_key(P, i, j, ...) example in the spec + # where P = self._key_prefix, i, j, ... = chunk_coords + # e.g. c0/2/3 for 3d array with chunk index (0, 2, 3) + # https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/core/v3.0.html#regular-grids + return ( + "data/root/" + + self._key_prefix + + "c" + + self._dimension_separator.join(map(str, chunk_coords)) + ) + else: + return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) + + def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): + # decompress + if self._compressor: + # only decode requested items + if ( + all(x is not None for x in [start, nitems]) and self._compressor.codec_id == "blosc" + ) and hasattr(self._compressor, "decode_partial"): + chunk = self._compressor.decode_partial(cdata, start, nitems) + else: + chunk = self._compressor.decode(cdata) + else: + chunk = cdata + + # apply filters + if self._filters: + for f in reversed(self._filters): + chunk = f.decode(chunk) + + # view as numpy array with correct dtype + chunk = ensure_ndarray_like(chunk) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: + chunk = chunk.view(self._dtype) + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError("cannot read object array without object codec") + + # ensure correct chunk shape + chunk = chunk.reshape(-1, order="A") + chunk = chunk.reshape(expected_shape or self._chunks, order=self._order) + + return chunk + + def _encode_chunk(self, chunk): + # apply filters + if self._filters: + for f in self._filters: + chunk = f.encode(chunk) + + # check object encoding + if ensure_ndarray_like(chunk).dtype == object: + raise RuntimeError("cannot write object array without object codec") + + # compress + if self._compressor: + cdata = self._compressor.encode(chunk) + else: + cdata = chunk + + # ensure in-memory data is immutable and easy to compare + if isinstance(self.chunk_store, KVStore) or isinstance(self._chunk_store, KVStore): + cdata = ensure_bytes(cdata) + + return cdata + + def __repr__(self): + t = type(self) + r = f"<{t.__module__}.{t.__name__}" + if self.name: + r += f" {self.name!r}" + r += f" {str(self.shape)}" + r += f" {self.dtype}" + if self._read_only: + r += " read-only" + r += ">" + return r + + @property + def info(self): + """Report some diagnostic information about the array. + + Examples + -------- + >>> import zarr + >>> z = zarr.zeros(1000000, chunks=100000, dtype='i4') + >>> z.info + Type : zarr.core.Array + Data type : int32 + Shape : (1000000,) + Chunk shape : (100000,) + Order : C + Read-only : False + Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + Store type : zarr.storage.KVStore + No. bytes : 4000000 (3.8M) + No. bytes stored : 320 + Storage ratio : 12500.0 + Chunks initialized : 0/10 + + """ + return InfoReporter(self) + + def info_items(self): + return self._synchronized_op(self._info_items_nosync) + + def _info_items_nosync(self): + def typestr(o): + return f"{type(o).__module__}.{type(o).__name__}" + + def bytestr(n): + if n > 2**10: + return f"{n} ({human_readable_size(n)})" + else: + return str(n) + + items = [] + + # basic info + if self.name is not None: + items += [("Name", self.name)] + items += [ + ("Type", typestr(self)), + ("Data type", str(self.dtype)), + ("Shape", str(self.shape)), + ("Chunk shape", str(self.chunks)), + ("Order", self.order), + ("Read-only", str(self.read_only)), + ] + + # filters + if self.filters: + for i, f in enumerate(self.filters): + items += [(f"Filter [{i}]", repr(f))] + + # compressor + items += [("Compressor", repr(self.compressor))] + + # synchronizer + if self._synchronizer is not None: + items += [("Synchronizer type", typestr(self._synchronizer))] + + # storage info + nbytes = self.nbytes + nbytes_stored = self.nbytes_stored + items += [("Store type", typestr(self._store))] + if self._chunk_store is not None: + items += [("Chunk store type", typestr(self._chunk_store))] + items += [("No. bytes", bytestr(nbytes))] + if nbytes_stored > 0: + items += [ + ("No. bytes stored", bytestr(nbytes_stored)), + ("Storage ratio", f"{nbytes / nbytes_stored:.1f}"), + ] + items += [("Chunks initialized", f"{self.nchunks_initialized}/{self.nchunks}")] + + return items + + def digest(self, hashname="sha1"): + """ + Compute a checksum for the data. Default uses sha1 for speed. + + Examples + -------- + >>> import binascii + >>> import zarr + >>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000)) + >>> binascii.hexlify(z.digest()) + b'041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' + >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + >>> binascii.hexlify(z.digest()) + b'7162d416d26a68063b66ed1f30e0a866e4abed60' + >>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) + >>> binascii.hexlify(z.digest()) + b'cb387af37410ae5a3222e893cf3373e4e4f22816' + """ + + h = hashlib.new(hashname) + + for i in itertools.product(*[range(s) for s in self.cdata_shape]): + h.update(self.chunk_store.get(self._chunk_key(i), b"")) + + mkey = _prefix_to_array_key(self._store, self._key_prefix) + h.update(self.store.get(mkey, b"")) + + h.update(self.store.get(self.attrs.key, b"")) + + checksum = h.digest() + + return checksum + + def hexdigest(self, hashname="sha1"): + """ + Compute a checksum for the data. Default uses sha1 for speed. + + Examples + -------- + >>> import zarr + >>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z.hexdigest() + '041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' + >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z.hexdigest() + '7162d416d26a68063b66ed1f30e0a866e4abed60' + >>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) + >>> z.hexdigest() + 'cb387af37410ae5a3222e893cf3373e4e4f22816' + """ + + checksum = binascii.hexlify(self.digest(hashname=hashname)) + + # This is a bytes object on Python 3 and we want a str. + if not isinstance(checksum, str): + checksum = checksum.decode("utf8") + + return checksum + + def __getstate__(self): + return { + "store": self._store, + "path": self._path, + "read_only": self._read_only, + "chunk_store": self._chunk_store, + "synchronizer": self._synchronizer, + "cache_metadata": self._cache_metadata, + "cache_attrs": self._attrs.cache, + "partial_decompress": self._partial_decompress, + "write_empty_chunks": self._write_empty_chunks, + "zarr_version": self._version, + "meta_array": self._meta_array, + } + + def __setstate__(self, state): + self.__init__(**state) + + def _synchronized_op(self, f, *args, **kwargs): + if self._synchronizer is None: + # no synchronization + lock = nolock + + else: + # synchronize on the array + mkey = _prefix_to_array_key(self._store, self._key_prefix) + lock = self._synchronizer[mkey] + + with lock: + self._refresh_metadata_nosync() + result = f(*args, **kwargs) + + return result + + def _write_op(self, f, *args, **kwargs): + # guard condition + if self._read_only: + raise ReadOnlyError() + + return self._synchronized_op(f, *args, **kwargs) + + def resize(self, *args): + """Change the shape of the array by growing or shrinking one or more + dimensions. + + Examples + -------- + >>> import zarr + >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z.shape + (10000, 10000) + >>> z.resize(20000, 10000) + >>> z.shape + (20000, 10000) + >>> z.resize(30000, 1000) + >>> z.shape + (30000, 1000) + + Notes + ----- + When resizing an array, the data are not rearranged in any way. + + If one or more dimensions are shrunk, any chunks falling outside the + new array shape will be deleted from the underlying store. + However, it is noteworthy that the chunks partially falling inside the new array + (i.e. boundary chunks) will remain intact, and therefore, + the data falling outside the new array but inside the boundary chunks + would be restored by a subsequent resize operation that grows the array size. + + """ + + return self._write_op(self._resize_nosync, *args) + + def _resize_nosync(self, *args): + # normalize new shape argument + old_shape = self._shape + new_shape = normalize_resize_args(old_shape, *args) + old_cdata_shape = self._cdata_shape + + # update metadata + self._shape = new_shape + self._flush_metadata_nosync() + + # determine the new number and arrangement of chunks + chunks = self._chunks + new_cdata_shape = tuple(math.ceil(s / c) for s, c in zip(new_shape, chunks)) + + # remove any chunks not within range + # The idea is that, along each dimension, + # only find and remove the chunk slices that exist in 'old' but not 'new' data. + # Note that a mutable list ('old_cdata_shape_working_list') is introduced here + # to dynamically adjust the number of chunks along the already-processed dimensions + # in order to avoid duplicate chunk removal. + chunk_store = self.chunk_store + old_cdata_shape_working_list = list(old_cdata_shape) + for idx_cdata, (val_old_cdata, val_new_cdata) in enumerate( + zip(old_cdata_shape_working_list, new_cdata_shape) + ): + for cidx in itertools.product( + *[ + range(n_new, n_old) if (idx == idx_cdata) else range(n_old) + for idx, (n_old, n_new) in enumerate( + zip(old_cdata_shape_working_list, new_cdata_shape) + ) + ] + ): + key = self._chunk_key(cidx) + try: + del chunk_store[key] + except KeyError: + # chunk not initialized + pass + old_cdata_shape_working_list[idx_cdata] = min(val_old_cdata, val_new_cdata) + + def append(self, data, axis=0): + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> z = zarr.array(a, chunks=(1000, 100)) + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) + + """ + return self._write_op(self._append_nosync, data, axis=axis) + + def _append_nosync(self, data, axis=0): + # ensure data is array-like + if not hasattr(data, "shape"): + data = np.asanyarray(data, like=self._meta_array) + + # ensure shapes are compatible for non-append dimensions + self_shape_preserved = tuple(s for i, s in enumerate(self._shape) if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError( + "shape of data to append is not compatible with the array; " + "all dimensions must match except for the dimension being " + "appended" + ) + + # remember old shape + old_shape = self._shape + + # determine new shape + new_shape = tuple( + self._shape[i] if i != axis else self._shape[i] + data.shape[i] + for i in range(len(self._shape)) + ) + + # resize + self._resize_nosync(new_shape) + + # store data + # noinspection PyTypeChecker + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(self._shape)) + ) + self[append_selection] = data + + return new_shape + + def view( + self, + shape=None, + chunks=None, + dtype=None, + fill_value=None, + filters=None, + read_only=None, + synchronizer=None, + ): + """Return an array sharing the same data. + + Parameters + ---------- + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. + dtype : string or dtype, optional + NumPy dtype. + fill_value : object + Default value to use for uninitialized portions of the array. + filters : sequence, optional + Sequence of filters to use to encode chunk data prior to + compression. + read_only : bool, optional + True if array should be protected against modification. + synchronizer : object, optional + Array synchronizer. + + Notes + ----- + WARNING: This is an experimental feature and should be used with care. + There are plenty of ways to generate errors and/or cause data + corruption. + + Examples + -------- + + Bypass filters: + + >>> import zarr + >>> import numpy as np + >>> np.random.seed(42) + >>> labels = ['female', 'male'] + >>> data = np.random.choice(labels, size=10000) + >>> filters = [zarr.Categorize(labels=labels, + ... dtype=data.dtype, + ... astype='u1')] + >>> a = zarr.array(data, chunks=1000, filters=filters) + >>> a[:] + array(['female', 'male', 'female', ..., 'male', 'male', 'female'], + shape=(10000,), dtype='>> v = a.view(dtype='u1', filters=[]) + >>> v.is_view + True + >>> v[:] + array([1, 2, 1, ..., 2, 2, 1], shape=(10000,), dtype=uint8) + + Views can be used to modify data: + + >>> x = v[:] + >>> x.sort() + >>> v[:] = x + >>> v[:] + array([1, 1, 1, ..., 2, 2, 2], shape=(10000,), dtype=uint8) + >>> a[:] + array(['female', 'female', 'female', ..., 'male', 'male', 'male'], + shape=(10000,), dtype='>> data = np.random.randint(0, 2, size=10000, dtype='u1') + >>> a = zarr.array(data, chunks=1000) + >>> a[:] + array([0, 0, 1, ..., 1, 0, 0], shape=(10000,), dtype=uint8) + >>> v = a.view(dtype=bool) + >>> v[:] + array([False, False, True, ..., True, False, False], shape=(10000,)) + >>> np.all(a[:].view(dtype=bool) == v[:]) + np.True_ + + An array can be viewed with a dtype with a different item size, however + some care is needed to adjust the shape and chunk shape so that chunk + data is interpreted correctly: + + >>> data = np.arange(10000, dtype='u2') + >>> a = zarr.array(data, chunks=1000) + >>> a[:10] + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint16) + >>> v = a.view(dtype='u1', shape=20000, chunks=2000) + >>> v[:10] + array([0, 0, 1, 0, 2, 0, 3, 0, 4, 0], dtype=uint8) + >>> np.all(a[:].view('u1') == v[:]) + np.True_ + + Change fill value for uninitialized chunks: + + >>> a = zarr.full(10000, chunks=1000, fill_value=-1, dtype='i1') + >>> a[:] + array([-1, -1, -1, ..., -1, -1, -1], shape=(10000,), dtype=int8) + >>> v = a.view(fill_value=42) + >>> v[:] + array([42, 42, 42, ..., 42, 42, 42], shape=(10000,), dtype=int8) + + Note that resizing or appending to views is not permitted: + + >>> a = zarr.empty(10000) + >>> v = a.view() + >>> try: + ... v.resize(20000) + ... except PermissionError as e: + ... print(e) + operation not permitted for views + + """ + + store = self._store + chunk_store = self._chunk_store + path = self._path + if read_only is None: + read_only = self._read_only + if synchronizer is None: + synchronizer = self._synchronizer + a = Array( + store=store, + path=path, + chunk_store=chunk_store, + read_only=read_only, + synchronizer=synchronizer, + cache_metadata=True, + zarr_version=self._version, + ) + a._is_view = True + + # allow override of some properties + if dtype is None: + dtype = self._dtype + else: + dtype = np.dtype(dtype) + a._dtype = dtype + if shape is None: + shape = self._shape + else: + shape = normalize_shape(shape) + a._shape = shape + if chunks is not None: + chunks = normalize_chunks(chunks, shape, dtype.itemsize) + a._chunks = chunks + if fill_value is not None: + a._fill_value = fill_value + if filters is not None: + a._filters = filters + + return a + + def astype(self, dtype): + """Returns a view that does on the fly type conversion of the underlying data. + + Parameters + ---------- + dtype : string or dtype + NumPy dtype. + + Notes + ----- + This method returns a new Array object which is a view on the same + underlying chunk data. Modifying any data via the view is currently + not permitted and will result in an error. This is an experimental + feature and its behavior is subject to change in the future. + + See Also + -------- + Array.view + + Examples + -------- + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100, dtype=np.uint8) + >>> a = zarr.array(data, chunks=10) + >>> a[:] + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99], dtype=uint8) + >>> v = a.astype(np.float32) + >>> v.is_view + True + >>> v[:] + array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., + 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., + 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., + 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., + 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., + 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., + 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., + 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., + 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., + 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.], + dtype=float32) + """ + + dtype = np.dtype(dtype) + + filters = [] + if self._filters: + filters.extend(self._filters) + filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype)) + + return self.view(filters=filters, dtype=dtype, read_only=True) diff --git a/src/zarr/v2/creation.py b/src/zarr/v2/creation.py new file mode 100644 index 0000000000..e54fb408f8 --- /dev/null +++ b/src/zarr/v2/creation.py @@ -0,0 +1,751 @@ +from collections.abc import MutableMapping +from typing import Optional, Tuple, Union, Sequence +from warnings import warn + +import numpy as np +import numpy.typing as npt +from numcodecs.abc import Codec +from numcodecs.registry import codec_registry + +from zarr._storage.store import DEFAULT_ZARR_VERSION +from zarr.core import Array +from zarr.errors import ( + ArrayNotFoundError, + ContainsArrayError, + ContainsGroupError, +) +from zarr.storage import ( + contains_array, + contains_group, + default_compressor, + init_array, + normalize_storage_path, + normalize_store_arg, +) +from zarr._storage.store import StorageTransformer +from zarr.sync import Synchronizer +from zarr.types import ZARR_VERSION, DIMENSION_SEPARATOR, MEMORY_ORDER, MetaArray, PathLike +from zarr.util import normalize_dimension_separator + + +def create( + shape: Union[int, Tuple[int, ...]], + chunks: Union[int, Tuple[int, ...], bool] = True, + dtype: Optional[npt.DTypeLike] = None, + compressor="default", + fill_value: Optional[int] = 0, + order: MEMORY_ORDER = "C", + store: Optional[Union[str, MutableMapping]] = None, + synchronizer: Optional[Synchronizer] = None, + overwrite: bool = False, + path: Optional[PathLike] = None, + chunk_store: Optional[MutableMapping] = None, + filters: Optional[Sequence[Codec]] = None, + cache_metadata: bool = True, + cache_attrs: bool = True, + read_only: bool = False, + object_codec: Optional[Codec] = None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + write_empty_chunks: bool = True, + *, + zarr_version: Optional[ZARR_VERSION] = None, + meta_array: Optional[MetaArray] = None, + storage_transformers: Sequence[StorageTransformer] = (), + **kwargs, +): + """Create an array. + + Parameters + ---------- + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If True, will be guessed from `shape` and `dtype`. If + False, will be set to `shape`, i.e., single chunk for the whole array. + If an int, the chunk size in each dimension will be given by the value + of `chunks`. Default is True. + dtype : string or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + synchronizer : object, optional + Array synchronizer. + overwrite : bool, optional + If True, delete all pre-existing data in `store` at `path` before + creating the array. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + filters : sequence of Codecs, optional + Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + read_only : bool, optional + True if array should be protected against modification. + object_codec : Codec, optional + A codec to encode object arrays, only needed if dtype=object. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + .. versionadded:: 2.8 + + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill value + prior to storing. If a chunk is uniformly equal to the fill value, then + that chunk is not be stored, and the store entry for that chunk's key + is deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + + .. versionadded:: 2.11 + + storage_transformers : sequence of StorageTransformers, optional + Setting storage transformers, changes the storage structure and behaviour + of data coming from the underlying store. The transformers are applied in the + order of the given sequence. Supplying an empty sequence is the same as omitting + the argument or setting it to None. May only be set when using zarr_version 3. + + .. versionadded:: 2.13 + + zarr_version : {None, 2, 3}, optional + The zarr protocol version of the created array. If None, it will be + inferred from ``store`` or ``chunk_store`` if they are provided, + otherwise defaulting to 2. + + .. versionadded:: 2.12 + + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.13 + + Returns + ------- + z : zarr.core.Array + + Examples + -------- + + Create an array with default settings:: + + >>> import zarr + >>> z = zarr.create((10000, 10000), chunks=(1000, 1000)) + >>> z + + + Create an array with different some different configuration options:: + + >>> from numcodecs import Blosc + >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.BITSHUFFLE) + >>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='i1', order='F', + ... compressor=compressor) + >>> z + + + To create an array with object dtype requires a filter that can handle Python object + encoding, e.g., `MsgPack` or `Pickle` from `numcodecs`:: + + >>> from numcodecs import MsgPack + >>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype=object, + ... object_codec=MsgPack()) + >>> z + + + Example with some filters, and also storing chunks separately from metadata:: + + >>> from numcodecs import Quantize, Adler32 + >>> store, chunk_store = dict(), dict() + >>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='f8', + ... filters=[Quantize(digits=2, dtype='f8'), Adler32()], + ... store=store, chunk_store=chunk_store) + >>> z + + + """ + if zarr_version is None and store is None: + zarr_version = getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) + + # handle polymorphic store arg + store = normalize_store_arg(store, zarr_version=zarr_version, mode="w") + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + + # API compatibility with h5py + compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) + + # optional array metadata + if dimension_separator is None: + dimension_separator = getattr(store, "_dimension_separator", None) + else: + store_separator = getattr(store, "_dimension_separator", None) + if store_separator not in (None, dimension_separator): + raise ValueError( + f"Specified dimension_separator: {dimension_separator}" + f"conflicts with store's separator: " + f"{store_separator}" + ) + dimension_separator = normalize_dimension_separator(dimension_separator) + + if zarr_version > 2 and path is None: + path = "/" + + # initialize array metadata + init_array( + store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + overwrite=overwrite, + path=path, + chunk_store=chunk_store, + filters=filters, + object_codec=object_codec, + dimension_separator=dimension_separator, + storage_transformers=storage_transformers, + ) + + # instantiate array + z = Array( + store, + path=path, + chunk_store=chunk_store, + synchronizer=synchronizer, + cache_metadata=cache_metadata, + cache_attrs=cache_attrs, + read_only=read_only, + write_empty_chunks=write_empty_chunks, + meta_array=meta_array, + ) + + return z + + +def _kwargs_compat(compressor, fill_value, kwargs): + # to be compatible with h5py, as well as backwards-compatible with Zarr + # 1.x, accept 'compression' and 'compression_opts' keyword arguments + + if compressor != "default": + # 'compressor' overrides 'compression' + if "compression" in kwargs: + warn( + "'compression' keyword argument overridden by 'compressor'", + stacklevel=3, + ) + del kwargs["compression"] + if "compression_opts" in kwargs: + warn( + "'compression_opts' keyword argument overridden by 'compressor'", + stacklevel=3, + ) + del kwargs["compression_opts"] + + elif "compression" in kwargs: + compression = kwargs.pop("compression") + compression_opts = kwargs.pop("compression_opts", None) + + if compression is None or compression == "none": + compressor = None + + elif compression == "default": + compressor = default_compressor + + elif isinstance(compression, str): + codec_cls = codec_registry[compression] + + # handle compression_opts + if isinstance(compression_opts, dict): + compressor = codec_cls(**compression_opts) + elif isinstance(compression_opts, (list, tuple)): + compressor = codec_cls(*compression_opts) + elif compression_opts is None: + compressor = codec_cls() + else: + # assume single argument, e.g., int + compressor = codec_cls(compression_opts) + + # be lenient here if user gives compressor as 'compression' + elif hasattr(compression, "get_config"): + compressor = compression + + else: + raise ValueError(f"bad value for compression: {compression!r}") + + # handle 'fillvalue' + if "fillvalue" in kwargs: + # to be compatible with h5py, accept 'fillvalue' instead of + # 'fill_value' + fill_value = kwargs.pop("fillvalue") + + # ignore other keyword arguments + for k in kwargs: + warn(f"ignoring keyword argument {k!r}", stacklevel=2) + + return compressor, fill_value + + +def empty(shape, **kwargs): + """Create an empty array. + + For parameter definitions see :func:`zarr.creation.create`. + + Notes + ----- + The contents of an empty Zarr array are not defined. On attempting to + retrieve data from an empty Zarr array, any values may be returned, + and these are not guaranteed to be stable from one access to the next. + + """ + return create(shape=shape, fill_value=None, **kwargs) + + +def zeros(shape, **kwargs): + """Create an array, with zero being used as the default value for + uninitialized portions of the array. + + For parameter definitions see :func:`zarr.creation.create`. + + Examples + -------- + >>> import zarr + >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000)) + >>> z + + >>> z[:2, :2] + array([[0., 0.], + [0., 0.]]) + + """ + + return create(shape=shape, fill_value=0, **kwargs) + + +def ones(shape, **kwargs): + """Create an array, with one being used as the default value for + uninitialized portions of the array. + + For parameter definitions see :func:`zarr.creation.create`. + + Examples + -------- + >>> import zarr + >>> z = zarr.ones((10000, 10000), chunks=(1000, 1000)) + >>> z + + >>> z[:2, :2] + array([[1., 1.], + [1., 1.]]) + + """ + + return create(shape=shape, fill_value=1, **kwargs) + + +def full(shape, fill_value, **kwargs): + """Create an array, with `fill_value` being used as the default value for + uninitialized portions of the array. + + For parameter definitions see :func:`zarr.creation.create`. + + Examples + -------- + >>> import zarr + >>> z = zarr.full((10000, 10000), chunks=(1000, 1000), fill_value=42) + >>> z + + >>> z[:2, :2] + array([[42., 42.], + [42., 42.]]) + + """ + + return create(shape=shape, fill_value=fill_value, **kwargs) + + +def _get_shape_chunks(a): + shape = None + chunks = None + + if hasattr(a, "shape") and isinstance(a.shape, tuple): + shape = a.shape + + if hasattr(a, "chunks") and isinstance(a.chunks, tuple) and (len(a.chunks) == len(a.shape)): + chunks = a.chunks + + elif hasattr(a, "chunklen"): + # bcolz carray + chunks = (a.chunklen,) + a.shape[1:] + + return shape, chunks + + +def array(data, **kwargs): + """Create an array filled with `data`. + + The `data` argument should be a NumPy array or array-like object. For + other parameter definitions see :func:`zarr.creation.create`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(100000000).reshape(10000, 10000) + >>> z = zarr.array(a, chunks=(1000, 1000)) + >>> z + + + """ + + # ensure data is array-like + if not hasattr(data, "shape") or not hasattr(data, "dtype"): + data = np.asanyarray(data) + + # setup dtype + kw_dtype = kwargs.get("dtype") + if kw_dtype is None: + kwargs["dtype"] = data.dtype + else: + kwargs["dtype"] = kw_dtype + + # setup shape and chunks + data_shape, data_chunks = _get_shape_chunks(data) + kwargs["shape"] = data_shape + kw_chunks = kwargs.get("chunks") + if kw_chunks is None: + kwargs["chunks"] = data_chunks + else: + kwargs["chunks"] = kw_chunks + + # pop read-only to apply after storing the data + read_only = kwargs.pop("read_only", False) + + # instantiate array + z = create(**kwargs) + + # fill with data + z[...] = data + + # set read_only property afterwards + z.read_only = read_only + + return z + + +def open_array( + store=None, + mode="a", + shape=None, + chunks=True, + dtype=None, + compressor="default", + fill_value=0, + order="C", + synchronizer=None, + filters=None, + cache_metadata=True, + cache_attrs=True, + path=None, + object_codec=None, + chunk_store=None, + storage_options=None, + partial_decompress=None, + write_empty_chunks=True, + *, + zarr_version=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + meta_array=None, + **kwargs, +): + """Open an array using file-mode-like semantics. + + Parameters + ---------- + store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + shape : int or tuple of ints, optional + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If True, will be guessed from `shape` and `dtype`. If + False, will be set to `shape`, i.e., single chunk for the whole array. + If an int, the chunk size in each dimension will be given by the value + of `chunks`. Default is True. + dtype : string or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object, optional + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + synchronizer : object, optional + Array synchronizer. + filters : sequence, optional + Sequence of filters to use to encode chunk data prior to compression. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + path : string, optional + Array path within store. + object_codec : Codec, optional + A codec to encode object arrays, only needed if dtype=object. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + write_empty_chunks : bool, optional + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill value + prior to storing. If a chunk is uniformly equal to the fill value, then + that chunk is not be stored, and the store entry for that chunk's key + is deleted. This setting enables sparser storage, as only chunks with + non-fill-value data are stored, at the expense of overhead associated + with checking the data of each chunk. + + .. versionadded:: 2.11 + + zarr_version : {None, 2, 3}, optional + The zarr protocol version of the array to be opened. If None, it will + be inferred from ``store`` or ``chunk_store`` if they are provided, + otherwise defaulting to 2. + dimension_separator : {None, '.', '/'}, optional + Can be used to specify whether the array is in a flat ('.') or nested + ('/') format. If None, the appropriate value will be read from `store` + when present. Otherwise, defaults to '.' when ``zarr_version == 2`` + and `/` otherwise. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.15 + + Returns + ------- + z : zarr.core.Array + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> z1 = zarr.open_array('data/example.zarr', mode='w', shape=(10000, 10000), + ... chunks=(1000, 1000), fill_value=0) + >>> z1[:] = np.arange(100000000).reshape(10000, 10000) + >>> z1 + + >>> z2 = zarr.open_array('data/example.zarr', mode='r') + >>> z2 + + >>> np.all(z1[:] == z2[:]) + np.True_ + + Notes + ----- + There is no need to close an array. Data are automatically flushed to the + file system. + + """ + + # use same mode semantics as h5py + # r : read only, must exist + # r+ : read/write, must exist + # w : create, delete if exists + # w- or x : create, fail if exists + # a : read/write if exists, create otherwise (default) + + if zarr_version is None and store is None: + zarr_version = getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) + + # handle polymorphic store arg + store = normalize_store_arg( + store, storage_options=storage_options, mode=mode, zarr_version=zarr_version + ) + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + if chunk_store is not None: + chunk_store = normalize_store_arg( + chunk_store, storage_options=storage_options, mode=mode, zarr_version=zarr_version + ) + + # respect the dimension separator specified in a store, if present + if dimension_separator is None: + if hasattr(store, "_dimension_separator"): + dimension_separator = store._dimension_separator + else: + dimension_separator = "." if zarr_version == 2 else "/" + + if zarr_version == 3 and path is None: + path = "array" # TODO: raise ValueError instead? + + path = normalize_storage_path(path) + + # API compatibility with h5py + compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) + + # ensure fill_value of correct type + if fill_value is not None: + fill_value = np.array(fill_value, dtype=dtype)[()] + + # ensure store is initialized + + if mode in ["r", "r+"]: + if not contains_array(store, path=path): + if contains_group(store, path=path): + raise ContainsGroupError(path) + raise ArrayNotFoundError(path) + + elif mode == "w": + init_array( + store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + overwrite=True, + path=path, + object_codec=object_codec, + chunk_store=chunk_store, + dimension_separator=dimension_separator, + ) + + elif mode == "a": + if not contains_array(store, path=path): + if contains_group(store, path=path): + raise ContainsGroupError(path) + init_array( + store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + path=path, + object_codec=object_codec, + chunk_store=chunk_store, + dimension_separator=dimension_separator, + ) + + elif mode in ["w-", "x"]: + if contains_group(store, path=path): + raise ContainsGroupError(path) + elif contains_array(store, path=path): + raise ContainsArrayError(path) + else: + init_array( + store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + path=path, + object_codec=object_codec, + chunk_store=chunk_store, + dimension_separator=dimension_separator, + ) + + # determine read only status + read_only = mode == "r" + + # instantiate array + z = Array( + store, + read_only=read_only, + synchronizer=synchronizer, + cache_metadata=cache_metadata, + cache_attrs=cache_attrs, + path=path, + chunk_store=chunk_store, + write_empty_chunks=write_empty_chunks, + meta_array=meta_array, + ) + + return z + + +def _like_args(a, kwargs): + shape, chunks = _get_shape_chunks(a) + if shape is not None: + kwargs.setdefault("shape", shape) + if chunks is not None: + kwargs.setdefault("chunks", chunks) + + if hasattr(a, "dtype"): + kwargs.setdefault("dtype", a.dtype) + + if isinstance(a, Array): + kwargs.setdefault("compressor", a.compressor) + kwargs.setdefault("order", a.order) + kwargs.setdefault("filters", a.filters) + kwargs.setdefault("zarr_version", a._version) + else: + kwargs.setdefault("compressor", "default") + kwargs.setdefault("order", "C") + + +def empty_like(a, **kwargs): + """Create an empty array like `a`.""" + _like_args(a, kwargs) + return empty(**kwargs) + + +def zeros_like(a, **kwargs): + """Create an array of zeros like `a`.""" + _like_args(a, kwargs) + return zeros(**kwargs) + + +def ones_like(a, **kwargs): + """Create an array of ones like `a`.""" + _like_args(a, kwargs) + return ones(**kwargs) + + +def full_like(a, **kwargs): + """Create a filled array like `a`.""" + _like_args(a, kwargs) + if isinstance(a, Array): + kwargs.setdefault("fill_value", a.fill_value) + return full(**kwargs) + + +def open_like(a, path, **kwargs): + """Open a persistent array like `a`.""" + _like_args(a, kwargs) + if isinstance(a, Array): + kwargs.setdefault("fill_value", a.fill_value) + return open_array(path, **kwargs) diff --git a/src/zarr/v2/errors.py b/src/zarr/v2/errors.py new file mode 100644 index 0000000000..85789fbcbf --- /dev/null +++ b/src/zarr/v2/errors.py @@ -0,0 +1,78 @@ +class MetadataError(Exception): + pass + + +class CopyError(RuntimeError): + pass + + +class _BaseZarrError(ValueError): + _msg = "" + + def __init__(self, *args): + super().__init__(self._msg.format(*args)) + + +class ArrayIndexError(IndexError): + pass + + +class _BaseZarrIndexError(IndexError): + _msg = "" + + def __init__(self, *args): + super().__init__(self._msg.format(*args)) + + +class ContainsGroupError(_BaseZarrError): + _msg = "path {0!r} contains a group" + + +class ContainsArrayError(_BaseZarrError): + _msg = "path {0!r} contains an array" + + +class ArrayNotFoundError(_BaseZarrError): + _msg = "array not found at path %r' {0!r}" + + +class GroupNotFoundError(_BaseZarrError): + _msg = "group not found at path {0!r}" + + +class PathNotFoundError(_BaseZarrError): + _msg = "nothing found at path {0!r}" + + +class BadCompressorError(_BaseZarrError): + _msg = "bad compressor; expected Codec object, found {0!r}" + + +class FSPathExistNotDir(GroupNotFoundError): + _msg = "path exists but is not a directory: %r" + + +class ReadOnlyError(PermissionError): + def __init__(self): + super().__init__("object is read-only") + + +class BoundsCheckError(_BaseZarrIndexError): + _msg = "index out of bounds for dimension with length {0}" + + +class NegativeStepError(IndexError): + def __init__(self): + super().__init__("only slices with step >= 1 are supported") + + +def err_too_many_indices(selection, shape): + raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") + + +class VindexInvalidSelectionError(_BaseZarrIndexError): + _msg = ( + "unsupported selection type for vectorized indexing; only " + "coordinate selection (tuple of integer arrays) and mask selection " + "(single Boolean array) are supported; got {0!r}" + ) diff --git a/src/zarr/v2/hierarchy.py b/src/zarr/v2/hierarchy.py new file mode 100644 index 0000000000..8894a5ed57 --- /dev/null +++ b/src/zarr/v2/hierarchy.py @@ -0,0 +1,1609 @@ +from collections.abc import MutableMapping +from itertools import islice + +import numpy as np + +from zarr._storage.store import ( + _get_metadata_suffix, + data_root, + meta_root, + DEFAULT_ZARR_VERSION, + assert_zarr_v3_api_available, +) +from zarr.attrs import Attributes +from zarr.core import Array +from zarr.creation import ( + array, + create, + empty, + empty_like, + full, + full_like, + ones, + ones_like, + zeros, + zeros_like, +) +from zarr.errors import ( + ContainsArrayError, + ContainsGroupError, + ArrayNotFoundError, + GroupNotFoundError, + ReadOnlyError, +) +from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_group_key, + BaseStore, + MemoryStore, + attrs_key, + contains_array, + contains_group, + group_meta_key, + init_group, + listdir, + normalize_store_arg, + rename, + rmdir, +) +from zarr._storage.v3 import MemoryStoreV3 +from zarr.util import ( + InfoReporter, + TreeViewer, + is_valid_python_name, + nolock, + normalize_shape, + normalize_storage_path, +) + + +class Group(MutableMapping): + """Instantiate a group from an initialized store. + + Parameters + ---------- + store : MutableMapping + Group store, already initialized. + If the Group is used in a context manager, and the store has a ``close`` method, + it will be called on exit. + path : string, optional + Group path. + read_only : bool, optional + True if group should be protected against modification. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + synchronizer : object, optional + Array synchronizer. + + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.13 + + Attributes + ---------- + store + path + name + read_only + chunk_store + synchronizer + attrs + info + meta_array + + Methods + ------- + __len__ + __iter__ + __contains__ + __getitem__ + __enter__ + __exit__ + group_keys + groups + array_keys + arrays + visit + visitkeys + visitvalues + visititems + tree + create_group + require_group + create_groups + require_groups + create_dataset + require_dataset + create + empty + zeros + ones + full + array + empty_like + zeros_like + ones_like + full_like + info + move + + """ + + def __init__( + self, + store, + path=None, + read_only=False, + chunk_store=None, + cache_attrs=True, + synchronizer=None, + zarr_version=None, + *, + meta_array=None, + ): + store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + + if zarr_version != 2: + assert_zarr_v3_api_available() + + if chunk_store is not None: + chunk_store: BaseStore = _normalize_store_arg(chunk_store, zarr_version=zarr_version) + self._store = store + self._chunk_store = chunk_store + self._path = normalize_storage_path(path) + if self._path: + self._key_prefix = self._path + "/" + else: + self._key_prefix = "" + self._read_only = read_only + self._synchronizer = synchronizer + if meta_array is not None: + self._meta_array = np.empty_like(meta_array, shape=()) + else: + self._meta_array = np.empty(()) + self._version = zarr_version + if self._version == 3: + self._data_key_prefix = data_root + self._key_prefix + self._data_path = data_root + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) + self._metadata_key_suffix = _get_metadata_suffix(store=self._store) + + # guard conditions + if contains_array(store, path=self._path): + raise ContainsArrayError(path) + + # initialize metadata + mkey = None + try: + mkey = _prefix_to_group_key(self._store, self._key_prefix) + assert not mkey.endswith("root/.group") + meta_bytes = store[mkey] + except KeyError as e: + if self._version == 2: + raise GroupNotFoundError(path) from e + else: + implicit_prefix = meta_root + self._key_prefix + if self._store.list_prefix(implicit_prefix): + # implicit group does not have any metadata + self._meta = None + else: + raise GroupNotFoundError(path) from e + else: + self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) + + # setup attributes + if self._version == 2: + akey = self._key_prefix + attrs_key + else: + # Note: mkey doesn't actually exist for implicit groups, but the + # object can still be created. + akey = mkey + self._attrs = Attributes( + store, + key=akey, + read_only=read_only, + cache=cache_attrs, + synchronizer=synchronizer, + cached_dict=self._meta["attributes"] if self._version == 3 and self._meta else None, + ) + + # setup info + + @property + def store(self): + """A MutableMapping providing the underlying storage for the group.""" + return self._store + + @property + def path(self): + """Storage path.""" + return self._path + + @property + def name(self): + """Group name following h5py convention.""" + if self._path: + # follow h5py convention: add leading slash + name = self._path + if name[0] != "/": + name = "/" + name + return name + return "/" + + @property + def basename(self): + """Final component of name.""" + return self.name.split("/")[-1] + + @property + def read_only(self): + """A boolean, True if modification operations are not permitted.""" + return self._read_only + + @property + def chunk_store(self): + """A MutableMapping providing the underlying storage for array chunks.""" + if self._chunk_store is None: + return self._store + else: + return self._chunk_store + + @property + def synchronizer(self): + """Object used to synchronize write access to groups and arrays.""" + return self._synchronizer + + @property + def attrs(self): + """A MutableMapping containing user-defined attributes. Note that + attribute values must be JSON serializable.""" + return self._attrs + + @property + def info(self): + """Return diagnostic information about the group.""" + return InfoReporter(self) + + @property + def meta_array(self): + """An array-like instance to use for determining arrays to create and return + to users. + """ + return self._meta_array + + def __eq__(self, other): + return ( + isinstance(other, Group) + and self._store == other.store + and self._read_only == other.read_only + and self._path == other.path + # N.B., no need to compare attributes, should be covered by + # store comparison + ) + + def __iter__(self): + """Return an iterator over group member names. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for name in g1: + ... print(name) + bar + baz + foo + quux + + """ + if getattr(self._store, "_store_version", 2) == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path) or contains_group(self._store, path): + yield key + else: + # TODO: Should this iterate over data folders and/or metadata + # folders and/or metadata files + + dir_path = meta_root + self._key_prefix + name_start = len(dir_path) + keys, prefixes = self._store.list_dir(dir_path) + + # yield any groups or arrays + sfx = self._metadata_key_suffix + for key in keys: + len_suffix = len(".group") + len(sfx) # same for .array + if key.endswith((".group" + sfx, ".array" + sfx)): + yield key[name_start:-len_suffix] + + # also yield any implicit groups + for prefix in prefixes: + prefix = prefix.rstrip("/") + # only implicit if there is no .group.sfx file + if prefix + ".group" + sfx not in self._store: + yield prefix[name_start:] + + # Note: omit data/root/ to avoid duplicate listings + # any group in data/root/ must has an entry in meta/root/ + + def __len__(self): + """Number of members.""" + return sum(1 for _ in self) + + def __repr__(self): + t = type(self) + r = f"<{t.__module__}.{t.__name__}" + if self.name: + r += f" {self.name!r}" + if self._read_only: + r += " read-only" + r += ">" + return r + + def __enter__(self): + """Return the Group for use as a context manager.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Call the close method of the underlying Store.""" + self.store.close() + + def info_items(self): + def typestr(o): + return f"{type(o).__module__}.{type(o).__name__}" + + items = [] + + # basic info + if self.name is not None: + items += [("Name", self.name)] + items += [ + ("Type", typestr(self)), + ("Read-only", str(self.read_only)), + ] + + # synchronizer + if self._synchronizer is not None: + items += [("Synchronizer type", typestr(self._synchronizer))] + + # storage info + items += [("Store type", typestr(self._store))] + if self._chunk_store is not None: + items += [("Chunk store type", typestr(self._chunk_store))] + + # members + items += [("No. members", len(self))] + array_keys = sorted(self.array_keys()) + group_keys = sorted(self.group_keys()) + items += [("No. arrays", len(array_keys))] + items += [("No. groups", len(group_keys))] + if array_keys: + items += [("Arrays", ", ".join(array_keys))] + if group_keys: + items += [("Groups", ", ".join(group_keys))] + + return items + + def __getstate__(self): + return { + "store": self._store, + "path": self._path, + "read_only": self._read_only, + "chunk_store": self._chunk_store, + "cache_attrs": self._attrs.cache, + "synchronizer": self._synchronizer, + "zarr_version": self._version, + "meta_array": self._meta_array, + } + + def __setstate__(self, state): + self.__init__(**state) + + def _item_path(self, item): + absolute = isinstance(item, str) and item and item[0] == "/" + path = normalize_storage_path(item) + if not absolute and self._path: + path = self._key_prefix + path + return path + + def __contains__(self, item): + """Test for group membership. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> d1 = g1.create_dataset('bar', shape=100, chunks=10) + >>> 'foo' in g1 + True + >>> 'bar' in g1 + True + >>> 'baz' in g1 + False + + """ + path = self._item_path(item) + return contains_array(self._store, path) or contains_group( + self._store, path, explicit_only=False + ) + + def __getitem__(self, item): + """Obtain a group member. + + Parameters + ---------- + item : string + Member name or path. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> d1 = g1.create_dataset('foo/bar/baz', shape=100, chunks=10) + >>> g1['foo'] + + >>> g1['foo/bar'] + + >>> g1['foo/bar/baz'] + + + """ + path = self._item_path(item) + try: + return Array( + self._store, + read_only=self._read_only, + path=path, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer, + cache_attrs=self.attrs.cache, + zarr_version=self._version, + meta_array=self._meta_array, + ) + except ArrayNotFoundError: + pass + + try: + return Group( + self._store, + read_only=self._read_only, + path=path, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + meta_array=self._meta_array, + ) + except GroupNotFoundError: + pass + + if self._version == 3: + implicit_group = meta_root + path + "/" + # non-empty folder in the metadata path implies an implicit group + if self._store.list_prefix(implicit_group): + return Group( + self._store, + read_only=self._read_only, + path=path, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + meta_array=self._meta_array, + ) + else: + raise KeyError(item) + else: + raise KeyError(item) + + def __setitem__(self, item, value): + self.array(item, value, overwrite=True) + + def __delitem__(self, item): + return self._write_op(self._delitem_nosync, item) + + def _delitem_nosync(self, item): + path = self._item_path(item) + if contains_array(self._store, path) or contains_group( + self._store, path, explicit_only=False + ): + rmdir(self._store, path) + else: + raise KeyError(item) + + def __getattr__(self, item): + # https://github.com/jupyter/notebook/issues/2014 + # Save a possibly expensive lookup (for e.g. against cloud stores) + # Note: The _ipython_display_ method is required to display the right info as a side-effect. + # It is simpler to pretend it doesn't exist. + if item in ["_ipython_canary_method_should_not_exist_", "_ipython_display_"]: + raise AttributeError + + # allow access to group members via dot notation + try: + return self.__getitem__(item) + except KeyError as e: + raise AttributeError from e + + def __dir__(self): + # noinspection PyUnresolvedReferences + base = super().__dir__() + keys = sorted(set(base + list(self))) + keys = [k for k in keys if is_valid_python_name(k)] + return keys + + def _ipython_key_completions_(self): + return sorted(self) + + def group_keys(self): + """Return an iterator over member names for groups only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> sorted(g1.group_keys()) + ['bar', 'foo'] + + """ + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key + else: + dir_name = meta_root + self._path + group_sfx = ".group" + self._metadata_key_suffix + # The fact that we call sorted means this can't be a streaming generator. + # The keys are already in memory. + all_keys = sorted(listdir(self._store, dir_name)) + for key in all_keys: + if key.endswith(group_sfx): + key = key[: -len(group_sfx)] + if key in all_keys: + # otherwise we will double count this group + continue + path = self._key_prefix + key + if path.endswith(".array" + self._metadata_key_suffix): + # skip array keys + continue + if contains_group(self._store, path, explicit_only=False): + yield key + + def groups(self): + """Return an iterator over (name, value) pairs for groups only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for n, v in g1.groups(): + ... print(n, type(v)) + bar + foo + + """ + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path, explicit_only=False): + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + ) + + else: + for key in self.group_keys(): + path = self._key_prefix + key + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + ) + + def array_keys(self, recurse=False): + """Return an iterator over member names for arrays only. + + Parameters + ---------- + recurse : recurse, optional + Option to return member names for all arrays, even from groups + below the current one. If False, only member names for arrays in + the current group will be returned. Default value is False. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> sorted(g1.array_keys()) + ['baz', 'quux'] + + """ + return self._array_iter(keys_only=True, method="array_keys", recurse=recurse) + + def arrays(self, recurse=False): + """Return an iterator over (name, value) pairs for arrays only. + + Parameters + ---------- + recurse : recurse, optional + Option to return (name, value) pairs for all arrays, even from groups + below the current one. If False, only (name, value) pairs for arrays in + the current group will be returned. Default value is False. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for n, v in g1.arrays(): + ... print(n, type(v)) + baz + quux + + """ + return self._array_iter(keys_only=False, method="arrays", recurse=recurse) + + def _array_iter(self, keys_only, method, recurse): + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + yield from getattr(group, method)(recurse=recurse) + else: + dir_name = meta_root + self._path + array_sfx = ".array" + self._metadata_key_suffix + group_sfx = ".group" + self._metadata_key_suffix + + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(array_sfx): + key = key[: -len(array_sfx)] + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + + path = self._key_prefix + key + assert not path.startswith("meta/") + if key.endswith(group_sfx): + # skip group metadata keys + continue + elif recurse and contains_group(self._store, path): + group = self[key] + yield from getattr(group, method)(recurse=recurse) + + def visitvalues(self, func): + """Run ``func`` on each object. + + Note: If ``func`` returns ``None`` (or doesn't return), + iteration continues. However, if ``func`` returns + anything else, it ceases and returns that value. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g3.create_group('baz') + >>> g5 = g3.create_group('quux') + >>> def print_visitor(obj): + ... print(obj) + >>> g1.visitvalues(print_visitor) + + + + + >>> g3.visitvalues(print_visitor) + + + + """ + + def _visit(obj): + yield obj + keys = sorted(getattr(obj, "keys", lambda: [])()) + for k in keys: + yield from _visit(obj[k]) + + for each_obj in islice(_visit(self), 1, None): + value = func(each_obj) + if value is not None: + return value + + def visit(self, func): + """Run ``func`` on each object's path. + + Note: If ``func`` returns ``None`` (or doesn't return), + iteration continues. However, if ``func`` returns + anything else, it ceases and returns that value. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g3.create_group('baz') + >>> g5 = g3.create_group('quux') + >>> def print_visitor(name): + ... print(name) + >>> g1.visit(print_visitor) + bar + bar/baz + bar/quux + foo + >>> g3.visit(print_visitor) + baz + quux + + Search for members matching some name query can be implemented using + ``visit`` that is, ``find`` and ``findall``. Consider the following + tree:: + + / + ├── aaa + │ └── bbb + │ └── ccc + │ └── aaa + ├── bar + └── foo + + It is created as follows: + + >>> root = zarr.group() + >>> foo = root.create_group("foo") + >>> bar = root.create_group("bar") + >>> root.create_group("aaa").create_group("bbb").create_group("ccc").create_group("aaa") + + + For ``find``, the first path that matches a given pattern (for example + "aaa") is returned. Note that a non-None value is returned in the visit + function to stop further iteration. + + >>> import re + >>> pattern = re.compile("aaa") + >>> found = None + >>> def find(path): + ... global found + ... if pattern.search(path) is not None: + ... found = path + ... return True + ... + >>> root.visit(find) + True + >>> print(found) + aaa + + For ``findall``, all the results are gathered into a list + + >>> pattern = re.compile("aaa") + >>> found = [] + >>> def findall(path): + ... if pattern.search(path) is not None: + ... found.append(path) + ... + >>> root.visit(findall) + >>> print(found) + ['aaa', 'aaa/bbb', 'aaa/bbb/ccc', 'aaa/bbb/ccc/aaa'] + + To match only on the last part of the path, use a greedy regex to filter + out the prefix: + + >>> prefix_pattern = re.compile(r".*/") + >>> pattern = re.compile("aaa") + >>> found = [] + >>> def findall(path): + ... match = prefix_pattern.match(path) + ... if match is None: + ... name = path + ... else: + ... _, end = match.span() + ... name = path[end:] + ... if pattern.search(name) is not None: + ... found.append(path) + ... return None + ... + >>> root.visit(findall) + >>> print(found) + ['aaa', 'aaa/bbb/ccc/aaa'] + """ + + base_len = len(self.name) + return self.visitvalues(lambda o: func(o.name[base_len:].lstrip("/"))) + + def visitkeys(self, func): + """An alias for :py:meth:`~Group.visit`.""" + + return self.visit(func) + + def visititems(self, func): + """Run ``func`` on each object's path and the object itself. + + Note: If ``func`` returns ``None`` (or doesn't return), + iteration continues. However, if ``func`` returns + anything else, it ceases and returns that value. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g3.create_group('baz') + >>> g5 = g3.create_group('quux') + >>> def print_visitor(name, obj): + ... print((name, obj)) + >>> g1.visititems(print_visitor) + ('bar', ) + ('bar/baz', ) + ('bar/quux', ) + ('foo', ) + >>> g3.visititems(print_visitor) + ('baz', ) + ('quux', ) + + """ + + base_len = len(self.name) + return self.visitvalues(lambda o: func(o.name[base_len:].lstrip("/"), o)) + + def tree(self, expand=False, level=None): + """Provide a ``print``-able display of the hierarchy. + + Parameters + ---------- + expand : bool, optional + Only relevant for HTML representation. If True, tree will be fully expanded. + level : int, optional + Maximum depth to descend into hierarchy. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g3.create_group('baz') + >>> g5 = g3.create_group('quux') + >>> d1 = g5.create_dataset('baz', shape=100, chunks=10) + >>> g1.tree() + / + ├── bar + │ ├── baz + │ └── quux + │ └── baz (100,) float64 + └── foo + >>> g1.tree(level=2) + / + ├── bar + │ ├── baz + │ └── quux + └── foo + >>> g3.tree() + bar + ├── baz + └── quux + └── baz (100,) float64 + + Notes + ----- + Please note that this is an experimental feature. The behaviour of this + function is still evolving and the default output and/or parameters may change + in future versions. + + """ + + return TreeViewer(self, expand=expand, level=level) + + def _write_op(self, f, *args, **kwargs): + # guard condition + if self._read_only: + raise ReadOnlyError() + + if self._synchronizer is None: + # no synchronization + lock = nolock + else: + # synchronize on the root group + lock = self._synchronizer[group_meta_key] + + with lock: + return f(*args, **kwargs) + + def create_group(self, name, overwrite=False): + """Create a sub-group. + + Parameters + ---------- + name : string + Group name. + overwrite : bool, optional + If True, overwrite any existing array with the given name. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g1.create_group('baz/quux') + + """ + + return self._write_op(self._create_group_nosync, name, overwrite=overwrite) + + def _create_group_nosync(self, name, overwrite=False): + path = self._item_path(name) + + # create terminal group + init_group(self._store, path=path, chunk_store=self._chunk_store, overwrite=overwrite) + + return Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + ) + + def create_groups(self, *names, **kwargs): + """Convenience method to create multiple groups in a single call.""" + return tuple(self.create_group(name, **kwargs) for name in names) + + def require_group(self, name, overwrite=False): + """Obtain a sub-group, creating one if it doesn't exist. + + Parameters + ---------- + name : string + Group name. + overwrite : bool, optional + Overwrite any existing array with given `name` if present. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.require_group('foo') + >>> g3 = g1.require_group('foo') + >>> g2 == g3 + True + + """ + + return self._write_op(self._require_group_nosync, name, overwrite=overwrite) + + def _require_group_nosync(self, name, overwrite=False): + path = self._item_path(name) + + # create terminal group if necessary + if not contains_group(self._store, path): + init_group( + store=self._store, path=path, chunk_store=self._chunk_store, overwrite=overwrite + ) + + return Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version, + ) + + def require_groups(self, *names): + """Convenience method to require multiple groups in a single call.""" + return tuple(self.require_group(name) for name in names) + + # noinspection PyIncorrectDocstring + def create_dataset(self, name, **kwargs): + """Create an array. + + Arrays are known as "datasets" in HDF5 terminology. For compatibility + with h5py, Zarr groups also implement the require_dataset() method. + + Parameters + ---------- + name : string + Array name. + data : array-like, optional + Initial data. + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and + `dtype`. + dtype : string or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + synchronizer : zarr.sync.ArraySynchronizer, optional + Array synchronizer. + filters : sequence of Codecs, optional + Sequence of filters to use to encode chunk data prior to + compression. + overwrite : bool, optional + If True, replace any existing array or group with the given name. + cache_metadata : bool, optional + If True, array configuration metadata will be cached for the + lifetime of the object. If False, array metadata will be reloaded + prior to all data access and modification operations (may incur + overhead depending on storage and data access pattern). + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + Returns + ------- + a : zarr.core.Array + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> d1 = g1.create_dataset('foo', shape=(10000, 10000), + ... chunks=(1000, 1000)) + >>> d1 + + >>> d2 = g1.create_dataset('bar/baz/qux', shape=(100, 100, 100), + ... chunks=(100, 10, 10)) + >>> d2 + + + """ + assert "mode" not in kwargs + + return self._write_op(self._create_dataset_nosync, name, **kwargs) + + def _create_dataset_nosync(self, name, data=None, **kwargs): + assert "mode" not in kwargs + path = self._item_path(name) + + # determine synchronizer + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + + # create array + if data is None: + a = create(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + else: + a = array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + return a + + def require_dataset(self, name, shape, dtype=None, exact=False, **kwargs): + """Obtain an array, creating if it doesn't exist. + + Arrays are known as "datasets" in HDF5 terminology. For compatibility + with h5py, Zarr groups also implement the create_dataset() method. + + Other `kwargs` are as per :func:`zarr.hierarchy.Group.create_dataset`. + + Parameters + ---------- + name : string + Array name. + shape : int or tuple of ints + Array shape. + dtype : string or dtype, optional + NumPy dtype. + exact : bool, optional + If True, require `dtype` to match exactly. If false, require + `dtype` can be cast from array dtype. + + """ + + return self._write_op( + self._require_dataset_nosync, name, shape=shape, dtype=dtype, exact=exact, **kwargs + ) + + def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, **kwargs): + path = self._item_path(name) + + if contains_array(self._store, path): + # array already exists at path, validate that it is the right shape and type + + synchronizer = kwargs.get("synchronizer", self._synchronizer) + cache_metadata = kwargs.get("cache_metadata", True) + cache_attrs = kwargs.get("cache_attrs", self.attrs.cache) + a = Array( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=synchronizer, + cache_metadata=cache_metadata, + cache_attrs=cache_attrs, + meta_array=self._meta_array, + ) + shape = normalize_shape(shape) + if shape != a.shape: + raise TypeError( + f"shape do not match existing array; expected {a.shape}, got {shape}" + ) + dtype = np.dtype(dtype) + if exact: + if dtype != a.dtype: + raise TypeError(f"dtypes do not match exactly; expected {a.dtype}, got {dtype}") + else: + if not np.can_cast(dtype, a.dtype): + raise TypeError(f"dtypes ({dtype}, {a.dtype}) cannot be safely cast") + return a + + else: + return self._create_dataset_nosync(name, shape=shape, dtype=dtype, **kwargs) + + def create(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.create`.""" + return self._write_op(self._create_nosync, name, **kwargs) + + def _create_nosync(self, name, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return create(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + def empty(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.empty`.""" + return self._write_op(self._empty_nosync, name, **kwargs) + + def _empty_nosync(self, name, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return empty(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + def zeros(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.zeros`.""" + return self._write_op(self._zeros_nosync, name, **kwargs) + + def _zeros_nosync(self, name, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return zeros(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + def ones(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.ones`.""" + return self._write_op(self._ones_nosync, name, **kwargs) + + def _ones_nosync(self, name, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return ones(store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + def full(self, name, fill_value, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.full`.""" + return self._write_op(self._full_nosync, name, fill_value, **kwargs) + + def _full_nosync(self, name, fill_value, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return full( + store=self._store, + path=path, + chunk_store=self._chunk_store, + fill_value=fill_value, + **kwargs, + ) + + def array(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.array`.""" + return self._write_op(self._array_nosync, name, data, **kwargs) + + def _array_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return array(data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs) + + def empty_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.empty_like`.""" + return self._write_op(self._empty_like_nosync, name, data, **kwargs) + + def _empty_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return empty_like( + data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs + ) + + def zeros_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.zeros_like`.""" + return self._write_op(self._zeros_like_nosync, name, data, **kwargs) + + def _zeros_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return zeros_like( + data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs + ) + + def ones_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.ones_like`.""" + return self._write_op(self._ones_like_nosync, name, data, **kwargs) + + def _ones_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return ones_like( + data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs + ) + + def full_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.full_like`.""" + return self._write_op(self._full_like_nosync, name, data, **kwargs) + + def _full_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + kwargs.setdefault("synchronizer", self._synchronizer) + kwargs.setdefault("cache_attrs", self.attrs.cache) + return full_like( + data, store=self._store, path=path, chunk_store=self._chunk_store, **kwargs + ) + + def _move_nosync(self, path, new_path): + rename(self._store, path, new_path) + if self._chunk_store is not None: + rename(self._chunk_store, path, new_path) + + def move(self, source, dest): + """Move contents from one path to another relative to the Group. + + Parameters + ---------- + source : string + Name or path to a Zarr object to move. + dest : string + New name or path of the Zarr object. + """ + + source = self._item_path(source) + dest = self._item_path(dest) + + # Check that source exists. + if not ( + contains_array(self._store, source) + or contains_group(self._store, source, explicit_only=False) + ): + raise ValueError('The source, "%s", does not exist.' % source) + if contains_array(self._store, dest) or contains_group( + self._store, dest, explicit_only=False + ): + raise ValueError('The dest, "%s", already exists.' % dest) + + # Ensure groups needed for `dest` exist. + if "/" in dest: + self.require_group("/" + dest.rsplit("/", 1)[0]) + + self._write_op(self._move_nosync, source, dest) + + # Override ipython repr methods, GH1716 + # https://ipython.readthedocs.io/en/stable/config/integrating.html#custom-methods + # " If the methods don’t exist, the standard repr() is used. If a method exists and + # returns None, it is treated the same as if it does not exist." + def _repr_html_(self): + return None + + def _repr_latex_(self): + return None + + def _repr_mimebundle_(self, **kwargs): + return None + + def _repr_svg_(self): + return None + + def _repr_png_(self): + return None + + def _repr_jpeg_(self): + return None + + def _repr_markdown_(self): + return None + + def _repr_javascript_(self): + return None + + def _repr_pdf_(self): + return None + + def _repr_json_(self): + return None + + +def _normalize_store_arg(store, *, storage_options=None, mode="r", zarr_version=None): + if zarr_version is None: + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + + if zarr_version != 2: + assert_zarr_v3_api_available() + + if store is None: + return MemoryStore() if zarr_version == 2 else MemoryStoreV3() + return normalize_store_arg( + store, storage_options=storage_options, mode=mode, zarr_version=zarr_version + ) + + +def group( + store=None, + overwrite=False, + chunk_store=None, + cache_attrs=True, + synchronizer=None, + path=None, + *, + zarr_version=None, + meta_array=None, +): + """Create a group. + + Parameters + ---------- + store : MutableMapping or string, optional + Store or path to directory in file system. + overwrite : bool, optional + If True, delete any pre-existing data in `store` at `path` before + creating the group. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + synchronizer : object, optional + Array synchronizer. + path : string, optional + Group path within store. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.16.1 + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + Create a group in memory:: + + >>> import zarr + >>> g = zarr.group() + >>> g + + + Create a group with a different store:: + + >>> store = zarr.DirectoryStore('data/example.zarr') + >>> g = zarr.group(store=store, overwrite=True) + >>> g + + + """ + + # handle polymorphic store arg + store = _normalize_store_arg(store, zarr_version=zarr_version, mode="w") + if zarr_version is None: + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + + if zarr_version != 2: + assert_zarr_v3_api_available() + + path = normalize_storage_path(path) + + requires_init = None + if zarr_version == 2: + requires_init = overwrite or not contains_group(store) + elif zarr_version == 3: + requires_init = overwrite or not contains_group(store, path) + + if requires_init: + init_group(store, overwrite=overwrite, chunk_store=chunk_store, path=path) + + return Group( + store, + read_only=False, + chunk_store=chunk_store, + cache_attrs=cache_attrs, + synchronizer=synchronizer, + path=path, + zarr_version=zarr_version, + meta_array=meta_array, + ) + + +def open_group( + store=None, + mode="a", + cache_attrs=True, + synchronizer=None, + path=None, + chunk_store=None, + storage_options=None, + *, + zarr_version=None, + meta_array=None, +): + """Open a group using file-mode-like semantics. + + Parameters + ---------- + store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. + mode : {'r', 'r+', 'a', 'w', 'w-'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + cache_attrs : bool, optional + If True (default), user attributes will be cached for attribute read + operations. If False, user attributes are reloaded from the store prior + to all attribute read operations. + synchronizer : object, optional + Array synchronizer. + path : string, optional + Group path within store. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + meta_array : array-like, optional + An array instance to use for determining arrays to create and return + to users. Use `numpy.empty(())` by default. + + .. versionadded:: 2.13 + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> root = zarr.open_group('data/example.zarr', mode='w') + >>> foo = root.create_group('foo') + >>> bar = root.create_group('bar') + >>> root + + >>> root2 = zarr.open_group('data/example.zarr', mode='a') + >>> root2 + + >>> root == root2 + True + + """ + + # handle polymorphic store arg + store = _normalize_store_arg( + store, storage_options=storage_options, mode=mode, zarr_version=zarr_version + ) + if zarr_version is None: + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + + if zarr_version != 2: + assert_zarr_v3_api_available() + + if chunk_store is not None: + chunk_store = _normalize_store_arg( + chunk_store, storage_options=storage_options, mode=mode, zarr_version=zarr_version + ) + if getattr(chunk_store, "_store_version", DEFAULT_ZARR_VERSION) != zarr_version: + raise ValueError("zarr_version of store and chunk_store must match") # pragma: no cover + + path = normalize_storage_path(path) + + # ensure store is initialized + + if mode in ["r", "r+"]: + if not contains_group(store, path=path): + if contains_array(store, path=path): + raise ContainsArrayError(path) + raise GroupNotFoundError(path) + + elif mode == "w": + init_group(store, overwrite=True, path=path, chunk_store=chunk_store) + + elif mode == "a": + if not contains_group(store, path=path): + if contains_array(store, path=path): + raise ContainsArrayError(path) + init_group(store, path=path, chunk_store=chunk_store) + + elif mode in ["w-", "x"]: + if contains_array(store, path=path): + raise ContainsArrayError(path) + elif contains_group(store, path=path): + raise ContainsGroupError(path) + else: + init_group(store, path=path, chunk_store=chunk_store) + + # determine read only status + read_only = mode == "r" + + return Group( + store, + read_only=read_only, + cache_attrs=cache_attrs, + synchronizer=synchronizer, + path=path, + chunk_store=chunk_store, + zarr_version=zarr_version, + meta_array=meta_array, + ) diff --git a/src/zarr/v2/indexing.py b/src/zarr/v2/indexing.py new file mode 100644 index 0000000000..35c1e813b1 --- /dev/null +++ b/src/zarr/v2/indexing.py @@ -0,0 +1,1079 @@ +import collections +import itertools +import math +import numbers + +import numpy as np + + +from zarr.errors import ( + ArrayIndexError, + NegativeStepError, + err_too_many_indices, + VindexInvalidSelectionError, + BoundsCheckError, +) + + +def is_integer(x): + """True if x is an integer (both pure Python or NumPy). + + Note that Python's bool is considered an integer too. + """ + return isinstance(x, numbers.Integral) + + +def is_integer_list(x): + """True if x is a list of integers. + + This function assumes ie *does not check* that all elements of the list + have the same type. Mixed type lists will result in other errors that will + bubble up anyway. + """ + return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) + + +def is_integer_array(x, ndim=None): + t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_bool_array(x, ndim=None): + t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_scalar(value, dtype): + if np.isscalar(value): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + if dtype.kind == "O" and not isinstance(value, np.ndarray): + return True + return False + + +def is_pure_fancy_indexing(selection, ndim): + """Check whether a selection contains only scalars or integer array-likes. + + Parameters + ---------- + selection : tuple, slice, or scalar + A valid selection value for indexing into arrays. + + Returns + ------- + is_pure : bool + True if the selection is a pure fancy indexing expression (ie not mixed + with boolean or slices). + """ + if ndim == 1: + if is_integer_list(selection) or is_integer_array(selection): + return True + # if not, we go through the normal path below, because a 1-tuple + # of integers is also allowed. + no_slicing = ( + isinstance(selection, tuple) + and len(selection) == ndim + and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) + ) + return ( + no_slicing + and all( + is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) + for elem in selection + ) + and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) + ) + + +def is_pure_orthogonal_indexing(selection, ndim): + if not ndim: + return False + + # Case 1: Selection is a single iterable of integers + if is_integer_list(selection) or is_integer_array(selection, ndim=1): + return True + + # Case two: selection contains either zero or one integer iterables. + # All other selection elements are slices or integers + return ( + isinstance(selection, tuple) + and len(selection) == ndim + and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 + and all( + is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, (int, slice)) + for elem in selection + ) + ) + + +def normalize_integer_selection(dim_sel, dim_len): + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise BoundsCheckError(dim_len) + + return dim_sel + + +ChunkDimProjection = collections.namedtuple( + "ChunkDimProjection", ("dim_chunk_ix", "dim_chunk_sel", "dim_out_sel") +) +"""A mapping from chunk to output array for a single dimension. + +Parameters +---------- +dim_chunk_ix + Index of chunk. +dim_chunk_sel + Selection of items from chunk array. +dim_out_sel + Selection of items in target (output) array. + +""" + + +class IntDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # normalize + dim_sel = normalize_integer_selection(dim_sel, dim_len) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = 1 + + def __iter__(self): + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def ceildiv(a, b): + return math.ceil(a / b) + + +class SliceDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # normalize + self.start, self.stop, self.step = dim_sel.indices(dim_len) + if self.step < 1: + raise NegativeStepError() + + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nitems = max(0, ceildiv((self.stop - self.start), self.step)) + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + + def __iter__(self): + # figure out the range of chunks we need to visit + dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) + + # iterate over chunks in range + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) + + # determine chunk length, accounting for trailing chunk + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + + else: + # selection starts within current chunk + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + # If there are no elements on the selection within this chunk, then skip + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def check_selection_length(selection, shape): + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + +def replace_ellipsis(selection, shape): + selection = ensure_tuple(selection) + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + check_selection_length(selection, shape) + + return selection + + +def replace_lists(selection): + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection + ) + + +def ensure_tuple(v): + if not isinstance(v, tuple): + v = (v,) + return v + + +ChunkProjection = collections.namedtuple( + "ChunkProjection", ("chunk_coords", "chunk_selection", "out_selection") +) +"""A mapping of items from chunk to output array. Can be used to extract items from the +chunk array for loading into an output array. Can also be used to extract items from a +value array for setting/updating in a chunk array. + +Parameters +---------- +chunk_coords + Indices of chunk. +chunk_selection + Selection of items from chunk array. +out_selection + Selection of items in target (output) array. + +""" + + +def is_slice(s): + return isinstance(s, slice) + + +def is_contiguous_slice(s): + return is_slice(s) and (s.step is None or s.step == 1) + + +def is_positive_slice(s): + return is_slice(s) and (s.step is None or s.step >= 1) + + +def is_contiguous_selection(selection): + selection = ensure_tuple(selection) + return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) + + +def is_basic_selection(selection): + selection = ensure_tuple(selection) + return all(is_integer(s) or is_positive_slice(s) for s in selection) + + +# noinspection PyProtectedMember +class BasicIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_slice(dim_sel): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + f"unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class BoolArrayDimIndexer: + def __init__(self, dim_sel, dim_len, dim_chunk_len): + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError( + "Boolean arrays in an orthogonal selection must " "be 1-dimensional only" + ) + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + f"Boolean array has the wrong length for dimension; " + f"expected {dim_len}, got { dim_sel.shape[0]}" + ) + + # store attributes + self.dim_sel = dim_sel + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.zeros(self.nchunks, dtype="i8") + for dim_chunk_ix in range(self.nchunks): + dim_offset = dim_chunk_ix * self.dim_chunk_len + self.chunk_nitems[dim_chunk_ix] = np.count_nonzero( + self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + ) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + self.nitems = self.chunk_nitems_cumsum[-1] + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + def __iter__(self): + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class Order: + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a): + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + +def wraparound_indices(x, dim_len): + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x, dim_len): + if np.any(x < 0) or np.any(x >= dim_len): + raise BoundsCheckError(dim_len) + + +class IntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + def __init__( + self, + dim_sel, + dim_len, + dim_chunk_len, + wraparound=True, + boundscheck=True, + order=Order.UNKNOWN, + ): + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError( + "integer arrays in an orthogonal selection must be " "1-dimensional only" + ) + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + self.nitems = len(dim_sel) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + self.order = order + + if self.order == Order.INCREASING: + self.dim_sel = dim_sel + self.dim_out_sel = None + elif self.order == Order.DECREASING: + self.dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + else: + # sort indices to group by chunk + self.dim_out_sel = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_out_sel) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + + # find chunks that we need to visit + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + # compute offsets into the output array + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + def __iter__(self): + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def slice_to_range(s: slice, l: int): # noqa: E741 + return range(*s.indices(l)) + + +def ix_(selection, shape): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" + + # normalisation + selection = replace_ellipsis(selection, shape) + + # replace slice and int as these are not supported by numpy.ix_ + selection = [ + ( + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] if is_integer(dim_sel) else dim_sel + ) + for dim_sel, dim_len in zip(selection, shape) + ] + + # now get numpy to convert to a coordinate selection + selection = np.ix_(*selection) + + return selection + + +def oindex(a, selection): + """Implementation of orthogonal indexing with slices and ints.""" + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result + + +def oindex_set(a, selection, value): + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) + value_selection = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value_selection = tuple(value_selection) + value = value[value_selection] + a[selection] = value + + +# noinspection PyProtectedMember +class OrthogonalIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_integer_array(dim_sel): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_bool_array(dim_sel): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + f"unsupported selection item for orthogonal indexing; " + f"expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.array = array + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.is_advanced = not is_basic_selection(selection) + if self.is_advanced: + self.drop_axes = tuple( + i + for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntDimIndexer) + ) + else: + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # N.B., numpy doesn't support orthogonal indexing directly as yet, + # so need to work around via np.ix_. Also np.ix_ does not support a + # mixture of arrays and slices or integers, so need to convert slices + # and integers into ranges. + chunk_selection = ix_(chunk_selection, self.array._chunks) + + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class OIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_orthogonal_selection(selection, fields=fields) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_orthogonal_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +class BlockIndexer: + def __init__(self, selection, array): + # handle ellipsis + selection = replace_ellipsis(selection, array._shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_size in zip(selection, array._shape, array._chunks): + dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + + if is_integer(dim_sel): + if dim_sel < 0: + dim_sel = dim_numchunks + dim_sel + + start = dim_sel * dim_chunk_size + stop = start + dim_chunk_size + slice_ = slice(start, stop) + + elif is_slice(dim_sel): + start = dim_sel.start if dim_sel.start is not None else 0 + stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks + + if dim_sel.step not in {1, None}: + raise IndexError( + f"unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" + ) + + # Can't reuse wraparound_indices because it expects a numpy array + # We have integers here. + if start < 0: + start = dim_numchunks + start + if stop < 0: + stop = dim_numchunks + stop + + start = start * dim_chunk_size + stop = stop * dim_chunk_size + slice_ = slice(start, stop) + + else: + raise IndexError( + f"unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexers.append(dim_indexer) + + if start >= dim_len or start < 0: + raise BoundsCheckError(dim_len) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers) + self.drop_axes = None + + def __iter__(self): + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +class BlockIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_block_selection(selection, fields=fields) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_block_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +def is_coordinate_selection(selection, array): + return (len(selection) == len(array._shape)) and all( + is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection + ) + + +# noinspection PyProtectedMember +def is_mask_selection(selection, array): + return ( + len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == array._shape + ) + + +# noinspection PyProtectedMember +class CoordinateIndexer: + def __init__(self, selection, array): + # some initial normalization + selection = ensure_tuple(selection) + selection = tuple([i] if is_integer(i) else i for i in selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, array): + raise IndexError( + f"invalid coordinate selection; expected one integer " + f"(coordinate) array per dimension of the target array, " + f"got {selection!r}" + ) + + # handle wraparound, boundscheck + for dim_sel, dim_len in zip(selection, array.shape): + # handle wraparound + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + boundscheck_indices(dim_sel, dim_len) + + # compute chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len for (dim_sel, dim_chunk_len) in zip(selection, array._chunks) + ) + + # broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + self.sel_shape = selection[0].shape if selection[0].shape else (1,) + + # flatten selection + selection = [dim_sel.reshape(-1) for dim_sel in selection] + chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + + # ravel chunk indices + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=array._cdata_shape) + + # group points by chunk + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + else: + sel_sort = None + + # store attributes + self.selection = selection + self.sel_sort = sel_sort + self.shape = selection[0].shape if selection[0].shape else (1,) + self.drop_axes = None + self.array = array + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=array.nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + # locate the chunks we need to process + self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] + + # unravel chunk indices + self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape) + + def __iter__(self): + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.array._chunks) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets) + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +# noinspection PyProtectedMember +class MaskIndexer(CoordinateIndexer): + def __init__(self, selection, array): + # some initial normalization + selection = ensure_tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_mask_selection(selection, array): + raise IndexError( + f"invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection!r}" + ) + + # convert to indices + selection = np.nonzero(selection[0]) + + # delegate the rest to superclass + super().__init__(selection, array) + + +class VIndex: + def __init__(self, array): + self.array = array + + def __getitem__(self, selection): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + return self.array.get_coordinate_selection(selection, fields=fields) + elif is_mask_selection(selection, self.array): + return self.array.get_mask_selection(selection, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + def __setitem__(self, selection, value): + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array): + self.array.set_coordinate_selection(selection, value, fields=fields) + elif is_mask_selection(selection, self.array): + self.array.set_mask_selection(selection, value, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + +def check_fields(fields, dtype): + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, (str, list, tuple)): + raise IndexError( + f"'fields' argument must be a string or list of strings; found " f"{type(fields)!r}" + ) + if fields: + if dtype.names is None: + raise IndexError("invalid 'fields' argument, array does not have any fields") + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError as e: + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e + else: + return out_dtype + else: + return dtype + + +def check_no_multi_fields(fields): + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError("multiple fields are not supported for this operation") + return fields + + +def pop_fields(selection): + if isinstance(selection, str): + # single field selection + fields = selection + selection = () + elif not isinstance(selection, tuple): + # single selection item, no fields + fields = None + # leave selection as-is + else: + # multiple items, split fields from selection items + fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection = tuple(s for s in selection if not isinstance(s, str)) + selection = selection[0] if len(selection) == 1 else selection + return fields, selection + + +def make_slice_selection(selection): + ls = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) + else: + raise ArrayIndexError() + else: + ls.append(dim_selection) + return ls + + +class PartialChunkIterator: + """Iterator to retrieve the specific coordinates of requested data + from within a compressed chunk. + + Parameters + ---------- + selection : tuple + tuple of slice objects to take from the chunk + arr_shape : shape of chunk to select data from + + Attributes + ----------- + arr_shape + selection + + Returns + ------- + Tuple with 3 elements: + + start: int + elements offset in the chunk to read from + nitems: int + number of elements to read in the chunk from start + partial_out_selection: list of slices + indices of a temporary empty array of size `Array._chunks` to assign + the decompressed data to after the partial read. + + Notes + ----- + An array is flattened when compressed with blosc, so this iterator takes + the wanted selection of an array and determines the wanted coordinates + of the flattened, compressed data to be read and then decompressed. The + decompressed data is then placed in a temporary empty array of size + `Array._chunks` at the indices yielded as partial_out_selection. + Once all the slices yielded by this iterator have been read, decompressed + and written to the temporary array, the wanted slice of the chunk can be + indexed from the temporary array and written to the out_selection slice + of the out array. + + """ + + def __init__(self, selection, arr_shape): + selection = make_slice_selection(selection) + self.arr_shape = arr_shape + + # number of selection dimensions can't be greater than the number of chunk dimensions + if len(selection) > len(self.arr_shape): + raise ValueError( + "Selection has more dimensions then the array:\n" + f"selection dimensions = {len(selection)}\n" + f"array dimensions = {len(self.arr_shape)}" + ) + + # any selection can not be out of the range of the chunk + selection_shape = np.empty(self.arr_shape)[tuple(selection)].shape + if any( + selection_dim < 0 or selection_dim > arr_dim + for selection_dim, arr_dim in zip(selection_shape, self.arr_shape) + ): + raise IndexError( + "a selection index is out of range for the dimension" + ) # pragma: no cover + + for i, dim_size in enumerate(self.arr_shape[::-1]): + index = len(self.arr_shape) - (i + 1) + if index <= len(selection) - 1: + slice_size = selection_shape[index] + if slice_size == dim_size and index > 0: + selection.pop() + else: + break + + chunk_loc_slices = [] + last_dim_slice = None if selection[-1].step > 1 else selection.pop() + for arr_shape_i, sl in zip(arr_shape, selection): + dim_chunk_loc_slices = [] + assert isinstance(sl, slice) + for x in slice_to_range(sl, arr_shape_i): + dim_chunk_loc_slices.append(slice(x, x + 1, 1)) + chunk_loc_slices.append(dim_chunk_loc_slices) + if last_dim_slice: + chunk_loc_slices.append([last_dim_slice]) + self.chunk_loc_slices = list(itertools.product(*chunk_loc_slices)) + + def __iter__(self): + chunk1 = self.chunk_loc_slices[0] + nitems = (chunk1[-1].stop - chunk1[-1].start) * np.prod( + self.arr_shape[len(chunk1) :], dtype=int + ) + for partial_out_selection in self.chunk_loc_slices: + start = 0 + for i, sl in enumerate(partial_out_selection): + start += sl.start * np.prod(self.arr_shape[i + 1 :], dtype=int) + yield start, nitems, partial_out_selection diff --git a/src/zarr/v2/meta.py b/src/zarr/v2/meta.py new file mode 100644 index 0000000000..44a2b7ebec --- /dev/null +++ b/src/zarr/v2/meta.py @@ -0,0 +1,580 @@ +import base64 +import itertools +from collections.abc import Mapping + +import numcodecs +import numpy as np +from numcodecs.abc import Codec + +from zarr.errors import MetadataError +from zarr.util import json_dumps, json_loads + +from typing import cast, Union, Any, List, Mapping as MappingType, Optional, TYPE_CHECKING + +if TYPE_CHECKING: # pragma: no cover + from zarr._storage.store import StorageTransformer + + +ZARR_FORMAT = 2 +ZARR_FORMAT_v3 = 3 + +# FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} + +_default_entry_point_metadata_v3 = { + "zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_encoding": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_key_suffix": ".json", + "extensions": [], +} + +_v3_core_types = set("".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8"))) +_v3_core_types = {"bool", "i1", "u1"} | _v3_core_types + +# The set of complex types allowed ({"c8", ">c16"}) +_v3_complex_types = set(f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("8", "16"))) + +# All dtype.str values corresponding to datetime64 and timedelta64 +# see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units +_date_units = ["Y", "M", "W", "D"] +_time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +_v3_datetime_types = set( + f"{end}{kind}8[{unit}]" + for end, unit, kind in itertools.product("<>", _date_units + _time_units, ("m", "M")) +) + + +def get_extended_dtype_info(dtype) -> dict: + if dtype.str in _v3_complex_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/complex-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str == "|O": + return dict( + extension="TODO: object array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|S"): + return dict( + extension="TODO: bytestring array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("U"): + return dict( + extension="TODO: unicode array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|V"): + return dict( + extension="TODO: structured array protocol URL", # noqa + type=dtype.descr, + fallback=None, + ) + elif dtype.str in _v3_datetime_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/latest/extensions/data-types/datetime/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + else: + raise ValueError(f"Unsupported dtype: {dtype}") + + +class Metadata2: + ZARR_FORMAT = ZARR_FORMAT + + @classmethod + def parse_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + # Here we allow that a store may return an already-parsed metadata object, + # or a string of JSON that we will parse here. We allow for an already-parsed + # object to accommodate a consolidated metadata store, where all the metadata for + # all groups and arrays will already have been parsed from JSON. + + if isinstance(s, Mapping): + # assume metadata has already been parsed into a mapping object + meta = s + + else: + # assume metadata needs to be parsed as JSON + meta = json_loads(s) + + return meta + + @classmethod + def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # check metadata format + zarr_format = meta.get("zarr_format", None) + if zarr_format != cls.ZARR_FORMAT: + raise MetadataError(f"unsupported zarr format: {zarr_format}") + + # extract array metadata fields + try: + dtype = cls.decode_dtype(meta["dtype"]) + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["filters"][0]) + else: + object_codec = None + + dimension_separator = meta.get("dimension_separator", None) + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) + meta = dict( + zarr_format=meta["zarr_format"], + shape=tuple(meta["shape"]), + chunks=tuple(meta["chunks"]), + dtype=dtype, + compressor=meta["compressor"], + fill_value=fill_value, + order=meta["order"], + filters=meta["filters"], + ) + if dimension_separator: + meta["dimension_separator"] = dimension_separator + except Exception as e: + raise MetadataError("error decoding metadata") from e + else: + return meta + + @classmethod + def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: + dtype = meta["dtype"] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype + + dimension_separator = meta.get("dimension_separator") + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["filters"][0]) + else: + object_codec = None + + meta = dict( + zarr_format=cls.ZARR_FORMAT, + shape=meta["shape"] + sdshape, + chunks=meta["chunks"], + dtype=cls.encode_dtype(dtype), + compressor=meta["compressor"], + fill_value=cls.encode_fill_value(meta["fill_value"], dtype, object_codec), + order=meta["order"], + filters=meta["filters"], + ) + if dimension_separator: + meta["dimension_separator"] = dimension_separator + + return json_dumps(meta) + + @classmethod + def encode_dtype(cls, d: np.dtype): + if d.fields is None: + return d.str + else: + return d.descr + + @classmethod + def _decode_dtype_descr(cls, d) -> List[Any]: + # need to convert list of lists to list of tuples + if isinstance(d, list): + # recurse to handle nested structures + d = [(k[0], cls._decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] + return d + + @classmethod + def decode_dtype(cls, d) -> np.dtype: + d = cls._decode_dtype_descr(d) + return np.dtype(d) + + @classmethod + def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # check metadata format version + zarr_format = meta.get("zarr_format", None) + if zarr_format != cls.ZARR_FORMAT: + raise MetadataError(f"unsupported zarr format: {zarr_format}") + + meta = dict(zarr_format=zarr_format) + return meta + + # N.B., keep `meta` parameter as a placeholder for future + # noinspection PyUnusedLocal + @classmethod + def encode_group_metadata(cls, meta=None) -> bytes: + meta = dict(zarr_format=cls.ZARR_FORMAT) + return json_dumps(meta) + + @classmethod + def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + # early out + if v is None: + return v + if dtype.kind == "V" and dtype.hasobject: + if object_codec is None: + raise ValueError("missing object_codec for object array") + v = base64.standard_b64decode(v) + v = object_codec.decode(v) + v = np.array(v, dtype=dtype)[()] + return v + if dtype.kind == "f": + if v == "NaN": + return np.nan + elif v == "Infinity": + return np.inf + elif v == "-Infinity": + return -np.inf + else: + return np.array(v, dtype=dtype)[()] + elif dtype.kind in "c": + v = ( + cls.decode_fill_value(v[0], dtype.type().real.dtype), + cls.decode_fill_value(v[1], dtype.type().imag.dtype), + ) + v = v[0] + 1j * v[1] + return np.array(v, dtype=dtype)[()] + elif dtype.kind == "S": + # noinspection PyBroadException + try: + v = base64.standard_b64decode(v) + except Exception: + # be lenient, allow for other values that may have been used before base64 + # encoding and may work as fill values, e.g., the number 0 + pass + v = np.array(v, dtype=dtype)[()] + return v + elif dtype.kind == "V": + v = base64.standard_b64decode(v) + v = np.array(v, dtype=dtype.str).view(dtype)[()] + return v + elif dtype.kind == "U": + # leave as-is + return v + else: + return np.array(v, dtype=dtype)[()] + + @classmethod + def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + # early out + if v is None: + return v + if dtype.kind == "V" and dtype.hasobject: + if object_codec is None: + raise ValueError("missing object_codec for object array") + v = object_codec.encode(v) + v = str(base64.standard_b64encode(v), "ascii") + return v + if dtype.kind == "f": + if np.isnan(v): + return "NaN" + elif np.isposinf(v): + return "Infinity" + elif np.isneginf(v): + return "-Infinity" + else: + return float(v) + elif dtype.kind in "ui": + return int(v) + elif dtype.kind == "b": + return bool(v) + elif dtype.kind in "c": + c = cast(np.complex128, np.dtype(complex).type()) + v = ( + cls.encode_fill_value(v.real, c.real.dtype, object_codec), + cls.encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) + return v + elif dtype.kind in "SV": + v = str(base64.standard_b64encode(v), "ascii") + return v + elif dtype.kind == "U": + return v + elif dtype.kind in "mM": + return int(v.view("i8")) + else: + return v + + +class Metadata3(Metadata2): + ZARR_FORMAT = ZARR_FORMAT_v3 + + @classmethod + def decode_dtype(cls, d, validate=True): + if isinstance(d, dict): + # extract the type from the extension info + try: + d = d["type"] + except KeyError as e: + raise KeyError("Extended dtype info must provide a key named 'type'.") from e + d = cls._decode_dtype_descr(d) + dtype = np.dtype(d) + if validate: + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + # it is a core dtype of the v3 spec + pass + else: + # will raise if this is not a recognized extended dtype + get_extended_dtype_info(dtype) + return dtype + + @classmethod + def encode_dtype(cls, d): + s = d.str + if s == "|b1": + return "bool" + elif s == "|u1": + return "u1" + elif s == "|i1": + return "i1" + elif s in _v3_core_types: + return Metadata2.encode_dtype(d) + else: + # Check if this dtype corresponds to a supported extension to + # the v3 protocol. + return get_extended_dtype_info(np.dtype(d)) + + @classmethod + def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # 1 / 0 + # # check metadata format version + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != cls.ZARR_FORMAT: + # raise MetadataError(f"unsupported zarr format: {zarr_format}") + + assert "attributes" in meta + # meta = dict(attributes=meta['attributes']) + return meta + + # return json.loads(s) + + @classmethod + def encode_group_metadata(cls, meta=None) -> bytes: + # The ZARR_FORMAT should not be in the group metadata, but in the + # entry point metadata instead + # meta = dict(zarr_format=cls.ZARR_FORMAT) + if meta is None: + meta = {"attributes": {}} + meta = dict(attributes=meta.get("attributes", {})) + return json_dumps(meta) + + @classmethod + def encode_hierarchy_metadata(cls, meta=None) -> bytes: + if meta is None: + meta = _default_entry_point_metadata_v3 + elif set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metadata. meta={meta}") + return json_dumps(meta) + + @classmethod + def decode_hierarchy_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # check metadata format + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": + # raise MetadataError(f"unsupported zarr format: {zarr_format}") + if set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metadata. meta={meta}") + return meta + + @classmethod + def _encode_codec_metadata(cls, codec: Codec) -> Optional[Mapping]: + if codec is None: + return None + + # only support gzip for now + config = codec.get_config() + del config["id"] + uri = "https://purl.org/zarr/spec/codec/" + if isinstance(codec, numcodecs.GZip): + uri = uri + "gzip/1.0" + elif isinstance(codec, numcodecs.Zlib): + uri = uri + "zlib/1.0" + elif isinstance(codec, numcodecs.Blosc): + uri = uri + "blosc/1.0" + elif isinstance(codec, numcodecs.BZ2): + uri = uri + "bz2/1.0" + elif isinstance(codec, numcodecs.LZ4): + uri = uri + "lz4/1.0" + elif isinstance(codec, numcodecs.LZMA): + uri = uri + "lzma/1.0" + elif isinstance(codec, numcodecs.Zstd): + uri = uri + "zstd/1.0" + meta = { + "codec": uri, + "configuration": config, + } + return meta + + @classmethod + def _decode_codec_metadata(cls, meta: Optional[Mapping]) -> Optional[Codec]: + if meta is None: + return None + + uri = "https://purl.org/zarr/spec/codec/" + conf = meta["configuration"] + if meta["codec"].startswith(uri + "gzip/"): + conf["id"] = "gzip" + elif meta["codec"].startswith(uri + "zlib/"): + conf["id"] = "zlib" + elif meta["codec"].startswith(uri + "blosc/"): + conf["id"] = "blosc" + elif meta["codec"].startswith(uri + "bz2/"): + conf["id"] = "bz2" + elif meta["codec"].startswith(uri + "lz4/"): + conf["id"] = "lz4" + elif meta["codec"].startswith(uri + "lzma/"): + conf["id"] = "lzma" + elif meta["codec"].startswith(uri + "zstd/"): + conf["id"] = "zstd" + else: + raise NotImplementedError + + codec = numcodecs.get_codec(conf) + + return codec + + @classmethod + def _encode_storage_transformer_metadata( + cls, storage_transformer: "StorageTransformer" + ) -> Optional[Mapping]: + return { + "extension": storage_transformer.extension_uri, + "type": storage_transformer.type, + "configuration": storage_transformer.get_config(), + } + + @classmethod + def _decode_storage_transformer_metadata(cls, meta: Mapping) -> "StorageTransformer": + from zarr.tests.test_storage_v3 import DummyStorageTransfomer + from zarr._storage.v3_storage_transformers import ShardingStorageTransformer + + # This might be changed to a proper registry in the future + KNOWN_STORAGE_TRANSFORMERS = [DummyStorageTransfomer, ShardingStorageTransformer] + + conf = meta.get("configuration", {}) + extension_uri = meta["extension"] + transformer_type = meta["type"] + + for StorageTransformerCls in KNOWN_STORAGE_TRANSFORMERS: + if StorageTransformerCls.extension_uri == extension_uri: + break + else: # pragma: no cover + raise NotImplementedError + + return StorageTransformerCls.from_config(transformer_type, conf) + + @classmethod + def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # extract array metadata fields + try: + dtype = cls.decode_dtype(meta["data_type"]) + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) + else: + object_codec = None + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) + # TODO: remove dimension_separator? + + compressor = cls._decode_codec_metadata(meta.get("compressor", None)) + storage_transformers = meta.get("storage_transformers", ()) + storage_transformers = [ + cls._decode_storage_transformer_metadata(i) for i in storage_transformers + ] + extensions = meta.get("extensions", []) + meta = dict( + shape=tuple(meta["shape"]), + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=dtype, + fill_value=fill_value, + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta["attributes"], + extensions=extensions, + ) + # compressor field should be absent when there is no compression + if compressor: + meta["compressor"] = compressor + if storage_transformers: + meta["storage_transformers"] = storage_transformers + + except Exception as e: + raise MetadataError(f"error decoding metadata: {e}") from e + else: + return meta + + @classmethod + def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: + dtype = meta["data_type"] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype + dimension_separator = meta.get("dimension_separator") + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) + else: + object_codec = None + + compressor = cls._encode_codec_metadata(meta.get("compressor", None)) + storage_transformers = meta.get("storage_transformers", ()) + storage_transformers = [ + cls._encode_storage_transformer_metadata(i) for i in storage_transformers + ] + extensions = meta.get("extensions", []) + meta = dict( + shape=meta["shape"] + sdshape, + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=cls.encode_dtype(dtype), + fill_value=encode_fill_value(meta["fill_value"], dtype, object_codec), + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta.get("attributes", {}), + extensions=extensions, + ) + if compressor: + meta["compressor"] = compressor + if dimension_separator: + meta["dimension_separator"] = dimension_separator + if storage_transformers: + meta["storage_transformers"] = storage_transformers + return json_dumps(meta) + + +parse_metadata = Metadata2.parse_metadata +decode_array_metadata = Metadata2.decode_array_metadata +encode_array_metadata = Metadata2.encode_array_metadata +encode_dtype = Metadata2.encode_dtype +_decode_dtype_descr = Metadata2._decode_dtype_descr +decode_dtype = Metadata2.decode_dtype +decode_group_metadata = Metadata2.decode_group_metadata +encode_group_metadata = Metadata2.encode_group_metadata +decode_fill_value = Metadata2.decode_fill_value +encode_fill_value = Metadata2.encode_fill_value diff --git a/src/zarr/v2/meta_v1.py b/src/zarr/v2/meta_v1.py new file mode 100644 index 0000000000..714f55f477 --- /dev/null +++ b/src/zarr/v2/meta_v1.py @@ -0,0 +1,64 @@ +import json + +import numpy as np + +from zarr.errors import MetadataError + + +def decode_metadata(b): + s = str(b, "ascii") + meta = json.loads(s) + zarr_format = meta.get("zarr_format", None) + if zarr_format != 1: + raise MetadataError(f"unsupported zarr format: {zarr_format}") + try: + meta = dict( + zarr_format=meta["zarr_format"], + shape=tuple(meta["shape"]), + chunks=tuple(meta["chunks"]), + dtype=decode_dtype(meta["dtype"]), + compression=meta["compression"], + compression_opts=meta["compression_opts"], + fill_value=meta["fill_value"], + order=meta["order"], + ) + except Exception as e: + raise MetadataError(f"error decoding metadata: {e}") from e + else: + return meta + + +def encode_metadata(meta): + meta = dict( + zarr_format=1, + shape=meta["shape"], + chunks=meta["chunks"], + dtype=encode_dtype(meta["dtype"]), + compression=meta["compression"], + compression_opts=meta["compression_opts"], + fill_value=meta["fill_value"], + order=meta["order"], + ) + s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + b = s.encode("ascii") + return b + + +def encode_dtype(d): + if d.fields is None: + return d.str + else: + return d.descr + + +def _decode_dtype_descr(d): + # need to convert list of lists to list of tuples + if isinstance(d, list): + # recurse to handle nested structures + d = [(f, _decode_dtype_descr(v)) for f, v in d] + return d + + +def decode_dtype(d): + d = _decode_dtype_descr(d) + return np.dtype(d) diff --git a/src/zarr/v2/n5.py b/src/zarr/v2/n5.py new file mode 100644 index 0000000000..3bb7093128 --- /dev/null +++ b/src/zarr/v2/n5.py @@ -0,0 +1,932 @@ +"""This module contains a storage class and codec to support the N5 format. +""" + +import os +import struct +import sys +from typing import Any, Dict, Optional, cast +import warnings + +import numpy as np +from numcodecs.abc import Codec +from numcodecs.compat import ndarray_copy +from numcodecs.registry import get_codec, register_codec + +from .meta import ZARR_FORMAT, json_dumps, json_loads +from .storage import FSStore +from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path +from .storage import array_meta_key as zarr_array_meta_key +from .storage import attrs_key as zarr_attrs_key +from .storage import group_meta_key as zarr_group_meta_key + +N5_FORMAT = "2.0.0" + +zarr_to_n5_keys = [ + ("chunks", "blockSize"), + ("dtype", "dataType"), + ("compressor", "compression"), + ("shape", "dimensions"), +] +n5_attrs_key = "attributes.json" +n5_keywords = ["n5", "dataType", "dimensions", "blockSize", "compression"] + + +class N5Store(NestedDirectoryStore): + """Storage class using directories and files on a standard file system, + following the N5 format (https://github.com/saalfeldlab/n5). + + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.N5Store('data/array.n5') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.N5Store('data/group.n5') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + Notes + ----- + + This is an experimental feature. + + Safe to write in multiple threads or processes. + + .. deprecated:: 2.18.3 + `N5Store` will be removed in Zarr 3.0.0. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + warnings.warn( + "The N5Store is deprecated and will be removed in a Zarr-Python version 3, " + "see https://github.com/zarr-developers/zarr-python/issues/1274 and " + "https://github.com/zarr-developers/n5py for more information.", + FutureWarning, + stacklevel=2, + ) + + def __getitem__(self, key: str) -> bytes: + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) + + return json_dumps(value) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + top_level = key == zarr_array_meta_key + value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) + return json_dumps(value) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + value = attrs_to_zarr(self._load_n5_attrs(key_new)) + + if len(value) == 0: + raise KeyError(key_new) + else: + return json_dumps(value) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + + else: + key_new = key + + return super().__getitem__(key_new) + + def __setitem__(self, key: str, value: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + top_level = key == zarr_array_meta_key + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + zarr_attrs = json_loads(value) + + for k in n5_keywords: + if k in zarr_attrs: + warnings.warn( + f"Attribute {k} is a reserved N5 keyword", + UserWarning, + stacklevel=2, + ) + + # remove previous user attributes + for k in list(n5_attrs.keys()): + if k not in n5_keywords: + del n5_attrs[k] + + # add new user attributes + n5_attrs.update(**zarr_attrs) + + value = json_dumps(n5_attrs) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + + else: + key_new = key + + super().__setitem__(key_new, value) + + def __delitem__(self, key: str): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + else: + key_new = key + + super().__delitem__(key_new) + + def __contains__(self, key): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, n5_attrs_key) + if key_new not in self: + return False + # group if not a dataset (attributes do not contain 'dimensions') + return "dimensions" not in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, n5_attrs_key) + # array if attributes contain 'dimensions' + return "dimensions" in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, n5_attrs_key) + return self._contains_attrs(key_new) + + elif is_chunk_key(key): + key_new = invert_chunk_coords(key) + else: + key_new = key + + return super().__contains__(key_new) + + def __eq__(self, other): + return isinstance(other, N5Store) and self.path == other.path + + def listdir(self, path: Optional[str] = None): + if path is not None: + path = invert_chunk_coords(path) + path = cast(str, path) + # We can't use NestedDirectoryStore's listdir, as it requires + # array_meta_key to be present in array directories, which this store + # doesn't provide. + children = super().listdir(path=path) + + if self._is_array(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(n5_attrs_key) + children.append(zarr_array_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + # special handling of directories containing an array to map + # inverted nested chunk keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and os.path.isdir(entry_path): + for dir_path, _, file_names in os.walk(entry_path): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + rel_path = file_path.split(root_path + os.path.sep)[1] + new_child = rel_path.replace(os.path.sep, ".") + new_children.append(invert_chunk_coords(new_child)) + else: + new_children.append(entry) + + return sorted(new_children) + + elif self._is_group(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(n5_attrs_key) + children.append(zarr_group_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + return sorted(children) + + else: + return children + + def _load_n5_attrs(self, path: str) -> Dict[str, Any]: + try: + s = super().__getitem__(path) + return json_loads(s) + except KeyError: + return {} + + def _is_group(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + attrs_key = os.path.join(path, n5_attrs_key) + + n5_attrs = self._load_n5_attrs(attrs_key) + return len(n5_attrs) > 0 and "dimensions" not in n5_attrs + + def _is_array(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + attrs_key = os.path.join(path, n5_attrs_key) + + return "dimensions" in self._load_n5_attrs(attrs_key) + + def _contains_attrs(self, path: str): + if path is None: + attrs_key = n5_attrs_key + else: + if not path.endswith(n5_attrs_key): + attrs_key = os.path.join(path, n5_attrs_key) + else: + attrs_key = path + + attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) + return len(attrs) > 0 + + +class N5FSStore(FSStore): + """Implementation of the N5 format (https://github.com/saalfeldlab/n5) + using `fsspec`, which allows storage on a variety of filesystems. Based + on `zarr.N5Store`. + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + Notes + ----- + This is an experimental feature. + Safe to write in multiple threads or processes. + + Be advised that the `_dimension_separator` property of this store + (and arrays it creates) is ".", but chunks saved by this store will + in fact be "/" separated, as proscribed by the N5 format. + + This is counter-intuitive (to say the least), but not arbitrary. + Chunks in N5 format are stored with reversed dimension order + relative to Zarr chunks: a chunk of a 3D Zarr array would be stored + on a file system as `/0/1/2`, but in N5 the same chunk would be + stored as `/2/1/0`. Therefore, stores targeting N5 must intercept + chunk keys and flip the order of the dimensions before writing to + storage, and this procedure requires chunk keys with "." separated + dimensions, hence the Zarr arrays targeting N5 have the deceptive + "." dimension separator. + + .. deprecated:: 2.18.3 + `N5FSStore` will be removed in Zarr 3.0.0. + """ + + _array_meta_key = "attributes.json" + _group_meta_key = "attributes.json" + _attrs_key = "attributes.json" + + def __init__(self, *args, **kwargs): + warnings.warn( + "The N5FSStore is deprecated and will be removed in a Zarr-Python version 3, " + "see https://github.com/zarr-developers/zarr-python/issues/1274 and " + "https://github.com/zarr-developers/n5py for more information.", + FutureWarning, + stacklevel=2, + ) + if "dimension_separator" in kwargs: + kwargs.pop("dimension_separator") + warnings.warn( + "Keyword argument `dimension_separator` will be ignored", + stacklevel=2, + ) + dimension_separator = "." + super().__init__(*args, dimension_separator=dimension_separator, **kwargs) + + @staticmethod + def _swap_separator(key: str): + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + coords = list(last_segment.split(".")) + last_segment = "/".join(coords[::-1]) + segments = segments[:-1] + [last_segment] + key = "/".join(segments) + return key + + def _normalize_key(self, key: str): + if is_chunk_key(key): + key = invert_chunk_coords(key) + + key = normalize_storage_path(key).lstrip("/") + if key: + *bits, end = key.split("/") + + if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): + end = end.replace(".", "/") + key = "/".join(bits + [end]) + return key.lower() if self.normalize_keys else key + + def __getitem__(self, key: str) -> bytes: + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) + + return json_dumps(value) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + top_level = key == zarr_array_meta_key + value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) + return json_dumps(value) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + value = attrs_to_zarr(self._load_n5_attrs(key_new)) + + if len(value) == 0: + raise KeyError(key_new) + else: + return json_dumps(value) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + + return super().__getitem__(key_new) + + def __setitem__(self, key: str, value: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + top_level = key == zarr_array_meta_key + n5_attrs = self._load_n5_attrs(key_new) + n5_attrs.update(**array_metadata_to_n5(json_loads(value), top_level=top_level)) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + + n5_attrs = self._load_n5_attrs(key_new) + zarr_attrs = json_loads(value) + + for k in n5_keywords: + if k in zarr_attrs.keys(): + warnings.warn( + f"Attribute {k} is a reserved N5 keyword", + UserWarning, + stacklevel=2, + ) + + # replace previous user attributes + for k in list(n5_attrs.keys()): + if k not in n5_keywords: + del n5_attrs[k] + + # add new user attributes + n5_attrs.update(**zarr_attrs) + + value = json_dumps(n5_attrs) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + + super().__setitem__(key_new, value) + + def __delitem__(self, key: str): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + elif is_chunk_key(key): + key_new = self._swap_separator(key) + else: + key_new = key + super().__delitem__(key_new) + + def __contains__(self, key: Any): + if key.endswith(zarr_group_meta_key): + key_new = key.replace(zarr_group_meta_key, self._group_meta_key) + if key_new not in self: + return False + # group if not a dataset (attributes do not contain 'dimensions') + return "dimensions" not in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_array_meta_key): + key_new = key.replace(zarr_array_meta_key, self._array_meta_key) + # array if attributes contain 'dimensions' + return "dimensions" in self._load_n5_attrs(key_new) + + elif key.endswith(zarr_attrs_key): + key_new = key.replace(zarr_attrs_key, self._attrs_key) + return self._contains_attrs(key_new) + + elif is_chunk_key(key): + key_new = self._swap_separator(key) + + else: + key_new = key + return super().__contains__(key_new) + + def __eq__(self, other: Any): + return isinstance(other, N5FSStore) and self.path == other.path + + def listdir(self, path: Optional[str] = None): + if path is not None: + path = invert_chunk_coords(path) + + # We can't use NestedDirectoryStore's listdir, as it requires + # array_meta_key to be present in array directories, which this store + # doesn't provide. + children = super().listdir(path=path) + if self._is_array(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(self._array_meta_key) + children.append(zarr_array_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + # special handling of directories containing an array to map + # inverted nested chunk keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and self.fs.isdir(entry_path): + for file_name in self.fs.find(entry_path): + file_path = os.path.join(root_path, file_name) + rel_path = file_path.split(root_path)[1] + new_child = rel_path.lstrip("/").replace("/", ".") + new_children.append(invert_chunk_coords(new_child)) + else: + new_children.append(entry) + return sorted(new_children) + + elif self._is_group(path): + # replace n5 attribute file with respective zarr attribute files + children.remove(self._group_meta_key) + children.append(zarr_group_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + return sorted(children) + else: + return children + + def _load_n5_attrs(self, path: str): + try: + s = super().__getitem__(path) + return json_loads(s) + except KeyError: + return {} + + def _is_group(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + n5_attrs = self._load_n5_attrs(attrs_key) + return len(n5_attrs) > 0 and "dimensions" not in n5_attrs + + def _is_array(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + return "dimensions" in self._load_n5_attrs(attrs_key) + + def _contains_attrs(self, path: Optional[str]): + if path is None: + attrs_key = self._attrs_key + else: + if not path.endswith(self._attrs_key): + attrs_key = os.path.join(path, self._attrs_key) + else: + attrs_key = path + + attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) + return len(attrs) > 0 + + +def is_chunk_key(key: str): + rv = False + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + rv = bool(_prog_ckey.match(last_segment)) + return rv + + +def invert_chunk_coords(key: str): + segments = list(key.split("/")) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + coords = list(last_segment.split(".")) + last_segment = "/".join(coords[::-1]) + segments = segments[:-1] + [last_segment] + key = "/".join(segments) + return key + + +def group_metadata_to_n5(group_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Convert group metadata from zarr to N5 format.""" + del group_metadata["zarr_format"] + # TODO: This should only exist at the top-level + group_metadata["n5"] = N5_FORMAT + return group_metadata + + +def group_metadata_to_zarr(group_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Convert group metadata from N5 to zarr format.""" + # This only exists at the top level + group_metadata.pop("n5", None) + group_metadata["zarr_format"] = ZARR_FORMAT + return group_metadata + + +def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dict[str, Any]: + """Convert array metadata from zarr to N5 format. If the `top_level` keyword argument is True, + then the `N5` : N5_FORMAT key : value pair will be inserted into the metadata.""" + + for f, t in zarr_to_n5_keys: + array_metadata[t] = array_metadata.pop(f) + del array_metadata["zarr_format"] + if top_level: + array_metadata["n5"] = N5_FORMAT + try: + dtype = np.dtype(array_metadata["dataType"]) + except TypeError as e: + raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") from e + + array_metadata["dataType"] = dtype.name + array_metadata["dimensions"] = array_metadata["dimensions"][::-1] + array_metadata["blockSize"] = array_metadata["blockSize"][::-1] + + if "fill_value" in array_metadata: + if array_metadata["fill_value"] != 0 and array_metadata["fill_value"] is not None: + raise ValueError( + f"""Received fill_value = {array_metadata['fill_value']}, + but N5 only supports fill_value = 0""" + ) + del array_metadata["fill_value"] + + if "order" in array_metadata: + if array_metadata["order"] != "C": + raise ValueError( + f"Received order = {array_metadata['order']}, but N5 only supports order = C" + ) + del array_metadata["order"] + + if "filters" in array_metadata: + if array_metadata["filters"] != [] and array_metadata["filters"] is not None: + raise ValueError("Received filters, but N5 storage does not support zarr filters") + del array_metadata["filters"] + + assert "compression" in array_metadata + compressor_config = array_metadata["compression"] + compressor_config = compressor_config_to_n5(compressor_config) + array_metadata["compression"] = compressor_config + + if "dimension_separator" in array_metadata: + del array_metadata["dimension_separator"] + + return array_metadata + + +def array_metadata_to_zarr( + array_metadata: Dict[str, Any], top_level: bool = False +) -> Dict[str, Any]: + """Convert array metadata from N5 to zarr format. + If the `top_level` keyword argument is True, then the `N5` key will be removed from metadata""" + for t, f in zarr_to_n5_keys: + array_metadata[t] = array_metadata.pop(f) + if top_level: + array_metadata.pop("n5") + array_metadata["zarr_format"] = ZARR_FORMAT + + array_metadata["shape"] = array_metadata["shape"][::-1] + array_metadata["chunks"] = array_metadata["chunks"][::-1] + array_metadata["fill_value"] = 0 # also if None was requested + array_metadata["order"] = "C" + array_metadata["filters"] = [] + array_metadata["dimension_separator"] = "." + array_metadata["dtype"] = np.dtype(array_metadata["dtype"]).str + + compressor_config = array_metadata["compressor"] + compressor_config = compressor_config_to_zarr(compressor_config) + array_metadata["compressor"] = { + "id": N5ChunkWrapper.codec_id, + "compressor_config": compressor_config, + "dtype": array_metadata["dtype"], + "chunk_shape": array_metadata["chunks"], + } + + return array_metadata + + +def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: + """Get all zarr attributes from an N5 attributes dictionary (i.e., + all non-keyword attributes).""" + + # remove all N5 keywords + for n5_key in n5_keywords: + if n5_key in attrs: + del attrs[n5_key] + + return attrs + + +def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + if compressor_config is None: + return {"type": "raw"} + else: + _compressor_config = compressor_config + + # peel wrapper, if present + if _compressor_config["id"] == N5ChunkWrapper.codec_id: + _compressor_config = _compressor_config["compressor_config"] + + codec_id = _compressor_config["id"] + n5_config = {"type": codec_id} + + if codec_id == "bz2": + n5_config["type"] = "bzip2" + n5_config["blockSize"] = _compressor_config["level"] + + elif codec_id == "blosc": + n5_config["cname"] = _compressor_config["cname"] + n5_config["clevel"] = _compressor_config["clevel"] + n5_config["shuffle"] = _compressor_config["shuffle"] + n5_config["blocksize"] = _compressor_config["blocksize"] + + elif codec_id == "lzma": + # Switch to XZ for N5 if we are using the default XZ format. + # Note: 4 is the default, which is lzma.CHECK_CRC64. + if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: + n5_config["type"] = "xz" + else: + warnings.warn( + "Not all N5 implementations support lzma compression (yet). You " + "might not be able to open the dataset with another N5 library.", + RuntimeWarning, + stacklevel=2, + ) + n5_config["format"] = _compressor_config["format"] + n5_config["check"] = _compressor_config["check"] + n5_config["filters"] = _compressor_config["filters"] + + # The default is lzma.PRESET_DEFAULT, which is 6. + if _compressor_config["preset"]: + n5_config["preset"] = _compressor_config["preset"] + else: + n5_config["preset"] = 6 + + elif codec_id == "zlib": + n5_config["type"] = "gzip" + n5_config["level"] = _compressor_config["level"] + n5_config["useZlib"] = True + + elif codec_id == "gzip": + n5_config["type"] = "gzip" + n5_config["level"] = _compressor_config["level"] + n5_config["useZlib"] = False + + else: + n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) + + return n5_config + + +def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: + codec_id = compressor_config["type"] + zarr_config = {"id": codec_id} + + if codec_id == "bzip2": + zarr_config["id"] = "bz2" + zarr_config["level"] = compressor_config["blockSize"] + + elif codec_id == "blosc": + zarr_config["cname"] = compressor_config["cname"] + zarr_config["clevel"] = compressor_config["clevel"] + zarr_config["shuffle"] = compressor_config["shuffle"] + zarr_config["blocksize"] = compressor_config["blocksize"] + + elif codec_id == "lzma": + zarr_config["format"] = compressor_config["format"] + zarr_config["check"] = compressor_config["check"] + zarr_config["preset"] = compressor_config["preset"] + zarr_config["filters"] = compressor_config["filters"] + + elif codec_id == "xz": + zarr_config["id"] = "lzma" + zarr_config["format"] = 1 # lzma.FORMAT_XZ + zarr_config["check"] = -1 + zarr_config["preset"] = compressor_config["preset"] + zarr_config["filters"] = None + + elif codec_id == "gzip": + if "useZlib" in compressor_config and compressor_config["useZlib"]: + zarr_config["id"] = "zlib" + zarr_config["level"] = compressor_config["level"] + else: + zarr_config["id"] = "gzip" + zarr_config["level"] = compressor_config["level"] + + elif codec_id == "raw": + return None + + else: + zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) + + return zarr_config + + +class N5ChunkWrapper(Codec): + codec_id = "n5_wrapper" + + def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): + self.dtype = np.dtype(dtype) + self.chunk_shape = tuple(chunk_shape) + # is the dtype a little endian format? + self._little_endian = self.dtype.byteorder == "<" or ( + self.dtype.byteorder == "=" and sys.byteorder == "little" + ) + + if compressor: + if compressor_config is not None: + raise ValueError("Only one of compressor_config or compressor should be given.") + compressor_config = compressor.get_config() + + if compressor_config is None and compressor is None or compressor_config["id"] == "raw": + self.compressor_config = None + self._compressor = None + else: + self._compressor = get_codec(compressor_config) + self.compressor_config = self._compressor.get_config() + + def get_config(self): + config = {"id": self.codec_id, "compressor_config": self.compressor_config} + return config + + def encode(self, chunk): + assert chunk.flags.c_contiguous + + header = self._create_header(chunk) + chunk = self._to_big_endian(chunk) + + if self._compressor: + return header + self._compressor.encode(chunk) + else: + return header + chunk.tobytes(order="A") + + def decode(self, chunk, out=None) -> bytes: + len_header, chunk_shape = self._read_header(chunk) + chunk = chunk[len_header:] + + if out is not None: + # out should only be used if we read a complete chunk + assert ( + chunk_shape == self.chunk_shape + ), f"Expected chunk of shape {self.chunk_shape}, found {chunk_shape}" + + if self._compressor: + self._compressor.decode(chunk, out) + else: + ndarray_copy(chunk, out) + + # we can byteswap in-place + if self._little_endian: + out.byteswap(True) + + return out + + else: + if self._compressor: + chunk = self._compressor.decode(chunk) + + # more expensive byteswap + chunk = self._from_big_endian(chunk) + + # read partial chunk + if chunk_shape != self.chunk_shape: + chunk = np.frombuffer(chunk, dtype=self.dtype) + chunk = chunk.reshape(chunk_shape) + complete_chunk = np.zeros(self.chunk_shape, dtype=self.dtype) + target_slices = tuple(slice(0, s) for s in chunk_shape) + complete_chunk[target_slices] = chunk + chunk = complete_chunk + + return chunk + + @staticmethod + def _create_header(chunk): + mode = struct.pack(">H", 0) + num_dims = struct.pack(">H", len(chunk.shape)) + shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) + + return mode + num_dims + shape + + @staticmethod + def _read_header(chunk): + num_dims = struct.unpack(">H", chunk[2:4])[0] + shape = tuple( + struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) + )[::-1] + + len_header = 4 + num_dims * 4 + + return len_header, shape + + def _to_big_endian(self, data): + # assumes data is ndarray + + if self._little_endian: + return data.byteswap() + return data + + def _from_big_endian(self, data): + # assumes data is byte array in big endian + + if not self._little_endian: + return data + + a = np.frombuffer(data, self.dtype.newbyteorder(">")) + return a.astype(self.dtype) + + +register_codec(N5ChunkWrapper, N5ChunkWrapper.codec_id) diff --git a/src/zarr/v2/storage.py b/src/zarr/v2/storage.py new file mode 100644 index 0000000000..f9f6dbe0a6 --- /dev/null +++ b/src/zarr/v2/storage.py @@ -0,0 +1,3079 @@ +"""This module contains storage classes for use with Zarr arrays and groups. + +Note that any object implementing the :class:`MutableMapping` interface from the +:mod:`collections` module in the Python standard library can be used as a Zarr +array store, as long as it accepts string (str) keys and bytes values. + +In addition to the :class:`MutableMapping` interface, store classes may also implement +optional methods `listdir` (list members of a "directory") and `rmdir` (remove all +members of a "directory"). These methods should be implemented if the store class is +aware of the hierarchical organisation of resources within the store and can provide +efficient implementations. If these methods are not available, Zarr will fall back to +slower implementations that work via the :class:`MutableMapping` interface. Store +classes may also optionally implement a `rename` method (rename all members under a given +path) and a `getsize` method (return the size in bytes of a given value). + +""" + +import atexit +import errno +import glob +import multiprocessing +import operator +import os +import re +import shutil +import sys +import tempfile +import warnings +import zipfile +from collections import OrderedDict +from collections.abc import MutableMapping +from functools import lru_cache +from os import scandir +from pickle import PicklingError +from threading import Lock, RLock +from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any +import uuid +import time + +from numcodecs.abc import Codec +from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like +from numcodecs.registry import codec_registry +from zarr.context import Context +from zarr.types import PathLike as Path, DIMENSION_SEPARATOR +from zarr.util import NoLock + +from zarr.errors import ( + MetadataError, + BadCompressorError, + ContainsArrayError, + ContainsGroupError, + FSPathExistNotDir, + ReadOnlyError, +) +from zarr.meta import encode_array_metadata, encode_group_metadata +from zarr.util import ( + buffer_size, + json_loads, + nolock, + normalize_chunks, + normalize_dimension_separator, + normalize_dtype, + normalize_fill_value, + normalize_order, + normalize_shape, + normalize_storage_path, + retry_call, + ensure_contiguous_ndarray_or_bytes, +) + +from zarr._storage.absstore import ABSStore # noqa: F401 +from zarr._storage.store import ( # noqa: F401 + _get_hierarchy_metadata, + _get_metadata_suffix, + _listdir_from_keys, + _rename_from_keys, + _rename_metadata_v3, + _rmdir_from_keys, + _rmdir_from_keys_v3, + _path_to_prefix, + _prefix_to_array_key, + _prefix_to_group_key, + array_meta_key, + attrs_key, + data_root, + group_meta_key, + meta_root, + DEFAULT_ZARR_VERSION, + BaseStore, + Store, + V3_DEPRECATION_MESSAGE, +) + +__doctest_requires__ = { + ("RedisStore", "RedisStore.*"): ["redis"], + ("MongoDBStore", "MongoDBStore.*"): ["pymongo"], + ("LRUStoreCache", "LRUStoreCache.*"): ["s3fs"], +} + + +try: + # noinspection PyUnresolvedReferences + from zarr.codecs import Blosc + + default_compressor = Blosc() +except ImportError: # pragma: no cover + from zarr.codecs import Zlib + + default_compressor = Zlib() + + +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + +def contains_array(store: StoreLike, path: Path = None) -> bool: + """Return True if the store contains an array at the given logical path.""" + path = normalize_storage_path(path) + prefix = _path_to_prefix(path) + key = _prefix_to_array_key(store, prefix) + return key in store + + +def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> bool: + """Return True if the store contains a group at the given logical path.""" + path = normalize_storage_path(path) + prefix = _path_to_prefix(path) + key = _prefix_to_group_key(store, prefix) + store_version = getattr(store, "_store_version", 2) + if store_version == 2 or explicit_only: + return key in store + else: + if key in store: + return True + # for v3, need to also handle implicit groups + + sfx = _get_metadata_suffix(store) # type: ignore + implicit_prefix = key.replace(".group" + sfx, "") + if not implicit_prefix.endswith("/"): + implicit_prefix += "/" + if store.list_prefix(implicit_prefix): # type: ignore + return True + return False + + +def _normalize_store_arg_v2(store: Any, storage_options=None, mode="r") -> BaseStore: + # default to v2 store for backward compatibility + zarr_version = getattr(store, "_store_version", 2) + if zarr_version != 2: + raise ValueError("store must be a version 2 store") + if store is None: + store = KVStore(dict()) + return store + if isinstance(store, os.PathLike): + store = os.fspath(store) + if FSStore._fsspec_installed(): + import fsspec + + if isinstance(store, fsspec.FSMap): + return FSStore( + store.root, + fs=store.fs, + mode=mode, + check=store.check, + create=store.create, + missing_exceptions=store.missing_exceptions, + **(storage_options or {}), + ) + if isinstance(store, str): + if "://" in store or "::" in store: + return FSStore(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + if store.endswith(".zip"): + return ZipStore(store, mode=mode) + elif store.endswith(".n5"): + from zarr.n5 import N5Store + + return N5Store(store) + else: + return DirectoryStore(store) + else: + store = Store._ensure_store(store) + return store + + +def normalize_store_arg( + store: Any, storage_options=None, mode="r", *, zarr_version=None +) -> BaseStore: + if zarr_version is None: + # default to v2 store for backward compatibility + zarr_version = getattr(store, "_store_version", DEFAULT_ZARR_VERSION) + if zarr_version == 2: + normalize_store = _normalize_store_arg_v2 + elif zarr_version == 3: + from zarr._storage.v3 import _normalize_store_arg_v3 + + normalize_store = _normalize_store_arg_v3 + else: + raise ValueError("zarr_version must be either 2 or 3") + return normalize_store(store, storage_options, mode) + + +def rmdir(store: StoreLike, path: Path = None): + """Remove all items under the given path. If `store` provides a `rmdir` method, + this will be called, otherwise will fall back to implementation via the + `Store` interface.""" + path = normalize_storage_path(path) + store_version = getattr(store, "_store_version", 2) + if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore + # pass through + store.rmdir(path) + else: + # slow version, delete one key at a time + if store_version == 2: + _rmdir_from_keys(store, path) + else: + _rmdir_from_keys_v3(store, path) # type: ignore + + +def rename(store: Store, src_path: Path, dst_path: Path): + """Rename all items under the given path. If `store` provides a `rename` method, + this will be called, otherwise will fall back to implementation via the + `Store` interface.""" + src_path = normalize_storage_path(src_path) + dst_path = normalize_storage_path(dst_path) + if hasattr(store, "rename"): + # pass through + store.rename(src_path, dst_path) + else: + # slow version, delete one key at a time + _rename_from_keys(store, src_path, dst_path) + + +def listdir(store: BaseStore, path: Path = None): + """Obtain a directory listing for the given path. If `store` provides a `listdir` + method, this will be called, otherwise will fall back to implementation via the + `MutableMapping` interface.""" + path = normalize_storage_path(path) + if hasattr(store, "listdir"): + # pass through + return store.listdir(path) + else: + # slow version, iterate through all keys + warnings.warn( + f"Store {store} has no `listdir` method. From zarr 2.9 onwards " + "may want to inherit from `Store`.", + stacklevel=2, + ) + return _listdir_from_keys(store, path) + + +def _getsize(store: BaseStore, path: Path = None) -> int: + # compute from size of values + if path and path in store: + v = store[path] + size = buffer_size(v) + else: + path = "" if path is None else normalize_storage_path(path) + size = 0 + store_version = getattr(store, "_store_version", 2) + if store_version == 3: + if path == "": + # have to list the root folders without trailing / in this case + members = store.list_prefix(data_root.rstrip("/")) # type: ignore + members += store.list_prefix(meta_root.rstrip("/")) # type: ignore + else: + members = store.list_prefix(data_root + path) # type: ignore + members += store.list_prefix(meta_root + path) # type: ignore + # also include zarr.json? + # members += ['zarr.json'] + else: + members = listdir(store, path) + prefix = _path_to_prefix(path) + members = [prefix + k for k in members] + for k in members: + try: + v = store[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + +def getsize(store: BaseStore, path: Path = None) -> int: + """Compute size of stored items for a given path. If `store` provides a `getsize` + method, this will be called, otherwise will return -1.""" + if hasattr(store, "getsize"): + # pass through + path = normalize_storage_path(path) + return store.getsize(path) + elif isinstance(store, MutableMapping): + return _getsize(store, path) + else: + return -1 + + +def _require_parent_group( + path: Optional[str], + store: StoreLike, + chunk_store: Optional[StoreLike], + overwrite: bool, +): + # assume path is normalized + if path: + segments = path.split("/") + for i in range(len(segments)): + p = "/".join(segments[:i]) + if contains_array(store, p): + _init_group_metadata(store, path=p, chunk_store=chunk_store, overwrite=overwrite) + elif not contains_group(store, p): + _init_group_metadata(store, path=p, chunk_store=chunk_store) + + +def init_array( + store: StoreLike, + shape: Union[int, Tuple[int, ...]], + chunks: Union[bool, int, Tuple[int, ...]] = True, + dtype=None, + compressor="default", + fill_value=None, + order: str = "C", + overwrite: bool = False, + path: Optional[Path] = None, + chunk_store: Optional[StoreLike] = None, + filters=None, + object_codec=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + storage_transformers=(), +): + """Initialize an array store with the given configuration. Note that this is a low-level + function and there should be no need to call this directly from user code. + + Parameters + ---------- + store : Store + A mapping that supports string keys and bytes-like values. + shape : int or tuple of ints + Array shape. + chunks : bool, int or tuple of ints, optional + Chunk shape. If True, will be guessed from `shape` and `dtype`. If + False, will be set to `shape`, i.e., single chunk for the whole array. + dtype : string or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + overwrite : bool, optional + If True, erase all data in `store` prior to initialisation. + path : string, bytes, optional + Path under which array is stored. + chunk_store : Store, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + filters : sequence, optional + Sequence of filters to use to encode chunk data prior to compression. + object_codec : Codec, optional + A codec to encode object arrays, only needed if dtype=object. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + Examples + -------- + Initialize an array store:: + + >>> from zarr.storage import init_array, KVStore + >>> store = KVStore(dict()) + >>> init_array(store, shape=(10000, 10000), chunks=(1000, 1000)) + >>> sorted(store.keys()) + ['.zarray'] + + Array metadata is stored as JSON:: + + >>> print(store['.zarray'].decode()) + { + "chunks": [ + 1000, + 1000 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": ">> store = KVStore(dict()) + >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', path='foo') + >>> sorted(store.keys()) + ['.zgroup', 'foo/.zarray'] + >>> print(store['foo/.zarray'].decode()) + { + "chunks": [ + 1000000 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "filters": null, + "order": "C", + "shape": [ + 100000000 + ], + "zarr_format": 2 + } + + Notes + ----- + The initialisation process involves normalising all array metadata, encoding + as JSON and storing under the '.zarray' key. + + """ + + # normalize path + path = normalize_storage_path(path) + + # ensure parent group initialized + store_version = getattr(store, "_store_version", 2) + if store_version < 3: + _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) + + if store_version == 3 and "zarr.json" not in store: + # initialize with default zarr.json entry level metadata + store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore + + if not compressor: + # compatibility with legacy tests using compressor=[] + compressor = None + _init_array_metadata( + store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + order=order, + overwrite=overwrite, + path=path, + chunk_store=chunk_store, + filters=filters, + object_codec=object_codec, + dimension_separator=dimension_separator, + storage_transformers=storage_transformers, + ) + + +def _init_array_metadata( + store: StoreLike, + shape, + chunks=None, + dtype=None, + compressor="default", + fill_value=None, + order="C", + overwrite=False, + path: Optional[str] = None, + chunk_store: Optional[StoreLike] = None, + filters=None, + object_codec=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + storage_transformers=(), +): + store_version = getattr(store, "_store_version", 2) + + path = normalize_storage_path(path) + + # guard conditions + if overwrite: + if store_version == 2: + # attempt to delete any pre-existing array in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = data_root + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if "/" in path: + # path is a subfolder of an existing array, remove that array + parent_path = "/".join(path.split("/")[:-1]) + sfx = _get_metadata_suffix(store) # type: ignore + array_key = meta_root + parent_path + ".array" + sfx + if array_key in store: + store.erase(array_key) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path, explicit_only=False): + raise ContainsGroupError(path) + elif store_version == 3: + if "/" in path: + # cannot create an array within an existing array path + parent_path = "/".join(path.split("/")[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) + + # normalize metadata + dtype, object_codec = normalize_dtype(dtype, object_codec) + shape = normalize_shape(shape) + dtype.shape + dtype = dtype.base + chunks = normalize_chunks(chunks, shape, dtype.itemsize) + order = normalize_order(order) + fill_value = normalize_fill_value(fill_value, dtype) + + # optional array metadata + if dimension_separator is None and store_version == 2: + dimension_separator = getattr(store, "_dimension_separator", None) + dimension_separator = normalize_dimension_separator(dimension_separator) + + # compressor prep + if shape == (): + # no point in compressing a 0-dimensional array, only a single value + compressor = None + elif compressor == "none": + # compatibility + compressor = None + elif compressor == "default": + compressor = default_compressor + + # obtain compressor config + compressor_config = None + if compressor: + if store_version == 2: + try: + compressor_config = compressor.get_config() + except AttributeError as e: + raise BadCompressorError(compressor) from e + elif not isinstance(compressor, Codec): + raise ValueError("expected a numcodecs Codec for compressor") + # TODO: alternatively, could autoconvert str to a Codec + # e.g. 'zlib' -> numcodec.Zlib object + # compressor = numcodecs.get_codec({'id': compressor}) + + # obtain filters config + if filters: + # TODO: filters was removed from the metadata in v3 + # raise error here if store_version > 2? + filters_config = [f.get_config() for f in filters] + else: + filters_config = [] + + # deal with object encoding + if dtype.hasobject: + if object_codec is None: + if not filters: + # there are no filters so we can be sure there is no object codec + raise ValueError("missing object_codec for object array") + else: + # one of the filters may be an object codec, issue a warning rather + # than raise an error to maintain backwards-compatibility + warnings.warn( + "missing object_codec for object array; this will raise a " + "ValueError in version 3.0", + FutureWarning, + stacklevel=2, + ) + else: + filters_config.insert(0, object_codec.get_config()) + elif object_codec is not None: + warnings.warn( + "an object_codec is only needed for object arrays", + stacklevel=2, + ) + + # use null to indicate no filters + if not filters_config: + filters_config = None # type: ignore + + # initialize metadata + # TODO: don't store redundant dimension_separator for v3? + _compressor = compressor_config if store_version == 2 else compressor + meta = dict( + shape=shape, + compressor=_compressor, + fill_value=fill_value, + dimension_separator=dimension_separator, + ) + if store_version < 3: + meta.update(dict(chunks=chunks, dtype=dtype, order=order, filters=filters_config)) + assert not storage_transformers + else: + if dimension_separator is None: + dimension_separator = "/" + if filters_config: + attributes = {"filters": filters_config} + else: + attributes = {} + meta.update( + dict( + chunk_grid=dict(type="regular", chunk_shape=chunks, separator=dimension_separator), + chunk_memory_layout=order, + data_type=dtype, + attributes=attributes, + storage_transformers=storage_transformers, + ) + ) + + key = _prefix_to_array_key(store, _path_to_prefix(path)) + if hasattr(store, "_metadata_class"): + store[key] = store._metadata_class.encode_array_metadata(meta) + else: + store[key] = encode_array_metadata(meta) + + +# backwards compatibility +init_store = init_array + + +def init_group( + store: StoreLike, + overwrite: bool = False, + path: Path = None, + chunk_store: Optional[StoreLike] = None, +): + """Initialize a group store. Note that this is a low-level function and there should be no + need to call this directly from user code. + + Parameters + ---------- + store : Store + A mapping that supports string keys and byte sequence values. + overwrite : bool, optional + If True, erase all data in `store` prior to initialisation. + path : string, optional + Path under which array is stored. + chunk_store : Store, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + + """ + + # normalize path + path = normalize_storage_path(path) + + store_version = getattr(store, "_store_version", 2) + if store_version < 3: + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) + + if store_version == 3 and "zarr.json" not in store: + # initialize with default zarr.json entry level metadata + store["zarr.json"] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore + + # initialise metadata + _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) + + if store_version == 3: + # TODO: Should initializing a v3 group also create a corresponding + # empty folder under data/root/? I think probably not until there + # is actual data written there. + pass + + +def _init_group_metadata( + store: StoreLike, + overwrite: Optional[bool] = False, + path: Optional[str] = None, + chunk_store: Optional[StoreLike] = None, +): + store_version = getattr(store, "_store_version", 2) + path = normalize_storage_path(path) + + # guard conditions + if overwrite: + if store_version == 2: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = data_root + _path_to_prefix(path) + meta_prefix = meta_root + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + store.erase_prefix(meta_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path): + raise ContainsGroupError(path) + elif store_version == 3 and "/" in path: + # cannot create a group overlapping with an existing array name + parent_path = "/".join(path.split("/")[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) + + # initialize metadata + # N.B., currently no metadata properties are needed, however there may + # be in future + if store_version == 3: + meta = {"attributes": {}} # type: ignore + else: + meta = {} + key = _prefix_to_group_key(store, _path_to_prefix(path)) + if hasattr(store, "_metadata_class"): + store[key] = store._metadata_class.encode_group_metadata(meta) + else: + store[key] = encode_group_metadata(meta) + + +def _dict_store_keys(d: Dict, prefix="", cls=dict): + for k in d.keys(): + v = d[k] + if isinstance(v, cls): + yield from _dict_store_keys(v, prefix + k + "/", cls) + else: + yield prefix + k + + +class KVStore(Store): + """ + This provides a default implementation of a store interface around + a mutable mapping, to avoid having to test stores for presence of methods. + + This, for most methods should just be a pass-through to the underlying KV + store which is likely to expose a MuttableMapping interface, + """ + + def __init__(self, mutablemapping): + self._mutable_mapping = mutablemapping + + def __getitem__(self, key): + return self._mutable_mapping[key] + + def __setitem__(self, key, value): + self._mutable_mapping[key] = value + + def __delitem__(self, key): + del self._mutable_mapping[key] + + def __contains__(self, key): + return key in self._mutable_mapping + + def get(self, key, default=None): + return self._mutable_mapping.get(key, default) + + def values(self): + return self._mutable_mapping.values() + + def __iter__(self): + return iter(self._mutable_mapping) + + def __len__(self): + return len(self._mutable_mapping) + + def __repr__(self): + return f"<{self.__class__.__name__}: \n{self._mutable_mapping!r}\n at {id(self):#x}>" + + def __eq__(self, other): + if isinstance(other, KVStore): + return self._mutable_mapping == other._mutable_mapping + else: + return NotImplemented + + +class MemoryStore(Store): + """Store class that uses a hierarchy of :class:`KVStore` objects, thus all data + will be held in main memory. + + Examples + -------- + This is the default class used when creating a group. E.g.:: + + >>> import zarr + >>> g = zarr.group() + >>> type(g.store) + + + Note that the default class when creating an array is the built-in + :class:`KVStore` class, i.e.:: + + >>> z = zarr.zeros(100) + >>> type(z.store) + + + Notes + ----- + Safe to write in multiple threads. + + """ + + def __init__(self, root=None, cls=dict, dimension_separator=None): + if root is None: + self.root = cls() + else: + self.root = root + self.cls = cls + self.write_mutex = Lock() + self._dimension_separator = dimension_separator + + def __getstate__(self): + return self.root, self.cls + + def __setstate__(self, state): + root, cls = state + self.__init__(root=root, cls=cls) + + def _get_parent(self, item: str): + parent = self.root + # split the item + segments = item.split("/") + # find the parent container + for k in segments[:-1]: + parent = parent[k] + if not isinstance(parent, self.cls): + raise KeyError(item) + return parent, segments[-1] + + def _require_parent(self, item): + parent = self.root + # split the item + segments = item.split("/") + # require the parent container + for k in segments[:-1]: + try: + parent = parent[k] + except KeyError: + parent[k] = self.cls() + parent = parent[k] + else: + if not isinstance(parent, self.cls): + raise KeyError(item) + return parent, segments[-1] + + def __getitem__(self, item: str): + parent, key = self._get_parent(item) + try: + value = parent[key] + except KeyError as e: + raise KeyError(item) from e + else: + if isinstance(value, self.cls): + raise KeyError(item) + else: + return value + + def __setitem__(self, item: str, value): + with self.write_mutex: + parent, key = self._require_parent(item) + value = ensure_bytes(value) + parent[key] = value + + def __delitem__(self, item: str): + with self.write_mutex: + parent, key = self._get_parent(item) + try: + del parent[key] + except KeyError as e: + raise KeyError(item) from e + + def __contains__(self, item: str): # type: ignore[override] + try: + parent, key = self._get_parent(item) + value = parent[key] + except KeyError: + return False + else: + return not isinstance(value, self.cls) + + def __eq__(self, other): + return isinstance(other, MemoryStore) and self.root == other.root and self.cls == other.cls + + def keys(self): + yield from _dict_store_keys(self.root, cls=self.cls) + + def __iter__(self): + return self.keys() + + def __len__(self) -> int: + return sum(1 for _ in self.keys()) + + def listdir(self, path: Path = None) -> List[str]: + path = normalize_storage_path(path) + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + return [] + else: + value = self.root + if isinstance(value, self.cls): + return sorted(value.keys()) + else: + return [] + + def rename(self, src_path: Path, dst_path: Path): + src_path = normalize_storage_path(src_path) + dst_path = normalize_storage_path(dst_path) + + src_parent, src_key = self._get_parent(src_path) + dst_parent, dst_key = self._require_parent(dst_path) + + dst_parent[dst_key] = src_parent.pop(src_key) + + def rmdir(self, path: Path = None): + path = normalize_storage_path(path) + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + return + else: + if isinstance(value, self.cls): + del parent[key] + else: + # clear out root + self.root = self.cls() + + def getsize(self, path: Path = None): + path = normalize_storage_path(path) + + # obtain value to return size of + value = None + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + pass + else: + value = self.root + + # obtain size of value + if value is None: + return 0 + + elif isinstance(value, self.cls): + # total size for directory + size = 0 + for v in value.values(): + if not isinstance(v, self.cls): + size += buffer_size(v) + return size + + else: + return buffer_size(value) + + def clear(self): + with self.write_mutex: + self.root.clear() + + +class DictStore(MemoryStore): + def __init__(self, *args, **kwargs): + warnings.warn( + "DictStore has been renamed to MemoryStore in 2.4.0 and " + "will be removed in the future. Please use MemoryStore.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(*args, **kwargs) + + +class DirectoryStore(Store): + """Storage class using directories and files on a standard file system. + + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.DirectoryStore('data/array.zarr') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Each chunk of the array is stored as a separate file on the file system, + i.e.:: + + >>> import os + >>> sorted(os.listdir('data/array.zarr')) + ['.zarray', '0.0', '0.1', '1.0', '1.1'] + + Store a group:: + + >>> store = zarr.DirectoryStore('data/group.zarr') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + When storing a group, levels in the group hierarchy will correspond to + directories on the file system, i.e.:: + + >>> sorted(os.listdir('data/group.zarr')) + ['.zgroup', 'foo'] + >>> sorted(os.listdir('data/group.zarr/foo')) + ['.zgroup', 'bar'] + >>> sorted(os.listdir('data/group.zarr/foo/bar')) + ['.zarray', '0.0', '0.1', '1.0', '1.1'] + + Notes + ----- + Atomic writes are used, which means that data are first written to a + temporary file, then moved into place when the write is successfully + completed. Files are only held open while they are being read or written and are + closed immediately afterwards, so there is no need to manually close any files. + + Safe to write in multiple threads or processes. + + """ + + def __init__( + self, path, normalize_keys=False, dimension_separator: Optional[DIMENSION_SEPARATOR] = None + ): + # guard conditions + path = os.path.abspath(path) + if os.path.exists(path) and not os.path.isdir(path): + raise FSPathExistNotDir(path) + + self.path = path + self.normalize_keys = normalize_keys + self._dimension_separator = dimension_separator + + def _normalize_key(self, key): + return key.lower() if self.normalize_keys else key + + @staticmethod + def _fromfile(fn): + """Read data from a file + + Parameters + ---------- + fn : str + Filepath to open and read from. + + Notes + ----- + Subclasses should overload this method to specify any custom + file reading logic. + """ + with open(fn, "rb") as f: + return f.read() + + @staticmethod + def _tofile(a, fn): + """Write data to a file + + Parameters + ---------- + a : array-like + Data to write into the file. + fn : str + Filepath to open and write to. + + Notes + ----- + Subclasses should overload this method to specify any custom + file writing logic. + """ + with open(fn, mode="wb") as f: + f.write(a) + + def __getitem__(self, key): + key = self._normalize_key(key) + filepath = os.path.join(self.path, key) + if os.path.isfile(filepath): + return self._fromfile(filepath) + else: + raise KeyError(key) + + def __setitem__(self, key, value): + key = self._normalize_key(key) + + # coerce to flat, contiguous array (ideally without copying) + value = ensure_contiguous_ndarray_like(value) + + # destination path for key + file_path = os.path.join(self.path, key) + + # ensure there is no directory in the way + if os.path.isdir(file_path): + shutil.rmtree(file_path) + + # ensure containing directory exists + dir_path, file_name = os.path.split(file_path) + if os.path.isfile(dir_path): + raise KeyError(key) + if not os.path.exists(dir_path): + try: + os.makedirs(dir_path) + except OSError as e: + if e.errno != errno.EEXIST: + raise KeyError(key) from e + + # write to temporary file + # note we're not using tempfile.NamedTemporaryFile to avoid restrictive file permissions + temp_name = file_name + "." + uuid.uuid4().hex + ".partial" + temp_path = os.path.join(dir_path, temp_name) + try: + self._tofile(value, temp_path) + + # move temporary file into place; + # make several attempts at writing the temporary file to get past + # potential antivirus file locking issues + retry_call(os.replace, (temp_path, file_path), exceptions=(PermissionError,)) + + finally: + # clean up if temp file still exists for whatever reason + if os.path.exists(temp_path): # pragma: no cover + os.remove(temp_path) + + def __delitem__(self, key): + key = self._normalize_key(key) + path = os.path.join(self.path, key) + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + # include support for deleting directories, even though strictly + # speaking these do not exist as keys in the store + shutil.rmtree(path) + else: + raise KeyError(key) + + def __contains__(self, key): + key = self._normalize_key(key) + file_path = os.path.join(self.path, key) + return os.path.isfile(file_path) + + def __eq__(self, other): + return isinstance(other, DirectoryStore) and self.path == other.path + + def keys(self): + if os.path.exists(self.path): + yield from self._keys_fast(self.path) + + @staticmethod + def _keys_fast(path, walker=os.walk): + for dirpath, _, filenames in walker(path): + dirpath = os.path.relpath(dirpath, path) + if dirpath == os.curdir: + for f in filenames: + yield f + else: + dirpath = dirpath.replace("\\", "/") + for f in filenames: + yield "/".join((dirpath, f)) + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def dir_path(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + dir_path = os.path.join(dir_path, store_path) + return dir_path + + def listdir(self, path=None): + return ( + self._nested_listdir(path) + if self._dimension_separator == "/" + else self._flat_listdir(path) + ) + + def _flat_listdir(self, path=None): + dir_path = self.dir_path(path) + if os.path.isdir(dir_path): + return sorted(os.listdir(dir_path)) + else: + return [] + + def _nested_listdir(self, path=None): + children = self._flat_listdir(path=path) + if array_meta_key in children: + # special handling of directories containing an array to map nested chunk + # keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and os.path.isdir(entry_path): + for dir_path, _, file_names in os.walk(entry_path): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + rel_path = file_path.split(root_path + os.path.sep)[1] + new_children.append( + rel_path.replace(os.path.sep, self._dimension_separator or ".") + ) + else: + new_children.append(entry) + return sorted(new_children) + else: + return children + + def rename(self, src_path, dst_path): + store_src_path = normalize_storage_path(src_path) + store_dst_path = normalize_storage_path(dst_path) + + dir_path = self.path + + src_path = os.path.join(dir_path, store_src_path) + dst_path = os.path.join(dir_path, store_dst_path) + + os.renames(src_path, dst_path) + + def rmdir(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + dir_path = os.path.join(dir_path, store_path) + if os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self.path + if store_path: + fs_path = os.path.join(fs_path, store_path) + if os.path.isfile(fs_path): + return os.path.getsize(fs_path) + elif os.path.isdir(fs_path): + size = 0 + for child in scandir(fs_path): + if child.is_file(): + size += child.stat().st_size + return size + else: + return 0 + + def clear(self): + shutil.rmtree(self.path) + + +def atexit_rmtree(path, isdir=os.path.isdir, rmtree=shutil.rmtree): # pragma: no cover + """Ensure directory removal at interpreter exit.""" + if isdir(path): + rmtree(path) + + +# noinspection PyShadowingNames +def atexit_rmglob( + path, + glob=glob.glob, + isdir=os.path.isdir, + isfile=os.path.isfile, + remove=os.remove, + rmtree=shutil.rmtree, +): # pragma: no cover + """Ensure removal of multiple files at interpreter exit.""" + for p in glob(path): + if isfile(p): + remove(p) + elif isdir(p): + rmtree(p) + + +class FSStore(Store): + """Wraps an fsspec.FSMap to give access to arbitrary filesystems + + Requires that ``fsspec`` is installed, as well as any additional + requirements for the protocol chosen. + + Parameters + ---------- + url : str + The destination to map. If no fs is provided, should include protocol + and path, like "s3://bucket/root". If an fs is provided, can be a path + within that filesystem, like "bucket/root" + normalize_keys : bool + key_separator : str + public API for accessing dimension_separator. Never `None` + See dimension_separator for more information. + mode : str + "w" for writable, "r" for read-only + exceptions : list of Exception subclasses + When accessing data, any of these exceptions will be treated + as a missing key + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + fs : fsspec.spec.AbstractFileSystem, optional + An existing filesystem to use for the store. + check : bool, optional + If True, performs a touch at the root location, to check for write access. + Passed to `fsspec.mapping.FSMap` constructor. + create : bool, optional + If True, performs a mkdir at the rool location. + Passed to `fsspec.mapping.FSMap` constructor. + missing_exceptions : sequence of Exceptions, optional + Exceptions classes to associate with missing files. + Passed to `fsspec.mapping.FSMap` constructor. + storage_options : passed to the fsspec implementation. Cannot be used + together with fs. + """ + + _array_meta_key = array_meta_key + _group_meta_key = group_meta_key + _attrs_key = attrs_key + + def __init__( + self, + url, + normalize_keys=False, + key_separator=None, + mode="w", + exceptions=(KeyError, PermissionError, IOError), + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + fs=None, + check=False, + create=False, + missing_exceptions=None, + **storage_options, + ): + if not self._fsspec_installed(): # pragma: no cover + raise ImportError("`fsspec` is required to use zarr's FSStore") + import fsspec + + mapper_options = {"check": check, "create": create} + # https://github.com/zarr-developers/zarr-python/pull/911#discussion_r841926292 + # Some fsspec implementations don't accept missing_exceptions. + # This is a workaround to avoid passing it in the most common scenarios. + # Remove this and add missing_exceptions to mapper_options when fsspec is released. + if missing_exceptions is not None: + mapper_options["missing_exceptions"] = missing_exceptions # pragma: no cover + + if fs is None: + protocol, _ = fsspec.core.split_protocol(url) + # set auto_mkdir to True for local file system + if protocol in (None, "file") and not storage_options.get("auto_mkdir"): + storage_options["auto_mkdir"] = True + self.map = fsspec.get_mapper(url, **{**mapper_options, **storage_options}) + self.fs = self.map.fs # for direct operations + self.path = self.fs._strip_protocol(url) + else: + if storage_options: + raise ValueError("Cannot specify both fs and storage_options") + self.fs = fs + self.path = self.fs._strip_protocol(url) + self.map = self.fs.get_mapper(self.path, **mapper_options) + + self.normalize_keys = normalize_keys + self.mode = mode + self.exceptions = exceptions + # For backwards compatibility. Guaranteed to be non-None + if key_separator is not None: + dimension_separator = key_separator + + self.key_separator = dimension_separator + self._default_key_separator() + + # Pass attributes to array creation + self._dimension_separator = dimension_separator + + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "." + + def _normalize_key(self, key): + key = normalize_storage_path(key).lstrip("/") + if key: + *bits, end = key.split("/") + + if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): + end = end.replace(".", self.key_separator) + key = "/".join(bits + [end]) + + return key.lower() if self.normalize_keys else key + + def getitems( + self, keys: Sequence[str], *, contexts: Mapping[str, Context] + ) -> Mapping[str, Any]: + keys_transformed = {self._normalize_key(key): key for key in keys} + results_transformed = self.map.getitems(list(keys_transformed), on_error="return") + results = {} + for k, v in results_transformed.items(): + if isinstance(v, self.exceptions): + # Cause recognized exceptions to prompt a KeyError in the + # function calling this method + continue + elif isinstance(v, Exception): + # Raise any other exception + raise v + else: + # The function calling this method may not recognize the transformed + # keys, so we send the values returned by self.map.getitems back into + # the original key space. + results[keys_transformed[k]] = v + return results + + def __getitem__(self, key): + key = self._normalize_key(key) + try: + return self.map[key] + except self.exceptions as e: + raise KeyError(key) from e + + def setitems(self, values): + if self.mode == "r": + raise ReadOnlyError() + + # Normalize keys and make sure the values are bytes + values = { + self._normalize_key(key): ensure_contiguous_ndarray_or_bytes(val) + for key, val in values.items() + } + self.map.setitems(values) + + def __setitem__(self, key, value): + if self.mode == "r": + raise ReadOnlyError() + key = self._normalize_key(key) + value = ensure_contiguous_ndarray_or_bytes(value) + path = self.dir_path(key) + try: + if self.fs.isdir(path): + self.fs.rm(path, recursive=True) + self.map[key] = value + self.fs.invalidate_cache(self.fs._parent(path)) + except self.exceptions as e: + raise KeyError(key) from e + + def __delitem__(self, key): + if self.mode == "r": + raise ReadOnlyError() + key = self._normalize_key(key) + path = self.dir_path(key) + if self.fs.isdir(path): + self.fs.rm(path, recursive=True) + else: + del self.map[key] + + def delitems(self, keys): + if self.mode == "r": + raise ReadOnlyError() + # only remove the keys that exist in the store + nkeys = [self._normalize_key(key) for key in keys if key in self] + # rm errors if you pass an empty collection + if len(nkeys) > 0: + self.map.delitems(nkeys) + + def __contains__(self, key): + key = self._normalize_key(key) + return key in self.map + + def __eq__(self, other): + return type(self) is type(other) and self.map == other.map and self.mode == other.mode + + def keys(self): + return iter(self.map) + + def __iter__(self): + return self.keys() + + def __len__(self): + return len(list(self.keys())) + + def dir_path(self, path=None): + store_path = normalize_storage_path(path) + return self.map._key_to_str(store_path) + + def listdir(self, path=None): + dir_path = self.dir_path(path) + try: + children = sorted( + p.rstrip("/").rsplit("/", 1)[-1] for p in self.fs.ls(dir_path, detail=False) + ) + if self.key_separator != "/": + return children + else: + if self._array_meta_key in children: + # special handling of directories containing an array to map nested chunk + # keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and self.fs.isdir(entry_path): + for file_name in self.fs.find(entry_path): + file_path = os.path.join(dir_path, file_name) + rel_path = file_path.split(root_path)[1] + rel_path = rel_path.lstrip("/") + new_children.append(rel_path.replace("/", ".")) + else: + new_children.append(entry) + return sorted(new_children) + else: + return children + except OSError: + return [] + + def rmdir(self, path=None): + if self.mode == "r": + raise ReadOnlyError() + store_path = self.dir_path(path) + if self.fs.isdir(store_path): + self.fs.rm(store_path, recursive=True) + + def getsize(self, path=None): + store_path = self.dir_path(path) + return self.fs.du(store_path, True, True) + + def clear(self): + if self.mode == "r": + raise ReadOnlyError() + self.map.clear() + + @classmethod + @lru_cache(maxsize=None) + def _fsspec_installed(cls): + """Returns true if fsspec is installed""" + import importlib.util + + return importlib.util.find_spec("fsspec") is not None + + +class TempStore(DirectoryStore): + """Directory store using a temporary directory for storage. + + Parameters + ---------- + suffix : string, optional + Suffix for the temporary directory name. + prefix : string, optional + Prefix for the temporary directory name. + dir : string, optional + Path to parent directory in which to create temporary directory. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + """ + + # noinspection PyShadowingBuiltins + def __init__( + self, + suffix="", + prefix="zarr", + dir=None, + normalize_keys=False, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + ): + path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) + atexit.register(atexit_rmtree, path) + super().__init__(path, normalize_keys=normalize_keys) + + +_prog_ckey = re.compile(r"^(\d+)(\.\d+)+$") +_prog_number = re.compile(r"^\d+$") + + +class NestedDirectoryStore(DirectoryStore): + """Storage class using directories and files on a standard file system, with + special handling for chunk keys so that chunk files for multidimensional + arrays are stored in a nested directory tree. + + .. deprecated:: 2.18.0 + NestedDirectoryStore will be removed in Zarr-Python 3.0 where controlling + the chunk key encoding will be supported as part of the array metadata. See + `GH1274 `_ + for more information. + + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-sensitive and + case-insensitive file system. Default value is False. + dimension_separator : {'/'}, optional + Separator placed between the dimensions of a chunk. + Only supports "/" unlike other implementations. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.NestedDirectoryStore('data/array.zarr') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Each chunk of the array is stored as a separate file on the file system, + note the multiple directory levels used for the chunk files:: + + >>> import os + >>> sorted(os.listdir('data/array.zarr')) + ['.zarray', '0', '1'] + >>> sorted(os.listdir('data/array.zarr/0')) + ['0', '1'] + >>> sorted(os.listdir('data/array.zarr/1')) + ['0', '1'] + + Store a group:: + + >>> store = zarr.NestedDirectoryStore('data/group.zarr') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + When storing a group, levels in the group hierarchy will correspond to + directories on the file system, i.e.:: + + >>> sorted(os.listdir('data/group.zarr')) + ['.zgroup', 'foo'] + >>> sorted(os.listdir('data/group.zarr/foo')) + ['.zgroup', 'bar'] + >>> sorted(os.listdir('data/group.zarr/foo/bar')) + ['.zarray', '0', '1'] + >>> sorted(os.listdir('data/group.zarr/foo/bar/0')) + ['0', '1'] + >>> sorted(os.listdir('data/group.zarr/foo/bar/1')) + ['0', '1'] + + Notes + ----- + The :class:`DirectoryStore` class stores all chunk files for an array + together in a single directory. On some file systems, the potentially large + number of files in a single directory can cause performance issues. The + :class:`NestedDirectoryStore` class provides an alternative where chunk + files for multidimensional arrays will be organised into a directory + hierarchy, thus reducing the number of files in any one directory. + + Safe to write in multiple threads or processes. + + """ + + def __init__( + self, path, normalize_keys=False, dimension_separator: Optional[DIMENSION_SEPARATOR] = "/" + ): + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + super().__init__(path, normalize_keys=normalize_keys) + if dimension_separator is None: + dimension_separator = "/" + elif dimension_separator != "/": + raise ValueError("NestedDirectoryStore only supports '/' as dimension_separator") + self._dimension_separator = dimension_separator + + def __eq__(self, other): + return isinstance(other, NestedDirectoryStore) and self.path == other.path + + +# noinspection PyPep8Naming +class ZipStore(Store): + """Storage class using a Zip file. + + Parameters + ---------- + path : string + Location of file. + compression : integer, optional + Compression method to use when writing to the archive. + allowZip64 : bool, optional + If True (the default) will create ZIP files that use the ZIP64 + extensions when the zipfile is larger than 2 GiB. If False + will raise an exception when the ZIP file would require ZIP64 + extensions. + mode : string, optional + One of 'r' to read an existing file, 'w' to truncate and write a new + file, 'a' to append to an existing file, or 'x' to exclusively create + and write a new file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.ZipStore('data/array.zip', mode='w') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.ZipStore('data/group.zip', mode='w') + >>> root = zarr.group(store=store) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a ZipStore, the ``close()`` method must be called, otherwise + essential data will not be written to the underlying Zip file. The ZipStore + class also supports the context manager protocol, which ensures the ``close()`` + method is called on leaving the context, e.g.:: + + >>> with zarr.ZipStore('data/array.zip', mode='w') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + ... z[...] = 42 + ... # no need to call store.close() + + Notes + ----- + Each chunk of an array is stored as a separate entry in the Zip file. Note + that Zip files do not provide any way to remove or replace existing entries. + If an attempt is made to replace an entry, then a warning is generated by + the Python standard library about a duplicate Zip file entry. This can be + triggered if you attempt to write data to a Zarr array more than once, + e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> # first write OK + ... z[...] = 42 + >>> # second write generates warnings + ... z[...] = 42 # doctest: +SKIP + >>> store.close() + + This can also happen in a more subtle situation, where data are written only + once to a Zarr array, but the write operations are not aligned with chunk + boundaries, e.g.:: + + >>> store = zarr.ZipStore('data/example.zip', mode='w') + >>> z = zarr.zeros(100, chunks=10, store=store) + >>> z[5:15] = 42 + >>> # write overlaps chunk previously written, generates warnings + ... z[15:25] = 42 # doctest: +SKIP + + To avoid creating duplicate entries, only write data once, and align writes + with chunk boundaries. This alignment is done automatically if you call + ``z[...] = ...`` or create an array from existing data via :func:`zarr.array`. + + Alternatively, use a :class:`DirectoryStore` when writing the data, then + manually Zip the directory and use the Zip file for subsequent reads. + Take note that the files in the Zip file must be relative to the root of the + Zarr archive. You may find it easier to create such a Zip file with ``7z``, e.g.:: + + 7z a -tzip archive.zarr.zip archive.zarr/. + + Safe to write in multiple threads but not in multiple processes. + + """ + + _erasable = False + + def __init__( + self, + path, + compression=zipfile.ZIP_STORED, + allowZip64=True, + mode="a", + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + ): + # store properties + path = os.path.abspath(path) + self.path = path + self.compression = compression + self.allowZip64 = allowZip64 + self.mode = mode + self._dimension_separator = dimension_separator + + # Current understanding is that zipfile module in stdlib is not thread-safe, + # and so locking is required for both read and write. However, this has not + # been investigated in detail, perhaps no lock is needed if mode='r'. + self.mutex = RLock() + + # open zip file + self.zf = zipfile.ZipFile(path, mode=mode, compression=compression, allowZip64=allowZip64) + + def __getstate__(self): + self.flush() + return self.path, self.compression, self.allowZip64, self.mode + + def __setstate__(self, state): + path, compression, allowZip64, mode = state + # if initially opened with mode 'w' or 'x', re-open in mode 'a' so file doesn't + # get clobbered + if mode in "wx": + mode = "a" + self.__init__(path=path, compression=compression, allowZip64=allowZip64, mode=mode) + + def close(self): + """Closes the underlying zip file, ensuring all records are written.""" + with self.mutex: + self.zf.close() + + def flush(self): + """Closes the underlying zip file, ensuring all records are written, + then re-opens the file for further modifications.""" + if self.mode != "r": + with self.mutex: + self.zf.close() + # N.B., re-open with mode 'a' regardless of initial mode so we don't wipe + # what's been written + self.zf = zipfile.ZipFile( + self.path, mode="a", compression=self.compression, allowZip64=self.allowZip64 + ) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + with self.mutex: + with self.zf.open(key) as f: # will raise KeyError + return f.read() + + def __setitem__(self, key, value): + if self.mode == "r": + raise ReadOnlyError() + value = ensure_contiguous_ndarray_like(value).view("u1") + with self.mutex: + # writestr(key, value) writes with default permissions from + # zipfile (600) that are too restrictive, build ZipInfo for + # the key to work around limitation + keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6]) + keyinfo.compress_type = self.compression + if keyinfo.filename[-1] == os.sep: + keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x + keyinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- + + self.zf.writestr(keyinfo, value) + + def __delitem__(self, key): + raise NotImplementedError + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) + and self.path == other.path + and self.compression == other.compression + and self.allowZip64 == other.allowZip64 + ) + + def keylist(self): + with self.mutex: + return sorted(self.zf.namelist()) + + def keys(self): + yield from self.keylist() + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def __contains__(self, key): + try: + with self.mutex: + self.zf.getinfo(key) + except KeyError: + return False + else: + return True + + def listdir(self, path=None): + path = normalize_storage_path(path) + return _listdir_from_keys(self, path) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.listdir(path) + if children: + size = 0 + for child in children: + if path: + name = path + "/" + child + else: + name = child + try: + info = self.zf.getinfo(name) + except KeyError: + pass + else: + size += info.compress_size + return size + elif path: + try: + info = self.zf.getinfo(path) + return info.compress_size + except KeyError: + return 0 + else: + return 0 + + def clear(self): + if self.mode == "r": + raise ReadOnlyError() + with self.mutex: + self.close() + os.remove(self.path) + self.zf = zipfile.ZipFile( + self.path, mode=self.mode, compression=self.compression, allowZip64=self.allowZip64 + ) + + +def migrate_1to2(store): + """Migrate array metadata in `store` from Zarr format version 1 to + version 2. + + Parameters + ---------- + store : Store + Store to be migrated. + + Notes + ----- + Version 1 did not support hierarchies, so this migration function will + look for a single array in `store` and migrate the array metadata to + version 2. + + """ + + # migrate metadata + from zarr import meta_v1 + + meta = meta_v1.decode_metadata(store["meta"]) + del store["meta"] + + # add empty filters + meta["filters"] = None + + # migration compression metadata + compression = meta["compression"] + if compression is None or compression == "none": + compressor_config = None + else: + compression_opts = meta["compression_opts"] + codec_cls = codec_registry[compression] + if isinstance(compression_opts, dict): + compressor = codec_cls(**compression_opts) + else: + compressor = codec_cls(compression_opts) + compressor_config = compressor.get_config() + meta["compressor"] = compressor_config + del meta["compression"] + del meta["compression_opts"] + + # store migrated metadata + if hasattr(store, "_metadata_class"): + store[array_meta_key] = store._metadata_class.encode_array_metadata(meta) + else: + store[array_meta_key] = encode_array_metadata(meta) + + # migrate user attributes + store[attrs_key] = store["attrs"] + del store["attrs"] + + +# noinspection PyShadowingBuiltins +class DBMStore(Store): + """Storage class using a DBM-style database. + + .. deprecated:: 2.18.0 + DBMStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + + Parameters + ---------- + path : string + Location of database file. + flag : string, optional + Flags for opening the database file. + mode : int + File mode used if a new file is created. + open : function, optional + Function to open the database file. If not provided, :func:`dbm.open` will be + used on Python 3, and :func:`anydbm.open` will be used on Python 2. + write_lock: bool, optional + Use a lock to prevent concurrent writes from multiple threads (True by default). + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk.e + **open_kwargs + Keyword arguments to pass the `open` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.DBMStore('data/array.db') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.DBMStore('data/group.db') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a DBMStore, the ``close()`` method must be called, otherwise + essential data may not be written to the underlying database file. The + DBMStore class also supports the context manager protocol, which ensures the + ``close()`` method is called on leaving the context, e.g.:: + + >>> with zarr.DBMStore('data/array.db') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + ... z[...] = 42 + ... # no need to call store.close() + + A different database library can be used by passing a different function to + the `open` parameter. For example, if the `bsddb3 + `_ package is installed, a + Berkeley DB database can be used:: + + >>> import bsddb3 # doctest: +SKIP + >>> store = zarr.DBMStore('data/array.bdb', open=bsddb3.btopen) # doctest: +SKIP + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) # doctest: +SKIP + >>> z[...] = 42 # doctest: +SKIP + >>> store.close() # doctest: +SKIP + + Notes + ----- + Please note that, by default, this class will use the Python standard + library `dbm.open` function to open the database file (or `anydbm.open` on + Python 2). There are up to three different implementations of DBM-style + databases available in any Python installation, and which one is used may + vary from one system to another. Database file formats are not compatible + between these different implementations. Also, some implementations are + more efficient than others. In particular, the "dumb" implementation will be + the fall-back on many systems, and has very poor performance for some usage + scenarios. If you want to ensure a specific implementation is used, pass the + corresponding open function, e.g., `dbm.gnu.open` to use the GNU DBM + library. + + Safe to write in multiple threads. May be safe to write in multiple processes, + depending on which DBM implementation is being used, although this has not been + tested. + + """ + + def __init__( + self, + path, + flag="c", + mode=0o666, + open=None, + write_lock=True, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + **open_kwargs, + ): + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + if open is None: + import dbm + + open = dbm.open + path = os.path.abspath(path) + # noinspection PyArgumentList + self.db = open(path, flag, mode, **open_kwargs) + self.path = path + self.flag = flag + self.mode = mode + self.open = open + self.write_lock = write_lock + self.write_mutex: Union[Lock, NoLock] + if write_lock: + # This may not be required as some dbm implementations manage their own + # locks, but err on the side of caution. + self.write_mutex = Lock() + else: + self.write_mutex = nolock + self.open_kwargs = open_kwargs + self._dimension_separator = dimension_separator + + def __getstate__(self): + try: + self.flush() # needed for ndbm + except Exception: + # flush may fail if db has already been closed + pass + return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) + + def __setstate__(self, state): + path, flag, mode, open, write_lock, open_kws = state + if flag[0] == "n": + flag = "c" + flag[1:] # don't clobber an existing database + self.__init__(path=path, flag=flag, mode=mode, open=open, write_lock=write_lock, **open_kws) + + def close(self): + """Closes the underlying database file.""" + if hasattr(self.db, "close"): + with self.write_mutex: + self.db.close() + + def flush(self): + """Synchronizes data to the underlying database file.""" + if self.flag[0] != "r": + with self.write_mutex: + if hasattr(self.db, "sync"): + self.db.sync() + else: # pragma: no cover + # we don't cover this branch anymore as ndbm (oracle) is not packaged + # by conda-forge on non-mac OS: + # https://github.com/conda-forge/staged-recipes/issues/4476 + # fall-back, close and re-open, needed for ndbm + flag = self.flag + if flag[0] == "n": + flag = "c" + flag[1:] # don't clobber an existing database + self.db.close() + # noinspection PyArgumentList + self.db = self.open(self.path, flag, self.mode, **self.open_kwargs) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + return self.db[key] + + def __setitem__(self, key, value): + if isinstance(key, str): + key = key.encode("ascii") + value = ensure_bytes(value) + with self.write_mutex: + self.db[key] = value + + def __delitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.write_mutex: + del self.db[key] + + def __eq__(self, other): + return ( + isinstance(other, DBMStore) + and self.path == other.path + and + # allow flag and mode to differ + self.open == other.open + and self.open_kwargs == other.open_kwargs + ) + + def keys(self): + return (ensure_text(k, "ascii") for k in iter(self.db.keys())) + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def __contains__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + return key in self.db + + def rmdir(self, path: str = "") -> None: + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + + +class LMDBStore(Store): + """Storage class using LMDB. Requires the `lmdb `_ + package to be installed. + + .. deprecated:: 2.18.0 + LMDBStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + + Parameters + ---------- + path : string + Location of database file. + buffers : bool, optional + If True (default) use support for buffers, which should increase performance by + reducing memory copies. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `lmdb.open` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.LMDBStore('data/array.mdb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.LMDBStore('data/group.mdb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + + After modifying a DBMStore, the ``close()`` method must be called, otherwise + essential data may not be written to the underlying database file. The + DBMStore class also supports the context manager protocol, which ensures the + ``close()`` method is called on leaving the context, e.g.:: + + >>> with zarr.LMDBStore('data/array.mdb') as store: + ... z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + ... z[...] = 42 + ... # no need to call store.close() + + Notes + ----- + By default writes are not immediately flushed to disk to increase performance. You + can ensure data are flushed to disk by calling the ``flush()`` or ``close()`` methods. + + Should be safe to write in multiple threads or processes due to the synchronization + support within LMDB, although writing from multiple processes has not been tested. + + """ + + def __init__( + self, + path, + buffers=True, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + **kwargs, + ): + import lmdb + + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + # set default memory map size to something larger than the lmdb default, which is + # very likely to be too small for any moderate array (logic copied from zict) + map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 + kwargs.setdefault("map_size", map_size) + + # don't initialize buffers to zero by default, shouldn't be necessary + kwargs.setdefault("meminit", False) + + # decide whether to use the writemap option based on the operating system's + # support for sparse files - writemap requires sparse file support otherwise + # the whole# `map_size` may be reserved up front on disk (logic copied from zict) + writemap = sys.platform.startswith("linux") + kwargs.setdefault("writemap", writemap) + + # decide options for when data are flushed to disk - choose to delay syncing + # data to filesystem, otherwise pay a large performance penalty (zict also does + # this) + kwargs.setdefault("metasync", False) + kwargs.setdefault("sync", False) + kwargs.setdefault("map_async", False) + + # set default option for number of cached transactions + max_spare_txns = multiprocessing.cpu_count() + kwargs.setdefault("max_spare_txns", max_spare_txns) + + # normalize path + path = os.path.abspath(path) + + # open database + self.db = lmdb.open(path, **kwargs) + + # store properties + self.buffers = buffers + self.path = path + self.kwargs = kwargs + self._dimension_separator = dimension_separator + + def __getstate__(self): + try: + self.flush() # just in case + except Exception: + # flush may fail if db has already been closed + pass + return self.path, self.buffers, self.kwargs + + def __setstate__(self, state): + path, buffers, kwargs = state + self.__init__(path=path, buffers=buffers, **kwargs) + + def close(self): + """Closes the underlying database.""" + self.db.close() + + def flush(self): + """Synchronizes data to the file system.""" + self.db.sync() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __getitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + # use the buffers option, should avoid a memory copy + with self.db.begin(buffers=self.buffers) as txn: + value = txn.get(key) + if value is None: + raise KeyError(key) + return value + + def __setitem__(self, key, value): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(write=True, buffers=self.buffers) as txn: + txn.put(key, value) + + def __delitem__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(write=True) as txn: + if not txn.delete(key): + raise KeyError(key) + + def __contains__(self, key): + if isinstance(key, str): + key = key.encode("ascii") + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + return cursor.set_key(key) + + def items(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + for k, v in cursor.iternext(keys=True, values=True): + yield ensure_text(k, "ascii"), v + + def keys(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + for k in cursor.iternext(keys=True, values=False): + yield ensure_text(k, "ascii") + + def values(self): + with self.db.begin(buffers=self.buffers) as txn: + with txn.cursor() as cursor: + yield from cursor.iternext(keys=False, values=True) + + def __iter__(self): + return self.keys() + + def __len__(self): + return self.db.stat()["entries"] + + +class LRUStoreCache(Store): + """Storage class that implements a least-recently-used (LRU) cache layer over + some other store. Intended primarily for use with stores that can be slow to + access, e.g., remote stores that require network communication to store and + retrieve data. + + Parameters + ---------- + store : Store + The store containing the actual data to be cached. + max_size : int + The maximum size that the cache may grow to, in number of bytes. Provide `None` + if you would like the cache to have unlimited size. + + Examples + -------- + The example below wraps an S3 store with an LRU cache:: + + >>> import s3fs + >>> import zarr + >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + >>> cache = zarr.LRUStoreCache(store, max_size=2**28) + >>> root = zarr.group(store=cache) # doctest: +REMOTE_DATA + >>> z = root['foo/bar/baz'] # doctest: +REMOTE_DATA + >>> from timeit import timeit + >>> # first data access is relatively slow, retrieved from store + ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.1081731989979744 + >>> # second data access is faster, uses cache + ... timeit('print(z[:].tobytes())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.0009490990014455747 + + """ + + def __init__(self, store: StoreLike, max_size: int): + self._store: BaseStore = BaseStore._ensure_store(store) + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache: Dict[Any, Any] = {} + self._listdir_cache: Dict[Path, Any] = dict() + self._values_cache: Dict[Path, Any] = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def __getstate__(self): + return ( + self._store, + self._max_size, + self._current_size, + self._keys_cache, + self._contains_cache, + self._listdir_cache, + self._values_cache, + self.hits, + self.misses, + ) + + def __setstate__(self, state): + ( + self._store, + self._max_size, + self._current_size, + self._keys_cache, + self._contains_cache, + self._listdir_cache, + self._values_cache, + self.hits, + self.misses, + ) = state + self._mutex = Lock() + + def __len__(self): + return len(self._keys()) + + def __iter__(self): + return self.keys() + + def __contains__(self, key): + with self._mutex: + if key not in self._contains_cache: + self._contains_cache[key] = key in self._store + return self._contains_cache[key] + + def clear(self): + self._store.clear() + self.invalidate() + + def keys(self): + with self._mutex: + return iter(self._keys()) + + def _keys(self): + if self._keys_cache is None: + self._keys_cache = list(self._store.keys()) + return self._keys_cache + + def listdir(self, path: Path = None): + with self._mutex: + try: + return self._listdir_cache[path] + except KeyError: + listing = listdir(self._store, path) + self._listdir_cache[path] = listing + return listing + + def getsize(self, path=None) -> int: + return getsize(self._store, path=path) + + def _pop_value(self): + # remove the first value from the cache, as this will be the least recently + # used value + _, v = self._values_cache.popitem(last=False) + return v + + def _accommodate_value(self, value_size): + if self._max_size is None: + return + # ensure there is enough space in the cache for a new value + while self._current_size + value_size > self._max_size: + v = self._pop_value() + self._current_size -= buffer_size(v) + + def _cache_value(self, key: Path, value): + # cache a value + value_size = buffer_size(value) + # check size of the value against max size, as if the value itself exceeds max + # size then we are never going to cache it + if self._max_size is None or value_size <= self._max_size: + self._accommodate_value(value_size) + self._values_cache[key] = value + self._current_size += value_size + + def invalidate(self): + """Completely clear the cache.""" + with self._mutex: + self._values_cache.clear() + self._invalidate_keys() + self._current_size = 0 + + def invalidate_values(self): + """Clear the values cache.""" + with self._mutex: + self._values_cache.clear() + + def invalidate_keys(self): + """Clear the keys cache.""" + with self._mutex: + self._invalidate_keys() + + def _invalidate_keys(self): + self._keys_cache = None + self._contains_cache.clear() + self._listdir_cache.clear() + + def _invalidate_value(self, key): + if key in self._values_cache: + value = self._values_cache.pop(key) + self._current_size -= buffer_size(value) + + def __getitem__(self, key): + try: + # first try to obtain the value from the cache + with self._mutex: + value = self._values_cache[key] + # cache hit if no KeyError is raised + self.hits += 1 + # treat the end as most recently used + self._values_cache.move_to_end(key) + + except KeyError: + # cache miss, retrieve value from the store + value = self._store[key] + with self._mutex: + self.misses += 1 + # need to check if key is not in the cache, as it may have been cached + # while we were retrieving the value from the store + if key not in self._values_cache: + self._cache_value(key, value) + + return value + + def __setitem__(self, key, value): + self._store[key] = value + with self._mutex: + self._invalidate_keys() + self._invalidate_value(key) + self._cache_value(key, value) + + def __delitem__(self, key): + del self._store[key] + with self._mutex: + self._invalidate_keys() + self._invalidate_value(key) + + +class SQLiteStore(Store): + """Storage class using SQLite. + + .. deprecated:: 2.18.0 + SQLiteStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + + Parameters + ---------- + path : string + Location of database file. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `sqlite3.connect` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.SQLiteStore('data/array.sqldb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.SQLiteStore('data/group.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + """ + + def __init__(self, path, dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **kwargs): + import sqlite3 + + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + self._dimension_separator = dimension_separator + + # normalize path + if path != ":memory:": + path = os.path.abspath(path) + + # store properties + self.path = path + self.kwargs = kwargs + + # allow threading if SQLite connections are thread-safe + # + # ref: https://www.sqlite.org/releaselog/3_3_1.html + # ref: https://github.com/python/cpython/issues/71377 + check_same_thread = True + if sqlite3.sqlite_version_info >= (3, 3, 1): + check_same_thread = False + + # keep a lock for serializing mutable operations + self.lock = Lock() + + # open database + self.db = sqlite3.connect( + self.path, + detect_types=0, + isolation_level=None, + check_same_thread=check_same_thread, + **self.kwargs, + ) + + # handle keys as `str`s + self.db.text_factory = str + + # get a cursor to read/write to the database + self.cursor = self.db.cursor() + + # initialize database with our table if missing + with self.lock: + self.cursor.execute("CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)") + + def __getstate__(self): + if self.path == ":memory:": + raise PicklingError("Cannot pickle in-memory SQLite databases") + return self.path, self.kwargs + + def __setstate__(self, state): + path, kwargs = state + self.__init__(path=path, **kwargs) + + def close(self): + """Closes the underlying database.""" + + # close cursor and db objects + self.cursor.close() + self.db.close() + + def __getitem__(self, key): + value = self.cursor.execute("SELECT v FROM zarr WHERE (k = ?)", (key,)) + for (v,) in value: + return v + raise KeyError(key) + + def __setitem__(self, key, value): + self.update({key: value}) + + def __delitem__(self, key): + with self.lock: + self.cursor.execute("DELETE FROM zarr WHERE (k = ?)", (key,)) + if self.cursor.rowcount < 1: + raise KeyError(key) + + def __contains__(self, key): + cs = self.cursor.execute("SELECT COUNT(*) FROM zarr WHERE (k = ?)", (key,)) + for (has,) in cs: + has = bool(has) + return has + + def items(self): + kvs = self.cursor.execute("SELECT k, v FROM zarr") + yield from kvs + + def keys(self): + ks = self.cursor.execute("SELECT k FROM zarr") + for (k,) in ks: + yield k + + def values(self): + vs = self.cursor.execute("SELECT v FROM zarr") + for (v,) in vs: + yield v + + def __iter__(self): + return self.keys() + + def __len__(self): + cs = self.cursor.execute("SELECT COUNT(*) FROM zarr") + for (c,) in cs: + return c + + def update(self, *args, **kwargs): + args += (kwargs,) + + kv_list = [] + for dct in args: + for k, v in dct.items(): + v = ensure_contiguous_ndarray_like(v) + + # Accumulate key-value pairs for storage + kv_list.append((k, v)) + + with self.lock: + self.cursor.executemany("REPLACE INTO zarr VALUES (?, ?)", kv_list) + + def listdir(self, path=None): + path = normalize_storage_path(path) + sep = "_" if path == "" else "/" + keys = self.cursor.execute( + f""" + SELECT DISTINCT SUBSTR(m, 0, INSTR(m, '/')) AS l FROM ( + SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), '/') || '/' AS m + FROM zarr WHERE k LIKE (? || '{sep}%') + ) ORDER BY l ASC + """, + (path, path), + ) + keys = list(map(operator.itemgetter(0), keys)) + return keys + + def getsize(self, path=None): + path = normalize_storage_path(path) + size = self.cursor.execute( + """ + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || '%') AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), '/'), '/') + """, + (path, path), + ) + for (s,) in size: + return s + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + with self.lock: + self.cursor.execute("DELETE FROM zarr WHERE k LIKE (? || '/%')", (path,)) + else: + self.clear() + + def clear(self): + with self.lock: + self.cursor.executescript( + """ + BEGIN TRANSACTION; + DROP TABLE zarr; + CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); + COMMIT TRANSACTION; + """ + ) + + +class MongoDBStore(Store): + """Storage class using MongoDB. + + .. note:: This is an experimental feature. + + .. deprecated:: 2.18.0 + MongoDBStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + + Requires the `pymongo `_ + package to be installed. + + Parameters + ---------- + database : string + Name of database + collection : string + Name of collection + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `pymongo.MongoClient` function. + + Notes + ----- + The maximum chunksize in MongoDB documents is 16 MB. + + """ + + _key = "key" + _value = "value" + + def __init__( + self, + database="mongodb_zarr", + collection="zarr_collection", + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + **kwargs, + ): + import pymongo + + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + self._database = database + self._collection = collection + self._dimension_separator = dimension_separator + self._kwargs = kwargs + + self.client = pymongo.MongoClient(**self._kwargs) + self.db = self.client.get_database(self._database) + self.collection = self.db.get_collection(self._collection) + + def __getitem__(self, key): + doc = self.collection.find_one({self._key: key}) + + if doc is None: + raise KeyError(key) + else: + return doc[self._value] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.collection.replace_one( + {self._key: key}, {self._key: key, self._value: value}, upsert=True + ) + + def __delitem__(self, key): + result = self.collection.delete_many({self._key: key}) + if not result.deleted_count == 1: + raise KeyError(key) + + def __iter__(self): + for f in self.collection.find({}): + yield f[self._key] + + def __len__(self): + return self.collection.count_documents({}) + + def __getstate__(self): + return self._database, self._collection, self._kwargs + + def __setstate__(self, state): + database, collection, kwargs = state + self.__init__(database=database, collection=collection, **kwargs) + + def close(self): + """Cleanup client resources and disconnect from MongoDB.""" + self.client.close() + + def clear(self): + """Remove all items from store.""" + self.collection.delete_many({}) + + +class RedisStore(Store): + """Storage class using Redis. + + .. note:: This is an experimental feature. + + .. deprecated:: 2.18.0 + RedisStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + + Requires the `redis `_ + package to be installed. + + Parameters + ---------- + prefix : string + Name of prefix for Redis keys + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + **kwargs + Keyword arguments passed through to the `redis.Redis` function. + + """ + + def __init__( + self, prefix="zarr", dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **kwargs + ): + import redis + + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + + self._prefix = prefix + self._kwargs = kwargs + self._dimension_separator = dimension_separator + + self.client = redis.Redis(**kwargs) + + def _key(self, key): + return f"{self._prefix}:{key}" + + def __getitem__(self, key): + return self.client[self._key(key)] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.client[self._key(key)] = value + + def __delitem__(self, key): + count = self.client.delete(self._key(key)) + if not count: + raise KeyError(key) + + def keylist(self): + offset = len(self._key("")) # length of prefix + return [key[offset:].decode("utf-8") for key in self.client.keys(self._key("*"))] + + def keys(self): + yield from self.keylist() + + def __iter__(self): + yield from self.keys() + + def __len__(self): + return len(self.keylist()) + + def __getstate__(self): + return self._prefix, self._kwargs + + def __setstate__(self, state): + prefix, kwargs = state + self.__init__(prefix=prefix, **kwargs) + + def clear(self): + for key in self.keys(): + del self[key] + + +class ConsolidatedMetadataStore(Store): + """A layer over other storage, where the metadata has been consolidated into + a single key. + + The purpose of this class, is to be able to get all of the metadata for + a given array in a single read operation from the underlying storage. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the array metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. versionadded:: 2.3 + + .. note:: This is an experimental feature. + + Parameters + ---------- + store: Store + Containing the zarr array. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + + """ + + def __init__(self, store: StoreLike, metadata_key=".zmetadata"): + self.store = Store._ensure_store(store) + + # retrieve consolidated metadata + meta = json_loads(self.store[metadata_key]) + + # check format of consolidated metadata + consolidated_format = meta.get("zarr_consolidated_format", None) + if consolidated_format != 1: + raise MetadataError( + f"unsupported zarr consolidated metadata format: {consolidated_format}" + ) + + # decode metadata + self.meta_store: Store = KVStore(meta["metadata"]) + + def __getitem__(self, key): + return self.meta_store[key] + + def __contains__(self, item): + return item in self.meta_store + + def __iter__(self): + return iter(self.meta_store) + + def __len__(self): + return len(self.meta_store) + + def __delitem__(self, key): + raise ReadOnlyError() + + def __setitem__(self, key, value): + raise ReadOnlyError() + + def getsize(self, path): + return getsize(self.meta_store, path) + + def listdir(self, path): + return listdir(self.meta_store, path) diff --git a/src/zarr/v2/sync.py b/src/zarr/v2/sync.py new file mode 100644 index 0000000000..ba1c5df5b3 --- /dev/null +++ b/src/zarr/v2/sync.py @@ -0,0 +1,57 @@ +import os +from collections import defaultdict +from threading import Lock +from typing import Protocol + + +class Synchronizer(Protocol): + """Base class for synchronizers.""" + + def __getitem__(self, item): + # see subclasses + ... + + +class ThreadSynchronizer(Synchronizer): + """Provides synchronization using thread locks.""" + + def __init__(self): + self.mutex = Lock() + self.locks = defaultdict(Lock) + + def __getitem__(self, item): + with self.mutex: + return self.locks[item] + + def __getstate__(self): + return True + + def __setstate__(self, *args): + # reinitialize from scratch + self.__init__() + + +class ProcessSynchronizer(Synchronizer): + """Provides synchronization using file locks via the + `fasteners `_ + package. + + Parameters + ---------- + path : string + Path to a directory on a file system that is shared by all processes. + N.B., this should be a *different* path to where you store the array. + + """ + + def __init__(self, path): + self.path = path + + def __getitem__(self, item): + import fasteners + + path = os.path.join(self.path, item) + lock = fasteners.InterProcessLock(path) + return lock + + # pickling and unpickling should be handled automatically diff --git a/src/zarr/v2/tests/__init__.py b/src/zarr/v2/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/v2/tests/conftest.py b/src/zarr/v2/tests/conftest.py new file mode 100644 index 0000000000..aa73b8691e --- /dev/null +++ b/src/zarr/v2/tests/conftest.py @@ -0,0 +1,8 @@ +import pathlib + +import pytest + + +@pytest.fixture(params=[str, pathlib.Path]) +def path_type(request): + return request.param diff --git a/src/zarr/v2/tests/test_attrs.py b/src/zarr/v2/tests/test_attrs.py new file mode 100644 index 0000000000..2d9553971b --- /dev/null +++ b/src/zarr/v2/tests/test_attrs.py @@ -0,0 +1,298 @@ +import json + +import pathlib +import pytest + +import zarr +from zarr._storage.store import meta_root +from zarr.attrs import Attributes +from zarr.storage import KVStore, DirectoryStore +from zarr._storage.v3 import KVStoreV3 +from zarr.tests.util import CountingDict, CountingDictV3 +from zarr.hierarchy import group + + +@pytest.fixture(params=[2, 3]) +def zarr_version(request): + return request.param + + +def _init_store(version): + """Use a plain dict() for v2, but KVStoreV3 otherwise.""" + if version == 2: + return dict() + return KVStoreV3(dict()) + + +class TestAttributes: + def init_attributes(self, store, read_only=False, cache=True, zarr_version=2): + root = ".z" if zarr_version == 2 else meta_root + return Attributes(store, key=root + "attrs", read_only=read_only, cache=cache) + + def test_storage(self, zarr_version): + store = _init_store(zarr_version) + root = ".z" if zarr_version == 2 else meta_root + attrs_key = root + "attrs" + a = Attributes(store=store, key=attrs_key) + assert isinstance(a.store, KVStore) + assert "foo" not in a + assert "bar" not in a + assert dict() == a.asdict() + + a["foo"] = "bar" + a["baz"] = 42 + assert attrs_key in store + assert isinstance(store[attrs_key], bytes) + d = json.loads(str(store[attrs_key], "utf-8")) + if zarr_version == 3: + d = d["attributes"] + assert dict(foo="bar", baz=42) == d + + def test_utf8_encoding(self, zarr_version): + project_root = pathlib.Path(zarr.__file__).resolve().parent.parent + fixdir = project_root / "fixture" + testdir = fixdir / "utf8attrs" + if not testdir.exists(): # pragma: no cover + # store the data - should be one-time operation + testdir.mkdir(parents=True, exist_ok=True) + with (testdir / ".zattrs").open("w", encoding="utf-8") as f: + f.write('{"foo": "た"}') + with (testdir / ".zgroup").open("w", encoding="utf-8") as f: + f.write("""{\n "zarr_format": 2\n}""") + + # fixture data + fixture = group(store=DirectoryStore(str(fixdir))) + assert fixture["utf8attrs"].attrs.asdict() == dict(foo="た") + + def test_get_set_del_contains(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) + assert "foo" not in a + a["foo"] = "bar" + a["baz"] = 42 + assert "foo" in a + assert "baz" in a + assert "bar" == a["foo"] + assert 42 == a["baz"] + del a["foo"] + assert "foo" not in a + with pytest.raises(KeyError): + # noinspection PyStatementEffect + a["foo"] + + def test_update_put(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) + assert "foo" not in a + assert "bar" not in a + assert "baz" not in a + + a.update(foo="spam", bar=42, baz=4.2) + assert a["foo"] == "spam" + assert a["bar"] == 42 + assert a["baz"] == 4.2 + + a.put(dict(foo="eggs", bar=84)) + assert a["foo"] == "eggs" + assert a["bar"] == 84 + assert "baz" not in a + + def test_iterators(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) + assert 0 == len(a) + assert set() == set(a) + assert set() == set(a.keys()) + assert set() == set(a.values()) + assert set() == set(a.items()) + + a["foo"] = "bar" + a["baz"] = 42 + + assert 2 == len(a) + assert {"foo", "baz"} == set(a) + assert {"foo", "baz"} == set(a.keys()) + assert {"bar", 42} == set(a.values()) + assert {("foo", "bar"), ("baz", 42)} == set(a.items()) + + def test_read_only(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, read_only=True, zarr_version=zarr_version) + if zarr_version == 2: + store[".zattrs"] = json.dumps(dict(foo="bar", baz=42)).encode("ascii") + else: + store["meta/root/attrs"] = json.dumps(dict(attributes=dict(foo="bar", baz=42))).encode( + "ascii" + ) + assert a["foo"] == "bar" + assert a["baz"] == 42 + with pytest.raises(PermissionError): + a["foo"] = "quux" + with pytest.raises(PermissionError): + del a["foo"] + with pytest.raises(PermissionError): + a.update(foo="quux") + + def test_key_completions(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) + d = a._ipython_key_completions_() + assert "foo" not in d + assert "123" not in d + assert "baz" not in d + assert "asdf;" not in d + a["foo"] = 42 + a["123"] = 4.2 + a["asdf;"] = "ghjkl;" + d = a._ipython_key_completions_() + assert "foo" in d + assert "123" in d + assert "asdf;" in d + assert "baz" not in d + + def test_caching_on(self, zarr_version): + # caching is turned on by default + + # setup store + store = CountingDict() if zarr_version == 2 else CountingDictV3() + attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" + assert 0 == store.counter["__getitem__", attrs_key] + assert 0 == store.counter["__setitem__", attrs_key] + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + assert 0 == store.counter["__getitem__", attrs_key] + assert 1 == store.counter["__setitem__", attrs_key] + + # setup attributes + a = self.init_attributes(store, zarr_version=zarr_version) + + # test __getitem__ causes all attributes to be cached + assert a["foo"] == "xxx" + assert 1 == store.counter["__getitem__", attrs_key] + assert a["bar"] == 42 + assert 1 == store.counter["__getitem__", attrs_key] + assert a["foo"] == "xxx" + assert 1 == store.counter["__getitem__", attrs_key] + + # test __setitem__ updates the cache + a["foo"] = "yyy" + get_cnt = 2 if zarr_version == 2 else 3 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 2 == store.counter["__setitem__", attrs_key] + assert a["foo"] == "yyy" + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 2 == store.counter["__setitem__", attrs_key] + + # test update() updates the cache + a.update(foo="zzz", bar=84) + get_cnt = 3 if zarr_version == 2 else 5 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + assert a["foo"] == "zzz" + assert a["bar"] == 84 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + + # test __contains__ uses the cache + assert "foo" in a + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + assert "spam" not in a + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + + # test __delitem__ updates the cache + del a["bar"] + get_cnt = 4 if zarr_version == 2 else 7 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 4 == store.counter["__setitem__", attrs_key] + assert "bar" not in a + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 4 == store.counter["__setitem__", attrs_key] + + # test refresh() + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + assert get_cnt == store.counter["__getitem__", attrs_key] + a.refresh() + get_cnt = 5 if zarr_version == 2 else 8 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert a["foo"] == "xxx" + assert get_cnt == store.counter["__getitem__", attrs_key] + assert a["bar"] == 42 + assert get_cnt == store.counter["__getitem__", attrs_key] + + def test_caching_off(self, zarr_version): + # setup store + store = CountingDict() if zarr_version == 2 else CountingDictV3() + attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" + assert 0 == store.counter["__getitem__", attrs_key] + assert 0 == store.counter["__setitem__", attrs_key] + + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo="xxx", bar=42)).encode("ascii") + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo="xxx", bar=42))).encode("ascii") + assert 0 == store.counter["__getitem__", attrs_key] + assert 1 == store.counter["__setitem__", attrs_key] + + # setup attributes + a = self.init_attributes(store, cache=False, zarr_version=zarr_version) + + # test __getitem__ + assert a["foo"] == "xxx" + assert 1 == store.counter["__getitem__", attrs_key] + assert a["bar"] == 42 + assert 2 == store.counter["__getitem__", attrs_key] + assert a["foo"] == "xxx" + assert 3 == store.counter["__getitem__", attrs_key] + + # test __setitem__ + a["foo"] = "yyy" + get_cnt = 4 if zarr_version == 2 else 5 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 2 == store.counter["__setitem__", attrs_key] + assert a["foo"] == "yyy" + get_cnt = 5 if zarr_version == 2 else 6 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 2 == store.counter["__setitem__", attrs_key] + + # test update() + a.update(foo="zzz", bar=84) + get_cnt = 6 if zarr_version == 2 else 8 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + assert a["foo"] == "zzz" + assert a["bar"] == 84 + get_cnt = 8 if zarr_version == 2 else 10 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + + # test __contains__ + assert "foo" in a + get_cnt = 9 if zarr_version == 2 else 11 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + assert "spam" not in a + get_cnt = 10 if zarr_version == 2 else 12 + assert get_cnt == store.counter["__getitem__", attrs_key] + assert 3 == store.counter["__setitem__", attrs_key] + + def test_wrong_keys(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) + + warning_msg = "only attribute keys of type 'string' will be allowed in the future" + + with pytest.warns(DeprecationWarning, match=warning_msg): + a[1] = "foo" + + with pytest.warns(DeprecationWarning, match=warning_msg): + a.put({1: "foo"}) + + with pytest.warns(DeprecationWarning, match=warning_msg): + a.update({1: "foo"}) diff --git a/src/zarr/v2/tests/test_convenience.py b/src/zarr/v2/tests/test_convenience.py new file mode 100644 index 0000000000..7d190adc2c --- /dev/null +++ b/src/zarr/v2/tests/test_convenience.py @@ -0,0 +1,1047 @@ +import atexit +import tempfile +import unittest +from numbers import Integral + +import numpy as np +import pytest +from numcodecs import Adler32, Zlib +from numpy.testing import assert_array_equal + +import zarr +from zarr.convenience import ( + consolidate_metadata, + copy, + copy_store, + load, + open, + open_consolidated, + save, + save_group, + save_array, + copy_all, +) +from zarr.core import Array +from zarr.errors import CopyError +from zarr.hierarchy import Group, group +from zarr.storage import ( + ConsolidatedMetadataStore, + FSStore, + KVStore, + MemoryStore, + atexit_rmtree, + data_root, + meta_root, + getsize, +) +from zarr._storage.store import v3_api_available +from zarr._storage.v3 import ( + ConsolidatedMetadataStoreV3, + DirectoryStoreV3, + FSStoreV3, + KVStoreV3, + MemoryStoreV3, + SQLiteStoreV3, +) +from zarr.tests.util import have_fsspec + +_VERSIONS = (2, 3) if v3_api_available else (2,) + + +def _init_creation_kwargs(zarr_version): + kwargs = {"zarr_version": zarr_version} + if zarr_version == 3: + kwargs["path"] = "dataset" + return kwargs + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_open_array(path_type, zarr_version): + store = tempfile.mkdtemp() + atexit.register(atexit_rmtree, store) + store = path_type(store) + kwargs = _init_creation_kwargs(zarr_version) + + # open array, create if doesn't exist + z = open(store, mode="a", shape=100, **kwargs) + assert isinstance(z, Array) + assert z.shape == (100,) + + # open array, overwrite + z = open(store, mode="w", shape=200, **kwargs) + assert isinstance(z, Array) + assert z.shape == (200,) + + # open array, read-only + z = open(store, mode="r", **kwargs) + assert isinstance(z, Array) + assert z.shape == (200,) + assert z.read_only + + # path not found + with pytest.raises(ValueError): + open("doesnotexist", mode="r") + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_open_group(path_type, zarr_version): + store = tempfile.mkdtemp() + atexit.register(atexit_rmtree, store) + store = path_type(store) + kwargs = _init_creation_kwargs(zarr_version) + + # open group, create if doesn't exist + g = open(store, mode="a", **kwargs) + g.create_group("foo") + assert isinstance(g, Group) + assert "foo" in g + + # open group, overwrite + g = open(store, mode="w", **kwargs) + assert isinstance(g, Group) + assert "foo" not in g + + # open group, read-only + g = open(store, mode="r", **kwargs) + assert isinstance(g, Group) + assert g.read_only + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_save_errors(zarr_version): + with pytest.raises(ValueError): + # no arrays provided + save_group("data/group.zarr", zarr_version=zarr_version) + with pytest.raises(TypeError): + # no array provided + save_array("data/group.zarr", zarr_version=zarr_version) + with pytest.raises(ValueError): + # no arrays provided + save("data/group.zarr", zarr_version=zarr_version) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +def test_zarr_v3_save_multiple_unnamed(): + x = np.ones(8) + y = np.zeros(8) + store = KVStoreV3(dict()) + # no path provided + save_group(store, x, y, path="dataset", zarr_version=3) + # names become arr_{i} for unnamed *args + assert data_root + "dataset/arr_0/c0" in store + assert data_root + "dataset/arr_1/c0" in store + assert meta_root + "dataset/arr_0.array.json" in store + assert meta_root + "dataset/arr_1.array.json" in store + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +def test_zarr_v3_save_errors(): + x = np.ones(8) + with pytest.raises(ValueError): + # no path provided + save_group("data/group.zr3", x, zarr_version=3) + with pytest.raises(ValueError): + # no path provided + save_array("data/group.zr3", x, zarr_version=3) + with pytest.raises(ValueError): + # no path provided + save("data/group.zr3", x, zarr_version=3) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_lazy_loader(zarr_version): + foo = np.arange(100) + bar = np.arange(100, 0, -1) + store = "data/group.zarr" if zarr_version == 2 else "data/group.zr3" + kwargs = _init_creation_kwargs(zarr_version) + save(store, foo=foo, bar=bar, **kwargs) + loader = load(store, **kwargs) + assert "foo" in loader + assert "bar" in loader + assert "baz" not in loader + assert len(loader) == 2 + assert sorted(loader) == ["bar", "foo"] + assert_array_equal(foo, loader["foo"]) + assert_array_equal(bar, loader["bar"]) + assert "LazyLoader: " in repr(loader) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_load_array(zarr_version): + foo = np.arange(100) + bar = np.arange(100, 0, -1) + store = "data/group.zarr" if zarr_version == 2 else "data/group.zr3" + kwargs = _init_creation_kwargs(zarr_version) + save(store, foo=foo, bar=bar, **kwargs) + + # can also load arrays directly into a numpy array + for array_name in ["foo", "bar"]: + array_path = "dataset/" + array_name if zarr_version == 3 else array_name + array = load(store, path=array_path, zarr_version=zarr_version) + assert isinstance(array, np.ndarray) + if array_name == "foo": + assert_array_equal(foo, array) + else: + assert_array_equal(bar, array) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_tree(zarr_version): + kwargs = _init_creation_kwargs(zarr_version) + g1 = zarr.group(**kwargs) + g1.create_group("foo") + g3 = g1.create_group("bar") + g3.create_group("baz") + g5 = g3.create_group("qux") + g5.create_dataset("baz", shape=100, chunks=10) + assert repr(zarr.tree(g1)) == repr(g1.tree()) + assert str(zarr.tree(g1)) == str(g1.tree()) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("stores_from_path", [False, True]) +@pytest.mark.parametrize( + "with_chunk_store,listable", + [(False, True), (True, True), (False, False)], + ids=["default-listable", "with_chunk_store-listable", "default-unlistable"], +) +def test_consolidate_metadata( + with_chunk_store, zarr_version, listable, monkeypatch, stores_from_path +): + # setup initial data + if stores_from_path: + store = tempfile.mkdtemp() + atexit.register(atexit_rmtree, store) + if with_chunk_store: + chunk_store = tempfile.mkdtemp() + atexit.register(atexit_rmtree, chunk_store) + else: + chunk_store = None + version_kwarg = {"zarr_version": zarr_version} + else: + if zarr_version == 2: + store = MemoryStore() + chunk_store = MemoryStore() if with_chunk_store else None + elif zarr_version == 3: + store = MemoryStoreV3() + chunk_store = MemoryStoreV3() if with_chunk_store else None + version_kwarg = {} + path = "dataset" if zarr_version == 3 else None + z = group(store, chunk_store=chunk_store, path=path, **version_kwarg) + + # Reload the actual store implementation in case str + store_to_copy = z.store + + z.create_group("g1") + g2 = z.create_group("g2") + g2.attrs["hello"] = "world" + arr = g2.create_dataset("arr", shape=(20, 20), chunks=(5, 5), dtype="f8") + assert 16 == arr.nchunks + assert 0 == arr.nchunks_initialized + arr.attrs["data"] = 1 + arr[:] = 1.0 + assert 16 == arr.nchunks_initialized + + if stores_from_path: + # get the actual store class for use with consolidate_metadata + store_class = z._store + else: + store_class = store + + if zarr_version == 3: + # error on v3 if path not provided + with pytest.raises(ValueError): + consolidate_metadata(store_class, path=None) + + with pytest.raises(ValueError): + consolidate_metadata(store_class, path="") + + # perform consolidation + out = consolidate_metadata(store_class, path=path) + assert isinstance(out, Group) + assert ["g1", "g2"] == list(out) + if not stores_from_path: + if zarr_version == 2: + assert isinstance(out._store, ConsolidatedMetadataStore) + assert ".zmetadata" in store + meta_keys = [ + ".zgroup", + "g1/.zgroup", + "g2/.zgroup", + "g2/.zattrs", + "g2/arr/.zarray", + "g2/arr/.zattrs", + ] + else: + assert isinstance(out._store, ConsolidatedMetadataStoreV3) + assert "meta/root/consolidated/.zmetadata" in store + meta_keys = [ + "zarr.json", + meta_root + "dataset.group.json", + meta_root + "dataset/g1.group.json", + meta_root + "dataset/g2.group.json", + meta_root + "dataset/g2/arr.array.json", + "meta/root/consolidated.group.json", + ] + for key in meta_keys: + del store[key] + + # https://github.com/zarr-developers/zarr-python/issues/993 + # Make sure we can still open consolidated on an unlistable store: + if not listable: + fs_memory = pytest.importorskip("fsspec.implementations.memory") + monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False) + monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls") + fs = fs_memory.MemoryFileSystem() + if zarr_version == 2: + store_to_open = FSStore("", fs=fs) + else: + store_to_open = FSStoreV3("", fs=fs) + + # copy original store to new unlistable store + store_to_open.update(store_to_copy) + + else: + store_to_open = store + + # open consolidated + z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path, **version_kwarg) + assert ["g1", "g2"] == list(z2) + assert "world" == z2.g2.attrs["hello"] + assert 1 == z2.g2.arr.attrs["data"] + assert (z2.g2.arr[:] == 1.0).all() + assert 16 == z2.g2.arr.nchunks + if listable: + assert 16 == z2.g2.arr.nchunks_initialized + else: + with pytest.raises(NotImplementedError): + _ = z2.g2.arr.nchunks_initialized + + if stores_from_path: + # path string is note a BaseStore subclass so cannot be used to + # initialize a ConsolidatedMetadataStore. + if zarr_version == 2: + with pytest.raises(ValueError): + cmd = ConsolidatedMetadataStore(store) + elif zarr_version == 3: + with pytest.raises(ValueError): + cmd = ConsolidatedMetadataStoreV3(store) + else: + # tests del/write on the store + if zarr_version == 2: + cmd = ConsolidatedMetadataStore(store) + with pytest.raises(PermissionError): + del cmd[".zgroup"] + with pytest.raises(PermissionError): + cmd[".zgroup"] = None + else: + cmd = ConsolidatedMetadataStoreV3(store) + with pytest.raises(PermissionError): + del cmd[meta_root + "dataset.group.json"] + with pytest.raises(PermissionError): + cmd[meta_root + "dataset.group.json"] = None + + # test getsize on the store + assert isinstance(getsize(cmd), Integral) + + # test new metadata are not writeable + with pytest.raises(PermissionError): + z2.create_group("g3") + with pytest.raises(PermissionError): + z2.create_dataset("spam", shape=42, chunks=7, dtype="i4") + with pytest.raises(PermissionError): + del z2["g2"] + + # test consolidated metadata are not writeable + with pytest.raises(PermissionError): + z2.g2.attrs["hello"] = "universe" + with pytest.raises(PermissionError): + z2.g2.arr.attrs["foo"] = "bar" + + # test the data are writeable + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() + + # test invalid modes + with pytest.raises(ValueError): + open_consolidated(store, chunk_store=chunk_store, mode="a", path=path) + with pytest.raises(ValueError): + open_consolidated(store, chunk_store=chunk_store, mode="w", path=path) + with pytest.raises(ValueError): + open_consolidated(store, chunk_store=chunk_store, mode="w-", path=path) + + # make sure keyword arguments are passed through without error + open_consolidated( + store, + chunk_store=chunk_store, + path=path, + cache_attrs=True, + synchronizer=None, + **version_kwarg, + ) + + +@pytest.mark.parametrize( + "options", + ( + {"dimension_separator": "/"}, + {"dimension_separator": "."}, + {"dimension_separator": None}, + ), +) +def test_save_array_separator(tmpdir, options): + data = np.arange(6).reshape((3, 2)) + url = tmpdir.join("test.zarr") + save_array(url, data, **options) + + +class TestCopyStore(unittest.TestCase): + _version = 2 + + def setUp(self): + source = dict() + source["foo"] = b"xxx" + source["bar/baz"] = b"yyy" + source["bar/qux"] = b"zzz" + self.source = source + + def _get_dest_store(self): + return dict() + + def test_no_paths(self): + source = self.source + dest = self._get_dest_store() + copy_store(source, dest) + assert len(source) == len(dest) + for key in source: + assert source[key] == dest[key] + + def test_source_path(self): + source = self.source + # paths should be normalized + for source_path in "bar", "bar/", "/bar", "/bar/": + dest = self._get_dest_store() + copy_store(source, dest, source_path=source_path) + assert 2 == len(dest) + for key in source: + if key.startswith("bar/"): + dest_key = key.split("bar/")[1] + assert source[key] == dest[dest_key] + else: + assert key not in dest + + def test_dest_path(self): + source = self.source + # paths should be normalized + for dest_path in "new", "new/", "/new", "/new/": + dest = self._get_dest_store() + copy_store(source, dest, dest_path=dest_path) + assert len(source) == len(dest) + for key in source: + if self._version == 3: + dest_key = key[:10] + "new/" + key[10:] + else: + dest_key = "new/" + key + assert source[key] == dest[dest_key] + + def test_source_dest_path(self): + source = self.source + # paths should be normalized + for source_path in "bar", "bar/", "/bar", "/bar/": + for dest_path in "new", "new/", "/new", "/new/": + dest = self._get_dest_store() + copy_store(source, dest, source_path=source_path, dest_path=dest_path) + assert 2 == len(dest) + for key in source: + if key.startswith("bar/"): + dest_key = "new/" + key.split("bar/")[1] + assert source[key] == dest[dest_key] + else: + assert key not in dest + assert ("new/" + key) not in dest + + def test_excludes_includes(self): + source = self.source + + # single excludes + dest = self._get_dest_store() + excludes = "f.*" + copy_store(source, dest, excludes=excludes) + assert len(dest) == 2 + + root = "" if self._version == 2 else meta_root + assert root + "foo" not in dest + + # multiple excludes + dest = self._get_dest_store() + excludes = "b.z", ".*x" + copy_store(source, dest, excludes=excludes) + assert len(dest) == 1 + assert root + "foo" in dest + assert root + "bar/baz" not in dest + assert root + "bar/qux" not in dest + + # excludes and includes + dest = self._get_dest_store() + excludes = "b.*" + includes = ".*x" + copy_store(source, dest, excludes=excludes, includes=includes) + assert len(dest) == 2 + assert root + "foo" in dest + assert root + "bar/baz" not in dest + assert root + "bar/qux" in dest + + def test_dry_run(self): + source = self.source + dest = self._get_dest_store() + copy_store(source, dest, dry_run=True) + assert 0 == len(dest) + + def test_if_exists(self): + source = self.source + dest = self._get_dest_store() + root = "" if self._version == 2 else meta_root + dest[root + "bar/baz"] = b"mmm" + + # default ('raise') + with pytest.raises(CopyError): + copy_store(source, dest) + + # explicit 'raise' + with pytest.raises(CopyError): + copy_store(source, dest, if_exists="raise") + + # skip + copy_store(source, dest, if_exists="skip") + assert 3 == len(dest) + assert dest[root + "foo"] == b"xxx" + assert dest[root + "bar/baz"] == b"mmm" + assert dest[root + "bar/qux"] == b"zzz" + + # replace + copy_store(source, dest, if_exists="replace") + assert 3 == len(dest) + assert dest[root + "foo"] == b"xxx" + assert dest[root + "bar/baz"] == b"yyy" + assert dest[root + "bar/qux"] == b"zzz" + + # invalid option + with pytest.raises(ValueError): + copy_store(source, dest, if_exists="foobar") + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestCopyStoreV3(TestCopyStore): + _version = 3 + + def setUp(self): + source = KVStoreV3(dict()) + source["meta/root/foo"] = b"xxx" + source["meta/root/bar/baz"] = b"yyy" + source["meta/root/bar/qux"] = b"zzz" + self.source = source + + def _get_dest_store(self): + return KVStoreV3(dict()) + + def test_mismatched_store_versions(self): + # cannot copy between stores of mixed Zarr versions + dest = KVStore(dict()) + with pytest.raises(ValueError): + copy_store(self.source, dest) + + +def check_copied_array(original, copied, without_attrs=False, expect_props=None): + # setup + source_h5py = original.__module__.startswith("h5py.") + dest_h5py = copied.__module__.startswith("h5py.") + zarr_to_zarr = not (source_h5py or dest_h5py) + h5py_to_h5py = source_h5py and dest_h5py + zarr_to_h5py = not source_h5py and dest_h5py + h5py_to_zarr = source_h5py and not dest_h5py + if expect_props is None: + expect_props = dict() + else: + expect_props = expect_props.copy() + + # common properties in zarr and h5py + for p in "dtype", "shape", "chunks": + expect_props.setdefault(p, getattr(original, p)) + + # zarr-specific properties + if zarr_to_zarr: + for p in "compressor", "filters", "order", "fill_value": + expect_props.setdefault(p, getattr(original, p)) + + # h5py-specific properties + if h5py_to_h5py: + for p in ( + "maxshape", + "compression", + "compression_opts", + "shuffle", + "scaleoffset", + "fletcher32", + "fillvalue", + ): + expect_props.setdefault(p, getattr(original, p)) + + # common properties with some name differences + if h5py_to_zarr: + expect_props.setdefault("fill_value", original.fillvalue) + if zarr_to_h5py: + expect_props.setdefault("fillvalue", original.fill_value) + + # compare properties + for k, v in expect_props.items(): + assert v == getattr(copied, k) + + # compare data + assert_array_equal(original[:], copied[:]) + + # compare attrs + if without_attrs: + for k in original.attrs.keys(): + assert k not in copied.attrs + else: + if dest_h5py and "filters" in original.attrs: + # special case in v3 (storing filters metadata under attributes) + # we explicitly do not copy this info over to HDF5 + original_attrs = original.attrs.asdict().copy() + original_attrs.pop("filters") + else: + original_attrs = original.attrs + assert sorted(original_attrs.items()) == sorted(copied.attrs.items()) + + +def check_copied_group(original, copied, without_attrs=False, expect_props=None, shallow=False): + # setup + if expect_props is None: + expect_props = dict() + else: + expect_props = expect_props.copy() + + # compare children + for k, v in original.items(): + if hasattr(v, "shape"): + assert k in copied + check_copied_array(v, copied[k], without_attrs=without_attrs, expect_props=expect_props) + elif shallow: + assert k not in copied + else: + assert k in copied + check_copied_group( + v, + copied[k], + without_attrs=without_attrs, + shallow=shallow, + expect_props=expect_props, + ) + + # compare attrs + if without_attrs: + for k in original.attrs.keys(): + assert k not in copied.attrs + else: + assert sorted(original.attrs.items()) == sorted(copied.attrs.items()) + + +def test_copy_all(): + """ + https://github.com/zarr-developers/zarr-python/issues/269 + + copy_all used to not copy attributes as `.keys()` does not return hidden `.zattrs`. + + """ + original_group = zarr.group(store=MemoryStore(), overwrite=True) + original_group.attrs["info"] = "group attrs" + original_subgroup = original_group.create_group("subgroup") + original_subgroup.attrs["info"] = "sub attrs" + + destination_group = zarr.group(store=MemoryStore(), overwrite=True) + + # copy from memory to directory store + copy_all( + original_group, + destination_group, + dry_run=False, + ) + + assert "subgroup" in destination_group + assert destination_group.attrs["info"] == "group attrs" + assert destination_group.subgroup.attrs["info"] == "sub attrs" + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +def test_copy_all_v3(): + """ + https://github.com/zarr-developers/zarr-python/issues/269 + + copy_all used to not copy attributes as `.keys()` + + """ + original_group = zarr.group(store=MemoryStoreV3(), path="group1", overwrite=True) + original_group.create_group("subgroup") + + destination_group = zarr.group(store=MemoryStoreV3(), path="group2", overwrite=True) + + # copy from memory to directory store + copy_all( + original_group, + destination_group, + dry_run=False, + ) + assert "subgroup" in destination_group + + +class TestCopy: + @pytest.fixture(params=[False, True], ids=["zarr", "hdf5"]) + def source(self, request, tmpdir): + def prep_source(source): + foo = source.create_group("foo") + foo.attrs["experiment"] = "weird science" + baz = foo.create_dataset("bar/baz", data=np.arange(100), chunks=(50,)) + baz.attrs["units"] = "metres" + if request.param: + extra_kws = dict( + compression="gzip", + compression_opts=3, + fillvalue=84, + shuffle=True, + fletcher32=True, + ) + else: + extra_kws = dict(compressor=Zlib(3), order="F", fill_value=42, filters=[Adler32()]) + source.create_dataset( + "spam", + data=np.arange(100, 200).reshape(20, 5), + chunks=(10, 2), + dtype="i2", + **extra_kws, + ) + return source + + if request.param: + h5py = pytest.importorskip("h5py") + fn = tmpdir.join("source.h5") + with h5py.File(str(fn), mode="w") as h5f: + yield prep_source(h5f) + else: + yield prep_source(group()) + + @pytest.fixture(params=[False, True], ids=["zarr", "hdf5"]) + def dest(self, request, tmpdir): + if request.param: + h5py = pytest.importorskip("h5py") + fn = tmpdir.join("dest.h5") + with h5py.File(str(fn), mode="w") as h5f: + yield h5f + else: + yield group() + + def test_copy_array(self, source, dest): + # copy array with default options + copy(source["foo/bar/baz"], dest) + check_copied_array(source["foo/bar/baz"], dest["baz"]) + copy(source["spam"], dest) + check_copied_array(source["spam"], dest["spam"]) + + def test_copy_bad_dest(self, source, dest): + # try to copy to an array, dest must be a group + dest = dest.create_dataset("eggs", shape=(100,)) + with pytest.raises(ValueError): + copy(source["foo/bar/baz"], dest) + + def test_copy_array_name(self, source, dest): + # copy array with name + copy(source["foo/bar/baz"], dest, name="qux") + assert "baz" not in dest + check_copied_array(source["foo/bar/baz"], dest["qux"]) + + def test_copy_array_create_options(self, source, dest): + dest_h5py = dest.__module__.startswith("h5py.") + + # copy array, provide creation options + compressor = Zlib(9) + create_kws = dict(chunks=(10,)) + if dest_h5py: + create_kws.update( + compression="gzip", compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42 + ) + else: + create_kws.update(compressor=compressor, fill_value=42, order="F", filters=[Adler32()]) + copy(source["foo/bar/baz"], dest, without_attrs=True, **create_kws) + check_copied_array( + source["foo/bar/baz"], dest["baz"], without_attrs=True, expect_props=create_kws + ) + + def test_copy_array_exists_array(self, source, dest): + # copy array, dest array in the way + dest.create_dataset("baz", shape=(10,)) + + # raise + with pytest.raises(CopyError): + # should raise by default + copy(source["foo/bar/baz"], dest) + assert (10,) == dest["baz"].shape + with pytest.raises(CopyError): + copy(source["foo/bar/baz"], dest, if_exists="raise") + assert (10,) == dest["baz"].shape + + # skip + copy(source["foo/bar/baz"], dest, if_exists="skip") + assert (10,) == dest["baz"].shape + + # replace + copy(source["foo/bar/baz"], dest, if_exists="replace") + check_copied_array(source["foo/bar/baz"], dest["baz"]) + + # invalid option + with pytest.raises(ValueError): + copy(source["foo/bar/baz"], dest, if_exists="foobar") + + def test_copy_array_exists_group(self, source, dest): + # copy array, dest group in the way + dest.create_group("baz") + + # raise + with pytest.raises(CopyError): + copy(source["foo/bar/baz"], dest) + assert not hasattr(dest["baz"], "shape") + with pytest.raises(CopyError): + copy(source["foo/bar/baz"], dest, if_exists="raise") + assert not hasattr(dest["baz"], "shape") + + # skip + copy(source["foo/bar/baz"], dest, if_exists="skip") + assert not hasattr(dest["baz"], "shape") + + # replace + copy(source["foo/bar/baz"], dest, if_exists="replace") + check_copied_array(source["foo/bar/baz"], dest["baz"]) + + def test_copy_array_skip_initialized(self, source, dest): + dest_h5py = dest.__module__.startswith("h5py.") + + dest.create_dataset("baz", shape=(100,), chunks=(10,), dtype="i8") + assert not np.all(source["foo/bar/baz"][:] == dest["baz"][:]) + + if dest_h5py: + with pytest.raises(ValueError): + # not available with copy to h5py + copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") + + else: + # copy array, dest array exists but not yet initialized + copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") + check_copied_array(source["foo/bar/baz"], dest["baz"]) + + # copy array, dest array exists and initialized, will be skipped + dest["baz"][:] = np.arange(100, 200) + copy(source["foo/bar/baz"], dest, if_exists="skip_initialized") + assert_array_equal(np.arange(100, 200), dest["baz"][:]) + assert not np.all(source["foo/bar/baz"][:] == dest["baz"][:]) + + def test_copy_group(self, source, dest): + # copy group, default options + copy(source["foo"], dest) + check_copied_group(source["foo"], dest["foo"]) + + def test_copy_group_no_name(self, source, dest): + with pytest.raises(TypeError): + # need a name if copy root + copy(source, dest) + + copy(source, dest, name="root") + check_copied_group(source, dest["root"]) + + def test_copy_group_options(self, source, dest): + # copy group, non-default options + copy(source["foo"], dest, name="qux", without_attrs=True) + assert "foo" not in dest + check_copied_group(source["foo"], dest["qux"], without_attrs=True) + + def test_copy_group_shallow(self, source, dest): + # copy group, shallow + copy(source, dest, name="eggs", shallow=True) + check_copied_group(source, dest["eggs"], shallow=True) + + def test_copy_group_exists_group(self, source, dest): + # copy group, dest groups exist + dest.create_group("foo/bar") + copy(source["foo"], dest) + check_copied_group(source["foo"], dest["foo"]) + + def test_copy_group_exists_array(self, source, dest): + # copy group, dest array in the way + dest.create_dataset("foo/bar", shape=(10,)) + + # raise + with pytest.raises(CopyError): + copy(source["foo"], dest) + assert dest["foo/bar"].shape == (10,) + with pytest.raises(CopyError): + copy(source["foo"], dest, if_exists="raise") + assert dest["foo/bar"].shape == (10,) + + # skip + copy(source["foo"], dest, if_exists="skip") + assert dest["foo/bar"].shape == (10,) + + # replace + copy(source["foo"], dest, if_exists="replace") + check_copied_group(source["foo"], dest["foo"]) + + def test_copy_group_dry_run(self, source, dest): + # dry run, empty destination + n_copied, n_skipped, n_bytes_copied = copy( + source["foo"], dest, dry_run=True, return_stats=True + ) + assert 0 == len(dest) + assert 3 == n_copied + assert 0 == n_skipped + assert 0 == n_bytes_copied + + # dry run, array exists in destination + baz = np.arange(100, 200) + dest.create_dataset("foo/bar/baz", data=baz) + assert not np.all(source["foo/bar/baz"][:] == dest["foo/bar/baz"][:]) + assert 1 == len(dest) + + # raise + with pytest.raises(CopyError): + copy(source["foo"], dest, dry_run=True) + assert 1 == len(dest) + + # skip + n_copied, n_skipped, n_bytes_copied = copy( + source["foo"], dest, dry_run=True, if_exists="skip", return_stats=True + ) + assert 1 == len(dest) + assert 2 == n_copied + assert 1 == n_skipped + assert 0 == n_bytes_copied + assert_array_equal(baz, dest["foo/bar/baz"]) + + # replace + n_copied, n_skipped, n_bytes_copied = copy( + source["foo"], dest, dry_run=True, if_exists="replace", return_stats=True + ) + assert 1 == len(dest) + assert 3 == n_copied + assert 0 == n_skipped + assert 0 == n_bytes_copied + assert_array_equal(baz, dest["foo/bar/baz"]) + + def test_logging(self, source, dest, tmpdir): + # callable log + copy(source["foo"], dest, dry_run=True, log=print) + + # file name + fn = str(tmpdir.join("log_name")) + copy(source["foo"], dest, dry_run=True, log=fn) + + # file + with tmpdir.join("log_file").open(mode="w") as f: + copy(source["foo"], dest, dry_run=True, log=f) + + # bad option + with pytest.raises(TypeError): + copy(source["foo"], dest, dry_run=True, log=True) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestCopyV3(TestCopy): + @pytest.fixture(params=["zarr", "hdf5"]) + def source(self, request, tmpdir): + def prep_source(source): + foo = source.create_group("foo") + foo.attrs["experiment"] = "weird science" + baz = foo.create_dataset("bar/baz", data=np.arange(100), chunks=(50,)) + baz.attrs["units"] = "metres" + if request.param == "hdf5": + extra_kws = dict( + compression="gzip", + compression_opts=3, + fillvalue=84, + shuffle=True, + fletcher32=True, + ) + else: + extra_kws = dict(compressor=Zlib(3), order="F", fill_value=42, filters=[Adler32()]) + source.create_dataset( + "spam", + data=np.arange(100, 200).reshape(20, 5), + chunks=(10, 2), + dtype="i2", + **extra_kws, + ) + return source + + if request.param == "hdf5": + h5py = pytest.importorskip("h5py") + fn = tmpdir.join("source.h5") + with h5py.File(str(fn), mode="w") as h5f: + yield prep_source(h5f) + elif request.param == "zarr": + yield prep_source(group(path="group1", zarr_version=3)) + + # Test with various destination StoreV3 types as TestCopyV3 covers rmdir + destinations = ["hdf5", "zarr", "zarr_kvstore", "zarr_directorystore", "zarr_sqlitestore"] + if have_fsspec: + destinations += ["zarr_fsstore"] + + @pytest.fixture(params=destinations) + def dest(self, request, tmpdir): + if request.param == "hdf5": + h5py = pytest.importorskip("h5py") + fn = tmpdir.join("dest.h5") + with h5py.File(str(fn), mode="w") as h5f: + yield h5f + elif request.param == "zarr": + yield group(path="group2", zarr_version=3) + elif request.param == "zarr_kvstore": + store = KVStoreV3(dict()) + yield group(store, path="group2", zarr_version=3) + elif request.param == "zarr_fsstore": + fn = tmpdir.join("dest.zr3") + store = FSStoreV3(str(fn), auto_mkdir=True) + yield group(store, path="group2", zarr_version=3) + elif request.param == "zarr_directorystore": + fn = tmpdir.join("dest.zr3") + store = DirectoryStoreV3(str(fn)) + yield group(store, path="group2", zarr_version=3) + elif request.param == "zarr_sqlitestore": + fn = tmpdir.join("dest.db") + store = SQLiteStoreV3(str(fn)) + yield group(store, path="group2", zarr_version=3) + + def test_copy_array_create_options(self, source, dest): + dest_h5py = dest.__module__.startswith("h5py.") + + # copy array, provide creation options + compressor = Zlib(9) + create_kws = dict(chunks=(10,)) + if dest_h5py: + create_kws.update( + compression="gzip", compression_opts=9, shuffle=True, fletcher32=True, fillvalue=42 + ) + else: + # v3 case has no filters argument in zarr create_kws + create_kws.update(compressor=compressor, fill_value=42, order="F") + copy(source["foo/bar/baz"], dest, without_attrs=True, **create_kws) + check_copied_array( + source["foo/bar/baz"], dest["baz"], without_attrs=True, expect_props=create_kws + ) + + def test_copy_group_no_name(self, source, dest): + if source.__module__.startswith("h5py"): + with pytest.raises(TypeError): + copy(source, dest) + else: + # For v3, dest.name will be inferred from source.name + copy(source, dest) + check_copied_group(source, dest[source.name.lstrip("/")]) + + copy(source, dest, name="root") + check_copied_group(source, dest["root"]) diff --git a/src/zarr/v2/tests/test_core.py b/src/zarr/v2/tests/test_core.py new file mode 100644 index 0000000000..a4e5a5e912 --- /dev/null +++ b/src/zarr/v2/tests/test_core.py @@ -0,0 +1,3301 @@ +import atexit +import sys +import pickle +import shutil + +from typing import Any, Literal, Optional, Tuple, Union, Sequence +import unittest +from itertools import zip_longest +from tempfile import mkdtemp +import numpy as np +import packaging.version +import pytest +from numcodecs import ( + BZ2, + JSON, + LZ4, + Blosc, + Categorize, + Delta, + FixedScaleOffset, + GZip, + MsgPack, + Pickle, + VLenArray, + VLenBytes, + VLenUTF8, + Zlib, +) +from numcodecs.abc import Codec +from numcodecs.compat import ensure_bytes, ensure_ndarray +from numcodecs.tests.common import greetings +from numpy.testing import assert_array_almost_equal, assert_array_equal + +import zarr +from zarr._storage.store import ( + BaseStore, + v3_api_available, +) +from .._storage.v3_storage_transformers import ShardingStorageTransformer, v3_sharding_available +from zarr.core import Array +from zarr.errors import ArrayNotFoundError, ContainsGroupError +from zarr.meta import json_loads +from zarr.n5 import N5Store, N5FSStore, n5_keywords +from zarr.storage import ( + ABSStore, + DBMStore, + DirectoryStore, + FSStore, + KVStore, + LMDBStore, + LRUStoreCache, + NestedDirectoryStore, + SQLiteStore, + atexit_rmglob, + atexit_rmtree, + data_root, + init_array, + init_group, + meta_root, + normalize_store_arg, +) +from zarr._storage.v3 import ( + ABSStoreV3, + DBMStoreV3, + DirectoryStoreV3, + FSStoreV3, + KVStoreV3, + LMDBStoreV3, + LRUStoreCacheV3, + RmdirV3, + SQLiteStoreV3, + StoreV3, +) +from zarr.tests.test_storage_v3 import DummyStorageTransfomer +from zarr.util import buffer_size +from zarr.tests.util import ( + abs_container, + have_fsspec, + have_lmdb, + have_sqlite3, + mktemp, + skip_test_env_var, +) +from zarr.types import DIMENSION_SEPARATOR + +# noinspection PyMethodMayBeStatic + +pytestmark = [ + pytest.mark.filterwarnings("ignore:Call to deprecated function .* \_cbuffer\_sizes.*"), + pytest.mark.filterwarnings("ignore:Call to deprecated function .* \_cbuffer\_metainfo.*"), +] + + +class TestArray: + version = 2 + root = "" + path = "" + compressor = Zlib(level=1) + filters: Optional[Sequence[Codec]] = None + dimension_separator: Optional[DIMENSION_SEPARATOR] = None + cache_metadata = True + cache_attrs = True + partial_decompress: bool | None = None + write_empty_chunks = True + read_only = False + storage_transformers: Tuple[Any, ...] = () + + def create_store(self) -> BaseStore: + return KVStore(dict()) + + # used by child classes + def create_chunk_store(self) -> Optional[BaseStore]: + return None + + def create_storage_transformers(self, shape: Union[int, Tuple[int, ...]]) -> Tuple[Any, ...]: + return () + + def create_filters(self, dtype: Optional[str]) -> Tuple[Any, ...]: + return () + + def create_array(self, shape: Union[int, Tuple[int, ...]], **kwargs): + store = self.create_store() + chunk_store = self.create_chunk_store() + # keyword arguments for array initialization + init_array_kwargs = { + "path": kwargs.pop("path", self.path), + "compressor": kwargs.pop("compressor", self.compressor), + "chunk_store": chunk_store, + "storage_transformers": self.create_storage_transformers(shape), + "filters": kwargs.pop("filters", self.create_filters(kwargs.get("dtype"))), + } + + # keyword arguments for array instantiation + access_array_kwargs = { + "path": init_array_kwargs["path"], + "read_only": kwargs.pop("read_only", self.read_only), + "chunk_store": chunk_store, + "cache_metadata": kwargs.pop("cache_metadata", self.cache_metadata), + "cache_attrs": kwargs.pop("cache_attrs", self.cache_attrs), + "partial_decompress": kwargs.pop("partial_decompress", self.partial_decompress), + "write_empty_chunks": kwargs.pop("write_empty_chunks", self.write_empty_chunks), + } + + init_array(store, shape, **{**init_array_kwargs, **kwargs}) + + return Array(store, **access_array_kwargs) + + def test_array_init(self): + # normal initialization + store = self.create_store() + init_array(store, shape=100, chunks=10, dtype=" end + assert [] == list(z.islice(6, 5)) + + z.store.close() + + def test_iter(self): + params = ( + ((1,), (1,)), + ((2,), (1,)), + ((1,), (2,)), + ((3,), (3,)), + ((1000,), (100,)), + ((100,), (1000,)), + ((1, 100), (1, 1)), + ((1, 0), (1, 1)), + ((0, 1), (1, 1)), + ((0, 1), (2, 1)), + ((100, 1), (3, 1)), + ((100, 100), (10, 10)), + ((10, 10, 10), (3, 3, 3)), + ) + for shape, chunks in params: + z = self.create_array(shape=shape, chunks=chunks, dtype=int) + a = np.arange(np.prod(shape)).reshape(shape) + z[:] = a + for expect, actual in zip_longest(a, z): + assert_array_equal(expect, actual) + z.store.close() + + def test_islice(self): + params = ( + ((1,), (1,), 0, 1), + ((2,), (1,), 0, 1), + ((1,), (2,), 0, 1), + ((3,), (3,), 1, 2), + ((1000,), (100,), 150, 1050), + ((100,), (1000,), 25, 75), + ((1, 100), (1, 1), 0, 1), + ((100, 1), (3, 1), 56, 100), + ((100, 100), (10, 10), 13, 99), + ((10, 10, 10), (3, 3, 3), 2, 4), + ) + for shape, chunks, start, end in params: + z = self.create_array(shape=shape, chunks=chunks, dtype=int) + a = np.arange(np.prod(shape)).reshape(shape) + z[:] = a + end_array = min(end, a.shape[0]) + for expect, actual in zip_longest(a[start:end_array], z.islice(start, end)): + assert_array_equal(expect, actual) + if hasattr(z.store, "close"): + z.store.close() + + def test_compressors(self): + compressors = [None, BZ2(), Blosc(), LZ4(), Zlib(), GZip()] + if LZMA: + compressors.append(LZMA()) + for compressor in compressors: + a = self.create_array(shape=1000, chunks=100, compressor=compressor) + a[0:100] = 1 + assert np.all(a[0:100] == 1) + a[:] = 1 + assert np.all(a[:] == 1) + a.store.close() + + def test_endian(self): + dtype = np.dtype("float32") + a1 = self.create_array(shape=1000, chunks=100, dtype=dtype.newbyteorder("<")) + a1[:] = 1 + x1 = a1[:] + a2 = self.create_array(shape=1000, chunks=100, dtype=dtype.newbyteorder(">")) + a2[:] = 1 + x2 = a2[:] + assert_array_equal(x1, x2) + a1.store.close() + a2.store.close() + + def test_attributes(self): + a = self.create_array(shape=10, chunks=10, dtype="i8") + a.attrs["foo"] = "bar" + assert a.attrs.key in a.store + attrs = json_loads(a.store[a.attrs.key]) + if self.version > 2: + # in v3, attributes are in a sub-dictionary of the metadata + attrs = attrs["attributes"] + assert "foo" in attrs and attrs["foo"] == "bar" + + a.attrs["bar"] = "foo" + assert a.attrs.key in a.store + attrs = json_loads(a.store[a.attrs.key]) + if self.version > 2: + # in v3, attributes are in a sub-dictionary of the metadata + attrs = attrs["attributes"] + assert "foo" in attrs and attrs["foo"] == "bar" + assert "bar" in attrs and attrs["bar"] == "foo" + a.store.close() + + def test_structured_with_object(self): + a = self.create_array( + fill_value=(0.0, None), + shape=10, + chunks=10, + dtype=[("x", float), ("y", object)], + object_codec=Pickle(), + ) + assert tuple(a[0]) == (0.0, None) + + +class TestArrayWithPath(TestArray): + path = "foo/bar" + compressor = Blosc() + + def test_nchunks_initialized(self): + pass + + def expected(self): + return ( + [ + "f710da18d45d38d4aaf2afd7fb822fdd73d02957", + "1437428e69754b1e1a38bd7fc9e43669577620db", + "6c530b6b9d73e108cc5ee7b6be3d552cc994bdbe", + "4c0a76fb1222498e09dcd92f7f9221d6cea8b40e", + "05b0663ffe1785f38d3a459dec17e57a18f254af", + ], + ) + + def test_nbytes_stored(self): + # MemoryStore as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum( + buffer_size(v) for k, v in z.store.items() if k.startswith("foo/bar/") + ) + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum( + buffer_size(v) for k, v in z.store.items() if k.startswith("foo/bar/") + ) + assert expect_nbytes_stored == z.nbytes_stored + + # mess with store + z.store[z._key_prefix + "foo"] = list(range(10)) + assert -1 == z.nbytes_stored + + +class TestArrayWithChunkStore(TestArray): + compressor = Blosc() + + def create_chunk_store(self): + return KVStore(dict()) + + def expected(self): + return ( + [ + "f710da18d45d38d4aaf2afd7fb822fdd73d02957", + "1437428e69754b1e1a38bd7fc9e43669577620db", + "6c530b6b9d73e108cc5ee7b6be3d552cc994bdbe", + "4c0a76fb1222498e09dcd92f7f9221d6cea8b40e", + "05b0663ffe1785f38d3a459dec17e57a18f254af", + ], + ) + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) for v in z.chunk_store.values()) + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + expect_nbytes_stored += sum(buffer_size(v) for v in z.chunk_store.values()) + assert expect_nbytes_stored == z.nbytes_stored + + # mess with store + z.chunk_store[z._key_prefix + "foo"] = list(range(10)) + assert -1 == z.nbytes_stored + + +class TestArrayWithDirectoryStore(TestArray): + def create_store(self): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = DirectoryStore(path) + return store + + def test_nbytes_stored(self): + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + assert expect_nbytes_stored == z.nbytes_stored + + +def test_array_init_from_dict(): + # initialization via non-Store MutableMapping + store = dict() + init_array(store, shape=100, chunks=10, dtype=" Tuple[Any, ...]: + return ( + Delta(dtype=dtype), + FixedScaleOffset(dtype=dtype, scale=1, offset=0), + ) + + def expected(self): + return ( + # zlib + [ + "b80367c5599d47110d42bd8886240c2f46620dba", + "95a7b2471225e73199c9716d21e8d3dd6e5f6f2a", + "7300f1eb130cff5891630038fd99c28ef23d3a01", + "c649ad229bc5720258b934ea958570c2f354c2eb", + "62fc9236d78af18a5ec26c12eea1d33bce52501e", + ], + # zlib-ng + [ + "b80367c5599d47110d42bd8886240c2f46620dba", + "95a7b2471225e73199c9716d21e8d3dd6e5f6f2a", + "7300f1eb130cff5891630038fd99c28ef23d3a01", + "1e053b6ad7dc58de7b1f5dad7fb45851f6b7b3ee", + "62fc9236d78af18a5ec26c12eea1d33bce52501e", + ], + ) + + def test_astype_no_filters(self): + shape = (100,) + dtype = np.dtype(np.int8) + astype = np.dtype(np.float32) + + store = KVStore(dict()) + init_array(store, shape=shape, chunks=10, dtype=dtype) + + data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + + z1 = Array(store) + z1[...] = data + z2 = z1.astype(astype) + + expected = data.astype(astype) + assert_array_equal(expected, z2) + assert z2.read_only + + def test_astype(self): + shape = (100,) + chunks = (10,) + + dtype = np.dtype(np.int8) + astype = np.dtype(np.float32) + + data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + + z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype) + z1[...] = data + z2 = z1.astype(astype) + + expected = data.astype(astype) + assert_array_equal(expected, z2) + + def test_array_dtype_shape(self): + # skip this one, cannot do delta on unstructured array + pass + + def test_structured_array(self): + # skip this one, cannot do delta on structured array + pass + + def test_structured_array_subshapes(self): + # skip this one, cannot do delta on structured array + pass + + def test_structured_array_nested(self): + # skip this one, cannot do delta on structured array + pass + + def test_dtypes(self): + # skip this one, delta messes up floats + pass + + def test_object_arrays(self): + # skip this one, cannot use delta with objects + pass + + def test_object_arrays_vlen_text(self): + # skip this one, cannot use delta with objects + pass + + def test_object_arrays_vlen_bytes(self): + # skip this one, cannot use delta with objects + pass + + def test_object_arrays_vlen_array(self): + # skip this one, cannot use delta with objects + pass + + def test_object_arrays_danger(self): + # skip this one, cannot use delta with objects + pass + + def test_structured_array_contain_object(self): + # skip this one, cannot use delta on structured array + pass + + +# custom store, does not support getsize() +class CustomMapping: + def __init__(self): + self.inner = KVStore(dict()) + + def __iter__(self): + return iter(self.keys()) + + def keys(self): + return self.inner.keys() + + def values(self): + return self.inner.values() + + def get(self, item, default=None): + try: + return self.inner[item] + except KeyError: + return default + + def __getitem__(self, item): + return self.inner[item] + + def __setitem__(self, item, value): + self.inner[item] = ensure_bytes(value) + + def __delitem__(self, key): + del self.inner[key] + + def __contains__(self, item): + return item in self.inner + + def close(self): + return self.inner.close() + + +class TestArrayWithCustomMapping(TestArray): + def create_store(self): + return CustomMapping() + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + assert 245 == z.nbytes_stored + z[:] = 42 + # 515 is zlib, 485 is zlib-ng + assert z.nbytes_stored in (515, 485) + + +class TestArrayNoCache(TestArray): + def test_cache_metadata(self): + a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_metadata=False) + path = None if self.version == 2 else a1.path + a2 = Array(a1.store, path=path, cache_metadata=True) + assert a1.shape == a2.shape + assert a1.size == a2.size + assert a1.nbytes == a2.nbytes + assert a1.nchunks == a2.nchunks + + # a1 is not caching so *will* see updates made via other objects + a2.resize(200) + assert (200,) == a2.shape + assert 200 == a2.size + assert 200 == a2.nbytes + assert 20 == a2.nchunks + assert a1.shape == a2.shape + assert a1.size == a2.size + assert a1.nbytes == a2.nbytes + assert a1.nchunks == a2.nchunks + + a2.append(np.zeros(100)) + assert (300,) == a2.shape + assert 300 == a2.size + assert 300 == a2.nbytes + assert 30 == a2.nchunks + assert a1.shape == a2.shape + assert a1.size == a2.size + assert a1.nbytes == a2.nbytes + assert a1.nchunks == a2.nchunks + + # a2 is caching so *will not* see updates made via other objects + a1.resize(400) + assert (400,) == a1.shape + assert 400 == a1.size + assert 400 == a1.nbytes + assert 40 == a1.nchunks + assert (300,) == a2.shape + assert 300 == a2.size + assert 300 == a2.nbytes + assert 30 == a2.nchunks + + def test_cache_attrs(self): + a1 = self.create_array(shape=100, chunks=10, dtype="i1", cache_attrs=False) + path = None if self.version == 2 else "arr1" + a2 = Array(a1.store, path=path, cache_attrs=True) + assert a1.attrs.asdict() == a2.attrs.asdict() + + # a1 is not caching so *will* see updates made via other objects + a2.attrs["foo"] = "xxx" + a2.attrs["bar"] = 42 + assert a1.attrs.asdict() == a2.attrs.asdict() + + # a2 is caching so *will not* see updates made via other objects + a1.attrs["foo"] = "yyy" + assert "yyy" == a1.attrs["foo"] + assert "xxx" == a2.attrs["foo"] + + def test_object_arrays_danger(self): + # skip this one as it only works if metadata are cached + pass + + +class TestArrayWithStoreCache(TestArray): + def create_store(self): + return LRUStoreCache(dict(), max_size=None) + + def test_store_has_bytes_values(self): + # skip as the cache has no control over how the store provides values + pass + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStore(TestArray): + compressor = Blosc() + dimension_separator: Literal[".", "/"] = "." + + def create_store(self): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = self.dimension_separator + store = FSStore( + path, + key_separator=key_separator, + auto_mkdir=True, + check=True, + create=True, + missing_exceptions=None, + ) + return store + + def expected(self): + return ( + [ + "ab753fc81df0878589535ca9bad2816ba88d91bc", + "c16261446f9436b1e9f962e57ce3e8f6074abe8a", + "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", + "6e52f95ac15b164a8e96843a230fcee0e610729b", + "091fa99bc60706095c9ce30b56ce2503e0223f56", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStoreFromFilesystem(TestArray): + compressor = Blosc() + dimension_separator = "." + + def create_store(self): + from fsspec.implementations.local import LocalFileSystem + + fs = LocalFileSystem(auto_mkdir=True) + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = self.dimension_separator + store = FSStore( + path, + fs=fs, + key_separator=key_separator, + check=True, + create=True, + missing_exceptions=None, + ) + return store + + def expected(self): + return ( + [ + "ab753fc81df0878589535ca9bad2816ba88d91bc", + "c16261446f9436b1e9f962e57ce3e8f6074abe8a", + "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", + "6e52f95ac15b164a8e96843a230fcee0e610729b", + "091fa99bc60706095c9ce30b56ce2503e0223f56", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.filterwarnings( + "ignore:.*Support for partial decompression will be removed in a future version.*" +) +class TestArrayWithFSStorePartialRead(TestArray): + compressor = Blosc(blocksize=256) + partial_decompress = True + + def create_store(self): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = FSStore(path) + return store + + def expected(self): + return ( + [ + "dd7577d645c38767cf6f6d1ef8fd64002883a014", + "aa0de9892cf1ed3cda529efbf3233720b84489b7", + "e6191c44cf958576c29c41cef0f55b028a4dbdff", + "88adeeabb819feecccadf50152293dbb42f9107e", + "1426e084427f9920e29c9ec81b663d1005849455", + ], + ) + + def test_non_cont(self): + z = self.create_array(shape=(500, 500, 500), chunks=(50, 50, 50), dtype=" BaseStore: + path = mkdtemp() + atexit.register(shutil.rmtree, path) + return DirectoryStoreV3(path) + + def test_nbytes_stored(self): + # dict as store + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + + +@skip_test_env_var("ZARR_TEST_ABS") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithABSStoreV3(TestArrayV3): + def create_store(self) -> ABSStoreV3: + client = abs_container() + store = ABSStoreV3(client=client) + store.rmdir() + return store + + +# TODO: TestArrayWithN5StoreV3 +# class TestArrayWithN5StoreV3(TestArrayWithDirectoryStoreV3): + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithDBMStoreV3(TestArrayV3): + def create_store(self) -> DBMStoreV3: + path = mktemp(suffix=".anydbm") + atexit.register(atexit_rmglob, path + "*") + store = DBMStoreV3(path, flag="n") + return store + + def test_nbytes_stored(self): + pass # not implemented + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(have_lmdb is False, reason="needs lmdb") +class TestArrayWithLMDBStoreV3(TestArrayV3): + lmdb_buffers = True + + def create_store(self) -> LMDBStoreV3: + path = mktemp(suffix=".lmdb") + atexit.register(atexit_rmtree, path) + store = LMDBStoreV3(path, buffers=self.lmdb_buffers) + return store + + def test_store_has_bytes_values(self): + pass # returns values as memoryviews/buffers instead of bytes + + def test_nbytes_stored(self): + pass # not implemented + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithLMDBStoreV3NoBuffers(TestArrayWithLMDBStoreV3): + lmdb_buffers = False + + def test_nbytes_stored(self): + pass # not implemented + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(have_sqlite3 is False, reason="needs sqlite3") +class TestArrayWithSQLiteStoreV3(TestArrayV3): + def create_store(self): + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path) + return store + + def test_nbytes_stored(self): + pass # not implemented + + +# skipped adding V3 equivalents for compressors (no change in v3): +# TestArrayWithNoCompressor +# TestArrayWithBZ2Compressor +# TestArrayWithBloscCompressor +# TestArrayWithLZMACompressor + +# skipped test with filters (v3 protocol removed filters) +# TestArrayWithFilters + + +# custom store, does not support getsize() +# Note: this custom mapping doesn't actually have all methods in the +# v3 spec (e.g. erase), but they aren't needed here. + + +class CustomMappingV3(RmdirV3, StoreV3): + def __init__(self): + self.inner = KVStoreV3(dict()) + + def __iter__(self): + return iter(self.keys()) + + def __len__(self): + return len(self.inner) + + def keys(self): + return self.inner.keys() + + def values(self): + return self.inner.values() + + def get(self, item, default=None): + try: + return self.inner[item] + except KeyError: + return default + + def __getitem__(self, item): + return self.inner[item] + + def __setitem__(self, item, value): + self.inner[item] = ensure_bytes(value) + + def __delitem__(self, key): + del self.inner[key] + + def __contains__(self, item): + return item in self.inner + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithCustomMappingV3(TestArrayV3): + def create_store(self): + store = CustomMappingV3() + return store + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + + def test_len(self): + # dict as store + z = self.create_array(shape=1000, chunks=100) + assert len(z._store) == 2 + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayNoCacheV3(TestArrayWithPathV3): + def create_store(self): + store = KVStoreV3(dict()) + return store + + def test_object_arrays_danger(self): + # skip this one as it only works if metadata are cached + pass + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithStoreCacheV3(TestArrayV3): + def create_store(self): + store = LRUStoreCacheV3(dict(), max_size=None) + return store + + def test_store_has_bytes_values(self): + # skip as the cache has no control over how the store provides values + pass + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithFSStoreV3(TestArrayV3): + compressor = Blosc() + + def create_store(self): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = self.dimension_separator + store = FSStoreV3( + path, + key_separator=key_separator, + auto_mkdir=True, + create=True, + check=True, + missing_exceptions=None, + ) + return store + + def expected(self): + return ( + [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithFSStoreV3FromFilesystem(TestArrayWithFSStoreV3): + def create_store(self): + from fsspec.implementations.local import LocalFileSystem + + fs = LocalFileSystem(auto_mkdir=True) + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = self.dimension_separator + store = FSStoreV3( + path, + fs=fs, + key_separator=key_separator, + create=True, + check=True, + missing_exceptions=None, + ) + return store + + def expected(self): + return ( + [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.filterwarnings( + "ignore:.*Support for partial decompression will be removed in a future version.*" +) +class TestArrayWithFSStoreV3PartialRead(TestArrayWithFSStoreV3): + partial_decompress = True + + def expected(self): + return ( + [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") +@pytest.mark.filterwarnings( + "ignore:.*Support for partial decompression will be removed in a future version.*" +) +class TestArrayWithFSStoreV3PartialReadUncompressedSharded(TestArrayWithFSStoreV3): + partial_decompress = True + compressor = None + + def create_storage_transformers(self, shape) -> Tuple[Any]: + num_dims = 1 if isinstance(shape, int) else len(shape) + sharding_transformer = ShardingStorageTransformer( + "indexed", chunks_per_shard=(2,) * num_dims + ) + return (sharding_transformer,) + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + + def test_supports_efficient_get_set_partial_values(self): + z = self.create_array(shape=100, chunks=10) + assert z.chunk_store.supports_efficient_get_partial_values + assert not z.chunk_store.supports_efficient_set_partial_values() + + def expected(self): + return ( + [ + "90109fc2a4e17efbcb447003ea1c08828b91f71e", + "2b73519f7260dba3ddce0d2b70041888856fec6b", + "bca5798be2ed71d444f3045b05432d937682b7dd", + "9ff1084501e28520e577662a6e3073f1116c76a2", + "882a97cad42417f90f111d0cb916a21579650467", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithFSStoreV3Nested(TestArrayWithFSStoreV3): + dimension_separator = "/" + + def expected(self): + return ( + [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ], + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithFSStoreV3NestedPartialRead(TestArrayWithFSStoreV3): + dimension_separator = "/" + + def expected(self): + return ( + [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ], + ) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithStorageTransformersV3(TestArrayWithChunkStoreV3): + def create_storage_transformers(self, shape) -> Tuple[Any]: + return ( + DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT), + ) + + def expected(self): + return ( + [ + "3fb9a4f8233b09ad02067b6b7fc9fd5caa405c7d", + "89c8eb364beb84919fc9153d2c1ed2696274ec18", + "73307055c3aec095dd1232c38d793ef82a06bd97", + "6152c09255a5efa43b1a115546e35affa00c138c", + "2f8802fc391f67f713302e84fad4fd8f1366d6c2", + ], + ) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") +class TestArrayWithShardingStorageTransformerV3(TestArrayV3): + compressor = None + + def create_storage_transformers(self, shape) -> Tuple[Any]: + num_dims = 1 if isinstance(shape, int) else len(shape) + return (ShardingStorageTransformer("indexed", chunks_per_shard=(2,) * num_dims),) + + def test_nbytes_stored(self): + z = self.create_array(shape=1000, chunks=100) + expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + z[:] = 42 + expect_nbytes_stored = sum(buffer_size(v) for k, v in z._store.items() if k != "zarr.json") + assert expect_nbytes_stored == z.nbytes_stored + + # mess with store + z.store[data_root + z._key_prefix + "foo"] = list(range(10)) + assert -1 == z.nbytes_stored + + def test_keys_inner_store(self): + z = self.create_array(shape=1000, chunks=100) + assert z.chunk_store.keys() == z._store.keys() + meta_keys = set(z.store.keys()) + z[:] = 42 + assert len(z.chunk_store.keys() - meta_keys) == 10 + # inner store should have half the data keys, + # since chunks_per_shard is 2: + assert len(z._store.keys() - meta_keys) == 5 + + def test_supports_efficient_get_set_partial_values(self): + z = self.create_array(shape=100, chunks=10) + assert not z.chunk_store.supports_efficient_get_partial_values + assert not z.chunk_store.supports_efficient_set_partial_values() + + def expected(self): + return ( + [ + "90109fc2a4e17efbcb447003ea1c08828b91f71e", + "2b73519f7260dba3ddce0d2b70041888856fec6b", + "bca5798be2ed71d444f3045b05432d937682b7dd", + "9ff1084501e28520e577662a6e3073f1116c76a2", + "882a97cad42417f90f111d0cb916a21579650467", + ], + ) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +def test_array_mismatched_store_versions(): + store_v3 = KVStoreV3(dict()) + store_v2 = KVStore(dict()) + + # separate chunk store + chunk_store_v2 = KVStore(dict()) + chunk_store_v3 = KVStoreV3(dict()) + + init_kwargs = dict(shape=100, chunks=10, dtype="""" + + data = np.arange(25).reshape((5, 5)) + ds = zarr.create( + shape=data.shape, + chunks=(5, 5), + dtype=data.dtype, + compressor=(None), + store=FSStore(url=str(tmpdir), mode="a"), + order="F", + ) + + ds[:] = data + + ds_reopened = zarr.open_array(store=FSStore(url=str(tmpdir), mode="r")) + + written_data = ds_reopened[:] + assert_array_equal(data, written_data) + + +def test_scalar_indexing(): + store = zarr.KVStore({}) + + store["a"] = zarr.create((3,), chunks=(1,), store=store) + store["a"][:] = [1, 2, 3] + + assert store["a"][1] == np.array(2.0) + assert store["a"][(1,)] == np.array(2.0) + + store["a"][slice(1)] = [-1] + assert store["a"][0] == np.array(-1) + + store["a"][0] = -2 + assert store["a"][0] == np.array(-2) + + store["a"][slice(1)] = (-3,) + assert store["a"][0] == np.array(-3) + + +def test_object_array_indexing(): + # regression test for #1874 + from numcodecs import MsgPack + + root = zarr.group() + arr = root.create_dataset( + name="my_dataset", + shape=0, + dtype=object, + object_codec=MsgPack(), + ) + new_items = [ + ["A", 1], + ["B", 2, "hello"], + ] + arr_add = np.empty(len(new_items), dtype=object) + arr_add[:] = new_items + arr.append(arr_add) + + # heterogeneous elements + elem = ["C", 3] + arr[0] = elem + assert arr[0] == elem + + # homogeneous elements + elem = [1, 3] + arr[1] = elem + assert arr[1] == elem + + +@pytest.mark.parametrize("shape", ((1, 1, 1), (5, 5, 1), (1, 5, 5))) +def test_scalar_orthogonal_indexing(shape): + # regression test for https://github.com/zarr-developers/zarr-python/issues/1931 + store = zarr.MemoryStore({}) + data = np.random.randint(0, 255, shape) + arr = zarr.zeros( + shape=shape, chunks=shape[:-1] + (1,), compressor=None, store=store, dtype="u1" + ) + arr[:, :, :] = data + store.close() + + zf = zarr.open(store, "r") + assert_array_equal(zf[0, :, :], data[0, :, :]) + assert_array_equal(zf[:, 0, :], data[:, 0, :]) + assert_array_equal(zf[:, :, 0], data[:, :, 0]) diff --git a/src/zarr/v2/tests/test_creation.py b/src/zarr/v2/tests/test_creation.py new file mode 100644 index 0000000000..3778141356 --- /dev/null +++ b/src/zarr/v2/tests/test_creation.py @@ -0,0 +1,781 @@ +import atexit +import os.path +import shutil +import warnings +import numbers + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from zarr._storage.store import DEFAULT_ZARR_VERSION +from zarr.codecs import Zlib +from zarr.core import Array +from zarr.creation import ( + array, + create, + empty, + empty_like, + full, + full_like, + ones, + ones_like, + open_array, + open_like, + zeros, + zeros_like, +) +from zarr.hierarchy import open_group +from zarr.n5 import N5Store +from zarr.storage import DirectoryStore, KVStore +from zarr._storage.store import v3_api_available +from zarr._storage.v3 import DirectoryStoreV3, KVStoreV3 +from zarr.sync import ThreadSynchronizer +from zarr.tests.test_storage_v3 import DummyStorageTransfomer +from zarr.tests.util import mktemp, have_fsspec + + +_VERSIONS = (None, 2, 3) if v3_api_available else (None, 2) +_VERSIONS2 = (2, 3) if v3_api_available else (2,) + + +# something bcolz-like +class MockBcolzArray: + def __init__(self, data, chunklen): + self.data = data + self.chunklen = chunklen + + def __getattr__(self, item): + return getattr(self.data, item) + + def __getitem__(self, item): + return self.data[item] + + +# something h5py-like +class MockH5pyDataset: + def __init__(self, data, chunks): + self.data = data + self.chunks = chunks + + def __getattr__(self, item): + return getattr(self.data, item) + + def __getitem__(self, item): + return self.data[item] + + +def _init_creation_kwargs(zarr_version, at_root=True): + kwargs = {"zarr_version": zarr_version} + if not at_root: + kwargs["path"] = "array" + return kwargs + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_array(zarr_version, at_root): + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + kwargs = _init_creation_kwargs(zarr_version, at_root) + + # with numpy array + a = np.arange(100) + z = array(a, chunks=10, **kwargs) + assert a.shape == z.shape + assert a.dtype == z.dtype + assert z._store._store_version == expected_zarr_version + assert_array_equal(a, z[:]) + + # with array-like + a = list(range(100)) + z = array(a, chunks=10, **kwargs) + assert (100,) == z.shape + assert np.asarray(a).dtype == z.dtype + assert_array_equal(np.asarray(a), z[:]) + + # with another zarr array + z2 = array(z, **kwargs) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert_array_equal(z[:], z2[:]) + + # with chunky array-likes + + b = np.arange(1000).reshape(100, 10) + c = MockBcolzArray(b, 10) + z3 = array(c, **kwargs) + assert c.shape == z3.shape + assert (10, 10) == z3.chunks + + b = np.arange(1000).reshape(100, 10) + c = MockH5pyDataset(b, chunks=(10, 2)) + z4 = array(c, **kwargs) + assert c.shape == z4.shape + assert (10, 2) == z4.chunks + + c = MockH5pyDataset(b, chunks=None) + z5 = array(c, **kwargs) + assert c.shape == z5.shape + assert isinstance(z5.chunks, tuple) + + # with dtype=None + a = np.arange(100, dtype="i4") + z = array(a, dtype=None, **kwargs) + assert_array_equal(a[:], z[:]) + assert a.dtype == z.dtype + + # with dtype=something else + a = np.arange(100, dtype="i4") + z = array(a, dtype="i8", **kwargs) + assert_array_equal(a[:], z[:]) + assert np.dtype("i8") == z.dtype + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_empty(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + z = empty(100, chunks=10, **kwargs) + assert (100,) == z.shape + assert (10,) == z.chunks + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_zeros(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + z = zeros(100, chunks=10, **kwargs) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.zeros(100), z[:]) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_ones(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + z = ones(100, chunks=10, **kwargs) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.ones(100), z[:]) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_full(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + z = full(100, chunks=10, fill_value=42, dtype="i4", **kwargs) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42, dtype="i4"), z[:]) + + # nan + z = full(100, chunks=10, fill_value=np.nan, dtype="f8", **kwargs) + assert np.all(np.isnan(z[:])) + + +@pytest.mark.parametrize("zarr_version", [None, 2]) # TODO +def test_full_additional_dtypes(zarr_version): + """Test additional types that aren't part of the base v3 spec.""" + kwargs = _init_creation_kwargs(zarr_version) + # NaT + z = full(100, chunks=10, fill_value="NaT", dtype="M8[s]", **kwargs) + assert np.all(np.isnat(z[:])) + z = full(100, chunks=10, fill_value="NaT", dtype="m8[s]", **kwargs) + assert np.all(np.isnat(z[:])) + + # byte string dtype + v = b"xxx" + z = full(100, chunks=10, fill_value=v, dtype="S3", **kwargs) + assert v == z[0] + a = z[...] + assert z.dtype == a.dtype + assert v == a[0] + assert np.all(a == v) + + # unicode string dtype + v = "xxx" + z = full(100, chunks=10, fill_value=v, dtype="U3", **kwargs) + assert v == z[0] + a = z[...] + assert z.dtype == a.dtype + assert v == a[0] + assert np.all(a == v) + + # bytes fill value / unicode dtype + v = b"xxx" + with pytest.raises(ValueError): + full(100, chunks=10, fill_value=v, dtype="U3") + + +@pytest.mark.parametrize("dimension_separator", [".", "/", None]) +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_open_array(zarr_version, at_root, dimension_separator): + store = "data/array.zarr" + kwargs = _init_creation_kwargs(zarr_version, at_root) + + # mode == 'w' + z = open_array( + store, mode="w", shape=100, chunks=10, dimension_separator=dimension_separator, **kwargs + ) + z[:] = 42 + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + + if dimension_separator is None: + assert z._dimension_separator == "/" if zarr_version == 3 else "." + else: + assert z._dimension_separator == dimension_separator + + # mode in 'r', 'r+' + group_kwargs = kwargs.copy() + if zarr_version == 3: + group_kwargs["path"] = "group" + open_group("data/group.zarr", mode="w", **group_kwargs) + for mode in "r", "r+": + with pytest.raises(ValueError): + open_array("doesnotexist", mode=mode) + with pytest.raises(ValueError): + open_array("data/group.zarr", mode=mode) + z = open_array(store, mode="r", **kwargs) + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + with pytest.raises(PermissionError): + z[:] = 43 + z = open_array(store, mode="r+", **kwargs) + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + z[:] = 43 + assert_array_equal(np.full(100, fill_value=43), z[:]) + + # mode == 'a' + shutil.rmtree(store) + z = open_array(store, mode="a", shape=100, chunks=10, **kwargs) + z[:] = 42 + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + + expected_error = TypeError if zarr_version == 3 else ValueError + # v3 path does not conflict, but will raise TypeError without shape kwarg + with pytest.raises(expected_error): + # array would end up at data/group.zarr/meta/root/array.array.json + open_array("data/group.zarr", mode="a", **kwargs) + + # mode in 'w-', 'x' + for mode in "w-", "x": + shutil.rmtree(store) + z = open_array(store, mode=mode, shape=100, chunks=10, **kwargs) + z[:] = 42 + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + with pytest.raises(ValueError): + open_array(store, mode=mode, **kwargs) + expected_error = TypeError if zarr_version == 3 else ValueError + # v3 path does not conflict, but will raise TypeError without shape kwarg + with pytest.raises(expected_error): + open_array("data/group.zarr", mode=mode, **kwargs) + + # with synchronizer + z = open_array(store, synchronizer=ThreadSynchronizer(), **kwargs) + assert isinstance(z, Array) + + # with path + kwargs_no_path = kwargs.copy() + kwargs_no_path.pop("path", None) + z = open_array(store, shape=100, path="foo/bar", mode="w", **kwargs_no_path) + assert isinstance(z, Array) + assert "foo/bar" == z.path + + # with chunk store + meta_store = "data/meta.zarr" + chunk_store = "data/chunks.zarr" + z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode="w", **kwargs) + z[:] = 42 + assert os.path.abspath(meta_store) == z.store.path + assert os.path.abspath(chunk_store) == z.chunk_store.path + + +def test_open_array_none(): + # open with both store and zarr_version = None + z = open_array(mode="w", shape=100, chunks=10) + assert isinstance(z, Array) + assert z._version == 2 + + +@pytest.mark.parametrize("dimension_separator", [".", "/", None]) +@pytest.mark.parametrize("zarr_version", _VERSIONS2) +def test_open_array_infer_separator_from_store(zarr_version, dimension_separator): + if zarr_version == 3: + StoreClass = DirectoryStoreV3 + path = "data" + else: + StoreClass = DirectoryStore + path = None + store = StoreClass("data/array.zarr", dimension_separator=dimension_separator) + + # Note: no dimension_separator kwarg to open_array + # we are testing here that it gets inferred from store + z = open_array(store, path=path, mode="w", shape=100, chunks=10) + z[:] = 42 + assert isinstance(z, Array) + if z._store._store_version == 2: + assert isinstance(z.store, DirectoryStore) + else: + assert isinstance(z.store, DirectoryStoreV3) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + + if dimension_separator is None: + assert z._dimension_separator == "/" if zarr_version == 3 else "." + else: + assert z._dimension_separator == dimension_separator + + +# TODO: N5 support for v3 +@pytest.mark.parametrize("zarr_version", [None, 2]) +def test_open_array_n5(zarr_version): + store = "data/array.zarr" + kwargs = _init_creation_kwargs(zarr_version) + + # for N5 store + store = "data/array.n5" + z = open_array(store, mode="w", shape=100, chunks=10, **kwargs) + z[:] = 42 + assert isinstance(z, Array) + assert isinstance(z.store, N5Store) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + + store = "data/group.n5" + group_kwargs = kwargs.copy() + # if zarr_version == 3: + # group_kwargs['path'] = 'group' + z = open_group(store, mode="w", **group_kwargs) + i = z.create_group("inner") + a = i.zeros("array", shape=100, chunks=10) + a[:] = 42 + + # Edit inner/attributes.json to not include "n5" + with open("data/group.n5/inner/attributes.json", "w") as o: + o.write("{}") + + # Re-open + a = open_group(store, **group_kwargs)["inner"]["array"] + assert isinstance(a, Array) + assert isinstance(z.store, N5Store) + assert (100,) == a.shape + assert (10,) == a.chunks + assert_array_equal(np.full(100, fill_value=42), a[:]) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_open_array_dict_store(zarr_version, at_root): + # dict will become a KVStore + store = dict() + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_store_type = KVStoreV3 if zarr_version == 3 else KVStore + + # mode == 'w' + z = open_array(store, mode="w", shape=100, chunks=10, **kwargs) + z[:] = 42 + assert isinstance(z, Array) + assert isinstance(z.store, expected_store_type) + assert (100,) == z.shape + assert (10,) == z.chunks + assert_array_equal(np.full(100, fill_value=42), z[:]) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_create_in_dict(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_store_type = KVStoreV3 if zarr_version == 3 else KVStore + + for func in [empty, zeros, ones]: + a = func(100, store=dict(), **kwargs) + assert isinstance(a.store, expected_store_type) + + a = full(100, 5, store=dict(), **kwargs) + assert isinstance(a.store, expected_store_type) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_create_writeable_mode(zarr_version, at_root, tmp_path): + # Regression test for https://github.com/zarr-developers/zarr-python/issues/1306 + import fsspec + + kwargs = _init_creation_kwargs(zarr_version, at_root) + store = fsspec.get_mapper(str(tmp_path)) + z = create(100, store=store, **kwargs) + assert z.store.map == store + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_empty_like(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + # zarr array + z = empty(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) + # zarr_version will be inferred from z, but have to specify a path in v3 + z2 = empty_like(z, path=kwargs.get("path")) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert z.compressor.get_config() == z2.compressor.get_config() + assert z.fill_value == z2.fill_value + assert z.order == z2.order + assert z._store._store_version == z2._store._store_version == expected_zarr_version + + # numpy array + a = np.empty(100, dtype="f4") + z3 = empty_like(a, **kwargs) + assert a.shape == z3.shape + assert (100,) == z3.chunks + assert a.dtype == z3.dtype + assert z3.fill_value is None + assert z3._store._store_version == expected_zarr_version + + # something slightly silly + a = [0] * 100 + z3 = empty_like(a, shape=200, **kwargs) + assert (200,) == z3.shape + + # other array-likes + b = np.arange(1000).reshape(100, 10) + c = MockBcolzArray(b, 10) + z = empty_like(c, **kwargs) + assert b.shape == z.shape + assert (10, 10) == z.chunks + c = MockH5pyDataset(b, chunks=(10, 2)) + z = empty_like(c, **kwargs) + assert b.shape == z.shape + assert (10, 2) == z.chunks + c = MockH5pyDataset(b, chunks=None) + z = empty_like(c, **kwargs) + assert b.shape == z.shape + assert isinstance(z.chunks, tuple) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_zeros_like(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + # zarr array + z = zeros(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) + z2 = zeros_like(z, path=kwargs.get("path")) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert z.compressor.get_config() == z2.compressor.get_config() + assert z.fill_value == z2.fill_value + assert z.order == z2.order + assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array + a = np.empty(100, dtype="f4") + z3 = zeros_like(a, chunks=10, **kwargs) + assert a.shape == z3.shape + assert (10,) == z3.chunks + assert a.dtype == z3.dtype + assert 0 == z3.fill_value + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_ones_like(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + # zarr array + z = ones(100, chunks=10, dtype="f4", compressor=Zlib(5), order="F", **kwargs) + z2 = ones_like(z, path=kwargs.get("path")) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert z.compressor.get_config() == z2.compressor.get_config() + assert z.fill_value == z2.fill_value + assert z.order == z2.order + assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array + a = np.empty(100, dtype="f4") + z3 = ones_like(a, chunks=10, **kwargs) + assert a.shape == z3.shape + assert (10,) == z3.chunks + assert a.dtype == z3.dtype + assert 1 == z3.fill_value + assert z3._store._store_version == expected_zarr_version + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_full_like(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + z = full(100, chunks=10, dtype="f4", compressor=Zlib(5), fill_value=42, order="F", **kwargs) + z2 = full_like(z, path=kwargs.get("path")) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert z.compressor.get_config() == z2.compressor.get_config() + assert z.fill_value == z2.fill_value + assert z.order == z2.order + assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array + a = np.empty(100, dtype="f4") + z3 = full_like(a, chunks=10, fill_value=42, **kwargs) + assert a.shape == z3.shape + assert (10,) == z3.chunks + assert a.dtype == z3.dtype + assert 42 == z3.fill_value + assert z3._store._store_version == expected_zarr_version + with pytest.raises(TypeError): + # fill_value missing + full_like(a, chunks=10, **kwargs) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_open_like(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + # zarr array + path = mktemp() + atexit.register(shutil.rmtree, path) + z = full(100, chunks=10, dtype="f4", compressor=Zlib(5), fill_value=42, order="F", **kwargs) + z2 = open_like(z, path) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + assert z.compressor.get_config() == z2.compressor.get_config() + assert z.fill_value == z2.fill_value + assert z.order == z2.order + assert z._store._store_version == z2._store._store_version == expected_zarr_version + # numpy array + path = mktemp() + atexit.register(shutil.rmtree, path) + a = np.empty(100, dtype="f4") + z3 = open_like(a, path, chunks=10, zarr_version=zarr_version) + assert a.shape == z3.shape + assert (10,) == z3.chunks + assert a.dtype == z3.dtype + assert 0 == z3.fill_value + assert z3._store._store_version == expected_zarr_version + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_create(zarr_version, at_root): + kwargs = _init_creation_kwargs(zarr_version, at_root) + expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version + + # defaults + z = create(100, **kwargs) + assert isinstance(z, Array) + assert (100,) == z.shape + assert (100,) == z.chunks # auto-chunks + assert np.dtype(None) == z.dtype + assert "blosc" == z.compressor.codec_id + assert 0 == z.fill_value + assert z._store._store_version == expected_zarr_version + + # all specified + z = create(100, chunks=10, dtype="i4", compressor=Zlib(1), fill_value=42, order="F", **kwargs) + assert isinstance(z, Array) + assert (100,) == z.shape + assert (10,) == z.chunks + assert np.dtype("i4") == z.dtype + assert "zlib" == z.compressor.codec_id + assert 1 == z.compressor.level + assert 42 == z.fill_value + assert "F" == z.order + assert z._store._store_version == expected_zarr_version + + # with synchronizer + synchronizer = ThreadSynchronizer() + z = create(100, chunks=10, synchronizer=synchronizer, **kwargs) + assert isinstance(z, Array) + assert (100,) == z.shape + assert (10,) == z.chunks + assert synchronizer is z.synchronizer + assert z._store._store_version == expected_zarr_version + + # don't allow string as compressor arg + with pytest.raises(ValueError): + create(100, chunks=10, compressor="zlib", **kwargs) + + # h5py compatibility + + z = create(100, compression="zlib", compression_opts=9, **kwargs) + assert "zlib" == z.compressor.codec_id + assert 9 == z.compressor.level + + z = create(100, compression="default", **kwargs) + assert "blosc" == z.compressor.codec_id + + # errors + with pytest.raises(ValueError): + # bad compression argument + create(100, compression=1, **kwargs) + with pytest.raises(ValueError): + # bad fill value + create(100, dtype="i4", fill_value="foo", **kwargs) + + # auto chunks + z = create(1000000000, chunks=True, **kwargs) + assert z.chunks[0] < z.shape[0] + z = create(1000000000, chunks=None, **kwargs) # backwards-compatibility + assert z.chunks[0] < z.shape[0] + # no chunks + z = create(1000000000, chunks=False, **kwargs) + assert z.chunks == z.shape + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_compression_args(zarr_version): + kwargs = _init_creation_kwargs(zarr_version) + + with warnings.catch_warnings(): + warnings.simplefilter("default") + z = create(100, compression="zlib", compression_opts=9, **kwargs) + assert isinstance(z, Array) + assert "zlib" == z.compressor.codec_id + assert 9 == z.compressor.level + + # 'compressor' overrides 'compression' + with pytest.warns(UserWarning): + z = create(100, compressor=Zlib(9), compression="bz2", compression_opts=1, **kwargs) + assert isinstance(z, Array) + assert "zlib" == z.compressor.codec_id + assert 9 == z.compressor.level + + # 'compressor' ignores 'compression_opts' + with pytest.warns(UserWarning): + z = create(100, compressor=Zlib(9), compression_opts=1, **kwargs) + assert isinstance(z, Array) + assert "zlib" == z.compressor.codec_id + assert 9 == z.compressor.level + + with pytest.warns(UserWarning): + # 'compressor' overrides 'compression' + create(100, compressor=Zlib(9), compression="bz2", compression_opts=1, **kwargs) + with pytest.warns(UserWarning): + # 'compressor' ignores 'compression_opts' + create(100, compressor=Zlib(9), compression_opts=1, **kwargs) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_create_read_only(zarr_version, at_root): + # https://github.com/alimanfoo/zarr/issues/151 + + kwargs = _init_creation_kwargs(zarr_version, at_root) + + # create an array initially read-only, then enable writing + z = create(100, read_only=True, **kwargs) + assert z.read_only + with pytest.raises(PermissionError): + z[:] = 42 + z.read_only = False + z[:] = 42 + assert np.all(z[...] == 42) + z.read_only = True + with pytest.raises(PermissionError): + z[:] = 0 + + # this is subtly different, but here we want to create an array with data, and then + # have it be read-only + a = np.arange(100) + z = array(a, read_only=True, **kwargs) + assert_array_equal(a, z[...]) + assert z.read_only + with pytest.raises(PermissionError): + z[:] = 42 + + +def test_json_dumps_chunks_numpy_dtype(): + z = zeros((10,), chunks=(np.int64(2),)) + assert np.all(z[...] == 0) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.parametrize("at_root", [False, True]) +def test_create_with_storage_transformers(at_root): + kwargs = _init_creation_kwargs(zarr_version=3, at_root=at_root) + transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) + z = create(1000000000, chunks=True, storage_transformers=[transformer], **kwargs) + assert isinstance(z.chunk_store, DummyStorageTransfomer) + assert z.chunk_store.test_value == DummyStorageTransfomer.TEST_CONSTANT + + +@pytest.mark.parametrize( + ("init_shape", "init_chunks", "shape", "chunks"), + ( + ((1,), (1,), (1,), (1,)), + ((1.0,), (1.0,), (1,), (1,)), + ((1.0,), False, (1,), (1,)), + ((1.0,), True, (1,), (1,)), + ((1.0,), None, (1,), (1,)), + ), +) +def test_shape_chunk_ints(init_shape, init_chunks, shape, chunks): + g = open_group() + if not isinstance(init_shape[0], numbers.Integral) or not isinstance( + init_chunks[0], numbers.Integral + ): + with pytest.warns(UserWarning): + array = g.create_dataset("ds", shape=init_shape, chunks=init_chunks, dtype=np.uint8) + else: + array = g.create_dataset("ds", shape=init_shape, chunks=init_chunks, dtype=np.uint8) + + assert all( + isinstance(s, int) for s in array.shape + ), f"Expected shape to be all ints but found {array.shape=}." + assert all( + isinstance(c, int) for c in array.chunks + ), f"Expected chunks to be all ints but found {array.chunks=}." + assert array.shape == shape, f"Expected {shape=} but found {array.shape=}." + assert array.chunks == chunks, f"Expected {chunks=} but found {array.chunks=}." diff --git a/src/zarr/v2/tests/test_dim_separator.py b/src/zarr/v2/tests/test_dim_separator.py new file mode 100644 index 0000000000..0a5814e65f --- /dev/null +++ b/src/zarr/v2/tests/test_dim_separator.py @@ -0,0 +1,135 @@ +import pathlib + +import pytest +from numpy.testing import assert_array_equal +from functools import partial + +import zarr +from zarr.core import Array +from zarr.storage import DirectoryStore, NestedDirectoryStore, FSStore +from zarr.tests.util import have_fsspec + + +needs_fsspec = pytest.mark.skipif(not have_fsspec, reason="needs fsspec") + + +@pytest.fixture( + params=( + "static_flat", + "static_flat_legacy", + "static_nested", + "static_nested_legacy", + "directory_nested", + "directory_flat", + "directory_default", + "nesteddirectory_nested", + "nesteddirectory_default", + pytest.param("fs_nested", marks=needs_fsspec), + pytest.param("fs_flat", marks=needs_fsspec), + pytest.param("fs_default", marks=needs_fsspec), + ) +) +def dataset(tmpdir, request): + """ + Generate a variety of different Zarrs using + different store implementations as well as + different dimension_separator arguments. + """ + + loc = tmpdir.join("dim_sep_test.zarr") + which = request.param + kwargs = {} + + if which.startswith("static"): + project_root = pathlib.Path(zarr.__file__).resolve().parent.parent + suffix = which[len("static_") :] + static = project_root / "fixture" / suffix + + if not static.exists(): # pragma: no cover + if "nested" in which: + # No way to reproduce the nested_legacy file via code + generator = NestedDirectoryStore + else: + if "legacy" in suffix: + # No dimension_separator metadata included + generator = DirectoryStore + else: + # Explicit dimension_separator metadata included + generator = partial(DirectoryStore, dimension_separator=".") + + # store the data - should be one-time operation + s = generator(str(static)) + a = zarr.open(store=s, mode="w", shape=(2, 2), dtype=" 2 and g1.store.is_erasable(): + arr_path = g1.path + "/arr1" + sfx = _get_metadata_suffix(g1.store) + array_meta_file = meta_root + arr_path + ".array" + sfx + assert array_meta_file in g1.store + group_meta_file = meta_root + g2.path + ".group" + sfx + assert group_meta_file in g1.store + + # rmdir on the array path should also remove the metadata file + g1.store.rmdir(arr_path) + assert array_meta_file not in g1.store + # rmdir on the group path should also remove its metadata file + g1.store.rmdir(g2.path) + assert group_meta_file not in g1.store + + def _dataset_path(self, group, path): + path = path.rstrip("/") + absolute = path.startswith("/") + if absolute: + dataset_path = path + else: + dataset_path = "/".join([group.path, path]) + dataset_path = dataset_path.lstrip("/") + dataset_name = "/" + dataset_path + return dataset_path, dataset_name + + def test_create_dataset(self): + g = self.create_group() + + # create as immediate child + dpath = "foo" + d1 = g.create_dataset(dpath, shape=1000, chunks=100) + path, name = self._dataset_path(g, dpath) + assert isinstance(d1, Array) + assert (1000,) == d1.shape + assert (100,) == d1.chunks + assert path == d1.path + assert name == d1.name + assert g.store is d1.store + + # create as descendant + dpath = "/a/b/c/" + d2 = g.create_dataset( + dpath, + shape=2000, + chunks=200, + dtype="i1", + compression="zlib", + compression_opts=9, + fill_value=42, + order="F", + ) + path, name = self._dataset_path(g, dpath) + assert isinstance(d2, Array) + assert (2000,) == d2.shape + assert (200,) == d2.chunks + assert np.dtype("i1") == d2.dtype + assert "zlib" == d2.compressor.codec_id + assert 9 == d2.compressor.level + assert 42 == d2.fill_value + assert "F" == d2.order + assert path == d2.path + assert name == d2.name + assert g.store is d2.store + + # create with data + data = np.arange(3000, dtype="u2") + dpath = "bar" + d3 = g.create_dataset(dpath, data=data, chunks=300) + path, name = self._dataset_path(g, dpath) + assert isinstance(d3, Array) + assert (3000,) == d3.shape + assert (300,) == d3.chunks + assert np.dtype("u2") == d3.dtype + assert_array_equal(data, d3[:]) + assert path == d3.path + assert name == d3.name + assert g.store is d3.store + + # compression arguments handling follows... + + # compression_opts as dict + d = g.create_dataset( + "aaa", + shape=1000, + dtype="u1", + compression="blosc", + compression_opts=dict(cname="zstd", clevel=1, shuffle=2), + ) + assert d.compressor.codec_id == "blosc" + assert "zstd" == d.compressor.cname + assert 1 == d.compressor.clevel + assert 2 == d.compressor.shuffle + + # compression_opts as sequence + d = g.create_dataset( + "bbb", shape=1000, dtype="u1", compression="blosc", compression_opts=("zstd", 1, 2) + ) + assert d.compressor.codec_id == "blosc" + assert "zstd" == d.compressor.cname + assert 1 == d.compressor.clevel + assert 2 == d.compressor.shuffle + + # None compression_opts + d = g.create_dataset("ccc", shape=1000, dtype="u1", compression="zlib") + assert d.compressor.codec_id == "zlib" + assert 1 == d.compressor.level + + # None compression + d = g.create_dataset("ddd", shape=1000, dtype="u1", compression=None) + assert d.compressor is None + + # compressor as compression + d = g.create_dataset("eee", shape=1000, dtype="u1", compression=Zlib(1)) + assert d.compressor.codec_id == "zlib" + assert 1 == d.compressor.level + + g.store.close() + + def test_require_dataset(self): + g = self.create_group() + + # create + dpath = "foo" + d1 = g.require_dataset(dpath, shape=1000, chunks=100, dtype="f4") + d1[:] = np.arange(1000) + path, name = self._dataset_path(g, dpath) + assert isinstance(d1, Array) + assert (1000,) == d1.shape + assert (100,) == d1.chunks + assert np.dtype("f4") == d1.dtype + assert path == d1.path + assert name == d1.name + assert g.store is d1.store + assert_array_equal(np.arange(1000), d1[:]) + + # require + d2 = g.require_dataset(dpath, shape=1000, chunks=100, dtype="f4") + assert isinstance(d2, Array) + assert (1000,) == d2.shape + assert (100,) == d2.chunks + assert np.dtype("f4") == d2.dtype + assert path == d2.path + assert name == d2.name + assert g.store is d2.store + assert_array_equal(np.arange(1000), d2[:]) + assert d1 == d2 + + # bad shape - use TypeError for h5py compatibility + with pytest.raises(TypeError): + g.require_dataset("foo", shape=2000, chunks=100, dtype="f4") + + # dtype matching + # can cast + d3 = g.require_dataset("foo", shape=1000, chunks=100, dtype="i2") + assert np.dtype("f4") == d3.dtype + assert d1 == d3 + with pytest.raises(TypeError): + # cannot cast + g.require_dataset("foo", shape=1000, chunks=100, dtype="i4") + with pytest.raises(TypeError): + # can cast but not exact match + g.require_dataset("foo", shape=1000, chunks=100, dtype="i2", exact=True) + + g.store.close() + + def test_create_errors(self): + g = self.create_group() + + # array obstructs group, array + g.create_dataset("foo", shape=100, chunks=10) + with pytest.raises(ValueError): + g.create_group("foo/bar") + with pytest.raises(ValueError): + g.require_group("foo/bar") + with pytest.raises(ValueError): + g.create_dataset("foo/bar", shape=100, chunks=10) + with pytest.raises(ValueError): + g.require_dataset("foo/bar", shape=100, chunks=10) + + # array obstructs group, array + g.create_dataset("a/b", shape=100, chunks=10) + with pytest.raises(ValueError): + g.create_group("a/b") + with pytest.raises(ValueError): + g.require_group("a/b") + with pytest.raises(ValueError): + g.create_dataset("a/b", shape=100, chunks=10) + + # group obstructs array + g.create_group("c/d") + with pytest.raises(ValueError): + g.create_dataset("c", shape=100, chunks=10) + with pytest.raises(ValueError): + g.require_dataset("c", shape=100, chunks=10) + with pytest.raises(ValueError): + g.create_dataset("c/d", shape=100, chunks=10) + with pytest.raises(ValueError): + g.require_dataset("c/d", shape=100, chunks=10) + + # h5py compatibility, accept 'fillvalue' + d = g.create_dataset("x", shape=100, chunks=10, fillvalue=42) + assert 42 == d.fill_value + + # h5py compatibility, ignore 'shuffle' + with pytest.warns(UserWarning, match="ignoring keyword argument 'shuffle'"): + g.create_dataset("y", shape=100, chunks=10, shuffle=True) + + # read-only + g = self.create_group(read_only=True) + with pytest.raises(PermissionError): + g.create_group("zzz") + with pytest.raises(PermissionError): + g.require_group("zzz") + with pytest.raises(PermissionError): + g.create_dataset("zzz", shape=100, chunks=10) + with pytest.raises(PermissionError): + g.require_dataset("zzz", shape=100, chunks=10) + + g.store.close() + + def test_create_overwrite(self): + try: + for method_name in "create_dataset", "create", "empty", "zeros", "ones": + g = self.create_group() + getattr(g, method_name)("foo", shape=100, chunks=10) + + # overwrite array with array + d = getattr(g, method_name)("foo", shape=200, chunks=20, overwrite=True) + assert (200,) == d.shape + # overwrite array with group + g2 = g.create_group("foo", overwrite=True) + assert 0 == len(g2) + # overwrite group with array + d = getattr(g, method_name)("foo", shape=300, chunks=30, overwrite=True) + assert (300,) == d.shape + # overwrite array with group + d = getattr(g, method_name)("foo/bar", shape=400, chunks=40, overwrite=True) + assert (400,) == d.shape + assert isinstance(g["foo"], Group) + + g.store.close() + except NotImplementedError: + pass + + def test_getitem_contains_iterators(self): + # setup + g1 = self.create_group() + g2 = g1.create_group("foo/bar") + if g1._version == 2: + d1 = g2.create_dataset("/a/b/c", shape=1000, chunks=100) + else: + # v3: cannot create a dataset at the root by starting with / + # instead, need to create the dataset on g1 directly + d1 = g1.create_dataset("a/b/c", shape=1000, chunks=100) + d1[:] = np.arange(1000) + d2 = g1.create_dataset("foo/baz", shape=3000, chunks=300) + d2[:] = np.arange(3000) + + # test __getitem__ + assert isinstance(g1["foo"], Group) + assert isinstance(g1["foo"]["bar"], Group) + assert isinstance(g1["foo/bar"], Group) + if g1._version == 2: + assert isinstance(g1["/foo/bar/"], Group) + else: + # start or end with / raises KeyError + # TODO: should we allow stripping of these on v3? + with pytest.raises(KeyError): + assert isinstance(g1["/foo/bar/"], Group) + assert isinstance(g1["foo/baz"], Array) + assert g2 == g1["foo/bar"] + assert g1["foo"]["bar"] == g1["foo/bar"] + assert d2 == g1["foo/baz"] + assert_array_equal(d2[:], g1["foo/baz"]) + assert isinstance(g1["a"], Group) + assert isinstance(g1["a"]["b"], Group) + assert isinstance(g1["a/b"], Group) + assert isinstance(g1["a"]["b"]["c"], Array) + assert isinstance(g1["a/b/c"], Array) + assert d1 == g1["a/b/c"] + assert g1["a"]["b"]["c"] == g1["a/b/c"] + assert_array_equal(d1[:], g1["a/b/c"][:]) + + # test __contains__ + assert "foo" in g1 + assert "foo/bar" in g1 + assert "foo/baz" in g1 + assert "bar" in g1["foo"] + assert "a" in g1 + assert "a/b" in g1 + assert "a/b/c" in g1 + assert "baz" not in g1 + assert "a/b/c/d" not in g1 + assert "a/z" not in g1 + assert "quux" not in g1["foo"] + + # test key errors + with pytest.raises(KeyError): + g1["baz"] + with pytest.raises(KeyError): + g1["x/y/z"] + + # test __len__ + assert 2 == len(g1) + assert 2 == len(g1["foo"]) + assert 0 == len(g1["foo/bar"]) + assert 1 == len(g1["a"]) + assert 1 == len(g1["a/b"]) + + # test __iter__, keys() + + if g1._version == 2: + # currently assumes sorted by key + assert ["a", "foo"] == list(g1) + assert ["a", "foo"] == list(g1.keys()) + assert ["bar", "baz"] == list(g1["foo"]) + assert ["bar", "baz"] == list(g1["foo"].keys()) + else: + # v3 is not necessarily sorted by key + assert ["a", "foo"] == sorted(list(g1)) + assert ["a", "foo"] == sorted(list(g1.keys())) + assert ["bar", "baz"] == sorted(list(g1["foo"])) + assert ["bar", "baz"] == sorted(list(g1["foo"].keys())) + assert [] == sorted(g1["foo/bar"]) + assert [] == sorted(g1["foo/bar"].keys()) + + # test items(), values() + # currently assumes sorted by key + + items = list(g1.items()) + values = list(g1.values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) + assert "a" == items[0][0] + assert g1["a"] == items[0][1] + assert g1["a"] == values[0] + assert "foo" == items[1][0] + assert g1["foo"] == items[1][1] + assert g1["foo"] == values[1] + + items = list(g1["foo"].items()) + values = list(g1["foo"].values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) + assert "bar" == items[0][0] + assert g1["foo"]["bar"] == items[0][1] + assert g1["foo"]["bar"] == values[0] + assert "baz" == items[1][0] + assert g1["foo"]["baz"] == items[1][1] + assert g1["foo"]["baz"] == values[1] + + # test array_keys(), arrays(), group_keys(), groups() + + groups = list(g1.groups()) + arrays = list(g1.arrays()) + if g1._version == 2: + # currently assumes sorted by key + assert ["a", "foo"] == list(g1.group_keys()) + else: + assert ["a", "foo"] == sorted(list(g1.group_keys())) + groups = sorted(groups) + arrays = sorted(arrays) + assert "a" == groups[0][0] + assert g1["a"] == groups[0][1] + assert "foo" == groups[1][0] + assert g1["foo"] == groups[1][1] + assert [] == list(g1.array_keys()) + assert [] == arrays + + assert ["bar"] == list(g1["foo"].group_keys()) + assert ["baz"] == list(g1["foo"].array_keys()) + groups = list(g1["foo"].groups()) + arrays = list(g1["foo"].arrays()) + if g1._version == 3: + groups = sorted(groups) + arrays = sorted(arrays) + assert "bar" == groups[0][0] + assert g1["foo"]["bar"] == groups[0][1] + assert "baz" == arrays[0][0] + assert g1["foo"]["baz"] == arrays[0][1] + + # visitor collection tests + items = [] + + def visitor2(obj): + items.append(obj.path) + + # noinspection PyUnusedLocal + def visitor3(name, obj=None): + items.append(name) + + def visitor4(name, obj): + items.append((name, obj)) + + del items[:] + g1.visitvalues(visitor2) + expected_items = [ + "a", + "a/b", + "a/b/c", + "foo", + "foo/bar", + "foo/baz", + ] + if g1._version == 3: + expected_items = [g1.path + "/" + i for i in expected_items] + assert expected_items == items + + del items[:] + g1["foo"].visitvalues(visitor2) + expected_items = [ + "foo/bar", + "foo/baz", + ] + if g1._version == 3: + expected_items = [g1.path + "/" + i for i in expected_items] + assert expected_items == items + + del items[:] + g1.visit(visitor3) + assert [ + "a", + "a/b", + "a/b/c", + "foo", + "foo/bar", + "foo/baz", + ] == items + + del items[:] + g1["foo"].visit(visitor3) + assert [ + "bar", + "baz", + ] == items + + del items[:] + g1.visitkeys(visitor3) + assert [ + "a", + "a/b", + "a/b/c", + "foo", + "foo/bar", + "foo/baz", + ] == items + + del items[:] + g1["foo"].visitkeys(visitor3) + assert [ + "bar", + "baz", + ] == items + + del items[:] + g1.visititems(visitor3) + assert [ + "a", + "a/b", + "a/b/c", + "foo", + "foo/bar", + "foo/baz", + ] == items + + del items[:] + g1["foo"].visititems(visitor3) + assert [ + "bar", + "baz", + ] == items + + del items[:] + g1.visititems(visitor4) + for n, o in items: + assert g1[n] == o + + del items[:] + g1["foo"].visititems(visitor4) + for n, o in items: + assert g1["foo"][n] == o + + # visitor filter tests + # noinspection PyUnusedLocal + def visitor0(val, *args): + name = getattr(val, "path", val) + if name == "a/b/c/d": + return True # pragma: no cover + + # noinspection PyUnusedLocal + def visitor1(val, *args): + name = getattr(val, "path", val) + if name.startswith("group/"): + # strip the group path for v3 + name = name[6:] + if name == "a/b/c": + return True + + assert g1.visit(visitor0) is None + assert g1.visitkeys(visitor0) is None + assert g1.visitvalues(visitor0) is None + assert g1.visititems(visitor0) is None + assert g1.visit(visitor1) is True + assert g1.visitkeys(visitor1) is True + assert g1.visitvalues(visitor1) is True + assert g1.visititems(visitor1) is True + + g1.store.close() + + # regression test for https://github.com/zarr-developers/zarr-python/issues/1228 + def test_double_counting_group_v3(self): + root_group = self.create_group() + group_names = ["foo", "foo-", "foo_"] + for name in group_names: + sub_group = root_group.create_group(name) + sub_group.create("bar", shape=10, dtype="i4") + assert list(root_group.group_keys()) == sorted(group_names) + assert list(root_group.groups()) == [ + (name, root_group[name]) for name in sorted(group_names) + ] + + def test_empty_getitem_contains_iterators(self): + # setup + g = self.create_group() + + # test + assert [] == list(g) + assert [] == list(g.keys()) + assert 0 == len(g) + assert "foo" not in g + + g.store.close() + + def test_iterators_recurse(self): + # setup + g1 = self.create_group() + g2 = g1.create_group("foo/bar") + d1 = g2.create_dataset("/a/b/c", shape=1000, chunks=100) + d1[:] = np.arange(1000) + d2 = g1.create_dataset("foo/baz", shape=3000, chunks=300) + d2[:] = np.arange(3000) + d3 = g2.create_dataset("zab", shape=2000, chunks=200) + d3[:] = np.arange(2000) + + # test recursive array_keys + array_keys = list(g1["foo"].array_keys(recurse=False)) + array_keys_recurse = list(g1["foo"].array_keys(recurse=True)) + assert len(array_keys_recurse) > len(array_keys) + assert sorted(array_keys_recurse) == ["baz", "zab"] + + # test recursive arrays + arrays = list(g1["foo"].arrays(recurse=False)) + arrays_recurse = list(g1["foo"].arrays(recurse=True)) + assert len(arrays_recurse) > len(arrays) + assert "zab" == arrays_recurse[0][0] + assert g1["foo"]["bar"]["zab"] == arrays_recurse[0][1] + + g1.store.close() + + def test_getattr(self): + # setup + g1 = self.create_group() + g2 = g1.create_group("foo") + g2.create_dataset("bar", shape=100) + + # test + assert g1["foo"] == g1.foo + assert g2["bar"] == g2.bar + # test that hasattr returns False instead of an exception (issue #88) + assert not hasattr(g1, "unexistingattribute") + + g1.store.close() + + def test_setitem(self): + g = self.create_group() + try: + data = np.arange(100) + g["foo"] = data + assert_array_equal(data, g["foo"]) + data = np.arange(200) + g["foo"] = data + assert_array_equal(data, g["foo"]) + # 0d array + g["foo"] = 42 + assert () == g["foo"].shape + assert 42 == g["foo"][()] + except NotImplementedError: + pass + g.store.close() + + def test_delitem(self): + g = self.create_group() + g.create_group("foo") + g.create_dataset("bar/baz", shape=100, chunks=10) + assert "foo" in g + assert "bar" in g + assert "bar/baz" in g + try: + del g["bar"] + with pytest.raises(KeyError): + del g["xxx"] + except NotImplementedError: + pass + else: + assert "foo" in g + assert "bar" not in g + assert "bar/baz" not in g + g.store.close() + + def test_move(self): + g = self.create_group() + + data = np.arange(100) + g["boo"] = data + + data = np.arange(100) + g["foo"] = data + + g.move("foo", "bar") + assert "foo" not in g + assert "bar" in g + assert_array_equal(data, g["bar"]) + + g.move("bar", "foo/bar") + assert "bar" not in g + assert "foo" in g + assert "foo/bar" in g + assert isinstance(g["foo"], Group) + assert_array_equal(data, g["foo/bar"]) + + g.move("foo", "foo2") + assert "foo" not in g + assert "foo/bar" not in g + assert "foo2" in g + assert "foo2/bar" in g + assert isinstance(g["foo2"], Group) + assert_array_equal(data, g["foo2/bar"]) + + g2 = g["foo2"] + g2.move("bar", "/bar") + assert "foo2" in g + assert "foo2/bar" not in g + if g2._version == 2: + assert "bar" in g + else: + # The `g2.move` call above moved bar to meta/root/bar and + # meta/data/bar. This is outside the `g` group located at + # /meta/root/group, so bar is no longer within `g`. + assert "bar" not in g + assert "meta/root/bar.array.json" in g._store + if g._chunk_store: + assert "data/root/bar/c0" in g._chunk_store + else: + assert "data/root/bar/c0" in g._store + assert isinstance(g["foo2"], Group) + if g2._version == 2: + assert_array_equal(data, g["bar"]) + else: + # TODO: How to access element created outside of group.path in v3? + # One option is to make a Hierarchy class representing the + # root. Currently Group requires specification of `path`, + # but the path of the root would be just '' which is not + # currently allowed. + pass + + with pytest.raises(ValueError): + g2.move("bar", "bar2") + + with pytest.raises(ValueError): + g.move("bar", "boo") + + g.store.close() + + def test_array_creation(self): + grp = self.create_group() + + a = grp.create("a", shape=100, chunks=10) + assert isinstance(a, Array) + b = grp.empty("b", shape=100, chunks=10) + assert isinstance(b, Array) + assert b.fill_value is None + c = grp.zeros("c", shape=100, chunks=10) + assert isinstance(c, Array) + assert 0 == c.fill_value + d = grp.ones("d", shape=100, chunks=10) + assert isinstance(d, Array) + assert 1 == d.fill_value + e = grp.full("e", shape=100, chunks=10, fill_value=42) + assert isinstance(e, Array) + assert 42 == e.fill_value + + f = grp.empty_like("f", a) + assert isinstance(f, Array) + assert f.fill_value is None + g = grp.zeros_like("g", a) + assert isinstance(g, Array) + assert 0 == g.fill_value + h = grp.ones_like("h", a) + assert isinstance(h, Array) + assert 1 == h.fill_value + i = grp.full_like("i", e) + assert isinstance(i, Array) + assert 42 == i.fill_value + + j = grp.array("j", data=np.arange(100), chunks=10) + assert isinstance(j, Array) + assert_array_equal(np.arange(100), j[:]) + + grp.store.close() + + grp = self.create_group(read_only=True) + with pytest.raises(PermissionError): + grp.create("aa", shape=100, chunks=10) + with pytest.raises(PermissionError): + grp.empty("aa", shape=100, chunks=10) + with pytest.raises(PermissionError): + grp.zeros("aa", shape=100, chunks=10) + with pytest.raises(PermissionError): + grp.ones("aa", shape=100, chunks=10) + with pytest.raises(PermissionError): + grp.full("aa", shape=100, chunks=10, fill_value=42) + with pytest.raises(PermissionError): + grp.array("aa", data=np.arange(100), chunks=10) + with pytest.raises(PermissionError): + grp.create("aa", shape=100, chunks=10) + with pytest.raises(PermissionError): + grp.empty_like("aa", a) + with pytest.raises(PermissionError): + grp.zeros_like("aa", a) + with pytest.raises(PermissionError): + grp.ones_like("aa", a) + with pytest.raises(PermissionError): + grp.full_like("aa", a) + + grp.store.close() + + def test_paths(self): + g1 = self.create_group() + g2 = g1.create_group("foo/bar") + + if g1._version == 2: + assert g1 == g1["/"] + assert g1 == g1["//"] + assert g1 == g1["///"] + assert g1 == g2["/"] + assert g1 == g2["//"] + assert g1 == g2["///"] + assert g2 == g1["foo/bar"] + assert g2 == g1["/foo/bar"] + assert g2 == g1["foo/bar/"] + assert g2 == g1["//foo/bar"] + assert g2 == g1["//foo//bar//"] + assert g2 == g1["///foo///bar///"] + assert g2 == g2["/foo/bar"] + else: + # the expected key format gives a match + assert g2 == g1["foo/bar"] + + # TODO: Should presence of a trailing slash raise KeyError? + # The spec says "the final character is not a / character" + # but we currently strip trailing '/' as done for v2. + assert g2 == g1["foo/bar/"] + + # double slash also currently works (spec doesn't mention this + # case, but have kept it for v2 behavior compatibility) + assert g2 == g1["foo//bar"] + + # TODO, root: fix these cases + # v3: leading / implies we are at the root, not within a group, + # so these all raise KeyError + for path in ["/foo/bar", "//foo/bar", "//foo//bar//", "///fooo///bar///"]: + with pytest.raises(KeyError): + g1[path] + + with pytest.raises(ValueError): + g1["."] + with pytest.raises(ValueError): + g1[".."] + with pytest.raises(ValueError): + g1["foo/."] + with pytest.raises(ValueError): + g1["foo/.."] + with pytest.raises(ValueError): + g1["foo/./bar"] + with pytest.raises(ValueError): + g1["foo/../bar"] + + g1.store.close() + + def test_pickle(self): + # setup group + g = self.create_group() + d = g.create_dataset("foo/bar", shape=100, chunks=10) + d[:] = np.arange(100) + path = g.path + name = g.name + n = len(g) + keys = list(g) + + # round-trip through pickle + dump = pickle.dumps(g) + # some stores cannot be opened twice at the same time, need to close + # store before can round-trip through pickle + g.store.close() + g2 = pickle.loads(dump) + + # verify + assert path == g2.path + assert name == g2.name + assert n == len(g2) + assert keys == list(g2) + assert isinstance(g2["foo"], Group) + assert isinstance(g2["foo/bar"], Array) + + g2.store.close() + + def test_context_manager(self): + with self.create_group() as g: + d = g.create_dataset("foo/bar", shape=100, chunks=10) + d[:] = np.arange(100) + + +@pytest.mark.parametrize("chunk_dict", [False, True]) +def test_group_init_from_dict(chunk_dict): + if chunk_dict: + store, chunk_store = dict(), dict() + else: + store, chunk_store = dict(), None + init_group(store, path=None, chunk_store=chunk_store) + g = Group(store, path=None, read_only=False, chunk_store=chunk_store) + assert store is not g.store + assert isinstance(g.store, KVStore) + if chunk_store is None: + assert g.store is g.chunk_store + else: + assert chunk_store is not g.chunk_store + + +# noinspection PyStatementEffect +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3(TestGroup, unittest.TestCase): + @staticmethod + def create_store(): + # can be overridden in sub-classes + return KVStoreV3(dict()), None + + def create_group( + self, store=None, path="group", read_only=False, chunk_store=None, synchronizer=None + ): + # can be overridden in sub-classes + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + g = Group( + store, + path=path, + read_only=read_only, + chunk_store=chunk_store, + synchronizer=synchronizer, + ) + return g + + def test_group_init_1(self): + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + assert store is g.store + if chunk_store is None: + assert store is g.chunk_store + else: + assert chunk_store is g.chunk_store + assert not g.read_only + # different path/name in v3 case + assert "group" == g.path + assert "/group" == g.name + assert "group" == g.basename + + assert isinstance(g.attrs, Attributes) + g.attrs["foo"] = "bar" + assert g.attrs["foo"] == "bar" + + assert isinstance(g.info, InfoReporter) + assert isinstance(repr(g.info), str) + assert isinstance(g.info._repr_html_(), str) + store.close() + + def test_group_init_errors_2(self): + store, chunk_store = self.create_store() + path = "tmp" + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + # array blocks group + with pytest.raises(ValueError): + Group(store, path=path, chunk_store=chunk_store) + store.close() + + +class TestGroupWithMemoryStore(TestGroup): + @staticmethod + def create_store(): + return MemoryStore(), None + + +# noinspection PyStatementEffect +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithMemoryStore(TestGroupWithMemoryStore, TestGroupV3): + @staticmethod + def create_store(): + return MemoryStoreV3(), None + + +class TestGroupWithDirectoryStore(TestGroup): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStore(path) + return store, None + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithDirectoryStore(TestGroupWithDirectoryStore, TestGroupV3): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path) + return store, None + + +@skip_test_env_var("ZARR_TEST_ABS") +class TestGroupWithABSStore(TestGroup): + @staticmethod + def create_store(): + container_client = abs_container() + store = ABSStore(client=container_client) + store.rmdir() + return store, None + + @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") + def test_pickle(self): + # internal attribute on ContainerClient isn't serializable for py36 and earlier + super().test_pickle() + + +@skip_test_env_var("ZARR_TEST_ABS") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithABSStore(TestGroupV3): + @staticmethod + def create_store(): + container_client = abs_container() + store = ABSStoreV3(client=container_client) + store.rmdir() + return store, None + + @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") + def test_pickle(self): + # internal attribute on ContainerClient isn't serializable for py36 and earlier + super().test_pickle() + + +class TestGroupWithNestedDirectoryStore(TestGroup): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStore(path) + return store, None + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupWithFSStore(TestGroup): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStore(path) + return store, None + + def test_round_trip_nd(self): + data = np.arange(1000).reshape(10, 10, 10) + name = "raw" + + store, _ = self.create_store() + f = open_group(store, mode="w") + f.create_dataset(name, data=data, chunks=(5, 5, 5), compressor=None) + assert name in f + h = open_group(store, mode="r") + np.testing.assert_array_equal(h[name][:], data) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithFSStore(TestGroupWithFSStore, TestGroupV3): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path) + return store, None + + def test_round_trip_nd(self): + data = np.arange(1000).reshape(10, 10, 10) + name = "raw" + + store, _ = self.create_store() + f = open_group(store, path="group", mode="w") + f.create_dataset(name, data=data, chunks=(5, 5, 5), compressor=None) + h = open_group(store, path="group", mode="r") + np.testing.assert_array_equal(h[name][:], data) + + f = open_group(store, path="group2", mode="w") + + data_size = data.nbytes + group_meta_size = buffer_size(store[meta_root + "group.group.json"]) + group2_meta_size = buffer_size(store[meta_root + "group2.group.json"]) + array_meta_size = buffer_size(store[meta_root + "group/raw.array.json"]) + assert store.getsize() == data_size + group_meta_size + group2_meta_size + array_meta_size + # added case with path to complete coverage + assert store.getsize("group") == data_size + group_meta_size + array_meta_size + assert store.getsize("group2") == group2_meta_size + assert store.getsize("group/raw") == data_size + array_meta_size + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupWithNestedFSStore(TestGroupWithFSStore): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStore(path, key_separator="/", auto_mkdir=True) + return store, None + + def test_inconsistent_dimension_separator(self): + data = np.arange(1000).reshape(10, 10, 10) + name = "raw" + + store, _ = self.create_store() + f = open_group(store, mode="w") + + # cannot specify dimension_separator that conflicts with the store + with pytest.raises(ValueError): + f.create_dataset( + name, data=data, chunks=(5, 5, 5), compressor=None, dimension_separator="." + ) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithNestedFSStore(TestGroupV3WithFSStore): + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path, key_separator="/", auto_mkdir=True) + return store, None + + def test_inconsistent_dimension_separator(self): + data = np.arange(1000).reshape(10, 10, 10) + name = "raw" + + store, _ = self.create_store() + f = open_group(store, path="group", mode="w") + + # cannot specify dimension_separator that conflicts with the store + with pytest.raises(ValueError): + f.create_dataset( + name, data=data, chunks=(5, 5, 5), compressor=None, dimension_separator="." + ) + + +class TestGroupWithZipStore(TestGroup): + @staticmethod + def create_store(): + path = mktemp(suffix=".zip") + atexit.register(os.remove, path) + store = ZipStore(path) + return store, None + + def test_context_manager(self): + with self.create_group() as g: + store = g.store + d = g.create_dataset("foo/bar", shape=100, chunks=10) + d[:] = np.arange(100) + + # Check that exiting the context manager closes the store, + # and therefore the underlying ZipFile. + with pytest.raises(ValueError): + store.zf.extractall() + + def test_move(self): + # zip store is not erasable (can so far only append to a zip + # so we can't test for move. + pass + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithZipStore(TestGroupWithZipStore, TestGroupV3): + @staticmethod + def create_store(): + path = mktemp(suffix=".zip") + atexit.register(os.remove, path) + store = ZipStoreV3(path) + return store, None + + +class TestGroupWithDBMStore(TestGroup): + @staticmethod + def create_store(): + path = mktemp(suffix=".anydbm") + atexit.register(atexit_rmglob, path + "*") + store = DBMStore(path, flag="n") + return store, None + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithDBMStore(TestGroupWithDBMStore, TestGroupV3): + @staticmethod + def create_store(): + path = mktemp(suffix=".anydbm") + atexit.register(atexit_rmglob, path + "*") + store = DBMStoreV3(path, flag="n") + return store, None + + +class TestGroupWithLMDBStore(TestGroup): + @staticmethod + def create_store(): + pytest.importorskip("lmdb") + path = mktemp(suffix=".lmdb") + atexit.register(atexit_rmtree, path) + store = LMDBStore(path) + return store, None + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithLMDBStore(TestGroupWithLMDBStore, TestGroupV3): + @staticmethod + def create_store(): + pytest.importorskip("lmdb") + path = mktemp(suffix=".lmdb") + atexit.register(atexit_rmtree, path) + store = LMDBStoreV3(path) + return store, None + + +class TestGroupWithSQLiteStore(TestGroup): + def create_store(self): + pytest.importorskip("sqlite3") + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store, None + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithSQLiteStore(TestGroupWithSQLiteStore, TestGroupV3): + def create_store(self): + pytest.importorskip("sqlite3") + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path) + return store, None + + +class TestGroupWithChunkStore(TestGroup): + @staticmethod + def create_store(): + return KVStore(dict()), KVStore(dict()) + + def test_chunk_store(self): + # setup + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + + # check attributes + assert store is g.store + assert chunk_store is g.chunk_store + + # create array + a = g.zeros("foo", shape=100, chunks=10) + assert store is a.store + assert chunk_store is a.chunk_store + a[:] = np.arange(100) + assert_array_equal(np.arange(100), a[:]) + + # check store keys + expect = sorted([group_meta_key, "foo/" + array_meta_key]) + actual = sorted(store.keys()) + assert expect == actual + expect = ["foo/" + str(i) for i in range(10)] + actual = sorted(chunk_store.keys()) + assert expect == actual + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithChunkStore(TestGroupWithChunkStore, TestGroupV3): + @staticmethod + def create_store(): + return KVStoreV3(dict()), KVStoreV3(dict()) + + def test_chunk_store(self): + # setup + store, chunk_store = self.create_store() + path = "group1" + g = self.create_group(store, path=path, chunk_store=chunk_store) + + # check attributes + assert store is g.store + assert chunk_store is g.chunk_store + + # create array + a = g.zeros("foo", shape=100, chunks=10) + assert store is a.store + assert chunk_store is a.chunk_store + a[:] = np.arange(100) + assert_array_equal(np.arange(100), a[:]) + + # check store keys + group_key = meta_root + path + ".group.json" + array_key = meta_root + path + "/foo" + ".array.json" + expect = sorted([group_key, array_key, "zarr.json"]) + actual = sorted(store.keys()) + assert expect == actual + expect = [data_root + path + "/foo/c" + str(i) for i in range(10)] + expect += ["zarr.json"] + actual = sorted(chunk_store.keys()) + assert expect == actual + + +class TestGroupWithStoreCache(TestGroup): + @staticmethod + def create_store(): + store = LRUStoreCache(dict(), max_size=None) + return store, None + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestGroupV3WithStoreCache(TestGroupWithStoreCache, TestGroupV3): + @staticmethod + def create_store(): + store = LRUStoreCacheV3(dict(), max_size=None) + return store, None + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_group(zarr_version): + # test the group() convenience function + + # basic usage + if zarr_version == 2: + g = group() + assert "" == g.path + assert "/" == g.name + else: + g = group(path="group1", zarr_version=zarr_version) + assert "group1" == g.path + assert "/group1" == g.name + assert isinstance(g, Group) + + # usage with custom store + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = "foo" + g = group(store=store, path=path) + assert isinstance(g, Group) + assert store is g.store + + # overwrite behaviour + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = "foo" + init_array(store, path=path, shape=100, chunks=10) + with pytest.raises(ValueError): + group(store, path=path) + g = group(store, path=path, overwrite=True) + assert isinstance(g, Group) + assert store is g.store + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_group_writeable_mode(zarr_version, tmp_path): + # Regression test for https://github.com/zarr-developers/zarr-python/issues/1353 + import fsspec + + store = fsspec.get_mapper(str(tmp_path)) + zg = group(store=store) + assert zg.store.map == store + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_open_group(zarr_version): + # test the open_group() convenience function + + store = "data/group.zarr" + + expected_store_type = DirectoryStore if zarr_version == 2 else DirectoryStoreV3 + + # mode == 'w' + path = None if zarr_version == 2 else "group1" + g = open_group(store, path=path, mode="w", zarr_version=zarr_version) + assert isinstance(g, Group) + assert isinstance(g.store, expected_store_type) + assert 0 == len(g) + g.create_groups("foo", "bar") + assert 2 == len(g) + + # mode in 'r', 'r+' + open_array("data/array.zarr", shape=100, chunks=10, mode="w") + for mode in "r", "r+": + with pytest.raises(ValueError): + open_group("doesnotexist", mode=mode) + with pytest.raises(ValueError): + open_group("data/array.zarr", mode=mode) + g = open_group(store, mode="r") + assert isinstance(g, Group) + assert 2 == len(g) + with pytest.raises(PermissionError): + g.create_group("baz") + g = open_group(store, mode="r+") + assert isinstance(g, Group) + assert 2 == len(g) + g.create_groups("baz", "quux") + assert 4 == len(g) + + # mode == 'a' + shutil.rmtree(store) + g = open_group(store, path=path, mode="a", zarr_version=zarr_version) + assert isinstance(g, Group) + assert isinstance(g.store, expected_store_type) + assert 0 == len(g) + g.create_groups("foo", "bar") + assert 2 == len(g) + if zarr_version == 2: + with pytest.raises(ValueError): + open_group("data/array.zarr", mode="a", zarr_version=zarr_version) + else: + # TODO, root: should this raise an error? + open_group("data/array.zarr", mode="a", zarr_version=zarr_version) + + # mode in 'w-', 'x' + for mode in "w-", "x": + shutil.rmtree(store) + g = open_group(store, path=path, mode=mode, zarr_version=zarr_version) + assert isinstance(g, Group) + assert isinstance(g.store, expected_store_type) + assert 0 == len(g) + g.create_groups("foo", "bar") + assert 2 == len(g) + with pytest.raises(ValueError): + open_group(store, path=path, mode=mode, zarr_version=zarr_version) + if zarr_version == 2: + with pytest.raises(ValueError): + open_group("data/array.zarr", mode=mode) + + # open with path + g = open_group(store, path="foo/bar", zarr_version=zarr_version) + assert isinstance(g, Group) + assert "foo/bar" == g.path + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_group_completions(zarr_version): + path = None if zarr_version == 2 else "group1" + g = group(path=path, zarr_version=zarr_version) + d = dir(g) + assert "foo" not in d + assert "bar" not in d + assert "baz" not in d + assert "qux" not in d + assert "xxx" not in d + assert "yyy" not in d + assert "zzz" not in d + assert "123" not in d + assert "456" not in d + g.create_groups("foo", "bar", "baz/qux", "123") + g.zeros("xxx", shape=100) + g.zeros("yyy", shape=100) + g.zeros("zzz", shape=100) + g.zeros("456", shape=100) + d = dir(g) + assert "foo" in d + assert "bar" in d + assert "baz" in d + assert "qux" not in d + assert "xxx" in d + assert "yyy" in d + assert "zzz" in d + assert "123" not in d # not valid identifier + assert "456" not in d # not valid identifier + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_group_key_completions(zarr_version): + path = None if zarr_version == 2 else "group1" + g = group(path=path, zarr_version=zarr_version) + d = dir(g) + # noinspection PyProtectedMember + k = g._ipython_key_completions_() + + # none of these names should be an attribute + assert "foo" not in d + assert "bar" not in d + assert "baz" not in d + assert "qux" not in d + assert "xxx" not in d + assert "yyy" not in d + assert "zzz" not in d + assert "123" not in d + assert "456" not in d + assert "asdf;" not in d + + # none of these names should be an item + assert "foo" not in k + assert "bar" not in k + assert "baz" not in k + assert "qux" not in k + assert "xxx" not in k + assert "yyy" not in k + assert "zzz" not in k + assert "123" not in k + assert "456" not in k + assert "asdf;" not in k + + g.create_groups("foo", "bar", "baz/qux", "123") + g.zeros("xxx", shape=100) + g.zeros("yyy", shape=100) + g.zeros("zzz", shape=100) + g.zeros("456", shape=100) + if zarr_version == 2: + g.zeros("asdf;", shape=100) + else: + # cannot have ; in key name for v3 + with pytest.raises(ValueError): + g.zeros("asdf;", shape=100) + + d = dir(g) + # noinspection PyProtectedMember + k = g._ipython_key_completions_() + + assert "foo" in d + assert "bar" in d + assert "baz" in d + assert "qux" not in d + assert "xxx" in d + assert "yyy" in d + assert "zzz" in d + assert "123" not in d # not valid identifier + assert "456" not in d # not valid identifier + if zarr_version == 2: + assert "asdf;" not in d # not valid identifier + + assert "foo" in k + assert "bar" in k + assert "baz" in k + assert "qux" not in k + assert "xxx" in k + assert "yyy" in k + assert "zzz" in k + assert "123" in k + assert "456" in k + if zarr_version == 2: + assert "asdf;" in k + + +def _check_tree(g, expect_bytes, expect_text): + assert expect_bytes == bytes(g.tree()) + assert expect_text == str(g.tree()) + expect_repr = expect_text + assert expect_repr == repr(g.tree()) + if ipytree: + # noinspection PyProtectedMember + widget = g.tree()._repr_mimebundle_() + isinstance(widget, ipytree.Tree) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +@pytest.mark.parametrize("at_root", [False, True]) +def test_tree(zarr_version, at_root): + # setup + path = None if at_root else "group1" + g1 = group(path=path, zarr_version=zarr_version) + g2 = g1.create_group("foo") + g3 = g1.create_group("bar") + g3.create_group("baz") + g5 = g3.create_group("quux") + g5.create_dataset("baz", shape=100, chunks=10) + + tree_path = "/" if at_root else path + # test root group + if zarr_version == 2: + expect_bytes = textwrap.dedent( + f"""\ + {tree_path} + +-- bar + | +-- baz + | +-- quux + | +-- baz (100,) float64 + +-- foo""" + ).encode() + expect_text = textwrap.dedent( + f"""\ + {tree_path} + ├── bar + │ ├── baz + │ └── quux + │ └── baz (100,) float64 + └── foo""" + ) + else: + # Almost the same as for v2, but has a path name and the + # subgroups are not necessarily sorted alphabetically. + expect_bytes = textwrap.dedent( + f"""\ + {tree_path} + +-- foo + +-- bar + +-- baz + +-- quux + +-- baz (100,) float64""" + ).encode() + expect_text = textwrap.dedent( + f"""\ + {tree_path} + ├── foo + └── bar + ├── baz + └── quux + └── baz (100,) float64""" + ) + _check_tree(g1, expect_bytes, expect_text) + + # test different group + expect_bytes = textwrap.dedent( + """\ + foo""" + ).encode() + expect_text = textwrap.dedent( + """\ + foo""" + ) + _check_tree(g2, expect_bytes, expect_text) + + # test different group + expect_bytes = textwrap.dedent( + """\ + bar + +-- baz + +-- quux + +-- baz (100,) float64""" + ).encode() + expect_text = textwrap.dedent( + """\ + bar + ├── baz + └── quux + └── baz (100,) float64""" + ) + _check_tree(g3, expect_bytes, expect_text) + + +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +def test_group_mismatched_store_versions(): + store_v3 = KVStoreV3(dict()) + store_v2 = KVStore(dict()) + + # separate chunk store + chunk_store_v2 = KVStore(dict()) + chunk_store_v3 = KVStoreV3(dict()) + + init_group(store_v2, path="group1", chunk_store=chunk_store_v2) + init_group(store_v3, path="group1", chunk_store=chunk_store_v3) + + g1_v3 = Group(store_v3, path="group1", read_only=True, chunk_store=chunk_store_v3) + assert isinstance(g1_v3._store, KVStoreV3) + g1_v2 = Group(store_v2, path="group1", read_only=True, chunk_store=chunk_store_v2) + assert isinstance(g1_v2._store, KVStore) + + # store and chunk_store must have the same zarr protocol version + with pytest.raises(ValueError): + Group(store_v3, path="group1", read_only=False, chunk_store=chunk_store_v2) + with pytest.raises(ValueError): + Group(store_v2, path="group1", read_only=False, chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + open_group(store_v2, path="group1", chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + open_group(store_v3, path="group1", chunk_store=chunk_store_v2) + + # raises Value if read_only and path is not a pre-existing group + with pytest.raises(ValueError): + Group(store_v3, path="group2", read_only=True, chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + Group(store_v3, path="group2", read_only=True, chunk_store=chunk_store_v3) + + +@pytest.mark.parametrize("zarr_version", _VERSIONS) +def test_open_group_from_paths(zarr_version): + """Verify zarr_version is applied to both the store and chunk_store.""" + store = tempfile.mkdtemp() + chunk_store = tempfile.mkdtemp() + atexit.register(atexit_rmtree, store) + atexit.register(atexit_rmtree, chunk_store) + path = "g1" + g = open_group(store, path=path, chunk_store=chunk_store, zarr_version=zarr_version) + assert g._store._store_version == g._chunk_store._store_version == zarr_version diff --git a/src/zarr/v2/tests/test_indexing.py b/src/zarr/v2/tests/test_indexing.py new file mode 100644 index 0000000000..a3afc101c5 --- /dev/null +++ b/src/zarr/v2/tests/test_indexing.py @@ -0,0 +1,1755 @@ +import numpy +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import zarr +from zarr.indexing import ( + make_slice_selection, + normalize_integer_selection, + oindex, + oindex_set, + replace_ellipsis, + PartialChunkIterator, +) + +from zarr.tests.util import CountingDict + + +def test_normalize_integer_selection(): + assert 1 == normalize_integer_selection(1, 100) + assert 99 == normalize_integer_selection(-1, 100) + with pytest.raises(IndexError): + normalize_integer_selection(100, 100) + with pytest.raises(IndexError): + normalize_integer_selection(1000, 100) + with pytest.raises(IndexError): + normalize_integer_selection(-1000, 100) + + +def test_replace_ellipsis(): + # 1D, single item + assert (0,) == replace_ellipsis(0, (100,)) + + # 1D + assert (slice(None),) == replace_ellipsis(Ellipsis, (100,)) + assert (slice(None),) == replace_ellipsis(slice(None), (100,)) + assert (slice(None, 100),) == replace_ellipsis(slice(None, 100), (100,)) + assert (slice(0, None),) == replace_ellipsis(slice(0, None), (100,)) + assert (slice(None),) == replace_ellipsis((slice(None), Ellipsis), (100,)) + assert (slice(None),) == replace_ellipsis((Ellipsis, slice(None)), (100,)) + + # 2D, single item + assert (0, 0) == replace_ellipsis((0, 0), (100, 100)) + assert (-1, 1) == replace_ellipsis((-1, 1), (100, 100)) + + # 2D, single col/row + assert (0, slice(None)) == replace_ellipsis((0, slice(None)), (100, 100)) + assert (0, slice(None)) == replace_ellipsis((0,), (100, 100)) + assert (slice(None), 0) == replace_ellipsis((slice(None), 0), (100, 100)) + + # 2D slice + assert (slice(None), slice(None)) == replace_ellipsis(Ellipsis, (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis(slice(None), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((Ellipsis, slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), Ellipsis), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), Ellipsis, slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (Ellipsis, slice(None), slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), slice(None), Ellipsis), (100, 100) + ) + + +def test_get_basic_selection_0d(): + # setup + a = np.array(42) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + z[...] = a + + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert 42 == z.get_basic_selection(()) + assert 42 == z[()] + + # test out param + b = np.zeros_like(a) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) + + # test structured array + value = (b"aaa", 1, 4.2) + a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + z[()] = value + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert a[()] == z.get_basic_selection(()) + assert a[()] == z[()] + assert b"aaa" == z.get_basic_selection((), fields="foo") + assert b"aaa" == z["foo"] + assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) + assert a[["foo", "bar"]] == z["foo", "bar"] + # test out param + b = np.zeros_like(a) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) + c = np.zeros_like(a[["foo", "bar"]]) + z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) + assert_array_equal(a[["foo", "bar"]], c) + + +basic_selections_1d = [ + # single value + 42, + -1, + # slices + slice(0, 1050), + slice(50, 150), + slice(0, 2000), + slice(-150, -50), + slice(-2000, 2000), + slice(0, 0), # empty result + slice(-1, 0), # empty result + # total selections + slice(None), + Ellipsis, + (), + (Ellipsis, slice(None)), + # slice with step + slice(None), + slice(None, None), + slice(None, None, 1), + slice(None, None, 10), + slice(None, None, 100), + slice(None, None, 1000), + slice(None, None, 10000), + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(0, 1050, 10000), + slice(1, 31, 3), + slice(1, 31, 30), + slice(1, 31, 300), + slice(81, 121, 3), + slice(81, 121, 30), + slice(81, 121, 300), + slice(50, 150), + slice(50, 150, 1), + slice(50, 150, 10), +] + + +basic_selections_1d_bad = [ + # only positive step supported + slice(None, None, -1), + slice(None, None, -10), + slice(None, None, -100), + slice(None, None, -1000), + slice(None, None, -10000), + slice(1050, -1, -1), + slice(1050, -1, -10), + slice(1050, -1, -100), + slice(1050, -1, -1000), + slice(1050, -1, -10000), + slice(1050, 0, -1), + slice(1050, 0, -10), + slice(1050, 0, -100), + slice(1050, 0, -1000), + slice(1050, 0, -10000), + slice(150, 50, -1), + slice(150, 50, -10), + slice(31, 1, -3), + slice(121, 81, -3), + slice(-1, 0, -1), + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +def _test_get_basic_selection(a, z, selection): + expect = a[selection] + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + actual = z[selection] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_get_basic_selection_1d(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + for selection in basic_selections_1d: + _test_get_basic_selection(a, z, selection) + + for selection in basic_selections_1d_bad: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + with pytest.raises(IndexError): + z[selection] + + with pytest.raises(IndexError): + z.get_basic_selection([1, 0]) + + +basic_selections_2d = [ + # single row + 42, + -1, + (42, slice(None)), + (-1, slice(None)), + # single col + (slice(None), 4), + (slice(None), -1), + # row slices + slice(None), + slice(0, 1000), + slice(250, 350), + slice(0, 2000), + slice(-350, -250), + slice(0, 0), # empty result + slice(-1, 0), # empty result + slice(-2000, 0), + slice(-2000, 2000), + # 2D slices + (slice(None), slice(1, 5)), + (slice(250, 350), slice(None)), + (slice(250, 350), slice(1, 5)), + (slice(250, 350), slice(-5, -1)), + (slice(250, 350), slice(-50, 50)), + (slice(250, 350, 10), slice(1, 5)), + (slice(250, 350), slice(1, 5, 2)), + (slice(250, 350, 33), slice(1, 5, 3)), + # total selections + (slice(None), slice(None)), + Ellipsis, + (), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), slice(None)), +] + + +basic_selections_2d_bad = [ + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (2.3, slice(None)), + # only positive step supported + slice(None, None, -1), + (slice(None, None, -1), slice(None)), + (0, 0, 0), + (slice(None), slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_basic_selection_2d(): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + for selection in basic_selections_2d: + _test_get_basic_selection(a, z, selection) + + bad_selections = basic_selections_2d_bad + [ + # integer arrays + [0, 1], + (slice(None), [0, 1]), + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + # check fallback on fancy indexing + fancy_selection = ([0, 1], [0, 1]) + np.testing.assert_array_equal(z[fancy_selection], [0, 11]) + + +def test_fancy_indexing_fallback_on_get_setitem(): + z = zarr.zeros((20, 20)) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr.zeros(5) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(None)), [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(1, None)), [[1, 2], [4, 5]]), + # Slice first, then list + ((slice(0, 2), [0, 2]), [[0, 2], [3, 5]]), + # Slices only + ((slice(0, 2), slice(0, 2)), [[0, 1], [3, 4]]), + # List with repeated index + (([1, 0, 1], slice(1, None)), [[4, 5], [1, 2], [4, 5]]), + # 1D indexing + (([1, 0, 1]), [[3, 4, 5], [0, 1, 2], [3, 4, 5]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_2d(index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 2D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [0, 1, 2], + # [3, 4, 5], + # [6, 7, 8] + a = np.arange(9).reshape(3, 3) + z = zarr.array(a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]), + # One slice, two integers + ((slice(0, 2), 1, 1), [4, 13]), + # One integer, two slices + ((slice(0, 2), 1, slice(0, 2)), [[3, 4], [12, 13]]), + # Two slices and a list + ((slice(0, 2), [1, 2], slice(0, 2)), [[[3, 4], [6, 7]], [[12, 13], [15, 16]]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_3d(index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [[[ 0, 1, 2], + # [ 3, 4, 5], + # [ 6, 7, 8]], + + # [[ 9, 10, 11], + # [12, 13, 14], + # [15, 16, 17]], + + # [[18, 19, 20], + # [21, 22, 23], + # [24, 25, 26]]] + a = np.arange(27).reshape(3, 3, 3) + z = zarr.array(a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[1, 1, 1], [1, 1, 1], [0, 0, 0]]), + # List and slice combined + (([0, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Index repetition is ignored on setitem + (([0, 1, 1, 1, 1, 1, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Slice with step + (([0, 2], slice(None, None, 2)), [[1, 0, 1], [0, 0, 0], [1, 0, 1]]), + ], +) +def test_orthogonal_indexing_fallback_on_setitem_2d(index, expected_result): + """ + Tests the orthogonal indexing fallback on __setitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # Slice + fancy index + a = np.zeros((3, 3)) + z = zarr.array(a) + z[index] = 1 + a[index] = 1 + np.testing.assert_array_equal(z, expected_result) + np.testing.assert_array_equal(z, a, err_msg="Indexing disagrees with numpy") + + +def test_fancy_indexing_doesnt_mix_with_implicit_slicing(): + z2 = zarr.zeros((5, 5, 5)) + with pytest.raises(IndexError): + z2[[1, 2, 3], [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[[1, 2, 3], [1, 2, 3]], 0) + with pytest.raises(IndexError): + z2[..., [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[..., [1, 2, 3]], 0) + + +def test_set_basic_selection_0d(): + # setup + v = np.array(42) + a = np.zeros_like(v) + z = zarr.zeros_like(v) + assert_array_equal(a, z) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z) + z[...] = 0 + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + + # test structured array + value = (b"aaa", 1, 4.2) + v = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.zeros_like(v) + z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z) + z.set_basic_selection(Ellipsis, a) + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + z[...] = a + assert_array_equal(a, z) + # with fields + z.set_basic_selection(Ellipsis, v["foo"], fields="foo") + assert v["foo"] == z["foo"] + assert a["bar"] == z["bar"] + assert a["baz"] == z["baz"] + z["bar"] = v["bar"] + assert v["foo"] == z["foo"] + assert v["bar"] == z["bar"] + assert a["baz"] == z["baz"] + # multiple field assignment not supported + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) + with pytest.raises(IndexError): + z[..., "foo", "bar"] = v[["foo", "bar"]] + + +def _test_get_orthogonal_selection(a, z, selection): + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_bool(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_orthogonal_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.oindex[np.zeros(50, dtype=bool)] # too short + with pytest.raises(IndexError): + z.oindex[np.zeros(2000, dtype=bool)] # too long + with pytest.raises(IndexError): + z.oindex[[[True, False], [False, True]]] # too many dimensions + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_int(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + # unordered + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_get_orthogonal_selection(a, z, ix) + # increasing + ix.sort() + _test_get_orthogonal_selection(a, z, ix) + # decreasing + ix = ix[::-1] + _test_get_orthogonal_selection(a, z, ix) + + selections = basic_selections_1d + [ + # test wraparound + [0, 3, 10, -23, -12, -1], + # explicit test not sorted + [3, 105, 23, 127], + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + bad_selections = basic_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + [[2, 4], [6, 8]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (ix0, slice(1, 5, 2)), + (slice(250, 350), ix1), + (slice(250, 350, 10), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_2d(): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + # mixed int array / bool array + selections = ( + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ) + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_get_orthogonal_selection(a, z, selection) + + for selection in basic_selections_2d_bad: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): + selections = [ + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with array + (ix0, ix1, ix2), + # mixed indexing with single array / slices + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, slice(15, 25, 5), slice(1, 5, 2)), + (slice(50, 70, 3), ix1, slice(1, 5, 2)), + (slice(50, 70, 3), slice(15, 25, 5), ix2), + # mixed indexing with single array / ints + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + # mixed indexing with single array / slice / int + (ix0, slice(15, 25), 4), + (42, ix1, slice(1, 5)), + (slice(50, 70), 42, ix2), + # mixed indexing with two array / slice + (ix0, ix1, slice(1, 5)), + (slice(50, 70), ix1, ix2), + (ix0, slice(15, 25), ix2), + # mixed indexing with two array / integer + (ix0, ix1, 4), + (42, ix1, ix2), + (ix0, 42, ix2), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +def test_get_orthogonal_selection_3d(): + # setup + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_edge_cases(): + a = np.arange(6).reshape(1, 2, 3) + z = zarr.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) + z[:] = a + + expect = oindex(a, (0, slice(None), [0, 1, 2])) + actual = z.oindex[0, :, [0, 1, 2]] + assert_array_equal(expect, actual) + + expect = oindex(a, (0, slice(None), [True, True, True])) + actual = z.oindex[0, :, [True, True, True]] + assert_array_equal(expect, actual) + + +def _test_set_orthogonal_selection(v, a, z, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): + if isinstance(value, list) and value == []: + # skip these cases as cannot preserve all dimensions + continue + # setup expectation + a[:] = 0 + oindex_set(a, selection, value) + # long-form API + z[:] = 0 + z.set_orthogonal_selection(selection, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_orthogonal_selection_1d(): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + # test with different degrees of sparseness + np.random.seed(42) + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_orthogonal_selection(v, a, z, ix) + + # integer arrays + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_set_orthogonal_selection(v, a, z, ix) + ix.sort() + _test_set_orthogonal_selection(v, a, z, ix) + ix = ix[::-1] + _test_set_orthogonal_selection(v, a, z, ix) + + # basic selections + for selection in basic_selections_1d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice or int + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_2d(): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): + selections = ( + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with bool array + (ix0, ix1, ix2), + # mixed indexing with single bool array / slice or int + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + (ix0, slice(15, 25), 4), + (slice(50, 70), ix1, 4), + (slice(50, 70), 42, ix2), + # indexing with two arrays / slice + (ix0, ix1, slice(1, 5)), + # indexing with two arrays / integer + (ix0, ix1, 4), + ) + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_3d(): + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted increasing + ix0.sort() + ix1.sort() + ix2.sort() + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted decreasing + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_fallback_on_get_setitem(): + z = zarr.zeros((20, 20)) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr.zeros(5) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + + +def _test_get_coordinate_selection(a, z, selection): + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +coordinate_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_coordinate_selection_1d(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_get_coordinate_selection(a, z, ix) + ix.sort() + _test_get_coordinate_selection(a, z, ix) + ix = ix[::-1] + _test_get_coordinate_selection(a, z, ix) + + selections = [ + # test single item + 42, + -1, + # test wraparound + [0, 3, 10, -23, -12, -1], + # test out of order + [3, 105, 23, 127], # not monotonically increasing + # test multi-dimensional selection + np.array([[2, 4], [6, 8]]), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # test errors + bad_selections = coordinate_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +def test_get_coordinate_selection_2d(): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # single value + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # not monotonically increasing (first dim) + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # not monotonically increasing (second dim) + ix0 = [1, 1, 2, 2, 5] + ix1 = [1, 3, 2, 1, 0] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # multi-dimensional selection + ix0 = np.array([[1, 1, 2], [2, 2, 5]]) + ix1 = np.array([[1, 3, 2], [1, 0, 0]]) + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = [1, 2, 3], slice(5, 15) + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis + z.get_coordinate_selection(selection) + + +def _test_set_coordinate_selection(v, a, z, selection): + for value in 42, v[selection], v[selection].tolist(): + # setup expectation + a[:] = 0 + a[selection] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_coordinate_selection_1d(): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_set_coordinate_selection(v, a, z, ix) + + # multi-dimensional selection + ix = np.array([[2, 4], [6, 8]]) + _test_set_coordinate_selection(v, a, z, ix) + + for selection in coordinate_selections_1d_bad: + with pytest.raises(IndexError): + z.set_coordinate_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_coordinate_selection_2d(): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + + selections = ( + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ) + for selection in selections: + _test_set_coordinate_selection(v, a, z, selection) + + # multi-dimensional selection + ix0 = np.array([[1, 2, 3], [4, 5, 6]]) + ix1 = np.array([[1, 3, 2], [2, 0, 5]]) + _test_set_coordinate_selection(v, a, z, (ix0, ix1)) + + +def _test_get_block_selection(a, z, selection, expected_idx): + expect = a[expected_idx] + actual = z.get_block_selection(selection) + assert_array_equal(expect, actual) + actual = z.blocks[selection] + assert_array_equal(expect, actual) + + +block_selections_1d = [ + # test single item + 0, + 5, + # test wraparound + -1, + -4, + # test slice + slice(5), + slice(None, 3), + slice(5, 6), + slice(-3, -1), + slice(None), # Full slice +] + +block_selections_1d_array_projection = [ + # test single item + slice(100), + slice(500, 600), + # test wraparound + slice(1000, None), + slice(700, 800), + # test slice + slice(500), + slice(None, 300), + slice(500, 600), + slice(800, 1000), + slice(None), +] + +block_selections_1d_bad = [ + # slice not supported + slice(3, 8, 2), + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), + [0, 5, 3], +] + + +def test_get_block_selection_1d(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + for selection, expected_idx in zip(block_selections_1d, block_selections_1d_array_projection): + _test_get_block_selection(a, z, selection, expected_idx) + + bad_selections = block_selections_1d_bad + [ + z.nchunks + 1, # out of bounds + -(z.nchunks + 1), # out of bounds + ] + + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_block_selection(selection) + with pytest.raises(IndexError): + z.blocks[selection] + + +block_selections_2d = [ + # test single item + (0, 0), + (1, 2), + # test wraparound + (-1, -1), + (-3, -2), + # test slice + (slice(1), slice(2)), + (slice(None, 2), slice(-2, -1)), + (slice(2, 3), slice(-2, None)), + (slice(-3, -1), slice(-3, -2)), + (slice(None), slice(None)), # Full slice +] + +block_selections_2d_array_projection = [ + # test single item + (slice(300), slice(3)), + (slice(300, 600), slice(6, 9)), + # test wraparound + (slice(900, None), slice(9, None)), + (slice(300, 600), slice(6, 9)), + # test slice + (slice(300), slice(6)), + (slice(None, 600), slice(6, 9)), + (slice(600, 900), slice(6, None)), + (slice(300, 900), slice(3, 6)), + (slice(None), slice(None)), # Full slice +] + + +def test_get_block_selection_2d(): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + for selection, expected_idx in zip(block_selections_2d, block_selections_2d_array_projection): + _test_get_block_selection(a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.get_block_selection(selection) + + +def _test_set_block_selection(v: np.ndarray, a: np.ndarray, z: zarr.Array, selection, expected_idx): + for value in 42, v[expected_idx], v[expected_idx].tolist(): + # setup expectation + a[:] = 0 + a[expected_idx] = value + # test long-form API + z[:] = 0 + z.set_block_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.blocks[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_block_selection_1d(): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + for selection, expected_idx in zip(block_selections_1d, block_selections_1d_array_projection): + _test_set_block_selection(v, a, z, selection, expected_idx) + + for selection in block_selections_1d_bad: + with pytest.raises(IndexError): + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + z.blocks[selection] = 42 + + +def test_set_block_selection_2d(): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + for selection, expected_idx in zip(block_selections_2d, block_selections_2d_array_projection): + _test_set_block_selection(v, a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.set_block_selection(selection, 42) + + +def _test_get_mask_selection(a, z, selection): + expect = a[selection] + actual = z.get_mask_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +mask_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_mask_selection_1d(): + # setup + a = np.arange(1050, dtype=int) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_mask_selection(a, z, ix) + + # test errors + bad_selections = mask_selections_1d_bad + [ + np.zeros(50, dtype=bool), # too short + np.zeros(2000, dtype=bool), # too long + [[True, False], [False, True]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_mask_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +# noinspection PyStatementEffect +def test_get_mask_selection_2d(): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_get_mask_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.vindex[np.zeros((1000, 5), dtype=bool)] # too short + with pytest.raises(IndexError): + z.vindex[np.zeros((2000, 10), dtype=bool)] # too long + with pytest.raises(IndexError): + z.vindex[[True, False]] # wrong no. dimensions + + +def _test_set_mask_selection(v, a, z, selection): + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z.set_mask_selection(selection, v[selection]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[selection] = v[selection] + assert_array_equal(a, z[:]) + + +def test_set_mask_selection_1d(): + # setup + v = np.arange(1050, dtype=int) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_mask_selection(v, a, z, ix) + + for selection in mask_selections_1d_bad: + with pytest.raises(IndexError): + z.set_mask_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_mask_selection_2d(): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_set_mask_selection(v, a, z, ix) + + +def test_get_selection_out(): + # basic selections + a = np.arange(1050) + z = zarr.create(shape=1050, chunks=100, dtype=a.dtype) + z[:] = a + selections = [ + slice(50, 150), + slice(0, 1050), + slice(1, 2), + ] + for selection in selections: + expect = a[selection] + out = zarr.create(shape=expect.shape, chunks=10, dtype=expect.dtype, fill_value=0) + z.get_basic_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + with pytest.raises(TypeError): + z.get_basic_selection(Ellipsis, out=[]) + + # orthogonal selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + # mixed int array / bool array + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ] + for selection in selections: + expect = oindex(a, selection) + # out = zarr.create(shape=expect.shape, chunks=10, dtype=expect.dtype, + # fill_value=0) + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_orthogonal_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + # coordinate selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) + z[:] = a + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + expect = a[selection] + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_coordinate_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + +def test_get_selections_with_fields(): + a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=None) + z[:] = a + + fields_fixture = [ + "foo", + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # total selection + expect = a[fields] + actual = z.get_basic_selection(Ellipsis, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[fields[0], fields[1]] + assert_array_equal(expect, actual) + if isinstance(fields, str): + actual = z[..., fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[..., fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with slice + expect = a[fields][0:2] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[0:2, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[0:2, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with single item + expect = a[fields][1] + actual = z.get_basic_selection(1, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[1, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[1, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # orthogonal selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.oindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.oindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # coordinate selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # mask selection + ix = [True, False, True] + expect = a[fields][ix] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # missing/bad fields + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=["notafield"]) + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=slice(None)) + + +def test_set_selections_with_fields(): + v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.empty_like(v) + z = zarr.empty_like(v, chunks=2) + + fields_fixture = [ + "foo", + [], + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # currently multi-field assignment is not supported in numpy, so we won't support + # it either + if isinstance(fields, list) and len(fields) > 1: + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v, fields=fields) + with pytest.raises(IndexError): + z.set_orthogonal_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_coordinate_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_mask_selection([True, False, True], v, fields=fields) + + else: + if isinstance(fields, list) and len(fields) == 1: + # work around numpy does not support multi-field assignment even if there + # is only one field + key = fields[0] + elif isinstance(fields, list) and len(fields) == 0: + # work around numpy ambiguity about what is a field selection + key = Ellipsis + else: + key = fields + + # setup expectation + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + assert_array_equal(a, z[:]) + a[key] = v[key] + # total selection + z.set_basic_selection(Ellipsis, v[key], fields=fields) + assert_array_equal(a, z[:]) + + # basic selection with slice + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + a[key][0:2] = v[key][0:2] + z.set_basic_selection(slice(0, 2), v[key][0:2], fields=fields) + assert_array_equal(a, z[:]) + + # orthogonal selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_orthogonal_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # coordinate selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_coordinate_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # mask selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [True, False, True] + a[key][ix] = v[key][ix] + z.set_mask_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + +@pytest.mark.parametrize( + "selection, arr, expected", + [ + ( + (slice(5, 8, 1), slice(2, 4, 1), slice(0, 100, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), + [ + (5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), + (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), + (7200, 200, (slice(7, 8, 1), slice(2, 4, 1))), + ], + ), + ( + (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), + [ + (5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1))), + ], + ), + ( + (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.asfortranarray(np.arange(2, 100_002).reshape((100, 10, 100))), + [ + (5200.0, 5.0, (slice(5, 6, 1), slice(2, 3, 1), slice(0, 5, 1))), + (5300.0, 5.0, (slice(5, 6, 1), slice(3, 4, 1), slice(0, 5, 1))), + (6200.0, 5.0, (slice(6, 7, 1), slice(2, 3, 1), slice(0, 5, 1))), + (6300.0, 5.0, (slice(6, 7, 1), slice(3, 4, 1), slice(0, 5, 1))), + (7200.0, 5.0, (slice(7, 8, 1), slice(2, 3, 1), slice(0, 5, 1))), + (7300.0, 5.0, (slice(7, 8, 1), slice(3, 4, 1), slice(0, 5, 1))), + ], + ), + ( + (slice(5, 8, 1), slice(2, 4, 1)), + np.arange(2, 100_002).reshape((100, 10, 100)), + [ + (5200, 200, (slice(5, 6, 1), slice(2, 4, 1))), + (6200, 200, (slice(6, 7, 1), slice(2, 4, 1))), + (7200, 200, (slice(7, 8, 1), slice(2, 4, 1))), + ], + ), + ( + (slice(0, 10, 1),), + np.arange(0, 10).reshape(10), + [(0, 10, (slice(0, 10, 1),))], + ), + ((0,), np.arange(0, 100).reshape((10, 10)), [(0, 10, (slice(0, 1, 1),))]), + ( + ( + 0, + 0, + ), + np.arange(0, 100).reshape((10, 10)), + [(0, 1, (slice(0, 1, 1), slice(0, 1, 1)))], + ), + ((0,), np.arange(0, 10).reshape(10), [(0, 1, (slice(0, 1, 1),))]), + pytest.param( + (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100002).reshape((10, 1, 10000)), + None, + marks=[pytest.mark.xfail(reason="slice 2 is out of range")], + ), + pytest.param( + (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), + np.arange(2, 100_002).reshape((10, 10_000)), + None, + marks=[pytest.mark.xfail(reason="slice 2 is out of range")], + ), + ], +) +def test_PartialChunkIterator(selection, arr, expected): + PCI = PartialChunkIterator(selection, arr.shape) + results = list(PCI) + assert results == expected + + +def test_slice_selection_uints(): + arr = np.arange(24).reshape((4, 6)) + idx = np.uint64(3) + slice_sel = make_slice_selection((idx,)) + assert arr[tuple(slice_sel)].shape == (1, 6) + + +def test_numpy_int_indexing(): + a = np.arange(1050) + z = zarr.create(shape=1050, chunks=100, dtype=a.dtype) + z[:] = a + assert a[42] == z[42] + assert a[numpy.int64(42)] == z[numpy.int64(42)] + + +@pytest.mark.parametrize( + "shape, chunks, ops", + [ + # 1D test cases + ((1070,), (50,), [("__getitem__", (slice(200, 400),))]), + ((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]), + ( + (1070,), + (50,), + [ + ("__getitem__", (slice(200, 400),)), + ("__setitem__", (slice(200, 400, 100),)), + ], + ), + # 2D test cases + ( + (40, 50), + (5, 8), + [ + ("__getitem__", (slice(6, 37, 13), (slice(4, 10)))), + ("__setitem__", (slice(None), (slice(None)))), + ], + ), + ], +) +def test_accessed_chunks(shape, chunks, ops): + # Test that only the required chunks are accessed during basic selection operations + # shape: array shape + # chunks: chunk size + # ops: list of tuples with (optype, tuple of slices) + # optype = "__getitem__" or "__setitem__", tuple length must match number of dims + import itertools + + # Use a counting dict as the backing store so we can track the items access + store = CountingDict() + z = zarr.create(shape=shape, chunks=chunks, store=store) + + for ii, (optype, slices) in enumerate(ops): + # Resolve the slices into the accessed chunks for each dimension + chunks_per_dim = [ + np.unique(np.arange(N, dtype=int)[sl] // C) for N, C, sl in zip(shape, chunks, slices) + ] + + # Combine and generate the cartesian product to determine the chunks keys that + # will be accessed + chunks_accessed = ( + ".".join([str(ci) for ci in comb]) for comb in itertools.product(*chunks_per_dim) + ) + counts_before = store.counter.copy() + + # Perform the operation + if optype == "__getitem__": + z[slices] + else: + z[slices] = ii + + # Get the change in counts + delta_counts = store.counter - counts_before + + # Check that the access counts for the operation have increased by one for all + # the chunks we expect to be included + for ci in chunks_accessed: + assert delta_counts.pop((optype, ci)) == 1 + + # If the chunk was partially written to it will also have been read once. We + # don't determine if the chunk was actually partial here, just that the + # counts are consistent that this might have happened + if optype == "__setitem__": + assert ("__getitem__", ci) not in delta_counts or delta_counts.pop( + ("__getitem__", ci) + ) == 1 + # Check that no other chunks were accessed + assert len(delta_counts) == 0 diff --git a/src/zarr/v2/tests/test_info.py b/src/zarr/v2/tests/test_info.py new file mode 100644 index 0000000000..96eae999f4 --- /dev/null +++ b/src/zarr/v2/tests/test_info.py @@ -0,0 +1,66 @@ +import numcodecs +import pytest + +import zarr +from zarr.util import InfoReporter + + +@pytest.mark.parametrize("array_size", [10, 15000]) +def test_info(array_size): + # setup + g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) + g.create_group("foo") + z = g.zeros("bar", shape=array_size, filters=[numcodecs.Adler32()]) + + # test group info + items = g.info_items() + keys = sorted([k for k, _ in items]) + expected_keys = sorted( + [ + "Type", + "Read-only", + "Synchronizer type", + "Store type", + "Chunk store type", + "No. members", + "No. arrays", + "No. groups", + "Arrays", + "Groups", + "Name", + ] + ) + assert expected_keys == keys + + # can also get a string representation of info via the info attribute + assert isinstance(g.info, InfoReporter) + assert "Type" in repr(g.info) + + # test array info + items = z.info_items() + keys = sorted([k for k, _ in items]) + expected_keys = sorted( + [ + "Type", + "Data type", + "Shape", + "Chunk shape", + "Order", + "Read-only", + "Filter [0]", + "Compressor", + "Synchronizer type", + "Store type", + "Chunk store type", + "No. bytes", + "No. bytes stored", + "Storage ratio", + "Chunks initialized", + "Name", + ] + ) + assert expected_keys == keys + + # can also get a string representation of info via the info attribute + assert isinstance(z.info, InfoReporter) + assert "Type" in repr(z.info) diff --git a/src/zarr/v2/tests/test_meta.py b/src/zarr/v2/tests/test_meta.py new file mode 100644 index 0000000000..57ab9a0781 --- /dev/null +++ b/src/zarr/v2/tests/test_meta.py @@ -0,0 +1,640 @@ +import base64 +import copy +import json + +import numpy as np +import pytest + +from zarr.codecs import Blosc, Delta, Pickle, Zlib, Zstd +from zarr.errors import MetadataError +from zarr.meta import ( + ZARR_FORMAT, + decode_array_metadata, + decode_dtype, + decode_group_metadata, + encode_array_metadata, + encode_dtype, + encode_fill_value, + decode_fill_value, + get_extended_dtype_info, + _v3_complex_types, + _v3_datetime_types, + _default_entry_point_metadata_v3, + Metadata3, +) +from zarr.util import normalize_dtype, normalize_fill_value + + +def assert_json_equal(expect, actual): + if isinstance(actual, bytes): + actual = str(actual, "ascii") + ej = json.loads(expect) + aj = json.loads(actual) + assert ej == aj + + +def test_encode_decode_array_1(): + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype("U4", " CuPyCPUCompressor: + if compressor: + compressor = getattr(zarr.codecs, compressor)() + return CuPyCPUCompressor(compressor) + + +def init_store(tmp_path, store_type) -> Optional[Store]: + if store_type is DirectoryStore: + return store_type(str(tmp_path / "store")) + if store_type is MemoryStore: + return MemoryStore() + return None + + +def ensure_module(module): + if isinstance(module, str): + return pytest.importorskip(module) + return module + + +param_module_and_compressor = [ + (MyArray, None), + ("cupy", init_compressor(None)), + ("cupy", init_compressor("Zlib")), + ("cupy", init_compressor("Blosc")), +] + + +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +@pytest.mark.parametrize("store_type", [None, DirectoryStore, MemoryStore, ZipStore]) +def test_array(tmp_path, module, compressor, store_type): + xp = ensure_module(module) + + store = init_store(tmp_path / "from_cupy_array", store_type) + a = xp.arange(100) + z = array(a, chunks=10, compressor=compressor, store=store, meta_array=xp.empty(())) + assert a.shape == z.shape + assert a.dtype == z.dtype + assert isinstance(a, type(z[:])) + assert isinstance(z.meta_array, type(xp.empty(()))) + xp.testing.assert_array_equal(a, z[:]) + + # with array-like + store = init_store(tmp_path / "from_list", store_type) + a = list(range(100)) + z = array(a, chunks=10, compressor=compressor, store=store, meta_array=xp.empty(())) + assert (100,) == z.shape + assert np.asarray(a).dtype == z.dtype + xp.testing.assert_array_equal(a, z[:]) + + # with another zarr array + store = init_store(tmp_path / "from_another_store", store_type) + z2 = array(z, compressor=compressor, store=store, meta_array=xp.empty(())) + assert z.shape == z2.shape + assert z.chunks == z2.chunks + assert z.dtype == z2.dtype + xp.testing.assert_array_equal(z[:], z2[:]) + + store = init_store(tmp_path / "open_array", store_type) + a = xp.arange(100) + z = open_array( + store, + shape=a.shape, + dtype=a.dtype, + chunks=10, + compressor=compressor, + meta_array=xp.empty(()), + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert isinstance(a, type(z[:])) + assert isinstance(z.meta_array, type(xp.empty(()))) + xp.testing.assert_array_equal(a, z[:]) + + +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +def test_empty(module, compressor): + xp = ensure_module(module) + z = empty( + 100, + chunks=10, + compressor=compressor, + meta_array=xp.empty(()), + ) + assert (100,) == z.shape + assert (10,) == z.chunks + + +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +def test_zeros(module, compressor): + xp = ensure_module(module) + z = zeros( + 100, + chunks=10, + compressor=compressor, + meta_array=xp.empty(()), + ) + assert (100,) == z.shape + assert (10,) == z.chunks + xp.testing.assert_array_equal(np.zeros(100), z[:]) + + +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +def test_ones(module, compressor): + xp = ensure_module(module) + z = ones( + 100, + chunks=10, + compressor=compressor, + meta_array=xp.empty(()), + ) + assert (100,) == z.shape + assert (10,) == z.chunks + xp.testing.assert_array_equal(np.ones(100), z[:]) + + +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +def test_full(module, compressor): + xp = ensure_module(module) + z = full( + 100, + chunks=10, + fill_value=42, + dtype="i4", + compressor=compressor, + meta_array=xp.empty(()), + ) + assert (100,) == z.shape + assert (10,) == z.chunks + xp.testing.assert_array_equal(np.full(100, fill_value=42, dtype="i4"), z[:]) + + # nan + z = full( + 100, + chunks=10, + fill_value=np.nan, + dtype="f8", + compressor=compressor, + meta_array=xp.empty(()), + ) + assert np.all(np.isnan(z[:])) + + +@pytest.mark.parametrize("group_create_function", [group, open_group]) +@pytest.mark.parametrize("module, compressor", param_module_and_compressor) +@pytest.mark.parametrize("store_type", [None, DirectoryStore, MemoryStore, ZipStore]) +def test_group(tmp_path, group_create_function, module, compressor, store_type): + xp = ensure_module(module) + store = init_store(tmp_path, store_type) + g = group_create_function(store, meta_array=xp.empty(())) + g.ones("data", shape=(10, 11), dtype=int, compressor=compressor) + a = g["data"] + assert a.shape == (10, 11) + assert a.dtype == int + assert isinstance(a, Array) + assert isinstance(a[:], type(xp.empty(()))) + assert (a[:] == 1).all() + assert isinstance(g.meta_array, type(xp.empty(()))) diff --git a/src/zarr/v2/tests/test_n5.py b/src/zarr/v2/tests/test_n5.py new file mode 100644 index 0000000000..2602aa06c1 --- /dev/null +++ b/src/zarr/v2/tests/test_n5.py @@ -0,0 +1,53 @@ +import pytest + +from zarr.n5 import N5ChunkWrapper, N5FSStore +from zarr.creation import create +from zarr.storage import atexit_rmtree +from numcodecs import GZip +import numpy as np +from typing import Tuple +import json +import atexit + +from zarr.tests.util import have_fsspec + + +def test_make_n5_chunk_wrapper(): + dtype = "uint8" + chunk_shape = (10,) + codec = GZip() + # ValueError when specifying both compressor and compressor_config + with pytest.raises(ValueError): + N5ChunkWrapper( + dtype, chunk_shape=chunk_shape, compressor_config=codec.get_config(), compressor=codec + ) + + wrapper_a = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor_config=codec.get_config()) + wrapper_b = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor=codec) + assert wrapper_a == wrapper_b + + +@pytest.mark.parametrize("chunk_shape", ((2,), (4, 4), (8, 8, 8))) +def test_partial_chunk_decode(chunk_shape: Tuple[int, ...]): + # Test that the N5Chunk wrapper can handle fractional chunks that + # may be generated by other N5 implementations + dtype = "uint8" + codec = GZip() + codec_wrapped = N5ChunkWrapper(dtype, chunk_shape=chunk_shape, compressor=codec) + subslices = tuple(slice(0, cs // 2) for cs in chunk_shape) + chunk = np.zeros(chunk_shape, dtype=dtype) + chunk[subslices] = 1 + subchunk = np.ascontiguousarray(chunk[subslices]) + assert np.array_equal(codec_wrapped.decode(codec_wrapped.encode(subchunk)), chunk) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +def test_dtype_decode(): + path = "data/array.n5" + atexit_rmtree(path) + atexit.register(atexit_rmtree, path) + n5_store = N5FSStore(path) + create(100, store=n5_store) + dtype_n5 = json.loads(n5_store[".zarray"])["dtype"] + dtype_zarr = json.loads(create(100).store[".zarray"])["dtype"] + assert dtype_n5 == dtype_zarr diff --git a/src/zarr/v2/tests/test_storage.py b/src/zarr/v2/tests/test_storage.py new file mode 100644 index 0000000000..d72718d77a --- /dev/null +++ b/src/zarr/v2/tests/test_storage.py @@ -0,0 +1,2623 @@ +import array +import atexit +import json +import os +import pathlib +import sys +import pickle +import shutil +import tempfile +from contextlib import contextmanager +from pickle import PicklingError +from zipfile import ZipFile + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from numcodecs.compat import ensure_bytes + +import zarr +from zarr._storage.store import _get_hierarchy_metadata +from zarr.codecs import BZ2, AsType, Blosc, Zlib +from zarr.context import Context +from zarr.convenience import consolidate_metadata +from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataError +from zarr.hierarchy import group +from zarr.meta import ZARR_FORMAT, decode_array_metadata +from zarr.n5 import N5Store, N5FSStore, N5_FORMAT, n5_attrs_key +from zarr.storage import ( + ABSStore, + ConsolidatedMetadataStore, + DBMStore, + DictStore, + DirectoryStore, + KVStore, + LMDBStore, + LRUStoreCache, + MemoryStore, + MongoDBStore, + NestedDirectoryStore, + RedisStore, + SQLiteStore, + Store, + TempStore, + ZipStore, + array_meta_key, + atexit_rmglob, + atexit_rmtree, + attrs_key, + data_root, + default_compressor, + getsize, + group_meta_key, + init_array, + init_group, + migrate_1to2, + meta_root, + normalize_store_arg, +) +from zarr.storage import FSStore, rename, listdir +from zarr._storage.v3 import KVStoreV3 +from zarr.tests.util import CountingDict, have_fsspec, skip_test_env_var, abs_container, mktemp +from zarr.util import ConstantMap, json_dumps + + +@contextmanager +def does_not_raise(): + yield + + +@pytest.fixture( + params=[ + (None, "."), + (".", "."), + ("/", "/"), + ] +) +def dimension_separator_fixture(request): + return request.param + + +def skip_if_nested_chunks(**kwargs): + if kwargs.get("dimension_separator") == "/": + pytest.skip("nested chunks are unsupported") + + +def test_kvstore_repr(): + repr(KVStore(dict())) + + +def test_ensure_store(): + class InvalidStore: + pass + + with pytest.raises(ValueError): + Store._ensure_store(InvalidStore()) + + # cannot initialize with a store from a different Zarr version + with pytest.raises(ValueError): + Store._ensure_store(KVStoreV3(dict())) + + # cannot initialize without a store + with pytest.raises(ValueError): + Store._ensure_store(None) + + +def test_capabilities(): + s = KVStore(dict()) + assert s.is_readable() + assert s.is_listable() + assert s.is_erasable() + assert s.is_writeable() + + +def test_getsize_non_implemented(): + assert getsize(object()) == -1 + + +def test_kvstore_eq(): + assert KVStore(dict()) != dict() + + +def test_coverage_rename(): + store = dict() + store["a"] = 1 + rename(store, "a", "b") + + +def test_deprecated_listdir_nosotre(): + store = dict() + with pytest.warns(UserWarning, match="has no `listdir`"): + listdir(store) + + +class StoreTests: + """Abstract store tests.""" + + version = 2 + root = "" + + def create_store(self, **kwargs): # pragma: no cover + # implement in sub-class + raise NotImplementedError + + def test_context_manager(self): + with self.create_store(): + pass + + def test_get_set_del_contains(self): + store = self.create_store() + + # test __contains__, __getitem__, __setitem__ + key = self.root + "foo" + assert key not in store + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store[key] + store[key] = b"bar" + assert key in store + assert b"bar" == ensure_bytes(store[key]) + + # test __delitem__ (optional) + try: + del store[key] + except NotImplementedError: + pass + else: + assert key not in store + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store[key] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + del store[key] + + store.close() + + def test_set_invalid_content(self): + store = self.create_store() + + with pytest.raises(TypeError): + store[self.root + "baz"] = list(range(5)) + + store.close() + + def test_clear(self): + store = self.create_store() + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"qux" + assert len(store) == 2 + store.clear() + assert len(store) == 0 + assert self.root + "foo" not in store + assert self.root + "baz" not in store + + store.close() + + def test_pop(self): + store = self.create_store() + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"qux" + assert len(store) == 2 + v = store.pop(self.root + "foo") + assert ensure_bytes(v) == b"bar" + assert len(store) == 1 + v = store.pop(self.root + "baz") + assert ensure_bytes(v) == b"qux" + assert len(store) == 0 + with pytest.raises(KeyError): + store.pop(self.root + "xxx") + v = store.pop(self.root + "xxx", b"default") + assert v == b"default" + v = store.pop(self.root + "xxx", b"") + assert v == b"" + v = store.pop(self.root + "xxx", None) + assert v is None + + store.close() + + def test_popitem(self): + store = self.create_store() + store[self.root + "foo"] = b"bar" + k, v = store.popitem() + assert k == self.root + "foo" + assert ensure_bytes(v) == b"bar" + assert len(store) == 0 + with pytest.raises(KeyError): + store.popitem() + + store.close() + + def test_writeable_values(self): + store = self.create_store() + + # __setitem__ should accept any value that implements buffer interface + store[self.root + "foo1"] = b"bar" + store[self.root + "foo2"] = bytearray(b"bar") + store[self.root + "foo3"] = array.array("B", b"bar") + store[self.root + "foo4"] = np.frombuffer(b"bar", dtype="u1") + + store.close() + + def test_update(self): + store = self.create_store() + assert self.root + "foo" not in store + assert self.root + "baz" not in store + + if self.version == 2: + store.update(foo=b"bar", baz=b"quux") + else: + kv = {self.root + "foo": b"bar", self.root + "baz": b"quux"} + store.update(kv) + + assert b"bar" == ensure_bytes(store[self.root + "foo"]) + assert b"quux" == ensure_bytes(store[self.root + "baz"]) + + store.close() + + def test_iterators(self): + store = self.create_store() + + # test iterator methods on empty store + assert 0 == len(store) + assert set() == set(store) + assert set() == set(store.keys()) + assert set() == set(store.values()) + assert set() == set(store.items()) + + # setup some values + store[self.root + "a"] = b"aaa" + store[self.root + "b"] = b"bbb" + store[self.root + "c/d"] = b"ddd" + store[self.root + "c/e/f"] = b"fff" + + # test iterators on store with data + assert 4 == len(store) + expected = set(self.root + k for k in ["a", "b", "c/d", "c/e/f"]) + assert expected == set(store) + assert expected == set(store.keys()) + assert {b"aaa", b"bbb", b"ddd", b"fff"} == set(map(ensure_bytes, store.values())) + assert { + (self.root + "a", b"aaa"), + (self.root + "b", b"bbb"), + (self.root + "c/d", b"ddd"), + (self.root + "c/e/f", b"fff"), + } == set(map(lambda kv: (kv[0], ensure_bytes(kv[1])), store.items())) + + store.close() + + def test_pickle(self): + # setup store + store = self.create_store() + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"quux" + n = len(store) + keys = sorted(store.keys()) + + # round-trip through pickle + dump = pickle.dumps(store) + # some stores cannot be opened twice at the same time, need to close + # store before can round-trip through pickle + store.close() + # check can still pickle after close + assert dump == pickle.dumps(store) + store2 = pickle.loads(dump) + + # verify + assert n == len(store2) + assert keys == sorted(store2.keys()) + assert b"bar" == ensure_bytes(store2[self.root + "foo"]) + assert b"quux" == ensure_bytes(store2[self.root + "baz"]) + + store2.close() + + def test_getsize(self): + store = self.create_store() + if isinstance(store, dict) or hasattr(store, "getsize"): + assert 0 == getsize(store) + store["foo"] = b"x" + assert 1 == getsize(store) + assert 1 == getsize(store, "foo") + store["bar"] = b"yy" + assert 3 == getsize(store) + assert 2 == getsize(store, "bar") + store["baz"] = bytearray(b"zzz") + assert 6 == getsize(store) + assert 3 == getsize(store, "baz") + store["quux"] = array.array("B", b"zzzz") + assert 10 == getsize(store) + assert 4 == getsize(store, "quux") + store["spong"] = np.frombuffer(b"zzzzz", dtype="u1") + assert 15 == getsize(store) + assert 5 == getsize(store, "spong") + + store.close() + + # noinspection PyStatementEffect + def test_hierarchy(self): + # setup + store = self.create_store() + store[self.root + "a"] = b"aaa" + store[self.root + "b"] = b"bbb" + store[self.root + "c/d"] = b"ddd" + store[self.root + "c/e/f"] = b"fff" + store[self.root + "c/e/g"] = b"ggg" + + # check keys + assert self.root + "a" in store + assert self.root + "b" in store + assert self.root + "c/d" in store + assert self.root + "c/e/f" in store + assert self.root + "c/e/g" in store + assert self.root + "c" not in store + assert self.root + "c/" not in store + assert self.root + "c/e" not in store + assert self.root + "c/e/" not in store + assert self.root + "c/d/x" not in store + + # check __getitem__ + with pytest.raises(KeyError): + store[self.root + "c"] + with pytest.raises(KeyError): + store[self.root + "c/e"] + with pytest.raises(KeyError): + store[self.root + "c/d/x"] + + # test getsize (optional) + if hasattr(store, "getsize"): + # TODO: proper behavior of getsize? + # v3 returns size of all nested arrays, not just the + # size of the arrays in the current folder. + if self.version == 2: + assert 6 == store.getsize() + else: + assert 15 == store.getsize() + assert 3 == store.getsize("a") + assert 3 == store.getsize("b") + if self.version == 2: + assert 3 == store.getsize("c") + else: + assert 9 == store.getsize("c") + assert 3 == store.getsize("c/d") + assert 6 == store.getsize("c/e") + assert 3 == store.getsize("c/e/f") + assert 3 == store.getsize("c/e/g") + # non-existent paths + assert 0 == store.getsize("x") + assert 0 == store.getsize("a/x") + assert 0 == store.getsize("c/x") + assert 0 == store.getsize("c/x/y") + assert 0 == store.getsize("c/d/y") + assert 0 == store.getsize("c/d/y/z") + + # access item via full path + assert 3 == store.getsize(self.root + "a") + + # test listdir (optional) + if hasattr(store, "listdir"): + assert {"a", "b", "c"} == set(store.listdir(self.root)) + assert {"d", "e"} == set(store.listdir(self.root + "c")) + assert {"f", "g"} == set(store.listdir(self.root + "c/e")) + # no exception raised if path does not exist or is leaf + assert [] == store.listdir(self.root + "x") + assert [] == store.listdir(self.root + "a/x") + assert [] == store.listdir(self.root + "c/x") + assert [] == store.listdir(self.root + "c/x/y") + assert [] == store.listdir(self.root + "c/d/y") + assert [] == store.listdir(self.root + "c/d/y/z") + assert [] == store.listdir(self.root + "c/e/f") + + # test rename (optional) + if store.is_erasable(): + store.rename("c/e", "c/e2") + assert self.root + "c/d" in store + assert self.root + "c/e" not in store + assert self.root + "c/e/f" not in store + assert self.root + "c/e/g" not in store + assert self.root + "c/e2" not in store + assert self.root + "c/e2/f" in store + assert self.root + "c/e2/g" in store + store.rename("c/e2", "c/e") + assert self.root + "c/d" in store + assert self.root + "c/e2" not in store + assert self.root + "c/e2/f" not in store + assert self.root + "c/e2/g" not in store + assert self.root + "c/e" not in store + assert self.root + "c/e/f" in store + assert self.root + "c/e/g" in store + store.rename("c", "c1/c2/c3") + assert self.root + "a" in store + assert self.root + "c" not in store + assert self.root + "c/d" not in store + assert self.root + "c/e" not in store + assert self.root + "c/e/f" not in store + assert self.root + "c/e/g" not in store + assert self.root + "c1" not in store + assert self.root + "c1/c2" not in store + assert self.root + "c1/c2/c3" not in store + assert self.root + "c1/c2/c3/d" in store + assert self.root + "c1/c2/c3/e" not in store + assert self.root + "c1/c2/c3/e/f" in store + assert self.root + "c1/c2/c3/e/g" in store + store.rename("c1/c2/c3", "c") + assert self.root + "c" not in store + assert self.root + "c/d" in store + assert self.root + "c/e" not in store + assert self.root + "c/e/f" in store + assert self.root + "c/e/g" in store + assert self.root + "c1" not in store + assert self.root + "c1/c2" not in store + assert self.root + "c1/c2/c3" not in store + assert self.root + "c1/c2/c3/d" not in store + assert self.root + "c1/c2/c3/e" not in store + assert self.root + "c1/c2/c3/e/f" not in store + assert self.root + "c1/c2/c3/e/g" not in store + + # test rmdir (optional) + store.rmdir("c/e") + assert self.root + "c/d" in store + assert self.root + "c/e/f" not in store + assert self.root + "c/e/g" not in store + store.rmdir("c") + assert self.root + "c/d" not in store + store.rmdir() + assert self.root + "a" not in store + assert self.root + "b" not in store + store[self.root + "a"] = b"aaa" + store[self.root + "c/d"] = b"ddd" + store[self.root + "c/e/f"] = b"fff" + # no exceptions raised if path does not exist or is leaf + store.rmdir("x") + store.rmdir("a/x") + store.rmdir("c/x") + store.rmdir("c/x/y") + store.rmdir("c/d/y") + store.rmdir("c/d/y/z") + store.rmdir("c/e/f") + assert self.root + "a" in store + assert self.root + "c/d" in store + assert self.root + "c/e/f" in store + + store.close() + + def test_init_array(self, dimension_separator_fixture): + pass_dim_sep, want_dim_sep = dimension_separator_fixture + + store = self.create_store(dimension_separator=pass_dim_sep) + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + assert default_compressor.get_config() == meta["compressor"] + assert meta["fill_value"] is None + # Missing MUST be assumed to be "." + assert meta.get("dimension_separator", ".") is want_dim_sep + + store.close() + + def test_init_array_overwrite(self): + self._test_init_array_overwrite("F") + + def test_init_array_overwrite_path(self): + self._test_init_array_overwrite_path("F") + + def test_init_array_overwrite_chunk_store(self): + self._test_init_array_overwrite_chunk_store("F") + + def test_init_group_overwrite(self): + self._test_init_group_overwrite("F") + + def test_init_group_overwrite_path(self): + self._test_init_group_overwrite_path("F") + + def test_init_group_overwrite_chunk_store(self): + self._test_init_group_overwrite_chunk_store("F") + + def _test_init_array_overwrite(self, order): + # setup + store = self.create_store() + if self.version == 2: + path = None + mkey = array_meta_key + meta = dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=Zlib(1).get_config(), + fill_value=0, + order=order, + filters=None, + ) + else: + path = "arr1" # no default, have to specify for v3 + mkey = meta_root + path + ".array.json" + meta = dict( + shape=(2000,), + chunk_grid=dict(type="regular", chunk_shape=(200,), separator=("/")), + data_type=np.dtype("u1"), + compressor=Zlib(1), + fill_value=0, + chunk_memory_layout=order, + filters=None, + ) + store[mkey] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite (default) + with pytest.raises(ContainsArrayError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype="i4", overwrite=True, path=path) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + assert (100,) == meta["chunks"] + assert np.dtype("i4") == meta["dtype"] + else: + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype("i4") == meta["data_type"] + assert (1000,) == meta["shape"] + + store.close() + + def test_init_array_path(self): + path = "foo/bar" + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + if self.version == 2: + mkey = path + "/" + array_meta_key + else: + mkey = meta_root + path + ".array.json" + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + assert default_compressor.get_config() == meta["compressor"] + else: + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype(None) == meta["data_type"] + assert default_compressor == meta["compressor"] + assert (1000,) == meta["shape"] + assert meta["fill_value"] is None + + store.close() + + def _test_init_array_overwrite_path(self, order): + # setup + path = "foo/bar" + store = self.create_store() + if self.version == 2: + mkey = path + "/" + array_meta_key + meta = dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=Zlib(1).get_config(), + fill_value=0, + order=order, + filters=None, + ) + else: + mkey = meta_root + path + ".array.json" + meta = dict( + shape=(2000,), + chunk_grid=dict(type="regular", chunk_shape=(200,), separator=("/")), + data_type=np.dtype("u1"), + compressor=Zlib(1), + fill_value=0, + chunk_memory_layout=order, + filters=None, + ) + store[mkey] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ContainsArrayError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype="i4", path=path, overwrite=True) + except NotImplementedError: + pass + else: + if self.version == 2: + assert group_meta_key in store + assert array_meta_key not in store + assert mkey in store + # should have been overwritten + meta = store._metadata_class.decode_array_metadata(store[mkey]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + assert (100,) == meta["chunks"] + assert np.dtype("i4") == meta["dtype"] + else: + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype("i4") == meta["data_type"] + assert (1000,) == meta["shape"] + + store.close() + + def test_init_array_overwrite_group(self): + # setup + path = "foo/bar" + store = self.create_store() + if self.version == 2: + array_key = path + "/" + array_meta_key + group_key = path + "/" + group_meta_key + else: + array_key = meta_root + path + ".array.json" + group_key = meta_root + path + ".group.json" + store[group_key] = store._metadata_class.encode_group_metadata() + + # don't overwrite + with pytest.raises(ContainsGroupError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype="i4", path=path, overwrite=True) + except NotImplementedError: + pass + else: + assert group_key not in store + assert array_key in store + meta = store._metadata_class.decode_array_metadata(store[array_key]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + assert (100,) == meta["chunks"] + assert np.dtype("i4") == meta["dtype"] + else: + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype("i4") == meta["data_type"] + assert (1000,) == meta["shape"] + + store.close() + + def _test_init_array_overwrite_chunk_store(self, order): + # setup + store = self.create_store() + chunk_store = self.create_store() + + if self.version == 2: + path = None + data_path = "" + mkey = array_meta_key + meta = dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=None, + fill_value=0, + filters=None, + order=order, + ) + else: + path = "arr1" + data_path = data_root + "arr1/" + mkey = meta_root + path + ".array.json" + meta = dict( + shape=(2000,), + chunk_grid=dict(type="regular", chunk_shape=(200,), separator=("/")), + data_type=np.dtype("u1"), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order, + ) + + store[mkey] = store._metadata_class.encode_array_metadata(meta) + + chunk_store[data_path + "0"] = b"aaa" + chunk_store[data_path + "1"] = b"bbb" + + # don't overwrite (default) + with pytest.raises(ContainsArrayError): + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + + # do overwrite + try: + init_array( + store, + path=path, + shape=1000, + chunks=100, + dtype="i4", + overwrite=True, + chunk_store=chunk_store, + ) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + assert (100,) == meta["chunks"] + assert np.dtype("i4") == meta["dtype"] + else: + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype("i4") == meta["data_type"] + assert (1000,) == meta["shape"] + assert data_path + "0" not in chunk_store + assert data_path + "1" not in chunk_store + + store.close() + chunk_store.close() + + def test_init_array_compat(self): + store = self.create_store() + if self.version == 2: + path = None + mkey = array_meta_key + else: + path = "arr1" + mkey = meta_root + path + ".array.json" + init_array(store, path=path, shape=1000, chunks=100, compressor="none") + meta = store._metadata_class.decode_array_metadata(store[mkey]) + if self.version == 2: + assert meta["compressor"] is None + else: + assert "compressor" not in meta + store.close() + + def test_init_group(self): + store = self.create_store() + if self.version == 2: + path = None + mkey = group_meta_key + else: + path = "foo" + mkey = meta_root + path + ".group.json" + init_group(store, path=path) + + # check metadata + assert mkey in store + meta = store._metadata_class.decode_group_metadata(store[mkey]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + else: + assert meta == {"attributes": {}} + + store.close() + + def _test_init_group_overwrite(self, order): + if self.version == 3: + pytest.skip("In v3 array and group names cannot overlap") + # setup + store = self.create_store() + store[array_meta_key] = store._metadata_class.encode_array_metadata( + dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=None, + fill_value=0, + order=order, + filters=None, + ) + ) + + # don't overwrite array (default) + with pytest.raises(ContainsArrayError): + init_group(store) + + # do overwrite + try: + init_group(store, overwrite=True) + except NotImplementedError: + pass + else: + assert array_meta_key not in store + assert group_meta_key in store + meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + + # don't overwrite group + with pytest.raises(ValueError): + init_group(store) + + store.close() + + def _test_init_group_overwrite_path(self, order): + # setup + path = "foo/bar" + store = self.create_store() + if self.version == 2: + meta = dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=None, + fill_value=0, + order=order, + filters=None, + ) + array_key = path + "/" + array_meta_key + group_key = path + "/" + group_meta_key + else: + meta = dict( + shape=(2000,), + chunk_grid=dict(type="regular", chunk_shape=(200,), separator=("/")), + data_type=np.dtype("u1"), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order, + ) + array_key = meta_root + path + ".array.json" + group_key = meta_root + path + ".group.json" + store[array_key] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ValueError): + init_group(store, path=path) + + # do overwrite + try: + init_group(store, overwrite=True, path=path) + except NotImplementedError: + pass + else: + if self.version == 2: + assert array_meta_key not in store + assert group_meta_key in store + assert array_key not in store + assert group_key in store + # should have been overwritten + meta = store._metadata_class.decode_group_metadata(store[group_key]) + if self.version == 2: + assert ZARR_FORMAT == meta["zarr_format"] + else: + assert meta == {"attributes": {}} + + store.close() + + def _test_init_group_overwrite_chunk_store(self, order): + if self.version == 3: + pytest.skip("In v3 array and group names cannot overlap") + # setup + store = self.create_store() + chunk_store = self.create_store() + store[array_meta_key] = store._metadata_class.encode_array_metadata( + dict( + shape=(2000,), + chunks=(200,), + dtype=np.dtype("u1"), + compressor=None, + fill_value=0, + filters=None, + order=order, + ) + ) + chunk_store["foo"] = b"bar" + chunk_store["baz"] = b"quux" + + # don't overwrite array (default) + with pytest.raises(ValueError): + init_group(store, chunk_store=chunk_store) + + # do overwrite + try: + init_group(store, overwrite=True, chunk_store=chunk_store) + except NotImplementedError: + pass + else: + assert array_meta_key not in store + assert group_meta_key in store + meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert "foo" not in chunk_store + assert "baz" not in chunk_store + + # don't overwrite group + with pytest.raises(ValueError): + init_group(store) + + store.close() + chunk_store.close() + + +class TestMappingStore(StoreTests): + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return KVStore(dict()) + + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + + +def setdel_hierarchy_checks(store, root=""): + # these tests are for stores that are aware of hierarchy levels; this + # behaviour is not strictly required by Zarr but these tests are included + # to define behaviour of MemoryStore and DirectoryStore classes + + # check __setitem__ and __delitem__ blocked by leaf + + store[root + "a/b"] = b"aaa" + with pytest.raises(KeyError): + store[root + "a/b/c"] = b"xxx" + with pytest.raises(KeyError): + del store[root + "a/b/c"] + + store[root + "d"] = b"ddd" + with pytest.raises(KeyError): + store[root + "d/e/f"] = b"xxx" + with pytest.raises(KeyError): + del store[root + "d/e/f"] + + # test __setitem__ overwrite level + store[root + "x/y/z"] = b"xxx" + store[root + "x/y"] = b"yyy" + assert b"yyy" == ensure_bytes(store[root + "x/y"]) + assert root + "x/y/z" not in store + store[root + "x"] = b"zzz" + assert b"zzz" == ensure_bytes(store[root + "x"]) + assert root + "x/y" not in store + + # test __delitem__ overwrite level + store[root + "r/s/t"] = b"xxx" + del store[root + "r/s"] + assert root + "r/s/t" not in store + store[root + "r/s"] = b"xxx" + del store[root + "r"] + assert root + "r/s" not in store + + +class TestMemoryStore(StoreTests): + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return MemoryStore(**kwargs) + + def test_store_contains_bytes(self): + store = self.create_store() + store[self.root + "foo"] = np.array([97, 98, 99, 100, 101], dtype=np.uint8) + assert store[self.root + "foo"] == b"abcde" + + def test_setdel(self): + store = self.create_store() + setdel_hierarchy_checks(store, self.root) + + +class TestDictStore(StoreTests): + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + + with pytest.warns(DeprecationWarning): + return DictStore(**kwargs) + + def test_deprecated(self): + store = self.create_store() + assert isinstance(store, MemoryStore) + + def test_pickle(self): + with pytest.warns(DeprecationWarning): + # pickle.load() will also trigger deprecation warning + super().test_pickle() + + +class TestDirectoryStore(StoreTests): + def create_store(self, normalize_keys=False, dimension_separator=".", **kwargs): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStore( + path, normalize_keys=normalize_keys, dimension_separator=dimension_separator, **kwargs + ) + return store + + def test_filesystem_path(self): + # test behaviour with path that does not exist + path = "data/store" + if os.path.exists(path): + shutil.rmtree(path) + store = DirectoryStore(path) + # should only be created on demand + assert not os.path.exists(path) + store["foo"] = b"bar" + assert os.path.isdir(path) + + # check correct permissions + # regression test for https://github.com/zarr-developers/zarr-python/issues/325 + stat = os.stat(path) + mode = stat.st_mode & 0o666 + umask = os.umask(0) + os.umask(umask) + assert mode == (0o666 & ~umask) + + # test behaviour with file path + with tempfile.NamedTemporaryFile() as f: + with pytest.raises(ValueError): + DirectoryStore(f.name) + + def test_init_pathlib(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + DirectoryStore(pathlib.Path(path)) + + def test_pickle_ext(self): + store = self.create_store() + store2 = pickle.loads(pickle.dumps(store)) + + # check path is preserved + assert store.path == store2.path + + # check point to same underlying directory + assert self.root + "xxx" not in store + store2[self.root + "xxx"] = b"yyy" + assert b"yyy" == ensure_bytes(store[self.root + "xxx"]) + + def test_setdel(self): + store = self.create_store() + setdel_hierarchy_checks(store, self.root) + + def test_normalize_keys(self): + store = self.create_store(normalize_keys=True) + store[self.root + "FOO"] = b"bar" + assert self.root + "FOO" in store + assert self.root + "foo" in store + + def test_listing_keys_slash(self): + def mock_walker_slash(_path): + yield from [ + # trailing slash in first key + ("root_with_slash/", ["d1", "g1"], [".zgroup"]), + ("root_with_slash/d1", [], [".zarray"]), + ("root_with_slash/g1", [], [".zgroup"]), + ] + + res = set(DirectoryStore._keys_fast("root_with_slash/", walker=mock_walker_slash)) + assert res == {".zgroup", "g1/.zgroup", "d1/.zarray"} + + def test_listing_keys_no_slash(self): + def mock_walker_no_slash(_path): + yield from [ + # no trailing slash in first key + ("root_with_no_slash", ["d1", "g1"], [".zgroup"]), + ("root_with_no_slash/d1", [], [".zarray"]), + ("root_with_no_slash/g1", [], [".zgroup"]), + ] + + res = set(DirectoryStore._keys_fast("root_with_no_slash", mock_walker_no_slash)) + assert res == {".zgroup", "g1/.zgroup", "d1/.zarray"} + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStore(StoreTests): + @pytest.fixture + def memory_store(self): + store = FSStore("memory://") + yield store + store.fs.store.clear() + + def create_store(self, normalize_keys=False, dimension_separator=".", path=None, **kwargs): + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = FSStore( + path, normalize_keys=normalize_keys, dimension_separator=dimension_separator, **kwargs + ) + return store + + def test_init_array(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + assert meta["dimension_separator"] == "." + + def test_dimension_separator(self): + for x in (".", "/"): + store = self.create_store(dimension_separator=x) + norm = store._normalize_key + assert ".zarray" == norm(".zarray") + assert ".zarray" == norm("/.zarray") + assert ".zgroup" == norm("/.zgroup") + assert "group/.zarray" == norm("group/.zarray") + assert "group/.zgroup" == norm("group/.zgroup") + assert "group/.zarray" == norm("/group/.zarray") + assert "group/.zgroup" == norm("/group/.zgroup") + + def test_complex(self): + path1 = tempfile.mkdtemp() + path2 = tempfile.mkdtemp() + store = self.create_store( + path="simplecache::file://" + path1, + simplecache={"same_names": True, "cache_storage": path2}, + ) + assert not store + assert not os.listdir(path1) + assert not os.listdir(path2) + store[self.root + "foo"] = b"hello" + assert "foo" in os.listdir(str(path1) + "/" + self.root) + assert self.root + "foo" in store + assert not os.listdir(str(path2)) + assert store[self.root + "foo"] == b"hello" + assert "foo" in os.listdir(str(path2)) + + def test_deep_ndim(self): + import zarr + + store = self.create_store() + path = None if self.version == 2 else "group1" + foo = zarr.open_group(store=store, path=path) + bar = foo.create_group("bar") + baz = bar.create_dataset("baz", shape=(4, 4, 4), chunks=(2, 2, 2), dtype="i8") + baz[:] = 1 + if self.version == 2: + assert set(store.listdir()) == {".zgroup", "bar"} + else: + assert set(store.listdir()) == {"data", "meta", "zarr.json"} + assert set(store.listdir("meta/root/" + path)) == {"bar", "bar.group.json"} + assert set(store.listdir("data/root/" + path)) == {"bar"} + assert foo["bar"]["baz"][(0, 0, 0)] == 1 + + def test_not_fsspec(self): + import zarr + + path = tempfile.mkdtemp() + with pytest.raises(ValueError, match="storage_options"): + zarr.open_array(path, mode="w", storage_options={"some": "kwargs"}) + with pytest.raises(ValueError, match="storage_options"): + zarr.open_group(path, mode="w", storage_options={"some": "kwargs"}) + zarr.open_array("file://" + path, mode="w", shape=(1,), dtype="f8") + + def test_create(self): + import zarr + + path1 = tempfile.mkdtemp() + path2 = tempfile.mkdtemp() + g = zarr.open_group("file://" + path1, mode="w", storage_options={"auto_mkdir": True}) + a = g.create_dataset("data", shape=(8,)) + a[:4] = [0, 1, 2, 3] + assert "data" in os.listdir(path1) + assert ".zgroup" in os.listdir(path1) + + # consolidated metadata (GH#915) + consolidate_metadata("file://" + path1) + assert ".zmetadata" in os.listdir(path1) + + g = zarr.open_group( + "simplecache::file://" + path1, + mode="r", + storage_options={"cache_storage": path2, "same_names": True}, + ) + assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] + with pytest.raises(PermissionError): + g.data[:] = 1 + + @pytest.mark.parametrize("mode,allowed", [("r", False), ("r+", True)]) + def test_modify_consolidated(self, mode, allowed): + import zarr + + url = "file://" + tempfile.mkdtemp() + + # create + root = zarr.open_group(url, mode="w") + root.zeros("baz", shape=(10000, 10000), chunks=(1000, 1000), dtype="i4") + zarr.consolidate_metadata(url) + + # reopen and modify + root = zarr.open_consolidated(url, mode=mode) + if allowed: + root["baz"][0, 0] = 7 + + root = zarr.open_consolidated(url, mode="r") + assert root["baz"][0, 0] == 7 + else: + with pytest.raises(zarr.errors.ReadOnlyError): + root["baz"][0, 0] = 7 + + @pytest.mark.parametrize("mode", ["r", "r+"]) + def test_modify_consolidated_metadata_raises(self, mode): + import zarr + + url = "file://" + tempfile.mkdtemp() + + # create + root = zarr.open_group(url, mode="w") + root.zeros("baz", shape=(10000, 10000), chunks=(1000, 1000), dtype="i4") + zarr.consolidate_metadata(url) + + # reopen and modify + root = zarr.open_consolidated(url, mode=mode) + with pytest.raises(zarr.errors.ReadOnlyError): + root["baz"].resize(100, 100) + + def test_read_only(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = self.create_store(path=path) + store[self.root + "foo"] = b"bar" + + store = self.create_store(path=path, mode="r") + + with pytest.raises(PermissionError): + store[self.root + "foo"] = b"hex" + + with pytest.raises(PermissionError): + del store[self.root + "foo"] + + with pytest.raises(PermissionError): + store.delitems([self.root + "foo"]) + + with pytest.raises(PermissionError): + store.setitems({self.root + "foo": b"baz"}) + + with pytest.raises(PermissionError): + store.clear() + + with pytest.raises(PermissionError): + store.rmdir(self.root + "anydir") + + assert store[self.root + "foo"] == b"bar" + + def test_eq(self): + store1 = self.create_store(path="anypath") + store2 = self.create_store(path="anypath") + assert store1 == store2 + + @pytest.mark.usefixtures("s3") + def test_s3(self): + import zarr + + g = zarr.open_group("s3://test/out.zarr", mode="w", storage_options=self.s3so) + a = g.create_dataset("data", shape=(8,)) + a[:4] = [0, 1, 2, 3] + + g = zarr.open_group("s3://test/out.zarr", mode="r", storage_options=self.s3so) + + assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] + + # test via convenience + g = zarr.open("s3://test/out.zarr", mode="r", storage_options=self.s3so) + assert g.data[:].tolist() == [0, 1, 2, 3, 0, 0, 0, 0] + + @pytest.mark.usefixtures("s3") + def test_s3_complex(self): + import zarr + + g = zarr.open_group("s3://test/out.zarr", mode="w", storage_options=self.s3so) + expected = np.empty((8, 8, 8), dtype="int64") + expected[:] = -1 + a = g.create_dataset( + "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True + ) + expected[0] = 0 + expected[3] = 3 + expected[6, 6, 6] = 6 + a[6, 6, 6] = 6 + a[:4] = expected[:4] + + b = g.create_dataset( + "data_f", + shape=(8,), + chunks=(1,), + dtype=[("foo", "S3"), ("bar", "i4")], + fill_value=(b"b", 1), + ) + b[:4] = (b"aaa", 2) + g2 = zarr.open_group("s3://test/out.zarr", mode="r", storage_options=self.s3so) + + assert (g2.data[:] == expected).all() + a.chunk_store.fs.invalidate_cache("test/out.zarr/data") + a[:] = 5 + assert (a[:] == 5).all() + + assert g2.data_f["foo"].tolist() == [b"aaa"] * 4 + [b"b"] * 4 + with pytest.raises(PermissionError): + g2.data[:] = 5 + + with pytest.raises(PermissionError): + g2.store.setitems({}) + + with pytest.raises(PermissionError): + # even though overwrite=True, store is read-only, so fails + g2.create_dataset( + "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True + ) + + a = g.create_dataset( + "data", shape=(8, 8, 8), fill_value=-1, chunks=(1, 1, 1), overwrite=True + ) + assert (a[:] == -np.ones((8, 8, 8))).all() + + def test_exceptions(self, memory_store): + fs = memory_store.fs + group = zarr.open(memory_store, mode="w") + x = group.create_dataset("x", data=[1, 2, 3]) + y = group.create_dataset("y", data=1) + fs.store["/x/0"] = None + fs.store["/y/0"] = None + # no exception from FSStore.getitems getting KeyError + assert group.store.getitems(["foo"], contexts={}) == {} + # exception from FSStore.getitems getting AttributeError + with pytest.raises(Exception): # noqa: B017 + group.store.getitems(["x/0"], contexts={}) + # exception from FSStore.getitems getting AttributeError + with pytest.raises(Exception): # noqa: B017 + x[...] + # exception from FSStore.__getitem__ getting AttributeError + with pytest.raises(Exception): # noqa: B017 + y[...] + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreWithKeySeparator(StoreTests): + def create_store(self, normalize_keys=False, key_separator=".", **kwargs): + # Since the user is passing key_separator, that will take priority. + skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + return FSStore(path, normalize_keys=normalize_keys, key_separator=key_separator) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreFromFilesystem(StoreTests): + def create_store(self, normalize_keys=False, dimension_separator=".", path=None, **kwargs): + import fsspec + + fs = fsspec.filesystem("file") + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + with pytest.raises(ValueError): + # can't specify storage_options when passing an + # existing fs object + _ = FSStore(path, fs=fs, auto_mkdir=True) + + store = FSStore( + path, + normalize_keys=normalize_keys, + dimension_separator=dimension_separator, + fs=fs, + **kwargs, + ) + + return store + + +@pytest.fixture() +def s3(request): + # writable local S3 system + import shlex + import subprocess + import time + + if "BOTO_CONFIG" not in os.environ: # pragma: no cover + os.environ["BOTO_CONFIG"] = "/dev/null" + if "AWS_ACCESS_KEY_ID" not in os.environ: # pragma: no cover + os.environ["AWS_ACCESS_KEY_ID"] = "foo" + if "AWS_SECRET_ACCESS_KEY" not in os.environ: # pragma: no cover + os.environ["AWS_SECRET_ACCESS_KEY"] = "bar" + requests = pytest.importorskip("requests") + s3fs = pytest.importorskip("s3fs") + pytest.importorskip("moto") + + port = 5555 + endpoint_uri = f"http://127.0.0.1:{port}/" + proc = subprocess.Popen( + shlex.split(f"moto_server -p {port}"), + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + ) + + timeout = 5 + while timeout > 0: + try: + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: # pragma: no cover + pass + timeout -= 0.1 # pragma: no cover + time.sleep(0.1) # pragma: no cover + s3so = dict(client_kwargs={"endpoint_url": endpoint_uri}, use_listings_cache=False) + s3 = s3fs.S3FileSystem(anon=False, **s3so) + s3.mkdir("test") + request.cls.s3so = s3so + yield + proc.terminate() + proc.wait() + + +class TestNestedDirectoryStore(TestDirectoryStore): + def create_store(self, normalize_keys=False, **kwargs): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStore(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + assert store._dimension_separator == "/" + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + assert meta["dimension_separator"] == "/" + + def test_chunk_nesting(self): + store = self.create_store() + # any path where last segment looks like a chunk key gets special handling + store[self.root + "0.0"] = b"xxx" + assert b"xxx" == store[self.root + "0.0"] + # assert b'xxx' == store['0/0'] + store[self.root + "foo/10.20.30"] = b"yyy" + assert b"yyy" == store[self.root + "foo/10.20.30"] + # assert b'yyy' == store['foo/10/20/30'] + store[self.root + "42"] = b"zzz" + assert b"zzz" == store[self.root + "42"] + + def test_listdir(self): + store = self.create_store() + z = zarr.zeros((10, 10), chunks=(5, 5), store=store) + z[:] = 1 # write to all chunks + for k in store.listdir(): + assert store.get(k) is not None + + +class TestNestedDirectoryStoreNone: + def test_value_error(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStore(path, normalize_keys=True, dimension_separator=None) + assert store._dimension_separator == "/" + + +class TestNestedDirectoryStoreWithWrongValue: + def test_value_error(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + with pytest.raises(ValueError): + NestedDirectoryStore(path, normalize_keys=True, dimension_separator=".") + + +class TestN5Store(TestNestedDirectoryStore): + def create_store(self, normalize_keys=False): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = N5Store(path, normalize_keys=normalize_keys) + return store + + def test_equal(self): + store_a = self.create_store() + store_b = N5Store(store_a.path) + assert store_a == store_b + + @pytest.mark.parametrize("zarr_meta_key", [".zarray", ".zattrs", ".zgroup"]) + def test_del_zarr_meta_key(self, zarr_meta_key): + store = self.create_store() + store[n5_attrs_key] = json_dumps({"foo": "bar"}) + del store[zarr_meta_key] + assert n5_attrs_key not in store + + def test_chunk_nesting(self): + store = self.create_store() + store["0.0"] = b"xxx" + assert "0.0" in store + assert b"xxx" == store["0.0"] + # assert b'xxx' == store['0/0'] + store["foo/10.20.30"] = b"yyy" + assert "foo/10.20.30" in store + assert b"yyy" == store["foo/10.20.30"] + # N5 reverses axis order + assert b"yyy" == store["foo/30/20/10"] + del store["foo/10.20.30"] + assert "foo/30/20/10" not in store + store["42"] = b"zzz" + assert "42" in store + assert b"zzz" == store["42"] + + def test_init_array(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta["fill_value"] == 0 + assert meta["dimension_separator"] == "." + # Top-level groups AND arrays should have + # the n5 keyword in metadata + raw_n5_meta = json.loads(store[n5_attrs_key]) + assert raw_n5_meta.get("n5", None) == N5_FORMAT + + def test_init_array_path(self): + path = "foo/bar" + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + key = path + "/" + array_meta_key + assert key in store + meta = store._metadata_class.decode_array_metadata(store[key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta["fill_value"] == 0 + + def test_init_array_compat(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100, compressor="none") + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert compressor_config is None + + def test_init_array_overwrite(self): + self._test_init_array_overwrite("C") + + def test_init_array_overwrite_path(self): + self._test_init_array_overwrite_path("C") + + def test_init_array_overwrite_chunk_store(self): + self._test_init_array_overwrite_chunk_store("C") + + def test_init_group_overwrite(self): + self._test_init_group_overwrite("C") + + def test_init_group_overwrite_path(self): + self._test_init_group_overwrite_path("C") + + def test_init_group_overwrite_chunk_store(self): + self._test_init_group_overwrite_chunk_store("C") + + def test_init_group(self): + store = self.create_store() + init_group(store) + store[".zattrs"] = json_dumps({"foo": "bar"}) + # check metadata + assert group_meta_key in store + assert group_meta_key in store.listdir() + assert group_meta_key in store.listdir("") + meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + + def test_filters(self): + all_filters, all_errors = zip( + *[ + (None, does_not_raise()), + ([], does_not_raise()), + ([AsType("f4", "f8")], pytest.raises(ValueError)), + ] + ) + for filters, error in zip(all_filters, all_errors): + store = self.create_store() + with error: + init_array(store, shape=1000, chunks=100, filters=filters) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestN5FSStore(TestFSStore): + def create_store(self, normalize_keys=False, path=None, **kwargs): + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = N5FSStore(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_equal(self): + store_a = self.create_store() + store_b = N5FSStore(store_a.path) + assert store_a == store_b + + # This is copied wholesale from the N5Store tests. The same test could + # be run by making TestN5FSStore inherit from both TestFSStore and + # TestN5Store, but a direct copy is arguably more explicit. + + @pytest.mark.parametrize("zarr_meta_key", [".zarray", ".zattrs", ".zgroup"]) + def test_del_zarr_meta_key(self, zarr_meta_key): + store = self.create_store() + store[n5_attrs_key] = json_dumps({"foo": "bar"}) + del store[zarr_meta_key] + assert n5_attrs_key not in store + + def test_chunk_nesting(self): + store = self.create_store() + store["0.0"] = b"xxx" + assert "0.0" in store + assert b"xxx" == store["0.0"] + # assert b'xxx' == store['0/0'] + store["foo/10.20.30"] = b"yyy" + assert "foo/10.20.30" in store + assert b"yyy" == store["foo/10.20.30"] + # N5 reverses axis order + assert b"yyy" == store["foo/30/20/10"] + del store["foo/10.20.30"] + assert "foo/30/20/10" not in store + store["42"] = b"zzz" + assert "42" in store + assert b"zzz" == store["42"] + + def test_init_array(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta["fill_value"] == 0 + assert meta["dimension_separator"] == "." + # Top-level groups AND arrays should have + # the n5 keyword in metadata + raw_n5_meta = json.loads(store[n5_attrs_key]) + assert raw_n5_meta.get("n5", None) == N5_FORMAT + + def test_init_array_path(self): + path = "foo/bar" + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + key = path + "/" + array_meta_key + assert key in store + meta = store._metadata_class.decode_array_metadata(store[key]) + assert ZARR_FORMAT == meta["zarr_format"] + assert (1000,) == meta["shape"] + assert (100,) == meta["chunks"] + assert np.dtype(None) == meta["dtype"] + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta["fill_value"] == 0 + + def test_init_array_compat(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100, compressor="none") + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + # N5Store wraps the actual compressor + compressor_config = meta["compressor"]["compressor_config"] + assert compressor_config is None + + def test_init_array_overwrite(self): + self._test_init_array_overwrite("C") + + def test_init_array_overwrite_path(self): + self._test_init_array_overwrite_path("C") + + def test_init_array_overwrite_chunk_store(self): + self._test_init_array_overwrite_chunk_store("C") + + def test_init_group_overwrite(self): + self._test_init_group_overwrite("C") + + def test_init_group_overwrite_path(self): + self._test_init_group_overwrite_path("C") + + def test_init_group_overwrite_chunk_store(self): + self._test_init_group_overwrite_chunk_store("C") + + def test_dimension_separator(self): + with pytest.warns(UserWarning, match="dimension_separator"): + self.create_store(dimension_separator="/") + + def test_init_group(self): + store = self.create_store() + init_group(store) + store[".zattrs"] = json_dumps({"foo": "bar"}) + # check metadata + assert group_meta_key in store + assert group_meta_key in store.listdir() + assert group_meta_key in store.listdir("") + meta = store._metadata_class.decode_group_metadata(store[group_meta_key]) + assert ZARR_FORMAT == meta["zarr_format"] + + def test_filters(self): + all_filters, all_errors = zip( + *[ + (None, does_not_raise()), + ([], does_not_raise()), + ([AsType("f4", "f8")], pytest.raises(ValueError)), + ] + ) + for filters, error in zip(all_filters, all_errors): + store = self.create_store() + with error: + init_array(store, shape=1000, chunks=100, filters=filters) + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestNestedFSStore(TestNestedDirectoryStore): + def create_store(self, normalize_keys=False, path=None, **kwargs): + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStore( + path, normalize_keys=normalize_keys, dimension_separator="/", auto_mkdir=True, **kwargs + ) + return store + + def test_numbered_groups(self): + import zarr + + # Create an array + store = self.create_store() + group = zarr.group(store=store) + arr = group.create_dataset("0", shape=(10, 10)) + arr[1] = 1 + + # Read it back + store = self.create_store(path=store.path) + zarr.open_group(store.path)["0"] + + +class TestTempStore(StoreTests): + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return TempStore(**kwargs) + + def test_setdel(self): + store = self.create_store() + setdel_hierarchy_checks(store, self.root) + + +class TestZipStore(StoreTests): + ZipStoreClass = ZipStore + + def create_store(self, **kwargs): + path = mktemp(suffix=".zip") + atexit.register(os.remove, path) + store = ZipStore(path, mode="w", **kwargs) + return store + + def test_mode(self): + with self.ZipStoreClass("data/store.zip", mode="w") as store: + store[self.root + "foo"] = b"bar" + store = self.ZipStoreClass("data/store.zip", mode="r") + with pytest.raises(PermissionError): + store[self.root + "foo"] = b"bar" + with pytest.raises(PermissionError): + store.clear() + + def test_flush(self): + store = self.ZipStoreClass("data/store.zip", mode="w") + store[self.root + "foo"] = b"bar" + store.flush() + assert store[self.root + "foo"] == b"bar" + store.close() + + store = self.ZipStoreClass("data/store.zip", mode="r") + store.flush() # no-op + + def test_context_manager(self): + with self.create_store() as store: + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"qux" + assert 2 == len(store) + + def test_pop(self): + # override because not implemented + store = self.create_store() + store[self.root + "foo"] = b"bar" + with pytest.raises(NotImplementedError): + store.pop(self.root + "foo") + + def test_popitem(self): + # override because not implemented + store = self.create_store() + store[self.root + "foo"] = b"bar" + with pytest.raises(NotImplementedError): + store.popitem() + + def test_permissions(self): + store = self.ZipStoreClass("data/store.zip", mode="w") + foo_key = "foo" if self.version == 2 else self.root + "foo" + # TODO: cannot provide key ending in / for v3 + # how to create an empty folder in that case? + baz_key = "baz/" if self.version == 2 else self.root + "baz" + store[foo_key] = b"bar" + store[baz_key] = b"" + + store.flush() + store.close() + z = ZipFile("data/store.zip", "r") + info = z.getinfo(foo_key) + perm = oct(info.external_attr >> 16) + assert perm == "0o644" + info = z.getinfo(baz_key) + perm = oct(info.external_attr >> 16) + # only for posix platforms + if os.name == "posix": + if self.version == 2: + assert perm == "0o40775" + else: + # baz/ on v2, but baz on v3, so not a directory + assert perm == "0o644" + z.close() + + def test_store_and_retrieve_ndarray(self): + store = ZipStore("data/store.zip") + x = np.array([[1, 2], [3, 4]]) + store["foo"] = x + y = np.frombuffer(store["foo"], dtype=x.dtype).reshape(x.shape) + assert np.array_equiv(y, x) + + +class TestDBMStore(StoreTests): + def create_store(self, dimension_separator=None): + path = mktemp(suffix=".anydbm") + atexit.register(atexit_rmglob, path + "*") + # create store using default dbm implementation + store = DBMStore(path, flag="n", dimension_separator=dimension_separator) + return store + + def test_context_manager(self): + with self.create_store() as store: + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"qux" + assert 2 == len(store) + + +class TestDBMStoreDumb(TestDBMStore): + def create_store(self, **kwargs): + path = mktemp(suffix=".dumbdbm") + atexit.register(atexit_rmglob, path + "*") + + import dbm.dumb as dumbdbm + + store = DBMStore(path, flag="n", open=dumbdbm.open, **kwargs) + return store + + +class TestDBMStoreGnu(TestDBMStore): + def create_store(self, **kwargs): + gdbm = pytest.importorskip("dbm.gnu") + path = mktemp(suffix=".gdbm") # pragma: no cover + atexit.register(os.remove, path) # pragma: no cover + store = DBMStore( + path, flag="n", open=gdbm.open, write_lock=False, **kwargs + ) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreNDBM(TestDBMStore): + def create_store(self, **kwargs): + ndbm = pytest.importorskip("dbm.ndbm") + path = mktemp(suffix=".ndbm") # pragma: no cover + atexit.register(atexit_rmglob, path + "*") # pragma: no cover + store = DBMStore(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover + return store # pragma: no cover + + +class TestLMDBStore(StoreTests): + def create_store(self, **kwargs): + pytest.importorskip("lmdb") + path = mktemp(suffix=".lmdb") + atexit.register(atexit_rmtree, path) + buffers = True + store = LMDBStore(path, buffers=buffers, **kwargs) + return store + + def test_context_manager(self): + with self.create_store() as store: + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"qux" + assert 2 == len(store) + + +class TestSQLiteStore(StoreTests): + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path, **kwargs) + return store + + def test_underscore_in_name(self): + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + store["a"] = b"aaa" + store["a_b"] = b"aa_bb" + store.rmdir("a") + assert "a_b" in store + + +class TestSQLiteStoreInMemory(TestSQLiteStore): + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + store = SQLiteStore(":memory:", **kwargs) + return store + + def test_pickle(self): + # setup store + store = self.create_store() + store[self.root + "foo"] = b"bar" + store[self.root + "baz"] = b"quux" + + # round-trip through pickle + with pytest.raises(PicklingError): + pickle.dumps(store) + + +@skip_test_env_var("ZARR_TEST_MONGO") +class TestMongoDBStore(StoreTests): + def create_store(self, **kwargs): + pytest.importorskip("pymongo") + store = MongoDBStore( + host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs + ) + # start with an empty store + store.clear() + return store + + +@skip_test_env_var("ZARR_TEST_REDIS") +class TestRedisStore(StoreTests): + def create_store(self, **kwargs): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + pytest.importorskip("redis") + store = RedisStore(host="localhost", port=6379, **kwargs) + # start with an empty store + store.clear() + return store + + +class TestLRUStoreCache(StoreTests): + CountingClass = CountingDict + LRUStoreClass = LRUStoreCache + + def create_store(self, **kwargs): + # wrapper therefore no dimension_separator argument + skip_if_nested_chunks(**kwargs) + return self.LRUStoreClass(dict(), max_size=2**27) + + def test_cache_values_no_max_size(self): + # setup store + store = self.CountingClass() + foo_key = self.root + "foo" + bar_key = self.root + "bar" + store[foo_key] = b"xxx" + store[bar_key] = b"yyy" + assert 0 == store.counter["__getitem__", foo_key] + assert 1 == store.counter["__setitem__", foo_key] + assert 0 == store.counter["__getitem__", bar_key] + assert 1 == store.counter["__setitem__", bar_key] + + # setup cache + cache = self.LRUStoreClass(store, max_size=None) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first __getitem__, cache miss + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 1 == store.counter["__setitem__", foo_key] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second __getitem__, cache hit + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 1 == store.counter["__setitem__", foo_key] + assert 1 == cache.hits + assert 1 == cache.misses + + # test __setitem__, __getitem__ + cache[foo_key] = b"zzz" + assert 1 == store.counter["__getitem__", foo_key] + assert 2 == store.counter["__setitem__", foo_key] + # should be a cache hit + assert b"zzz" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 2 == store.counter["__setitem__", foo_key] + assert 2 == cache.hits + assert 1 == cache.misses + + # manually invalidate all cached values + cache.invalidate_values() + assert b"zzz" == cache[foo_key] + assert 2 == store.counter["__getitem__", foo_key] + assert 2 == store.counter["__setitem__", foo_key] + cache.invalidate() + assert b"zzz" == cache[foo_key] + assert 3 == store.counter["__getitem__", foo_key] + assert 2 == store.counter["__setitem__", foo_key] + + # test __delitem__ + del cache[foo_key] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + cache[foo_key] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store[foo_key] + + # verify other keys untouched + assert 0 == store.counter["__getitem__", bar_key] + assert 1 == store.counter["__setitem__", bar_key] + + def test_cache_values_with_max_size(self): + # setup store + store = self.CountingClass() + foo_key = self.root + "foo" + bar_key = self.root + "bar" + store[foo_key] = b"xxx" + store[bar_key] = b"yyy" + assert 0 == store.counter["__getitem__", foo_key] + assert 0 == store.counter["__getitem__", bar_key] + # setup cache - can only hold one item + cache = self.LRUStoreClass(store, max_size=5) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b"yyy" == cache[bar_key] + assert 1 == store.counter["__getitem__", bar_key] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b"yyy" == cache[bar_key] + assert 1 == store.counter["__getitem__", bar_key] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should have been evicted, cache miss + assert b"xxx" == cache[foo_key] + assert 2 == store.counter["__getitem__", foo_key] + assert 2 == cache.hits + assert 3 == cache.misses + + # test 'bar' __getitem__, should have been evicted, cache miss + assert b"yyy" == cache[bar_key] + assert 2 == store.counter["__getitem__", bar_key] + assert 2 == cache.hits + assert 4 == cache.misses + + # setup store + store = self.CountingClass() + store[foo_key] = b"xxx" + store[bar_key] = b"yyy" + assert 0 == store.counter["__getitem__", foo_key] + assert 0 == store.counter["__getitem__", bar_key] + # setup cache - can hold two items + cache = self.LRUStoreClass(store, max_size=6) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b"yyy" == cache[bar_key] + assert 1 == store.counter["__getitem__", bar_key] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b"yyy" == cache[bar_key] + assert 1 == store.counter["__getitem__", bar_key] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should still be cached + assert b"xxx" == cache[foo_key] + assert 1 == store.counter["__getitem__", foo_key] + assert 3 == cache.hits + assert 2 == cache.misses + + # test 'bar' __getitem__, should still be cached + assert b"yyy" == cache[bar_key] + assert 1 == store.counter["__getitem__", bar_key] + assert 4 == cache.hits + assert 2 == cache.misses + + def test_cache_keys(self): + # setup + store = self.CountingClass() + foo_key = self.root + "foo" + bar_key = self.root + "bar" + baz_key = self.root + "baz" + store[foo_key] = b"xxx" + store[bar_key] = b"yyy" + assert 0 == store.counter["__contains__", foo_key] + assert 0 == store.counter["__iter__"] + assert 0 == store.counter["keys"] + cache = self.LRUStoreClass(store, max_size=None) + + # keys should be cached on first call + keys = sorted(cache.keys()) + assert keys == [bar_key, foo_key] + assert 1 == store.counter["keys"] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 1 == store.counter["keys"] + assert foo_key in cache + assert 1 == store.counter["__contains__", foo_key] + # the next check for `foo_key` is cached + assert foo_key in cache + assert 1 == store.counter["__contains__", foo_key] + assert keys == sorted(cache) + assert 0 == store.counter["__iter__"] + assert 1 == store.counter["keys"] + + # cache should be cleared if store is modified - crude but simple for now + cache[baz_key] = b"zzz" + keys = sorted(cache.keys()) + assert keys == [bar_key, baz_key, foo_key] + assert 2 == store.counter["keys"] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 2 == store.counter["keys"] + + # manually invalidate keys + cache.invalidate_keys() + keys = sorted(cache.keys()) + assert keys == [bar_key, baz_key, foo_key] + assert 3 == store.counter["keys"] + assert 1 == store.counter["__contains__", foo_key] + assert 0 == store.counter["__iter__"] + cache.invalidate_keys() + keys = sorted(cache) + assert keys == [bar_key, baz_key, foo_key] + assert 4 == store.counter["keys"] + assert 1 == store.counter["__contains__", foo_key] + assert 0 == store.counter["__iter__"] + cache.invalidate_keys() + assert foo_key in cache + assert 4 == store.counter["keys"] + assert 2 == store.counter["__contains__", foo_key] + assert 0 == store.counter["__iter__"] + + # check these would get counted if called directly + assert foo_key in store + assert 3 == store.counter["__contains__", foo_key] + assert keys == sorted(store) + assert 1 == store.counter["__iter__"] + + +def test_getsize(): + store = KVStore(dict()) + store["foo"] = b"aaa" + store["bar"] = b"bbbb" + store["baz/quux"] = b"ccccc" + assert 7 == getsize(store) + assert 5 == getsize(store, "baz") + + store = KVStore(dict()) + store["boo"] = None + assert -1 == getsize(store) + + +@pytest.mark.parametrize("dict_store", [False, True]) +def test_migrate_1to2(dict_store): + from zarr import meta_v1 + + # N.B., version 1 did not support hierarchies, so we only have to be + # concerned about migrating a single array at the root of the store + + # setup + store = dict() if dict_store else KVStore(dict()) + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype("f4"), + compression="zlib", + compression_opts=1, + fill_value=None, + order="C", + ) + meta_json = meta_v1.encode_metadata(meta) + store["meta"] = meta_json + store["attrs"] = json.dumps(dict()).encode("ascii") + + # run migration + migrate_1to2(store) + + # check results + assert "meta" not in store + assert array_meta_key in store + assert "attrs" not in store + assert attrs_key in store + meta_migrated = decode_array_metadata(store[array_meta_key]) + assert 2 == meta_migrated["zarr_format"] + + # preserved fields + for f in "shape", "chunks", "dtype", "fill_value", "order": + assert meta[f] == meta_migrated[f] + + # migrate should have added empty filters field + assert meta_migrated["filters"] is None + + # check compression and compression_opts migrated to compressor + assert "compression" not in meta_migrated + assert "compression_opts" not in meta_migrated + assert meta_migrated["compressor"] == Zlib(1).get_config() + + # check dict compression_opts + store = dict() if dict_store else KVStore(dict()) + meta["compression"] = "blosc" + meta["compression_opts"] = dict(cname="lz4", clevel=5, shuffle=1) + meta_json = meta_v1.encode_metadata(meta) + store["meta"] = meta_json + store["attrs"] = json.dumps(dict()).encode("ascii") + migrate_1to2(store) + meta_migrated = decode_array_metadata(store[array_meta_key]) + assert "compression" not in meta_migrated + assert "compression_opts" not in meta_migrated + assert meta_migrated["compressor"] == Blosc(cname="lz4", clevel=5, shuffle=1).get_config() + + # check 'none' compression is migrated to None (null in JSON) + store = dict() if dict_store else KVStore(dict()) + meta["compression"] = "none" + meta_json = meta_v1.encode_metadata(meta) + store["meta"] = meta_json + store["attrs"] = json.dumps(dict()).encode("ascii") + migrate_1to2(store) + meta_migrated = decode_array_metadata(store[array_meta_key]) + assert "compression" not in meta_migrated + assert "compression_opts" not in meta_migrated + assert meta_migrated["compressor"] is None + + +def test_format_compatibility(): + # This test is intended to catch any unintended changes that break the ability to + # read data stored with a previous minor version (which should be format-compatible). + + # fixture data + fixture = group(store=DirectoryStore("fixture")) + + # set seed to get consistent random data + np.random.seed(42) + + arrays_chunks = [ + (np.arange(1111, dtype=" 2 else "" + # setup some values + store[prefix + "a"] = b"aaa" + store[prefix + "b"] = b"bbb" + store[prefix + "c/d"] = b"ddd" + store[prefix + "c/e/f"] = b"fff" + + # test iterators on store with data + assert 4 == len(store) + keys = [prefix + "a", prefix + "b", prefix + "c/d", prefix + "c/e/f"] + values = [b"aaa", b"bbb", b"ddd", b"fff"] + items = list(zip(keys, values)) + assert set(keys) == set(store) + assert set(keys) == set(store.keys()) + assert set(values) == set(store.values()) + assert set(items) == set(store.items()) + + def test_getsize(self): + return super().test_getsize() + + def test_hierarchy(self): + return super().test_hierarchy() + + @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") + def test_pickle(self): + # internal attribute on ContainerClient isn't serializable for py36 and earlier + super().test_pickle() + + +class TestConsolidatedMetadataStore: + version = 2 + ConsolidatedMetadataClass = ConsolidatedMetadataStore + + @property + def metadata_key(self): + return ".zmetadata" + + def test_bad_format(self): + # setup store with consolidated metadata + store = dict() + consolidated = { + # bad format version + "zarr_consolidated_format": 0, + } + store[self.metadata_key] = json.dumps(consolidated).encode() + + # check appropriate error is raised + with pytest.raises(MetadataError): + self.ConsolidatedMetadataClass(store) + + def test_bad_store_version(self): + with pytest.raises(ValueError): + self.ConsolidatedMetadataClass(KVStoreV3(dict())) + + def test_read_write(self): + # setup store with consolidated metadata + store = dict() + consolidated = { + "zarr_consolidated_format": 1, + "metadata": { + "foo": "bar", + "baz": 42, + }, + } + store[self.metadata_key] = json.dumps(consolidated).encode() + + # create consolidated store + cs = self.ConsolidatedMetadataClass(store) + + # test __contains__, __getitem__ + for key, value in consolidated["metadata"].items(): + assert key in cs + assert value == cs[key] + + # test __delitem__, __setitem__ + with pytest.raises(PermissionError): + del cs["foo"] + with pytest.raises(PermissionError): + cs["bar"] = 0 + with pytest.raises(PermissionError): + cs["spam"] = "eggs" + + +# standalone test we do not want to run on each store. + + +def test_fill_value_change(): + a = zarr.create((10, 10), dtype=int) + + assert a[0, 0] == 0 + + a.fill_value = 1 + + assert a[0, 0] == 1 + + assert json.loads(a.store[".zarray"])["fill_value"] == 1 + + +def test_get_hierarchy_metadata_v2(): + # v2 stores do not have hierarchy metadata (i.e. zarr.json) + with pytest.raises(ValueError): + _get_hierarchy_metadata(KVStore(dict)) + + +def test_normalize_store_arg(tmpdir): + with pytest.raises(ValueError): + normalize_store_arg(dict(), zarr_version=4) + + for ext, Class in [(".zip", ZipStore), (".n5", N5Store)]: + fn = tmpdir.join("store" + ext) + store = normalize_store_arg(str(fn), zarr_version=2, mode="w") + assert isinstance(store, Class) + + if have_fsspec: + import fsspec + + path = tempfile.mkdtemp() + store = normalize_store_arg("file://" + path, zarr_version=2, mode="w") + assert isinstance(store, FSStore) + + store = normalize_store_arg(fsspec.get_mapper("file://" + path)) + assert isinstance(store, FSStore) + + +def test_meta_prefix_6853(): + fixture = pathlib.Path(zarr.__file__).resolve().parent.parent / "fixture" + meta = fixture / "meta" + if not meta.exists(): # pragma: no cover + s = DirectoryStore(str(meta), dimension_separator=".") + a = zarr.open(store=s, mode="w", shape=(2, 2), dtype=" None: + super().__init__(_type) + assert test_value == self.TEST_CONSTANT + self.test_value = test_value + + +def test_ensure_store_v3(): + class InvalidStore: + pass + + with pytest.raises(ValueError): + StoreV3._ensure_store(InvalidStore()) + + # cannot initialize with a store from a different Zarr version + with pytest.raises(ValueError): + StoreV3._ensure_store(KVStore(dict())) + + assert StoreV3._ensure_store(None) is None + + # class with all methods of a MutableMapping will become a KVStoreV3 + assert isinstance(StoreV3._ensure_store(DummyStore), KVStoreV3) + + with pytest.raises(ValueError): + # does not have the methods expected of a MutableMapping + StoreV3._ensure_store(InvalidDummyStore) + + +def test_valid_key(): + store = KVStoreV3(dict) + + # only ascii keys are valid + assert not store._valid_key(5) + assert not store._valid_key(2.8) + + for key in store._valid_key_characters: + assert store._valid_key(key) + + # other characters not in store._valid_key_characters are not allowed + assert not store._valid_key("*") + assert not store._valid_key("~") + assert not store._valid_key("^") + + +def test_validate_key(): + store = KVStoreV3(dict) + + # zarr.json is a valid key + store._validate_key("zarr.json") + # but other keys not starting with meta/ or data/ are not + with pytest.raises(ValueError): + store._validate_key("zar.json") + + # valid ascii keys + for valid in [ + meta_root + "arr1.array.json", + data_root + "arr1.array.json", + meta_root + "subfolder/item_1-0.group.json", + ]: + store._validate_key(valid) + # but otherwise valid keys cannot end in / + with pytest.raises(ValueError): + assert store._validate_key(valid + "/") + + for invalid in [0, "*", "~", "^", "&"]: + with pytest.raises(ValueError): + store._validate_key(invalid) + + +class StoreV3Tests(_StoreTests): + version = 3 + root = meta_root + + def test_getsize(self): + # TODO: determine proper getsize() behavior for v3 + # Currently returns the combined size of entries under + # meta/root/path and data/root/path. + # Any path not under meta/root/ or data/root/ (including zarr.json) + # returns size 0. + + store = self.create_store() + if isinstance(store, dict) or hasattr(store, "getsize"): + assert 0 == getsize(store, "zarr.json") + store[meta_root + "foo/a"] = b"x" + assert 1 == getsize(store) + assert 1 == getsize(store, "foo") + store[meta_root + "foo/b"] = b"x" + assert 2 == getsize(store, "foo") + assert 1 == getsize(store, "foo/b") + store[meta_root + "bar/a"] = b"yy" + assert 2 == getsize(store, "bar") + store[data_root + "bar/a"] = b"zzz" + assert 5 == getsize(store, "bar") + store[data_root + "baz/a"] = b"zzz" + assert 3 == getsize(store, "baz") + assert 10 == getsize(store) + store[data_root + "quux"] = array.array("B", b"zzzz") + assert 14 == getsize(store) + assert 4 == getsize(store, "quux") + store[data_root + "spong"] = np.frombuffer(b"zzzzz", dtype="u1") + assert 19 == getsize(store) + assert 5 == getsize(store, "spong") + store.close() + + def test_init_array(self, dimension_separator_fixture_v3): + pass_dim_sep, want_dim_sep = dimension_separator_fixture_v3 + + store = self.create_store() + path = "arr1" + transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) + init_array( + store, + path=path, + shape=1000, + chunks=100, + dimension_separator=pass_dim_sep, + storage_transformers=[transformer], + ) + + # check metadata + mkey = meta_root + path + ".array.json" + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert (1000,) == meta["shape"] + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype(None) == meta["data_type"] + assert default_compressor == meta["compressor"] + assert meta["fill_value"] is None + # Missing MUST be assumed to be "/" + assert meta["chunk_grid"]["separator"] is want_dim_sep + assert len(meta["storage_transformers"]) == 1 + assert isinstance(meta["storage_transformers"][0], DummyStorageTransfomer) + assert meta["storage_transformers"][0].test_value == DummyStorageTransfomer.TEST_CONSTANT + store.close() + + def test_list_prefix(self): + store = self.create_store() + path = "arr1" + init_array(store, path=path, shape=1000, chunks=100) + + expected = [meta_root + "arr1.array.json", "zarr.json"] + assert sorted(store.list_prefix("")) == expected + + expected = [meta_root + "arr1.array.json"] + assert sorted(store.list_prefix(meta_root.rstrip("/"))) == expected + + # cannot start prefix with '/' + with pytest.raises(ValueError): + store.list_prefix(prefix="/" + meta_root.rstrip("/")) + + def test_equal(self): + store = self.create_store() + assert store == store + + def test_rename_nonexisting(self): + store = self.create_store() + if store.is_erasable(): + with pytest.raises(ValueError): + store.rename("a", "b") + else: + with pytest.raises(NotImplementedError): + store.rename("a", "b") + + def test_get_partial_values(self): + store = self.create_store() + assert store.supports_efficient_get_partial_values in [True, False] + store[data_root + "foo"] = b"abcdefg" + store[data_root + "baz"] = b"z" + assert [b"a"] == store.get_partial_values([(data_root + "foo", (0, 1))]) + assert [ + b"d", + b"b", + b"z", + b"abc", + b"defg", + b"defg", + b"g", + b"ef", + ] == store.get_partial_values( + [ + (data_root + "foo", (3, 1)), + (data_root + "foo", (1, 1)), + (data_root + "baz", (0, 1)), + (data_root + "foo", (0, 3)), + (data_root + "foo", (3, 4)), + (data_root + "foo", (3, None)), + (data_root + "foo", (-1, None)), + (data_root + "foo", (-3, 2)), + ] + ) + + def test_set_partial_values(self): + store = self.create_store() + store.supports_efficient_set_partial_values() + store[data_root + "foo"] = b"abcdefg" + store.set_partial_values([(data_root + "foo", 0, b"hey")]) + assert store[data_root + "foo"] == b"heydefg" + + store.set_partial_values([(data_root + "baz", 0, b"z")]) + assert store[data_root + "baz"] == b"z" + store.set_partial_values( + [ + (data_root + "foo", 1, b"oo"), + (data_root + "baz", 1, b"zzz"), + (data_root + "baz", 4, b"aaaa"), + (data_root + "foo", 6, b"done"), + ] + ) + assert store[data_root + "foo"] == b"hoodefdone" + assert store[data_root + "baz"] == b"zzzzaaaa" + store.set_partial_values( + [ + (data_root + "foo", -2, b"NE"), + (data_root + "baz", -5, b"q"), + ] + ) + assert store[data_root + "foo"] == b"hoodefdoNE" + assert store[data_root + "baz"] == b"zzzq" + + +class TestMappingStoreV3(StoreV3Tests): + def create_store(self, **kwargs): + return KVStoreV3(dict()) + + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + + +class TestMemoryStoreV3(_TestMemoryStore, StoreV3Tests): + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return MemoryStoreV3(**kwargs) + + +class TestDirectoryStoreV3(_TestDirectoryStore, StoreV3Tests): + def create_store(self, normalize_keys=False, **kwargs): + # For v3, don't have to skip if nested. + # skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_rename_nonexisting(self): + store = self.create_store() + with pytest.raises(FileNotFoundError): + store.rename(meta_root + "a", meta_root + "b") + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3(_TestFSStore, StoreV3Tests): + def create_store(self, normalize_keys=False, dimension_separator=".", path=None, **kwargs): + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = FSStoreV3( + path, normalize_keys=normalize_keys, dimension_separator=dimension_separator, **kwargs + ) + return store + + def test_init_array(self): + store = self.create_store() + path = "arr1" + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + mkey = meta_root + path + ".array.json" + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert (1000,) == meta["shape"] + assert (100,) == meta["chunk_grid"]["chunk_shape"] + assert np.dtype(None) == meta["data_type"] + assert meta["chunk_grid"]["separator"] == "/" + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3WithKeySeparator(StoreV3Tests): + def create_store(self, normalize_keys=False, key_separator=".", **kwargs): + # Since the user is passing key_separator, that will take priority. + skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + return FSStoreV3(path, normalize_keys=normalize_keys, key_separator=key_separator) + + +# TODO: enable once N5StoreV3 has been implemented +# @pytest.mark.skipif(True, reason="N5StoreV3 not yet fully implemented") +# class TestN5StoreV3(_TestN5Store, TestDirectoryStoreV3, StoreV3Tests): + + +class TestZipStoreV3(_TestZipStore, StoreV3Tests): + ZipStoreClass = ZipStoreV3 + + def create_store(self, **kwargs): + path = mktemp(suffix=".zip") + atexit.register(os.remove, path) + store = ZipStoreV3(path, mode="w", **kwargs) + return store + + +class TestDBMStoreV3(_TestDBMStore, StoreV3Tests): + def create_store(self, dimension_separator=None): + path = mktemp(suffix=".anydbm") + atexit.register(atexit_rmglob, path + "*") + # create store using default dbm implementation + store = DBMStoreV3(path, flag="n", dimension_separator=dimension_separator) + return store + + +class TestDBMStoreV3Dumb(_TestDBMStoreDumb, StoreV3Tests): + def create_store(self, **kwargs): + path = mktemp(suffix=".dumbdbm") + atexit.register(atexit_rmglob, path + "*") + + import dbm.dumb as dumbdbm + + store = DBMStoreV3(path, flag="n", open=dumbdbm.open, **kwargs) + return store + + +class TestDBMStoreV3Gnu(_TestDBMStoreGnu, StoreV3Tests): + def create_store(self, **kwargs): + gdbm = pytest.importorskip("dbm.gnu") + path = mktemp(suffix=".gdbm") # pragma: no cover + atexit.register(os.remove, path) # pragma: no cover + store = DBMStoreV3( + path, flag="n", open=gdbm.open, write_lock=False, **kwargs + ) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3NDBM(_TestDBMStoreNDBM, StoreV3Tests): + def create_store(self, **kwargs): + ndbm = pytest.importorskip("dbm.ndbm") + path = mktemp(suffix=".ndbm") # pragma: no cover + atexit.register(atexit_rmglob, path + "*") # pragma: no cover + store = DBMStoreV3(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover + return store # pragma: no cover + + +class TestLMDBStoreV3(_TestLMDBStore, StoreV3Tests): + def create_store(self, **kwargs): + pytest.importorskip("lmdb") + path = mktemp(suffix=".lmdb") + atexit.register(atexit_rmtree, path) + buffers = True + store = LMDBStoreV3(path, buffers=buffers, **kwargs) + return store + + +class TestSQLiteStoreV3(_TestSQLiteStore, StoreV3Tests): + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + path = mktemp(suffix=".db") + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path, **kwargs) + return store + + +class TestSQLiteStoreV3InMemory(_TestSQLiteStoreInMemory, StoreV3Tests): + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + store = SQLiteStoreV3(":memory:", **kwargs) + return store + + +@skip_test_env_var("ZARR_TEST_MONGO") +class TestMongoDBStoreV3(StoreV3Tests): + def create_store(self, **kwargs): + pytest.importorskip("pymongo") + store = MongoDBStoreV3( + host="127.0.0.1", database="zarr_tests", collection="zarr_tests", **kwargs + ) + # start with an empty store + store.clear() + return store + + +@skip_test_env_var("ZARR_TEST_REDIS") +class TestRedisStoreV3(StoreV3Tests): + def create_store(self, **kwargs): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + pytest.importorskip("redis") + store = RedisStoreV3(host="localhost", port=6379, **kwargs) + # start with an empty store + store.clear() + return store + + +@pytest.mark.skipif(not v3_sharding_available, reason="sharding is disabled") +class TestStorageTransformerV3(TestMappingStoreV3): + def create_store(self, **kwargs): + inner_store = super().create_store(**kwargs) + dummy_transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) + sharding_transformer = ShardingStorageTransformer( + "indexed", + chunks_per_shard=2, + ) + path = "bla" + init_array( + inner_store, + path=path, + shape=1000, + chunks=100, + dimension_separator=".", + storage_transformers=[dummy_transformer, sharding_transformer], + ) + store = Array(store=inner_store, path=path).chunk_store + store.erase_prefix("data/root/bla/") + store.clear() + return store + + def test_method_forwarding(self): + store = self.create_store() + inner_store = store.inner_store.inner_store + assert store.list() == inner_store.list() + assert store.list_dir(data_root) == inner_store.list_dir(data_root) + + assert store.is_readable() + assert store.is_writeable() + assert store.is_listable() + inner_store._readable = False + inner_store._writeable = False + inner_store._listable = False + assert not store.is_readable() + assert not store.is_writeable() + assert not store.is_listable() + + +class TestLRUStoreCacheV3(_TestLRUStoreCache, StoreV3Tests): + CountingClass = CountingDictV3 + LRUStoreClass = LRUStoreCacheV3 + + +@skip_test_env_var("ZARR_TEST_ABS") +class TestABSStoreV3(_TestABSStore, StoreV3Tests): + ABSStoreClass = ABSStoreV3 + + +def test_normalize_store_arg_v3(tmpdir): + fn = tmpdir.join("store.zip") + store = normalize_store_arg(str(fn), zarr_version=3, mode="w") + assert isinstance(store, ZipStoreV3) + assert "zarr.json" in store + + # can't pass storage_options to non-fsspec store + with pytest.raises(ValueError): + normalize_store_arg(str(fn), zarr_version=3, mode="w", storage_options={"some": "kwargs"}) + + if have_fsspec: + import fsspec + + path = tempfile.mkdtemp() + store = normalize_store_arg("file://" + path, zarr_version=3, mode="w") + assert isinstance(store, FSStoreV3) + assert "zarr.json" in store + + store = normalize_store_arg(fsspec.get_mapper("file://" + path), zarr_version=3) + assert isinstance(store, FSStoreV3) + + # regression for https://github.com/zarr-developers/zarr-python/issues/1382 + # contents of zarr.json are not important for this test + out = {"version": 1, "refs": {"zarr.json": "{...}"}} + store = normalize_store_arg( + "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_version=3 + ) + assert isinstance(store, FSStoreV3) + + fn = tmpdir.join("store.n5") + with pytest.raises(NotImplementedError): + normalize_store_arg(str(fn), zarr_version=3, mode="w") + + # error on zarr_version=3 with a v2 store + with pytest.raises(ValueError): + normalize_store_arg(KVStore(dict()), zarr_version=3, mode="w") + + # error on zarr_version=2 with a v3 store + with pytest.raises(ValueError): + normalize_store_arg(KVStoreV3(dict()), zarr_version=2, mode="w") + + +class TestConsolidatedMetadataStoreV3(_TestConsolidatedMetadataStore): + version = 3 + ConsolidatedMetadataClass = ConsolidatedMetadataStoreV3 + + @property + def metadata_key(self): + return meta_root + "consolidated/.zmetadata" + + def test_bad_store_version(self): + with pytest.raises(ValueError): + self.ConsolidatedMetadataClass(KVStore(dict())) + + +def test_get_hierarchy_metadata(): + store = KVStoreV3({}) + + # error raised if 'jarr.json' is not in the store + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) + + store["zarr.json"] = _default_entry_point_metadata_v3 + assert _get_hierarchy_metadata(store) == _default_entry_point_metadata_v3 + + # ValueError if only a subset of keys are present + store["zarr.json"] = {"zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0"} + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) + + # ValueError if any unexpected keys are present + extra_metadata = copy.copy(_default_entry_point_metadata_v3) + extra_metadata["extra_key"] = "value" + store["zarr.json"] = extra_metadata + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) + + +def test_top_level_imports(): + for store_name in [ + "ABSStoreV3", + "DBMStoreV3", + "KVStoreV3", + "DirectoryStoreV3", + "LMDBStoreV3", + "LRUStoreCacheV3", + "MemoryStoreV3", + "MongoDBStoreV3", + "RedisStoreV3", + "SQLiteStoreV3", + "ZipStoreV3", + ]: + if v3_api_available: + assert hasattr(zarr, store_name) # pragma: no cover + else: + assert not hasattr(zarr, store_name) # pragma: no cover + + +def test_assert_zarr_v3_api_available_warns_once(): + import zarr._storage.store + + zarr._storage.store._has_warned_about_v3 = False + warnings.resetwarnings() + with pytest.warns() as record: + assert_zarr_v3_api_available() + assert_zarr_v3_api_available() + assert len(record) == 1 + assert "The experimental Zarr V3 implementation" in str(record[0].message) + + +def _get_public_and_dunder_methods(some_class): + return set( + name + for name, _ in inspect.getmembers(some_class, predicate=inspect.isfunction) + if not name.startswith("_") or name.startswith("__") + ) + + +def test_storage_transformer_interface(): + store_v3_methods = _get_public_and_dunder_methods(StoreV3) + store_v3_methods.discard("__init__") + # Note, getitems() isn't mandatory when get_partial_values() is available + store_v3_methods.discard("getitems") + storage_transformer_methods = _get_public_and_dunder_methods(StorageTransformer) + storage_transformer_methods.discard("__init__") + storage_transformer_methods.discard("get_config") + assert storage_transformer_methods == store_v3_methods diff --git a/src/zarr/v2/tests/test_sync.py b/src/zarr/v2/tests/test_sync.py new file mode 100644 index 0000000000..3d8ef3a9b7 --- /dev/null +++ b/src/zarr/v2/tests/test_sync.py @@ -0,0 +1,321 @@ +import atexit +import shutil +import tempfile +from multiprocessing import Pool as ProcessPool +from multiprocessing import cpu_count +from multiprocessing.pool import ThreadPool +from tempfile import mkdtemp + +import numpy as np +from numpy.testing import assert_array_equal + +from zarr.attrs import Attributes +from zarr.core import Array +from zarr.hierarchy import Group +from zarr.storage import DirectoryStore, KVStore, atexit_rmtree, init_array, init_group, meta_root +from zarr.sync import ProcessSynchronizer, ThreadSynchronizer + +# zarr_version fixture must be imported although not used directly here +from zarr.tests.test_attrs import TestAttributes, zarr_version # noqa +from zarr.tests.test_core import TestArray +from zarr.tests.test_hierarchy import TestGroup + + +class TestAttributesWithThreadSynchronizer(TestAttributes): + def init_attributes(self, store, read_only=False, cache=True, zarr_version=zarr_version): + key = ".zattrs" if zarr_version == 2 else meta_root + "attrs" + synchronizer = ThreadSynchronizer() + return Attributes( + store, synchronizer=synchronizer, key=key, read_only=read_only, cache=cache + ) + + +class TestAttributesProcessSynchronizer(TestAttributes): + def init_attributes(self, store, read_only=False, cache=True, zarr_version=zarr_version): + key = ".zattrs" if zarr_version == 2 else meta_root + "attrs" + sync_path = mkdtemp() + atexit.register(shutil.rmtree, sync_path) + synchronizer = ProcessSynchronizer(sync_path) + return Attributes( + store, synchronizer=synchronizer, key=key, read_only=read_only, cache=cache + ) + + +def _append(arg): + z, i = arg + import numpy + + x = numpy.empty(1000, dtype="i4") + x[:] = i + shape = z.append(x) + return shape + + +def _set_arange(arg): + z, i = arg + import numpy + + x = numpy.arange(i * 1000, (i * 1000) + 1000, 1) + z[i * 1000 : (i * 1000) + 1000] = x + return i + + +class MixinArraySyncTests: + def test_parallel_setitem(self): + n = 100 + + # setup + arr = self.create_array(shape=n * 1000, chunks=999, dtype="i4") + arr[:] = 0 + pool = self.create_pool() + + # parallel setitem + results = pool.map(_set_arange, zip([arr] * n, range(n)), chunksize=1) + results = sorted(results) + + assert list(range(n)) == results + assert_array_equal(np.arange(n * 1000), arr[:]) + + pool.terminate() + + def test_parallel_append(self): + n = 100 + + # setup + arr = self.create_array(shape=1000, chunks=999, dtype="i4") + arr[:] = 0 + pool = self.create_pool() + + # parallel append + results = pool.map(_append, zip([arr] * n, range(n)), chunksize=1) + results = sorted(results) + + assert [((i + 2) * 1000,) for i in range(n)] == results + assert ((n + 1) * 1000,) == arr.shape + + pool.terminate() + + +class TestArrayWithThreadSynchronizer(TestArray, MixinArraySyncTests): + def create_array(self, read_only=False, **kwargs): + store = KVStore(dict()) + cache_metadata = kwargs.pop("cache_metadata", True) + cache_attrs = kwargs.pop("cache_attrs", True) + write_empty_chunks = kwargs.pop("write_empty_chunks", True) + init_array(store, **kwargs) + return Array( + store, + synchronizer=ThreadSynchronizer(), + read_only=read_only, + cache_metadata=cache_metadata, + cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks, + ) + + # noinspection PyMethodMayBeStatic + def create_pool(self): + pool = ThreadPool(cpu_count()) + return pool + + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype="" == actual[-8:] + + +def test_tree_get_icon(): + assert tree_get_icon("Array") == tree_array_icon + assert tree_get_icon("Group") == tree_group_icon + with pytest.raises(ValueError): + tree_get_icon("Baz") + + +@mock.patch.dict("sys.modules", {"ipytree": None}) +def test_tree_widget_missing_ipytree(): + pattern = ( + "Run `pip install zarr[jupyter]` or `conda install ipytree`" + "to get the required ipytree dependency for displaying the tree " + "widget. If using jupyterlab<3, you also need to run " + "`jupyter labextension install ipytree`" + ) + with pytest.raises(ImportError, match=re.escape(pattern)): + tree_widget(None, None, None) + + +def test_retry_call(): + class Fixture: + def __init__(self, pass_on=1): + self.c = 0 + self.pass_on = pass_on + + def __call__(self): + self.c += 1 + if self.c != self.pass_on: + raise PermissionError() + + for x in range(1, 11): + # Any number of failures less than 10 will be accepted. + fixture = Fixture(pass_on=x) + retry_call(fixture, exceptions=(PermissionError,), wait=0) + assert fixture.c == x + + def fail(x): + # Failures after 10 will cause an error to be raised. + retry_call(Fixture(pass_on=x), exceptions=(Exception,), wait=0) + + for x in range(11, 15): + pytest.raises(PermissionError, fail, x) + + +def test_flatten(): + assert list( + flatten( + [ + "0", + [ + "1", + [ + "2", + [ + "3", + [ + 4, + ], + ], + ], + ], + ] + ) + ) == ["0", "1", "2", "3", 4] + assert list(flatten("foo")) == ["f", "o", "o"] + assert list(flatten(["foo"])) == ["foo"] + + +def test_all_equal(): + assert all_equal(0, np.zeros((10, 10, 10))) + assert not all_equal(1, np.zeros((10, 10, 10))) + + assert all_equal(1, np.ones((10, 10, 10))) + assert not all_equal(1, 1 + np.ones((10, 10, 10))) + + assert all_equal(np.nan, np.array([np.nan, np.nan])) + assert not all_equal(np.nan, np.array([np.nan, 1.0])) + + assert all_equal({"a": -1}, np.array([{"a": -1}, {"a": -1}], dtype="object")) + assert not all_equal({"a": -1}, np.array([{"a": -1}, {"a": 2}], dtype="object")) + + assert all_equal(np.timedelta64(999, "D"), np.array([999, 999], dtype="timedelta64[D]")) + assert not all_equal(np.timedelta64(999, "D"), np.array([999, 998], dtype="timedelta64[D]")) + + # all_equal(None, *) always returns False + assert not all_equal(None, np.array([None, None])) + assert not all_equal(None, np.array([None, 10])) + + +def test_json_dumps_numpy_dtype(): + assert json_dumps(np.int64(0)) == json_dumps(0) + assert json_dumps(np.float32(0)) == json_dumps(float(0)) + # Check that we raise the error of the superclass for unsupported object + with pytest.raises(TypeError): + json_dumps(Array) + + +def test_constant_map(): + val = object() + m = ConstantMap(keys=[1, 2], constant=val) + assert len(m) == 2 + assert m[1] is val + assert m[2] is val + assert 1 in m + assert 0 not in m + with pytest.raises(KeyError): + m[0] + assert repr(m) == repr({1: val, 2: val}) diff --git a/src/zarr/v2/tests/util.py b/src/zarr/v2/tests/util.py new file mode 100644 index 0000000000..b3c3249cab --- /dev/null +++ b/src/zarr/v2/tests/util.py @@ -0,0 +1,120 @@ +import collections +import os +import tempfile +from typing import Any, Mapping, Sequence +from zarr.context import Context + +from zarr.storage import Store +from zarr._storage.v3 import StoreV3 + +import pytest + + +class CountingDict(Store): + def __init__(self): + self.wrapped = dict() + self.counter = collections.Counter() + + def __len__(self): + self.counter["__len__"] += 1 + return len(self.wrapped) + + def keys(self): + self.counter["keys"] += 1 + return self.wrapped.keys() + + def __iter__(self): + self.counter["__iter__"] += 1 + return iter(self.wrapped) + + def __contains__(self, item): + self.counter["__contains__", item] += 1 + return item in self.wrapped + + def __getitem__(self, item): + self.counter["__getitem__", item] += 1 + return self.wrapped[item] + + def __setitem__(self, key, value): + self.counter["__setitem__", key] += 1 + self.wrapped[key] = value + + def __delitem__(self, key): + self.counter["__delitem__", key] += 1 + del self.wrapped[key] + + def getitems( + self, keys: Sequence[str], *, contexts: Mapping[str, Context] + ) -> Mapping[str, Any]: + for key in keys: + self.counter["__getitem__", key] += 1 + return {k: self.wrapped[k] for k in keys if k in self.wrapped} + + +class CountingDictV3(CountingDict, StoreV3): + pass + + +def skip_test_env_var(name): + """Checks for environment variables indicating whether tests requiring services should be run""" + value = os.environ.get(name, "0") + return pytest.mark.skipif(value == "0", reason="Tests not enabled via environment variable") + + +try: + import fsspec # noqa: F401 + + have_fsspec = True +except ImportError: # pragma: no cover + have_fsspec = False + + +try: + import bsddb3 # noqa: F401 + + have_bsddb3 = True +except ImportError: # pragma: no cover + have_bsddb3 = False + + +try: + import lmdb # noqa: F401 + + have_lmdb = True +except ImportError: # pragma: no cover + have_lmdb = False + + +try: + import sqlite3 # noqa: F401 + + have_sqlite3 = True +except ImportError: # pragma: no cover + have_sqlite3 = False + + +def abs_container(): + from azure.core.exceptions import ResourceExistsError + import azure.storage.blob as asb + + URL = "http://127.0.0.1:10000" + ACCOUNT_NAME = "devstoreaccount1" + KEY = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + CONN_STR = ( + f"DefaultEndpointsProtocol=http;AccountName={ACCOUNT_NAME};" + f"AccountKey={KEY};BlobEndpoint={URL}/{ACCOUNT_NAME};" + ) + + blob_service_client = asb.BlobServiceClient.from_connection_string(CONN_STR) + try: + container_client = blob_service_client.create_container("test") + except ResourceExistsError: + container_client = blob_service_client.get_container_client("test") + + return container_client + + +def mktemp(**kwargs): + f = tempfile.NamedTemporaryFile(**kwargs) + f.close() + return f.name diff --git a/src/zarr/v2/types.py b/src/zarr/v2/types.py new file mode 100644 index 0000000000..cc29a350f5 --- /dev/null +++ b/src/zarr/v2/types.py @@ -0,0 +1,14 @@ +from typing import Literal, Protocol, Union + +ZARR_VERSION = Literal[2, 3] +DIMENSION_SEPARATOR = Literal[".", "/"] +MEMORY_ORDER = Literal["C", "F"] + + +PathLike = Union[str, bytes, None] + + +class MetaArray(Protocol): + def __array_function__(self, func, types, args, kwargs): + # To be extended + ... diff --git a/src/zarr/v2/util.py b/src/zarr/v2/util.py new file mode 100644 index 0000000000..2c9b0f616b --- /dev/null +++ b/src/zarr/v2/util.py @@ -0,0 +1,796 @@ +import inspect +import json +import math +import numbers +from textwrap import TextWrapper +import mmap +import time +from typing import ( + Any, + Callable, + Dict, + Iterator, + Mapping, + Optional, + Tuple, + TypeVar, + Union, + Iterable, + cast, +) +import warnings + +import numpy as np +from asciitree import BoxStyle, LeftAligned +from asciitree.traversal import Traversal +from numcodecs.compat import ( + ensure_text, + ensure_ndarray_like, + ensure_bytes, + ensure_contiguous_ndarray_like, +) +from numcodecs.ndarray_like import NDArrayLike +from numcodecs.registry import codec_registry +from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo +from zarr.types import DIMENSION_SEPARATOR + +KeyType = TypeVar("KeyType") +ValueType = TypeVar("ValueType") + + +def flatten(arg: Iterable) -> Iterable: + for element in arg: + if isinstance(element, Iterable) and not isinstance(element, (str, bytes)): + yield from flatten(element) + else: + yield element + + +# codecs to use for object dtype convenience API +object_codecs = { + str.__name__: "vlen-utf8", + bytes.__name__: "vlen-bytes", + "array": "vlen-array", +} + + +class NumberEncoder(json.JSONEncoder): + def default(self, o): + # See json.JSONEncoder.default docstring for explanation + # This is necessary to encode numpy dtype + if isinstance(o, numbers.Integral): + return int(o) + if isinstance(o, numbers.Real): + return float(o) + return json.JSONEncoder.default(self, o) + + +def json_dumps(o: Any) -> bytes: + """Write JSON in a consistent, human-readable way.""" + return json.dumps( + o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder + ).encode("ascii") + + +def json_loads(s: Union[bytes, str]) -> Dict[str, Any]: + """Read JSON in a consistent way.""" + return json.loads(ensure_text(s, "utf-8")) + + +def normalize_shape(shape: Union[int, Tuple[int, ...], None]) -> Tuple[int, ...]: + """Convenience function to normalize the `shape` argument.""" + + if shape is None: + raise TypeError("shape is None") + + # handle 1D convenience form + if isinstance(shape, numbers.Integral): + shape = (int(shape),) + + # normalize + shape = cast(Tuple[int, ...], shape) + if not all(isinstance(s, numbers.Integral) for s in shape): + warnings.warn("shape contains non-integer value(s)", UserWarning, stacklevel=2) + shape = tuple(int(s) for s in shape) + return shape + + +# code to guess chunk shape, adapted from h5py + +CHUNK_BASE = 256 * 1024 # Multiplier by which chunks are adjusted +CHUNK_MIN = 128 * 1024 # Soft lower limit (128k) +CHUNK_MAX = 64 * 1024 * 1024 # Hard upper limit + + +def guess_chunks(shape: Tuple[int, ...], typesize: int) -> Tuple[int, ...]: + """ + Guess an appropriate chunk layout for an array, given its shape and + the size of each element in bytes. Will allocate chunks only as large + as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of + each axis, slightly favoring bigger values for the last index. + Undocumented and subject to change without warning. + """ + + ndims = len(shape) + # require chunks to have non-zero length for all dimensions + chunks = np.maximum(np.array(shape, dtype="=f8"), 1) + + # Determine the optimal chunk size in bytes using a PyTables expression. + # This is kept as a float. + dset_size = np.prod(chunks) * typesize + target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024))) + + if target_size > CHUNK_MAX: + target_size = CHUNK_MAX + elif target_size < CHUNK_MIN: + target_size = CHUNK_MIN + + idx = 0 + while True: + # Repeatedly loop over the axes, dividing them by 2. Stop when: + # 1a. We're smaller than the target chunk size, OR + # 1b. We're within 50% of the target chunk size, AND + # 2. The chunk is smaller than the maximum chunk size + + chunk_bytes = np.prod(chunks) * typesize + + if ( + chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5 + ) and chunk_bytes < CHUNK_MAX: + break + + if np.prod(chunks) == 1: + break # Element size larger than CHUNK_MAX + + chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0) + idx += 1 + + return tuple(int(x) for x in chunks) + + +def normalize_chunks(chunks: Any, shape: Tuple[int, ...], typesize: int) -> Tuple[int, ...]: + """Convenience function to normalize the `chunks` argument for an array + with the given `shape`.""" + + # N.B., expect shape already normalized + + # handle auto-chunking + if chunks is None or chunks is True: + return guess_chunks(shape, typesize) + + # handle no chunking + if chunks is False: + return shape + + # handle 1D convenience form + if isinstance(chunks, numbers.Integral): + chunks = tuple(int(chunks) for _ in shape) + + # handle bad dimensionality + if len(chunks) > len(shape): + raise ValueError("too many dimensions in chunks") + + # handle underspecified chunks + if len(chunks) < len(shape): + # assume chunks across remaining dimensions + chunks += shape[len(chunks) :] + + # handle None or -1 in chunks + if -1 in chunks or None in chunks: + chunks = tuple(s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks)) + + if not all(isinstance(c, numbers.Integral) for c in chunks): + warnings.warn("chunks contains non-integer value(s)", UserWarning, stacklevel=2) + + chunks = tuple(int(c) for c in chunks) + return chunks + + +def normalize_dtype(dtype: Union[str, np.dtype], object_codec) -> Tuple[np.dtype, Any]: + # convenience API for object arrays + if inspect.isclass(dtype): + dtype = dtype.__name__ + if isinstance(dtype, str): + # allow ':' to delimit class from codec arguments + tokens = dtype.split(":") + key = tokens[0] + if key in object_codecs: + dtype = np.dtype(object) + if object_codec is None: + codec_id = object_codecs[key] + if len(tokens) > 1: + args = tokens[1].split(",") + else: + args = [] + try: + object_codec = codec_registry[codec_id](*args) + except KeyError as e: # pragma: no cover + raise ValueError( + f"codec {codec_id!r} for object type {key!r} is not " + f"available; please provide an object_codec manually" + ) from e + return dtype, object_codec + + dtype = np.dtype(dtype) + + # don't allow generic datetime64 or timedelta64, require units to be specified + if dtype == np.dtype("M8") or dtype == np.dtype("m8"): + raise ValueError( + "datetime64 and timedelta64 dtypes with generic units " + 'are not supported, please specify units (e.g., "M8[ns]")' + ) + + return dtype, object_codec + + +# noinspection PyTypeChecker +def is_total_slice(item, shape: Tuple[int]) -> bool: + """Determine whether `item` specifies a complete slice of array with the + given `shape`. Used to optimize __setitem__ operations on the Chunk + class.""" + + # N.B., assume shape is normalized + + if item == Ellipsis: + return True + if item == slice(None): + return True + if isinstance(item, slice): + item = (item,) + if isinstance(item, tuple): + return all( + ( + ( + isinstance(it, slice) + and ( + (it == slice(None)) + or ((it.stop - it.start == sh) and (it.step in [1, None])) + ) + ) + # The only scalar edge case, indexing with int 0 along a size-1 dimension + # is identical to a total slice + # https://github.com/zarr-developers/zarr-python/issues/1730 + or (isinstance(it, int) and it == 0 and sh == 1) + ) + for it, sh in zip(item, shape) + ) + else: + raise TypeError(f"expected slice or tuple of slices, found {item!r}") + + +def normalize_resize_args(old_shape, *args): + # normalize new shape argument + if len(args) == 1: + new_shape = args[0] + else: + new_shape = args + if isinstance(new_shape, int): + new_shape = (new_shape,) + else: + new_shape = tuple(new_shape) + if len(new_shape) != len(old_shape): + raise ValueError("new shape must have same number of dimensions") + + # handle None in new_shape + new_shape = tuple(s if n is None else int(n) for s, n in zip(old_shape, new_shape)) + + return new_shape + + +def human_readable_size(size) -> str: + if size < 2**10: + return f"{size}" + elif size < 2**20: + return f"{size / float(2**10):.1f}K" + elif size < 2**30: + return f"{size / float(2**20):.1f}M" + elif size < 2**40: + return f"{size / float(2**30):.1f}G" + elif size < 2**50: + return f"{size / float(2**40):.1f}T" + else: + return f"{size / float(2**50):.1f}P" + + +def normalize_order(order: str) -> str: + order = str(order).upper() + if order not in ["C", "F"]: + raise ValueError(f"order must be either 'C' or 'F', found: {order!r}") + return order + + +def normalize_dimension_separator(sep: Optional[str]) -> Optional[DIMENSION_SEPARATOR]: + if sep in (".", "/", None): + return cast(Optional[DIMENSION_SEPARATOR], sep) + else: + raise ValueError(f"dimension_separator must be either '.' or '/', found: {sep!r}") + + +def normalize_fill_value(fill_value, dtype: np.dtype): + if fill_value is None or dtype.hasobject: + # no fill value + pass + elif not isinstance(fill_value, np.void) and fill_value == 0: + # this should be compatible across numpy versions for any array type, including + # structured arrays + fill_value = np.zeros((), dtype=dtype)[()] + + elif dtype.kind == "U": + # special case unicode because of encoding issues on Windows if passed through numpy + # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 + + if not isinstance(fill_value, str): + raise ValueError( + f"fill_value {fill_value!r} is not valid for dtype {dtype}; " + f"must be a unicode string" + ) + + else: + try: + if isinstance(fill_value, bytes) and dtype.kind == "V": + # special case for numpy 1.14 compatibility + fill_value = np.array(fill_value, dtype=dtype.str).view(dtype)[()] + else: + fill_value = np.array(fill_value, dtype=dtype)[()] + + except Exception as e: + # re-raise with our own error message to be helpful + raise ValueError( + f"fill_value {fill_value!r} is not valid for dtype {dtype}; " + f"nested exception: {e}" + ) from e + + return fill_value + + +def normalize_storage_path(path: Union[str, bytes, None]) -> str: + # handle bytes + if isinstance(path, bytes): + path = str(path, "ascii") + + # ensure str + if path is not None and not isinstance(path, str): + path = str(path) + + if path: + # convert backslash to forward slash + path = path.replace("\\", "/") + + # ensure no leading slash + while len(path) > 0 and path[0] == "/": + path = path[1:] + + # ensure no trailing slash + while len(path) > 0 and path[-1] == "/": + path = path[:-1] + + # collapse any repeated slashes + previous_char = None + collapsed = "" + for char in path: + if char == "/" and previous_char == "/": + pass + else: + collapsed += char + previous_char = char + path = collapsed + + # don't allow path segments with just '.' or '..' + segments = path.split("/") + if any(s in {".", ".."} for s in segments): + raise ValueError("path containing '.' or '..' segment not allowed") + + else: + path = "" + + return path + + +def buffer_size(v) -> int: + return ensure_ndarray_like(v).nbytes + + +def info_text_report(items: Dict[Any, Any]) -> str: + keys = [k for k, v in items] + max_key_len = max(len(k) for k in keys) + report = "" + for k, v in items: + wrapper = TextWrapper( + width=80, + initial_indent=k.ljust(max_key_len) + " : ", + subsequent_indent=" " * max_key_len + " : ", + ) + text = wrapper.fill(str(v)) + report += text + "\n" + return report + + +def info_html_report(items) -> str: + report = '' + report += "" + for k, v in items: + report += ( + f"" + f'' + f'' + f"" + ) + report += "" + report += "
{k}{v}
" + return report + + +class InfoReporter: + def __init__(self, obj): + self.obj = obj + self.items = self.obj.info_items() + + def __repr__(self): + return info_text_report(self.items) + + def _repr_html_(self): + return info_html_report(self.items) + + +class TreeNode: + def __init__(self, obj, depth=0, level=None): + self.obj = obj + self.depth = depth + self.level = level + + def get_children(self): + if hasattr(self.obj, "values"): + if self.level is None or self.depth < self.level: + depth = self.depth + 1 + return [TreeNode(o, depth=depth, level=self.level) for o in self.obj.values()] + return [] + + def get_text(self): + name = self.obj.name.split("/")[-1] or "/" + if hasattr(self.obj, "shape"): + name += f" {self.obj.shape} {self.obj.dtype}" + return name + + def get_type(self): + return type(self.obj).__name__ + + +class TreeTraversal(Traversal): + def get_children(self, node): + return node.get_children() + + def get_root(self, tree): + return tree + + def get_text(self, node): + return node.get_text() + + +tree_group_icon = "folder" +tree_array_icon = "table" + + +def tree_get_icon(stype: str) -> str: + if stype == "Array": + return tree_array_icon + elif stype == "Group": + return tree_group_icon + else: + raise ValueError(f"Unknown type: {stype}") + + +def tree_widget_sublist(node, root=False, expand=False): + import ipytree + + result = ipytree.Node() + result.icon = tree_get_icon(node.get_type()) + if root or (expand is True) or (isinstance(expand, int) and node.depth < expand): + result.opened = True + else: + result.opened = False + result.name = node.get_text() + result.nodes = [tree_widget_sublist(c, expand=expand) for c in node.get_children()] + result.disabled = True + + return result + + +def tree_widget(group, expand, level): + try: + import ipytree + except ImportError as e: + raise ImportError( + f"{e}: Run `pip install zarr[jupyter]` or `conda install ipytree`" + f"to get the required ipytree dependency for displaying the tree " + f"widget. If using jupyterlab<3, you also need to run " + f"`jupyter labextension install ipytree`" + ) from e + + result = ipytree.Tree() + root = TreeNode(group, level=level) + result.add_node(tree_widget_sublist(root, root=True, expand=expand)) + + return result + + +class TreeViewer: + def __init__(self, group, expand=False, level=None): + self.group = group + self.expand = expand + self.level = level + + self.text_kwargs = dict(horiz_len=2, label_space=1, indent=1) + + self.bytes_kwargs = dict( + UP_AND_RIGHT="+", HORIZONTAL="-", VERTICAL="|", VERTICAL_AND_RIGHT="+" + ) + + self.unicode_kwargs = dict( + UP_AND_RIGHT="\u2514", + HORIZONTAL="\u2500", + VERTICAL="\u2502", + VERTICAL_AND_RIGHT="\u251C", + ) + + def __bytes__(self): + drawer = LeftAligned( + traverse=TreeTraversal(), draw=BoxStyle(gfx=self.bytes_kwargs, **self.text_kwargs) + ) + root = TreeNode(self.group, level=self.level) + result = drawer(root) + + # Unicode characters slip in on Python 3. + # So we need to straighten that out first. + result = result.encode() + + return result + + def __unicode__(self): + drawer = LeftAligned( + traverse=TreeTraversal(), draw=BoxStyle(gfx=self.unicode_kwargs, **self.text_kwargs) + ) + root = TreeNode(self.group, level=self.level) + return drawer(root) + + def __repr__(self): + return self.__unicode__() + + def _repr_mimebundle_(self, **kwargs): + tree = tree_widget(self.group, expand=self.expand, level=self.level) + return tree._repr_mimebundle_(**kwargs) + + +def check_array_shape(param, array, shape): + if not hasattr(array, "shape"): + raise TypeError(f"parameter {param!r}: expected an array-like object, got {type(array)!r}") + if array.shape != shape: + raise ValueError( + f"parameter {param!r}: expected array with shape {shape!r}, got {array.shape!r}" + ) + + +def is_valid_python_name(name): + from keyword import iskeyword + + return name.isidentifier() and not iskeyword(name) + + +class NoLock: + """A lock that doesn't lock.""" + + def __enter__(self): + pass + + def __exit__(self, *args): + pass + + +nolock = NoLock() + + +class PartialReadBuffer: + def __init__(self, store_key, chunk_store): + self.chunk_store = chunk_store + # is it fsstore or an actual fsspec map object + assert hasattr(self.chunk_store, "map") + self.map = self.chunk_store.map + self.fs = self.chunk_store.fs + self.store_key = store_key + self.buff = None + self.nblocks = None + self.start_points = None + self.n_per_block = None + self.start_points_max = None + self.read_blocks = set() + + _key_path = self.map._key_to_str(store_key) + _key_path = _key_path.split("/") + _chunk_path = [self.chunk_store._normalize_key(_key_path[-1])] + _key_path = "/".join(_key_path[:-1] + _chunk_path) + self.key_path = _key_path + + def prepare_chunk(self): + assert self.buff is None + header = self.fs.read_block(self.key_path, 0, 16) + nbytes, self.cbytes, blocksize = cbuffer_sizes(header) + typesize, _shuffle, _memcpyd = cbuffer_metainfo(header) + self.buff = mmap.mmap(-1, self.cbytes) + self.buff[0:16] = header + self.nblocks = nbytes / blocksize + self.nblocks = ( + int(self.nblocks) if self.nblocks == int(self.nblocks) else int(self.nblocks + 1) + ) + if self.nblocks == 1: + self.buff = self.read_full() + return + start_points_buffer = self.fs.read_block(self.key_path, 16, int(self.nblocks * 4)) + self.start_points = np.frombuffer(start_points_buffer, count=self.nblocks, dtype=np.int32) + self.start_points_max = self.start_points.max() + self.buff[16 : (16 + (self.nblocks * 4))] = start_points_buffer + self.n_per_block = blocksize / typesize + + def read_part(self, start, nitems): + assert self.buff is not None + if self.nblocks == 1: + return + start_block = int(start / self.n_per_block) + wanted_decompressed = 0 + while wanted_decompressed < nitems: + if start_block not in self.read_blocks: + start_byte = self.start_points[start_block] + if start_byte == self.start_points_max: + stop_byte = self.cbytes + else: + stop_byte = self.start_points[self.start_points > start_byte].min() + length = stop_byte - start_byte + data_buff = self.fs.read_block(self.key_path, start_byte, length) + self.buff[start_byte:stop_byte] = data_buff + self.read_blocks.add(start_block) + if wanted_decompressed == 0: + wanted_decompressed += ((start_block + 1) * self.n_per_block) - start + else: + wanted_decompressed += self.n_per_block + start_block += 1 + + def read_full(self): + return self.chunk_store[self.store_key] + + +class UncompressedPartialReadBufferV3: + def __init__(self, store_key, chunk_store, itemsize): + assert chunk_store.supports_efficient_get_partial_values + self.chunk_store = chunk_store + self.store_key = store_key + self.itemsize = itemsize + + def prepare_chunk(self): + pass + + def read_part(self, start, nitems): + return self.chunk_store.get_partial_values( + [(self.store_key, (start * self.itemsize, nitems * self.itemsize))] + )[0] + + def read_full(self): + return self.chunk_store[self.store_key] + + +def retry_call( + callabl: Callable, + args=None, + kwargs=None, + exceptions: Tuple[Any, ...] = (), + retries: int = 10, + wait: float = 0.1, +) -> Any: + """ + Make several attempts to invoke the callable. If one of the given exceptions + is raised, wait the given period of time and retry up to the given number of + retries. + """ + + if args is None: + args = () + if kwargs is None: + kwargs = {} + + for attempt in range(1, retries + 1): + try: + return callabl(*args, **kwargs) + except exceptions: + if attempt < retries: + time.sleep(wait) + else: + raise + + +def all_equal(value: Any, array: Any): + """ + Test if all the elements of an array are equivalent to a value. + If `value` is None, then this function does not do any comparison and + returns False. + """ + + if value is None: + return False + if not value: + # if `value` is falsey, then just 1 truthy value in `array` + # is sufficient to return False. We assume here that np.any is + # optimized to return on the first truthy value in `array`. + try: + return not np.any(array) + except (TypeError, ValueError): # pragma: no cover + pass + if np.issubdtype(array.dtype, np.object_): + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + return all(flatten(np.equal(value, array, dtype=array.dtype))) + else: + # Numpy errors if you call np.isnan on custom dtypes, so ensure + # we are working with floats before calling isnan + if np.issubdtype(array.dtype, np.floating) and np.isnan(value): + return np.all(np.isnan(array)) + else: + # using == raises warnings from numpy deprecated pattern, but + # using np.equal() raises type errors for structured dtypes... + return np.all(value == array) + + +def ensure_contiguous_ndarray_or_bytes(buf) -> Union[NDArrayLike, bytes]: + """Convenience function to coerce `buf` to ndarray-like array or bytes. + + First check if `buf` can be zero-copy converted to a contiguous array. + If not, `buf` will be copied to a newly allocated `bytes` object. + + Parameters + ---------- + buf : ndarray-like, array-like, or bytes-like + A numpy array like object such as numpy.ndarray, cupy.ndarray, or + any object exporting a buffer interface. + + Returns + ------- + arr : NDArrayLike or bytes + A ndarray-like or bytes object + """ + + try: + return ensure_contiguous_ndarray_like(buf) + except TypeError: + # An error is raised if `buf` couldn't be zero-copy converted + return ensure_bytes(buf) + + +class ConstantMap(Mapping[KeyType, ValueType]): + """A read-only map that maps all keys to the same constant value + + Useful if you want to call `getitems()` with the same context for all keys. + + Parameters + ---------- + keys + The keys of the map. Will be copied to a frozenset if it isn't already. + constant + The constant that all keys are mapping to. + """ + + def __init__(self, keys: Iterable[KeyType], constant: ValueType) -> None: + self._keys = keys if isinstance(keys, frozenset) else frozenset(keys) + self._constant = constant + + def __getitem__(self, key: KeyType) -> ValueType: + if key not in self._keys: + raise KeyError(repr(key)) + return self._constant + + def __iter__(self) -> Iterator[KeyType]: + return iter(self._keys) + + def __len__(self) -> int: + return len(self._keys) + + def __contains__(self, key: object) -> bool: + return key in self._keys + + def __repr__(self) -> str: + return repr({k: v for k, v in self.items()})