From 79560d5b80fed1187376b9f3a710ddc9c94b3af2 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 22:20:15 +0100 Subject: [PATCH 01/17] fix merge --- tests/test_array.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index b81f966e20..c8f3b08676 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,7 +35,7 @@ _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, chunks_initialized, - create_array, + create_array, SerializerLike, ) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer @@ -1025,6 +1025,14 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu ZstdCodec(level=3), {"name": "zstd", "configuration": {"level": 3}}, ({"name": "zstd", "configuration": {"level": 3}},), + "zstd", + ], + ) + @pytest.mark.parametrize( + "serializer", + [ + "auto", + "bytes", ], ) @pytest.mark.parametrize( @@ -1059,12 +1067,15 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu ), {"name": "transpose", "configuration": {"order": [0]}}, ({"name": "transpose", "configuration": {"order": [0]}},), + "transpose", ], ) + @pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_v3_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, + serializer: SerializerLike, filters: FiltersLike, dtype: str, chunks: tuple[int, ...], @@ -1081,10 +1092,11 @@ async def test_v3_chunk_encoding( shards=shards, zarr_format=3, filters=filters, + serializer=serializer, compressors=compressors, ) - filters_expected, _, compressors_expected = _parse_chunk_encoding_v3( - filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype) + filters_expected, serializer_expected, compressors_expected = _parse_chunk_encoding_v3( + filters=filters, compressors=compressors, serializer=serializer, dtype=np.dtype(dtype) ) assert arr.filters == filters_expected assert arr.compressors == compressors_expected @@ -1099,10 +1111,11 @@ async def test_v3_chunk_encoding( numcodecs.Zstd(level=3), (), (numcodecs.Zstd(level=3),), + "zstd" ], ) @pytest.mark.parametrize( - "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] + "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),"gzip")] ) async def test_v2_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str From 76199904a5285e1d71619e4b4ee3033ed26a374e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 23:27:04 +0100 Subject: [PATCH 02/17] add str arguments for filters, serializer, compressors --- src/zarr/core/array.py | 12 +++++++++--- src/zarr/core/metadata/v2.py | 12 +++++++++--- src/zarr/registry.py | 12 +++++++++--- tests/test_array.py | 22 +++++++++++++++++----- 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9c2f8a7260..7ab61a4e7c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3769,23 +3769,25 @@ def _get_default_codecs( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | str | ArrayArrayCodec | numcodecs.abc.Codec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec | Literal["auto"] + | str | None ) CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | str | BytesBytesCodec | numcodecs.abc.Codec] | dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] + | str | None ) -SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] +SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] | str class ShardsConfigParam(TypedDict): @@ -4305,6 +4307,8 @@ def _parse_chunk_encoding_v3( out_array_array: tuple[ArrayArrayCodec, ...] = () elif filters == "auto": out_array_array = default_array_array + elif isinstance(filters, str): + out_array_array = (_parse_array_array_codec(filters),) else: maybe_array_array: Iterable[Codec | dict[str, JSON]] if isinstance(filters, dict | Codec): @@ -4322,6 +4326,8 @@ def _parse_chunk_encoding_v3( out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_bytes_bytes + elif isinstance(compressors, str): + out_bytes_bytes = (_parse_bytes_bytes_codec(compressors),) else: maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] if isinstance(compressors, dict | Codec): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3d292c81b4..c1e87aa3e0 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -246,20 +246,24 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: if data is None: return data + if isinstance(data, str): + return (numcodecs.get_codec({"id": data}),) if isinstance(data, Iterable): for idx, val in enumerate(data): if isinstance(val, numcodecs.abc.Codec): out.append(val) elif isinstance(val, dict): out.append(numcodecs.get_codec(val)) + elif isinstance(val, str): + out.append(numcodecs.get_codec({"id": val})) else: - msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." + msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead." raise TypeError(msg) return tuple(out) # take a single codec instance and wrap it in a tuple if isinstance(data, numcodecs.abc.Codec): return (data,) - msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." + msg = f"For Zarr format 2 arrays, all elements of `filters` must be None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) @@ -271,7 +275,9 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: return data if isinstance(data, dict): return numcodecs.get_codec(data) - msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." + if isinstance(data, str): + return numcodecs.get_codec({"id": data}) + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 704db3f704..72fd6de85c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -166,7 +166,7 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec(data: dict[str, JSON] | str | Codec) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it @@ -174,6 +174,8 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: """ from zarr.abc.codec import BytesBytesCodec + if isinstance(data, str): + data = {"name": data, "configuration": {}} if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, BytesBytesCodec): @@ -186,7 +188,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec(data: dict[str, JSON] | str | Codec) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it @@ -194,6 +196,8 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: """ from zarr.abc.codec import ArrayBytesCodec + if isinstance(data, str): + data = {"name": data, "configuration": {}} if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, ArrayBytesCodec): @@ -206,7 +210,7 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it @@ -214,6 +218,8 @@ def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: """ from zarr.abc.codec import ArrayArrayCodec + if isinstance(data, str): + data = {"name": data, "configuration": {}} if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, ArrayArrayCodec): diff --git a/tests/test_array.py b/tests/test_array.py index c8f3b08676..8522bcc7e7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -30,12 +30,13 @@ from zarr.core.array import ( CompressorsLike, FiltersLike, + SerializerLike, _get_default_chunk_encoding_v2, _get_default_chunk_encoding_v3, _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, chunks_initialized, - create_array, SerializerLike, + create_array, ) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer @@ -1026,6 +1027,7 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu {"name": "zstd", "configuration": {"level": 3}}, ({"name": "zstd", "configuration": {"level": 3}},), "zstd", + ("crc32c", "zstd"), ], ) @pytest.mark.parametrize( @@ -1067,10 +1069,9 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu ), {"name": "transpose", "configuration": {"order": [0]}}, ({"name": "transpose", "configuration": {"order": [0]}},), - "transpose", + # is there a filter with no configuration? ], ) - @pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) async def test_v3_chunk_encoding( store: MemoryStore, @@ -1084,6 +1085,9 @@ async def test_v3_chunk_encoding( """ Test various possibilities for the compressors and filters parameter to create_array """ + if serializer == "bytes" and dtype == "str": + serializer = "vlen-utf8" + arr = await create_array( store=store, dtype=dtype, @@ -1111,11 +1115,19 @@ async def test_v3_chunk_encoding( numcodecs.Zstd(level=3), (), (numcodecs.Zstd(level=3),), - "zstd" + "zstd", ], ) @pytest.mark.parametrize( - "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),"gzip")] + "filters", + [ + "auto", + None, + numcodecs.GZip(level=1), + (numcodecs.GZip(level=1)), + "gzip", + ("gzip", "zstd"), + ], ) async def test_v2_chunk_encoding( store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str From daff61abc1d61cbd348fee7b9449f5e2dc8e1349 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 23:27:29 +0100 Subject: [PATCH 03/17] remove duplicate type check --- src/zarr/core/array.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7ab61a4e7c..52871111fa 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4266,9 +4266,6 @@ def _parse_chunk_encoding_v2( elif isinstance(compressor, tuple | list) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4276,14 +4273,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) return _filters, _compressor From 1a3a5027d18478fa3aa90595677cbec52817b982 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 23:35:58 +0100 Subject: [PATCH 04/17] fix ruff --- src/zarr/core/array.py | 4 +--- tests/test_array.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 52871111fa..fe491a4cc8 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3773,7 +3773,6 @@ def _get_default_codecs( | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec - | Literal["auto"] | str | None ) @@ -3783,11 +3782,10 @@ def _get_default_codecs( | dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec - | Literal["auto"] | str | None ) -SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] | str +SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | str class ShardsConfigParam(TypedDict): diff --git a/tests/test_array.py b/tests/test_array.py index 8522bcc7e7..6df8cf581b 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1114,7 +1114,7 @@ async def test_v3_chunk_encoding( None, numcodecs.Zstd(level=3), (), - (numcodecs.Zstd(level=3),), + (numcodecs.Zstd(level=2),), "zstd", ], ) @@ -1124,7 +1124,7 @@ async def test_v3_chunk_encoding( "auto", None, numcodecs.GZip(level=1), - (numcodecs.GZip(level=1)), + (numcodecs.GZip(level=2)), "gzip", ("gzip", "zstd"), ], From 68ac3299235e8e8344ed8bf2a37162c2e1214400 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 23:42:32 +0100 Subject: [PATCH 05/17] update docstrings --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index e1f92633cd..48d8733b87 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -792,7 +792,7 @@ def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fe491a4cc8..ea3257e876 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4053,7 +4053,7 @@ async def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1f5d57c0ab..5d08e841ca 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1045,7 +1045,7 @@ async def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` @@ -2280,7 +2280,7 @@ def create_array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` @@ -2678,7 +2678,7 @@ def array( chunk to bytes. For Zarr format 3, a "filter" is a codec that takes an array and returns an array, - and these values must be instances of ``ArrayArrayCodec``, or dict representations + and these values must be instances of ``ArrayArrayCodec``, or dict or string representations of ``ArrayArrayCodec``. If no ``filters`` are provided, a default set of filters will be used. These defaults can be changed by modifying the value of ``array.v3_default_filters`` From 0e227e02dec30ac55330890746cd6b63ec81929a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sat, 15 Feb 2025 23:48:59 +0100 Subject: [PATCH 06/17] document changes --- changes/2839.feature.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2839.feature.rst diff --git a/changes/2839.feature.rst b/changes/2839.feature.rst new file mode 100644 index 0000000000..5256493563 --- /dev/null +++ b/changes/2839.feature.rst @@ -0,0 +1 @@ +Array creation allows string representation of codecs for ``filters``, ``serializer``, and ``compressors``. \ No newline at end of file From de83f920083a2e85214ad88a7ba0fb63e3c8ee87 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sun, 16 Feb 2025 00:16:55 +0100 Subject: [PATCH 07/17] test_bad_chunk_encoding --- tests/test_array.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index 6df8cf581b..cd8320765f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1154,6 +1154,30 @@ async def test_v2_chunk_encoding( assert arr.compressors == compressor_expected assert arr.filters == filters_expected + @staticmethod + async def test_bad_chunk_encoding(store: MemoryStore) -> None: + """ + Test that passing an invalid compressor or filter to create_array raises an error. + """ + bad_compressor = 2 + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(bad_compressor)} instead." + with pytest.raises(ValueError, match=msg): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=2, + compressors=bad_compressor, # type: ignore[arg-type] + ) + with pytest.raises(KeyError): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=3, + filters="bad_filter", + ) + @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) async def test_default_filters_compressors( From 73b32ac66f2851e05c43a7d8b75a7f1b8a683a93 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sun, 16 Feb 2025 00:19:32 +0100 Subject: [PATCH 08/17] remove unused "type: ignore" comment --- tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index cd8320765f..4eed4cacdd 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1167,7 +1167,7 @@ async def test_bad_chunk_encoding(store: MemoryStore) -> None: dtype="uint8", shape=(10,), zarr_format=2, - compressors=bad_compressor, # type: ignore[arg-type] + compressors=bad_compressor, ) with pytest.raises(KeyError): await create_array( From 3588a65d352f4b5bb946311131571ff2a16c6eb7 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Sun, 16 Feb 2025 00:21:01 +0100 Subject: [PATCH 09/17] remove comment --- tests/test_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index 4eed4cacdd..df61d31661 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1069,7 +1069,6 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu ), {"name": "transpose", "configuration": {"order": [0]}}, ({"name": "transpose", "configuration": {"order": [0]}},), - # is there a filter with no configuration? ], ) @pytest.mark.parametrize(("chunks", "shards"), [((6,), None), ((3,), (6,))]) From 74b45bb86059a2da38010b4ee5d57e211e9cd1b4 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 18 Feb 2025 13:14:53 +0100 Subject: [PATCH 10/17] update test_v3_chunk_encoding --- tests/test_array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_array.py b/tests/test_array.py index df61d31661..b0d83c1399 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1102,6 +1102,7 @@ async def test_v3_chunk_encoding( filters=filters, compressors=compressors, serializer=serializer, dtype=np.dtype(dtype) ) assert arr.filters == filters_expected + assert arr.serializer == serializer_expected assert arr.compressors == compressors_expected @staticmethod From 1937ee58ada3225aee8e6af2892574a1871e7907 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 18 Feb 2025 13:17:35 +0100 Subject: [PATCH 11/17] update test_invalid_chunk_encoding --- tests/test_array.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index b0d83c1399..3476964f1d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1155,19 +1155,19 @@ async def test_v2_chunk_encoding( assert arr.filters == filters_expected @staticmethod - async def test_bad_chunk_encoding(store: MemoryStore) -> None: + async def test_invalid_chunk_encoding(store: MemoryStore) -> None: """ Test that passing an invalid compressor or filter to create_array raises an error. """ - bad_compressor = 2 - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(bad_compressor)} instead." + invalid_compressor_type = 2 + msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(invalid_compressor_type)} instead." with pytest.raises(ValueError, match=msg): await create_array( store=store, dtype="uint8", shape=(10,), zarr_format=2, - compressors=bad_compressor, + compressors=invalid_compressor_type, ) with pytest.raises(KeyError): await create_array( @@ -1175,7 +1175,7 @@ async def test_bad_chunk_encoding(store: MemoryStore) -> None: dtype="uint8", shape=(10,), zarr_format=3, - filters="bad_filter", + filters="nonexistent_filter_name", ) @staticmethod From 55f975c9b9876458ba075c930040253343541ae1 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Fri, 11 Apr 2025 15:06:53 +0200 Subject: [PATCH 12/17] test for codec with mandatory config --- tests/test_array.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index b9c46b855e..8db372046f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1194,6 +1194,26 @@ async def test_invalid_chunk_encoding(store: MemoryStore) -> None: filters="nonexistent_filter_name", ) + # string representation of a codec is only supported if codec has no required arguments + msg = "Delta.__init__() missing 1 required positional argument: 'dtype'" + with pytest.raises(TypeError, match=re.escape(msg)): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=2, + filters="delta", + ) + msg = "TransposeCodec.__init__() missing 1 required keyword-only argument: 'order'" + with pytest.raises(TypeError, match=re.escape(msg)): + await create_array( + store=store, + dtype="uint8", + shape=(10,), + zarr_format=3, + filters="transpose", + ) + @staticmethod @pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) async def test_default_filters_compressors( From 4519547a178d456c9decc3bc8524a194f6478d9e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Fri, 16 May 2025 13:25:38 +0200 Subject: [PATCH 13/17] better error msg if codec requires config --- src/zarr/core/metadata/v2.py | 24 ++++++++-- src/zarr/registry.py | 64 +++++++++++++++++--------- tests/test_array.py | 87 ++++++++++++++++++++++++++++++++---- 3 files changed, 142 insertions(+), 33 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index a10cd62e8d..c87b2c8415 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -254,7 +254,16 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: if data is None: return data if isinstance(data, str): - return (numcodecs.get_codec({"id": data}),) + try: + return (numcodecs.get_codec({"id": data}),) + except TypeError as e: + codec_cls = numcodecs.registry.codec_registry.get(data) + msg = ( + f'A string representation for filter "{data}" was provided which specifies codec {codec_cls.__name__}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e if isinstance(data, Iterable): for idx, val in enumerate(data): if isinstance(val, numcodecs.abc.Codec): @@ -262,7 +271,7 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: elif isinstance(val, dict): out.append(numcodecs.get_codec(val)) elif isinstance(val, str): - out.append(numcodecs.get_codec({"id": val})) + out.append(parse_filters(val)[0]) else: msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead." raise TypeError(msg) @@ -287,7 +296,16 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: if isinstance(data, dict): return numcodecs.get_codec(data) if isinstance(data, str): - return numcodecs.get_codec({"id": data}) + try: + return numcodecs.get_codec({"id": data}) + except TypeError as e: + codec_cls = numcodecs.registry.codec_registry.get(data) + msg = ( + f'A string representation for compressor "{data}" was provided which specifies codec {codec_cls.__name__}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 72fd6de85c..dffa4ef5b2 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -175,16 +175,23 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | str | Codec) -> BytesBytesC from zarr.abc.codec import BytesBytesCodec if isinstance(data, str): - data = {"name": data, "configuration": {}} - if isinstance(data, dict): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for compressor "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, BytesBytesCodec): - msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, BytesBytesCodec): - raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") result = data + if not isinstance(result, BytesBytesCodec): + msg = f"Expected a representation of a BytesBytesCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result @@ -197,16 +204,23 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | str | Codec) -> ArrayBytesC from zarr.abc.codec import ArrayBytesCodec if isinstance(data, str): - data = {"name": data, "configuration": {}} - if isinstance(data, dict): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for serializer "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, ArrayBytesCodec): - msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, ArrayBytesCodec): - raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") result = data + if not isinstance(result, ArrayBytesCodec): + msg = f"Expected a representation of a ArrayBytesCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result @@ -218,17 +232,25 @@ def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayC """ from zarr.abc.codec import ArrayArrayCodec + result: ArrayArrayCodec if isinstance(data, str): - data = {"name": data, "configuration": {}} - if isinstance(data, dict): + try: + result = _resolve_codec({"name": data, "configuration": {}}) + except TypeError as e: + codec_cls = get_codec_class(data) + msg = ( + f'A string representation for filter "{data}" was provided which specifies codec {codec_cls.__name__}. ' + f"But that codec cannot be specified by a string because it takes a required configuration. Use either " + f"the dict representation of {data} codec, or pass in a concrete {codec_cls.__name__} instance instead" + ) + raise TypeError(msg) from e + elif isinstance(data, dict): result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): - msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." - raise TypeError(msg) else: - if not isinstance(data, ArrayArrayCodec): - raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") result = data + if not isinstance(result, ArrayArrayCodec): + msg = f"Expected a representation of a ArrayArrayCodec; got a representation of a {type(result)} instead." + raise TypeError(msg) return result diff --git a/tests/test_array.py b/tests/test_array.py index 719b101081..742994bf9e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -18,6 +18,7 @@ import zarr.api.asynchronous import zarr.api.synchronous as sync_api from zarr import Array, AsyncArray, Group +from zarr.abc.codec import BytesBytesCodec from zarr.abc.store import Store from zarr.codecs import ( BytesCodec, @@ -49,10 +50,12 @@ from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError +from zarr.registry import register_codec from zarr.storage import LocalStore, MemoryStore, StorePath if TYPE_CHECKING: from zarr.core.array_spec import ArrayConfigLike +from zarr.core.array_spec import ArraySpec from zarr.core.metadata.v2 import ArrayV2Metadata @@ -1327,24 +1330,90 @@ async def test_invalid_chunk_encoding(store: MemoryStore) -> None: filters="nonexistent_filter_name", ) - # string representation of a codec is only supported if codec has no required arguments - msg = "Delta.__init__() missing 1 required positional argument: 'dtype'" - with pytest.raises(TypeError, match=re.escape(msg)): + @staticmethod + @pytest.mark.parametrize( + ("argument_key", "codec"), + [ + ("filters", "bytes"), + ("filters", "gzip"), + ("serializer", "blosc"), + ("compressors", "bytes"), + ], + ) + async def test_chunk_encoding_wrong_type(argument_key, codec, store: MemoryStore) -> None: + """ + Test that passing an invalid codec to create_array raises an error. + """ + msg = "Expected a representation of a .*Codec; got a representation of a instead" + with pytest.raises(TypeError, match=msg): await create_array( store=store, dtype="uint8", shape=(10,), - zarr_format=2, - filters="delta", + zarr_format=3, + **{argument_key: codec}, ) - msg = "TransposeCodec.__init__() missing 1 required keyword-only argument: 'order'" - with pytest.raises(TypeError, match=re.escape(msg)): + + @staticmethod + @pytest.mark.parametrize( + ("argument_key", "codec", "codec_cls_name", "zarr_format"), + [ + ("filters", "delta", "Delta", 2), + ("filters", ("delta",), "Delta", 2), + ("filters", "transpose", "TransposeCodec", 3), + ("filters", ("transpose",), "TransposeCodec", 3), + ("serializer", "sharding_indexed", "ShardingCodec", 3), + ("compressors", "mock_compressor_v3", "MockCompressorRequiresConfig3", 3), + ("compressors", ("mock_compressor_v3",), "MockCompressorRequiresConfig3", 3), + ("compressors", "mock_compressor_v2", "MockCompressorRequiresConfig2", 2), + ], + ) + async def test_chunk_encoding_missing_arguments( + store: MemoryStore, argument_key, codec, codec_cls_name, zarr_format + ) -> None: + codec_key = codec if not isinstance(codec, tuple) else codec[0] + argument_key_single = argument_key.removesuffix("s") + error_msg = ( + f'A string representation for {argument_key_single} "{codec_key}" was provided which specifies codec {codec_cls_name}. But that codec ' + f"cannot be specified by a string because it takes a required configuration. Use either the dict " + f"representation of {codec_key} codec, or pass in a concrete {codec_cls_name} instance instead" + ) + if codec == "mock_compressor_v3": + + class MockCompressorRequiresConfig3(BytesBytesCodec): + def compute_encoded_size( + self, input_byte_length: int, chunk_spec: ArraySpec + ) -> int: + pass + + def __init__(self, *, argument: str) -> None: + super().__init__() + + register_codec("mock_compressor_v3", MockCompressorRequiresConfig3) + elif codec == "mock_compressor_v2": + + class MockCompressorRequiresConfig2(numcodecs.abc.Codec): + def __init__(self, *, argument: str) -> None: + super().__init__() + + def encode(self: Any, buf) -> Any: + pass + + def decode(self, buf: Any, out=None) -> Any: + pass + + numcodecs.register_codec( + MockCompressorRequiresConfig2, + "mock_compressor_v2", + ) + # string representation of a codec is only supported if codec has no required arguments + with pytest.raises(TypeError, match=re.escape(error_msg)): await create_array( store=store, dtype="uint8", shape=(10,), - zarr_format=3, - filters="transpose", + zarr_format=zarr_format, + **{argument_key: codec}, ) @staticmethod From 61c32f87c40849f7c229edfe5460d0c515abb15a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 20 May 2025 10:57:55 +0200 Subject: [PATCH 14/17] typing --- src/zarr/core/metadata/v2.py | 4 +++- src/zarr/registry.py | 1 - tests/test_array.py | 11 +++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index c87b2c8415..40a1d02621 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -271,7 +271,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: elif isinstance(val, dict): out.append(numcodecs.get_codec(val)) elif isinstance(val, str): - out.append(parse_filters(val)[0]) + filter = parse_filters(val) + if filter is not None: + out.extend(filter) else: msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead." raise TypeError(msg) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index dffa4ef5b2..3cb78b0f2a 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -232,7 +232,6 @@ def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayC """ from zarr.abc.codec import ArrayArrayCodec - result: ArrayArrayCodec if isinstance(data, str): try: result = _resolve_codec({"name": data, "configuration": {}}) diff --git a/tests/test_array.py b/tests/test_array.py index 742994bf9e..732c9cc881 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -10,6 +10,7 @@ from unittest import mock import numcodecs +import numcodecs.abc import numpy as np import numpy.typing as npt import pytest @@ -1340,7 +1341,9 @@ async def test_invalid_chunk_encoding(store: MemoryStore) -> None: ("compressors", "bytes"), ], ) - async def test_chunk_encoding_wrong_type(argument_key, codec, store: MemoryStore) -> None: + async def test_chunk_encoding_wrong_type( + argument_key: str, codec: str, store: MemoryStore + ) -> None: """ Test that passing an invalid codec to create_array raises an error. """ @@ -1369,7 +1372,11 @@ async def test_chunk_encoding_wrong_type(argument_key, codec, store: MemoryStore ], ) async def test_chunk_encoding_missing_arguments( - store: MemoryStore, argument_key, codec, codec_cls_name, zarr_format + store: MemoryStore, + argument_key: str, + codec: str, + codec_cls_name: str, + zarr_format: ZarrFormat, ) -> None: codec_key = codec if not isinstance(codec, tuple) else codec[0] argument_key_single = argument_key.removesuffix("s") From 9e35f7ce0a807cd853a1339d20ced6e290f03162 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 20 May 2025 11:13:20 +0200 Subject: [PATCH 15/17] typing --- tests/test_array.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 732c9cc881..e49b8307f6 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1342,7 +1342,7 @@ async def test_invalid_chunk_encoding(store: MemoryStore) -> None: ], ) async def test_chunk_encoding_wrong_type( - argument_key: str, codec: str, store: MemoryStore + argument_key: str, codec: FiltersLike | SerializerLike | CompressorsLike, store: MemoryStore ) -> None: """ Test that passing an invalid codec to create_array raises an error. @@ -1369,12 +1369,13 @@ async def test_chunk_encoding_wrong_type( ("compressors", "mock_compressor_v3", "MockCompressorRequiresConfig3", 3), ("compressors", ("mock_compressor_v3",), "MockCompressorRequiresConfig3", 3), ("compressors", "mock_compressor_v2", "MockCompressorRequiresConfig2", 2), + ("compressors", ("mock_compressor_v2",), "MockCompressorRequiresConfig2", 2), ], ) async def test_chunk_encoding_missing_arguments( store: MemoryStore, argument_key: str, - codec: str, + codec: str | tuple[FiltersLike | SerializerLike | CompressorsLike], codec_cls_name: str, zarr_format: ZarrFormat, ) -> None: @@ -1385,7 +1386,7 @@ async def test_chunk_encoding_missing_arguments( f"cannot be specified by a string because it takes a required configuration. Use either the dict " f"representation of {codec_key} codec, or pass in a concrete {codec_cls_name} instance instead" ) - if codec == "mock_compressor_v3": + if "mock_compressor_v3" in codec: class MockCompressorRequiresConfig3(BytesBytesCodec): def compute_encoded_size( @@ -1397,16 +1398,16 @@ def __init__(self, *, argument: str) -> None: super().__init__() register_codec("mock_compressor_v3", MockCompressorRequiresConfig3) - elif codec == "mock_compressor_v2": + elif "mock_compressor_v2" in codec: class MockCompressorRequiresConfig2(numcodecs.abc.Codec): def __init__(self, *, argument: str) -> None: super().__init__() - def encode(self: Any, buf) -> Any: + def encode(self: Any, buf) -> Any: # type: ignore[no-untyped-def] pass - def decode(self, buf: Any, out=None) -> Any: + def decode(self, buf: Any, out=None) -> Any: # type: ignore[no-untyped-def] pass numcodecs.register_codec( From a3366aa9748b68e1ebcd1170147d127b00d88213 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 20 May 2025 12:10:53 +0200 Subject: [PATCH 16/17] typing in tests --- tests/test_array.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index e49b8307f6..6fc65515c7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1354,7 +1354,7 @@ async def test_chunk_encoding_wrong_type( dtype="uint8", shape=(10,), zarr_format=3, - **{argument_key: codec}, + **{argument_key: codec}, # type: ignore[arg-type] ) @staticmethod @@ -1375,7 +1375,10 @@ async def test_chunk_encoding_wrong_type( async def test_chunk_encoding_missing_arguments( store: MemoryStore, argument_key: str, - codec: str | tuple[FiltersLike | SerializerLike | CompressorsLike], + codec: FiltersLike + | SerializerLike + | CompressorsLike + | tuple[FiltersLike | SerializerLike | CompressorsLike], codec_cls_name: str, zarr_format: ZarrFormat, ) -> None: @@ -1392,15 +1395,15 @@ class MockCompressorRequiresConfig3(BytesBytesCodec): def compute_encoded_size( self, input_byte_length: int, chunk_spec: ArraySpec ) -> int: - pass + return 0 def __init__(self, *, argument: str) -> None: super().__init__() register_codec("mock_compressor_v3", MockCompressorRequiresConfig3) elif "mock_compressor_v2" in codec: - - class MockCompressorRequiresConfig2(numcodecs.abc.Codec): + # ignore mypy error because numcodecs is not typed + class MockCompressorRequiresConfig2(numcodecs.abc.Codec): # type: ignore[misc] def __init__(self, *, argument: str) -> None: super().__init__() @@ -1421,7 +1424,7 @@ def decode(self, buf: Any, out=None) -> Any: # type: ignore[no-untyped-def] dtype="uint8", shape=(10,), zarr_format=zarr_format, - **{argument_key: codec}, + **{argument_key: codec}, # type: ignore[arg-type] ) @staticmethod From 7a5bc663215838228f9c121cde080d40772dfffa Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Tue, 20 May 2025 12:18:08 +0200 Subject: [PATCH 17/17] typing in tests --- tests/test_array.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 6fc65515c7..b660495dc7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1375,10 +1375,7 @@ async def test_chunk_encoding_wrong_type( async def test_chunk_encoding_missing_arguments( store: MemoryStore, argument_key: str, - codec: FiltersLike - | SerializerLike - | CompressorsLike - | tuple[FiltersLike | SerializerLike | CompressorsLike], + codec: str | tuple[str], codec_cls_name: str, zarr_format: ZarrFormat, ) -> None: @@ -1403,7 +1400,7 @@ def __init__(self, *, argument: str) -> None: register_codec("mock_compressor_v3", MockCompressorRequiresConfig3) elif "mock_compressor_v2" in codec: # ignore mypy error because numcodecs is not typed - class MockCompressorRequiresConfig2(numcodecs.abc.Codec): # type: ignore[misc] + class MockCompressorRequiresConfig2(numcodecs.abc.Codec): # type: ignore[misc] def __init__(self, *, argument: str) -> None: super().__init__()