1
1
from __future__ import annotations
2
2
3
+ import math
3
4
import os
4
5
from collections .abc import Mapping
5
6
from typing import (
21
22
from numcodecs .abc import Codec
22
23
from pydantic import AfterValidator , model_validator
23
24
from pydantic .functional_validators import BeforeValidator
25
+ from zarr .abc .store import Store
26
+ from zarr .core .sync_group import get_node
24
27
from zarr .errors import ContainsArrayError , ContainsGroupError
25
- from zarr .storage import BaseStore , contains_array , contains_group , init_group
26
- from zarr .util import guess_chunks
27
28
28
29
from pydantic_zarr .core import (
29
30
IncEx ,
36
37
TItem = TypeVar ("TItem" , bound = Union ["GroupSpec" , "ArraySpec" ])
37
38
38
39
40
+ def _contains_array (store : Store , path : str ) -> bool :
41
+ try :
42
+ return isinstance (get_node (store , path , zarr_format = 2 ), zarr .Array )
43
+ except FileNotFoundError :
44
+ return False
45
+
46
+
47
+ def _contains_group (store : Store , path : str ) -> bool :
48
+ return isinstance (get_node (store , path , zarr_format = 2 ), zarr .Group )
49
+
50
+
39
51
def stringify_dtype (value : npt .DTypeLike ) -> str :
40
52
"""
41
53
Convert a `numpy.dtype` object into a `str`.
@@ -318,14 +330,14 @@ def from_zarr(cls, array: zarr.Array) -> Self:
318
330
fill_value = array .dtype .type (array .fill_value ).tolist (),
319
331
order = array .order ,
320
332
filters = array .filters ,
321
- dimension_separator = array ._dimension_separator ,
322
- compressor = array .compressor ,
333
+ dimension_separator = array .metadata . dimension_separator ,
334
+ compressor = array .compressors [ 0 ]. get_config () ,
323
335
attributes = array .attrs .asdict (),
324
336
)
325
337
326
338
def to_zarr (
327
339
self ,
328
- store : BaseStore ,
340
+ store : Store ,
329
341
path : str ,
330
342
* ,
331
343
overwrite : bool = False ,
@@ -337,14 +349,15 @@ def to_zarr(
337
349
338
350
Parameters
339
351
----------
340
- store : instance of zarr.BaseStore
352
+ store : instance of zarr.abc.store.Store
341
353
The storage backend that will manifest the array.
342
354
path : str
343
355
The location of the array inside the store.
344
356
overwrite: bool, default = False
345
357
Whether to overwrite existing objects in storage to create the Zarr array.
346
358
**kwargs : Any
347
359
Additional keyword arguments are passed to `zarr.create`.
360
+
348
361
Returns
349
362
-------
350
363
zarr.Array
@@ -356,24 +369,20 @@ def to_zarr(
356
369
spec_dict ["compressor" ] = numcodecs .get_codec (spec_dict ["compressor" ])
357
370
if self .filters is not None :
358
371
spec_dict ["filters" ] = [numcodecs .get_codec (f ) for f in spec_dict ["filters" ]]
359
- if contains_array (store , path ):
360
- extant_array = zarr .open_array (store , path = path , mode = "r" )
372
+ if _contains_array (store , path ):
373
+ extant_array = zarr .open_array (store , path = path , mode = "r" , zarr_format = 2 )
361
374
362
375
if not self .like (extant_array ):
363
376
if not overwrite :
364
- msg = (
365
- f"An array already exists at path { path } . "
366
- "That array is structurally dissimilar to the array you are trying to "
367
- "store. Call to_zarr with overwrite=True to overwrite that array."
368
- )
369
- raise ContainsArrayError (msg )
377
+ raise ContainsArrayError (store , path )
370
378
else :
371
379
if not overwrite :
372
380
# extant_array is read-only, so we make a new array handle that
373
381
# takes **kwargs
374
382
return zarr .open_array (
375
- store = extant_array .store , path = extant_array .path , ** kwargs
383
+ store = extant_array .store , path = extant_array .path , zarr_format = 2 , ** kwargs
376
384
)
385
+ spec_dict ["zarr_format" ] = spec_dict .pop ("zarr_version" , 2 )
377
386
result = zarr .create (store = store , path = path , overwrite = overwrite , ** spec_dict , ** kwargs )
378
387
result .attrs .put (attrs )
379
388
return result
@@ -519,13 +528,14 @@ def from_zarr(cls, group: zarr.Group, *, depth: int = -1) -> Self:
519
528
result = cls (attributes = attributes , members = members )
520
529
return result
521
530
522
- def to_zarr (self , store : BaseStore , path : str , * , overwrite : bool = False , ** kwargs ):
531
+ def to_zarr (self , store : Store , path : str , * , overwrite : bool = False , ** kwargs ):
523
532
"""
524
- Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.BaseStore `.
533
+ Serialize this `GroupSpec` to a Zarr group at a specific path in a `zarr.abc.store.Store `.
525
534
This operation will create metadata documents in the store.
535
+
526
536
Parameters
527
537
----------
528
- store : zarr.BaseStore
538
+ store : zarr.abc.store.Store
529
539
The storage backend that will manifest the group and its contents.
530
540
path : str
531
541
The location of the group inside the store.
@@ -542,7 +552,7 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa
542
552
"""
543
553
spec_dict = self .model_dump (exclude = {"members" : True })
544
554
attrs = spec_dict .pop ("attributes" )
545
- if contains_group (store , path ):
555
+ if _contains_group (store , path ):
546
556
extant_group = zarr .group (store , path = path )
547
557
if not self .like (extant_group ):
548
558
if not overwrite :
@@ -558,14 +568,14 @@ def to_zarr(self, store: BaseStore, path: str, *, overwrite: bool = False, **kwa
558
568
# then just return the extant group
559
569
return extant_group
560
570
561
- elif contains_array (store , path ) and not overwrite :
571
+ elif _contains_array (store , path ) and not overwrite :
562
572
msg = (
563
573
f"An array already exists at path { path } . "
564
574
"Call to_zarr with overwrite=True to overwrite the array."
565
575
)
566
576
raise ContainsArrayError (msg )
567
577
else :
568
- init_group (store = store , overwrite = overwrite , path = path )
578
+ zarr . create_group (store = store , overwrite = overwrite , path = path , zarr_format = 2 )
569
579
570
580
result = zarr .group (store = store , path = path , overwrite = overwrite )
571
581
result .attrs .put (attrs )
@@ -746,7 +756,7 @@ def from_zarr(element: zarr.Array | zarr.Group, depth: int = -1) -> ArraySpec |
746
756
@overload
747
757
def to_zarr (
748
758
spec : ArraySpec ,
749
- store : BaseStore ,
759
+ store : Store ,
750
760
path : str ,
751
761
* ,
752
762
overwrite : bool = False ,
@@ -757,7 +767,7 @@ def to_zarr(
757
767
@overload
758
768
def to_zarr (
759
769
spec : GroupSpec ,
760
- store : BaseStore ,
770
+ store : Store ,
761
771
path : str ,
762
772
* ,
763
773
overwrite : bool = False ,
@@ -767,7 +777,7 @@ def to_zarr(
767
777
768
778
def to_zarr (
769
779
spec : ArraySpec | GroupSpec ,
770
- store : BaseStore ,
780
+ store : Store ,
771
781
path : str ,
772
782
* ,
773
783
overwrite : bool = False ,
@@ -781,7 +791,7 @@ def to_zarr(
781
791
----------
782
792
spec : ArraySpec | GroupSpec
783
793
The `GroupSpec` or `ArraySpec` that will be serialized to storage.
784
- store : zarr.BaseStore
794
+ store : zarr.abc.store. BaseStore
785
795
The storage backend that will manifest the Zarr group or array modeled by `spec`.
786
796
path : str
787
797
The location of the Zarr group or array inside the store.
@@ -985,7 +995,7 @@ def auto_chunks(data: Any) -> tuple[int, ...]:
985
995
return data .chunksize
986
996
if hasattr (data , "chunks" ):
987
997
return data .chunks
988
- return guess_chunks (data .shape , np .dtype (data .dtype ).itemsize )
998
+ return _guess_chunks (data .shape , np .dtype (data .dtype ).itemsize )
989
999
990
1000
991
1001
def auto_attributes (data : Any ) -> Mapping [str , Any ]:
@@ -1045,3 +1055,55 @@ def auto_dimension_separator(data: Any) -> Literal["/", "."]:
1045
1055
if hasattr (data , "dimension_separator" ):
1046
1056
return data .dimension_separator
1047
1057
return "/"
1058
+
1059
+
1060
+ def _guess_chunks (shape : tuple [int , ...], typesize : int ) -> tuple [int , ...]:
1061
+ """
1062
+ Vendored from zarr-python v2.
1063
+
1064
+ Guess an appropriate chunk layout for an array, given its shape and
1065
+ the size of each element in bytes. Will allocate chunks only as large
1066
+ as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
1067
+ each axis, slightly favoring bigger values for the last index.
1068
+ Undocumented and subject to change without warning.
1069
+ """
1070
+
1071
+ CHUNK_BASE = 256 * 1024 # Multiplier by which chunks are adjusted
1072
+ CHUNK_MIN = 128 * 1024 # Soft lower limit (128k)
1073
+ CHUNK_MAX = 64 * 1024 * 1024 # Hard upper limit
1074
+
1075
+ ndims = len (shape )
1076
+ # require chunks to have non-zero length for all dimensions
1077
+ chunks = np .maximum (np .array (shape , dtype = "=f8" ), 1 )
1078
+
1079
+ # Determine the optimal chunk size in bytes using a PyTables expression.
1080
+ # This is kept as a float.
1081
+ dset_size = np .prod (chunks ) * typesize
1082
+ target_size = CHUNK_BASE * (2 ** np .log10 (dset_size / (1024.0 * 1024 )))
1083
+
1084
+ if target_size > CHUNK_MAX :
1085
+ target_size = CHUNK_MAX
1086
+ elif target_size < CHUNK_MIN :
1087
+ target_size = CHUNK_MIN
1088
+
1089
+ idx = 0
1090
+ while True :
1091
+ # Repeatedly loop over the axes, dividing them by 2. Stop when:
1092
+ # 1a. We're smaller than the target chunk size, OR
1093
+ # 1b. We're within 50% of the target chunk size, AND
1094
+ # 2. The chunk is smaller than the maximum chunk size
1095
+
1096
+ chunk_bytes = np .prod (chunks ) * typesize
1097
+
1098
+ if (
1099
+ chunk_bytes < target_size or abs (chunk_bytes - target_size ) / target_size < 0.5
1100
+ ) and chunk_bytes < CHUNK_MAX :
1101
+ break
1102
+
1103
+ if np .prod (chunks ) == 1 :
1104
+ break # Element size larger than CHUNK_MAX
1105
+
1106
+ chunks [idx % ndims ] = math .ceil (chunks [idx % ndims ] / 2.0 )
1107
+ idx += 1
1108
+
1109
+ return tuple (int (x ) for x in chunks )
0 commit comments