-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactoring/fixing zarr-pyhton v3 incompatibilities in xarray datatrees #10020
base: main
Are you sure you want to change the base?
Changes from all commits
0a2a49e
ae80662
379db18
846dc50
ddfd0b5
3f9a8fb
f140658
0e790eb
403afa9
58e8f8e
fd357fa
8b993a1
d4aeeca
3125647
0c7485b
fdeee94
f3e2c66
f9f1043
c118841
ec2086a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ | |
requires_dask, | ||
requires_h5netcdf, | ||
requires_netCDF4, | ||
requires_zarr, | ||
requires_zarr_v3, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
|
@@ -141,14 +141,16 @@ def unaligned_datatree_zarr(tmp_path_factory): | |
a (y) int64 16B ... | ||
b (x) float64 16B ... | ||
""" | ||
from zarr import consolidate_metadata | ||
|
||
filepath = tmp_path_factory.mktemp("data") / "unaligned_simple_datatree.zarr" | ||
root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) | ||
set1_data = xr.Dataset({"a": 0, "b": 1}) | ||
set2_data = xr.Dataset({"a": ("y", [2, 3]), "b": ("x", [0.1, 0.2])}) | ||
root_data.to_zarr(filepath) | ||
set1_data.to_zarr(filepath, group="/Group1", mode="a") | ||
set2_data.to_zarr(filepath, group="/Group2", mode="a") | ||
set1_data.to_zarr(filepath, group="/Group1/subgroup1", mode="a") | ||
consolidate_metadata(filepath) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something seems off here. IIUC, the prior behavior consolidated metadata at the root of the store ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jhamman, I am not sure what is happening in this case but if we don't add On the other hand, I also remove the nested group |
||
yield filepath | ||
|
||
|
||
|
@@ -373,15 +375,12 @@ class TestH5NetCDFDatatreeIO(DatatreeIOBase): | |
engine: T_DataTreeNetcdfEngine | None = "h5netcdf" | ||
|
||
|
||
@pytest.mark.skipif( | ||
have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet" | ||
) | ||
@requires_zarr | ||
@requires_zarr_v3 | ||
class TestZarrDatatreeIO: | ||
engine = "zarr" | ||
|
||
def test_to_zarr(self, tmpdir, simple_datatree): | ||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt = simple_datatree | ||
original_dt.to_zarr(filepath) | ||
|
||
|
@@ -391,16 +390,31 @@ def test_to_zarr(self, tmpdir, simple_datatree): | |
def test_zarr_encoding(self, tmpdir, simple_datatree): | ||
from numcodecs.blosc import Blosc | ||
|
||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt = simple_datatree | ||
|
||
comp = {"compressor": Blosc(cname="zstd", clevel=3, shuffle=2)} | ||
blosc = Blosc(cname="zstd", clevel=3, shuffle="shuffle").get_config() | ||
comp = {"compressor": {"name": blosc.pop("id"), "configuration": blosc}} | ||
enc = {"/set2": {var: comp for var in original_dt["/set2"].dataset.data_vars}} | ||
original_dt.to_zarr(filepath, encoding=enc) | ||
|
||
with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
print(roundtrip_dt["/set2/a"].encoding) | ||
assert roundtrip_dt["/set2/a"].encoding["compressor"] == comp["compressor"] | ||
retrieved_compressor = roundtrip_dt["/set2/a"].encoding["compressors"][ | ||
0 | ||
] # Get the BloscCodec object | ||
assert ( | ||
retrieved_compressor.cname.name | ||
== comp["compressor"]["configuration"]["cname"] | ||
) | ||
assert ( | ||
retrieved_compressor.clevel | ||
== comp["compressor"]["configuration"]["clevel"] | ||
) | ||
assert ( | ||
retrieved_compressor.shuffle.name | ||
== comp["compressor"]["configuration"]["shuffle"] | ||
) | ||
|
||
enc["/not/a/group"] = {"foo": "bar"} # type: ignore[dict-item] | ||
with pytest.raises(ValueError, match="unexpected encoding group.*"): | ||
|
@@ -409,9 +423,9 @@ def test_zarr_encoding(self, tmpdir, simple_datatree): | |
def test_to_zarr_zip_store(self, tmpdir, simple_datatree): | ||
from zarr.storage import ZipStore | ||
|
||
filepath = tmpdir / "test.zarr.zip" | ||
filepath = str(tmpdir / "test.zarr.zip") | ||
original_dt = simple_datatree | ||
store = ZipStore(filepath) | ||
store = ZipStore(filepath, mode="w") | ||
original_dt.to_zarr(store) | ||
|
||
with open_datatree(store, engine="zarr") as roundtrip_dt: # type: ignore[arg-type, unused-ignore] | ||
|
@@ -432,32 +446,29 @@ def test_to_zarr_not_consolidated(self, tmpdir, simple_datatree): | |
assert_equal(original_dt, roundtrip_dt) | ||
|
||
def test_to_zarr_default_write_mode(self, tmpdir, simple_datatree): | ||
import zarr | ||
|
||
simple_datatree.to_zarr(tmpdir) | ||
simple_datatree.to_zarr(str(tmpdir)) | ||
|
||
# with default settings, to_zarr should not overwrite an existing dir | ||
with pytest.raises(zarr.errors.ContainsGroupError): | ||
simple_datatree.to_zarr(tmpdir) | ||
with pytest.raises(FileExistsError): | ||
simple_datatree.to_zarr(str(tmpdir)) | ||
|
||
@requires_dask | ||
def test_to_zarr_compute_false(self, tmpdir, simple_datatree): | ||
import dask.array as da | ||
|
||
filepath = tmpdir / "test.zarr" | ||
original_dt = simple_datatree.chunk() | ||
original_dt.to_zarr(filepath, compute=False) | ||
original_dt.to_zarr(str(filepath), compute=False) | ||
|
||
for node in original_dt.subtree: | ||
for name, variable in node.dataset.variables.items(): | ||
var_dir = filepath / node.path / name | ||
var_files = var_dir.listdir() | ||
assert var_dir / ".zarray" in var_files | ||
assert var_dir / ".zattrs" in var_files | ||
assert var_dir / "zarr.json" in var_files | ||
if isinstance(variable.data, da.Array): | ||
assert var_dir / "0" not in var_files | ||
assert var_dir / "zarr.json" in var_files | ||
else: | ||
assert var_dir / "0" in var_files | ||
assert var_dir / "c" in var_files | ||
|
||
def test_to_zarr_inherited_coords(self, tmpdir): | ||
original_dt = DataTree.from_dict( | ||
|
@@ -466,7 +477,7 @@ def test_to_zarr_inherited_coords(self, tmpdir): | |
"/sub": xr.Dataset({"b": (("x",), [5, 6])}), | ||
} | ||
) | ||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt.to_zarr(filepath) | ||
|
||
with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
@@ -476,7 +487,7 @@ def test_to_zarr_inherited_coords(self, tmpdir): | |
|
||
def test_open_groups_round_trip(self, tmpdir, simple_datatree) -> None: | ||
"""Test `open_groups` opens a zarr store with the `simple_datatree` structure.""" | ||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt = simple_datatree | ||
original_dt.to_zarr(filepath) | ||
|
||
|
@@ -501,7 +512,7 @@ def test_open_datatree(self, unaligned_datatree_zarr) -> None: | |
|
||
@requires_dask | ||
def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: | ||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
|
||
chunks = {"x": 2, "y": 1} | ||
|
||
|
@@ -527,9 +538,8 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: | |
unaligned_dict_of_datasets = open_groups(unaligned_datatree_zarr, engine="zarr") | ||
|
||
assert "/" in unaligned_dict_of_datasets.keys() | ||
assert "/Group1" in unaligned_dict_of_datasets.keys() | ||
assert "/Group1/subgroup1" in unaligned_dict_of_datasets.keys() | ||
assert "/Group2" in unaligned_dict_of_datasets.keys() | ||
assert "Group1" in unaligned_dict_of_datasets.keys() | ||
assert "Group2" in unaligned_dict_of_datasets.keys() | ||
# Check that group name returns the correct datasets | ||
with xr.open_dataset( | ||
unaligned_datatree_zarr, group="/", engine="zarr" | ||
|
@@ -538,22 +548,20 @@ def test_open_groups(self, unaligned_datatree_zarr) -> None: | |
with xr.open_dataset( | ||
unaligned_datatree_zarr, group="Group1", engine="zarr" | ||
) as expected: | ||
assert_identical(unaligned_dict_of_datasets["/Group1"], expected) | ||
with xr.open_dataset( | ||
unaligned_datatree_zarr, group="/Group1/subgroup1", engine="zarr" | ||
) as expected: | ||
assert_identical(unaligned_dict_of_datasets["/Group1/subgroup1"], expected) | ||
assert_identical(unaligned_dict_of_datasets["Group1"], expected) | ||
with xr.open_dataset( | ||
unaligned_datatree_zarr, group="/Group2", engine="zarr" | ||
unaligned_datatree_zarr, group="Group2", engine="zarr" | ||
) as expected: | ||
assert_identical(unaligned_dict_of_datasets["/Group2"], expected) | ||
|
||
assert_identical(unaligned_dict_of_datasets["Group2"], expected) | ||
for ds in unaligned_dict_of_datasets.values(): | ||
ds.close() | ||
|
||
@pytest.mark.filterwarnings( | ||
"ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
) | ||
def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: | ||
"""Test opening a specific group within a Zarr store using `open_datatree`.""" | ||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
group = "/set2" | ||
original_dt = simple_datatree | ||
original_dt.to_zarr(filepath) | ||
|
@@ -568,10 +576,7 @@ def test_open_groups_chunks(self, tmpdir) -> None: | |
"""Test `open_groups` with chunks on a zarr store.""" | ||
|
||
chunks = {"x": 2, "y": 1} | ||
filepath = tmpdir / "test.zarr" | ||
|
||
chunks = {"x": 2, "y": 1} | ||
|
||
filepath = str(tmpdir / "test.zarr") | ||
root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) | ||
set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) | ||
set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) | ||
|
@@ -605,13 +610,16 @@ def test_write_subgroup(self, tmpdir): | |
expected_dt = original_dt.copy() | ||
expected_dt.name = None | ||
|
||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt.to_zarr(filepath) | ||
|
||
with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
assert_equal(original_dt, roundtrip_dt) | ||
assert_identical(expected_dt, roundtrip_dt) | ||
|
||
@pytest.mark.filterwarnings( | ||
"ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
) | ||
def test_write_inherited_coords_false(self, tmpdir): | ||
original_dt = DataTree.from_dict( | ||
{ | ||
|
@@ -620,7 +628,7 @@ def test_write_inherited_coords_false(self, tmpdir): | |
} | ||
) | ||
|
||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt.to_zarr(filepath, write_inherited_coords=False) | ||
|
||
with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
@@ -631,6 +639,9 @@ def test_write_inherited_coords_false(self, tmpdir): | |
with open_datatree(filepath, group="child", engine="zarr") as roundtrip_child: | ||
assert_identical(expected_child, roundtrip_child) | ||
|
||
@pytest.mark.filterwarnings( | ||
"ignore:Failed to open Zarr store with consolidated metadata:RuntimeWarning" | ||
) | ||
def test_write_inherited_coords_true(self, tmpdir): | ||
original_dt = DataTree.from_dict( | ||
{ | ||
|
@@ -639,7 +650,7 @@ def test_write_inherited_coords_true(self, tmpdir): | |
} | ||
) | ||
|
||
filepath = tmpdir / "test.zarr" | ||
filepath = str(tmpdir / "test.zarr") | ||
original_dt.to_zarr(filepath, write_inherited_coords=True) | ||
|
||
with open_datatree(filepath, engine="zarr") as roundtrip_dt: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
:( we should be providing a better error in Zarr. Do you have an example traceback that raises this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need to provide a better error here too. I realized about it yesterday. Not sure why we handle it like that.