diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 42cd1c61..84e71097 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -1,9 +1,9 @@ from __future__ import annotations import pickle -from collections.abc import AsyncGenerator, Iterable +from collections.abc import AsyncGenerator, Iterable, Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeAlias from urllib.parse import urlparse from zarr.abc.store import ( @@ -13,23 +13,21 @@ Store, SuffixByteRequest, ) -from zarr.core.buffer import Buffer, default_buffer_prototype -from zarr.core.buffer.core import BufferPrototype +from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype +from zarr.core.common import BytesLike -from virtualizarr.manifests.array import ManifestArray from virtualizarr.manifests.group import ManifestGroup from virtualizarr.vendor.zarr.core.metadata import dict_to_buffer if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Iterable, Mapping - from typing import Any - - import xarray as xr from obstore.store import ( ObjectStore, # type: ignore[import-not-found] ) - from zarr.core.buffer import BufferPrototype - from zarr.core.common import BytesLike + + StoreDict: TypeAlias = dict[str, ObjectStore] + + import xarray as xr + __all__ = ["ManifestStore"] @@ -51,25 +49,6 @@ class StoreRequest: """The key within the store to request.""" -async def list_dir_from_manifest_arrays( - arrays: Mapping[str, ManifestArray], prefix: str -) -> AsyncGenerator[str]: - """Create the expected results for Zarr's `store.list_dir()` from an Xarray DataArrray or Dataset - - Parameters - ---------- - arrays : Mapping[str, ManifestArrays] - prefix : str - - Returns - ------- - AsyncIterator[str] - """ - # TODO shouldn't this just accept a ManifestGroup instead? - # Start with expected group level metadata - raise NotImplementedError - - def get_zarr_metadata(manifest_group: ManifestGroup, key: str) -> Buffer: """ Generate the expected Zarr V3 metadata from a virtual dataset. @@ -201,6 +180,7 @@ def get_store(self, url: str) -> ObjectStore: ----------- url : str A url to identify the appropriate object_store instance based on the URL scheme and netloc. + Returns: -------- StoreRequest @@ -215,10 +195,10 @@ def get_store(self, url: str) -> ObjectStore: class ManifestStore(Store): """ - A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. The requests - from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing - multiple :class:`virtualizarr.manifests.ManifestArray`, - allowing for virtually interfacing with underlying data in other file format. + A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. + + The requests from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing + multiple :class:`virtualizarr.manifests.ManifestArray`, allowing for virtually interfacing with underlying data in other file formats. Parameters ---------- diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index 9704c43c..59163cb5 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -3,15 +3,11 @@ from xarray import Dataset, Index +from virtualizarr.manifests import ManifestStore from virtualizarr.readers.api import ( VirtualBackend, ) -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.xarray import construct_fully_virtual_dataset class FITSVirtualBackend(VirtualBackend): @@ -40,25 +36,16 @@ def open_virtual_dataset( # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly - if loadable_variables or indexes: - raise NotImplementedError( - "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS" - ) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + manifeststore = ManifestStore.from_kerchunk_refs( refs, + group=group, fs_root=Path.cwd().as_uri(), ) - vds = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, + vds = manifeststore.to_virtual_dataset( + group=group, + loadable_variables=loadable_variables, + indexes=indexes, ) return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index 786b6881..322b9ea6 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -3,15 +3,8 @@ from xarray import Dataset, Index +from virtualizarr.manifests import ManifestStore from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) class HDF5VirtualBackend(VirtualBackend): @@ -41,29 +34,16 @@ def open_virtual_dataset( filepath, inline_threshold=0, **reader_options ).translate() - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + manifeststore = ManifestStore.from_kerchunk_refs( refs, + group=group, fs_root=Path.cwd().as_uri(), ) - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, + vds = manifeststore.to_virtual_dataset( group=group, loadable_variables=loadable_variables, - reader_options=reader_options, indexes=indexes, - decode_times=decode_times, ) return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 6e2e7819..d078fe9d 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -5,7 +5,7 @@ from xarray import Dataset, Index from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs +from virtualizarr.translators.kerchunk import manifeststore_from_kerchunk_refs from virtualizarr.types.kerchunk import ( KerchunkStoreRefs, ) @@ -72,11 +72,7 @@ def open_virtual_dataset( # is there a better / more performant way to extract this? array_refs = {k: lrm[k] for k in lrm.keys()} - full_reference = {"refs": array_refs} - - vds = dataset_from_kerchunk_refs( - KerchunkStoreRefs(full_reference), fs_root=fs_root - ) + refs = KerchunkStoreRefs({"refs": array_refs}) # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': # https://fsspec.github.io/kerchunk/spec.html @@ -84,12 +80,22 @@ def open_virtual_dataset( with fs.open_file() as of: refs = ujson.load(of) - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root) - else: raise ValueError( "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" ) + manifeststore = manifeststore_from_kerchunk_refs( + refs, + group=group, + fs_root=fs_root, + ) + + vds = manifeststore.to_virtual_dataset( + group=group, + loadable_variables=loadable_variables, + indexes=indexes, + ) + # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index c1917ed5..5ff8beb7 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -3,14 +3,8 @@ from xarray import Dataset, Index +from virtualizarr.manifests import ManifestStore from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) class NetCDF3VirtualBackend(VirtualBackend): @@ -44,25 +38,16 @@ def open_virtual_dataset( "group kwarg passed, but netCDF3 files can't have multiple groups!" ) - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + manifeststore = ManifestStore.from_kerchunk_refs( refs, + group=group, fs_root=Path.cwd().as_uri(), ) - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, + vds = manifeststore.to_virtual_dataset( group=group, loadable_variables=loadable_variables, - reader_options=reader_options, indexes=indexes, - decode_times=decode_times, ) return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index f4dbcfe7..a0401bf7 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -4,16 +4,9 @@ from xarray import Dataset, Index +from virtualizarr.manifests import ManifestStore from virtualizarr.readers.api import VirtualBackend -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.xarray import ( - construct_fully_virtual_dataset, - construct_virtual_dataset, -) class TIFFVirtualBackend(VirtualBackend): @@ -51,29 +44,16 @@ def open_virtual_dataset( # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) - # both group=None and group='' mean to read root group - if group: - refs = extract_group(refs, group) - - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + manifeststore = ManifestStore.from_kerchunk_refs( refs, + group=group, fs_root=Path.cwd().as_uri(), ) - fully_virtual_dataset = construct_fully_virtual_dataset( - virtual_vars=virtual_vars, - coord_names=coord_names, - attrs=attrs, - ) - - vds = construct_virtual_dataset( - fully_virtual_ds=fully_virtual_dataset, - filepath=filepath, + vds = manifeststore.to_virtual_dataset( group=group, loadable_variables=loadable_variables, - reader_options=reader_options, indexes=indexes, - decode_times=decode_times, ) return vds.drop_vars(_drop_vars) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index af61df82..1eb8a6a8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -11,7 +11,7 @@ from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC from virtualizarr import open_virtual_dataset from virtualizarr.backend import VirtualBackend -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests import ChunkManifest, ManifestArray, ManifestStore from virtualizarr.tests import ( has_fastparquet, has_icechunk, @@ -20,9 +20,6 @@ requires_kerchunk, requires_zarr_python, ) -from virtualizarr.translators.kerchunk import ( - dataset_from_kerchunk_refs, -) RoundtripFunction: TypeAlias = Callable[ Concatenate[xr.Dataset | xr.DataTree, Path, ...], xr.Dataset | xr.DataTree @@ -45,8 +42,9 @@ def test_kerchunk_roundtrip_in_memory_no_concat(array_v3_metadata): # Use accessor to write it out to kerchunk reference dict ds_refs = vds.virtualize.to_kerchunk(format="dict") - # Use dataset_from_kerchunk_refs to reconstruct the dataset - roundtrip = dataset_from_kerchunk_refs(ds_refs) + # reconstruct the dataset + manifest_store = ManifestStore.from_kerchunk_refs(ds_refs) + roundtrip = manifest_store.to_virtual_dataset(loadable_variables=[]) # Assert equal to original dataset xrt.assert_equal(roundtrip, vds) diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py index 8d9fb4c3..2c26c687 100644 --- a/virtualizarr/translators/kerchunk.py +++ b/virtualizarr/translators/kerchunk.py @@ -1,9 +1,6 @@ -from typing import Any, Mapping, MutableMapping, cast +from typing import cast import numpy as np -from xarray import Dataset -from xarray.core.indexes import Index -from xarray.core.variable import Variable from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -11,7 +8,12 @@ from virtualizarr.codecs import ( numcodec_config_to_configurable, ) -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests import ( + ChunkManifest, + ManifestArray, + ManifestGroup, + ManifestStore, +) from virtualizarr.manifests.manifest import ChunkEntry, ChunkKey from virtualizarr.manifests.utils import create_v3_array_metadata from virtualizarr.types.kerchunk import ( @@ -19,7 +21,6 @@ KerchunkStoreRefs, ) from virtualizarr.utils import determine_chunk_grid_shape -from virtualizarr.xarray import separate_coords def to_kerchunk_json(v2_metadata: ArrayV2Metadata) -> str: @@ -93,32 +94,46 @@ def from_kerchunk_refs(decoded_arr_refs_zarray) -> "ArrayV3Metadata": ) -def virtual_vars_and_metadata_from_kerchunk_refs( - vds_refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, +def manifeststore_from_kerchunk_refs( + refs: KerchunkStoreRefs, + group: str | None = None, fs_root: str | None = None, -) -> tuple[Mapping[str, Variable], dict[str, Any], list[str]]: +) -> ManifestStore: """ - Parses all useful information from a set kerchunk references (for a single group). + Construct a ManifestStore from a dictionary of kerchunk references. Parameters ---------- - drop_variables - Variables in the file to not bother generating chunk metadata for. - fs_root + refs: dict + The Kerchunk references, as a dictionary. + group: string, optional + Default is to build a store from the root group. + fs_root: string, optional The root of the fsspec filesystem on which these references were generated. Required if any paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ + # TODO teach this method to understand kerchunk json/parquet filepaths too? Then it would be a full "reader" - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables, - fs_root=fs_root, - ) - ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) + # both group=None and group='' mean to read root group + if group: + refs = extract_group(refs, group) - return virtual_vars, ds_attrs, coord_names + arr_names = find_var_names(refs) + # TODO do we need drop_variables here? + + # TODO support iterating over multiple nested groups + marrs = { + arr_name: manifestarray_from_kerchunk_refs(refs, arr_name, fs_root=fs_root) + for arr_name in arr_names + } + + # TODO probably need to parse the group-level attributes more here + attributes = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + + manifestgroup = ManifestGroup(arrays=marrs, attributes=attributes) + + # TODO what should the obstore store be? + return ManifestStore(group=manifestgroup) def extract_group(vds_refs: KerchunkStoreRefs, group: str) -> KerchunkStoreRefs: @@ -160,79 +175,23 @@ def extract_group(vds_refs: KerchunkStoreRefs, group: str) -> KerchunkStoreRefs: return KerchunkStoreRefs(vds_refs) -def virtual_vars_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, - fs_root: str | None = None, -) -> dict[str, Variable]: - """ - Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - - Parameters - ---------- - drop_variables: list[str], default is None - Variables in the file to drop before returning. - """ - - var_names = find_var_names(refs) - if drop_variables is None: - drop_variables = [] - var_names_to_keep = [ - var_name for var_name in var_names if var_name not in drop_variables - ] - - vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, fs_root=fs_root) - for var_name in var_names_to_keep - } - return vars - - -def dataset_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] = [], - indexes: MutableMapping[str, Index] | None = None, - fs_root: str | None = None, -) -> Dataset: - """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. - - drop_variables: list[str], default is None - Variables in the file to drop before returning. - """ - - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, fs_root=fs_root) - ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None: - indexes = {} - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return vds - - -def variable_from_kerchunk_refs( +def manifestarray_from_kerchunk_refs( refs: KerchunkStoreRefs, var_name: str, fs_root: str | None = None, -) -> Variable: - """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" +) -> ManifestArray: + """Create a single ManifestArray by reading specific keys of a kerchunk references dict.""" arr_refs = extract_array_refs(refs, var_name) + + # TODO probably need to update internals of this to use ArrayV3Metadata more neatly chunk_dict, metadata, zattrs = parse_array_refs(arr_refs) # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs dims = zattrs.pop("_ARRAY_DIMENSIONS") + if chunk_dict: manifest = manifest_from_kerchunk_chunk_dict(chunk_dict, fs_root=fs_root) - varr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) elif len(metadata.shape) != 0: # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D @@ -242,14 +201,14 @@ def variable_from_kerchunk_refs( metadata.chunks, ) manifest = ChunkManifest(entries={}, shape=shape) - varr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) else: # This means we encountered a scalar variable of dimension 0, # very likely that it actually has no numeric value and its only purpose # is to communicate dataset attributes. - varr = metadata.fill_value + marr = metadata.fill_value - return Variable(data=varr, dims=dims, attrs=zattrs) + return marr def manifest_from_kerchunk_chunk_dict(