Skip to content

Refactor kerchunk readers to use manifest store #547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 14 additions & 34 deletions virtualizarr/manifests/store.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

import pickle
from collections.abc import AsyncGenerator, Iterable
from collections.abc import AsyncGenerator, Iterable, Mapping
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, TypeAlias
from urllib.parse import urlparse

from zarr.abc.store import (
Expand All @@ -13,23 +13,21 @@
Store,
SuffixByteRequest,
)
from zarr.core.buffer import Buffer, default_buffer_prototype
from zarr.core.buffer.core import BufferPrototype
from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype
from zarr.core.common import BytesLike

from virtualizarr.manifests.array import ManifestArray
from virtualizarr.manifests.group import ManifestGroup
from virtualizarr.vendor.zarr.core.metadata import dict_to_buffer

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Iterable, Mapping
from typing import Any

import xarray as xr
from obstore.store import (
ObjectStore, # type: ignore[import-not-found]
)
from zarr.core.buffer import BufferPrototype
from zarr.core.common import BytesLike

StoreDict: TypeAlias = dict[str, ObjectStore]

import xarray as xr


__all__ = ["ManifestStore"]

Expand All @@ -51,25 +49,6 @@ class StoreRequest:
"""The key within the store to request."""


async def list_dir_from_manifest_arrays(
arrays: Mapping[str, ManifestArray], prefix: str
) -> AsyncGenerator[str]:
"""Create the expected results for Zarr's `store.list_dir()` from an Xarray DataArrray or Dataset

Parameters
----------
arrays : Mapping[str, ManifestArrays]
prefix : str

Returns
-------
AsyncIterator[str]
"""
# TODO shouldn't this just accept a ManifestGroup instead?
# Start with expected group level metadata
raise NotImplementedError


def get_zarr_metadata(manifest_group: ManifestGroup, key: str) -> Buffer:
"""
Generate the expected Zarr V3 metadata from a virtual dataset.
Expand Down Expand Up @@ -201,6 +180,7 @@ def get_store(self, url: str) -> ObjectStore:
-----------
url : str
A url to identify the appropriate object_store instance based on the URL scheme and netloc.

Returns:
--------
StoreRequest
Expand All @@ -215,10 +195,10 @@ def get_store(self, url: str) -> ObjectStore:

class ManifestStore(Store):
"""
A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. The requests
from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing
multiple :class:`virtualizarr.manifests.ManifestArray`,
allowing for virtually interfacing with underlying data in other file format.
A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure.

The requests from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing
multiple :class:`virtualizarr.manifests.ManifestArray`, allowing for virtually interfacing with underlying data in other file formats.

Parameters
----------
Expand Down
27 changes: 7 additions & 20 deletions virtualizarr/readers/fits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@

from xarray import Dataset, Index

from virtualizarr.manifests import ManifestStore
from virtualizarr.readers.api import (
VirtualBackend,
)
from virtualizarr.translators.kerchunk import (
extract_group,
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.xarray import construct_fully_virtual_dataset


class FITSVirtualBackend(VirtualBackend):
Expand Down Expand Up @@ -40,25 +36,16 @@ def open_virtual_dataset(
# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)})

# both group=None and group='' mean to read root group
if group:
refs = extract_group(refs, group)

# TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
if loadable_variables or indexes:
raise NotImplementedError(
"Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
)

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
manifeststore = ManifestStore.from_kerchunk_refs(
refs,
group=group,
fs_root=Path.cwd().as_uri(),
)

vds = construct_fully_virtual_dataset(
virtual_vars=virtual_vars,
coord_names=coord_names,
attrs=attrs,
vds = manifeststore.to_virtual_dataset(
group=group,
loadable_variables=loadable_variables,
indexes=indexes,
)

return vds.drop_vars(_drop_vars)
28 changes: 4 additions & 24 deletions virtualizarr/readers/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,8 @@

from xarray import Dataset, Index

from virtualizarr.manifests import ManifestStore
from virtualizarr.readers.api import VirtualBackend
from virtualizarr.translators.kerchunk import (
extract_group,
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.xarray import (
construct_fully_virtual_dataset,
construct_virtual_dataset,
)


class HDF5VirtualBackend(VirtualBackend):
Expand Down Expand Up @@ -41,29 +34,16 @@ def open_virtual_dataset(
filepath, inline_threshold=0, **reader_options
).translate()

# both group=None and group='' mean to read root group
if group:
refs = extract_group(refs, group)

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
manifeststore = ManifestStore.from_kerchunk_refs(
refs,
group=group,
fs_root=Path.cwd().as_uri(),
)

fully_virtual_dataset = construct_fully_virtual_dataset(
virtual_vars=virtual_vars,
coord_names=coord_names,
attrs=attrs,
)

vds = construct_virtual_dataset(
fully_virtual_ds=fully_virtual_dataset,
filepath=filepath,
vds = manifeststore.to_virtual_dataset(
group=group,
loadable_variables=loadable_variables,
reader_options=reader_options,
indexes=indexes,
decode_times=decode_times,
)

return vds.drop_vars(_drop_vars)
22 changes: 14 additions & 8 deletions virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from xarray import Dataset, Index

from virtualizarr.readers.api import VirtualBackend
from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs
from virtualizarr.translators.kerchunk import manifeststore_from_kerchunk_refs
from virtualizarr.types.kerchunk import (
KerchunkStoreRefs,
)
Expand Down Expand Up @@ -72,24 +72,30 @@ def open_virtual_dataset(
# is there a better / more performant way to extract this?
array_refs = {k: lrm[k] for k in lrm.keys()}

full_reference = {"refs": array_refs}

vds = dataset_from_kerchunk_refs(
KerchunkStoreRefs(full_reference), fs_root=fs_root
)
refs = KerchunkStoreRefs({"refs": array_refs})

# JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version':
# https://fsspec.github.io/kerchunk/spec.html
elif fs.read_bytes(9).startswith(b'{"version'):
with fs.open_file() as of:
refs = ujson.load(of)

vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root)

else:
raise ValueError(
"The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. If your Kerchunk generated references are saved in parquet format, make sure the file extension is `.parquet`. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues"
)

manifeststore = manifeststore_from_kerchunk_refs(
refs,
group=group,
fs_root=fs_root,
)

vds = manifeststore.to_virtual_dataset(
group=group,
loadable_variables=loadable_variables,
indexes=indexes,
)

# TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict
return vds.drop_vars(_drop_vars)
23 changes: 4 additions & 19 deletions virtualizarr/readers/netcdf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@

from xarray import Dataset, Index

from virtualizarr.manifests import ManifestStore
from virtualizarr.readers.api import VirtualBackend
from virtualizarr.translators.kerchunk import (
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.xarray import (
construct_fully_virtual_dataset,
construct_virtual_dataset,
)


class NetCDF3VirtualBackend(VirtualBackend):
Expand Down Expand Up @@ -44,25 +38,16 @@ def open_virtual_dataset(
"group kwarg passed, but netCDF3 files can't have multiple groups!"
)

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
manifeststore = ManifestStore.from_kerchunk_refs(
refs,
group=group,
fs_root=Path.cwd().as_uri(),
)

fully_virtual_dataset = construct_fully_virtual_dataset(
virtual_vars=virtual_vars,
coord_names=coord_names,
attrs=attrs,
)

vds = construct_virtual_dataset(
fully_virtual_ds=fully_virtual_dataset,
filepath=filepath,
vds = manifeststore.to_virtual_dataset(
group=group,
loadable_variables=loadable_variables,
reader_options=reader_options,
indexes=indexes,
decode_times=decode_times,
)

return vds.drop_vars(_drop_vars)
28 changes: 4 additions & 24 deletions virtualizarr/readers/tiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,9 @@

from xarray import Dataset, Index

from virtualizarr.manifests import ManifestStore
from virtualizarr.readers.api import VirtualBackend
from virtualizarr.translators.kerchunk import (
extract_group,
virtual_vars_and_metadata_from_kerchunk_refs,
)
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.xarray import (
construct_fully_virtual_dataset,
construct_virtual_dataset,
)


class TIFFVirtualBackend(VirtualBackend):
Expand Down Expand Up @@ -51,29 +44,16 @@ def open_virtual_dataset(
# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)})

# both group=None and group='' mean to read root group
if group:
refs = extract_group(refs, group)

virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
manifeststore = ManifestStore.from_kerchunk_refs(
refs,
group=group,
fs_root=Path.cwd().as_uri(),
)

fully_virtual_dataset = construct_fully_virtual_dataset(
virtual_vars=virtual_vars,
coord_names=coord_names,
attrs=attrs,
)

vds = construct_virtual_dataset(
fully_virtual_ds=fully_virtual_dataset,
filepath=filepath,
vds = manifeststore.to_virtual_dataset(
group=group,
loadable_variables=loadable_variables,
reader_options=reader_options,
indexes=indexes,
decode_times=decode_times,
)

return vds.drop_vars(_drop_vars)
10 changes: 4 additions & 6 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC
from virtualizarr import open_virtual_dataset
from virtualizarr.backend import VirtualBackend
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests import ChunkManifest, ManifestArray, ManifestStore
from virtualizarr.tests import (
has_fastparquet,
has_icechunk,
Expand All @@ -20,9 +20,6 @@
requires_kerchunk,
requires_zarr_python,
)
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
)

RoundtripFunction: TypeAlias = Callable[
Concatenate[xr.Dataset | xr.DataTree, Path, ...], xr.Dataset | xr.DataTree
Expand All @@ -45,8 +42,9 @@ def test_kerchunk_roundtrip_in_memory_no_concat(array_v3_metadata):
# Use accessor to write it out to kerchunk reference dict
ds_refs = vds.virtualize.to_kerchunk(format="dict")

# Use dataset_from_kerchunk_refs to reconstruct the dataset
roundtrip = dataset_from_kerchunk_refs(ds_refs)
# reconstruct the dataset
manifest_store = ManifestStore.from_kerchunk_refs(ds_refs)
roundtrip = manifest_store.to_virtual_dataset(loadable_variables=[])

# Assert equal to original dataset
xrt.assert_equal(roundtrip, vds)
Expand Down
Loading
Loading