Skip to content

silx.io.open: Added basic support for zarr URLs #4350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/silx/app/utils/parseutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from collections.abc import Sequence
import glob
import logging
import urllib.parse
from typing import Any
from collections.abc import Generator, Iterable
from pathlib import Path
Expand Down Expand Up @@ -76,7 +77,7 @@ def to_bool(thing: Any, default: bool | None = None) -> bool:
def filenames_to_dataurls(
filenames: Iterable[str | Path],
slices: Sequence[int] = tuple(),
) -> Generator[object]:
) -> Generator["DataUrl" | str]:
"""Expand filenames and HDF5 data path in files input argument"""
# Imports here so they are performed after setting HDF5_USE_FILE_LOCKING and logging level
import silx.io
Expand All @@ -87,6 +88,12 @@ def filenames_to_dataurls(
extra_slices = tuple(slices)

for filename in filenames:
if isinstance(filename, str) and urllib.parse.urlparse(
filename
).scheme.startswith("zarr+"):
yield filename
continue

url = DataUrl(filename)

for file_path in sorted(silx.utils.files.expand_filenames([url.file_path()])):
Expand Down
3 changes: 2 additions & 1 deletion src/silx/app/view/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import signal
import sys
import traceback
import urllib.parse
from silx.app.utils import parseutils


Expand Down Expand Up @@ -178,7 +179,7 @@ def exceptHook(type_, value, trace):
for url in parseutils.filenames_to_dataurls(options.files, options.slices):
# TODO: Would be nice to add a process widget and a cancel button
try:
window.appendFile(url.path())
window.appendFile(url if isinstance(url, str) else url.path())
except OSError as e:
_logger.error(e.args[0])
_logger.debug("Backtrace", exc_info=True)
Expand Down
1 change: 1 addition & 0 deletions src/silx/io/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ py.install_sources([
'spech5.py',
'url.py',
'utils.py',
'zarrh5.py',
],
subdir: 'silx/io', # Folder relative to site-packages to install to
)
Expand Down
45 changes: 32 additions & 13 deletions src/silx/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import sys
import time
import logging
import urllib.parse
from collections.abc import Generator

import numpy
Expand All @@ -49,6 +50,12 @@
except ImportError as e:
h5pyd = None

try:
from .zarrh5 import ZarrH5
except ImportError as e:
ZarrH5 = None


logger = logging.getLogger(__name__)

NEXUS_HDF5_EXT = [".h5", ".nx5", ".nxs", ".hdf", ".hdf5", ".cxi"]
Expand Down Expand Up @@ -689,34 +696,46 @@ def open(filename): # pylint:disable=redefined-builtin
:raises: IOError if the file can't be loaded or path can't be found
:rtype: h5py-like node
"""
url = DataUrl(filename)
url = urllib.parse.urlparse(filename)
if url.scheme.startswith("zarr+"):
if ZarrH5 is None:
raise IOError(
f"Zarr support is not available, please install zarr, cannot open: {filename}"
)
try:
return ZarrH5(filename)
except Exception as e:
raise IOError(f"Failed to open URL with zarr: {type(e)} {e}")

if url.scheme() in [None, "file", "silx"]:
data_url = DataUrl(filename)
if data_url.scheme() in [None, "file", "silx"]:
# That's a local file
if not url.is_valid():
if not data_url.is_valid():
raise OSError("URL '%s' is not valid" % filename)
h5_file = _open_local_file(url.file_path())
elif url.scheme() in ("http", "https"):
h5_file = _open_local_file(data_url.file_path())
elif data_url.scheme() in ("http", "https"):
return _open_url_with_h5pyd(filename)
else:
raise OSError(f"Unsupported URL scheme {url.scheme}: {filename}")
raise OSError(f"Unsupported URL scheme {data_url.scheme}: {filename}")

if url.data_path() in [None, "/", ""]: # The full file is requested
if url.data_slice():
if data_url.data_path() in [None, "/", ""]: # The full file is requested
if data_url.data_slice():
raise OSError(f"URL '{filename}' containing slicing is not supported")
return h5_file
else:
# Only a children is requested
if url.data_path() not in h5_file:
msg = f"File '{filename}' does not contain path '{url.data_path()}'."
if data_url.data_path() not in h5_file:
msg = f"File '{filename}' does not contain path '{data_url.data_path()}'."
raise OSError(msg)
node = h5_file[url.data_path()]
node = h5_file[data_url.data_path()]

if url.data_slice() is not None:
if data_url.data_slice() is not None:
from . import _sliceh5 # Lazy-import to avoid circular dependency

try:
return _sliceh5.DatasetSlice(node, url.data_slice(), attrs=node.attrs)
return _sliceh5.DatasetSlice(
node, data_url.data_slice(), attrs=node.attrs
)
except ValueError:
raise OSError(
f"URL {filename} contains slicing, but it is not a dataset"
Expand Down
151 changes: 151 additions & 0 deletions src/silx/io/zarrh5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# /*##########################################################################
# Copyright (C) 2025 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# ############################################################################*/
"""
Provides a wrapper to expose `Zarr <https://zarr.readthedocs.io/>`_
This is a preview feature.
"""
from __future__ import annotations

import logging
import urllib.parse
from typing import Generator
import numpy
from . import commonh5
import zarr


_logger = logging.getLogger(__name__)


def _children(group: zarr.Group) -> Generator[ZarrDataset | ZarrGroup]:
for name, item in group.items():
if isinstance(item, zarr.Group):
yield ZarrGroup(name, item)
elif isinstance(item, zarr.Array):
yield ZarrDataset(name, item)
else:
_logger.warning(f"Cannot map child {name}: Ignored")


class ZarrH5(commonh5.File):
"""Zarr client wrapper"""

def __init__(
self,
name: str,
mode: str | None = None,
attrs: dict | None = None,
) -> None:
assert mode in ("r", None)
if name.startswith("zarr+"):
name = name[5:]
full_url = urllib.parse.urlparse(name)
if full_url.fragment:
raise ValueError("URL fragment is not supported")

base_url = urllib.parse.urlunparse(
(full_url.scheme, full_url.netloc, full_url.path, "", "", "")
)

# quick&dirty storage_options parsing: it would need pydantic model
storage_options = {}
for key, values in urllib.parse.parse_qs(full_url.query).items():
value = values[-1]
if key == "use_ssl":
value = True if value.lower() == "true" else False
storage_options[key] = value
self.__group = zarr.open_group(base_url, storage_options=storage_options)

if attrs is None:
attrs = {}
super().__init__(
base_url.rstrip("/"), mode, attrs={**self.__group.attrs, **attrs}
)

for child in _children(self.__group):
self.add_node(child)

_logger.warning(
"Zarr support is a preview feature: This may change or be removed without notice."
)

def close(self) -> None:
super().close()
self.__group = None


class ZarrGroup(commonh5.LazyLoadableGroup):
"""Zarr Group wrapper"""

def __init__(
self,
name: str,
group: zarr.Group,
parent: ZarrH5 | ZarrGroup | None = None,
attrs: dict | None = None,
) -> None:
super().__init__(name, parent, attrs)
self.__group = group

def _create_child(self) -> None:
for child in _children(self.__group):
self.add_node(child)


class ZarrDataset(commonh5.Dataset):
"""Zarr Array wrapper"""

def __init__(
self,
name: str,
array: zarr.Array,
parent: ZarrH5 | ZarrGroup | None = None,
attrs: dict | None = None,
) -> None:
super().__init__(name, array, parent, attrs)

@property
def shape(self) -> tuple[int, ...]:
return self._get_data().shape

@property
def size(self) -> int:
return self._get_data().size

def __len__(self) -> int:
return len(self._get_data())

def __getitem__(self, item):
return self._get_data()[item]

@property
def value(self) -> numpy.ndarray:
return self._get_data()[()]

@property
def compression(self):
return self._get_data().compressor.codec_id

@property
def chunks(self):
return self._get_data().chunks