diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 58a98598a5b..0b7ddbcc735 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -22,6 +22,7 @@ NdimSizeLenMixin, attempt_import, emit_user_level_warning, + is_http_url, is_remote_uri, ) from xarray.namedarray.parallelcompat import get_chunked_array_type @@ -143,28 +144,31 @@ def _find_absolute_paths( ['common.py'] """ if isinstance(paths, str): - if is_remote_uri(paths) and kwargs.get("engine") == "zarr": - if TYPE_CHECKING: - import fsspec + if is_remote_uri(paths): + if not is_http_url(paths): + # remote uris that are not http are likely s3 or similar, which support globbing with fsspec + + if TYPE_CHECKING: + import fsspec + else: + fsspec = attempt_import("fsspec") + + fs, _, _ = fsspec.core.get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories + return [fs.get_mapper(path) for path in tmp_paths] else: - fsspec = attempt_import("fsspec") - - fs, _, _ = fsspec.core.get_fs_token_paths( - paths, - mode="rb", - storage_options=kwargs.get("backend_kwargs", {}).get( - "storage_options", {} - ), - expand=False, - ) - tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories - return [fs.get_mapper(path) for path in tmp_paths] - elif is_remote_uri(paths): - raise ValueError( - "cannot do wild-card matching for paths that are remote URLs " - f"unless engine='zarr' is specified. Got paths: {paths}. " - "Instead, supply paths as an explicit list of strings." - ) + raise ValueError( + "cannot do wild-card matching for paths that are remote http URLs. " + f"Got paths: {paths}. " + "Instead, supply paths as an explicit list of strings." + ) else: return sorted(glob(_normalize_path(paths))) elif isinstance(paths, os.PathLike): diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 11f9ee49ca2..aecd91ea41e 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -617,7 +617,7 @@ def close_on_error(f): def is_remote_uri(path: str) -> bool: - """Finds URLs of the form protocol:// or protocol:: + """Matches URLs of the form protocol:// or protocol:: This also matches for http[s]://, which were the only remote URLs supported in <=v0.16.2. @@ -625,6 +625,14 @@ def is_remote_uri(path: str) -> bool: return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) +def is_http_url(path: str) -> bool: + """Matches URLs of the form http[s]:// + + Does not match for + """ + return bool(re.search(r"^https?\://", path)) + + def read_magic_number_from_file(filename_or_obj, count=8) -> bytes: # check byte header to determine file type if isinstance(filename_or_obj, bytes): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cfca5e69048..f6983be38f5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4741,11 +4741,11 @@ def test_open_mfdataset(self) -> None: @requires_fsspec def test_open_mfdataset_no_files(self) -> None: - pytest.importorskip("aiobotocore") - - # glob is attempted as of #4823, but finds no files - with pytest.raises(OSError, match=r"no files"): - open_mfdataset("http://some/remote/uri", engine="zarr") + with pytest.raises( + ValueError, + match=r"cannot do wild-card matching for paths that are remote http URLs", + ): + open_mfdataset("http://some/remote/uri") def test_open_mfdataset_2d(self) -> None: original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 9873b271033..21d14daf3df 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -176,6 +176,15 @@ def test_is_remote_uri(): assert utils.is_remote_uri("https://example.com") assert not utils.is_remote_uri(" http://example.com") assert not utils.is_remote_uri("example.nc") + assert utils.is_remote_uri("s3://bucket/example.nc") + + +def test_is_http_url(): + assert utils.is_http_url("http://example.com") + assert utils.is_http_url("https://example.com") + assert not utils.is_http_url(" http://example.com") + assert not utils.is_http_url("example.nc") + assert not utils.is_http_url("s3://bucket/example.nc") class Test_is_uniform_and_sorted: