diff --git a/.appveyor.yml b/.appveyor.yml index 77886465..0f553b32 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -60,7 +60,7 @@ environment: # All of these are common to all matrix runs ATM, so pre-defined here and to be overloaded if needed DTS: datalad_container APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 + INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 skopeo # system git-annex is way too old, use better one INSTALL_GITANNEX: git-annex -m deb-url --url http://snapshot.debian.org/archive/debian/20210906T204127Z/pool/main/g/git-annex/git-annex_8.20210903-1_amd64.deb CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov @@ -73,8 +73,8 @@ environment: - ID: Ubu # The same but with the oldest supported Python. - - ID: Ubu-3.8 - PY: '3.8' + - ID: Ubu-3.9 + PY: '3.9' # The same but removing busybox first - triggers different code paths in the tests - ID: Ubu-nobusybox diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..7485cb2a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,178 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +datalad-container is a DataLad extension for working with containerized computational environments. It enables tracking, versioning, and execution of containerized workflows within DataLad datasets using Singularity/Apptainer, Docker, and OCI-compliant images. + +## Core Architecture + +### Command Suite Structure + +The extension registers a command suite with DataLad through setuptools entry points (see `setup.cfg`). The main commands are: + +- **containers-add** (`containers_add.py`) - Add/update container images to a dataset +- **containers-list** (`containers_list.py`) - List configured containers +- **containers-remove** (`containers_remove.py`) - Remove containers from configuration +- **containers-run** (`containers_run.py`) - Execute commands within containers + +All commands are registered in `datalad_container/__init__.py` via the `command_suite` tuple. + +### Container Adapters + +The `adapters/` directory contains transport-specific handlers: + +- **docker.py** - Docker Hub images (`dhub://` scheme) +- **oci.py** - OCI-compliant images using Skopeo (`oci:` scheme) + - Saves images as trackable directory structures + - Supports loading images to Docker daemon on-demand + - Uses Skopeo for image manipulation + +Each adapter implements `save()` and `run()` functions for their respective container formats. + +### Container Discovery + +`find_container.py` implements the logic for locating containers: +- Searches current dataset and subdatasets +- Supports hierarchical container names (e.g., `subds/container-name`) +- Falls back to path-based and name-based lookups +- Automatically installs subdatasets if needed to access containers + +### Configuration Storage + +Container metadata is stored in `.datalad/config` with the pattern: +``` +datalad.containers..image = +datalad.containers..cmdexec = +datalad.containers..updateurl = +datalad.containers..extra-input = +``` + +Default container location: `.datalad/environments//image` + +## Development Commands + +### Setup Development Environment + +```bash +# Using uv (preferred) +uv venv +source .venv/bin/activate +uv pip install -e .[devel] + +# Or traditional method +python3 -m venv .venv +source .venv/bin/activate +pip install -e .[devel] +``` + +### Running Tests + +```bash +# Run all tests +pytest -s -v datalad_container + +# Run specific test file +pytest -s -v datalad_container/tests/test_containers.py + +# Run specific test function +pytest -s -v datalad_container/tests/test_containers.py::test_add_noop + +# Run with coverage +pytest -s -v --cov=datalad_container datalad_container + +# Skip slow tests (marked with 'turtle') +pytest -s -v -m "not turtle" datalad_container +``` + +### Code Quality Tools + +Pre-commit hooks are configured in `.pre-commit-config.yaml`: + +```bash +# Install pre-commit hooks +pre-commit install + +# Run manually on all files +pre-commit run --all-files + +# Individual tools +isort datalad_container/ # Sort imports +codespell # Spell checking +``` + +### Building Documentation + +```bash +cd docs +make html +# Output in docs/build/html/ +``` + +### Important Testing Notes + +- Tests use pytest fixtures defined in `datalad_container/conftest.py` and `tests/fixtures/` +- The project uses `@with_tempfile` and `@with_tree` decorators from DataLad's test utilities +- Docker tests may require Docker to be running +- Singularity/Apptainer tests require the container runtime to be installed +- Some tests are marked with `@pytest.mark.turtle` for slow-running tests + +## Key Implementation Details + +### URL Scheme Handling + +Container sources are identified by URL schemes: +- `shub://` - Singularity Hub (legacy, uses requests library) +- `docker://` - Direct Singularity pull from Docker Hub +- `dhub://` - Docker images stored locally via docker pull/save +- `oci:` - OCI images stored as directories via Skopeo + +The scheme determines both storage format and execution template. + +### Execution Format Strings + +Call format strings support placeholders: +- `{img}` - Path to container image +- `{cmd}` - Command to execute +- `{img_dspath}` - Relative path to dataset containing image +- `{img_dirpath}` - Directory containing the image +- `{python}` - Path to current Python executable + +Example: `singularity exec {img} {cmd}` + +### Git-annex Integration + +- Large container images are managed by git-annex +- For `shub://` URLs, uses DataLad's special remote if available +- The `ensure_datalad_remote()` function (in `utils.py`) initializes the special remote when needed +- For `oci:docker://` images, registry URLs are added to annexed layers for efficient retrieval + +### Path Normalization + +`utils.py` contains `_normalize_image_path()` to handle cross-platform path issues: +- Config historically stored platform-specific paths +- Now standardizes to POSIX paths in config +- Maintains backward compatibility with Windows paths + +## Testing Considerations + +- Mark AI-generated tests with `@pytest.mark.ai_generated` +- Tests should not `chdir()` the entire process; use `cwd` parameter instead +- Use `common_kwargs = {'result_renderer': 'disabled'}` in tests to suppress output +- Many tests use DataLad's `with_tempfile` decorator for temporary test directories + +## Dependencies + +Core dependencies: +- datalad >= 0.18.0 +- requests >= 1.2 (for Singularity Hub communication) + +Container runtimes (at least one required): +- Singularity or Apptainer for Singularity images +- Docker for Docker and OCI image execution +- Skopeo for OCI image manipulation + +## Version Management + +This project uses `versioneer.py` for automatic version management from git tags. Version info is in `datalad_container/_version.py` (auto-generated, excluded from coverage). diff --git a/changelog.d/pr-277.md b/changelog.d/pr-277.md new file mode 100644 index 00000000..935fa469 --- /dev/null +++ b/changelog.d/pr-277.md @@ -0,0 +1,6 @@ +### 🚀 Enhancements and New Features + +- Add skopeo-based adapter for working with OCI images. + [PR #277](https://github.com/datalad/datalad-container/pull/277) (by [@yarikoptic](https://github.com/yarikoptic)) + continued an old/never finalized/closed + [PR #136](https://github.com/datalad/datalad-container/pull/136) (by [@kyleam](https://github.com/kyleam)). diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 774f53b9..3acb9360 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -19,7 +19,15 @@ import tarfile import tempfile -from datalad.utils import on_windows +import logging + +from datalad_container.adapters.utils import ( + docker_run, + get_docker_image_ids, + log_and_exit, + on_windows, + setup_logger, +) lgr = logging.getLogger("datalad.containers.adapters.docker") @@ -49,7 +57,7 @@ def save(image, path): with tempfile.NamedTemporaryFile() as stream: # Windows can't write to an already opened file stream.close() - sp.check_call(["docker", "save", "-o", stream.name, image]) + sp.run(["docker", "save", "-o", stream.name, image], check=True) with tarfile.open(stream.name, mode="r:") as tar: if not op.exists(path): lgr.debug("Creating new directory at %s", path) @@ -79,12 +87,6 @@ def safe_extract(tar, path=".", members=None, *, numeric_owner=False): lgr.info("Saved %s to %s", image, path) -def _list_images(): - out = sp.check_output( - ["docker", "images", "--all", "--quiet", "--no-trunc"]) - return out.decode().splitlines() - - def get_image(path, repo_tag=None, config=None): """Return the image ID of the image extracted at `path`. """ @@ -130,7 +132,7 @@ def load(path, repo_tag, config): # things, loading the image from the dataset will tag the old neurodebian # image as the latest. image_id = "sha256:" + get_image(path, repo_tag, config) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): lgr.debug("Loading %s", image_id) cmd = ["docker", "load"] p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE) @@ -144,7 +146,7 @@ def load(path, repo_tag, config): else: lgr.debug("Image %s is already present", image_id) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): raise RuntimeError( "docker image {} was not successfully loaded".format(image_id)) return image_id @@ -159,25 +161,7 @@ def cli_save(namespace): def cli_run(namespace): image_id = load(namespace.path, namespace.repo_tag, namespace.config) - prefix = ["docker", "run", - # FIXME: The -v/-w settings are convenient for testing, but they - # should be configurable. - "-v", "{}:/tmp".format(os.getcwd()), - "-w", "/tmp", - "--rm", - "--interactive"] - if not on_windows: - # Make it possible for the output files to be added to the - # dataset without the user needing to manually adjust the - # permissions. - prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) - - if sys.stdin.isatty(): - prefix.append("--tty") - prefix.append(image_id) - cmd = prefix + namespace.cmd - lgr.debug("Running %r", cmd) - sp.check_call(cmd) + docker_run(image_id, namespace.cmd) def main(args): @@ -228,20 +212,11 @@ def main(args): namespace = parser.parse_args(args[1:]) - logging.basicConfig( - level=logging.DEBUG if namespace.verbose else logging.INFO, - format="%(message)s") + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) namespace.func(namespace) if __name__ == "__main__": - try: + with log_and_exit(lgr): main(sys.argv) - except Exception as exc: - lgr.exception("Failed to execute %s", sys.argv) - if isinstance(exc, sp.CalledProcessError): - excode = exc.returncode - else: - excode = 1 - sys.exit(excode) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py new file mode 100644 index 00000000..6885c13a --- /dev/null +++ b/datalad_container/adapters/oci.py @@ -0,0 +1,402 @@ +"""Run a container from the image in a local OCI directory. + +This adapter uses Skopeo to save a Docker image (or any source that Skopeo +supports) to a local directory that's compliant with the "Open Container Image +Layout Specification" and can be tracked as objects in a DataLad dataset. + +This image can then be loaded on-the-fly in order for execution. Currently only +docker-run is supported (i.e. the image is loaded with Skopeo's +"docker-daemon:" transport), but the plan is to support podman-run (via the +"containers-storage:" transport) as well. + +Examples +-------- + +Save BusyBox 1.32 from Docker Hub to the local directory bb_1.32: + + $ python -m datalad_container.adapters.oci \\ + save docker://busybox:1.32 bb-1.32/ + +Load the image into the Docker daemon (if necessary) and run a command: + + $ python -m datalad_container.adapters.oci \\ + run bb_1.32/ sh -c 'busybox | head -1' + BusyBox v1.32.0 (2020-10-12 23:47:18 UTC) multi-call binary. +""" +# ^TODO: Add note about expected image ID mismatches (e.g., between the docker +# pulled entry and loaded one)? + +from collections import namedtuple +import json +import logging +from pathlib import Path +import re +import subprocess as sp +import sys + +from datalad_container.adapters.utils import ( + docker_run, + get_docker_image_ids, + log_and_exit, + setup_logger, +) + +lgr = logging.getLogger("datalad.container.adapters.oci") + +_IMAGE_SOURCE_KEY = "org.datalad.container.image.source" + + +def _normalize_reference(reference): + """Normalize a short repository name to a canonical one. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "neurodebian", "library/neurodebian"). + + Returns + ------- + A fully-qualified reference (e.g., "docker.io/library/neurodebian") + + Note: This tries to follow containers/image's splitDockerDomain(). + """ + parts = reference.split("/", maxsplit=1) + if len(parts) == 1 or (not any(c in parts[0] for c in [".", ":"]) + and parts[0] != "localhost"): + domain, remainder = "docker.io", reference + else: + domain, remainder = parts + + if domain == "docker.io" and "/" not in remainder: + remainder = "library/" + remainder + return domain + "/" + remainder + + +Reference = namedtuple("Reference", ["name", "tag", "digest"]) + + +def parse_docker_reference(reference, normalize=False, strip_transport=False): + """Parse a Docker reference into a name, tag, and digest. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "busybox" or "library/busybox:latest") + normalize : bool, optional + Whether to normalize short names like "busybox" to the fully qualified + name ("docker.io/library/busybox") + strip_transport : bool, optional + Remove Skopeo transport value ("docker://" or "docker-daemon:") from + the name. Unless this is true, reference should not include a + transport. + + Returns + ------- + A Reference namedtuple with .name, .tag, and .digest attributes + """ + if strip_transport: + try: + reference = reference.split(":", maxsplit=1)[1] + except IndexError: + raise ValueError("Reference did not have transport: {}" + .format(reference)) + if reference.startswith("//"): + reference = reference[2:] + + parts = reference.split("/") + last = parts[-1] + if "@" in last: + sep = "@" + elif ":" in last: + sep = ":" + else: + sep = None + + tag = None + digest = None + if sep: + repo, label = last.split(sep) + front = "/".join(parts[:-1] + [repo]) + if sep == "@": + digest = label + else: + tag = label + else: + front, tag = "/".join(parts), None + if normalize: + front = _normalize_reference(front) + return Reference(front, tag, digest) + + +def _store_annotation(path, key, value): + """Set a value of image's org.datalad.container.image.source annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key, value : str + Key and value to store in the image's "annotations" field. + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + annot = index_info["manifests"][0].get("annotations", {}) + + annot[key] = value + index_info["manifests"][0]["annotations"] = annot + with index.open("w") as fh: + json.dump(index_info, fh) + + +def _get_annotation(path, key): + """Return value for `key` in an image's annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key : str + Key in the image's "annotations" field. + + Returns + ------- + str or None + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + # Assume one manifest because skopeo-inspect would fail anyway otherwise. + return index_info["manifests"][0].get("annotations", {}).get(key) + + +def save(image, path): + """Save an image to an OCI-compliant directory. + + Parameters + ---------- + image : str + A source image accepted by skopeo-copy + path : pathlib.Path + Directory to copy the image to + """ + # Refuse to work with non-empty directory if it's not empty by letting the + # OSError through. Multiple images can be saved to an OCI directory, but + # run() and get_image_id() don't support a way to pull out a specific one. + try: + path.rmdir() + except FileNotFoundError: + pass + except OSError as exc: + raise OSError(exc) from None + path.mkdir(parents=True) + dest = "oci:" + str(path) + tag = parse_docker_reference(image).tag + if tag: + dest += ":" + tag + sp.run(["skopeo", "copy", image, dest], check=True) + _store_annotation(path, _IMAGE_SOURCE_KEY, image) + + +def link(ds, path, reference): + """Add Docker registry URLs to annexed layer images. + + Parameters + ---------- + ds : Dataset + path : pathlib.Path + Absolute path to the image directory. + reference : str + Docker reference (e.g., "busybox:1.32"). This should not include the + transport (i.e. "docker://"). + """ + from datalad.downloaders.providers import Providers + from datalad.support.exceptions import CommandError + from datalad_container.utils import ensure_datalad_remote + + res = sp.run(["skopeo", "inspect", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + + ref = parse_docker_reference(reference, normalize=True) + registry, name = ref.name.split("/", maxsplit=1) + + # Docker Hub has a special endpoint, all others follow the pattern + # https://{registry}/v2/ + if registry == "docker.io": + endpoint = "https://registry-1.docker.io/v2/" + else: + endpoint = f"https://{registry}/v2/" + provider = Providers.from_config_files().get_provider( + endpoint + name, only_nondefault=True) + if not provider: + lgr.warning("Required Datalad provider configuration " + "for Docker registry links not detected. We will enable 'datalad' " + "special remote anyways but datalad might issue warnings later on.") + + layers = {} # path => digest + for layer in info["Layers"]: + algo, digest = layer.split(":") + layer_path = path / "blobs" / algo / digest + layers[layer_path] = layer + + ds_repo = ds.repo + checked_dl_remote = False + for st in ds.status(layers.keys(), annex="basic", result_renderer=None): + if "keyname" in st: + if not checked_dl_remote: + ensure_datalad_remote(ds_repo) + checked_dl_remote = True + path = Path(st["path"]) + url = "{}{}/blobs/{}".format(endpoint, name, layers[path]) + try: + ds_repo.add_url_to_file( + path, url, batch=True, options=['--relaxed']) + except CommandError as exc: + lgr.warning("Registering %s with %s failed: %s", + path, url, exc) + else: + lgr.warning("Skipping non-annexed layer: %s", st["path"]) + + +def get_image_id(path): + """Return a directory's image ID. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + + Returns + ------- + An image ID (str) + """ + # Note: This adapter depends on one image per directory. If, outside of + # this adapter interface, multiple images were stored in a directory, this + # will inspect call fails with a reasonable message. + res = sp.run(["skopeo", "inspect", "--raw", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + return info["config"]["digest"] + + +def load(path): + """Load OCI image from `path`. + + Currently, the only supported load destination is the Docker daemon. + + Parameters + ---------- + path : pathlib.Path + An OCI-compliant directory such as the one generated by `save`. It must + contain only one image. + + Returns + ------- + An image ID (str) + """ + image_id = get_image_id(path) + if image_id not in get_docker_image_ids(): + lgr.debug("Loading %s", image_id) + # The image is copied with a datalad-container/ prefix to reduce the + # chance of collisions with existing names registered with the Docker + # daemon. While we must specify _something_ for the name and tag in + # order to copy it, the particular values don't matter for execution + # purposes; they're chosen to help users identify the container in the + # `docker images` output. + source = _get_annotation(path, _IMAGE_SOURCE_KEY) + if source: + ref = parse_docker_reference(source, strip_transport=True) + name = ref.name + if ref.tag: + tag = ref.tag + else: + if ref.digest: + tag = "source-" + ref.digest.replace(":", "-")[:14] + else: + tag = "latest" + + else: + name = re.sub("[^a-z0-9-_.]", "", path.name.lower()[:10]) + tag = image_id.replace(":", "-")[:14] + + lgr.debug("Copying %s to Docker daemon", image_id) + sp.run(["skopeo", "copy", "oci:" + str(path), + # This load happens right before the command executes. Don't + # let the output be confused for the command's output. + "--quiet", + "docker-daemon:datalad-container/{}:{}".format(name, tag)], + check=True) + else: + lgr.debug("Image %s is already present", image_id) + return image_id + + +# Command-line + + +def cli_save(namespace): + save(namespace.image, namespace.path) + + +def cli_run(namespace): + image_id = load(namespace.path) + docker_run(image_id, namespace.cmd) + + +def main(args): + import argparse + + parser = argparse.ArgumentParser( + prog="python -m datalad_container.adapters.oci", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "-v", "--verbose", + action="store_true") + + subparsers = parser.add_subparsers(title="subcommands") + # Don't continue without a subcommand. + subparsers.required = True + subparsers.dest = "command" + + parser_save = subparsers.add_parser( + "save", + help="save an image to a directory") + parser_save.add_argument( + "image", metavar="NAME", + help="image to save") + parser_save.add_argument( + "path", metavar="PATH", type=Path, + help="directory to save image in") + parser_save.set_defaults(func=cli_save) + + parser_run = subparsers.add_parser( + "run", + help="run a command with a directory's image") + + # TODO: Support containers-storage/podman. This would need to be fed + # through cli_run() and load(). Also, a way to specify it should probably + # be available through containers-add. + # parser_run.add_argument( + # "--dest", metavar="TRANSPORT", + # choices=["docker-daemon", "containers-storage"], + # ...) + parser_run.add_argument( + "path", metavar="PATH", type=Path, + help="image directory") + parser_run.add_argument( + "cmd", metavar="CMD", nargs=argparse.REMAINDER, + help="command to execute") + parser_run.set_defaults(func=cli_run) + + namespace = parser.parse_args(args[1:]) + + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) + + namespace.func(namespace) + + +if __name__ == "__main__": + with log_and_exit(lgr): + main(sys.argv) diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py new file mode 100644 index 00000000..b6d4a4f5 --- /dev/null +++ b/datalad_container/adapters/tests/test_oci.py @@ -0,0 +1,125 @@ +"""Test of oci adapter that do not depend on skopeo or docker being installed. + + +See datalad_container.adapters.tests.test_oci_more for tests that do depend on +this. +""" + +import json + +import pytest + +from datalad.utils import Path +from datalad_container.adapters import oci +from datalad.tests.utils_pytest import ( + assert_raises, + eq_, + with_tempfile, +) + +# parse_docker_reference + + +def test_parse_docker_reference(): + eq_(oci.parse_docker_reference("neurodebian").name, + "neurodebian") + + +def test_parse_docker_reference_normalize(): + fn = oci.parse_docker_reference + for name in ["neurodebian", + "library/neurodebian", + "docker.io/neurodebian"]: + eq_(fn(name, normalize=True).name, + "docker.io/library/neurodebian") + + eq_(fn("quay.io/skopeo/stable", normalize=True).name, + "quay.io/skopeo/stable") + eq_(fn("ghcr.io/astral-sh/uv", normalize=True).name, + "ghcr.io/astral-sh/uv") + eq_(fn("gcr.io/my-project/my-image", normalize=True).name, + "gcr.io/my-project/my-image") + + +def test_parse_docker_reference_tag(): + fn = oci.parse_docker_reference + eq_(fn("busybox:1.32"), + ("busybox", "1.32", None)) + eq_(fn("busybox:1.32", normalize=True), + ("docker.io/library/busybox", "1.32", None)) + eq_(fn("docker.io/library/busybox:1.32"), + ("docker.io/library/busybox", "1.32", None)) + + +@pytest.mark.ai_generated +def test_parse_docker_reference_alternative_registries(): + """Test parsing references from alternative registries like quay.io and ghcr.io.""" + fn = oci.parse_docker_reference + + # Test quay.io with tag + eq_(fn("quay.io/linuxserver.io/baseimage-alpine:3.18"), + ("quay.io/linuxserver.io/baseimage-alpine", "3.18", None)) + + # Test ghcr.io with tag + eq_(fn("ghcr.io/astral-sh/uv:latest"), + ("ghcr.io/astral-sh/uv", "latest", None)) + + # Test gcr.io with tag + eq_(fn("gcr.io/my-project/my-image:v1.0"), + ("gcr.io/my-project/my-image", "v1.0", None)) + + +def test_parse_docker_reference_digest(): + fn = oci.parse_docker_reference + id_ = "sha256:a9286defaba7b3a519d585ba0e37d0b2cbee74ebfe590960b0b1d6a5e97d1e1d" + eq_(fn("busybox@{}".format(id_)), + ("busybox", None, id_)) + eq_(fn("busybox@{}".format(id_), normalize=True), + ("docker.io/library/busybox", None, id_)) + eq_(fn("docker.io/library/busybox@{}".format(id_)), + ("docker.io/library/busybox", None, id_)) + + +def test_parse_docker_reference_strip_transport(): + fn = oci.parse_docker_reference + eq_(fn("docker://neurodebian", strip_transport=True).name, + "neurodebian") + eq_(fn("docker-daemon:neurodebian", strip_transport=True).name, + "neurodebian") + + +def test_parse_docker_reference_strip_transport_no_transport(): + with assert_raises(ValueError): + oci.parse_docker_reference("neurodebian", strip_transport=True) + + +# _store_annotation and _get_annotation + +# This is the index.json contents of oci: copy of +# docker.io/library/busybox:1.32 +INDEX_VALUE = { + "schemaVersion": 2, + "manifests": [ + {"mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:9f9f95fc6f6b24f0ab756a55b8326e8849ac6a82623bea29fc4c75b99ee166a3", + "size": 347}]} + + +@with_tempfile(mkdir=True) +def test_store_and_get_annotation(path=None): + path = Path(path) + with (path / "index.json").open("w") as fh: + json.dump(INDEX_VALUE, fh) + + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + None) + + oci._store_annotation(path, "org.opencontainers.image.ref.name", "1.32") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") + + oci._store_annotation(path, "another", "foo") + eq_(oci._get_annotation(path, "another"), + "foo") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py new file mode 100644 index 00000000..1538820d --- /dev/null +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -0,0 +1,164 @@ +"""Tests of oci adapter that depend on skopeo or docker being installed. + +See datalad_container.adapters.tests.test_oci for tests that do not depend on +this. +""" +from shutil import which + +from datalad.api import ( + Dataset, + # FIXME: This is needed to register the dataset method, at least when + # running this single test module. Shouldn't that no longer be the case + # after datalad's 4b056a251f (BF/RF: Automagically find and import a + # datasetmethod if not yet bound, 2019-02-10)? + containers_add, +) +from datalad.cmd import ( + StdOutErrCapture, + WitlessRunner, +) +from datalad.support.exceptions import CommandError +from datalad.consts import ( + DATALAD_SPECIAL_REMOTE, + DATALAD_SPECIAL_REMOTES_UUIDS, +) +from datalad_container.adapters import oci +from datalad_container.adapters.utils import get_docker_image_ids +from datalad.tests.utils_pytest import ( + assert_in, + eq_, + integration, + ok_, + skip_if_no_network, + SkipTest, + slow, + with_tempfile, +) +import pytest + +for dep in ["skopeo", "docker"]: + if not which(dep): + raise SkipTest("'{}' not found on path".format(dep)) + + +@skip_if_no_network +@integration +@slow # ~13s +@with_tempfile +def test_oci_add_and_run(path=None): + ds = Dataset(path).create(cfg_proc="text2git") + ds.containers_add(url="oci:docker://busybox:1.30", name="bb") + + image_path = ds.repo.pathobj / ".datalad" / "environments" / "bb" / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", "bb", + "sh -c 'busybox | head -1'"], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + assert_in("BusyBox v1.30", out["stdout"]) + + from datalad.downloaders.providers import Providers + if not Providers.from_config_files().get_provider( + "https://registry-1.docker.io/v2/library", + only_nondefault=True): + # The rest of the test is about Docker Hub registry links, which + # require provider configuration for authentication. + return + + layers = [r["path"] + for r in ds.status(image_path / "blobs", annex="basic", + result_renderer=None) + if "key" in r] + ok_(layers) + + dl_uuid = DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE] + for where_res in ds.repo.whereis(list(map(str, layers))): + assert_in(dl_uuid, where_res) + + +@pytest.mark.ai_generated +@skip_if_no_network +@integration +@slow +@pytest.mark.parametrize("registry,image_ref,container_name,test_cmd,expected_output", [ + ("docker.io", "busybox:1.30", "busybox", "sh -c 'busybox | head -1'", "BusyBox v1.30"), + ("gcr.io", "gcr.io/google-containers/busybox:latest", "busybox-gcr", "sh -c 'busybox | head -1'", "BusyBox"), + ("quay.io", "quay.io/prometheus/busybox:latest", "busybox-quay", "sh -c 'busybox | head -1'", "BusyBox"), +]) +def test_oci_alternative_registries(tmp_path, registry, image_ref, container_name, + test_cmd, expected_output): + """Test adding and running containers from alternative registries. + + Also verifies that: + - Annexed layer blobs have URLs registered in datalad or web remotes + - Files can be dropped and retrieved from remotes (tests the full cycle) + + Parameters + ---------- + registry : str + Registry name (for test identification) + image_ref : str + Full image reference (e.g., "ghcr.io/astral-sh/uv:latest") + container_name : str + Name to register the container under + test_cmd : str + Command to run in the container + expected_output : str + String expected to be in the command output + """ + ds = Dataset(str(tmp_path)).create(cfg_proc="text2git") + ds.containers_add(url=f"oci:docker://{image_ref}", name=container_name) + + image_path = ds.repo.pathobj / ".datalad" / "environments" / container_name / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", container_name, test_cmd], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + + assert_in(expected_output, out["stdout"]) + + # Check that there are annexed files in the image blobs + blobs_path = image_path / "blobs" + annexed_blobs = [r["path"] + for r in ds.status(blobs_path, annex="basic", + result_renderer=None) + if "key" in r] + ok_(annexed_blobs, f"Expected to find annexed blobs in {blobs_path}") + + # Verify all annexed files are available from datalad or web remote + # (only check if datalad remote exists, which requires provider configuration) + # git annex find --not --in datalad --and --not --in web should be empty + result = WitlessRunner(cwd=ds.path).run( + ["git", "annex", "find", "--not", "--in", "datalad", + "--and", "--not", "--in", "web"] + annexed_blobs, + protocol=StdOutErrCapture) + eq_(result["stdout"].strip(), "", + "All annexed blobs should be available from datalad or web remote") + + # Drop all annexed content in the dataset + drop_results = ds.drop(".", result_renderer=None, on_failure='ignore') + print(f"Drop results for {registry}: {len(drop_results)} items") + for r in drop_results[:5]: # Print first 5 results + print(f" Dropped: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") + # Verify that something was actually dropped + ok_(drop_results, "Expected to drop some annexed files") + + # Get everything back to verify it can be retrieved from remotes + get_results = ds.get(".", result_renderer=None) + print(f"Get results for {registry}: {len(get_results)} items") + for r in get_results[:5]: # Print first 5 results + print(f" Retrieved: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") + # Verify that files were retrieved + ok_(get_results, "Expected to retrieve files from remotes") diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py new file mode 100644 index 00000000..27e2895b --- /dev/null +++ b/datalad_container/adapters/utils.py @@ -0,0 +1,79 @@ +"""Utilities used across the adapters +""" + +import contextlib +import logging +import os +import subprocess as sp +import sys + +from datalad.utils import on_windows + +lgr = logging.getLogger("datalad.containers.adapters.utils") + + +def setup_logger(level): + logger = logging.getLogger("datalad.containers.adapters") + logger.setLevel(level) + if not logger.hasHandlers(): + # If this script is executed with the file name rather than the + # documented `python -m ...` invocation, we can't rely on DataLad's + # handler. Add a minimal one. + handler = logger.StreamHandler() + handler.setFormatter(logger.Formatter('%(message)s')) + logger.addHandler(handler) + + +@contextlib.contextmanager +def log_and_exit(logger): + try: + yield + except Exception as exc: + logger.exception("Failed to execute %s", sys.argv) + if isinstance(exc, sp.CalledProcessError): + excode = exc.returncode + if exc.stderr: + sys.stderr.write(exc.stderr) + else: + excode = 1 + sys.exit(excode) + + +def get_docker_image_ids(): + """Return IDs of all known images.""" + out = sp.run( + ["docker", "images", "--all", "--quiet", "--no-trunc"], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + return out.stdout.splitlines() + + +def docker_run(image_id, cmd): + """Execute `docker run`. + + Parameters + ---------- + image_id : str + ID of image to execute + cmd : list of str + Command to execute + """ + prefix = ["docker", "run", + # FIXME: The -v/-w settings are convenient for testing, but they + # should be configurable. + "-v", "{}:/tmp".format(os.getcwd()), + "-w", "/tmp", + "--rm", + "--interactive"] + if not on_windows: + # Make it possible for the output files to be added to the + # dataset without the user needing to manually adjust the + # permissions. + prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) + + if sys.stdin.isatty(): + prefix.append("--tty") + prefix.append(image_id) + full_cmd = prefix + cmd + lgr.debug("Running %r", full_cmd) + sp.run(full_cmd, check=True) diff --git a/datalad_container/conftest.py b/datalad_container/conftest.py index f6b40986..dd716f22 100644 --- a/datalad_container/conftest.py +++ b/datalad_container/conftest.py @@ -1,3 +1,33 @@ +import os +import sys + +import pytest + from datalad.conftest import setup_package from .tests.fixtures import * # noqa: F401, F403 # lgtm [py/polluting-import] + + +@pytest.fixture(scope="session", autouse=True) +def ensure_sys_executable_in_path(): + """Ensure sys.executable's directory is first in PATH for test duration. + + This is needed when tests spawn subprocesses that need to import modules + from the same Python environment that's running pytest. + """ + python_dir = os.path.dirname(sys.executable) + original_path = os.environ.get("PATH", "") + path_dirs = original_path.split(os.pathsep) + + # Check if python_dir is already first in PATH + if path_dirs and path_dirs[0] != python_dir: + # Put python_dir first, removing it from elsewhere if present + filtered_dirs = [d for d in path_dirs if d != python_dir] + new_path = os.pathsep.join([python_dir] + filtered_dirs) + os.environ["PATH"] = new_path + yield + # Restore original PATH + os.environ["PATH"] = original_path + else: + # PATH is already correct + yield diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index ab182224..56ac64e7 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -31,8 +31,9 @@ ) from datalad.support.exceptions import InsufficientArgumentsError from datalad.support.param import Parameter +from datalad.utils import Path -from .utils import get_container_configuration +from .utils import get_container_configuration, ensure_datalad_remote lgr = logging.getLogger("datalad.containers.containers_add") @@ -77,6 +78,8 @@ def _guess_call_fmt(ds, name, url): elif url.startswith('dhub://'): # {python} is replaced with sys.executable on *execute* return '{python} -m datalad_container.adapters.docker run {img} {cmd}' + elif url.startswith('oci:'): + return '{python} -m datalad_container.adapters.oci run {img} {cmd}' def _ensure_datalad_remote(repo): @@ -134,12 +137,19 @@ class ContainersAdd(Interface): 'docker://debian:stable-slim'), a command format string for Singularity-based execution will be auto-configured when [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. - For Docker-based container execution with the URL scheme 'dhub://', + For the scheme 'oci:', the rest of the URL will be interpreted as the + source argument to a `skopeo copy` call and the image will be saved + as an OCI-compliant directory at location specified by `name`. + Similarly, there is a 'dhub://' scheme, where the rest of the URL will be interpreted as the argument to 'docker pull', the image will be saved to a location specified by `name`, and the call format will be auto-configured to run docker, unless overwritten. The auto-configured call to docker - run mounts the CWD to '/tmp' and sets the working directory to '/tmp'.""", + run mounts the CWD to '/tmp' and sets the working directory to '/tmp'. + However, using + the 'oci:' scheme is recommended if you have skopeo installed. The + call format for the 'oci:' and 'dhub://' schemes will be + auto-guessed if not given.""", metavar="URL", constraints=EnsureStr() | EnsureNone(), ), @@ -291,6 +301,9 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, docker_image, image) runner.run(["docker", "pull", docker_image]) docker.save(docker_image, image) + elif url.startswith("oci:"): + from .adapters import oci + oci.save(url[len("oci:"):], Path(image)) elif url.startswith("docker://"): image_dir, image_basename = op.split(image) if not image_basename: @@ -311,7 +324,7 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, copyfile(url, image) else: if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): - _ensure_datalad_remote(ds.repo) + ensure_datalad_remote(ds.repo) try: ds.repo.add_url_to_file(image, imgurl) @@ -381,3 +394,7 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, yield r result["status"] = "ok" yield result + + # We need to do this after the image is saved. + if url and url.startswith("oci:docker://"): + oci.link(ds, Path(image), url[len("oci:docker://"):]) diff --git a/datalad_container/tests/test_add.py b/datalad_container/tests/test_utils.py similarity index 81% rename from datalad_container/tests/test_add.py rename to datalad_container/tests/test_utils.py index 6a002174..863b965a 100644 --- a/datalad_container/tests/test_add.py +++ b/datalad_container/tests/test_utils.py @@ -13,10 +13,7 @@ ) from datalad.utils import Path -from datalad_container.containers_add import _ensure_datalad_remote - -# NOTE: At the moment, testing of the containers-add itself happens implicitly -# via use in other tests. +from datalad_container.utils import ensure_datalad_remote @with_tempfile @@ -24,7 +21,7 @@ def test_ensure_datalad_remote_init_and_enable_needed(path=None): ds = Dataset(path).create(force=True) repo = ds.repo assert_false(repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) @@ -40,5 +37,5 @@ def test_ensure_datalad_remote_maybe_enable(path=None, *, autoenable): repo = ds_b.repo if not autoenable: assert_not_in("datalad", repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) diff --git a/datalad_container/utils.py b/datalad_container/utils.py index 27d6b733..f9eed397 100644 --- a/datalad_container/utils.py +++ b/datalad_container/utils.py @@ -13,6 +13,10 @@ from datalad.distribution.dataset import Dataset from datalad.support.external_versions import external_versions +import logging + +lgr = logging.getLogger("datalad.containers.utils") + def get_container_command(): for command in ["apptainer", "singularity"]: @@ -148,3 +152,25 @@ def _normalize_image_path(path: str, ds: Dataset) -> PurePath: assert pathobj is not None # we report in platform-conventions return PurePath(pathobj) + + +def ensure_datalad_remote(repo): + """Initialize and enable datalad special remote if it isn't already.""" + dl_remote = None + for info in repo.get_special_remotes().values(): + if info["externaltype"] == "datalad": + dl_remote = info["name"] + break + + if not dl_remote: + from datalad.consts import DATALAD_SPECIAL_REMOTE + from datalad.customremotes.base import init_datalad_remote + + init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True) + elif repo.is_special_annex_remote(dl_remote, check_if_known=False): + lgr.debug("datalad special remote '%s' is already enabled", + dl_remote) + else: + lgr.debug("datalad special remote '%s' found. Enabling", + dl_remote) + repo.enable_remote(dl_remote)