From b3aee66abff9e3104bdbccd3f71a37e9612e1fab Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 12:22:53 -0500 Subject: [PATCH 01/32] MV: docker adapter: Move some bits to adapters.utils These parts will be useful for the upcoming skopeo adapter as well. --- datalad_container/adapters/docker.py | 31 +++--------------- datalad_container/adapters/utils.py | 49 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 27 deletions(-) create mode 100644 datalad_container/adapters/utils.py diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index da4be720..9175bdbe 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -20,8 +20,9 @@ import logging -from datalad.utils import ( - on_windows, +from datalad_container.adapters.utils import ( + _list_images, + docker_run, ) lgr = logging.getLogger("datalad.containers.adapters.docker") @@ -61,12 +62,6 @@ def save(image, path): lgr.info("Saved %s to %s", image, path) -def _list_images(): - out = sp.check_output( - ["docker", "images", "--all", "--quiet", "--no-trunc"]) - return out.decode().splitlines() - - def get_image(path): """Return the image ID of the image extracted at `path`. """ @@ -128,25 +123,7 @@ def cli_save(namespace): def cli_run(namespace): image_id = load(namespace.path) - prefix = ["docker", "run", - # FIXME: The -v/-w settings are convenient for testing, but they - # should be configurable. - "-v", "{}:/tmp".format(os.getcwd()), - "-w", "/tmp", - "--rm", - "--interactive"] - if not on_windows: - # Make it possible for the output files to be added to the - # dataset without the user needing to manually adjust the - # permissions. - prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) - - if sys.stdin.isatty(): - prefix.append("--tty") - prefix.append(image_id) - cmd = prefix + namespace.cmd - lgr.debug("Running %r", cmd) - sp.check_call(cmd) + docker_run(image_id, namespace.cmd) def main(args): diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py new file mode 100644 index 00000000..5562445d --- /dev/null +++ b/datalad_container/adapters/utils.py @@ -0,0 +1,49 @@ +"""Utilities used across the adapters +""" + +import logging +import os +import subprocess as sp +import sys + +from datalad.utils import on_windows + +lgr = logging.getLogger("datalad.containers.adapters.utils") + + +def _list_images(): + """Return IDs of all known images.""" + out = sp.check_output( + ["docker", "images", "--all", "--quiet", "--no-trunc"]) + return out.decode().splitlines() + + +def docker_run(image_id, cmd): + """Execute `docker run`. + + Parameters + ---------- + image_id : str + ID of image to execute + cmd : list of str + Command to execute + """ + prefix = ["docker", "run", + # FIXME: The -v/-w settings are convenient for testing, but they + # should be configurable. + "-v", "{}:/tmp".format(os.getcwd()), + "-w", "/tmp", + "--rm", + "--interactive"] + if not on_windows: + # Make it possible for the output files to be added to the + # dataset without the user needing to manually adjust the + # permissions. + prefix.extend(["-u", "{}:{}".format(os.getuid(), os.getgid())]) + + if sys.stdin.isatty(): + prefix.append("--tty") + prefix.append(image_id) + full_cmd = prefix + cmd + lgr.debug("Running %r", full_cmd) + sp.check_call(full_cmd) From 4274759cb70b2e952173caf2dae05c826f1e5ff9 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 12:29:53 -0500 Subject: [PATCH 02/32] RF: adapters.utils: Rename _list_images to get_docker_image_ids "get" is probably clearer than "list", and tacking on "_ids" makes it clearer what the return value is. Also, drop the leading underscore, which is a holdover from the function being in the adapters.docker module. --- datalad_container/adapters/docker.py | 6 +++--- datalad_container/adapters/utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 9175bdbe..413841ea 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -21,8 +21,8 @@ import logging from datalad_container.adapters.utils import ( - _list_images, docker_run, + get_docker_image_ids, ) lgr = logging.getLogger("datalad.containers.adapters.docker") @@ -94,7 +94,7 @@ def load(path): # things, loading the image from the dataset will tag the old neurodebian # image as the latest. image_id = "sha256:" + get_image(path) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): lgr.debug("Loading %s", image_id) cmd = ["docker", "load"] p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE) @@ -108,7 +108,7 @@ def load(path): else: lgr.debug("Image %s is already present", image_id) - if image_id not in _list_images(): + if image_id not in get_docker_image_ids(): raise RuntimeError( "docker image {} was not successfully loaded".format(image_id)) return image_id diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index 5562445d..19e0adc8 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -11,7 +11,7 @@ lgr = logging.getLogger("datalad.containers.adapters.utils") -def _list_images(): +def get_docker_image_ids(): """Return IDs of all known images.""" out = sp.check_output( ["docker", "images", "--all", "--quiet", "--no-trunc"]) From 3656d4211e7bbda5a559f5367c702e75ffb7b97f Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 12:31:02 -0500 Subject: [PATCH 03/32] RF: adapters: Prefer subprocess.run over check_* methods The minimum Python version of DataLad is new enough that we can assume subprocess.run() is available. It's recommended by the docs, and I like it more, so switch to it. Note that we might want to eventually switch to using WitlessRunner here. The original idea with using the subprocess module directly was that it'd be nice for the docker adapter to be standalone, as nothing in the adapter depended on datalad at the time. That's not the case anymore after the adapters.utils split and the use of datalad.utils within it. (And the upcoming skopeo adapter will make heavier use of datalad for adding URLs to the layers.) --- datalad_container/adapters/docker.py | 2 +- datalad_container/adapters/utils.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 413841ea..448a2884 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -51,7 +51,7 @@ def save(image, path): with tempfile.NamedTemporaryFile() as stream: # Windows can't write to an already opened file stream.close() - sp.check_call(["docker", "save", "-o", stream.name, image]) + sp.run(["docker", "save", "-o", stream.name, image], check=True) with tarfile.open(stream.name, mode="r:") as tar: if not op.exists(path): lgr.debug("Creating new directory at %s", path) diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index 19e0adc8..5ede98b3 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -13,9 +13,11 @@ def get_docker_image_ids(): """Return IDs of all known images.""" - out = sp.check_output( - ["docker", "images", "--all", "--quiet", "--no-trunc"]) - return out.decode().splitlines() + out = sp.run( + ["docker", "images", "--all", "--quiet", "--no-trunc"], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + return out.stdout.splitlines() def docker_run(image_id, cmd): @@ -46,4 +48,4 @@ def docker_run(image_id, cmd): prefix.append(image_id) full_cmd = prefix + cmd lgr.debug("Running %r", full_cmd) - sp.check_call(full_cmd) + sp.run(full_cmd, check=True) From b0af4b129a1a81566587f36c785fe9f65245029a Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 13:59:38 -0500 Subject: [PATCH 04/32] MV: adapters: Move logging configuration to utils This logic will get a bit more involved in the next commit, and it will be needed by the skopeo adapter too. --- datalad_container/adapters/docker.py | 5 ++--- datalad_container/adapters/utils.py | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 448a2884..1348fefd 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -23,6 +23,7 @@ from datalad_container.adapters.utils import ( docker_run, get_docker_image_ids, + setup_logger, ) lgr = logging.getLogger("datalad.containers.adapters.docker") @@ -166,9 +167,7 @@ def main(args): namespace = parser.parse_args(args[1:]) - logging.basicConfig( - level=logging.DEBUG if namespace.verbose else logging.INFO, - format="%(message)s") + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) namespace.func(namespace) diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index 5ede98b3..516be042 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -11,6 +11,12 @@ lgr = logging.getLogger("datalad.containers.adapters.utils") +def setup_logger(level): + logging.basicConfig( + level=level, + format="%(message)s") + + def get_docker_image_ids(): """Return IDs of all known images.""" out = sp.run( From 780247ecb0462da32e442580d9c617ebc4d7b9d3 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 14:02:18 -0500 Subject: [PATCH 05/32] ENH: adapters.utils: Account for datalad logging When the adapter is called from the command line (as containers-run does) and datalad gets imported, the level set via the --verbose argument doesn't have an effect and logging happens twice, once through datalad's handler and once through the adapter's. Before 313c4f0f (WIN/Workaround: don't pass gid and uid to docker run call, 2020-11-10), the above was the case when docker.main() was triggered with the documented `python -m datalad_container.adapters ...` invocation, but not when the script path was passed to python. Following that commit, the adapter imports datalad, so datalad's logger is always configured. Adjust setup_logger() to set the log level of loggers under the datalad.containers.adapters namespace so that the adapter's logging level is in effect for command line calls to the adapter. As mentioned above, datalad is now loaded in all cases, so a handler is always configured, but, in case this changes in the future, add a simpler handler if one isn't already configured. --- datalad_container/adapters/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index 516be042..ae7540f1 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -12,9 +12,15 @@ def setup_logger(level): - logging.basicConfig( - level=level, - format="%(message)s") + logger = logging.getLogger("datalad.containers.adapters") + logger.setLevel(level) + if not logger.hasHandlers(): + # If this script is executed with the file name rather than the + # documented `python -m ...` invocation, we can't rely on DataLad's + # handler. Add a minimal one. + handler = logger.StreamHandler() + handler.setFormatter(logger.Formatter('%(message)s')) + logger.addHandler(handler) def get_docker_image_ids(): From 326ac7b011356a2c6cfdd33c91cd7bc627f8504e Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 12:53:58 -0500 Subject: [PATCH 06/32] ENH: adapters.utils: Add helper for main() handling The same handling will be needed in the skopeo adapter. Avoid repeating it. --- datalad_container/adapters/docker.py | 10 ++-------- datalad_container/adapters/utils.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py index 1348fefd..49ad54c3 100644 --- a/datalad_container/adapters/docker.py +++ b/datalad_container/adapters/docker.py @@ -23,6 +23,7 @@ from datalad_container.adapters.utils import ( docker_run, get_docker_image_ids, + log_and_exit, setup_logger, ) @@ -173,12 +174,5 @@ def main(args): if __name__ == "__main__": - try: + with log_and_exit(lgr): main(sys.argv) - except Exception as exc: - lgr.exception("Failed to execute %s", sys.argv) - if isinstance(exc, sp.CalledProcessError): - excode = exc.returncode - else: - excode = 1 - sys.exit(excode) diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index ae7540f1..e21166d7 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -1,6 +1,7 @@ """Utilities used across the adapters """ +import contextlib import logging import os import subprocess as sp @@ -23,6 +24,19 @@ def setup_logger(level): logger.addHandler(handler) +@contextlib.contextmanager +def log_and_exit(logger): + try: + yield + except Exception as exc: + logger.exception("Failed to execute %s", sys.argv) + if isinstance(exc, sp.CalledProcessError): + excode = exc.returncode + else: + excode = 1 + sys.exit(excode) + + def get_docker_image_ids(): """Return IDs of all known images.""" out = sp.run( From 9dbb3c5ef715815f143c32cb9d527377a2a0327a Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 10 Nov 2020 12:57:38 -0500 Subject: [PATCH 07/32] ENH: adapters.utils: Display captured stderr on exit Some of the subprocess calls capture stderr. Show it to the caller on failure. --- datalad_container/adapters/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_container/adapters/utils.py b/datalad_container/adapters/utils.py index e21166d7..27e2895b 100644 --- a/datalad_container/adapters/utils.py +++ b/datalad_container/adapters/utils.py @@ -32,6 +32,8 @@ def log_and_exit(logger): logger.exception("Failed to execute %s", sys.argv) if isinstance(exc, sp.CalledProcessError): excode = exc.returncode + if exc.stderr: + sys.stderr.write(exc.stderr) else: excode = 1 sys.exit(excode) From ac5d9706185f4fdc572c87eb06a058a2650ccbce Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 3 Nov 2020 16:30:32 -0500 Subject: [PATCH 08/32] NF: oci: Add adapter for working with OCI images In order to be able to track Docker containers in a dataset, we introduced the docker-save-based docker adapter in 68a1462 (Add prototype of a Docker adapter, 2018-05-18). It's not clear how much this has been used, but at least conceptually it seems to be viable. One problem, however, is that ideally we'd be able to assign Docker registry URLs to the image files stored in the dataset (particularly the large non-configuration files). There doesn't seem to be a way to do this with the docker-save archives. Another option for storing the image in a dataset is the Open Container Initiative image format. Skopeo can be used to copy images in Docker registries (and some other destinations) to an OCI-compliant directory. When Docker Hub is used as the source, the resulting layers blobs can be re-obtained via GET /v2/NAME/blobs/ID. Using skopeo/OCI also has the advantage of making it easier to execute via podman in the future. Add an initial skopeo-based OCI adapter. At this point, it has the same functionality as the docker adapter. --- datalad_container/adapters/oci.py | 184 ++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 datalad_container/adapters/oci.py diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py new file mode 100644 index 00000000..ec056db2 --- /dev/null +++ b/datalad_container/adapters/oci.py @@ -0,0 +1,184 @@ +"""Run a container from the image in a local OCI directory. + +This adapter uses Skopeo to save a Docker image (or any source that Skopeo +supports) to a local directory that's compliant with the "Open Container Image +Layout Specification" and can be tracked as objects in a DataLad dataset. + +This image can then be loaded on-the-fly in order for execution. + +Examples +-------- + +Save BusyBox 1.32 from Docker Hub to the local directory bb_1.32: + + $ python -m datalad_container.adapters.oci \\ + save docker://busybox:1.32 bb-1.32/ + +Load the image into the Docker daemon (if necessary) and run a command: + + $ python -m datalad_container.adapters.oci \\ + run bb_1.32/ sh -c 'busybox | head -1' + BusyBox v1.32.0 (2020-10-12 23:47:18 UTC) multi-call binary. +""" + +import json +import logging +from pathlib import Path +import re +import subprocess as sp +import sys + +from datalad_container.adapters.utils import ( + docker_run, + get_docker_image_ids, + log_and_exit, + setup_logger, +) + +lgr = logging.getLogger("datalad.container.adapters.oci") + + +def save(image, path): + """Save an image to an OCI-compliant directory. + + Parameters + ---------- + image : str + A source image accepted by skopeo-copy + path : pathlib.Path + Directory to copy the image to + """ + # Refuse to work with non-empty directory if it's not empty by letting the + # OSError through. Multiple images can be saved to an OCI directory, but + # run() and get_image_id() don't support a way to pull out a specific one. + try: + path.rmdir() + except FileNotFoundError: + pass + except OSError as exc: + raise OSError(exc) from None + path.mkdir(parents=True) + sp.run(["skopeo", "copy", image, "oci:" + str(path)], + check=True) + + +def get_image_id(path): + """Return a directory's image ID. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + + Returns + ------- + An image ID (str) + """ + # Note: This adapter depends on one image per directory. If, outside of + # this adapter interface, multiple images were stored in a directory, this + # will inspect call fails with a reasonable message. + res = sp.run(["skopeo", "inspect", "--raw", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + return info["config"]["digest"] + + +def load(path): + """Load OCI image from `path`. + + Currently the only supported load destination is the Docker daemon. + + Parameters + ---------- + path : pathlib.Path + An OCI-compliant directory such as the one generated by `save`. It must + contain only one image. + + Returns + ------- + An image ID (str) + """ + image_id = get_image_id(path) + if image_id not in get_docker_image_ids(): + lgr.debug("Loading %s", image_id) + # The image is copied with a datalad-container/ prefix to reduce the + # chance of collisions with existing names registered with the Docker + # daemon. While we must specify _something_ for the name and tag in + # order to copy it, the particular values don't matter for execution + # purposes; they're chosen to help users identify the container in the + # `docker images` output. + name = re.sub("[^a-z0-9-_.]", "", path.name.lower()[:10]) + assert image_id.startswith("sha256:") + tag = image_id.replace(":", "-")[:14] + + lgr.debug("Copying %s to Docker daemon", image_id) + sp.run(["skopeo", "copy", "oci:" + str(path), + "docker-daemon:datalad-container/{}:{}".format(name, tag)], + check=True) + else: + lgr.debug("Image %s is already present", image_id) + return image_id + + +# Command-line + + +def cli_save(namespace): + save(namespace.image, namespace.path) + + +def cli_run(namespace): + image_id = load(namespace.path) + docker_run(image_id, namespace.cmd) + + +def main(args): + import argparse + + parser = argparse.ArgumentParser( + prog="python -m datalad_container.adapters.oci", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + "-v", "--verbose", + action="store_true") + + subparsers = parser.add_subparsers(title="subcommands") + # Don't continue without a subcommand. + subparsers.required = True + subparsers.dest = "command" + + parser_save = subparsers.add_parser( + "save", + help="save an image to a directory") + parser_save.add_argument( + "image", metavar="NAME", + help="image to save") + parser_save.add_argument( + "path", metavar="PATH", type=Path, + help="directory to save image in") + parser_save.set_defaults(func=cli_save) + + parser_run = subparsers.add_parser( + "run", + help="run a command with a directory's image") + + parser_run.add_argument( + "path", metavar="PATH", type=Path, + help="image directory") + parser_run.add_argument( + "cmd", metavar="CMD", nargs=argparse.REMAINDER, + help="command to execute") + parser_run.set_defaults(func=cli_run) + + namespace = parser.parse_args(args[1:]) + + setup_logger(logging.DEBUG if namespace.verbose else logging.INFO) + + namespace.func(namespace) + + +if __name__ == "__main__": + with log_and_exit(lgr): + main(sys.argv) From c21c48082f93c97fadb50d09234514bcde407e1f Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 3 Nov 2020 16:30:32 -0500 Subject: [PATCH 09/32] DOC: oci: Add todo comment about image ID mismatch After running `skopeo copy docker://docker.io/... oci:`, we can link up the layer to the Docker registry. However, other digests aren't preserved. One notable mismatch is between the image ID if you run docker pull x versus skopeo copy docker://x oci:x && skopeo copy oci:x docker-daemon:x I haven't really wrapped my head around all the different digests and when they can change. However, skopeo's issue tracker has a good deal of discussion about this, and it looks complicated (e.g., issues 11, 469, 949, 1046, and 1097). The adapter docstring should probably note this, though at this point I'm not sure I could say something coherent. Anyway, add a to-do note... --- datalad_container/adapters/oci.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index ec056db2..868403e7 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -20,6 +20,8 @@ run bb_1.32/ sh -c 'busybox | head -1' BusyBox v1.32.0 (2020-10-12 23:47:18 UTC) multi-call binary. """ +# ^TODO: Add note about expected image ID mismatches (e.g., between the docker +# pulled entry and loaded one)? import json import logging From 0817007746f6aef8cbd743cb17b7894769f11af7 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 3 Nov 2020 16:30:32 -0500 Subject: [PATCH 10/32] DOC: oci: Add comment about alternative destinations I _think_ containers-storage: is what we'd use for podman-run, but I haven't attempted it. --- datalad_container/adapters/oci.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 868403e7..c5732dd4 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -4,7 +4,10 @@ supports) to a local directory that's compliant with the "Open Container Image Layout Specification" and can be tracked as objects in a DataLad dataset. -This image can then be loaded on-the-fly in order for execution. +This image can then be loaded on-the-fly in order for execution. Currently only +docker-run is supported (i.e. the image is loaded with Skopeo's +"docker-daemon:" transport), but the plan is to support podman-run (via the +"containers-storage:" transport) as well. Examples -------- @@ -166,6 +169,13 @@ def main(args): "run", help="run a command with a directory's image") + # TODO: Support containers-storage/podman. This would need to be fed + # through cli_run() and load(). Also, a way to specify it should probably + # be available through containers-add. + # parser_run.add_argument( + # "--dest", metavar="TRANSPORT", + # choices=["docker-daemon", "containers-storage"], + # ...) parser_run.add_argument( "path", metavar="PATH", type=Path, help="image directory") From c91b92eca69820a05a475ba1a6cb432a585fb8c2 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 4 Nov 2020 22:01:55 -0500 Subject: [PATCH 11/32] ENH: oci: Silence 'skopeo copy' output when loading image Prevent skopeo-copy output from being shown, since it's probably confusing to see output under run's "Command start (output follows)" tag for a command that the user didn't explicitly call. However, for large images, this has the downside that the user might want some signs of life, so this may need to be revisited. --- datalad_container/adapters/oci.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index c5732dd4..ae0b7594 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -119,6 +119,9 @@ def load(path): lgr.debug("Copying %s to Docker daemon", image_id) sp.run(["skopeo", "copy", "oci:" + str(path), + # This load happens right before the command executes. Don't + # let the output be confused for the command's output. + "--quiet", "docker-daemon:datalad-container/{}:{}".format(name, tag)], check=True) else: From 104e211db44c7887787a7e6d3dba22113de0bab0 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 5 Nov 2020 18:10:15 -0500 Subject: [PATCH 12/32] ENH: oci: Add utility for parsing Docker reference We'll need this information in order to add a tag to the oci: destination and to make the entry copied to docker-daemon more informative. I've tried to base the rules on containers/image implementation, which is what skopeo uses underneath. --- datalad_container/adapters/oci.py | 83 ++++++++++++++++++++ datalad_container/adapters/tests/test_oci.py | 61 ++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 datalad_container/adapters/tests/test_oci.py diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index ae0b7594..b7bd37ae 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -26,6 +26,7 @@ # ^TODO: Add note about expected image ID mismatches (e.g., between the docker # pulled entry and loaded one)? +from collections import namedtuple import json import logging from pathlib import Path @@ -43,6 +44,88 @@ lgr = logging.getLogger("datalad.container.adapters.oci") +def _normalize_reference(reference): + """Normalize a short repository name to a canonical one. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "neurodebian", "library/neurodebian"). + + Returns + ------- + A fully-qualified reference (e.g., "docker.io/library/neurodebian") + + Note: This tries to follow containers/image's splitDockerDomain(). + """ + parts = reference.split("/", maxsplit=1) + if len(parts) == 1 or (not any(c in parts[0] for c in [".", ":"]) + and parts[0] != "localhost"): + domain, remainder = "docker.io", reference + else: + domain, remainder = parts + + if domain == "docker.io" and "/" not in remainder: + remainder = "library/" + remainder + return domain + "/" + remainder + + +Reference = namedtuple("Reference", ["name", "tag", "digest"]) + + +def parse_docker_reference(reference, normalize=False, strip_transport=False): + """Parse a Docker reference into a name, tag, and digest. + + Parameters + ---------- + reference : str + A Docker reference (e.g., "busybox" or "library/busybox:latest") + normalize : bool, optional + Whether to normalize short names like "busybox" to the fully qualified + name ("docker.io/library/busybox") + strip_transport : bool, optional + Remove Skopeo transport value ("docker://" or "docker-daemon:") from + the name. Unless this is true, reference should not include a + transport. + + Returns + ------- + A Reference namedtuple with .name, .tag, and .digest attributes + """ + if strip_transport: + try: + reference = reference.split(":", maxsplit=1)[1] + except IndexError: + raise ValueError("Reference did not have transport: {}" + .format(reference)) + if reference.startswith("//"): + reference = reference[2:] + + parts = reference.split("/") + last = parts[-1] + if "@" in last: + sep = "@" + elif ":" in last: + sep = ":" + else: + sep = None + + tag = None + digest = None + if sep: + repo, label = last.split(sep) + front = "/".join(parts[:-1] + [repo]) + if sep == "@": + digest = label + else: + tag = label + else: + front, tag = "/".join(parts), None + if normalize: + front = _normalize_reference(front) + return Reference(front, tag, digest) + + def save(image, path): """Save an image to an OCI-compliant directory. diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py new file mode 100644 index 00000000..bfad024d --- /dev/null +++ b/datalad_container/adapters/tests/test_oci.py @@ -0,0 +1,61 @@ +"""Test of oci adapter that do not depend on skopeo or docker being installed. +""" + +from datalad_container.adapters import oci +from datalad.tests.utils import ( + assert_raises, + eq_, +) + +# parse_docker_reference + + +def test_parse_docker_reference(): + eq_(oci.parse_docker_reference("neurodebian").name, + "neurodebian") + + +def test_parse_docker_reference_normalize(): + fn = oci.parse_docker_reference + for name in ["neurodebian", + "library/neurodebian", + "docker.io/neurodebian"]: + eq_(fn(name, normalize=True).name, + "docker.io/library/neurodebian") + + eq_(fn("quay.io/skopeo/stable", normalize=True).name, + "quay.io/skopeo/stable") + + +def test_parse_docker_reference_tag(): + fn = oci.parse_docker_reference + eq_(fn("busybox:1.32"), + ("busybox", "1.32", None)) + eq_(fn("busybox:1.32", normalize=True), + ("docker.io/library/busybox", "1.32", None)) + eq_(fn("docker.io/library/busybox:1.32"), + ("docker.io/library/busybox", "1.32", None)) + + +def test_parse_docker_reference_digest(): + fn = oci.parse_docker_reference + id_ = "sha256:a9286defaba7b3a519d585ba0e37d0b2cbee74ebfe590960b0b1d6a5e97d1e1d" + eq_(fn("busybox@{}".format(id_)), + ("busybox", None, id_)) + eq_(fn("busybox@{}".format(id_), normalize=True), + ("docker.io/library/busybox", None, id_)) + eq_(fn("docker.io/library/busybox@{}".format(id_)), + ("docker.io/library/busybox", None, id_)) + + +def test_parse_docker_reference_strip_transport(): + fn = oci.parse_docker_reference + eq_(fn("docker://neurodebian", strip_transport=True).name, + "neurodebian") + eq_(fn("docker-daemon:neurodebian", strip_transport=True).name, + "neurodebian") + + +def test_parse_docker_reference_strip_transport_no_transport(): + with assert_raises(ValueError): + oci.parse_docker_reference("neurodebian", strip_transport=True) From 5445874e87a317149eeac254e9049c61ef465708 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Tue, 3 Nov 2020 16:30:32 -0500 Subject: [PATCH 13/32] ENH: oci: Copy over tag when saving OCI directory An image stored as an OCI directory can have a tag. If the source has a tag specified, copy it over to the destination. Note that in upcoming commits will store the full source specification as an image annotation, so we won't rely on this when copying the image to docker-daemon:, but it still seems nice to have (e.g., when looking at the directory with skopeo-inspect). --- datalad_container/adapters/oci.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index b7bd37ae..607fd40d 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -146,8 +146,11 @@ def save(image, path): except OSError as exc: raise OSError(exc) from None path.mkdir(parents=True) - sp.run(["skopeo", "copy", image, "oci:" + str(path)], - check=True) + dest = "oci:" + str(path) + tag = parse_docker_reference(image).tag + if tag: + dest += ":" + tag + sp.run(["skopeo", "copy", image, dest], check=True) def get_image_id(path): From 9e9024ce48633905151ad50f793c7a0e74413ac0 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 5 Nov 2020 18:10:15 -0500 Subject: [PATCH 14/32] ENH: oci: Add utilities for storing and reading annotation field These will be used to store the value of the skopeo-copy source and then retrieve it at load time to make the docker-daemon: entry more informative. --- datalad_container/adapters/oci.py | 40 ++++++++++++++++++++ datalad_container/adapters/tests/test_oci.py | 36 ++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 607fd40d..50962213 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -126,6 +126,46 @@ def parse_docker_reference(reference, normalize=False, strip_transport=False): return Reference(front, tag, digest) +def _store_annotation(path, key, value): + """Set a value of image's org.datalad.container.image.source annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key, value : str + Key and value to store in the image's "annotations" field. + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + annot = index_info["manifests"][0].get("annotations", {}) + + annot[key] = value + index_info["manifests"][0]["annotations"] = annot + with index.open("w") as fh: + json.dump(index_info, fh) + + +def _get_annotation(path, key): + """Return value for `key` in an image's annotation. + + Parameters + ---------- + path : pathlib.Path + Image directory. It must contain only one image. + key : str + Key in the image's "annotations" field. + + Returns + ------- + str or None + """ + index = path / "index.json" + index_info = json.loads(index.read_text()) + # Assume one manifest because skopeo-inspect would fail anyway otherwise. + return index_info["manifests"][0].get("annotations", {}).get(key) + + def save(image, path): """Save an image to an OCI-compliant directory. diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py index bfad024d..778ba950 100644 --- a/datalad_container/adapters/tests/test_oci.py +++ b/datalad_container/adapters/tests/test_oci.py @@ -1,10 +1,14 @@ """Test of oci adapter that do not depend on skopeo or docker being installed. """ +import json + +from datalad.utils import Path from datalad_container.adapters import oci from datalad.tests.utils import ( assert_raises, eq_, + with_tempfile, ) # parse_docker_reference @@ -59,3 +63,35 @@ def test_parse_docker_reference_strip_transport(): def test_parse_docker_reference_strip_transport_no_transport(): with assert_raises(ValueError): oci.parse_docker_reference("neurodebian", strip_transport=True) + + +# _store_annotation and _get_annotation + +# This is the index.json contents of oci: copy of +# docker.io/library/busybox:1.32 +INDEX_VALUE = { + "schemaVersion": 2, + "manifests": [ + {"mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:9f9f95fc6f6b24f0ab756a55b8326e8849ac6a82623bea29fc4c75b99ee166a3", + "size": 347}]} + + +@with_tempfile(mkdir=True) +def test_store_and_get_annotation(path): + path = Path(path) + with (path / "index.json").open("w") as fh: + json.dump(INDEX_VALUE, fh) + + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + None) + + oci._store_annotation(path, "org.opencontainers.image.ref.name", "1.32") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") + + oci._store_annotation(path, "another", "foo") + eq_(oci._get_annotation(path, "another"), + "foo") + eq_(oci._get_annotation(path, "org.opencontainers.image.ref.name"), + "1.32") From 5dcce3902711f7202dd28ccc0c41243475831fc4 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 5 Nov 2020 18:30:55 -0500 Subject: [PATCH 15/32] ENH: oci: Record source for skopeo-copy in image's annotation The OCI format allows annotations. Add one with the source value (which will be determined by what the caller gives to containers-add) so that we can use this information when copying the information to a docker-daemon: destination. --- datalad_container/adapters/oci.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 50962213..f7849c5c 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -43,6 +43,8 @@ lgr = logging.getLogger("datalad.container.adapters.oci") +_IMAGE_SOURCE_KEY = "org.datalad.container.image.source" + def _normalize_reference(reference): """Normalize a short repository name to a canonical one. @@ -191,6 +193,7 @@ def save(image, path): if tag: dest += ":" + tag sp.run(["skopeo", "copy", image, dest], check=True) + _store_annotation(path, _IMAGE_SOURCE_KEY, image) def get_image_id(path): From 6634736f81c231bf70e74c5b069921dfa00e9cfc Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 4 Nov 2020 22:02:34 -0500 Subject: [PATCH 16/32] ENH: oci: Register a more informative name with docker-daemon The images copied to the daemon look like this $ docker images REPOSITORY TAG IMAGE ID CREATED SIZE datalad-container/bb sha256-98345e4 98345e418eb7 3 weeks ago 69.2MB That tag isn't useful because it just repeats the image ID. And the name after "datalad-container/" is the name of the directory, so with the default containers-add location it would be an uninformative "image". With the last commit, we store the source specification as an annotation in the OCI directory. Parse it and reuse the original repository name and tag. REPOSITORY TAG IMAGE ID CREATED SIZE datalad-container/debian buster-slim 98345e418eb7 3 weeks ago 69.2MB If the source has a digest instead of the tag, construct the daemon tag from that. --- datalad_container/adapters/oci.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index f7849c5c..bd37cfa2 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -242,9 +242,21 @@ def load(path): # order to copy it, the particular values don't matter for execution # purposes; they're chosen to help users identify the container in the # `docker images` output. - name = re.sub("[^a-z0-9-_.]", "", path.name.lower()[:10]) - assert image_id.startswith("sha256:") - tag = image_id.replace(":", "-")[:14] + source = _get_annotation(path, _IMAGE_SOURCE_KEY) + if source: + ref = parse_docker_reference(source, strip_transport=True) + name = ref.name + if ref.tag: + tag = ref.tag + else: + if ref.digest: + tag = "source-" + ref.digest.replace(":", "-")[:14] + else: + tag = "latest" + + else: + name = re.sub("[^a-z0-9-_.]", "", path.name.lower()[:10]) + tag = image_id.replace(":", "-")[:14] lgr.debug("Copying %s to Docker daemon", image_id) sp.run(["skopeo", "copy", "oci:" + str(path), From 9e2a0607dcb6b41a414d62f4ef9ca9900ee61c11 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 4 Nov 2020 13:29:27 -0500 Subject: [PATCH 17/32] ENH: containers-add: Wire up OCI adapter Add a new oci: scheme. The stacking of the schemes isn't ideal (oci:docker://, oci:docker-daemon:), but it allows for any skopeo transport to be used. Note: I'm not avoiding appending "//" for a conceptual reason (although there might be a valid one), but because I find "oci://docker://" to be ugly. Perhaps the consistency with "shub://" and "dhub://" outweighs that though. --- datalad_container/adapters/tests/test_oci.py | 4 ++ .../adapters/tests/test_oci_more.py | 56 +++++++++++++++++++ datalad_container/containers_add.py | 19 +++++-- 3 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 datalad_container/adapters/tests/test_oci_more.py diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py index 778ba950..e2bbb940 100644 --- a/datalad_container/adapters/tests/test_oci.py +++ b/datalad_container/adapters/tests/test_oci.py @@ -1,4 +1,8 @@ """Test of oci adapter that do not depend on skopeo or docker being installed. + + +See datalad_container.adapters.tests.test_oci_more for tests that do depend on +this. """ import json diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py new file mode 100644 index 00000000..69621de4 --- /dev/null +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -0,0 +1,56 @@ +"""Tests of oci adapter that depend on skopeo or docker being installed. + +See datalad_container.adapters.tests.test_oci for tests that do not depend on +this. +""" +from distutils.spawn import find_executable + +from datalad.api import ( + Dataset, + # FIXME: This is needed to register the dataset method, at least when + # running this single test module. Shouldn't that no longer be the case + # after datalad's 4b056a251f (BF/RF: Automagically find and import a + # datasetmethod if not yet bound, 2019-02-10)? + containers_add, +) +from datalad.cmd import ( + StdOutErrCapture, + WitlessRunner, +) +from datalad_container.adapters import oci +from datalad_container.adapters.utils import get_docker_image_ids +from datalad.tests.utils import ( + assert_in, + integration, + skip_if_no_network, + SkipTest, + slow, + with_tempfile, +) + +for dep in ["skopeo", "docker"]: + if not find_executable(dep): + raise SkipTest("'{}' not found on path".format(dep)) + + +@skip_if_no_network +@integration +@slow # ~13s +@with_tempfile +def test_oci_add_and_run(path): + ds = Dataset(path).create(cfg_proc="text2git") + ds.containers_add(url="oci:docker://busybox:1.30", name="bb") + + image_path = ds.repo.pathobj / ".datalad" / "environments" / "bb" / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", "bb", + "sh -c 'busybox | head -1'"], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + assert_in("BusyBox v1.30", out["stdout"]) diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index c403bd31..9657610a 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -21,6 +21,7 @@ from datalad.support.constraints import EnsureNone from datalad.support.exceptions import InsufficientArgumentsError from datalad.interface.results import get_status_dict +from datalad.utils import Path from .definitions import definitions @@ -66,6 +67,8 @@ def _guess_call_fmt(ds, name, url): return 'singularity exec {img} {cmd}' elif url.startswith('dhub://'): return op.basename(sys.executable) + ' -m datalad_container.adapters.docker run {img} {cmd}' + elif url.startswith('oci:'): + return op.basename(sys.executable) + ' -m datalad_container.adapters.oci run {img} {cmd}' def _ensure_datalad_remote(repo): @@ -121,10 +124,15 @@ class ContainersAdd(Interface): the URL scheme is one recognized by Singularity, 'shub://' or 'docker://', the command format string will be auto-guessed when [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. For the - scheme 'dhub://', the rest of the URL will be interpreted as the - argument to 'docker pull', the image will be saved to the location - specified by `name`, and the call format will be auto-guessed if - not given.""", + scheme 'oci:', the rest of the URL will be interpreted as the + source argument to a `skopeo copy` call and the image will be saved + as an OCI-compliant directory at location specified by `name`. + Similarly, there is a 'dhub://' scheme where the rest of the URL + will be interpreted as the argument to 'docker pull', the image + will be saved to the location specified by `name`. However, using + the 'oci:' scheme is recommended if you have skopeo installed. The + call format for the 'oci:' and 'dhub://' schemes will be + auto-guessed if not given.""", metavar="URL", constraints=EnsureStr() | EnsureNone(), ), @@ -263,6 +271,9 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, docker_image, image) runner.run(["docker", "pull", docker_image]) docker.save(docker_image, image) + elif url.startswith("oci:"): + from .adapters import oci + oci.save(url[len("oci:"):], Path(image)) elif url.startswith("docker://"): image_dir, image_basename = op.split(image) if not image_basename: From 230971249acef9c0ad7e8e0c52d9ab7d408ebf99 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Mon, 9 Nov 2020 15:15:33 -0500 Subject: [PATCH 18/32] MV: Add utils module with containers-add's _ensure_datalad_remote The next commit will use this logic in the oci adapter as well, and, it'd be nice (though not strictly necessary) to avoid oci and containers_add importing each other. --- datalad_container/containers_add.py | 25 ++----------------- .../tests/{test_add.py => test_utils.py} | 9 +++---- datalad_container/utils.py | 25 +++++++++++++++++++ 3 files changed, 30 insertions(+), 29 deletions(-) rename datalad_container/tests/{test_add.py => test_utils.py} (83%) create mode 100644 datalad_container/utils.py diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index 9657610a..afd589c0 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -24,6 +24,7 @@ from datalad.utils import Path from .definitions import definitions +from .utils import ensure_datalad_remote lgr = logging.getLogger("datalad.containers.containers_add") @@ -71,28 +72,6 @@ def _guess_call_fmt(ds, name, url): return op.basename(sys.executable) + ' -m datalad_container.adapters.oci run {img} {cmd}' -def _ensure_datalad_remote(repo): - """Initialize and enable datalad special remote if it isn't already.""" - dl_remote = None - for info in repo.get_special_remotes().values(): - if info["externaltype"] == "datalad": - dl_remote = info["name"] - break - - if not dl_remote: - from datalad.consts import DATALAD_SPECIAL_REMOTE - from datalad.customremotes.base import init_datalad_remote - - init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True) - elif repo.is_special_annex_remote(dl_remote, check_if_known=False): - lgr.debug("datalad special remote '%s' is already enabled", - dl_remote) - else: - lgr.debug("datalad special remote '%s' found. Enabling", - dl_remote) - repo.enable_remote(dl_remote) - - @build_doc # all commands must be derived from Interface class ContainersAdd(Interface): @@ -294,7 +273,7 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, copyfile(url, image) else: if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): - _ensure_datalad_remote(ds.repo) + ensure_datalad_remote(ds.repo) try: ds.repo.add_url_to_file(image, imgurl) diff --git a/datalad_container/tests/test_add.py b/datalad_container/tests/test_utils.py similarity index 83% rename from datalad_container/tests/test_add.py rename to datalad_container/tests/test_utils.py index a764286b..a5efe41d 100644 --- a/datalad_container/tests/test_add.py +++ b/datalad_container/tests/test_utils.py @@ -8,10 +8,7 @@ from datalad.tests.utils import with_tempfile from datalad.utils import Path -from datalad_container.containers_add import _ensure_datalad_remote - -# NOTE: At the moment, testing of the containers-add itself happens implicitly -# via use in other tests. +from datalad_container.utils import ensure_datalad_remote @with_tempfile @@ -19,7 +16,7 @@ def test_ensure_datalad_remote_init_and_enable_needed(path): ds = Dataset(path).create(force=True) repo = ds.repo assert_false(repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) @@ -34,7 +31,7 @@ def check_ensure_datalad_remote_maybe_enable(autoenable, path): repo = ds_b.repo if not autoenable: assert_not_in("datalad", repo.get_remotes()) - _ensure_datalad_remote(repo) + ensure_datalad_remote(repo) assert_in("datalad", repo.get_remotes()) diff --git a/datalad_container/utils.py b/datalad_container/utils.py new file mode 100644 index 00000000..f6bf1175 --- /dev/null +++ b/datalad_container/utils.py @@ -0,0 +1,25 @@ +import logging + +lgr = logging.getLogger("datalad.containers.utils") + + +def ensure_datalad_remote(repo): + """Initialize and enable datalad special remote if it isn't already.""" + dl_remote = None + for info in repo.get_special_remotes().values(): + if info["externaltype"] == "datalad": + dl_remote = info["name"] + break + + if not dl_remote: + from datalad.consts import DATALAD_SPECIAL_REMOTE + from datalad.customremotes.base import init_datalad_remote + + init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True) + elif repo.is_special_annex_remote(dl_remote, check_if_known=False): + lgr.debug("datalad special remote '%s' is already enabled", + dl_remote) + else: + lgr.debug("datalad special remote '%s' found. Enabling", + dl_remote) + repo.enable_remote(dl_remote) From fc7b847abb7903aa2f2df0dc33a7449e1f32183d Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Wed, 4 Nov 2020 14:59:01 -0500 Subject: [PATCH 19/32] ENH: containers-add: Try to link layers in OCI directory TODO: Finalize approach in Datalad for Docker Registry URLs. --- datalad_container/adapters/oci.py | 62 +++++++++++++++++++ .../adapters/tests/test_oci_more.py | 23 +++++++ datalad_container/containers_add.py | 4 ++ 3 files changed, 89 insertions(+) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index bd37cfa2..698d2245 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -196,6 +196,68 @@ def save(image, path): _store_annotation(path, _IMAGE_SOURCE_KEY, image) +_ENDPOINTS = {"docker.io": "https://registry-1.docker.io/v2/"} + + +def link(ds, path, reference): + """Add Docker registry URLs to annexed layer images. + + Parameters + ---------- + ds : Dataset + path : pathlib.Path + Absolute path to the image directory. + reference : str + Docker reference (e.g., "busybox:1.32"). This should not include the + transport (i.e. "docker://"). + """ + from datalad.downloaders.providers import Providers + from datalad.support.exceptions import CommandError + from datalad_container.utils import ensure_datalad_remote + + res = sp.run(["skopeo", "inspect", "oci:" + str(path)], + stdout=sp.PIPE, stderr=sp.PIPE, + universal_newlines=True, check=True) + info = json.loads(res.stdout) + + ref = parse_docker_reference(reference, normalize=True) + registry, name = ref.name.split("/", maxsplit=1) + endpoint = _ENDPOINTS.get(registry) + if not endpoint: + lgr.debug("No known endpoint for %s. Skipping linking.", registry) + return + provider = Providers.from_config_files().get_provider( + endpoint + name, only_nondefault=True) + if not provider: + lgr.debug("Required Datalad provider configuration " + "for Docker registry links not detected. Skipping linking.") + return + + layers = {} # path => digest + for layer in info["Layers"]: + algo, digest = layer.split(":") + layer_path = path / "blobs" / algo / digest + layers[layer_path] = layer + + ds_repo = ds.repo + checked_dl_remote = False + for st in ds.status(layers.keys(), annex="basic", result_renderer=None): + if "keyname" in st: + if not checked_dl_remote: + ensure_datalad_remote(ds_repo) + checked_dl_remote = True + path = Path(st["path"]) + url = "{}{}/blobs/{}".format(endpoint, name, layers[path]) + try: + ds_repo.add_url_to_file( + path, url, batch=True, options=['--relaxed']) + except CommandError as exc: + lgr.warning("Registering %s with %s failed: %s", + path, url, exc) + else: + lgr.warning("Skipping non-annexed layer: %s", st["path"]) + + def get_image_id(path): """Return a directory's image ID. diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index 69621de4..3883cce1 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -17,11 +17,16 @@ StdOutErrCapture, WitlessRunner, ) +from datalad.consts import ( + DATALAD_SPECIAL_REMOTE, + DATALAD_SPECIAL_REMOTES_UUIDS, +) from datalad_container.adapters import oci from datalad_container.adapters.utils import get_docker_image_ids from datalad.tests.utils import ( assert_in, integration, + ok_, skip_if_no_network, SkipTest, slow, @@ -54,3 +59,21 @@ def test_oci_add_and_run(path): if not existed: WitlessRunner().run(["docker", "rmi", image_id]) assert_in("BusyBox v1.30", out["stdout"]) + + from datalad.downloaders.providers import Providers + if not Providers.from_config_files().get_provider( + "https://registry-1.docker.io/v2/library", + only_nondefault=True): + # The rest of the test is about Docker Hub registry links, which + # require provider configuration for authentication. + return + + layers = [r["path"] + for r in ds.status(image_path / "blobs", annex="basic", + result_renderer=None) + if "key" in r] + ok_(layers) + + dl_uuid = DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE] + for where_res in ds.repo.whereis(list(map(str, layers))): + assert_in(dl_uuid, where_res) diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index afd589c0..f64cfb4f 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -319,3 +319,7 @@ def __call__(name, url=None, dataset=None, call_fmt=None, image=None, yield r result["status"] = "ok" yield result + + # We need to do this after the image is saved. + if url and url.startswith("oci:docker://"): + oci.link(ds, Path(image), url[len("oci:docker://"):]) From f13bcdd11c66ccfd5b34f2359edbd454a30b972e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 27 Sep 2025 09:03:06 -0400 Subject: [PATCH 20/32] BF(TST): minimally account for our migration to pytest --- datalad_container/adapters/tests/test_oci.py | 4 ++-- datalad_container/adapters/tests/test_oci_more.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py index e2bbb940..59b5aa9f 100644 --- a/datalad_container/adapters/tests/test_oci.py +++ b/datalad_container/adapters/tests/test_oci.py @@ -9,7 +9,7 @@ from datalad.utils import Path from datalad_container.adapters import oci -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_raises, eq_, with_tempfile, @@ -82,7 +82,7 @@ def test_parse_docker_reference_strip_transport_no_transport(): @with_tempfile(mkdir=True) -def test_store_and_get_annotation(path): +def test_store_and_get_annotation(path=None): path = Path(path) with (path / "index.json").open("w") as fh: json.dump(INDEX_VALUE, fh) diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index 3883cce1..4d8dbad9 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -23,7 +23,7 @@ ) from datalad_container.adapters import oci from datalad_container.adapters.utils import get_docker_image_ids -from datalad.tests.utils import ( +from datalad.tests.utils_pytest import ( assert_in, integration, ok_, @@ -42,7 +42,7 @@ @integration @slow # ~13s @with_tempfile -def test_oci_add_and_run(path): +def test_oci_add_and_run(path=None): ds = Dataset(path).create(cfg_proc="text2git") ds.containers_add(url="oci:docker://busybox:1.30", name="bb") From a6bfb559eb4766563eee14c35d29ebc612c98fee Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 27 Sep 2025 09:22:47 -0400 Subject: [PATCH 21/32] chore: appveyor -- progress Ubuntu to 2204 otherwise even singularity does not install --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index ed70c53f..41d98f6f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -59,7 +59,7 @@ environment: # All of these are common to all matrix runs ATM, so pre-defined here and to be overloaded if needed DTS: datalad_container - APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2004 + APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 INSTALL_SYSPKGS: python3-venv xz-utils jq # system git-annex is way too old, use better one INSTALL_GITANNEX: git-annex -m deb-url --url http://snapshot.debian.org/archive/debian/20210906T204127Z/pool/main/g/git-annex/git-annex_8.20210903-1_amd64.deb From 038a3d1b5534e24def96c2444d4f22867cb0bf4a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 27 Sep 2025 09:31:48 -0400 Subject: [PATCH 22/32] install libffi7 since otherwise git-annex install fails --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 41d98f6f..77886465 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -60,7 +60,7 @@ environment: # All of these are common to all matrix runs ATM, so pre-defined here and to be overloaded if needed DTS: datalad_container APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - INSTALL_SYSPKGS: python3-venv xz-utils jq + INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 # system git-annex is way too old, use better one INSTALL_GITANNEX: git-annex -m deb-url --url http://snapshot.debian.org/archive/debian/20210906T204127Z/pool/main/g/git-annex/git-annex_8.20210903-1_amd64.deb CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov From ed733e30793321d0a83fb9ecb7bda39d26d98f65 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 27 Sep 2025 09:52:36 -0400 Subject: [PATCH 23/32] [DATALAD RUNCMD] chore: drop use of find_executable (use shutil.which) === Do not change lines below === { "chain": [], "cmd": "sed -i -e 's,from distutils.spawn import find_executable,from shutil import which,g' -e 's,find_executable(,which(,g' datalad_container/adapters/tests/test_oci_more.py", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- datalad_container/adapters/tests/test_oci_more.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index 4d8dbad9..17d470e6 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -3,7 +3,7 @@ See datalad_container.adapters.tests.test_oci for tests that do not depend on this. """ -from distutils.spawn import find_executable +from shutil import which from datalad.api import ( Dataset, @@ -34,7 +34,7 @@ ) for dep in ["skopeo", "docker"]: - if not find_executable(dep): + if not which(dep): raise SkipTest("'{}' not found on path".format(dep)) From afc1ea7a40505c33ce1c28cda90627ab430dee63 Mon Sep 17 00:00:00 2001 From: DataLad Bot Date: Sat, 27 Sep 2025 13:59:49 +0000 Subject: [PATCH 24/32] [release-action] Autogenerate changelog snippet for PR 277 --- changelog.d/pr-277.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog.d/pr-277.md diff --git a/changelog.d/pr-277.md b/changelog.d/pr-277.md new file mode 100644 index 00000000..935fa469 --- /dev/null +++ b/changelog.d/pr-277.md @@ -0,0 +1,6 @@ +### 🚀 Enhancements and New Features + +- Add skopeo-based adapter for working with OCI images. + [PR #277](https://github.com/datalad/datalad-container/pull/277) (by [@yarikoptic](https://github.com/yarikoptic)) + continued an old/never finalized/closed + [PR #136](https://github.com/datalad/datalad-container/pull/136) (by [@kyleam](https://github.com/kyleam)). From ad1e343bb438b8612fa67b548c80725077955607 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 7 Oct 2025 08:34:35 -0400 Subject: [PATCH 25/32] Just a minor syntax fix spotted --- datalad_container/adapters/oci.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 698d2245..15c4d6ca 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -283,7 +283,7 @@ def get_image_id(path): def load(path): """Load OCI image from `path`. - Currently the only supported load destination is the Docker daemon. + Currently, the only supported load destination is the Docker daemon. Parameters ---------- From f0a5cd3f951c3888aba4e956d616777e3f60b452 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 15 Oct 2025 09:43:18 -0400 Subject: [PATCH 26/32] Add CLAUDE.md for AI assistant guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added comprehensive documentation for Claude Code to work effectively with this codebase, including architecture overview, development commands, and key implementation details. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- CLAUDE.md | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..7485cb2a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,178 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +datalad-container is a DataLad extension for working with containerized computational environments. It enables tracking, versioning, and execution of containerized workflows within DataLad datasets using Singularity/Apptainer, Docker, and OCI-compliant images. + +## Core Architecture + +### Command Suite Structure + +The extension registers a command suite with DataLad through setuptools entry points (see `setup.cfg`). The main commands are: + +- **containers-add** (`containers_add.py`) - Add/update container images to a dataset +- **containers-list** (`containers_list.py`) - List configured containers +- **containers-remove** (`containers_remove.py`) - Remove containers from configuration +- **containers-run** (`containers_run.py`) - Execute commands within containers + +All commands are registered in `datalad_container/__init__.py` via the `command_suite` tuple. + +### Container Adapters + +The `adapters/` directory contains transport-specific handlers: + +- **docker.py** - Docker Hub images (`dhub://` scheme) +- **oci.py** - OCI-compliant images using Skopeo (`oci:` scheme) + - Saves images as trackable directory structures + - Supports loading images to Docker daemon on-demand + - Uses Skopeo for image manipulation + +Each adapter implements `save()` and `run()` functions for their respective container formats. + +### Container Discovery + +`find_container.py` implements the logic for locating containers: +- Searches current dataset and subdatasets +- Supports hierarchical container names (e.g., `subds/container-name`) +- Falls back to path-based and name-based lookups +- Automatically installs subdatasets if needed to access containers + +### Configuration Storage + +Container metadata is stored in `.datalad/config` with the pattern: +``` +datalad.containers..image = +datalad.containers..cmdexec = +datalad.containers..updateurl = +datalad.containers..extra-input = +``` + +Default container location: `.datalad/environments//image` + +## Development Commands + +### Setup Development Environment + +```bash +# Using uv (preferred) +uv venv +source .venv/bin/activate +uv pip install -e .[devel] + +# Or traditional method +python3 -m venv .venv +source .venv/bin/activate +pip install -e .[devel] +``` + +### Running Tests + +```bash +# Run all tests +pytest -s -v datalad_container + +# Run specific test file +pytest -s -v datalad_container/tests/test_containers.py + +# Run specific test function +pytest -s -v datalad_container/tests/test_containers.py::test_add_noop + +# Run with coverage +pytest -s -v --cov=datalad_container datalad_container + +# Skip slow tests (marked with 'turtle') +pytest -s -v -m "not turtle" datalad_container +``` + +### Code Quality Tools + +Pre-commit hooks are configured in `.pre-commit-config.yaml`: + +```bash +# Install pre-commit hooks +pre-commit install + +# Run manually on all files +pre-commit run --all-files + +# Individual tools +isort datalad_container/ # Sort imports +codespell # Spell checking +``` + +### Building Documentation + +```bash +cd docs +make html +# Output in docs/build/html/ +``` + +### Important Testing Notes + +- Tests use pytest fixtures defined in `datalad_container/conftest.py` and `tests/fixtures/` +- The project uses `@with_tempfile` and `@with_tree` decorators from DataLad's test utilities +- Docker tests may require Docker to be running +- Singularity/Apptainer tests require the container runtime to be installed +- Some tests are marked with `@pytest.mark.turtle` for slow-running tests + +## Key Implementation Details + +### URL Scheme Handling + +Container sources are identified by URL schemes: +- `shub://` - Singularity Hub (legacy, uses requests library) +- `docker://` - Direct Singularity pull from Docker Hub +- `dhub://` - Docker images stored locally via docker pull/save +- `oci:` - OCI images stored as directories via Skopeo + +The scheme determines both storage format and execution template. + +### Execution Format Strings + +Call format strings support placeholders: +- `{img}` - Path to container image +- `{cmd}` - Command to execute +- `{img_dspath}` - Relative path to dataset containing image +- `{img_dirpath}` - Directory containing the image +- `{python}` - Path to current Python executable + +Example: `singularity exec {img} {cmd}` + +### Git-annex Integration + +- Large container images are managed by git-annex +- For `shub://` URLs, uses DataLad's special remote if available +- The `ensure_datalad_remote()` function (in `utils.py`) initializes the special remote when needed +- For `oci:docker://` images, registry URLs are added to annexed layers for efficient retrieval + +### Path Normalization + +`utils.py` contains `_normalize_image_path()` to handle cross-platform path issues: +- Config historically stored platform-specific paths +- Now standardizes to POSIX paths in config +- Maintains backward compatibility with Windows paths + +## Testing Considerations + +- Mark AI-generated tests with `@pytest.mark.ai_generated` +- Tests should not `chdir()` the entire process; use `cwd` parameter instead +- Use `common_kwargs = {'result_renderer': 'disabled'}` in tests to suppress output +- Many tests use DataLad's `with_tempfile` decorator for temporary test directories + +## Dependencies + +Core dependencies: +- datalad >= 0.18.0 +- requests >= 1.2 (for Singularity Hub communication) + +Container runtimes (at least one required): +- Singularity or Apptainer for Singularity images +- Docker for Docker and OCI image execution +- Skopeo for OCI image manipulation + +## Version Management + +This project uses `versioneer.py` for automatic version management from git tags. Version info is in `datalad_container/_version.py` (auto-generated, excluded from coverage). From 33e4927449203404abb8a807e825734883b8b512 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 15 Oct 2025 17:02:25 -0400 Subject: [PATCH 27/32] Add generic registry support to OCI adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extended the OCI adapter to support any container registry without hardcoding endpoints. The link() function now dynamically constructs registry API endpoints using the pattern https://{registry}/v2/, with Docker Hub as the only special case (registry-1.docker.io). This enables automatic support for registries like: - quay.io (Quay.io registry) - gcr.io (Google Container Registry) - ghcr.io (GitHub Container Registry) - Any other V2-compatible registry Changes: - Removed hardcoded _ENDPOINTS dictionary - Added dynamic endpoint construction in link() function - Added unit tests for parsing references from alternative registries - Added integration tests using real images: - ghcr.io/astral-sh/uv:latest for ghcr.io testing - quay.io/linuxserver.io/baseimage-alpine:3.18 for quay.io testing The link() function will add registry URLs to annexed layer images for any registry when proper provider configuration is available, enabling efficient retrieval through git-annex. All new tests are marked with @pytest.mark.ai_generated as per project standards. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- datalad_container/adapters/oci.py | 14 ++--- datalad_container/adapters/tests/test_oci.py | 24 ++++++++ .../adapters/tests/test_oci_more.py | 57 +++++++++++++++++++ 3 files changed, 88 insertions(+), 7 deletions(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 15c4d6ca..8bd9d508 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -196,9 +196,6 @@ def save(image, path): _store_annotation(path, _IMAGE_SOURCE_KEY, image) -_ENDPOINTS = {"docker.io": "https://registry-1.docker.io/v2/"} - - def link(ds, path, reference): """Add Docker registry URLs to annexed layer images. @@ -222,10 +219,13 @@ def link(ds, path, reference): ref = parse_docker_reference(reference, normalize=True) registry, name = ref.name.split("/", maxsplit=1) - endpoint = _ENDPOINTS.get(registry) - if not endpoint: - lgr.debug("No known endpoint for %s. Skipping linking.", registry) - return + + # Docker Hub has a special endpoint, all others follow the pattern + # https://{registry}/v2/ + if registry == "docker.io": + endpoint = "https://registry-1.docker.io/v2/" + else: + endpoint = f"https://{registry}/v2/" provider = Providers.from_config_files().get_provider( endpoint + name, only_nondefault=True) if not provider: diff --git a/datalad_container/adapters/tests/test_oci.py b/datalad_container/adapters/tests/test_oci.py index 59b5aa9f..b6d4a4f5 100644 --- a/datalad_container/adapters/tests/test_oci.py +++ b/datalad_container/adapters/tests/test_oci.py @@ -7,6 +7,8 @@ import json +import pytest + from datalad.utils import Path from datalad_container.adapters import oci from datalad.tests.utils_pytest import ( @@ -33,6 +35,10 @@ def test_parse_docker_reference_normalize(): eq_(fn("quay.io/skopeo/stable", normalize=True).name, "quay.io/skopeo/stable") + eq_(fn("ghcr.io/astral-sh/uv", normalize=True).name, + "ghcr.io/astral-sh/uv") + eq_(fn("gcr.io/my-project/my-image", normalize=True).name, + "gcr.io/my-project/my-image") def test_parse_docker_reference_tag(): @@ -45,6 +51,24 @@ def test_parse_docker_reference_tag(): ("docker.io/library/busybox", "1.32", None)) +@pytest.mark.ai_generated +def test_parse_docker_reference_alternative_registries(): + """Test parsing references from alternative registries like quay.io and ghcr.io.""" + fn = oci.parse_docker_reference + + # Test quay.io with tag + eq_(fn("quay.io/linuxserver.io/baseimage-alpine:3.18"), + ("quay.io/linuxserver.io/baseimage-alpine", "3.18", None)) + + # Test ghcr.io with tag + eq_(fn("ghcr.io/astral-sh/uv:latest"), + ("ghcr.io/astral-sh/uv", "latest", None)) + + # Test gcr.io with tag + eq_(fn("gcr.io/my-project/my-image:v1.0"), + ("gcr.io/my-project/my-image", "v1.0", None)) + + def test_parse_docker_reference_digest(): fn = oci.parse_docker_reference id_ = "sha256:a9286defaba7b3a519d585ba0e37d0b2cbee74ebfe590960b0b1d6a5e97d1e1d" diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index 17d470e6..3b8e6dbe 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -32,6 +32,7 @@ slow, with_tempfile, ) +import pytest for dep in ["skopeo", "docker"]: if not which(dep): @@ -77,3 +78,59 @@ def test_oci_add_and_run(path=None): dl_uuid = DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE] for where_res in ds.repo.whereis(list(map(str, layers))): assert_in(dl_uuid, where_res) + + +@pytest.mark.ai_generated +@skip_if_no_network +@integration +@slow +@with_tempfile +def test_oci_ghcr_registry(path=None): + """Test adding and running a container from GitHub Container Registry (ghcr.io).""" + ds = Dataset(path).create(cfg_proc="text2git") + ds.containers_add(url="oci:docker://ghcr.io/astral-sh/uv:latest", name="uv") + + image_path = ds.repo.pathobj / ".datalad" / "environments" / "uv" / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", "uv", "--version"], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + + # uv --version should output something like "uv X.Y.Z" + assert_in("uv", out["stdout"]) + + +@pytest.mark.ai_generated +@skip_if_no_network +@integration +@slow +@with_tempfile +def test_oci_quay_registry(path=None): + """Test adding and running a container from Quay.io registry.""" + ds = Dataset(path).create(cfg_proc="text2git") + ds.containers_add( + url="oci:docker://quay.io/linuxserver.io/baseimage-alpine:3.18", + name="alpine" + ) + + image_path = ds.repo.pathobj / ".datalad" / "environments" / "alpine" / "image" + image_id = oci.get_image_id(image_path) + existed = image_id in get_docker_image_ids() + + try: + out = WitlessRunner(cwd=ds.path).run( + ["datalad", "containers-run", "-n", "alpine", + "cat /etc/os-release"], + protocol=StdOutErrCapture) + finally: + if not existed: + WitlessRunner().run(["docker", "rmi", image_id]) + + # Should contain Alpine Linux identifier + assert_in("Alpine", out["stdout"]) From 0e3ee4b2f20a4e17c942bf26410e49ac1fd834b3 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 16 Oct 2025 08:15:54 -0400 Subject: [PATCH 28/32] Add docker.io to registry tests and verify annex URL availability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhanced the parametrized registry test to include: 1. Docker Hub (docker.io) with busybox:1.30 for consistency 2. Verification that annexed blobs exist in the OCI image 3. Check that all annexed files have URLs registered in either the datalad or web remote for efficient retrieval The test now verifies that `git annex find --not --in datalad --and --not --in web` returns empty, ensuring all blobs are accessible through git-annex remotes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../adapters/tests/test_oci_more.py | 85 +++++++++++-------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index 3b8e6dbe..dd5ae700 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -25,6 +25,7 @@ from datalad_container.adapters.utils import get_docker_image_ids from datalad.tests.utils_pytest import ( assert_in, + eq_, integration, ok_, skip_if_no_network, @@ -84,53 +85,63 @@ def test_oci_add_and_run(path=None): @skip_if_no_network @integration @slow +@pytest.mark.parametrize("registry,image_ref,container_name,test_cmd,expected_output", [ + ("docker.io", "busybox:1.30", "busybox", "sh -c 'busybox | head -1'", "BusyBox v1.30"), + ("ghcr.io", "ghcr.io/astral-sh/uv:latest", "uv", "--version", "uv"), + ("quay.io", "quay.io/linuxserver.io/baseimage-alpine:3.18", "alpine", + "cat /etc/os-release", "Alpine"), +]) @with_tempfile -def test_oci_ghcr_registry(path=None): - """Test adding and running a container from GitHub Container Registry (ghcr.io).""" +def test_oci_alternative_registries(registry, image_ref, container_name, + test_cmd, expected_output, path=None): + """Test adding and running containers from alternative registries. + + Also verifies that annexed layer blobs have URLs registered either in + datalad or web remotes for efficient retrieval. + + Parameters + ---------- + registry : str + Registry name (for test identification) + image_ref : str + Full image reference (e.g., "ghcr.io/astral-sh/uv:latest") + container_name : str + Name to register the container under + test_cmd : str + Command to run in the container + expected_output : str + String expected to be in the command output + """ ds = Dataset(path).create(cfg_proc="text2git") - ds.containers_add(url="oci:docker://ghcr.io/astral-sh/uv:latest", name="uv") + ds.containers_add(url=f"oci:docker://{image_ref}", name=container_name) - image_path = ds.repo.pathobj / ".datalad" / "environments" / "uv" / "image" + image_path = ds.repo.pathobj / ".datalad" / "environments" / container_name / "image" image_id = oci.get_image_id(image_path) existed = image_id in get_docker_image_ids() try: out = WitlessRunner(cwd=ds.path).run( - ["datalad", "containers-run", "-n", "uv", "--version"], + ["datalad", "containers-run", "-n", container_name, test_cmd], protocol=StdOutErrCapture) finally: if not existed: WitlessRunner().run(["docker", "rmi", image_id]) - # uv --version should output something like "uv X.Y.Z" - assert_in("uv", out["stdout"]) - - -@pytest.mark.ai_generated -@skip_if_no_network -@integration -@slow -@with_tempfile -def test_oci_quay_registry(path=None): - """Test adding and running a container from Quay.io registry.""" - ds = Dataset(path).create(cfg_proc="text2git") - ds.containers_add( - url="oci:docker://quay.io/linuxserver.io/baseimage-alpine:3.18", - name="alpine" - ) - - image_path = ds.repo.pathobj / ".datalad" / "environments" / "alpine" / "image" - image_id = oci.get_image_id(image_path) - existed = image_id in get_docker_image_ids() - - try: - out = WitlessRunner(cwd=ds.path).run( - ["datalad", "containers-run", "-n", "alpine", - "cat /etc/os-release"], - protocol=StdOutErrCapture) - finally: - if not existed: - WitlessRunner().run(["docker", "rmi", image_id]) - - # Should contain Alpine Linux identifier - assert_in("Alpine", out["stdout"]) + assert_in(expected_output, out["stdout"]) + + # Check that there are annexed files in the image blobs + blobs_path = image_path / "blobs" + annexed_blobs = [r["path"] + for r in ds.status(blobs_path, annex="basic", + result_renderer=None) + if "key" in r] + ok_(annexed_blobs, f"Expected to find annexed blobs in {blobs_path}") + + # Verify all annexed files are available from datalad or web remote + # git annex find --not --in datalad --and --not --in web should be empty + result = WitlessRunner(cwd=ds.path).run( + ["git", "annex", "find", "--not", "--in", "datalad", + "--and", "--not", "--in", "web"] + annexed_blobs, + protocol=StdOutErrCapture) + eq_(result["stdout"].strip(), "", + "All annexed blobs should be available from datalad or web remote") From 4e68a56b353e6051546c32c0f94ac7e448d3d6b6 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 16 Oct 2025 08:22:35 -0400 Subject: [PATCH 29/32] Add drop/get cycle test to verify remote retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhanced the parametrized registry test to verify the complete drop/get cycle for the entire dataset: 1. Drops all annexed content in the dataset 2. Verifies that files were actually dropped (non-empty results) 3. Gets everything back from remotes 4. Verifies that files were retrieved (non-empty results) This ensures that the registered URLs in datalad/web remotes are functional and files can be successfully retrieved from the registry. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../adapters/tests/test_oci_more.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index dd5ae700..a71cd481 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -96,8 +96,9 @@ def test_oci_alternative_registries(registry, image_ref, container_name, test_cmd, expected_output, path=None): """Test adding and running containers from alternative registries. - Also verifies that annexed layer blobs have URLs registered either in - datalad or web remotes for efficient retrieval. + Also verifies that: + - Annexed layer blobs have URLs registered in datalad or web remotes + - Files can be dropped and retrieved from remotes (tests the full cycle) Parameters ---------- @@ -145,3 +146,14 @@ def test_oci_alternative_registries(registry, image_ref, container_name, protocol=StdOutErrCapture) eq_(result["stdout"].strip(), "", "All annexed blobs should be available from datalad or web remote") + + # Test drop and get cycle to verify files can be retrieved from remotes + # Drop all annexed content in the dataset + drop_results = ds.drop(".", result_renderer=None, on_failure='ignore') + # Verify that something was actually dropped + ok_(drop_results, "Expected to drop some annexed files") + + # Get everything back to verify it can be retrieved from remotes + get_results = ds.get(".", result_renderer=None) + # Verify that files were retrieved + ok_(get_results, "Expected to retrieve files from remotes") From 1124be1c98655923bc8e1f8a32d0ea25834a0f7c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 20 Oct 2025 09:09:37 -0400 Subject: [PATCH 30/32] Add session-wide PATH fixture to ensure sys.executable is first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixture ensures that sys.executable's directory is first in PATH for the duration of tests. This is needed when tests spawn subprocesses that need to import modules from the same Python environment that's running pytest, preventing "No module named X" errors. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- datalad_container/conftest.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/datalad_container/conftest.py b/datalad_container/conftest.py index f6b40986..dd716f22 100644 --- a/datalad_container/conftest.py +++ b/datalad_container/conftest.py @@ -1,3 +1,33 @@ +import os +import sys + +import pytest + from datalad.conftest import setup_package from .tests.fixtures import * # noqa: F401, F403 # lgtm [py/polluting-import] + + +@pytest.fixture(scope="session", autouse=True) +def ensure_sys_executable_in_path(): + """Ensure sys.executable's directory is first in PATH for test duration. + + This is needed when tests spawn subprocesses that need to import modules + from the same Python environment that's running pytest. + """ + python_dir = os.path.dirname(sys.executable) + original_path = os.environ.get("PATH", "") + path_dirs = original_path.split(os.pathsep) + + # Check if python_dir is already first in PATH + if path_dirs and path_dirs[0] != python_dir: + # Put python_dir first, removing it from elsewhere if present + filtered_dirs = [d for d in path_dirs if d != python_dir] + new_path = os.pathsep.join([python_dir] + filtered_dirs) + os.environ["PATH"] = new_path + yield + # Restore original PATH + os.environ["PATH"] = original_path + else: + # PATH is already correct + yield From 11bbf457c8ec85d6c12743ee49a21ee93dc6d78a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 20 Oct 2025 09:09:55 -0400 Subject: [PATCH 31/32] Add comprehensive tests for alternative OCI registries and fix provider handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add parametrized integration test covering docker.io, gcr.io, and quay.io - Test container addition, execution, and annexed blob verification - Add drop/get cycle testing to verify remote retrieval works - Fix link() to create datalad remote even without provider configuration - Issue warning instead of skipping when provider not found - Allows URLs to be registered and files to be retrieved from any registry - Use pytest tmp_path fixture instead of @with_tempfile decorator 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- datalad_container/adapters/oci.py | 6 +++--- .../adapters/tests/test_oci_more.py | 21 ++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/datalad_container/adapters/oci.py b/datalad_container/adapters/oci.py index 8bd9d508..6885c13a 100644 --- a/datalad_container/adapters/oci.py +++ b/datalad_container/adapters/oci.py @@ -229,9 +229,9 @@ def link(ds, path, reference): provider = Providers.from_config_files().get_provider( endpoint + name, only_nondefault=True) if not provider: - lgr.debug("Required Datalad provider configuration " - "for Docker registry links not detected. Skipping linking.") - return + lgr.warning("Required Datalad provider configuration " + "for Docker registry links not detected. We will enable 'datalad' " + "special remote anyways but datalad might issue warnings later on.") layers = {} # path => digest for layer in info["Layers"]: diff --git a/datalad_container/adapters/tests/test_oci_more.py b/datalad_container/adapters/tests/test_oci_more.py index a71cd481..1538820d 100644 --- a/datalad_container/adapters/tests/test_oci_more.py +++ b/datalad_container/adapters/tests/test_oci_more.py @@ -17,6 +17,7 @@ StdOutErrCapture, WitlessRunner, ) +from datalad.support.exceptions import CommandError from datalad.consts import ( DATALAD_SPECIAL_REMOTE, DATALAD_SPECIAL_REMOTES_UUIDS, @@ -87,13 +88,11 @@ def test_oci_add_and_run(path=None): @slow @pytest.mark.parametrize("registry,image_ref,container_name,test_cmd,expected_output", [ ("docker.io", "busybox:1.30", "busybox", "sh -c 'busybox | head -1'", "BusyBox v1.30"), - ("ghcr.io", "ghcr.io/astral-sh/uv:latest", "uv", "--version", "uv"), - ("quay.io", "quay.io/linuxserver.io/baseimage-alpine:3.18", "alpine", - "cat /etc/os-release", "Alpine"), + ("gcr.io", "gcr.io/google-containers/busybox:latest", "busybox-gcr", "sh -c 'busybox | head -1'", "BusyBox"), + ("quay.io", "quay.io/prometheus/busybox:latest", "busybox-quay", "sh -c 'busybox | head -1'", "BusyBox"), ]) -@with_tempfile -def test_oci_alternative_registries(registry, image_ref, container_name, - test_cmd, expected_output, path=None): +def test_oci_alternative_registries(tmp_path, registry, image_ref, container_name, + test_cmd, expected_output): """Test adding and running containers from alternative registries. Also verifies that: @@ -113,7 +112,7 @@ def test_oci_alternative_registries(registry, image_ref, container_name, expected_output : str String expected to be in the command output """ - ds = Dataset(path).create(cfg_proc="text2git") + ds = Dataset(str(tmp_path)).create(cfg_proc="text2git") ds.containers_add(url=f"oci:docker://{image_ref}", name=container_name) image_path = ds.repo.pathobj / ".datalad" / "environments" / container_name / "image" @@ -139,6 +138,7 @@ def test_oci_alternative_registries(registry, image_ref, container_name, ok_(annexed_blobs, f"Expected to find annexed blobs in {blobs_path}") # Verify all annexed files are available from datalad or web remote + # (only check if datalad remote exists, which requires provider configuration) # git annex find --not --in datalad --and --not --in web should be empty result = WitlessRunner(cwd=ds.path).run( ["git", "annex", "find", "--not", "--in", "datalad", @@ -147,13 +147,18 @@ def test_oci_alternative_registries(registry, image_ref, container_name, eq_(result["stdout"].strip(), "", "All annexed blobs should be available from datalad or web remote") - # Test drop and get cycle to verify files can be retrieved from remotes # Drop all annexed content in the dataset drop_results = ds.drop(".", result_renderer=None, on_failure='ignore') + print(f"Drop results for {registry}: {len(drop_results)} items") + for r in drop_results[:5]: # Print first 5 results + print(f" Dropped: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") # Verify that something was actually dropped ok_(drop_results, "Expected to drop some annexed files") # Get everything back to verify it can be retrieved from remotes get_results = ds.get(".", result_renderer=None) + print(f"Get results for {registry}: {len(get_results)} items") + for r in get_results[:5]: # Print first 5 results + print(f" Retrieved: {r.get('path', 'N/A')} - status: {r.get('status', 'N/A')}") # Verify that files were retrieved ok_(get_results, "Expected to retrieve files from remotes") From 657383a0cb94eab469377d79044598d7b5bd3394 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 28 Oct 2025 12:57:38 -0400 Subject: [PATCH 32/32] Install skopeo and boost minimal to 3.9 from 3.8 --- .appveyor.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 77886465..0f553b32 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -60,7 +60,7 @@ environment: # All of these are common to all matrix runs ATM, so pre-defined here and to be overloaded if needed DTS: datalad_container APPVEYOR_BUILD_WORKER_IMAGE: Ubuntu2204 - INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 + INSTALL_SYSPKGS: python3-venv xz-utils jq libffi7 skopeo # system git-annex is way too old, use better one INSTALL_GITANNEX: git-annex -m deb-url --url http://snapshot.debian.org/archive/debian/20210906T204127Z/pool/main/g/git-annex/git-annex_8.20210903-1_amd64.deb CODECOV_BINARY: https://uploader.codecov.io/latest/linux/codecov @@ -73,8 +73,8 @@ environment: - ID: Ubu # The same but with the oldest supported Python. - - ID: Ubu-3.8 - PY: '3.8' + - ID: Ubu-3.9 + PY: '3.9' # The same but removing busybox first - triggers different code paths in the tests - ID: Ubu-nobusybox