From b45081f75d2e1509c903882525b87cb0c443dcf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= <m.szczepanik@fz-juelich.de>
Date: Wed, 5 Feb 2025 17:48:23 +0100
Subject: [PATCH 1/2] style: clean up extra whitespace

---
 datalad_container/adapters/docker.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py
index 9e02bec..774f53b 100644
--- a/datalad_container/adapters/docker.py
+++ b/datalad_container/adapters/docker.py
@@ -57,24 +57,24 @@ def save(image, path):
             elif os.listdir(path):
                 raise OSError("Directory {} is not empty".format(path))
             def is_within_directory(directory, target):
-                
+
                 abs_directory = os.path.abspath(directory)
                 abs_target = os.path.abspath(target)
-            
+
                 prefix = os.path.commonprefix([abs_directory, abs_target])
-                
+
                 return prefix == abs_directory
-            
+
             def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
-            
+
                 for member in tar.getmembers():
                     member_path = os.path.join(path, member.name)
                     if not is_within_directory(path, member_path):
                         raise Exception("Attempted Path Traversal in Tar File")
-            
-                tar.extractall(path, members, numeric_owner=numeric_owner) 
-                
-            
+
+                tar.extractall(path, members, numeric_owner=numeric_owner)
+
+
             safe_extract(tar, path=path)
             lgr.info("Saved %s to %s", image, path)
 

From 3534fbfc4cfe4efdb438032cde934158b8973c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= <m.szczepanik@fz-juelich.de>
Date: Wed, 5 Feb 2025 17:48:50 +0100
Subject: [PATCH 2/2] Simulate new docker manifest to ID old images

It appears that while Docker 27 has no problem loading images saved with
older versions, it generates the ID based on the "new style"
(OCI-compliant) manifest that it would save starting with v25, and not
the config file stored in the dataset. This causes DataLad to error out
due to ID mismatch, although the ID is most likely equivalent; see #269

This commit is the first attempt to solve this issue. Since the manifest
is a structured file, an attempt is made to generate a "new" style
manifest based on the contents of the saved image, and derive the ID
from that.

The manifest needs file types, sizes, and checksums. While we could copy
checksums from the previous manifest / config, we do not seem to have
the sizes. To solve that problem, we get both through ls_file_collection
from datalad-next. This is convenient and quick, but introduces a new
dependency.

The generated structure and content are a guesswork based on reading the
OCI spec and seeing docker save output from a single container - it sure
works from that container and tries to be applicable more broadly, but
most likely won't cover more complicated cases, or those where I'm not
even sure what behavior to expect (e.g. multi-arch manifest?). Layers
are assumed to always be rootfs_diff (I currently don't know if there
are other types possible).

This commit focuses on reading older images with new Docker, and does
not address reading new images (reading images saved with Docker 26
would still fail, because it already uses the new save format which our
adapter does not expect). So the combinatorics around that will need to
be addressed later.

The new code would only trigger for Docker 27. It introduces one small
regression, where get_image_id raises a NotImplementedError for two
arguments which can be given to the old get_image.
---
 datalad_container/adapters/docker.py        |  15 ++-
 datalad_container/adapters/manifestutils.py | 110 ++++++++++++++++++++
 2 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 datalad_container/adapters/manifestutils.py

diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py
index 774f53b..572e6af 100644
--- a/datalad_container/adapters/docker.py
+++ b/datalad_container/adapters/docker.py
@@ -85,6 +85,12 @@ def _list_images():
     return out.decode().splitlines()
 
 
+def _get_docker_version():
+    cmd = ["docker", "version", "--format", "{{.Client.Version}}"]
+    res = sp.run(cmd, capture_output=True, text=True)
+    return res.stdout.rstrip()
+
+
 def get_image(path, repo_tag=None, config=None):
     """Return the image ID of the image extracted at `path`.
     """
@@ -129,7 +135,14 @@ def load(path, repo_tag, config):
     # deleted (e.g., with 'docker image prune --all'). Given all three of these
     # things, loading the image from the dataset will tag the old neurodebian
     # image as the latest.
-    image_id = "sha256:" + get_image(path, repo_tag, config)
+    major_docker_version = int(_get_docker_version().split(".")[0])
+    if major_docker_version >= 27:
+        # delayed import for now because of extra dependency on -next
+        from .manifestutils import get_image_id
+        image_id = get_image_id(path, repo_tag, config)
+    else:
+        image_id = "sha256:" + get_image(path, repo_tag, config)
+
     if image_id not in _list_images():
         lgr.debug("Loading %s", image_id)
         cmd = ["docker", "load"]
diff --git a/datalad_container/adapters/manifestutils.py b/datalad_container/adapters/manifestutils.py
new file mode 100644
index 0000000..861ed48
--- /dev/null
+++ b/datalad_container/adapters/manifestutils.py
@@ -0,0 +1,110 @@
+import hashlib
+import json
+from pathlib import Path
+
+from datalad.api import ls_file_collection
+
+
+def descriptor(record):
+    """Create an OSI-compliant descriptor from a file collection record
+
+    This translates a DataLad ls_file_collection record into a minimal OCI
+    content descriptor. The media types are based on an example image
+    saved with Docker v27 (n=1 sample size), and they are assigned based on
+    the file extensions alone. The gzipped variant appears in the OCI spec
+    but the file extensions are a complete guess here.
+    """
+    media_type = None
+    p = record["item"]
+    if p.suffix == ".json":
+        media_type = "application/vnd.docker.container.image.v1+json"
+    elif p.suffix == ".tar":
+        media_type = "application/vnd.docker.image.rootfs.diff.tar"
+    elif p.suffix in {".tgz", ".tar.gz", ".tar.gzip"}:
+        media_type = "application/vnd.docker.image.rootfs.diff.tar+gzip"
+
+    d = {
+        "mediaType": media_type,
+        "digest": f"sha256:{record['hash-sha256']}",
+        "size": record["size"],
+    }
+    return d
+
+
+def new_manifest(path):
+    """Create a v2 docker image manifest from an old saved image
+
+    This is a best effort of creating a "new style" OSI-compliant image
+    manifest from an image saved with an older (<25) Docker version.
+    Such manifest may be needed to compute the image ID for Docker >=27.
+
+    """
+    # use ls_file_collection to get sizes and hashes of container files
+    # we do not need all, but hashing the text files adds little overhead
+    # and the convenience probably wins
+    records = ls_file_collection(
+        type="annexworktree",
+        collection=path.absolute(),
+        hash="sha256",
+        result_renderer="disabled"
+    )
+
+    # we only need certain files, in the order they appear in old manifest
+    # convert the above to a path-indexed dict for easier lookups
+    contents = {r["item"].relative_to(r["collection"]): r for r in records}
+
+    # read the old manifest and find out the config and layer paths
+    with path.joinpath("manifest.json").open("rb") as jpath:
+        manifest = json.load(jpath)[0]
+    config_path = Path(manifest["Config"])
+    layer_paths = [Path(layer) for layer in manifest["Layers"]]
+
+    # create the new-style manifest
+    d = {
+        "schemaVersion": 2,
+        "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+        "config": descriptor(contents[config_path]),
+        "layers": [descriptor(contents[p]) for p in layer_paths],
+    }
+
+    return json.dumps(d, separators=(",", ":"))
+
+
+def get_image_id(path, repo_tag=None, config=None):
+    """Return the ID of an image extracted at path.
+
+    This is a drop-in replacement for get_image which tries to emulate
+    Docker 27 behavior when creating image IDs seemingly based on the
+    hash of the v2 image manifest (even if the image is stored in an
+    older format, in which case we try to create a manifest ourselves).
+    It does not take all the combinatorics ino account but can serve as
+    a workaround in at least some cases.
+
+    """
+    if (repo_tag is not None) or (config is not None):
+        msg = (
+            "Dealing with repo tags or config is not implemented"
+            "for the new style of docker manifests"
+        )
+        raise NotImplementedError(msg)
+
+    if isinstance(path, str):
+        path = Path(path)
+
+    # determine "new" vs "old" schema
+    with path.joinpath("manifest.json").open() as jpath:
+        manifest = json.load(jpath)
+
+    try:
+        isNewSchema = manifest.get("schemaVersion", 1) >= 2
+    except AttributeError:
+        isNewSchema = False
+
+    # get a hash of a new-style manifest, generating one if needed
+    if isNewSchema:
+        shasum = hashlib.sha256(path.read_bytes())
+    else:
+        nm = new_manifest(path)
+        shasum = hashlib.sha256(nm.encode("utf-8")).hexdigest()
+
+    return f"sha256:{shasum}"