datalad · mslw · Feb 5, 2025 · Feb 5, 2025
diff --git a/datalad_container/adapters/docker.py b/datalad_container/adapters/docker.py
@@ -57,24 +57,24 @@ def save(image, path):
             elif os.listdir(path):
                 raise OSError("Directory {} is not empty".format(path))
             def is_within_directory(directory, target):
-                
+
                 abs_directory = os.path.abspath(directory)
                 abs_target = os.path.abspath(target)
-            
+
                 prefix = os.path.commonprefix([abs_directory, abs_target])
-                
+
                 return prefix == abs_directory
-            
+
             def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
-            
+
                 for member in tar.getmembers():
                     member_path = os.path.join(path, member.name)
                     if not is_within_directory(path, member_path):
                         raise Exception("Attempted Path Traversal in Tar File")
-            
-                tar.extractall(path, members, numeric_owner=numeric_owner) 
-                
-            
+
+                tar.extractall(path, members, numeric_owner=numeric_owner)
+
+
             safe_extract(tar, path=path)
             lgr.info("Saved %s to %s", image, path)
 
@@ -85,6 +85,12 @@ def _list_images():
     return out.decode().splitlines()
 
 
+def _get_docker_version():
+    cmd = ["docker", "version", "--format", "{{.Client.Version}}"]
+    res = sp.run(cmd, capture_output=True, text=True)
+    return res.stdout.rstrip()
+
+
 def get_image(path, repo_tag=None, config=None):
     """Return the image ID of the image extracted at `path`.
     """
@@ -129,7 +135,14 @@ def load(path, repo_tag, config):
     # deleted (e.g., with 'docker image prune --all'). Given all three of these
     # things, loading the image from the dataset will tag the old neurodebian
     # image as the latest.
-    image_id = "sha256:" + get_image(path, repo_tag, config)
+    major_docker_version = int(_get_docker_version().split(".")[0])
+    if major_docker_version >= 27:
+        # delayed import for now because of extra dependency on -next
+        from .manifestutils import get_image_id
+        image_id = get_image_id(path, repo_tag, config)
+    else:
+        image_id = "sha256:" + get_image(path, repo_tag, config)
+
     if image_id not in _list_images():
         lgr.debug("Loading %s", image_id)
         cmd = ["docker", "load"]

diff --git a/datalad_container/adapters/manifestutils.py b/datalad_container/adapters/manifestutils.py
@@ -0,0 +1,110 @@
+import hashlib
+import json
+from pathlib import Path
+
+from datalad.api import ls_file_collection
+
+
+def descriptor(record):
+    """Create an OSI-compliant descriptor from a file collection record
+
+    This translates a DataLad ls_file_collection record into a minimal OCI
+    content descriptor. The media types are based on an example image
+    saved with Docker v27 (n=1 sample size), and they are assigned based on
+    the file extensions alone. The gzipped variant appears in the OCI spec
+    but the file extensions are a complete guess here.
+    """
+    media_type = None
+    p = record["item"]
+    if p.suffix == ".json":
+        media_type = "application/vnd.docker.container.image.v1+json"
+    elif p.suffix == ".tar":
+        media_type = "application/vnd.docker.image.rootfs.diff.tar"
+    elif p.suffix in {".tgz", ".tar.gz", ".tar.gzip"}:
+        media_type = "application/vnd.docker.image.rootfs.diff.tar+gzip"
+
+    d = {
+        "mediaType": media_type,
+        "digest": f"sha256:{record['hash-sha256']}",
+        "size": record["size"],
+    }
+    return d
+
+
+def new_manifest(path):
+    """Create a v2 docker image manifest from an old saved image
+
+    This is a best effort of creating a "new style" OSI-compliant image
+    manifest from an image saved with an older (<25) Docker version.
+    Such manifest may be needed to compute the image ID for Docker >=27.
+
+    """
+    # use ls_file_collection to get sizes and hashes of container files
+    # we do not need all, but hashing the text files adds little overhead
+    # and the convenience probably wins
+    records = ls_file_collection(
+        type="annexworktree",
+        collection=path.absolute(),
+        hash="sha256",
+        result_renderer="disabled"
+    )
+
+    # we only need certain files, in the order they appear in old manifest
+    # convert the above to a path-indexed dict for easier lookups
+    contents = {r["item"].relative_to(r["collection"]): r for r in records}
+
+    # read the old manifest and find out the config and layer paths
+    with path.joinpath("manifest.json").open("rb") as jpath:
+        manifest = json.load(jpath)[0]
+    config_path = Path(manifest["Config"])
+    layer_paths = [Path(layer) for layer in manifest["Layers"]]
+
+    # create the new-style manifest
+    d = {
+        "schemaVersion": 2,
+        "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+        "config": descriptor(contents[config_path]),
+        "layers": [descriptor(contents[p]) for p in layer_paths],
+    }
+
+    return json.dumps(d, separators=(",", ":"))
+
+
+def get_image_id(path, repo_tag=None, config=None):
+    """Return the ID of an image extracted at path.
+
+    This is a drop-in replacement for get_image which tries to emulate
+    Docker 27 behavior when creating image IDs seemingly based on the
+    hash of the v2 image manifest (even if the image is stored in an
+    older format, in which case we try to create a manifest ourselves).
+    It does not take all the combinatorics ino account but can serve as
+    a workaround in at least some cases.
+
+    """
+    if (repo_tag is not None) or (config is not None):
+        msg = (
+            "Dealing with repo tags or config is not implemented"
+            "for the new style of docker manifests"
+        )
+        raise NotImplementedError(msg)
+
+    if isinstance(path, str):
+        path = Path(path)
+
+    # determine "new" vs "old" schema
+    with path.joinpath("manifest.json").open() as jpath:
+        manifest = json.load(jpath)
+
+    try:
+        isNewSchema = manifest.get("schemaVersion", 1) >= 2
+    except AttributeError:
+        isNewSchema = False
+
+    # get a hash of a new-style manifest, generating one if needed
+    if isNewSchema:
+        shasum = hashlib.sha256(path.read_bytes())
+    else:
+        nm = new_manifest(path)
+        shasum = hashlib.sha256(nm.encode("utf-8")).hexdigest()
+
+    return f"sha256:{shasum}"