Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions datalad_container/adapters/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,24 @@ def save(image, path):
elif os.listdir(path):
raise OSError("Directory {} is not empty".format(path))
def is_within_directory(directory, target):

abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)

prefix = os.path.commonprefix([abs_directory, abs_target])

return prefix == abs_directory

def safe_extract(tar, path=".", members=None, *, numeric_owner=False):

for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")
tar.extractall(path, members, numeric_owner=numeric_owner)

tar.extractall(path, members, numeric_owner=numeric_owner)


safe_extract(tar, path=path)
lgr.info("Saved %s to %s", image, path)

Expand All @@ -85,6 +85,12 @@ def _list_images():
return out.decode().splitlines()


def _get_docker_version():
cmd = ["docker", "version", "--format", "{{.Client.Version}}"]
res = sp.run(cmd, capture_output=True, text=True)
return res.stdout.rstrip()


def get_image(path, repo_tag=None, config=None):
"""Return the image ID of the image extracted at `path`.
"""
Expand Down Expand Up @@ -129,7 +135,14 @@ def load(path, repo_tag, config):
# deleted (e.g., with 'docker image prune --all'). Given all three of these
# things, loading the image from the dataset will tag the old neurodebian
# image as the latest.
image_id = "sha256:" + get_image(path, repo_tag, config)
major_docker_version = int(_get_docker_version().split(".")[0])
if major_docker_version >= 27:
# delayed import for now because of extra dependency on -next
from .manifestutils import get_image_id
image_id = get_image_id(path, repo_tag, config)
else:
image_id = "sha256:" + get_image(path, repo_tag, config)

if image_id not in _list_images():
lgr.debug("Loading %s", image_id)
cmd = ["docker", "load"]
Expand Down
110 changes: 110 additions & 0 deletions datalad_container/adapters/manifestutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import hashlib
import json
from pathlib import Path

from datalad.api import ls_file_collection


def descriptor(record):
"""Create an OSI-compliant descriptor from a file collection record

This translates a DataLad ls_file_collection record into a minimal OCI
content descriptor. The media types are based on an example image
saved with Docker v27 (n=1 sample size), and they are assigned based on
the file extensions alone. The gzipped variant appears in the OCI spec
but the file extensions are a complete guess here.
"""
media_type = None
p = record["item"]
if p.suffix == ".json":
media_type = "application/vnd.docker.container.image.v1+json"
elif p.suffix == ".tar":
media_type = "application/vnd.docker.image.rootfs.diff.tar"
elif p.suffix in {".tgz", ".tar.gz", ".tar.gzip"}:
media_type = "application/vnd.docker.image.rootfs.diff.tar+gzip"

d = {
"mediaType": media_type,
"digest": f"sha256:{record['hash-sha256']}",
"size": record["size"],
}
return d


def new_manifest(path):
"""Create a v2 docker image manifest from an old saved image

This is a best effort of creating a "new style" OSI-compliant image
manifest from an image saved with an older (<25) Docker version.
Such manifest may be needed to compute the image ID for Docker >=27.

"""
# use ls_file_collection to get sizes and hashes of container files
# we do not need all, but hashing the text files adds little overhead
# and the convenience probably wins
records = ls_file_collection(
type="annexworktree",
collection=path.absolute(),
hash="sha256",
result_renderer="disabled"
)

# we only need certain files, in the order they appear in old manifest
# convert the above to a path-indexed dict for easier lookups
contents = {r["item"].relative_to(r["collection"]): r for r in records}

# read the old manifest and find out the config and layer paths
with path.joinpath("manifest.json").open("rb") as jpath:
manifest = json.load(jpath)[0]
config_path = Path(manifest["Config"])
layer_paths = [Path(layer) for layer in manifest["Layers"]]

# create the new-style manifest
d = {
"schemaVersion": 2,
"mediaType": "application/vnd.docker.distribution.manifest.v2+json",
"config": descriptor(contents[config_path]),
"layers": [descriptor(contents[p]) for p in layer_paths],
}

return json.dumps(d, separators=(",", ":"))


def get_image_id(path, repo_tag=None, config=None):
"""Return the ID of an image extracted at path.

This is a drop-in replacement for get_image which tries to emulate
Docker 27 behavior when creating image IDs seemingly based on the
hash of the v2 image manifest (even if the image is stored in an
older format, in which case we try to create a manifest ourselves).
It does not take all the combinatorics ino account but can serve as
a workaround in at least some cases.

"""
if (repo_tag is not None) or (config is not None):
msg = (
"Dealing with repo tags or config is not implemented"
"for the new style of docker manifests"
)
raise NotImplementedError(msg)

if isinstance(path, str):
path = Path(path)

# determine "new" vs "old" schema
with path.joinpath("manifest.json").open() as jpath:
manifest = json.load(jpath)

try:
isNewSchema = manifest.get("schemaVersion", 1) >= 2
except AttributeError:
isNewSchema = False

# get a hash of a new-style manifest, generating one if needed
if isNewSchema:
shasum = hashlib.sha256(path.read_bytes())
else:
nm = new_manifest(path)
shasum = hashlib.sha256(nm.encode("utf-8")).hexdigest()

return f"sha256:{shasum}"