Skip to content

Commit e9071f1

Browse files
Migrate serialize_v0 to new API (as part of serialization layer) (#190)
* Migrate `serialize_v0` to new API. This is the middle layer of the API design work (#172). We add a manifest abstract class to represent various manifests (#111 #112) and also ways to serialize a model directory into manifests and ways to verify the manifests. For now, this only does what was formerly known as `serialize_v0`. The v1 and the manifest versions will come soon. Note: This has a lot of inspiration from #112, but makes the API work with all the usecases we need to consider right now. Signed-off-by: Mihai Maruseac <[email protected]> * Clarify some comments Signed-off-by: Mihai Maruseac <[email protected]> * Encode name with base64 Signed-off-by: Mihai Maruseac <[email protected]> * Add another test case Signed-off-by: Mihai Maruseac <[email protected]> * Empty commit to retrigger DCO check. See dcoapp/app#211 (comment) Signed-off-by: Mihai Maruseac <[email protected]> --------- Signed-off-by: Mihai Maruseac <[email protected]>
1 parent 8fc2f24 commit e9071f1

File tree

7 files changed

+520
-3
lines changed

7 files changed

+520
-3
lines changed

model_signing/hashing/file.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
```python
2929
>>> with open("/tmp/file", "w") as f:
3030
... f.write("0123abcd")
31-
>>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
31+
>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
3232
>>> digest = hasher.compute()
3333
>>> digest.digest_hex
3434
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
@@ -144,8 +144,7 @@ def __init__(
144144
Args:
145145
file: The file to hash. Use `set_file` to reset it.
146146
content_hasher: A `hashing.HashEngine` instance used to compute the
147-
digest of the file. This instance must not be used outside of this
148-
instance. However, it may be pre-initialized with a header.
147+
digest of the file.
149148
start: The file offset to start reading from. Must be valid. Reset
150149
with `set_shard`.
151150
end: The file offset to start reading from. Must be stricly greater

model_signing/manifest/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/manifest/manifest.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Machinery for representing a serialized representation of an ML model.
16+
17+
Currently, we only support a manifest that wraps around a digest. But, to
18+
support incremental updates and partial signature verification, we need a
19+
manifest that lists files and their digests. That will come in a future change,
20+
soon.
21+
"""
22+
23+
from abc import ABCMeta
24+
from dataclasses import dataclass
25+
26+
from model_signing.hashing import hashing
27+
28+
29+
class Manifest(metaclass=ABCMeta):
30+
"""Generic manifest file to represent a model."""
31+
32+
pass
33+
34+
35+
@dataclass
36+
class DigestManifest(Manifest):
37+
"""A manifest that is just a hash."""
38+
39+
digest: hashing.Digest

model_signing/serializing/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/serializing/dfs.py

+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Model serializers that build a single hash out of a DFS traversal."""
16+
17+
import base64
18+
import pathlib
19+
from typing import Callable
20+
from typing_extensions import override
21+
22+
from model_signing.hashing import file
23+
from model_signing.hashing import hashing
24+
from model_signing.manifest import manifest
25+
from model_signing.serializing import serializing
26+
27+
28+
def _check_file_or_directory(path: pathlib.Path) -> bool:
29+
"""Checks that the given path is either a file or a directory.
30+
31+
There is no support for sockets, pipes, or any other operating system
32+
concept abstracted as a file.
33+
34+
Furthermore, this would return False if the path is a broken symlink, if it
35+
doesn't exists or if there are permission errors.
36+
"""
37+
return path.is_file() or path.is_dir()
38+
39+
40+
def _build_header(*, entry_name: str, entry_type: str) -> bytes:
41+
"""Builds a header to encode a path with given name and type.
42+
43+
Args:
44+
entry_name: The name of the entry to build the header for.
45+
entry_type: The type of the entry (file or directory).
46+
"""
47+
encoded_type = entry_type.encode("utf-8")
48+
# Prevent confusion if name has a "." inside by encoding to base64.
49+
encoded_name = base64.b64encode(entry_name.encode("utf-8"))
50+
# Note: make sure to end with a ".".
51+
return b".".join([encoded_type, encoded_name, b""])
52+
53+
54+
class DFSSerializer(serializing.Serializer):
55+
"""Serializer for a model that performs a traversal of the model directory.
56+
57+
This serializer produces a single hash for the entire model. If the model is
58+
a file, the hash is the digest of the file. If the model is a directory, we
59+
perform a depth-first traversal of the directory, hash each individual files
60+
and aggregate the hashes together.
61+
"""
62+
63+
def __init__(
64+
self,
65+
file_hasher: file.FileHasher,
66+
merge_hasher_factory: Callable[[], hashing.StreamingHashEngine],
67+
):
68+
"""Initializes an instance to hash a file with a specific `HashEngine`.
69+
70+
Args:
71+
hasher: The hash engine used to hash the individual files.
72+
merge_hasher_factory: A callable that returns a
73+
`hashing.StreamingHashEngine` instance used to merge individual
74+
file digests to compute an aggregate digest.
75+
"""
76+
self._file_hasher = file_hasher
77+
self._merge_hasher_factory = merge_hasher_factory
78+
79+
@override
80+
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
81+
# TODO(mihaimaruseac): Add checks to exclude symlinks if desired
82+
if not _check_file_or_directory(model_path):
83+
raise ValueError(
84+
f"Cannot use '{model_path}' as file or directory. It could be a"
85+
" special file, it could be missing, or there might be a"
86+
" permission issue."
87+
)
88+
89+
if model_path.is_file():
90+
self._file_hasher.set_file(model_path)
91+
return manifest.DigestManifest(self._file_hasher.compute())
92+
93+
return manifest.DigestManifest(self._dfs(model_path))
94+
95+
def _dfs(self, directory: pathlib.Path) -> hashing.Digest:
96+
# TODO(mihaimaruseac): Add support for excluded files
97+
children = sorted([x for x in directory.iterdir()])
98+
99+
hasher = self._merge_hasher_factory()
100+
for child in children:
101+
if not _check_file_or_directory(child):
102+
raise ValueError(
103+
f"Cannot use '{child}' as file or directory. It could be a"
104+
" special file, it could be missing, or there might be a"
105+
" permission issue."
106+
)
107+
108+
if child.is_file():
109+
header = _build_header(entry_name=child.name, entry_type="file")
110+
hasher.update(header)
111+
self._file_hasher.set_file(child)
112+
digest = self._file_hasher.compute()
113+
hasher.update(digest.digest_value)
114+
else:
115+
header = _build_header(entry_name=child.name, entry_type="dir")
116+
hasher.update(header)
117+
digest = self._dfs(child)
118+
hasher.update(digest.digest_value)
119+
120+
return hasher.compute()

0 commit comments

Comments
 (0)