Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 4cae6b3

Browse files
committedJun 4, 2024··
Migrate serialize_v0 to new API.
This is the middle layer of the API design work (#172). We add a manifest abstract class to represent various manifests (#111 #112) and also ways to serialize a model directory into manifests and ways to verify the manifests. For now, this only does what was formerly known as `serialize_v0`. The v1 and the manifest versions will come soon. Note: This has a lot of inspiration from #112, but makes the API work with all the usecases we need to consider right now. Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>
1 parent 220d5c7 commit 4cae6b3

File tree

6 files changed

+484
-0
lines changed

6 files changed

+484
-0
lines changed
 

‎model_signing/manifest/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

‎model_signing/manifest/manifest.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Machinery for representing a serialized representation of an ML model.
16+
17+
Currently, we only support a manifest that wraps around a digest. But, to
18+
support incremental updates and partial signature verification, we need a
19+
manifest that lists files and their digests. That will come in a future change,
20+
soon.
21+
"""
22+
23+
from abc import ABCMeta
24+
from dataclasses import dataclass
25+
26+
from model_signing.hashing import hashing
27+
28+
29+
class Manifest(metaclass=ABCMeta):
30+
"""Generic manifest file to represent a model."""
31+
32+
pass
33+
34+
35+
@dataclass
36+
class DigestManifest(Manifest):
37+
"""A manifest that is just a hash."""
38+
39+
digest: hashing.Digest

‎model_signing/serializing/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

‎model_signing/serializing/dfs.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Model serializers that build a single hash out of a DFS traversal."""
16+
17+
import pathlib
18+
from typing import Callable
19+
from model_signing.hashing import file
20+
from model_signing.hashing import hashing
21+
from model_signing.manifest import manifest
22+
from model_signing.serializing import serializing
23+
from typing_extensions import override
24+
25+
26+
def _check_file_or_directory(path: pathlib.Path) -> bool:
27+
"""Checks that the given path is either a file or a directory."""
28+
return path.is_file() or path.is_dir()
29+
30+
31+
def _build_header(*, entry_name: str, entry_type: str) -> bytes:
32+
"""Builds a header to encode a path with given name and type.
33+
34+
Args:
35+
entry_name: The name of the entry to build the header for.
36+
entry_type: The type of the entry (file or directory).
37+
"""
38+
encoded_type = entry_type.encode("utf-8")
39+
encoded_name = entry_name.encode("utf-8")
40+
return b".".join([encoded_type, encoded_name, b""])
41+
42+
43+
class DFSSerializer(serializing.Serializer):
44+
"""Serializer for a model that performs a traversal of the model directory.
45+
46+
This serializer produces a single hash for the entire model. If the model is
47+
a file, the hash is the digest of the file. If the model is a directory, we
48+
perform a depth-first traversal of the directory, hash each individual files
49+
and aggregate the hashes together.
50+
"""
51+
52+
def __init__(
53+
self,
54+
file_hasher: file.FileHasher,
55+
merge_hasher_factory: Callable[[], hashing.StreamingHashEngine],
56+
):
57+
"""Initializes an instance to hash a file with a specific `HashEngine`.
58+
59+
Args:
60+
hasher: The hash engine used to hash the individual files.
61+
merge_hasher_factory: A callable that returns a
62+
`hashing.StreamingHashEngine` instance used to merge individual
63+
file digests to compute an aggregate digest.
64+
"""
65+
self._file_hasher = file_hasher
66+
self._merge_hasher_factory = merge_hasher_factory
67+
68+
@override
69+
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
70+
# TODO(mihaimaruseac): Add checks for symlinks
71+
if not _check_file_or_directory(model_path):
72+
raise ValueError(
73+
f"Must have a file or directory, but '{model_path}' is neither."
74+
)
75+
76+
if model_path.is_file():
77+
self._file_hasher.set_file(model_path)
78+
return manifest.DigestManifest(self._file_hasher.compute())
79+
80+
return manifest.DigestManifest(self._dfs(model_path))
81+
82+
def _dfs(self, directory: pathlib.Path) -> hashing.Digest:
83+
# TODO(mihaimaruseac): Add support for excluded files
84+
children = sorted([x for x in directory.iterdir()])
85+
86+
hasher = self._merge_hasher_factory()
87+
for child in children:
88+
if not _check_file_or_directory(child):
89+
raise ValueError(
90+
f"Must have a file or directory, but '{child}' is neither."
91+
)
92+
93+
if child.is_file():
94+
header = _build_header(entry_name=child.name, entry_type="file")
95+
hasher.update(header)
96+
self._file_hasher.set_file(child)
97+
digest = self._file_hasher.compute()
98+
hasher.update(digest.digest_value)
99+
else:
100+
header = _build_header(entry_name=child.name, entry_type="dir")
101+
hasher.update(header)
102+
digest = self._dfs(child)
103+
hasher.update(digest.digest_value)
104+
105+
return hasher.compute()

‎model_signing/serializing/dfs_test.py

+281
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from model_signing.hashing import file
16+
from model_signing.hashing import memory
17+
from model_signing.serializing import dfs
18+
import pytest
19+
20+
21+
# some constants used throughout testing
22+
_KNOWN_MODEL_TEXT: bytes = b"This is a simple model"
23+
_ANOTHER_MODEL_TEXT: bytes = b"This is another simple model"
24+
25+
26+
# Note: Don't make fixtures with global scope as we are altering the models!
27+
@pytest.fixture
28+
def sample_model_file(tmp_path_factory):
29+
file = tmp_path_factory.mktemp("model") / "file"
30+
file.write_bytes(_KNOWN_MODEL_TEXT)
31+
return file
32+
33+
34+
@pytest.fixture
35+
def empty_model_file(tmp_path_factory):
36+
file = tmp_path_factory.mktemp("model") / "file"
37+
file.write_bytes(b"")
38+
return file
39+
40+
41+
@pytest.fixture
42+
def sample_model_folder(tmp_path_factory):
43+
model_root = tmp_path_factory.mktemp("model") / "root"
44+
model_root.mkdir()
45+
46+
for i in range(2):
47+
root_dir = model_root / f"d{i}"
48+
root_dir.mkdir()
49+
for j in range(3):
50+
dir_file = root_dir / f"f{i}{j}"
51+
dir_file.write_text(f"This is file f{i}{j} in d{i}.")
52+
53+
for i in range(4):
54+
root_file = model_root / f"f{i}"
55+
root_file.write_text(f"This is file f{i} in root.")
56+
57+
return model_root
58+
59+
60+
@pytest.fixture
61+
def empty_model_folder(tmp_path_factory):
62+
model_root = tmp_path_factory.mktemp("model") / "root"
63+
model_root.mkdir()
64+
return model_root
65+
66+
67+
@pytest.fixture
68+
def deep_model_folder(tmp_path_factory):
69+
model_root = tmp_path_factory.mktemp("model") / "root"
70+
model_root.mkdir()
71+
72+
current = model_root
73+
for i in range(5):
74+
current = current / f"d{i}"
75+
current.mkdir()
76+
77+
for i in range(4):
78+
file = current / f"f{i}"
79+
file.write_text(f"This is file f{i}.")
80+
81+
return model_root
82+
83+
84+
class TestDFSSerializer:
85+
86+
def test_known_file(self, sample_model_file):
87+
file_hasher = file.FileHasher("unused", memory.SHA256())
88+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
89+
manifest = serializer.serialize(sample_model_file)
90+
expected = (
91+
"3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b"
92+
)
93+
assert manifest.digest.digest_hex == expected
94+
95+
def test_file_hash_is_same_as_hash_of_content(self, sample_model_file):
96+
file_hasher = file.FileHasher("unused", memory.SHA256())
97+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
98+
manifest = serializer.serialize(sample_model_file)
99+
digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute()
100+
assert manifest.digest.digest_hex == digest.digest_hex
101+
102+
def test_file_model_hash_is_same_if_model_is_moved(self, sample_model_file):
103+
file_hasher = file.FileHasher("unused", memory.SHA256())
104+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
105+
manifest = serializer.serialize(sample_model_file)
106+
107+
new_name = sample_model_file.with_name("new-file")
108+
new_file = sample_model_file.rename(new_name)
109+
new_manifest = serializer.serialize(new_file)
110+
111+
assert manifest == new_manifest
112+
113+
def test_file_model_hash_changes_if_content_changes(
114+
self, sample_model_file
115+
):
116+
file_hasher = file.FileHasher("unused", memory.SHA256())
117+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
118+
manifest = serializer.serialize(sample_model_file)
119+
120+
sample_model_file.write_bytes(_ANOTHER_MODEL_TEXT)
121+
new_manifest = serializer.serialize(sample_model_file)
122+
123+
assert manifest.digest.algorithm == new_manifest.digest.algorithm
124+
assert manifest.digest.digest_value != new_manifest.digest.digest_value
125+
126+
def test_directory_model_with_only_known_file(self, sample_model_file):
127+
file_hasher = file.FileHasher("unused", memory.SHA256())
128+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
129+
130+
model = sample_model_file.parent
131+
manifest = serializer.serialize(model)
132+
133+
expected = (
134+
"aa856f565699473579c8d7009bfad8c421e1643b810f0a28d47b9ce1f0b98ccc"
135+
)
136+
assert manifest.digest.digest_hex == expected
137+
138+
digest = memory.SHA256(_KNOWN_MODEL_TEXT).compute()
139+
assert manifest.digest.digest_hex != digest.digest_hex
140+
141+
def test_known_folder(self, sample_model_folder):
142+
file_hasher = file.FileHasher("unused", memory.SHA256())
143+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
144+
manifest = serializer.serialize(sample_model_folder)
145+
expected = (
146+
"516de24dd65c9749bbde333545cb997c645e21c510107fa5c06428e0df84099b"
147+
)
148+
assert manifest.digest.digest_hex == expected
149+
150+
def test_folder_model_hash_is_same_if_model_is_moved(
151+
self, sample_model_folder
152+
):
153+
file_hasher = file.FileHasher("unused", memory.SHA256())
154+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
155+
manifest = serializer.serialize(sample_model_folder)
156+
157+
new_name = sample_model_folder.with_name("new-root")
158+
new_model = sample_model_folder.rename(new_name)
159+
new_manifest = serializer.serialize(new_model)
160+
161+
assert manifest == new_manifest
162+
163+
def test_empty_file(self, empty_model_file):
164+
file_hasher = file.FileHasher("unused", memory.SHA256())
165+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
166+
manifest = serializer.serialize(empty_model_file)
167+
expected = (
168+
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
169+
)
170+
assert manifest.digest.digest_hex == expected
171+
172+
def test_directory_model_with_only_empty_file(self, empty_model_file):
173+
file_hasher = file.FileHasher("unused", memory.SHA256())
174+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
175+
manifest = serializer.serialize(empty_model_file)
176+
model = empty_model_file.parent
177+
manifest = serializer.serialize(model)
178+
expected = (
179+
"ca671f6b24ce1b08677759ed050a30eb86a28c18abfa2308c7da9e581a8f7917"
180+
)
181+
assert manifest.digest.digest_hex == expected
182+
183+
def test_empty_folder(self, empty_model_folder):
184+
file_hasher = file.FileHasher("unused", memory.SHA256())
185+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
186+
manifest = serializer.serialize(empty_model_folder)
187+
expected = (
188+
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
189+
)
190+
assert manifest.digest.digest_hex == expected
191+
192+
def test_empty_folder_hashes_the_same_as_empty_file(
193+
self, empty_model_file, empty_model_folder
194+
):
195+
file_hasher = file.FileHasher("unused", memory.SHA256())
196+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
197+
folder_manifest = serializer.serialize(empty_model_folder)
198+
file_manifest = serializer.serialize(empty_model_file)
199+
assert (
200+
folder_manifest.digest.digest_hex == file_manifest.digest.digest_hex
201+
)
202+
203+
def test_folder_model_empty_entry(self, sample_model_folder):
204+
file_hasher = file.FileHasher("unused", memory.SHA256())
205+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
206+
207+
# Alter first directory within the model
208+
dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()]
209+
altered_dir = dirs[0]
210+
211+
new_empty_dir = altered_dir / "empty"
212+
new_empty_dir.mkdir()
213+
manifest1 = serializer.serialize(sample_model_folder)
214+
215+
new_empty_dir.rmdir()
216+
217+
new_empty_file = altered_dir / "empty"
218+
new_empty_file.write_text("")
219+
manifest2 = serializer.serialize(sample_model_folder)
220+
221+
assert manifest1.digest != manifest2.digest
222+
223+
def test_folder_model_rename_file(self, sample_model_folder):
224+
file_hasher = file.FileHasher("unused", memory.SHA256())
225+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
226+
manifest1 = serializer.serialize(sample_model_folder)
227+
228+
# Alter first directory within the model
229+
dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()]
230+
altered_dir = dirs[0]
231+
232+
# Alter first file in the altered_dir
233+
files = [f for f in altered_dir.iterdir() if f.is_file()]
234+
file_to_rename = files[0]
235+
236+
new_name = file_to_rename.with_name("new-file")
237+
new_file = file_to_rename.rename(new_name)
238+
239+
manifest2 = serializer.serialize(sample_model_folder)
240+
assert manifest1.digest != manifest2.digest
241+
242+
def test_folder_model_rename_dir(self, sample_model_folder):
243+
file_hasher = file.FileHasher("unused", memory.SHA256())
244+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
245+
manifest1 = serializer.serialize(sample_model_folder)
246+
247+
# Alter first directory within the model
248+
dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()]
249+
dir_to_rename = dirs[0]
250+
251+
new_name = dir_to_rename.with_name("new-dir")
252+
new_file = dir_to_rename.rename(new_name)
253+
254+
manifest2 = serializer.serialize(sample_model_folder)
255+
assert manifest1.digest != manifest2.digest
256+
257+
def test_folder_model_change_file(self, sample_model_folder):
258+
file_hasher = file.FileHasher("unused", memory.SHA256())
259+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
260+
manifest1 = serializer.serialize(sample_model_folder)
261+
262+
# Alter first directory within the model
263+
dirs = [d for d in sample_model_folder.iterdir() if d.is_dir()]
264+
altered_dir = dirs[0]
265+
266+
# Alter first file in the altered_dir
267+
files = [f for f in altered_dir.iterdir() if f.is_file()]
268+
file_to_change = files[0]
269+
file_to_change.write_bytes(_KNOWN_MODEL_TEXT)
270+
271+
manifest2 = serializer.serialize(sample_model_folder)
272+
assert manifest1.digest != manifest2.digest
273+
274+
def test_deep_folder(self, deep_model_folder):
275+
file_hasher = file.FileHasher("unused", memory.SHA256())
276+
serializer = dfs.DFSSerializer(file_hasher, memory.SHA256)
277+
manifest = serializer.serialize(deep_model_folder)
278+
expected = (
279+
"1ae1b8a653dba20787ae8482611761ee7f1223b15fbfbaa1fce5c55751048d62"
280+
)
281+
assert manifest.digest.digest_hex == expected
+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Machinery for serializing ML models.
16+
17+
Currently we have only one serializer that performs a DFS traversal of the model
18+
directory, but more serializers are coming soon.
19+
"""
20+
21+
from abc import ABCMeta, abstractmethod
22+
import pathlib
23+
24+
from model_signing.manifest import manifest
25+
26+
27+
class Serializer(metaclass=ABCMeta):
28+
"""Generic ML model format serializer."""
29+
30+
@abstractmethod
31+
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
32+
"""Serializes the model given by the `model_path` argument."""
33+
pass

0 commit comments

Comments
 (0)
Please sign in to comment.