Skip to content

Commit 5b75d08

Browse files
committed
Migrate serialize_v0 to new API.
This is the middle layer of the API design work (sigstore#172). We add a manifest abstract class to represent various manifests (sigstore#111 sigstore#112) and also ways to serialize a model directory into manifests and ways to verify the manifests. For now, this only does what was formerly known as `serialize_v0`. The v1 and the manifest versions will come soon. Note: This has a lot of inspiration from sigstore#112, but makes the API work with all the usecases we need to consider right now. Signed-off-by: Mihai Maruseac <[email protected]>
1 parent 220d5c7 commit 5b75d08

File tree

7 files changed

+488
-3
lines changed

7 files changed

+488
-3
lines changed

model_signing/hashing/file.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
```python
2929
>>> with open("/tmp/file", "w") as f:
3030
... f.write("0123abcd")
31-
>>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
31+
>>> hasher = ShardedFileHasher("/tmp/file", SHA256(), start=4, end=8)
3232
>>> digest = hasher.compute()
3333
>>> digest.digest_hex
3434
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
@@ -144,8 +144,7 @@ def __init__(
144144
Args:
145145
file: The file to hash. Use `set_file` to reset it.
146146
content_hasher: A `hashing.HashEngine` instance used to compute the
147-
digest of the file. This instance must not be used outside of this
148-
instance. However, it may be pre-initialized with a header.
147+
digest of the file.
149148
start: The file offset to start reading from. Must be valid. Reset
150149
with `set_shard`.
151150
end: The file offset to start reading from. Must be stricly greater

model_signing/manifest/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/manifest/manifest.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Machinery for representing a serialized representation of an ML model.
16+
17+
Currently, we only support a manifest that wraps around a digest. But, to
18+
support incremental updates and partial signature verification, we need a
19+
manifest that lists files and their digests. That will come in a future change,
20+
soon.
21+
"""
22+
23+
from abc import ABCMeta
24+
from dataclasses import dataclass
25+
26+
from model_signing.hashing import hashing
27+
28+
29+
class Manifest(metaclass=ABCMeta):
30+
"""Generic manifest file to represent a model."""
31+
32+
pass
33+
34+
35+
@dataclass
36+
class DigestManifest(Manifest):
37+
"""A manifest that is just a hash."""
38+
39+
digest: hashing.Digest

model_signing/serializing/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/serializing/dfs.py

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Model serializers that build a single hash out of a DFS traversal."""
16+
17+
import pathlib
18+
from typing import Callable
19+
from typing_extensions import override
20+
21+
from model_signing.hashing import file
22+
from model_signing.hashing import hashing
23+
from model_signing.manifest import manifest
24+
from model_signing.serializing import serializing
25+
26+
27+
def _check_file_or_directory(path: pathlib.Path) -> bool:
28+
"""Checks that the given path is either a file or a directory."""
29+
return path.is_file() or path.is_dir()
30+
31+
32+
def _build_header(*, entry_name: str, entry_type: str) -> bytes:
33+
"""Builds a header to encode a path with given name and type.
34+
35+
Args:
36+
entry_name: The name of the entry to build the header for.
37+
entry_type: The type of the entry (file or directory).
38+
"""
39+
encoded_type = entry_type.encode("utf-8")
40+
encoded_name = entry_name.encode("utf-8")
41+
return b".".join([encoded_type, encoded_name, b""])
42+
43+
44+
class DFSSerializer(serializing.Serializer):
45+
"""Serializer for a model that performs a traversal of the model directory.
46+
47+
This serializer produces a single hash for the entire model. If the model is
48+
a file, the hash is the digest of the file. If the model is a directory, we
49+
perform a depth-first traversal of the directory, hash each individual files
50+
and aggregate the hashes together.
51+
"""
52+
53+
def __init__(
54+
self,
55+
file_hasher: file.FileHasher,
56+
merge_hasher_factory: Callable[[], hashing.StreamingHashEngine],
57+
):
58+
"""Initializes an instance to hash a file with a specific `HashEngine`.
59+
60+
Args:
61+
hasher: The hash engine used to hash the individual files.
62+
merge_hasher_factory: A callable that returns a
63+
`hashing.StreamingHashEngine` instance used to merge individual
64+
file digests to compute an aggregate digest.
65+
"""
66+
self._file_hasher = file_hasher
67+
self._merge_hasher_factory = merge_hasher_factory
68+
69+
@override
70+
def serialize(self, model_path: pathlib.Path) -> manifest.Manifest:
71+
# TODO(mihaimaruseac): Add checks for symlinks
72+
if not _check_file_or_directory(model_path):
73+
raise ValueError(
74+
f"Must have a file or directory, but '{model_path}' is neither."
75+
)
76+
77+
if model_path.is_file():
78+
self._file_hasher.set_file(model_path)
79+
return manifest.DigestManifest(self._file_hasher.compute())
80+
81+
return manifest.DigestManifest(self._dfs(model_path))
82+
83+
def _dfs(self, directory: pathlib.Path) -> hashing.Digest:
84+
# TODO(mihaimaruseac): Add support for excluded files
85+
children = sorted([x for x in directory.iterdir()])
86+
87+
hasher = self._merge_hasher_factory()
88+
for child in children:
89+
if not _check_file_or_directory(child):
90+
raise ValueError(
91+
f"Must have a file or directory, but '{child}' is neither."
92+
)
93+
94+
if child.is_file():
95+
header = _build_header(entry_name=child.name, entry_type="file")
96+
hasher.update(header)
97+
self._file_hasher.set_file(child)
98+
digest = self._file_hasher.compute()
99+
hasher.update(digest.digest_value)
100+
else:
101+
header = _build_header(entry_name=child.name, entry_type="dir")
102+
hasher.update(header)
103+
digest = self._dfs(child)
104+
hasher.update(digest.digest_value)
105+
106+
return hasher.compute()

0 commit comments

Comments
 (0)