Skip to content

Commit 220d5c7

Browse files
Add hashing API for model signing (sigstore#188)
* Add trivial .gitignore Signed-off-by: Mihai Maruseac <[email protected]> * Add missing copyright notice Signed-off-by: Mihai Maruseac <[email protected]> * Add signing API Signed-off-by: Mihai Maruseac <[email protected]> * Fix serialize_test import Signed-off-by: Mihai Maruseac <[email protected]> * Remove unused imports Signed-off-by: Mihai Maruseac <[email protected]> * Increase max line length for `flake8` to match Google style Signed-off-by: Mihai Maruseac <[email protected]> * Remove duplicated file Signed-off-by: Mihai Maruseac <[email protected]> * Slightly change API and add a few more tests Signed-off-by: Mihai Maruseac <[email protected]> * Reach 100% test coverage Signed-off-by: Mihai Maruseac <[email protected]> * Fix license stub Signed-off-by: Mihai Maruseac <[email protected]> * Make illegal states unrepresentable. Remove bad design patterns around the ahshing engine concerns. Signed-off-by: Mihai Maruseac <[email protected]> * Use protocols for file hashing Signed-off-by: Mihai Maruseac <[email protected]> * Fix some errors. Will fix rest later Signed-off-by: Mihai Maruseac <[email protected]> * Fix file hashing errors and API Signed-off-by: Mihai Maruseac <[email protected]> * Remove unused import Signed-off-by: Mihai Maruseac <[email protected]> * Fix lint Signed-off-by: Mihai Maruseac <[email protected]> * Remove unused imports Signed-off-by: Mihai Maruseac <[email protected]> * Remove check that is no longer needed Signed-off-by: Mihai Maruseac <[email protected]> --------- Signed-off-by: Mihai Maruseac <[email protected]> Signed-off-by: Mihai Maruseac <[email protected]>
1 parent d52f37b commit 220d5c7

12 files changed

+910
-1
lines changed

.github/workflows/lint.yml

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ jobs:
2020
python-version: "3.11"
2121
- name: flake8 Lint
2222
uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2.3.0
23+
with:
24+
max-line-length: "80"
2325
- name: Detect empty lines at end of file and trailing whitespace
2426
run: |
2527
set -euxo pipefail # No -x here!

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__/

model_signing/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/hashing/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

model_signing/hashing/file.py

+223
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Machinery for computing digests for a single file.
16+
17+
Example usage for `FileHasher`:
18+
```python
19+
>>> with open("/tmp/file", "w") as f:
20+
... f.write("abcd")
21+
>>> hasher = FileHasher("/tmp/file", SHA256())
22+
>>> digest = hasher.compute()
23+
>>> digest.digest_hex
24+
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
25+
```
26+
27+
Example usage for `ShardedFileHasher`, reading only the second part of a file:
28+
```python
29+
>>> with open("/tmp/file", "w") as f:
30+
... f.write("0123abcd")
31+
>>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
32+
>>> digest = hasher.compute()
33+
>>> digest.digest_hex
34+
'88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
35+
```
36+
"""
37+
38+
import pathlib
39+
from typing_extensions import override
40+
41+
from model_signing.hashing import hashing
42+
43+
44+
class FileHasher(hashing.HashEngine):
45+
"""Generic file hash engine.
46+
47+
To compute the hash of a file, we read the file exactly once, including for
48+
very large files that don't fit in memory. Files are read in chunks and each
49+
chunk is passed to the `update` method of an inner
50+
`hashing.StreamingHashEngine`, instance. This ensures that the file digest
51+
will not change even if the chunk size changes. As such, we can dynamically
52+
determine an optimal value for the chunk argument.
53+
54+
The `digest_name()` method MUST record all parameters that influence the
55+
hash output. For example, if a file is split into shards which are hashed
56+
separately and the final digest value is computed by aggregating these
57+
hashes, then the shard size must be given in the output string. However, for
58+
simplicity, predefined names can be used to override the `digest_name()`
59+
output.
60+
"""
61+
62+
def __init__(
63+
self,
64+
file: pathlib.Path,
65+
content_hasher: hashing.StreamingHashEngine,
66+
*,
67+
chunk_size: int = 8192,
68+
digest_name_override: str | None = None,
69+
):
70+
"""Initializes an instance to hash a file with a specific `HashEngine`.
71+
72+
Args:
73+
file: The file to hash. Use `set_file` to reset it.
74+
content_hasher: A `hashing.StreamingHashEngine` instance used to
75+
compute the digest of the file.
76+
chunk_size: The amount of file to read at once. Default is 8KB. A
77+
special value of 0 signals to attempt to read everything in a
78+
single call.
79+
digest_name_override: Optional string to allow overriding the
80+
`digest_name` property to support shorter, standardized names.
81+
"""
82+
if chunk_size < 0:
83+
raise ValueError(
84+
f"Chunk size must be non-negative, got {chunk_size}."
85+
)
86+
87+
self._file = file
88+
self._content_hasher = content_hasher
89+
self._chunk_size = chunk_size
90+
self._digest_name_override = digest_name_override
91+
92+
def set_file(self, file: pathlib.Path) -> None:
93+
"""Redefines the file to be hashed in `compute`."""
94+
self._file = file
95+
96+
@override
97+
@property
98+
def digest_name(self) -> str:
99+
if self._digest_name_override is not None:
100+
return self._digest_name_override
101+
return f"file-{self._content_hasher.digest_name}"
102+
103+
@override
104+
def compute(self) -> hashing.Digest:
105+
self._content_hasher.reset()
106+
107+
if self._chunk_size == 0:
108+
with open(self._file, "rb") as f:
109+
self._content_hasher.update(f.read())
110+
else:
111+
with open(self._file, "rb") as f:
112+
while True:
113+
data = f.read(self._chunk_size)
114+
if not data:
115+
break
116+
self._content_hasher.update(data)
117+
118+
digest = self._content_hasher.compute()
119+
return hashing.Digest(self.digest_name, digest.digest_value)
120+
121+
122+
class ShardedFileHasher(FileHasher):
123+
"""File hash engine that can be invoked in parallel.
124+
125+
To efficiently support hashing large files, this class provides an ability
126+
to compute the digest over a shard of the file. It is the responsibility of
127+
the user to compose the digests of each shard into a single digest for the
128+
entire file.
129+
"""
130+
131+
def __init__(
132+
self,
133+
file: pathlib.Path,
134+
content_hasher: hashing.StreamingHashEngine,
135+
*,
136+
start: int,
137+
end: int,
138+
chunk_size: int = 8192,
139+
shard_size: int = 1000000,
140+
digest_name_override: str | None = None,
141+
):
142+
"""Initializes an instance to hash a file with a specific `HashEngine`.
143+
144+
Args:
145+
file: The file to hash. Use `set_file` to reset it.
146+
content_hasher: A `hashing.HashEngine` instance used to compute the
147+
digest of the file. This instance must not be used outside of this
148+
instance. However, it may be pre-initialized with a header.
149+
start: The file offset to start reading from. Must be valid. Reset
150+
with `set_shard`.
151+
end: The file offset to start reading from. Must be stricly greater
152+
than start. If past the file size, or -1, it will be trimmed.
153+
Reset with `set_shard`.
154+
chunk_size: The amount of file to read at once. Default is 8KB. A
155+
special value of 0 signals to attempt to read everything in a
156+
single call.
157+
shard_size: The amount of file to read at once. Default is 8KB.
158+
digest_name_override: Optional string to allow overriding the
159+
`digest_name` property to support shorter, standardized names.
160+
"""
161+
super().__init__(
162+
file=file,
163+
content_hasher=content_hasher,
164+
chunk_size=chunk_size,
165+
digest_name_override=digest_name_override,
166+
)
167+
168+
if shard_size <= 0:
169+
raise ValueError(
170+
f"Shard size must be strictly positive, got {shard_size}."
171+
)
172+
self._shard_size = shard_size
173+
174+
self.set_shard(start=start, end=end)
175+
176+
def set_shard(self, *, start: int, end: int) -> None:
177+
"""Redefines the file shard to be hashed in `compute`."""
178+
if start < 0:
179+
raise ValueError(
180+
f"File start offset must be non-negative, got {start}."
181+
)
182+
if end <= start:
183+
raise ValueError(
184+
"File end offset must be stricly higher that file start offset,"
185+
f" got {start=}, {end=}."
186+
)
187+
read_length = end - start
188+
if read_length > self._shard_size:
189+
raise ValueError(
190+
f"Must not read more than shard_size={self._shard_size}, got"
191+
f" {read_length}."
192+
)
193+
194+
self._start = start
195+
self._end = end
196+
197+
@override
198+
def compute(self) -> hashing.Digest:
199+
self._content_hasher.reset()
200+
201+
with open(self._file, "rb") as f:
202+
f.seek(self._start)
203+
to_read = self._end - self._start
204+
if self._chunk_size == 0 or self._chunk_size >= to_read:
205+
data = f.read(to_read)
206+
self._content_hasher.update(data)
207+
else:
208+
while to_read >= 0:
209+
data = f.read(min(self._chunk_size, to_read))
210+
if not data:
211+
break
212+
to_read -= len(data)
213+
self._content_hasher.update(data)
214+
215+
digest = self._content_hasher.compute()
216+
return hashing.Digest(self.digest_name, digest.digest_value)
217+
218+
@override
219+
@property
220+
def digest_name(self) -> str:
221+
if self._digest_name_override is not None:
222+
return self._digest_name_override
223+
return f"file-{self._content_hasher.digest_name}-{self._shard_size}"

0 commit comments

Comments
 (0)