From da6c118717a3e780fe32cefe50b59b3fa9bcbbb2 Mon Sep 17 00:00:00 2001 From: Noah Holm <32292420+noppaz@users.noreply.github.com> Date: Sun, 5 Mar 2023 10:42:57 +0100 Subject: [PATCH] compute seed file hash incrementally --- .../unreleased/Under the Hood-20230305-093644.yaml | 6 ++++++ core/dbt/contracts/files.py | 12 ++++++++++++ core/dbt/parser/read_files.py | 3 +-- 3 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 .changes/unreleased/Under the Hood-20230305-093644.yaml diff --git a/.changes/unreleased/Under the Hood-20230305-093644.yaml b/.changes/unreleased/Under the Hood-20230305-093644.yaml new file mode 100644 index 00000000000..35f52fac6a3 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20230305-093644.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Compute seed file hash incrementally +time: 2023-03-05T09:36:44.023758357Z +custom: + Author: noppaz + Issue: "7124" diff --git a/core/dbt/contracts/files.py b/core/dbt/contracts/files.py index 9e82247da00..a8e7e7ae753 100644 --- a/core/dbt/contracts/files.py +++ b/core/dbt/contracts/files.py @@ -109,6 +109,18 @@ def from_contents(cls, contents: str, name="sha256") -> "FileHash": checksum = hashlib.new(name, data).hexdigest() return cls(name=name, checksum=checksum) + @classmethod + def from_path(cls, path: str, name="sha256") -> "FileHash": + """Create a file hash from the file at given path.""" + chunk_size = 1 * 1024 * 1024 + file_hash = hashlib.new(name) + with open(path, "rb") as handle: + chunk = handle.read(chunk_size) + while chunk: + file_hash.update(chunk) + chunk = handle.read(chunk_size) + return cls(name=name, checksum=file_hash.hexdigest()) + @dataclass class RemoteFile(dbtClassMixin): diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 531e5f39560..8eaf45ce403 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -98,8 +98,7 @@ def load_seed_source_file(match: FilePath, project_name) -> SourceFile: # We don't want to calculate a hash of this file. Use the path. source_file = SourceFile.big_seed(match) else: - file_contents = load_file_contents(match.absolute_path, strip=False) - checksum = FileHash.from_contents(file_contents) + checksum = FileHash.from_path(match.absolute_path) source_file = SourceFile(path=match, checksum=checksum) source_file.contents = "" source_file.parse_file_type = ParseFileType.Seed