|
12 | 12 | """ |
13 | 13 |
|
14 | 14 | import glob |
| 15 | +import hashlib |
15 | 16 | import json |
| 17 | +import os |
| 18 | +import pickle |
16 | 19 | import random |
17 | 20 | import re |
18 | 21 | import sys |
| 22 | +import tempfile |
19 | 23 | import time |
20 | 24 | from enum import Enum |
21 | 25 | from os.path import basename, dirname, exists |
@@ -106,12 +110,12 @@ def __init__( |
106 | 110 | dataset=None, |
107 | 111 | table=None, |
108 | 112 | billing_project=None, |
109 | | - ignore_content=False, |
| 113 | + use_cache=True, |
110 | 114 | ): |
111 | 115 | """Instantiate DryRun class.""" |
112 | 116 | self.sqlfile = sqlfile |
113 | 117 | self.content = content |
114 | | - self.ignore_content = ignore_content |
| 118 | + self.use_cache = use_cache |
115 | 119 | self.query_parameters = query_parameters |
116 | 120 | self.strip_dml = strip_dml |
117 | 121 | self.use_cloud_function = use_cloud_function |
@@ -227,16 +231,125 @@ def get_sql(self): |
227 | 231 |
|
228 | 232 | return sql |
229 | 233 |
|
| 234 | + def _get_cache_key(self, sql): |
| 235 | + """Generate cache key based on SQL content and other parameters.""" |
| 236 | + cache_input = f"{sql}|{self.project}|{self.dataset}|{self.table}" |
| 237 | + return hashlib.sha256(cache_input.encode()).hexdigest() |
| 238 | + |
| 239 | + def _get_cached_result(self, cache_key, ttl_seconds=None): |
| 240 | + """Load cached dry run result from disk.""" |
| 241 | + if ttl_seconds is None: |
| 242 | + ttl_seconds = ConfigLoader.get("dry_run", "cache_ttl_seconds", fallback=900) |
| 243 | + |
| 244 | + cache_dir = os.path.join(tempfile.gettempdir(), "bigquery_etl_dryrun_cache") |
| 245 | + os.makedirs(cache_dir, exist_ok=True) |
| 246 | + cache_file = os.path.join(cache_dir, f"dryrun_{cache_key}.pkl") |
| 247 | + |
| 248 | + if os.path.exists(cache_file): |
| 249 | + # check if cache is expired |
| 250 | + file_age = time.time() - os.path.getmtime(cache_file) |
| 251 | + if file_age > ttl_seconds: |
| 252 | + try: |
| 253 | + os.remove(cache_file) |
| 254 | + except OSError: |
| 255 | + pass |
| 256 | + return None |
| 257 | + |
| 258 | + try: |
| 259 | + with open(cache_file, "rb") as f: |
| 260 | + cached_data = pickle.load(f) |
| 261 | + cache_age = time.time() - os.path.getmtime(cache_file) |
| 262 | + print(f"[DRYRUN CACHE HIT] {self.sqlfile} (age: {cache_age:.0f}s)") |
| 263 | + return cached_data |
| 264 | + except (pickle.PickleError, EOFError, OSError) as e: |
| 265 | + print(f"[DRYRUN CACHE] Failed to load cache: {e}") |
| 266 | + return None |
| 267 | + |
| 268 | + return None |
| 269 | + |
| 270 | + def _save_cached_result(self, cache_key, result): |
| 271 | + """Save dry run result to disk cache.""" |
| 272 | + cache_dir = os.path.join(tempfile.gettempdir(), "bigquery_etl_dryrun_cache") |
| 273 | + os.makedirs(cache_dir, exist_ok=True) |
| 274 | + cache_file = os.path.join(cache_dir, f"dryrun_{cache_key}.pkl") |
| 275 | + |
| 276 | + try: |
| 277 | + with open(cache_file, "wb") as f: |
| 278 | + pickle.dump(result, f) |
| 279 | + |
| 280 | + # save table metadata separately if present |
| 281 | + if ( |
| 282 | + result |
| 283 | + and "tableMetadata" in result |
| 284 | + and self.project |
| 285 | + and self.dataset |
| 286 | + and self.table |
| 287 | + ): |
| 288 | + table_identifier = f"{self.project}.{self.dataset}.{self.table}" |
| 289 | + self._save_cached_table_metadata( |
| 290 | + table_identifier, result["tableMetadata"] |
| 291 | + ) |
| 292 | + except (pickle.PickleError, OSError) as e: |
| 293 | + print(f"[DRYRUN CACHE] Failed to save cache: {e}") |
| 294 | + |
| 295 | + def _get_cached_table_metadata(self, table_identifier, ttl_seconds=None): |
| 296 | + """Load cached table metadata from disk based on table identifier.""" |
| 297 | + if ttl_seconds is None: |
| 298 | + ttl_seconds = ConfigLoader.get("dry_run", "cache_ttl_seconds", fallback=900) |
| 299 | + |
| 300 | + cache_dir = os.path.join(tempfile.gettempdir(), "bigquery_etl_dryrun_cache") |
| 301 | + os.makedirs(cache_dir, exist_ok=True) |
| 302 | + # table identifier as cache key |
| 303 | + table_cache_key = hashlib.sha256(table_identifier.encode()).hexdigest() |
| 304 | + cache_file = os.path.join(cache_dir, f"table_metadata_{table_cache_key}.pkl") |
| 305 | + |
| 306 | + if os.path.exists(cache_file): |
| 307 | + # check if cache is expired |
| 308 | + file_age = time.time() - os.path.getmtime(cache_file) |
| 309 | + |
| 310 | + if file_age > ttl_seconds: |
| 311 | + try: |
| 312 | + os.remove(cache_file) |
| 313 | + except OSError: |
| 314 | + pass |
| 315 | + return None |
| 316 | + |
| 317 | + try: |
| 318 | + with open(cache_file, "rb") as f: |
| 319 | + cached_data = pickle.load(f) |
| 320 | + return cached_data |
| 321 | + except (pickle.PickleError, EOFError, OSError) as e: |
| 322 | + return None |
| 323 | + return None |
| 324 | + |
| 325 | + def _save_cached_table_metadata(self, table_identifier, metadata): |
| 326 | + """Save table metadata to disk cache.""" |
| 327 | + cache_dir = os.path.join(tempfile.gettempdir(), "bigquery_etl_dryrun_cache") |
| 328 | + os.makedirs(cache_dir, exist_ok=True) |
| 329 | + table_cache_key = hashlib.sha256(table_identifier.encode()).hexdigest() |
| 330 | + cache_file = os.path.join(cache_dir, f"table_metadata_{table_cache_key}.pkl") |
| 331 | + |
| 332 | + try: |
| 333 | + with open(cache_file, "wb") as f: |
| 334 | + pickle.dump(metadata, f) |
| 335 | + except (pickle.PickleError, OSError) as e: |
| 336 | + print(f"[TABLE METADATA] Failed to save cache for {table_identifier}: {e}") |
| 337 | + |
230 | 338 | @cached_property |
231 | 339 | def dry_run_result(self): |
232 | 340 | """Dry run the provided SQL file.""" |
233 | | - if self.ignore_content: |
234 | | - sql = None |
| 341 | + if self.content: |
| 342 | + sql = self.content |
235 | 343 | else: |
236 | | - if self.content: |
237 | | - sql = self.content |
238 | | - elif self.content != "": |
239 | | - sql = self.get_sql() |
| 344 | + sql = self.get_sql() |
| 345 | + |
| 346 | + # Check cache first (if caching is enabled) |
| 347 | + if sql is not None and self.use_cache: |
| 348 | + cache_key = self._get_cache_key(sql) |
| 349 | + cached_result = self._get_cached_result(cache_key) |
| 350 | + if cached_result is not None: |
| 351 | + self.dry_run_duration = 0 # Cached result, no actual dry run |
| 352 | + return cached_result |
240 | 353 |
|
241 | 354 | query_parameters = [] |
242 | 355 | if self.query_parameters: |
@@ -356,6 +469,11 @@ def dry_run_result(self): |
356 | 469 | } |
357 | 470 |
|
358 | 471 | self.dry_run_duration = time.time() - start_time |
| 472 | + |
| 473 | + # Save to cache (if caching is enabled) |
| 474 | + if self.use_cache: |
| 475 | + self._save_cached_result(cache_key, result) |
| 476 | + |
359 | 477 | return result |
360 | 478 |
|
361 | 479 | except Exception as e: |
@@ -481,6 +599,13 @@ def get_table_schema(self): |
481 | 599 | ): |
482 | 600 | return self.dry_run_result["tableMetadata"]["schema"] |
483 | 601 |
|
| 602 | + # Check if table metadata is cached (if caching is enabled) |
| 603 | + if self.use_cache and self.project and self.dataset and self.table: |
| 604 | + table_identifier = f"{self.project}.{self.dataset}.{self.table}" |
| 605 | + cached_metadata = self._get_cached_table_metadata(table_identifier) |
| 606 | + if cached_metadata: |
| 607 | + return cached_metadata["schema"] |
| 608 | + |
484 | 609 | return [] |
485 | 610 |
|
486 | 611 | def get_dataset_labels(self): |
|
0 commit comments