|
1 | 1 | import asyncio |
2 | 2 | import json |
3 | 3 | import logging |
| 4 | +import re |
4 | 5 | from datetime import datetime |
5 | 6 | from pathlib import Path |
6 | 7 | from typing import Any, Dict, Optional |
|
14 | 15 |
|
15 | 16 | logger = logging.getLogger(__name__) |
16 | 17 |
|
17 | | -BLOCKED_PATTERNS = [ |
| 18 | +IGNORED_PATTERNS = [ |
18 | 19 | "google-analytics", |
19 | 20 | "googleads", |
20 | 21 | "google-tag-manager", |
|
28 | 29 | "googletagmanager.com", |
29 | 30 | "amazon.com/1/events/", |
30 | 31 | "amazon-adsystem.com", |
31 | | - "amazon.com/rd/uedata", |
32 | | - "amazon.com/ap/uedata", |
33 | | - "amazon.com/*/uedata", # TODO: handle patterns instead |
| 32 | + "amazon.com/*/uedata", |
34 | 33 | "fls-na.amazon.com", |
35 | 34 | ] |
36 | 35 |
|
| 36 | +_compiled_patterns = [] |
| 37 | +for pattern in IGNORED_PATTERNS: |
| 38 | + if "*" in pattern: |
| 39 | + # Convert wildcard pattern to regex: * matches any characters except nothing |
| 40 | + regex_pattern = re.escape(pattern).replace(r"\*", r"[^/]+") |
| 41 | + _compiled_patterns.append(("regex", re.compile(regex_pattern, re.IGNORECASE))) |
| 42 | + else: |
| 43 | + _compiled_patterns.append(("substring", pattern.lower())) |
| 44 | + |
| 45 | + |
| 46 | +def should_ignore_url(url: str): |
| 47 | + """Check if URL should be ignored based on IGNORED_PATTERNS (supports wildcards).""" |
| 48 | + url_lower = url.lower() |
| 49 | + for pattern_type, pattern in _compiled_patterns: |
| 50 | + if pattern_type == "substring": |
| 51 | + if pattern in url_lower: |
| 52 | + return True |
| 53 | + elif pattern_type == "regex": |
| 54 | + if pattern.search(url_lower): |
| 55 | + return True |
| 56 | + return False |
| 57 | + |
37 | 58 |
|
38 | 59 | class ReplayBundle: |
39 | 60 | """Replay previously captured browsing resources using HAR files.""" |
@@ -71,10 +92,8 @@ def _setup_network_logging(self, context: BrowserContext) -> None: |
71 | 92 | """Set up network event listeners to log requests not found in HAR.""" |
72 | 93 |
|
73 | 94 | async def log_request_failed(request): |
74 | | - url_lower = request.url.lower() |
75 | | - for pattern in BLOCKED_PATTERNS: |
76 | | - if pattern in url_lower: |
77 | | - return |
| 95 | + if should_ignore_url(request.url): |
| 96 | + return |
78 | 97 |
|
79 | 98 | logger.warning( |
80 | 99 | "⚠️ Request FAILED (not in HAR): %s %s [%s]", |
@@ -224,64 +243,58 @@ async def build_context( |
224 | 243 | async def handle_routes_manually(self, route, request): |
225 | 244 | # TODO: do we need to obsfucate in a more clever way? |
226 | 245 | # - ?? Normalize JSON (remove volatile fields; sort keys) and hash; tolerate multipart boundary changes; ignore known nonce/timestamp params. |
| 246 | + # TODO: what if the request is sent twice, we'll be selecting the first one all the time. |
227 | 247 |
|
228 | | - har_data = self._load_har_data() |
229 | 248 | # TODO: this requires LM postprocessing selection of URL's to match or some dumb way for all POST? or smth |
230 | 249 | urls_to_ignore_post_data = { |
231 | 250 | "https://www.amazon.com/ax/claim", |
232 | 251 | "https://www.amazon.com/aaut/verify/ap", |
233 | 252 | "https://www.amazon.com/ap/signin", |
234 | 253 | } |
235 | 254 |
|
236 | | - url_lower = request.url.lower() |
237 | | - for pattern in BLOCKED_PATTERNS: |
238 | | - if pattern in url_lower: |
239 | | - await route.abort() |
240 | | - return |
241 | | - |
242 | | - # Handle Amazon claim POST requests (ignore POST body, match by URL and method only) |
243 | | - if request.method == "POST": |
244 | | - for base_url in urls_to_ignore_post_data: |
245 | | - if request.url.startswith(base_url): |
246 | | - for entry in har_data.get("log", {}).get("entries", []): |
247 | | - req = entry.get("request", {}) |
248 | | - # TODO: do need to match to base_url or is fine if == request.url |
249 | | - if ( |
250 | | - req.get("method") == "POST" |
251 | | - and req.get("url") == request.url |
252 | | - ): |
253 | | - logger.info( |
254 | | - "✅ Found matching HAR entry for %s", request.url |
255 | | - ) |
256 | | - response = entry.get("response", {}) |
257 | | - |
258 | | - # Extract response details |
259 | | - status = response.get("status", 200) |
260 | | - headers = { |
261 | | - h["name"]: h["value"] |
262 | | - for h in response.get("headers", []) |
263 | | - } |
264 | | - content = response.get("content", {}) |
265 | | - |
266 | | - # Get body if available |
267 | | - body = None |
268 | | - if "text" in content: |
269 | | - body = content["text"] |
270 | | - |
271 | | - # Fulfill the request with HAR response |
272 | | - await route.fulfill( |
273 | | - status=status, headers=headers, body=body |
274 | | - ) |
275 | | - return |
276 | | - |
277 | | - # If not found in HAR, abort it |
278 | | - logger.warning( |
279 | | - "⚠️ No matching HAR entry found for %s, aborting", request.url |
280 | | - ) |
| 255 | + if should_ignore_url(request.url): |
281 | 256 | await route.abort() |
282 | | - else: |
283 | | - # Not a special request, fall back to HAR routing |
| 257 | + return |
| 258 | + |
| 259 | + if not request.method == "POST": |
| 260 | + # TODO: should handle DELETE, PUT? everything that is not GET? |
284 | 261 | await route.fallback() |
| 262 | + return |
| 263 | + |
| 264 | + for base_url in urls_to_ignore_post_data: |
| 265 | + if not request.url.startswith(base_url): |
| 266 | + continue |
| 267 | + har_data = self._load_har_data() |
| 268 | + har_entries = har_data.get("log", {}).get("entries", []) |
| 269 | + |
| 270 | + entry = next( |
| 271 | + ( |
| 272 | + entry |
| 273 | + for entry in har_entries |
| 274 | + if entry.get("request", {}).get("method") == "POST" |
| 275 | + and entry.get("request", {}).get("url") == request.url |
| 276 | + ), |
| 277 | + None, |
| 278 | + ) |
| 279 | + if not entry: |
| 280 | + continue |
| 281 | + |
| 282 | + logger.info( |
| 283 | + "✅ Found matching HAR entry for %s", |
| 284 | + request.url[:100] + "..." if len(request.url) > 100 else request.url, |
| 285 | + ) |
| 286 | + response = entry.get("response", {}) |
| 287 | + status = response.get("status", 200) |
| 288 | + headers = {h["name"]: h["value"] for h in response.get("headers", [])} |
| 289 | + content = response.get("content", {}) |
| 290 | + body = None if "text" not in content else content["text"] |
| 291 | + # TODO: is body provided properly? what about other body types |
| 292 | + await route.fulfill(status=status, headers=headers, body=body) |
| 293 | + return |
| 294 | + |
| 295 | + # If not found in HAR, abort it |
| 296 | + logger.warning("⚠️ No matching HAR entry found for %s, aborting", request.url) |
| 297 | + await route.abort() |
285 | 298 |
|
286 | 299 | def _storage_state_path(self) -> Optional[Path]: |
287 | 300 | storage_dir = self.bundle_path / "storage" |
|
0 commit comments