Skip to content

Commit 491cd3d

Browse files
committed
manual route handler cleanup + pattern matching allowing wildcards of sites to ignore
1 parent 49f7e1a commit 491cd3d

File tree

1 file changed

+69
-56
lines changed

1 file changed

+69
-56
lines changed

src/environments/launch.py

Lines changed: 69 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import json
33
import logging
4+
import re
45
from datetime import datetime
56
from pathlib import Path
67
from typing import Any, Dict, Optional
@@ -14,7 +15,7 @@
1415

1516
logger = logging.getLogger(__name__)
1617

17-
BLOCKED_PATTERNS = [
18+
IGNORED_PATTERNS = [
1819
"google-analytics",
1920
"googleads",
2021
"google-tag-manager",
@@ -28,12 +29,32 @@
2829
"googletagmanager.com",
2930
"amazon.com/1/events/",
3031
"amazon-adsystem.com",
31-
"amazon.com/rd/uedata",
32-
"amazon.com/ap/uedata",
33-
"amazon.com/*/uedata", # TODO: handle patterns instead
32+
"amazon.com/*/uedata",
3433
"fls-na.amazon.com",
3534
]
3635

36+
_compiled_patterns = []
37+
for pattern in IGNORED_PATTERNS:
38+
if "*" in pattern:
39+
# Convert wildcard pattern to regex: * matches any characters except nothing
40+
regex_pattern = re.escape(pattern).replace(r"\*", r"[^/]+")
41+
_compiled_patterns.append(("regex", re.compile(regex_pattern, re.IGNORECASE)))
42+
else:
43+
_compiled_patterns.append(("substring", pattern.lower()))
44+
45+
46+
def should_ignore_url(url: str):
47+
"""Check if URL should be ignored based on IGNORED_PATTERNS (supports wildcards)."""
48+
url_lower = url.lower()
49+
for pattern_type, pattern in _compiled_patterns:
50+
if pattern_type == "substring":
51+
if pattern in url_lower:
52+
return True
53+
elif pattern_type == "regex":
54+
if pattern.search(url_lower):
55+
return True
56+
return False
57+
3758

3859
class ReplayBundle:
3960
"""Replay previously captured browsing resources using HAR files."""
@@ -71,10 +92,8 @@ def _setup_network_logging(self, context: BrowserContext) -> None:
7192
"""Set up network event listeners to log requests not found in HAR."""
7293

7394
async def log_request_failed(request):
74-
url_lower = request.url.lower()
75-
for pattern in BLOCKED_PATTERNS:
76-
if pattern in url_lower:
77-
return
95+
if should_ignore_url(request.url):
96+
return
7897

7998
logger.warning(
8099
"⚠️ Request FAILED (not in HAR): %s %s [%s]",
@@ -224,64 +243,58 @@ async def build_context(
224243
async def handle_routes_manually(self, route, request):
225244
# TODO: do we need to obsfucate in a more clever way?
226245
# - ?? Normalize JSON (remove volatile fields; sort keys) and hash; tolerate multipart boundary changes; ignore known nonce/timestamp params.
246+
# TODO: what if the request is sent twice, we'll be selecting the first one all the time.
227247

228-
har_data = self._load_har_data()
229248
# TODO: this requires LM postprocessing selection of URL's to match or some dumb way for all POST? or smth
230249
urls_to_ignore_post_data = {
231250
"https://www.amazon.com/ax/claim",
232251
"https://www.amazon.com/aaut/verify/ap",
233252
"https://www.amazon.com/ap/signin",
234253
}
235254

236-
url_lower = request.url.lower()
237-
for pattern in BLOCKED_PATTERNS:
238-
if pattern in url_lower:
239-
await route.abort()
240-
return
241-
242-
# Handle Amazon claim POST requests (ignore POST body, match by URL and method only)
243-
if request.method == "POST":
244-
for base_url in urls_to_ignore_post_data:
245-
if request.url.startswith(base_url):
246-
for entry in har_data.get("log", {}).get("entries", []):
247-
req = entry.get("request", {})
248-
# TODO: do need to match to base_url or is fine if == request.url
249-
if (
250-
req.get("method") == "POST"
251-
and req.get("url") == request.url
252-
):
253-
logger.info(
254-
"✅ Found matching HAR entry for %s", request.url
255-
)
256-
response = entry.get("response", {})
257-
258-
# Extract response details
259-
status = response.get("status", 200)
260-
headers = {
261-
h["name"]: h["value"]
262-
for h in response.get("headers", [])
263-
}
264-
content = response.get("content", {})
265-
266-
# Get body if available
267-
body = None
268-
if "text" in content:
269-
body = content["text"]
270-
271-
# Fulfill the request with HAR response
272-
await route.fulfill(
273-
status=status, headers=headers, body=body
274-
)
275-
return
276-
277-
# If not found in HAR, abort it
278-
logger.warning(
279-
"⚠️ No matching HAR entry found for %s, aborting", request.url
280-
)
255+
if should_ignore_url(request.url):
281256
await route.abort()
282-
else:
283-
# Not a special request, fall back to HAR routing
257+
return
258+
259+
if not request.method == "POST":
260+
# TODO: should handle DELETE, PUT? everything that is not GET?
284261
await route.fallback()
262+
return
263+
264+
for base_url in urls_to_ignore_post_data:
265+
if not request.url.startswith(base_url):
266+
continue
267+
har_data = self._load_har_data()
268+
har_entries = har_data.get("log", {}).get("entries", [])
269+
270+
entry = next(
271+
(
272+
entry
273+
for entry in har_entries
274+
if entry.get("request", {}).get("method") == "POST"
275+
and entry.get("request", {}).get("url") == request.url
276+
),
277+
None,
278+
)
279+
if not entry:
280+
continue
281+
282+
logger.info(
283+
"✅ Found matching HAR entry for %s",
284+
request.url[:100] + "..." if len(request.url) > 100 else request.url,
285+
)
286+
response = entry.get("response", {})
287+
status = response.get("status", 200)
288+
headers = {h["name"]: h["value"] for h in response.get("headers", [])}
289+
content = response.get("content", {})
290+
body = None if "text" not in content else content["text"]
291+
# TODO: is body provided properly? what about other body types
292+
await route.fulfill(status=status, headers=headers, body=body)
293+
return
294+
295+
# If not found in HAR, abort it
296+
logger.warning("⚠️ No matching HAR entry found for %s, aborting", request.url)
297+
await route.abort()
285298

286299
def _storage_state_path(self) -> Optional[Path]:
287300
storage_dir = self.bundle_path / "storage"

0 commit comments

Comments
 (0)