SandboxEnvironment reusing context configs from launch.py

josancamon19 · josancamon19 · commit 8e01803e46a4 · 2025-10-28T11:36:45.000-07:00
diff --git a/src/browser/browser.py b/src/browser/browser.py
@@ -204,17 +204,6 @@ async def launch_browser(self, task_id: int) -> BrowserContext:
             "--password-store=basic",
         ]
 
-        # TODO: environment.py should be cleaned and reuse more of launch.py
-        # TODO: does the agent launch works?
-        # TODO: does the agent when evaluated works on the environment?
-        # TODO: improve launching and running the environment
-
-        # ====== once this works well ======
-
-        # TODO: collect env with further n steps depth, using replay to bypass auths sections
-        # TODO: eval runs in parallel containers, or ran on kernel, hosting tunneled versions locally while it runs?
-        # TODO: websockets? like e.g. ChatGPT doesn't allow for collecting anything
-
         browser = await self.playwright.chromium.launch(
             channel=preferred_channel,
             headless=False,
diff --git a/src/environments/environment.py b/src/environments/environment.py
@@ -2,7 +2,6 @@
 import http.client
 import json
 import logging
-import os
 import socket
 from pathlib import Path
 from typing import List, Optional
@@ -11,7 +10,7 @@
 from playwright.async_api import BrowserContext, BrowserType, async_playwright
 
 from environments.launch import ReplayBundle
-from config.browser_config import BROWSER_ARGS, CONTEXT_CONFIG
+from config.browser_config import BROWSER_ARGS
 
 
 logger = logging.getLogger(__name__)
@@ -71,41 +70,28 @@ def __init__(
         bundle_path: Path,
         *,
         allow_network_fallback: bool = False,
-        headless: Optional[bool] = None,
-        browser_args: Optional[List[str]] = None,
+        headless: bool = False,
         safe_mode: bool = False,
+        browser_args: Optional[List[str]] = None,
+        include_storage_state: bool = False,
     ) -> None:
         self.bundle = ReplayBundle(bundle_path)
         self.allow_network_fallback = allow_network_fallback
         self.safe_mode = safe_mode
-
-        # TODO: what is this needed for?
-        env_headless = os.environ.get("SANDBOX_HEADLESS")
-        env_safe_mode = os.environ.get("SANDBOX_SAFE_MODE")
-        # TODO: ignore this env vars bullshit
-        # TODO: some variables naming don't elicit anything
-        if env_safe_mode is not None:
-            self.safe_mode = env_safe_mode.lower() in {"1", "true", "yes", "on"}
-
-        # Determine headless setting (allow override even in safe mode)
-        if env_headless is not None:
-            self.headless = env_headless.lower() in {"1", "true", "yes", "on"}
-        else:
-            self.headless = headless if headless is not None else False
+        self.headless = headless
+        self.include_storage_state = include_storage_state
 
         # Set browser args based on mode
         if self.safe_mode:
-            base_args = SAFE_BROWSER_ARGS
+            self.browser_args = SAFE_BROWSER_ARGS
+        elif browser_args is not None:
+            self.browser_args = browser_args
         else:
-            base_args = browser_args if browser_args is not None else BROWSER_ARGS
-
-        if browser_args is not None and self.safe_mode:
-            base_args = browser_args
-
-        self.browser_args = list(base_args) if base_args else []
+            self.browser_args = BROWSER_ARGS
 
         self._playwright = None
         self._browser: Optional[PlaywrightBrowser] = None
+        # TODO: a list of contexts seem like an overkill
         self._contexts: list[BrowserContext] = []
         self._ws_endpoint: Optional[str] = None
         self._debug_port: Optional[int] = None
@@ -135,11 +121,7 @@ async def start(self) -> str:
             self.headless,
         )
 
-        launch_kwargs = {
-            "headless": self.headless,
-            "args": launch_args,
-        }
-
+        launch_kwargs = {"headless": self.headless, "args": launch_args}
         self._browser = await browser_type.launch(**launch_kwargs)
         self._browser.on(
             "context",
@@ -148,7 +130,11 @@ async def start(self) -> str:
 
         # Ensure at least one context exists for routing
         if not self._browser.contexts:
-            context = await self._browser.new_context(**CONTEXT_CONFIG)
+            # TODO: or should use **CONTEXT_CONFIG?
+            context_config = self.bundle.get_context_config(
+                include_storage_state=self.include_storage_state
+            )
+            context = await self._browser.new_context(**context_config)
             await self._configure_context(context)
         else:
             for context in list(self._browser.contexts):
@@ -205,35 +191,26 @@ def _fetch() -> Optional[str]:
         raise RuntimeError("Timed out waiting for Chrome debugger endpoint")
 
     async def _configure_context(self, context: BrowserContext) -> None:
+        """Configure a context with HAR replay using the bundle's configuration."""
         if context in self._contexts:
             return
         self._contexts.append(context)
 
-        # Configure HAR-based replay
-        har_path = self.bundle.bundle_path / "recording.har"
-        if har_path.exists():
-            logger.info("[SANDBOX] Using HAR replay from %s", har_path)
-            await context.route_from_har(
-                str(har_path),
-                not_found="fallback" if self.allow_network_fallback else "abort",
-                update=False,
-            )
-        else:
-            raise FileNotFoundError(
-                f"[SANDBOX] HAR file not found at {har_path}. Cannot replay without HAR file."
-            )
+        # Delegate to the bundle's configure_context method for DRY
+        await self.bundle.configure_context(
+            context,
+            allow_network_fallback=self.allow_network_fallback,
+        )
 
     async def close(self) -> None:
-        if self._browser:
-            try:
-                await self._browser.close()
-            except Exception:
-                pass
-        if self._playwright:
-            try:
-                await self._playwright.stop()
-            except Exception:
-                pass
+        try:
+            await self._browser.close()
+        except Exception:
+            pass
+        try:
+            await self._playwright.stop()
+        except Exception:
+            pass
 
         self._browser = None
         self._playwright = None
diff --git a/src/environments/launch.py b/src/environments/launch.py
@@ -8,7 +8,13 @@
 
 from db.step import StepManager
 import typer
-from playwright.async_api import Browser, BrowserContext, Request, async_playwright
+from playwright.async_api import (
+    Browser,
+    BrowserContext,
+    Request,
+    Route,
+    async_playwright,
+)
 
 from environments.replay import TaskStepExecutor
 from config.storage import DATA_DIR
@@ -206,7 +212,21 @@ async def build_context(
         include_storage_state: bool = False,
     ) -> BrowserContext:
         """Build a browser context with HAR-based replay."""
+        context_config = self.get_context_config(
+            include_storage_state=include_storage_state
+        )
+        context = await browser.new_context(**context_config)
+        await self.configure_context(
+            context, allow_network_fallback=allow_network_fallback
+        )
+        return context
+
+    def get_context_config(
+        self, *, include_storage_state: bool = False
+    ) -> Dict[str, Any]:
+        """Prepare context configuration, optionally including storage state."""
         context_config = dict(self.environment.get("context_config") or {})
+
         if include_storage_state:
             storage_state_path = self._storage_state_path()
             if storage_state_path:
@@ -215,7 +235,15 @@ async def build_context(
                 logger.warning("Storage state file not found, using empty state")
                 context_config["storage_state"] = "{}"
 
-        context = await browser.new_context(**context_config)
+        return context_config
+
+    async def configure_context(
+        self,
+        context: BrowserContext,
+        *,
+        allow_network_fallback: bool = False,
+    ) -> None:
+        """Configure an existing browser context with HAR replay and routing."""
         self._setup_har_logging(context)
 
         har_path = self.bundle_path / "recording.har"
@@ -235,9 +263,8 @@ async def build_context(
         await context.route(
             "**/*", lambda route, request: self.handle_routes_manually(route, request)
         )
-        return context
 
-    async def handle_routes_manually(self, route, request):
+    async def handle_routes_manually(self, route: Route, request: Request) -> None:
         # TODO: do we need to obsfucate in a more clever way?
         # - ?? Normalize JSON (remove volatile fields; sort keys) and hash; tolerate multipart boundary changes; ignore known nonce/timestamp params.
         # TODO: what if the request is sent twice, we'll be selecting the first one all the time.
@@ -246,6 +273,7 @@ async def handle_routes_manually(self, route, request):
         # TODO: this requires LM postprocessing selection of URL's to match or some dumb way for all POST? or smth
         # TODO: why when collecting, increasing/decreasing cart stuff fails
         # TODO: some assets in GET are also dynamic?, bunch of js/stylesheets are not found in HAR
+        # TODO: websockets? like e.g. ChatGPT doesn't allow for collecting anything
 
         urls_to_ignore_post_data = {
             "https://www.amazon.com/ax/claim",
diff --git a/src/eval/run/browseruse.py b/src/eval/run/browseruse.py
@@ -239,7 +239,6 @@ async def run_task_with_agent(
     sandbox_bundle: Optional[Path] = None,
     sandbox_allow_network: bool = False,
     sandbox_headless: bool = True,
-    sandbox_safe_mode: bool = False,
 ) -> Dict[str, Any]:
     """Run a single task with the Browser-Use agent and capture all data."""
 
@@ -285,14 +284,9 @@ def capture_accessibility_tree(browser_state, agent_output, step_number):
 
     try:
         sandbox_start_error: Optional[Exception] = None
-        sandbox_modes = []
-        if sandbox_bundle:
-            if sandbox_safe_mode:
-                sandbox_modes = [True]
-            else:
-                sandbox_modes = [False, True]
+        for safe_mode in [False, True]:
+            # try first antidetection and performance, if fails, try safe mode
 
-        for safe_mode in sandbox_modes:
             logger.info(
                 "Starting sandbox for task %s at %s (safe_mode=%s)",
                 task["task_id"],
@@ -461,7 +455,6 @@ async def process_single_task(
     sandbox_root: Optional[Path],
     sandbox_allow_network: bool,
     sandbox_headless: bool,
-    sandbox_safe_mode: bool,
     semaphore: Optional[asyncio.Semaphore] = None,
 ):
     """Process a single task and write results to individual JSON file"""
@@ -495,7 +488,6 @@ async def process_single_task(
                 sandbox_bundle=sandbox_bundle,
                 sandbox_allow_network=sandbox_allow_network,
                 sandbox_headless=sandbox_headless,
-                sandbox_safe_mode=sandbox_safe_mode,
             )
 
             # Write result to individual JSON file
@@ -534,7 +526,6 @@ async def process_all_tasks(
     sandbox_root: Optional[Path],
     sandbox_allow_network: bool,
     sandbox_headless: bool,
-    sandbox_safe_mode: bool,
 ):
     """Process all tasks and save to individual JSON files, skipping already completed ones"""
     # Cleanup all active Kernel browser sessions before starting
@@ -593,7 +584,6 @@ async def process_all_tasks(
                 sandbox_root=sandbox_root,
                 sandbox_allow_network=sandbox_allow_network,
                 sandbox_headless=sandbox_headless,
-                sandbox_safe_mode=sandbox_safe_mode,
                 semaphore=semaphore,
             )
         )
@@ -620,19 +610,12 @@ async def main(args: argparse.Namespace) -> None:
             )
 
     sandbox_headless = not args.sandbox_headed
-    sandbox_safe_mode = args.sandbox_safe_mode
-    if sandbox_safe_mode and args.sandbox_headed:
-        logger.warning(
-            "Sandbox safe mode forces headless Chromium; ignoring --sandbox-headed"
-        )
-        sandbox_headless = True
 
     results_dir = await process_all_tasks(
         args.model,
         sandbox_root=sandbox_root,
         sandbox_allow_network=args.sandbox_allow_network,
         sandbox_headless=sandbox_headless,
-        sandbox_safe_mode=sandbox_safe_mode,
     )
     print(f"\nAll results saved to: {results_dir}")
 
@@ -658,11 +641,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Launch sandbox Chromium with a visible window",
     )
-    parser.add_argument(
-        "--sandbox-safe-mode",
-        action="store_true",
-        help="Use a reduced argument set and headless Chromium for stability",
-    )
     return parser.parse_args()