diff --git a/environments/online_mind2web/.gitignore b/environments/online_mind2web/.gitignore new file mode 100644 index 00000000..1916bd2d --- /dev/null +++ b/environments/online_mind2web/.gitignore @@ -0,0 +1,2 @@ +gcp.json +test.ipynb \ No newline at end of file diff --git a/environments/online_mind2web/Dockerfile b/environments/online_mind2web/Dockerfile new file mode 100644 index 00000000..c550b572 --- /dev/null +++ b/environments/online_mind2web/Dockerfile @@ -0,0 +1,36 @@ +# Use our HUD base browser image with Playwright and uv pre-installed +FROM hudpython/base-browser:latest + +# Create app-specific working directory +WORKDIR /app + +# Copy project files +COPY pyproject.toml ./ +COPY src/ ./src/ + +# Install the package using the existing venv at /opt/venv +# The --python flag tells uv to use this specific Python instead of creating a new venv +RUN uv pip install --python /opt/venv -e . + +# Create directories for logs and data +RUN mkdir -p /app/logs /app/data + +ENV DISPLAY_WIDTH=1448 +ENV DISPLAY_HEIGHT=944 + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 +# Note: Environment variables for browser providers should be set at runtime: +# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel +# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc. +# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed) + +# Run remote browser with persistent context +CMD ["sh", "-c", "\ + # Start context server in background \ + python3 -m hud_controller.context >&2 & \ + # Wait a bit for context server to start \ + sleep 2 && \ + # Run MCP server in foreground with exec \ + exec python3 -m hud_controller.server \ +"] \ No newline at end of file diff --git a/environments/online_mind2web/README.md b/environments/online_mind2web/README.md new file mode 100644 index 00000000..622f1477 --- /dev/null +++ b/environments/online_mind2web/README.md @@ -0,0 +1,36 @@ +# HUD Online Mind2Web Taskset + +Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation. + +## Running with Docker + +The Docker image supports both production and development modes using the same Dockerfile. + +### Building the Image + +```bash +# Production build (default) +docker build -t hud-om2w:latest . +``` + +### Running the Test Task +```bash +hud eval ./test_task.json +``` + +### Running Whole Online-Mind2Web Dataset From HuggingFace +```bash +hud eval Genteki/Online-Mind2Web --full --max-concurrent=5 +``` + +### Different Evaluation Method + +To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you: + +| Evaluation Method | Final Screenshot | Screenshot History | Action Histroy | +|:---|:---:|:---:| :---: | +| `autonomous` | ✔ | ✗ | ✔ | +| `webjudge` | ✔ | ✔ | ✔ | +| `overall_judge`[^1] | - | - | - | + +[^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them. diff --git a/environments/online_mind2web/pyproject.toml b/environments/online_mind2web/pyproject.toml new file mode 100644 index 00000000..ac90e720 --- /dev/null +++ b/environments/online_mind2web/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "hud-om2w" +version = "0.1.0" +description = "HUD Remote Browser Controller with MCP tools for cloud browser providers" +requires-python = ">=3.11,<3.13" +dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",] + +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[project.scripts] +hud-om2w = "hud_controller.__main__:main" + +[tool.hud] +image = "hud-om2w:dev" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = [ "src/hud_controller",] diff --git a/environments/online_mind2web/src/hud_controller/__init__.py b/environments/online_mind2web/src/hud_controller/__init__.py new file mode 100644 index 00000000..20a3997d --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/__init__.py @@ -0,0 +1,3 @@ +"""Online Mind2Web Env, From ../remote-browser""" + +__version__ = "0.1.0" diff --git a/environments/online_mind2web/src/hud_controller/context.py b/environments/online_mind2web/src/hud_controller/context.py new file mode 100644 index 00000000..0e45a766 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/context.py @@ -0,0 +1,139 @@ +""" +Context server for remote browser environment that persists state across hot-reloads. + +Run this as a separate process to maintain browser session state during development: + python -m hud_controller.context +""" + +import asyncio +import logging +from datetime import datetime +from typing import Dict, Any, Optional +from hud.server.context import run_context_server + +logger = logging.getLogger(__name__) + + +class RemoteBrowserContext: + """Context that holds remote browser state across reloads.""" + + def __init__(self): + """Initialize the remote browser context.""" + self.browser_provider = None + self.is_initialized = False + self.provider_config: Optional[Dict[str, Any]] = None + self.launch_options: Optional[Dict[str, Any]] = None + self._startup_complete = False + self.playwright_tool = None # Store the playwright tool + self._telemetry: Optional[Dict[str, Any]] = None # Store full telemetry data + + logger.info("[RemoteBrowserContext] Created new remote browser context") + + def startup(self): + """One-time startup when context server starts.""" + if self._startup_complete: + logger.info("[RemoteBrowserContext] Startup already complete, skipping") + return + + logger.info("[RemoteBrowserContext] Performing one-time startup") + self._startup_complete = True + + # === Proxy-friendly methods for multiprocessing.Manager === + # Note: These are needed because direct attribute access doesn't always + # work correctly through the multiprocessing proxy + + def get_browser_provider(self): + """Get the browser provider instance.""" + return self.browser_provider + + def set_browser_provider(self, provider) -> None: + """Set the browser provider instance.""" + self.browser_provider = provider + if provider: + self.provider_name = provider.__class__.__name__.replace("Provider", "").lower() + logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}") + + def get_cdp_url(self) -> Optional[str]: + """Get the CDP URL from telemetry.""" + return self._telemetry.get("cdp_url") if self._telemetry else None + + def get_is_initialized(self) -> bool: + """Check if environment is initialized.""" + return self.is_initialized + + def set_initialized(self, value: bool) -> None: + """Set initialization status.""" + self.is_initialized = value + logger.info(f"[RemoteBrowserContext] Initialization status: {value}") + + def get_provider_config(self) -> Optional[Dict[str, Any]]: + """Get provider configuration.""" + return self.provider_config + + def set_provider_config(self, config: Dict[str, Any]) -> None: + """Set provider configuration.""" + self.provider_config = config + logger.info(f"[RemoteBrowserContext] Set provider config") + + def get_launch_options(self) -> Optional[Dict[str, Any]]: + """Get launch options.""" + return self.launch_options + + def set_launch_options(self, options: Dict[str, Any]) -> None: + """Set launch options.""" + self.launch_options = options + logger.info(f"[RemoteBrowserContext] Set launch options") + + def get_playwright_tool(self): + """Get the playwright tool instance.""" + return self.playwright_tool + + def set_playwright_tool(self, tool) -> None: + """Set the playwright tool instance.""" + self.playwright_tool = tool + logger.info(f"[RemoteBrowserContext] Set playwright tool") + + def set_telemetry(self, telemetry: Dict[str, Any]) -> None: + """Set the full telemetry data.""" + self._telemetry = telemetry + logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}") + + def get_state_summary(self) -> Dict[str, Any]: + """Get a summary of the current state.""" + return { + "is_initialized": self.is_initialized, + "startup_complete": self._startup_complete, + "provider_name": self._telemetry.get("provider") if self._telemetry else None, + "has_cdp_url": self.get_cdp_url() is not None, + "has_browser_provider": self.browser_provider is not None, + "has_playwright_tool": self.playwright_tool is not None, + } + + def get_telemetry(self) -> Dict[str, Any]: + """Get telemetry data from the browser provider.""" + # If we have stored telemetry, return it + if self._telemetry: + return self._telemetry + + # Otherwise return basic telemetry data + return { + "provider": "unknown", + "status": "not_initialized", + "live_url": None, + "cdp_url": None, + "instance_id": None, + "timestamp": datetime.now().isoformat(), + } + + +if __name__ == "__main__": + # Run the context server with RemoteBrowserContext + context = RemoteBrowserContext() + context.startup() + + # Log initial state + logger.info(f"[Context] Starting remote browser context server") + logger.info(f"[Context] Initial state: {context.get_state_summary()}") + + # Run the context server + asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock")) diff --git a/environments/online_mind2web/src/hud_controller/evaluate/__init__.py b/environments/online_mind2web/src/hud_controller/evaluate/__init__.py new file mode 100644 index 00000000..4f8b909d --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/evaluate/__init__.py @@ -0,0 +1,11 @@ +"""Evaluation layer for remote browser environment.""" + +from __future__ import annotations + +from hud.tools.base import BaseHub + +evaluate = BaseHub("evaluate") + +from . import autonomous_eval, webjudge, overall_judge + +__all__ = ["evaluate"] diff --git a/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py b/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py new file mode 100644 index 00000000..6ed157c2 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py @@ -0,0 +1,170 @@ +"""online=mind2web evaluators.""" + +import os, json, logging +from hud.tools.types import EvaluationResult +from . import evaluate +from fastmcp import Context + +logger = logging.getLogger(__name__) + + +@evaluate.tool("autonomous") +async def autonomous( + ctx: Context, + task_description: dict | str, +) -> dict | EvaluationResult: + return await autonomous(ctx, task_description) + + +async def autonomous_eval( + ctx: Context, + task_description: dict | str, +) -> dict | EvaluationResult: + logging.info((task_description)) + if type(task_description) == str: + task_description = json.loads(task_description) + try: + # check openai api key + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + logging.error("OPENAI_API_KEY environment variable not set") + return EvaluationResult( + reward=0.0, + done=False, + info={"error": "OPENAI_API_KEY environment variable not set"}, + isError=True, + ) + + persistent_ctx = evaluate.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return EvaluationResult( + reward=0.0, + done=False, + content="No browser page available", + info={"error": "No browser page available"}, + isError=True, + ) + + # Load action history from file + action_history = [] + try: + action_history_file = "/action_history/action_history.txt" + if os.path.exists(action_history_file): + with open(action_history_file, "r", encoding="utf-8") as f: + action_history = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(action_history)} actions from history file") + else: + logging.warning("No action history file found") + except Exception as e: + logging.warning(f"Failed to load action history: {e}") + action_history = [] + + # Get last 10 actions for evaluation + last_actions = action_history[-10:] if action_history else [] + + logging.info("Taking screenshot for VLM evaluation...") + + # take screen shot + screenshot_bytes = await playwright_tool.page.screenshot() + import base64 + + screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8") + + # Create evaluation prompt using Autonomous_eval structure + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not. + +There are three types of tasks: +1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail. +2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response. +3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response. + +*IMPORTANT* +Format your response into two lines as shown below: + +Thoughts: +Status: "success" or "failure" +""" + + prompt = f"""User Intent: {task_description["confirmed_task"]} + +Action History: +{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))} + +The last snapshot of the web page is shown in the image.""" + logging.info("Calling GPT-4.1 for evaluation...") + import openai + + # Check for any environment variables that might affect OpenAI client + proxy_related_vars = { + k: v for k, v in os.environ.items() if "proxy" in k.lower() or "http" in k.lower() + } + if proxy_related_vars: + logging.info(f"Found proxy-related env vars: {proxy_related_vars}") + + try: + # Try creating client with minimal parameters to avoid 'proxies' error + client = openai.OpenAI(api_key=openai_api_key) + logging.info("OpenAI client created successfully") + except Exception as e: + logging.error(f"Failed to create OpenAI client: {e}") + logging.error(f"OpenAI version: {openai.__version__}") + import traceback + + logging.error(traceback.format_exc()) + raise + messages = [ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{screenshot_b64}", + "detail": "high", + }, + }, + ], + }, + ] + + response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + temperature=0.0, + max_tokens=500, # Increased for detailed thoughts and status + ) + + # Parse result according to new format + result_text = response.choices[0].message.content.strip() + + # Extract thoughts and status + try: + thoughts = result_text.split("Thoughts:")[1].split("Status:")[0].strip() + status = result_text.split("Status:")[1].strip().strip('"').lower() + success = status == "success" + except: + thoughts = result_text + success = "success" in result_text.lower() + status = "success" if success else "failure" + + logging.info(f"Autonomous evaluation result: {status} (thoughts: {thoughts[:100]}...)") + + return EvaluationResult( + reward=1.0 if success else 0.0, + done=True, + content=f"Status: {status}\nThoughts: {thoughts}", + info={ + "status": status, + "thoughts": thoughts, + "task_description": task_description, + "actions_count": len(action_history), + }, + ) + + except Exception as e: + logging.error(f"VLM evaluation failed: {e}") + return EvaluationResult(isError=True, info={"Exception": str(e)}) diff --git a/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py b/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py new file mode 100644 index 00000000..1ad90258 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py @@ -0,0 +1,48 @@ +import logging +from fastmcp import Context +from hud.tools.types import EvaluationResult +from . import evaluate +from .autonomous_eval import autonomous_eval +from .webjudge import webjudge_eval + +logger = logging.getLogger(__name__) + + +@evaluate.tool("overall_judge") +async def overall_judge(ctx: Context, task_description: dict | str) -> dict | EvaluationResult: + """Judge and return the results from all evalution methods + + Args: + ctx: Context, passed automatically + task_description: Task description (dict or JSON string) + + Returns: + Dict containing rewards and info + """ + evaluation_methods = [autonomous_eval, webjudge_eval] + + info = {} + reward = 0.0 + errors = 0 + done = 0.0 + n = float(len(evaluation_methods)) + + try: + for f in evaluation_methods: + r: EvaluationResult = await f(ctx, task_description) + reward += r.reward + errors += r.isError + done += int(r.done) + info[f.__name__] = { + "reward": r.reward, + "done": r.done, + "isError": r.isError, + "info": r.info, + } + + return EvaluationResult( + reward=reward / n, done=(done >= n / 2), info=info, isError=(errors > 0) + ) + except Exception as e: + logger.error(f"Overall evaluation failed: {e}") + return EvaluationResult(isError=True, info={"Exception": str(e)}) diff --git a/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py b/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py new file mode 100644 index 00000000..492993d1 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py @@ -0,0 +1,502 @@ +"""online-mind2web evaluators webjudge""" + +""" reference: https://github.com/OSU-NLP-Group/Online-Mind2Web/blob/main/src/methods/webjudge_online_mind2web.py """ + +import os, json, logging, base64, re +import openai +from typing import Optional +from fastmcp import Context +from hud.tools.types import EvaluationResult +from . import evaluate + +logger = logging.getLogger(__name__) + +MAX_IMAGE = 50 # Maximum screenshot of history to judge + + +@evaluate.tool("webjudge") +async def webjudge(ctx: Context, task_description: dict | str): + return await webjudge_eval(ctx, task_description) + + +async def identify_key_point(task_description: dict | str) -> dict: + """Identify key points in a task description using GPT-4. + + Args: + task_description: The task to analyze (dict or JSON string) + + Returns: + Dict containing the identified key points + """ + + if type(task_description) == str: + task_description = json.loads(task_description) + + # Check OpenAI API key + openai_api_key = os.getenv("OPENAI_API_KEY") + + # Debug what we get from environment + logging.info(f"DEBUG: Raw environment variable type: {type(openai_api_key)}") + if openai_api_key: + logging.info( + f"DEBUG: Raw key repr: {repr(openai_api_key[:10])}" + ) # Show first 10 chars with repr to see any weird characters + if openai_api_key is None: + logging.error("OPENAI_API_KEY environment variable not set") + return {"success": False, "error": "OPENAI_API_KEY environment variable not set"} + + try: + logging.info("Webjudge evaluation: identify_key_point") + + # Extract task text + task_text = ( + task_description.get("confirmed_task", str(task_description)) + if isinstance(task_description, dict) + else str(task_description) + ) + + system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description. + +**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal. + +**Instructions**: +1. Read the task description carefully. +2. Identify and extract **key points** directly stated in the task description. + - A **key point** is a critical element, condition, or step explicitly mentioned in the task description. + - Do not infer or add any unstated elements. + - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest"). + +**Respond with**: +- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details.""" + + user_prompt = f"Task: {task_text}" + + messages = [ + {"role": "system", "content": system_msg}, + {"role": "user", "content": user_prompt}, + ] + + # Debug the actual API key being used + logging.info( + f"DEBUG: Creating OpenAI client with key: {openai_api_key[:10]}...{openai_api_key[-10:]}" + ) + logging.info(f"DEBUG: Full API key length: {len(openai_api_key)}") + + # Check if the key looks valid (should start with sk-) + if not openai_api_key.startswith("sk-"): + logging.error(f"DEBUG: API key doesn't start with 'sk-': {openai_api_key[:10]}") + + client = openai.OpenAI(api_key=openai_api_key) + logging.info("DEBUG: OpenAI client created successfully") + + # Log the request we're about to make + logging.info(f"DEBUG: Making request to model: gpt-4o") + logging.info(f"DEBUG: Message count: {len(messages)}") + + try: + response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + temperature=0.0, + max_tokens=500, # Increased for key points list + ) + logging.info("DEBUG: API call completed successfully") + except Exception as api_error: + logging.error(f"DEBUG: API call failed with error: {api_error}") + logging.error(f"DEBUG: Error type: {type(api_error)}") + if hasattr(api_error, "response"): + logging.error(f"DEBUG: Error response: {api_error.response}") + # Re-check the environment variable at the moment of failure + current_key = os.getenv("OPENAI_API_KEY") + logging.error( + f"DEBUG: Env var at failure - length: {len(current_key) if current_key else 'None'}" + ) + if current_key: + logging.error( + f"DEBUG: Env var at failure: {current_key[:20]}...{current_key[-10:]}" + ) + raise + + # Parse the response + key_points_text = response.choices[0].message.content.strip() + + logging.info(f"Identified key points: {key_points_text}") + + return { + "success": True, + "key_points": key_points_text, + "task_description": task_text, + "model": "gpt-4o", + } + + except Exception as e: + logging.error(f"Key point identification failed: {e}") + return { + "success": False, + "error": str(e), + "task_description": task_text if "task_text" in locals() else str(task_description), + } + + +async def judge_image( + base64_images: list | str, task_description: dict | str, key_points: str +) -> dict: + """Judge image(s) for task completion using GPT-4V + + Args: + base64_images: List of base64 encoded images or single base64 string + task_description: The task to evaluate (dict or JSON string) + key_points: Key points for task completion + + Returns: + Dict containing evaluation results with reasoning and scores + """ + + if type(task_description) == str: + task_description = json.loads(task_description) + + if type(base64_images) == str: + base64_images = [base64_images] + + # Check OpenAI API key + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + logging.error("OPENAI_API_KEY environment variable not set") + return {"success": False, "error": "OPENAI_API_KEY environment variable not set"} + + try: + logging.info(f"Judging {len(base64_images)} images for task completion") + + # Extract task text + task_text = ( + task_description.get("confirmed_task", str(task_description)) + if isinstance(task_description, dict) + else str(task_description) + ) + + system_msg = """You are an expert evaluator tasked with determining whether the provided images contain information about the necessary steps to complete a task. + +**Objective**: Analyze the provided images and decide if they show essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score. + +**Instructions**: +1. Provide a detailed description of each image, including its contents, visible elements, text (if any), and any notable features. + +2. Carefully examine the images and evaluate whether they contain necessary steps or evidence crucial to task completion: +- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions. +- Do the images show actions, progress indicators, or critical information directly related to completing the task? +- Is this information indispensable for understanding or ensuring task success? +- If the images contain partial but relevant information, consider their usefulness rather than dismissing them outright. +- Consider the progression across multiple images if provided. + +3. Provide your response in the following format: +- **Reasoning**: Explain your thought process and observations. Mention specific elements in the images that indicate necessary steps, evidence, or lack thereof. +- **Score**: Assign a score based on the reasoning, using the following scale: + - **1**: The images do not contain any necessary steps or relevant information. + - **2**: The images contain minimal or ambiguous information, unlikely to be essential. + - **3**: The images include some relevant steps or hints but lack clarity or completeness. + - **4**: The images contain important steps or evidence that are highly relevant but not fully comprehensive. + - **5**: The images clearly display necessary steps or evidence crucial for completing the task. + +Respond with: +1. **Reasoning**: [Your detailed explanation] +2. **Score**: [1-5]""" + + prompt = f"""**Task**: {task_text} + +**Key Points for Task Completion**: {key_points} + +The snapshots of the web page progression are shown in the images below.""" + + # Create message content with text and images + message_content = [{"type": "text", "text": prompt}] + + for base64_img in base64_images: + message_content.append( + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{base64_img}", "detail": "high"}, + } # type: ignore + ) + + messages = [ + {"role": "system", "content": system_msg}, + {"role": "user", "content": message_content}, + ] + + client = openai.OpenAI(api_key=openai_api_key) + + response = client.chat.completions.create( + model="gpt-4o", # GPT-4V + messages=messages, + temperature=0.0, + max_tokens=1000, + ) + + # Parse the response + result_text = response.choices[0].message.content.strip() + + logging.info(f"Image judgment result: {result_text[:200]}...") + + return { + "success": True, + "judgment": result_text, + "task_description": task_text, + "key_points": key_points, + "images_processed": len(base64_images), + "model": "gpt-4o", + } + + except Exception as e: + logging.error(f"Image judgment failed: {e}") + return { + "success": False, + "error": str(e), + "task_description": task_text if "task_text" in locals() else str(task_description), + "images_processed": len(base64_images) if "base64_images" in locals() else 0, + } + + +async def webjudge_eval(ctx: Context, task_description: dict | str, score_threshold: int = 3): + """WebJudge Online Mind2Web evaluation using screenshot history and action history + + Args: + task_description: Task description (dict or JSON string) + score_threshold: Minimum score threshold for image filtering (1-5) + + Returns: + Dict containing evaluation results with success/failure status + """ + if type(task_description) == str: + task_description = json.loads(task_description) + + # Check OpenAI API key + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + logging.error("OPENAI_API_KEY environment variable not set") + return EvaluationResult( + isError=True, info={"Exception": f"OPENAI_API_KEY environment variable not set"} + ) + + try: + logging.info("Starting WebJudge Online Mind2Web evaluation") + + # Extract task text + task_text = task_description.get("confirmed_task") + + # Get screenshots from /screenshot directory + screenshot_dir = "/screenshot" + screenshot_history = [] + + try: + if os.path.exists(screenshot_dir): + # Get all PNG files sorted by modification time (newest last) + screenshot_files = [] + for file in os.listdir(screenshot_dir): + if file.endswith(".png") and file.startswith("screenshot_"): + filepath = os.path.join(screenshot_dir, file) + mtime = os.path.getmtime(filepath) + screenshot_files.append((mtime, filepath)) + + # Sort by modification time + screenshot_files.sort(key=lambda x: x[0]) + + for _, filepath in screenshot_files[-MAX_IMAGE:]: + try: + with open(filepath, "rb") as f: + image_data = f.read() + screenshot_b64 = base64.b64encode(image_data).decode("utf-8") + screenshot_history.append(screenshot_b64) + except Exception as e: + logging.warning(f"Failed to read screenshot {filepath}: {e}") + + logging.info(f"Loaded {len(screenshot_history)} screenshots from {screenshot_dir}") + else: + logging.warning(f"Screenshot directory {screenshot_dir} does not exist") + + except Exception as e: + logging.error(f"Failed to load screenshots from {screenshot_dir}: {e}") + + if not screenshot_history: + logging.warning("No screenshot history available") + return EvaluationResult( + reward=0.0, + done=True, + content="No screenshot avaliable", + info={"task_description": task_text, "status": "No screenshot avaliable"}, + ) + + # Get action history from file + action_history = [] + try: + action_history_file = "/action_history/action_history.txt" + if os.path.exists(action_history_file): + with open(action_history_file, "r", encoding="utf-8") as f: + action_history = [line.strip() for line in f if line.strip()] + logging.info(f"Loaded {len(action_history)} actions from history file") + else: + logging.warning("No action history file found") + except Exception as e: + logging.warning(f"Failed to load action history: {e}") + action_history = [] + + # Get all actions for evaluation + last_actions = action_history if action_history else [] + + logging.info( + f"Found {len(screenshot_history)} screenshots and {len(action_history)} actions" + ) + + # Step 1: Identify key points + logging.info(f"Webjudge step 1: Identify key points") + key_points_result = await identify_key_point(task_description) + if not key_points_result.get("success"): + logger.error(f"Key point identification failed: {key_points_result.get('error')}") + return EvaluationResult( + isError=True, + info={ + "Exception": f"Key point identification failed: {key_points_result.get('error')}" + }, + ) + + key_points = key_points_result["key_points"] + + # Clean up key points formatting + key_points = key_points.replace("\n\n", "\n") + try: + if "**Key Points**:" in key_points: + key_points = key_points.split("**Key Points**:")[1] + elif "Key Points:" in key_points: + key_points = key_points.split("Key Points:")[-1] + key_points = "\n".join(line.lstrip() for line in key_points.splitlines()) + except: + pass + + # Step 2: Judge images using screenshot history + logging.info(f"Webjudge step 2: Judge images using screenshot history") + judge_result = await judge_image( + base64_images=screenshot_history, + task_description=task_description, + key_points=key_points, + ) + + if not judge_result.get("success"): + return EvaluationResult( + isError=True, + info={"Exception": f"Image judgment failed: {judge_result.get('error')}"}, + ) + + # Parse judgment result for score + judgment_text = judge_result["judgment"] + pattern = r"[1-5]" + try: + scores = re.findall(pattern, judgment_text.split("Score")[1]) + main_score = int(scores[0]) if scores else 3 + except: + main_score = 3 # Default score if parsing fails + logger.info("Score: ", main_score) + + # Extract reasoning + try: + reasoning = ( + judgment_text.split("**Reasoning**:")[-1].strip().split("**Score**:")[0].strip() + ) + except: + reasoning = "Unable to extract reasoning" + + # Step 3: Final evaluation using GPT-4 + logging.info(f"Webjudge step 3: Final evaluation using GPT-4") + + system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, and analysis of important web pages, your goal is to determine whether the agent has completed the task and achieved all requirements. + +Your response must strictly follow the following evaluation criteria! +*Important Evaluation Criteria*: +1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful. +2: You must carefully check whether these screenshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function). +3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements! +4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. +5: Some tasks require a submission action or a display of results to be considered successful. +6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful. +7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements, the task is still considered successful. + +*IMPORTANT* +Format your response into two lines as shown below: + +Thoughts: +Status: "success" or "failure" +""" + + # Prepare final evaluation content + if main_score >= score_threshold: + # Include high-scoring screenshots in final evaluation + final_images = [] + for screenshot_b64 in screenshot_history: # All screenshots + final_images.append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{screenshot_b64}", + "detail": "high", + }, + } + ) + + prompt_with_images = f"""User Task: {task_text} + +Key Points: {key_points} + +Action History: +{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))} + +Image Analysis Results: +Score: {main_score}/5 +Reasoning: {reasoning}""" + + content = [{"type": "text", "text": prompt_with_images}] + final_images + else: + # Text-only evaluation if images don't meet threshold + prompt_text_only = f"""User Task: {task_text} + +Key Points: {key_points} + +Action History: +{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))} + +Note: Screenshot analysis scored {main_score}/5, below threshold of {score_threshold}.""" + + content = [{"type": "text", "text": prompt_text_only}] + + # Final evaluation + client = openai.OpenAI(api_key=openai_api_key) + + messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": content}] + + response = client.chat.completions.create( + model="gpt-4o", messages=messages, temperature=0.0, max_tokens=500 + ) + + final_result = response.choices[0].message.content.strip() + + # Parse final result + try: + thoughts = final_result.split("Thoughts:")[1].split("Status:")[0].strip() + status = final_result.split("Status:")[1].strip().strip('"').lower() + success = status == "success" + except: + logging.info(f"Warning: Final result parsing failed: {final_result}") + thoughts = final_result + success = "success" in final_result.lower() + status = "success" if success else "failure" + + logging.info(f"WebJudge evaluation result: {status}") + + return EvaluationResult( + reward=1.0 if success else 0.0, + done=True, + content=final_result, + info={"task_description": task_text, "status": status, "thoughts": thoughts}, + isError=False, + ) + + except Exception as e: + logging.error(f"WebJudge evaluation failed: {e}") + return EvaluationResult(isError=True, info={"Exception": str(e)}) diff --git a/environments/online_mind2web/src/hud_controller/problems/__init__.py b/environments/online_mind2web/src/hud_controller/problems/__init__.py new file mode 100644 index 00000000..dc6b56ed --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/__init__.py @@ -0,0 +1,14 @@ +"""Problem definitions for remote browser environment.""" + +from .registry import ProblemRegistry, problem + +# Import problem definitions to trigger registration +from .navigate_and_verify import NavigateAndVerifyProblem +from .form_interaction import FormFillAndSubmitProblem +from .search_interaction import GoogleSearchProblem +from .element_interaction import ButtonClickTestProblem + +__all__ = [ + "ProblemRegistry", + "problem", +] diff --git a/environments/online_mind2web/src/hud_controller/problems/element_interaction.py b/environments/online_mind2web/src/hud_controller/problems/element_interaction.py new file mode 100644 index 00000000..a6ccf111 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/element_interaction.py @@ -0,0 +1,41 @@ +"""Element interaction problems for testing UI components.""" + +from ..problems import problem + + +@problem("button_click_test", description="Test button clicking and verification") +class ButtonClickTestProblem: + """Problem that tests clicking buttons and verifying state changes.""" + + def get_setup(self): + """Load a page with interactive elements.""" + html_content = """ + + + + Button Test + + + +

Button Click Test

+ +
+ + + """ + return {"name": "load_html_content", "arguments": {"html": html_content}} + + def get_evaluation(self): + """Verify the button is present.""" + return { + "name": "page_contains", + "arguments": { + "search_terms": ["Button Click Test", "Click Me"], + "partial_rewarding": True, + }, + } diff --git a/environments/online_mind2web/src/hud_controller/problems/form_interaction.py b/environments/online_mind2web/src/hud_controller/problems/form_interaction.py new file mode 100644 index 00000000..5127762b --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/form_interaction.py @@ -0,0 +1,28 @@ +"""Form interaction problem for testing input elements.""" + +from ..problems import problem + + +@problem("form_fill_and_submit", description="Fill out a form and verify submission") +class FormFillAndSubmitProblem: + """Problem that fills out a form and verifies the interaction.""" + + def get_setup(self): + """Set up a form page.""" + return { + "name": "navigate_to_url", + "arguments": { + "url": "https://httpbin.org/forms/post", + "wait_for_load_state": "domcontentloaded", + }, + } + + def get_evaluation(self): + """Verify form elements are present.""" + return { + "name": "page_contains", + "arguments": { + "search_terms": ["Customer name:", "Pizza Size", "Submit order"], + "partial_rewarding": True, + }, + } diff --git a/environments/online_mind2web/src/hud_controller/problems/navigate_and_verify.py b/environments/online_mind2web/src/hud_controller/problems/navigate_and_verify.py new file mode 100644 index 00000000..8d6b282b --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/navigate_and_verify.py @@ -0,0 +1,28 @@ +"""Example problem definition for remote browser environment.""" + +from ..problems import problem + + +@problem("navigate_and_verify", description="Navigate to a URL and verify the page contains text") +class NavigateAndVerifyProblem: + """Problem that navigates to a URL and verifies page content.""" + + def get_setup(self): + """Get the setup configuration for this problem.""" + return { + "name": "navigate_to_url", + "arguments": {"url": "https://example.com", "wait_for_load_state": "networkidle"}, + } + + def get_evaluation(self): + """Get the evaluation configuration for this problem.""" + return { + "name": "page_contains", + "arguments": { + "search_terms": [ + "Example Domain", + "This domain is for use in illustrative examples", + ], + "partial_rewarding": True, + }, + } diff --git a/environments/online_mind2web/src/hud_controller/problems/registry.py b/environments/online_mind2web/src/hud_controller/problems/registry.py new file mode 100644 index 00000000..2c52c758 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/registry.py @@ -0,0 +1,91 @@ +"""Registry system for problem definitions.""" + +from typing import Dict, Type, Any, List +import json +import logging + +logger = logging.getLogger(__name__) + +# Global registry for problem classes +PROBLEM_REGISTRY: Dict[str, Type] = {} + + +def problem(name: str, description: str | None = None): + """Decorator to register a problem class. + + Args: + name: The problem identifier + description: Optional description for the problem + + Example: + @problem("navigate_and_click", description="Navigate to URL and click element") + class NavigateAndClickProblem: + def get_setup(self): + return {"name": "navigate_to_url", "arguments": {"url": "https://example.com"}} + def get_evaluation(self): + return {"name": "element_clicked", "arguments": {"selector": "#submit"}} + """ + + def decorator(cls): + # Store metadata on the class + cls._problem_name = name + cls._problem_description = description + + PROBLEM_REGISTRY[name] = cls + logger.info(f"Registered problem: {name} -> {cls.__name__}") + return cls + + return decorator + + +class ProblemRegistry: + """Registry for problem definitions.""" + + @staticmethod + def create_problem(name: str): + """Create a problem instance by name. + + Args: + name: Problem identifier + + Returns: + Problem instance + """ + if name not in PROBLEM_REGISTRY: + available = list(PROBLEM_REGISTRY.keys()) + raise ValueError(f"Unknown problem: {name}. Available: {available}") + + problem_class = PROBLEM_REGISTRY[name] + return problem_class() + + @staticmethod + def to_json() -> str: + """Convert registry to JSON for MCP resource serving.""" + problems = [] + for name, cls in PROBLEM_REGISTRY.items(): + problems.append( + { + "name": name, + "class": cls.__name__, + "description": getattr(cls, "_problem_description", None), + } + ) + return json.dumps(problems, indent=2) + + @staticmethod + def list_problems() -> List[str]: + """Get list of available problem names.""" + return list(PROBLEM_REGISTRY.keys()) + + @staticmethod + def get_problem_info(name: str) -> dict: + """Get information about a specific problem.""" + if name not in PROBLEM_REGISTRY: + raise ValueError(f"Unknown problem: {name}") + + cls = PROBLEM_REGISTRY[name] + return { + "name": name, + "class": cls.__name__, + "description": getattr(cls, "_problem_description", None), + } diff --git a/environments/online_mind2web/src/hud_controller/problems/search_interaction.py b/environments/online_mind2web/src/hud_controller/problems/search_interaction.py new file mode 100644 index 00000000..8c1634b0 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/problems/search_interaction.py @@ -0,0 +1,19 @@ +"""Search engine interaction problems.""" + +from ..problems import problem + + +@problem("google_search", description="Perform a Google search and verify results") +class GoogleSearchProblem: + """Problem that performs a search and verifies results appear.""" + + def get_setup(self): + """Navigate to Google.""" + return {"name": "navigate_to_url", "arguments": {"url": "https://www.google.com"}} + + def get_evaluation(self): + """Verify Google search page loaded.""" + return { + "name": "page_contains", + "arguments": {"search_terms": ["Google", "Search"], "partial_rewarding": True}, + } diff --git a/environments/online_mind2web/src/hud_controller/providers/README.md b/environments/online_mind2web/src/hud_controller/providers/README.md new file mode 100644 index 00000000..db3213a7 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/README.md @@ -0,0 +1,110 @@ +# Remote Browser Providers + +This directory contains implementations for various cloud browser providers that can be used with the HUD Remote Browser environment. + +## Supported Providers + +### 1. **AnchorBrowser** ✅ (Implemented) +- **API Endpoint**: `https://api.anchorbrowser.io/v1/sessions` +- **Features**: + - Residential proxy support + - CAPTCHA solving + - Ad blocking + - Popup blocking +- **API Key**: `ANCHOR_API_KEY` environment variable +- **Documentation**: Internal + +### 2. **BrowserBase** 🚧 (To be implemented) +- **API Endpoint**: `https://api.browserbase.com/v1/sessions` +- **Features**: + - Multiple regions support + - Context persistence + - Live view URLs + - Session recordings + - Proxy support +- **API Key**: `X-BB-API-Key` header +- **Documentation**: https://docs.browserbase.com/reference/api/create-a-session + +### 3. **HyperBrowser** 🚧 (To be implemented) +- **API Endpoint**: `https://api.hyperbrowser.ai/api/session` +- **Features**: + - Stealth mode + - Advanced proxy configuration (country/state/city) + - Profile management + - Web recording + - CAPTCHA solving + - Ad blocking + - Browser fingerprinting +- **API Key**: `x-api-key` header +- **Documentation**: https://docs.hyperbrowser.ai/reference/api-reference/sessions + +### 4. **Steel** 🚧 (To be implemented) +- **API Endpoint**: `https://api.steel.dev/v1/sessions` +- **Features**: + - Session management + - Browser automation + - Proxy support +- **API Key**: `steel_api_key` header or `STEEL_API_KEY` env variable +- **Documentation**: https://docs.steel.dev/api-reference + +### 5. **Kernel** ❌ (Not yet available) +- **Status**: API not yet available for browser sessions +- **Documentation**: N/A + +## Provider Lifecycle + +Each provider follows a similar lifecycle pattern: + +1. **Initialization** + - Set up API credentials + - Configure base URLs and default options + +2. **Session Creation** (`launch()`) + - Make API request to create a new browser session + - Handle provider-specific options (proxy, stealth, etc.) + - Return CDP WebSocket URL for Playwright connection + +3. **Session Management** + - Track session IDs and metadata + - Provide status checks + - Handle session-specific features (live view, recordings, etc.) + +4. **Session Termination** (`close()`) + - Clean up resources + - End the browser session via API + - Handle any provider-specific cleanup + +## Implementation Guide + +To add a new provider: + +1. Create a new file in this directory (e.g., `browserbase.py`) +2. Inherit from `BrowserProvider` base class +3. Implement required methods: + - `__init__()` - Initialize with API credentials + - `launch()` - Create a new session and return CDP URL + - `close()` - Terminate the session + - `get_status()` - Return session status +4. Add provider to the registry in `__init__.py` +5. Update environment variables in the main README + +## Environment Variables + +Each provider uses specific environment variables: + +- **AnchorBrowser**: `ANCHOR_API_KEY` +- **BrowserBase**: `BROWSERBASE_API_KEY` +- **HyperBrowser**: `HYPERBROWSER_API_KEY` +- **Steel**: `STEEL_API_KEY` + +## Common Features Across Providers + +| Feature | AnchorBrowser | BrowserBase | HyperBrowser | Steel | +|---------|---------------|-------------|--------------|-------| +| Proxy Support | ✅ | ✅ | ✅ | ✅ | +| CAPTCHA Solving | ✅ | ❓ | ✅ | ❓ | +| Ad Blocking | ✅ | ❓ | ✅ | ❓ | +| Session Recording | ❌ | ✅ | ✅ | ❓ | +| Live View | ✅ | ✅ | ✅ | ❓ | +| Profile Persistence | ❌ | ✅ | ✅ | ❓ | +| Multi-Region | ❌ | ✅ | ✅ | ❓ | \ No newline at end of file diff --git a/environments/online_mind2web/src/hud_controller/providers/__init__.py b/environments/online_mind2web/src/hud_controller/providers/__init__.py new file mode 100644 index 00000000..3ed34b64 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/__init__.py @@ -0,0 +1,33 @@ +"""Browser provider implementations for remote browser control.""" + +from .base import BrowserProvider +from .anchorbrowser import AnchorBrowserProvider +from .browserbase import BrowserBaseProvider +from .steel import SteelProvider +from .hyperbrowser import HyperBrowserProvider +# from .kernel import KernelProvider # Not yet implemented + +__all__ = [ + "BrowserProvider", + "AnchorBrowserProvider", + "BrowserBaseProvider", + "SteelProvider", + "HyperBrowserProvider", + # "KernelProvider", # Not yet implemented +] + +# Provider registry for easy lookup +PROVIDERS = { + "anchorbrowser": AnchorBrowserProvider, + "browserbase": BrowserBaseProvider, + "steel": SteelProvider, + "hyperbrowser": HyperBrowserProvider, + # "kernel": KernelProvider, # Not yet implemented +} + + +def get_provider(name: str) -> type[BrowserProvider]: + """Get a provider class by name.""" + if name not in PROVIDERS: + raise ValueError(f"Unknown provider: {name}. Available: {list(PROVIDERS.keys())}") + return PROVIDERS[name] diff --git a/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py b/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py new file mode 100644 index 00000000..38f9daa0 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py @@ -0,0 +1,170 @@ +"""AnchorBrowser provider implementation.""" + +import os +import logging +from typing import Optional, Dict, Any +import httpx +import requests + +from .base import BrowserProvider +from .helper.proxy import get_proxy_config + +logger = logging.getLogger(__name__) + + +class AnchorBrowserProvider(BrowserProvider): + """AnchorBrowser provider for remote browser control. + + AnchorBrowser provides cloud-based browser instances with features like: + - Proxy support + - CAPTCHA solving + - Ad blocking + - Popup blocking + """ + + def __init__(self, config: Dict[str, Any] | None = None): + super().__init__(config) + self.api_key = config.get("api_key") if config else os.getenv("ANCHOR_API_KEY") + self.base_url = ( + config.get("base_url", "https://api.anchorbrowser.io") + if config + else "https://api.anchorbrowser.io" + ) + self._session_data: Dict[str, Any] | None = None # Initialize session data storage + + if not self.api_key: + raise ValueError("AnchorBrowser API key not provided") + + async def launch(self, **kwargs) -> str: + """Launch an AnchorBrowser instance. + + Args: + **kwargs: Launch options including: + - max_duration: Maximum session duration in seconds (default: 120) + - idle_timeout: Idle timeout in seconds (default: 30) + - proxy: Proxy configuration dict with: + - type: "custom" or "anchor_residential" + - server: Proxy server address (for custom) + - username: Proxy username (for custom) + - password: Proxy password (for custom) + - country_code: Country code (for anchor_residential) + - headless: Whether to run headless + - viewport: Viewport size + - captcha_solver: Enable CAPTCHA solving + - adblock: Enable ad blocking + - popup_blocker: Enable popup blocking + + Returns: + CDP URL for connecting to the browser + """ + # Build request payload + request_data = { + "session": { + "timeout": { + "max_duration": kwargs.get("max_duration", 120), + "idle_timeout": kwargs.get("idle_timeout", 30), + }, + }, + "browser": { + "adblock": {"active": True}, + "popup_blocker": {"active": True}, + "captcha_solver": {"active": True}, + }, + } + + proxy_config = await get_proxy_config() + + # Default to residential proxy if nothing configured + if not proxy_config: + proxy_config = { + "type": "anchor_residential", + "active": True, + "country_code": os.getenv("PROXY_COUNTRY", "us"), + } + logger.info("Using default AnchorBrowser residential proxy") + + # Add proxy to request data + request_data["session"]["proxy"] = proxy_config + + # Make API request + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/v1/sessions", + json=request_data, + headers={"anchor-api-key": str(self.api_key), "Content-Type": "application/json"}, + timeout=30.0, + ) + response.raise_for_status() + + # Extract session data + data = response.json() + session_data = data.get("data", {}) + self._instance_id = session_data.get("id") + self._session_data = session_data # Store full session data + self._cdp_url = session_data.get("cdp_url") + + if not self._instance_id: + raise Exception("Failed to get session ID from AnchorBrowser response") + if not self._cdp_url: + raise Exception("Failed to get CDP URL from AnchorBrowser response") + + self._is_running = True + + logger.info(f"Launched AnchorBrowser session: {self._instance_id}") + logger.info(f"Using proxy type: {proxy_config.get('type')}") + return self._cdp_url + + def close(self) -> None: + """Terminate the AnchorBrowser session.""" + if not self._instance_id: + return + + try: + logger.info(f"Terminating AnchorBrowser session: {self._instance_id}") + response = requests.delete( + f"{self.base_url}/v1/sessions/{self._instance_id}", + headers={ + "anchor-api-key": str(self.api_key), + "Content-Type": "application/json", + }, + timeout=25.0, + ) + response.raise_for_status() + + logger.info(f"Terminated AnchorBrowser session: {self._instance_id}") + except Exception as e: + logger.error(f"Error terminating session {self._instance_id}: {e}") + finally: + self._is_running = False + self._cdp_url = None + self._instance_id = None + + async def get_status(self) -> Dict[str, Any]: + """Get status including AnchorBrowser-specific info.""" + status = await super().get_status() + + # Add AnchorBrowser-specific status + if self._instance_id and self._is_running: + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/v1/sessions/{self._instance_id}/status", + headers={ + "anchor-api-key": str(self.api_key), + "Content-Type": "application/json", + }, + timeout=10.0, + ) + if response.status_code == 200: + session_status = response.json().get("data", {}) + status["session_status"] = session_status + except Exception as e: + logger.warning(f"Failed to get session status: {e}") + + return status + + def get_live_view_url(self) -> Optional[str]: + """Get the live view URL for the AnchorBrowser instance.""" + if self._session_data: + return self._session_data.get("live_view_url") + return None diff --git a/environments/online_mind2web/src/hud_controller/providers/base.py b/environments/online_mind2web/src/hud_controller/providers/base.py new file mode 100644 index 00000000..32807635 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/base.py @@ -0,0 +1,96 @@ +"""Base class for browser providers.""" + +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any +import logging + +logger = logging.getLogger(__name__) + + +class BrowserProvider(ABC): + """Abstract base class for browser providers. + + Each provider manages the lifecycle of a remote browser instance + and provides access to its Chrome DevTools Protocol (CDP) endpoint. + """ + + def __init__(self, config: Dict[str, Any] | None = None): + """Initialize the provider with optional configuration. + + Args: + config: Provider-specific configuration + """ + self.config = config or {} + self._cdp_url: Optional[str] = None + self._instance_id: Optional[str] = None + self._is_running = False + + @abstractmethod + async def launch(self, **kwargs) -> str: + """Launch a browser instance and return its CDP URL. + + Args: + **kwargs: Provider-specific launch options + + Returns: + CDP URL (e.g., "ws://localhost:9222/devtools/browser/xxx") + + Raises: + Exception: If launch fails + """ + pass + + @abstractmethod + def close(self) -> None: + """Close the browser instance and cleanup resources. + + Raises: + Exception: If close fails + """ + pass + + async def get_status(self) -> Dict[str, Any]: + """Get the current status of the browser instance. + + Returns: + Dictionary with status information including: + - is_running: Whether the browser is running + - cdp_url: The CDP URL if available + - instance_id: Provider-specific instance identifier + - additional provider-specific status info + """ + return { + "is_running": self._is_running, + "cdp_url": self._cdp_url, + "instance_id": self._instance_id, + "provider": self.__class__.__name__, + } + + def get_live_view_url(self) -> Optional[str]: + """Get the live view URL for the browser instance. + + Returns: + Live view URL if available, None otherwise + """ + # Default implementation returns None + # Providers should override this method + return None + + @property + def cdp_url(self) -> Optional[str]: + """Get the CDP URL of the running browser instance.""" + return self._cdp_url + + @property + def is_running(self) -> bool: + """Check if the browser instance is running.""" + return self._is_running + + async def __aenter__(self): + """Async context manager entry - launch the browser.""" + await self.launch() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit - close the browser.""" + self.close() diff --git a/environments/online_mind2web/src/hud_controller/providers/browserbase.py b/environments/online_mind2web/src/hud_controller/providers/browserbase.py new file mode 100644 index 00000000..a2abf262 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/browserbase.py @@ -0,0 +1,176 @@ +"""BrowserBase provider implementation.""" + +import os +import logging +from typing import Optional, Dict, Any +import httpx + +from .base import BrowserProvider + +logger = logging.getLogger(__name__) + + +class BrowserBaseProvider(BrowserProvider): + """BrowserBase provider for remote browser control. + + BrowserBase provides cloud browser instances with features like: + - Multiple regions support + - Context persistence + - Live view URLs + - Session recordings + - Proxy support + + API Documentation: https://docs.browserbase.com/reference/api/create-a-session + """ + + def __init__(self, config: Dict[str, Any] | None = None): + super().__init__(config) + self.api_key = config.get("api_key") if config else os.getenv("BROWSERBASE_API_KEY") + self.base_url = ( + config.get("base_url", "https://api.browserbase.com") + if config + else "https://api.browserbase.com" + ) + self.project_id = ( + config.get("project_id") if config else os.getenv("BROWSERBASE_PROJECT_ID") + ) + self._session_data: Dict[str, Any] | None = None + + if not self.api_key: + raise ValueError("BrowserBase API key not provided") + + async def launch(self, **kwargs) -> str: + """Launch a BrowserBase instance. + + Args: + **kwargs: Launch options including: + - projectId: Project ID (required if not set in config) + - region: Browser region (e.g., "us-west-2") + - keepAlive: Keep session alive after disconnect + - contextId: Reuse browser context + - browserSettings: Additional browser settings + - proxies: Enable proxy support + + Returns: + CDP URL for connecting to the browser + """ + # Build request payload + request_data = {"projectId": kwargs.get("projectId", self.project_id)} + + # Add optional parameters + if "region" in kwargs: + request_data["region"] = kwargs["region"] + + if "keepAlive" in kwargs: + request_data["keepAlive"] = kwargs["keepAlive"] + + if "contextId" in kwargs: + request_data["contextId"] = kwargs["contextId"] + + if "browserSettings" in kwargs: + request_data["browserSettings"] = kwargs["browserSettings"] + + if "proxies" in kwargs: + request_data["proxies"] = kwargs["proxies"] + + # Ensure we have a project ID + if not request_data.get("projectId"): + raise ValueError("BrowserBase project ID not provided") + + # Make API request + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/v1/sessions", + json=request_data, + headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"}, + timeout=30.0, + ) + response.raise_for_status() + + # Extract session data + data = response.json() + self._session_data = data + self._instance_id = data.get("id") + + if not self._instance_id: + raise Exception("Failed to get session ID from BrowserBase response") + + # Get CDP URL - BrowserBase returns connectUrl directly + self._cdp_url = data.get("connectUrl") + if not self._cdp_url: + raise Exception("Failed to get connect URL from BrowserBase response") + + self._is_running = True + + logger.info(f"Launched BrowserBase session: {self._instance_id}") + logger.info(f"CDP URL: {self._cdp_url}") + + # Store additional URLs for reference + self._live_view_url = data.get("liveViewUrl") + self._selenium_remote_url = data.get("seleniumRemoteUrl") + + return self._cdp_url + + def close(self) -> None: + """Terminate the BrowserBase session.""" + if not self._instance_id: + return + + try: + # BrowserBase sessions automatically close after disconnect unless keepAlive is true + # We can explicitly update the session to mark it as ended + with httpx.Client() as client: + response = client.post( + f"{self.base_url}/v1/sessions/{self._instance_id}", + json={"status": "COMPLETED"}, + headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"}, + timeout=30.0, + ) + # BrowserBase may return 404 if session already ended + if response.status_code != 404: + response.raise_for_status() + + logger.info(f"Terminated BrowserBase session: {self._instance_id}") + except Exception as e: + logger.error(f"Error terminating session {self._instance_id}: {e}") + finally: + self._is_running = False + self._cdp_url = None + self._instance_id = None + + async def get_status(self) -> Dict[str, Any]: + """Get status including BrowserBase-specific info.""" + status = await super().get_status() + + # Add BrowserBase-specific status + if self._instance_id and self._is_running: + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/v1/sessions/{self._instance_id}", + headers={ + "X-BB-API-Key": str(self.api_key), + "Content-Type": "application/json", + }, + timeout=10.0, + ) + if response.status_code == 200: + session_data = response.json() + status["session_data"] = session_data + status["status"] = session_data.get("status", "UNKNOWN") + status["region"] = session_data.get("region") + status["proxy_bytes"] = session_data.get("proxyBytes") + status["cpu_usage"] = session_data.get("avgCpuUsage") + status["memory_usage"] = session_data.get("memoryUsage") + except Exception as e: + logger.warning(f"Failed to get session status: {e}") + + return status + + def get_live_view_url(self) -> Optional[str]: + """Get the live view URL for the BrowserBase instance.""" + return self._live_view_url if hasattr(self, "_live_view_url") else None + + def get_selenium_remote_url(self) -> Optional[str]: + """Get the Selenium remote URL for the BrowserBase instance.""" + return self._selenium_remote_url if hasattr(self, "_selenium_remote_url") else None diff --git a/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py b/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py new file mode 100644 index 00000000..f3181c79 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py @@ -0,0 +1,5 @@ +"""Helper utilities for browser providers.""" + +from .proxy import get_proxy_config + +__all__ = ["get_proxy_config"] diff --git a/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py b/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py new file mode 100644 index 00000000..2eedffd7 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py @@ -0,0 +1,86 @@ +# Copied simplified global proxy helper +import os, random, asyncio, logging +from typing import Optional, Dict, Any +import httpx + +logger = logging.getLogger(__name__) + + +# ----------------------- provider helpers --------------------------- +async def _decodo_proxy() -> Optional[Dict[str, Any]]: + user = os.getenv("DECODO_USERNAME") + pwd = os.getenv("DECODO_PASSWORD") + if not user or not pwd: + return None + host = os.getenv("DECODO_HOST", "us.decodo.com") + rotating = os.getenv("DECODO_ROTATING", "true").lower() == "true" + if rotating: + port = 10000 + logger.info("Using Decodo rotating proxy (port 10000)") + return { + "type": "custom", + "server": f"{host}:{port}", + "username": user, + "password": pwd, + "active": True, + } + logger.info("Searching Decodo ports 10001-11000 …") + tried = set() + for _ in range(5): + port = random.randint(10001, 11000) + while port in tried: + port = random.randint(10001, 11000) + tried.add(port) + proxy_url = f"http://{user}:{pwd}@{host}:{port}" + try: + async with httpx.AsyncClient(proxy=proxy_url, timeout=5.0) as client: + if (await client.get("http://httpbin.org/ip")).status_code == 200: + logger.info("Decodo port %s works", port) + return { + "type": "custom", + "server": f"{host}:{port}", + "username": user, + "password": pwd, + "active": True, + } + except Exception: + continue + logger.warning("No working Decodo port found") + return None + + +def _standard_proxy() -> Optional[Dict[str, Any]]: + server = os.getenv("PROXY_SERVER") + if not server: + return None + return { + "type": "custom", + "server": server, + "username": os.getenv("PROXY_USERNAME"), + "password": os.getenv("PROXY_PASSWORD"), + "active": True, + } + + +# ----------------------- public API --------------------------------- +async def get_proxy_config() -> Optional[Dict[str, Any]]: + provider = os.getenv("PROXY_PROVIDER", "auto").lower() + + if provider == "none": + logger.info("Proxy explicitly disabled") + return None + + if provider == "decodo": + config = await _decodo_proxy() + if not config: + logger.warning("Decodo proxy requested but credentials not found") + return config + + if provider == "standard": + config = _standard_proxy() + if not config: + logger.warning("Standard proxy requested but PROXY_SERVER not set") + return config + + # auto or unknown - let browser use its default + return None diff --git a/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py b/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py new file mode 100644 index 00000000..e290a9a1 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py @@ -0,0 +1,244 @@ +"""HyperBrowser provider implementation.""" + +import os +import logging +from typing import Optional, Dict, Any, List +import httpx + +from .base import BrowserProvider + +logger = logging.getLogger(__name__) + + +class HyperBrowserProvider(BrowserProvider): + """HyperBrowser provider for remote browser control. + + HyperBrowser provides cloud browser instances with advanced features like: + - Stealth mode with fingerprinting + - Advanced proxy configuration (country/state/city) + - Profile management and persistence + - Web recording (video and screenshots) + - CAPTCHA solving + - Ad blocking and tracker blocking + - Browser fingerprinting customization + + API Documentation: https://docs.hyperbrowser.ai/reference/api-reference/sessions + """ + + def __init__(self, config: Dict[str, Any] | None = None): + super().__init__(config) + self.api_key = config.get("api_key") if config else os.getenv("HYPERBROWSER_API_KEY") + self.base_url = ( + config.get("base_url", "https://api.hyperbrowser.ai") + if config + else "https://api.hyperbrowser.ai" + ) + self._session_data: Dict[str, Any] | None = None + + if not self.api_key: + raise ValueError("HyperBrowser API key not provided") + + async def launch(self, **kwargs) -> str: + """Launch a HyperBrowser instance. + + Args: + **kwargs: Launch options including: + - useStealth: Enable stealth mode (default: False) + - useProxy: Enable proxy (default: False) + - proxyCountry: Country code for proxy + - proxyState: State code for US proxies + - proxyCity: City name for proxy + - proxyServer: Custom proxy server + - proxyServerUsername: Proxy username + - proxyServerPassword: Proxy password + - solveCaptchas: Enable CAPTCHA solving + - adblock: Enable ad blocking + - trackers: Enable tracker blocking + - annoyances: Enable annoyances blocking + - enableWebRecording: Enable session recording + - enableVideoWebRecording: Enable video recording + - profile: Profile configuration dict with id and persistChanges + - acceptCookies: Auto-accept cookies + - extensionIds: List of extension IDs to load + - browserArgs: Additional browser arguments + - timeoutMinutes: Session timeout (1-720 minutes) + + Returns: + CDP URL for connecting to the browser + """ + # Build request payload with defaults + request_data = { + "useStealth": kwargs.get("useStealth", False), + "useProxy": kwargs.get("useProxy", False), + } + + # Add proxy configuration + if request_data["useProxy"]: + if "proxyServer" in kwargs: + request_data["proxyServer"] = kwargs["proxyServer"] + request_data["proxyServerUsername"] = kwargs.get("proxyServerUsername") + request_data["proxyServerPassword"] = kwargs.get("proxyServerPassword") + else: + # Use HyperBrowser's residential proxy + request_data["proxyCountry"] = kwargs.get("proxyCountry", "US") + if "proxyState" in kwargs: + request_data["proxyState"] = kwargs["proxyState"] + if "proxyCity" in kwargs: + request_data["proxyCity"] = kwargs["proxyCity"] + + # Add optional features + if "solveCaptchas" in kwargs: + request_data["solveCaptchas"] = kwargs["solveCaptchas"] + + if "adblock" in kwargs: + request_data["adblock"] = kwargs["adblock"] + + if "trackers" in kwargs: + request_data["trackers"] = kwargs["trackers"] + + if "annoyances" in kwargs: + request_data["annoyances"] = kwargs["annoyances"] + + # Recording options + if "enableWebRecording" in kwargs: + request_data["enableWebRecording"] = kwargs["enableWebRecording"] + + if "enableVideoWebRecording" in kwargs: + request_data["enableVideoWebRecording"] = kwargs["enableVideoWebRecording"] + + # Profile management + if "profile" in kwargs: + request_data["profile"] = kwargs["profile"] + + if "acceptCookies" in kwargs: + request_data["acceptCookies"] = kwargs["acceptCookies"] + + # Extensions and browser args + if "extensionIds" in kwargs: + request_data["extensionIds"] = kwargs["extensionIds"] + + if "browserArgs" in kwargs: + request_data["browserArgs"] = kwargs["browserArgs"] + + # Timeout + if "timeoutMinutes" in kwargs: + request_data["timeoutMinutes"] = kwargs["timeoutMinutes"] + + # Make API request + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/api/session", + json=request_data, + headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"}, + timeout=30.0, + ) + response.raise_for_status() + + # Extract session data + data = response.json() + self._session_data = data + self._instance_id = data.get("id") + + if not self._instance_id: + raise Exception("Failed to get session ID from HyperBrowser response") + + # Get WebSocket endpoint - HyperBrowser returns wsEndpoint + self._cdp_url = data.get("wsEndpoint") + if not self._cdp_url: + raise Exception("Failed to get WebSocket endpoint from HyperBrowser response") + + self._is_running = True + + logger.info(f"Launched HyperBrowser session: {self._instance_id}") + logger.info(f"CDP URL: {self._cdp_url}") + + # Store additional URLs for reference + self._session_url = data.get("sessionUrl") + self._live_url = data.get("liveUrl") + self._token = data.get("token") + + return self._cdp_url + + def close(self) -> None: + """Terminate the HyperBrowser session.""" + if not self._instance_id: + return + + try: + with httpx.Client() as client: + response = client.put( + f"{self.base_url}/api/session/{self._instance_id}/stop", + headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"}, + timeout=30.0, + ) + response.raise_for_status() + + logger.info(f"Terminated HyperBrowser session: {self._instance_id}") + except Exception as e: + logger.error(f"Error terminating session {self._instance_id}: {e}") + finally: + self._is_running = False + self._cdp_url = None + self._instance_id = None + + async def get_status(self) -> Dict[str, Any]: + """Get status including HyperBrowser-specific info.""" + status = await super().get_status() + + # Add HyperBrowser-specific status + if self._instance_id and self._is_running: + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/api/session/{self._instance_id}", + headers={ + "x-api-key": str(self.api_key), + "Content-Type": "application/json", + }, + timeout=10.0, + ) + if response.status_code == 200: + session_data = response.json() + status["session_data"] = session_data + status["status"] = session_data.get("status", "UNKNOWN") + status["start_time"] = session_data.get("startTime") + status["end_time"] = session_data.get("endTime") + except Exception as e: + logger.warning(f"Failed to get session status: {e}") + + return status + + def get_live_view_url(self) -> Optional[str]: + """Get the live view URL for the HyperBrowser instance.""" + return self._live_url if hasattr(self, "_live_url") else None + + def get_session_url(self) -> Optional[str]: + """Get the session URL for the HyperBrowser instance.""" + return self._session_url if hasattr(self, "_session_url") else None + + async def get_sessions_list( + self, page: int = 1, status: Optional[str] = None + ) -> Dict[str, Any]: + """Get list of sessions. + + Args: + page: Page number (default: 1) + status: Filter by status ("active", "closed", "error") + + Returns: + Dict with sessions list and pagination info + """ + params = {"page": page} + if status: + params["status"] = status + + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/api/sessions", + params=params, + headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"}, + timeout=10.0, + ) + response.raise_for_status() + + return response.json() diff --git a/environments/online_mind2web/src/hud_controller/providers/kernel.py b/environments/online_mind2web/src/hud_controller/providers/kernel.py new file mode 100644 index 00000000..14cee895 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/kernel.py @@ -0,0 +1,13 @@ +"""Kernel browser provider implementation (stub).""" + +from .base import BrowserProvider + + +class KernelProvider(BrowserProvider): + """Kernel browser-as-a-service platform - placeholder implementation.""" + + async def launch(self, **kwargs) -> str: + raise NotImplementedError("Kernel provider not yet implemented") + + def close(self) -> None: + raise NotImplementedError("Kernel provider not yet implemented") diff --git a/environments/online_mind2web/src/hud_controller/providers/steel.py b/environments/online_mind2web/src/hud_controller/providers/steel.py new file mode 100644 index 00000000..a85209c3 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/providers/steel.py @@ -0,0 +1,203 @@ +"""Steel provider implementation.""" + +import os +import logging +from typing import Optional, Dict, Any +import httpx + +from .base import BrowserProvider + +logger = logging.getLogger(__name__) + + +class SteelProvider(BrowserProvider): + """Steel provider for remote browser control. + + Steel is an open-source browser API that provides cloud browser instances with features like: + - CAPTCHA solving + - Proxy support + - Session management + - Context persistence (cookies, local storage) + - Live view and recordings + - Anti-detection measures + - Up to 24-hour sessions + + API Documentation: https://docs.steel.dev/api-reference + """ + + def __init__(self, config: Dict[str, Any] | None = None): + super().__init__(config) + self.api_key = config.get("api_key") if config else os.getenv("STEEL_API_KEY") + self.base_url = ( + config.get("base_url", "https://api.steel.dev") if config else "https://api.steel.dev" + ) + self._session_data: Dict[str, Any] | None = None + + if not self.api_key: + raise ValueError("Steel API key not provided") + + async def launch(self, **kwargs) -> str: + """Launch a Steel browser instance. + + Args: + **kwargs: Launch options including: + - sessionTimeout: Session timeout in milliseconds (max 24 hours) + - proxy: Proxy configuration (user:pass@host:port) + - blockAds: Block ads (default: False) + - stealth: Enable stealth mode + - isSelenium: Create Selenium-compatible session + - loadExtensions: Load Chrome extensions + - solveCaptchas: Enable CAPTCHA solving + - context: Saved context (cookies, localStorage) + + Returns: + CDP WebSocket URL for connecting to the browser + """ + # Build request payload using Steel's format + request_data = { + "sessionId": kwargs.get("sessionId", ""), + "userAgent": kwargs.get("userAgent", ""), + "useProxy": kwargs.get("useProxy", False), + "proxyUrl": kwargs.get("proxyUrl", ""), + "blockAds": kwargs.get("blockAds", False), + "solveCaptcha": kwargs.get("solveCaptcha", False), + "timeout": kwargs.get("timeout", 1800000), # 30 minutes default + "concurrency": kwargs.get("concurrency", 1), + "isSelenium": kwargs.get("isSelenium", False), + "region": kwargs.get("region", "lax"), + } + + # Add dimensions if specified + if "dimensions" in kwargs: + request_data["dimensions"] = kwargs["dimensions"] + else: + request_data["dimensions"] = {"width": 1920, "height": 1080} + + # Add session context if provided + if "sessionContext" in kwargs: + request_data["sessionContext"] = kwargs["sessionContext"] + + # Add stealth config + if "stealthConfig" in kwargs: + request_data["stealthConfig"] = kwargs["stealthConfig"] + + # Make API request + async with httpx.AsyncClient() as client: + response = await client.post( + f"{self.base_url}/v1/sessions", + json=request_data, + headers={"Content-Type": "application/json", "Steel-Api-Key": str(self.api_key)}, + timeout=30.0, + ) + response.raise_for_status() + + # Extract session data + data = response.json() + self._session_data = data + self._instance_id = data.get("id") + + if not self._instance_id: + raise Exception("Failed to get session ID from Steel response") + + # Get WebSocket URL - Steel returns wsUrl + self._cdp_url = data.get("wsUrl") + if not self._cdp_url: + # Fallback to constructing URL if not provided + self._cdp_url = f"wss://api.steel.dev/sessions/{self._instance_id}" + + self._is_running = True + + logger.info(f"Launched Steel session: {self._instance_id}") + logger.info(f"CDP URL: {self._cdp_url}") + + # Store additional URLs for reference + self._debug_url = data.get("debugUrl") + self._live_view_url = data.get("liveViewUrl") + + return self._cdp_url + + def close(self) -> None: + """Terminate the Steel session.""" + if not self._instance_id: + return + + try: + with httpx.Client() as client: + response = client.delete( + f"{self.base_url}/v1/sessions/{self._instance_id}", + headers={ + "Content-Type": "application/json", + "Steel-Api-Key": str(self.api_key), + }, + timeout=30.0, + ) + # Steel may return 404 if session already ended + if response.status_code != 404: + response.raise_for_status() + + logger.info(f"Terminated Steel session: {self._instance_id}") + except Exception as e: + logger.error(f"Error terminating session {self._instance_id}: {e}") + finally: + self._is_running = False + self._cdp_url = None + self._instance_id = None + + async def get_status(self) -> Dict[str, Any]: + """Get status including Steel-specific info.""" + status = await super().get_status() + + # Add Steel-specific status + if self._instance_id and self._is_running: + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/v1/sessions/{self._instance_id}", + headers={ + "steel_api_key": str(self.api_key), + "Content-Type": "application/json", + }, + timeout=10.0, + ) + if response.status_code == 200: + session_data = response.json() + status["session_data"] = session_data + status["status"] = session_data.get("status", "UNKNOWN") + status["context"] = session_data.get("context") + except Exception as e: + logger.warning(f"Failed to get session status: {e}") + + return status + + def get_debug_url(self) -> Optional[str]: + """Get the debug URL for the Steel instance.""" + return self._debug_url if hasattr(self, "_debug_url") else None + + def get_live_view_url(self) -> Optional[str]: + """Get the live view URL for the Steel instance.""" + return self._live_view_url if hasattr(self, "_live_view_url") else None + + async def save_context(self) -> Optional[Dict[str, Any]]: + """Save the current browser context (cookies, localStorage). + + Returns: + Context data that can be passed to launch() to restore state + """ + if not self._instance_id: + return None + + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{self.base_url}/v1/sessions/{self._instance_id}/context", + headers={ + "Content-Type": "application/json", + "Steel-Api-Key": str(self.api_key), + }, + timeout=10.0, + ) + response.raise_for_status() + return response.json() + except Exception as e: + logger.error(f"Failed to save context: {e}") + return None diff --git a/environments/online_mind2web/src/hud_controller/server.py b/environments/online_mind2web/src/hud_controller/server.py new file mode 100644 index 00000000..08bbca30 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/server.py @@ -0,0 +1,354 @@ +"""MCP server for remote browser environment.""" + +import sys +import logging +import os +import asyncio +from datetime import datetime +from typing import Optional, TypedDict, Any + +# Configure stderr logging +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", + force=True, +) +logger = logging.getLogger(__name__) + +from hud.server import MCPServer +from hud.server.context import attach_context + +# Import tools +from .tools import BrowserExecutor, AnthropicComputerToolWithRecord, OpenAIComputerToolWithRecord +from hud.tools.computer import ( + AnthropicComputerTool, + OpenAIComputerTool, + HudComputerTool, +) +from hud.tools import PlaywrightTool + +# Import setup and evaluate hubs +from .setup import setup as setup_hub +from .evaluate import evaluate as evaluate_hub + +# Import providers +from .providers import get_provider, BrowserProvider + +# Global persistent context (initialized during startup) +persistent_ctx = None +playwright_tool = None +browser_executor: Optional[BrowserExecutor] = None + +# Create Hud FastMCP instance +mcp = MCPServer( + name="HUD Remote Browser Environment", + instructions=""" + This is a remote browser automation environment that connects to cloud browser providers. + The browser provider is configured via the BROWSER_PROVIDER environment variable. + + Available tools: + - setup: Initialize browser environment with various setup functions + - evaluate: Evaluate browser state with various evaluator functions + - playwright tools: Browser automation (navigate, click, type, etc.) + - computer tools: Control browser as if it were a desktop application + """, +) + + +class Telemetry(TypedDict): + """Standard evaluation result format.""" + + provider: str + status: str + live_url: str | None + timestamp: str + cdp_url: str | None + instance_id: str | None + + +@mcp.resource("telemetry://live") +async def get_telemetry_resource() -> Telemetry: + """MCP resource containing telemetry data including provider's live view URL.""" + global persistent_ctx + + if persistent_ctx: + try: + telemetry = persistent_ctx.get_telemetry() # Now synchronous + return Telemetry( + provider=telemetry["provider"], + status=telemetry["status"], + live_url=telemetry["live_url"], + timestamp=datetime.now().isoformat(), + cdp_url=None, + instance_id=telemetry["instance_id"], + ) + except Exception as e: + logger.error(f"Error getting telemetry data: {e}") + # Return default telemetry on error instead of None + return Telemetry( + provider=os.getenv("BROWSER_PROVIDER", "unknown"), + status="error", + live_url=None, + timestamp=datetime.now().isoformat(), + cdp_url=None, + instance_id=None, + ) + + return Telemetry( + provider=os.getenv("BROWSER_PROVIDER", "unknown"), + status="not_initialized", + live_url=None, + timestamp=datetime.now().isoformat(), + cdp_url=None, + instance_id=None, + ) + + +@mcp.initialize +async def initialize_environment(ctx): + """Initialize the remote browser environment with progress reporting.""" + global persistent_ctx, playwright_tool, browser_executor + + # Extract progress token from context if available + progress_token = None + if ctx.meta and hasattr(ctx.meta, "progressToken"): + progress_token = ctx.meta.progressToken + + async def send_progress(progress: int, message: str): + if progress_token and hasattr(ctx, "session"): + try: + await ctx.session.send_progress_notification( + progress_token=progress_token, + progress=progress, + total=100, + message=message, + ) + except Exception as e: + logger.warning(f"Failed to send progress notification: {e}") + logger.info(f"[{progress}%] {message}") + + try: + await send_progress(5, "Connecting to persistent context...") + + # Connect to persistent context server + max_retries = 10 + retry_delay = 1.0 + + for attempt in range(max_retries): + try: + persistent_ctx = attach_context("/tmp/hud_remote_browser_ctx.sock") + if persistent_ctx is None: + raise ConnectionError("Failed to attach to context server") + logger.info("Connected to persistent remote browser context") + + # Log current state + state = persistent_ctx.get_state_summary() + logger.info(f"Context state: {state}") + + if persistent_ctx.get_is_initialized(): + logger.info("Resuming with existing browser session") + else: + logger.info("Starting fresh browser session") + break + + except Exception as e: + if attempt < max_retries - 1: + logger.warning( + f"Context server not ready yet (attempt {attempt + 1}/{max_retries}): {e}" + ) + await asyncio.sleep(retry_delay) + else: + logger.error( + f"Failed to connect to context server after {max_retries} attempts: {e}" + ) + logger.error( + "The context server should be started automatically. Check container logs." + ) + raise + + await send_progress(10, "Connected to persistent context") + + # At this point, persistent_ctx is guaranteed to be set + assert persistent_ctx is not None + + # Check if we need to initialize a new browser session + if not persistent_ctx.get_is_initialized(): + await send_progress(15, "Initializing new browser session...") + + # Get provider configuration from environment + provider_name = os.getenv("BROWSER_PROVIDER") + if not provider_name: + error_msg = ( + "BROWSER_PROVIDER environment variable is required. " + "Supported providers: anchorbrowser, steel, browserbase, hyperbrowser, kernel" + ) + logger.error(error_msg) + raise ValueError(error_msg) + + provider_name = provider_name.lower() + await send_progress(20, f"Using browser provider: {provider_name}") + + # Initialize the browser provider + provider_class = get_provider(provider_name) + provider_config = {} + + # Add provider-specific configuration + if provider_name == "anchorbrowser": + provider_config["api_key"] = os.getenv("ANCHOR_API_KEY") + provider_config["base_url"] = os.getenv( + "ANCHOR_BASE_URL", "https://api.anchorbrowser.io" + ) + elif provider_name == "steel": + provider_config["api_key"] = os.getenv("STEEL_API_KEY") + provider_config["base_url"] = os.getenv("STEEL_BASE_URL", "https://api.steel.dev") + elif provider_name == "browserbase": + provider_config["api_key"] = os.getenv("BROWSERBASE_API_KEY") + provider_config["project_id"] = os.getenv("BROWSERBASE_PROJECT_ID") + elif provider_name == "hyperbrowser": + provider_config["api_key"] = os.getenv("HYPERBROWSER_API_KEY") + elif provider_name == "kernel": + provider_config["api_key"] = os.getenv("KERNEL_API_KEY") + + # Store provider config in context + persistent_ctx.set_provider_config(provider_config) + + browser_provider = provider_class(provider_config) + persistent_ctx.set_browser_provider(browser_provider) + await send_progress(30, "Browser provider initialized") + + # Launch the browser and get CDP URL + await send_progress(40, "Launching remote browser...") + + # Build launch options + launch_options = {} + + # Add other launch options from environment + max_duration = os.getenv("BROWSER_MAX_DURATION") + if max_duration: + launch_options["max_duration"] = int(max_duration) + idle_timeout = os.getenv("BROWSER_IDLE_TIMEOUT") + if idle_timeout: + launch_options["idle_timeout"] = int(idle_timeout) + + # Store launch options in context + persistent_ctx.set_launch_options(launch_options) + + # Create browser session + cdp_url = await browser_provider.launch(**launch_options) + + # Build and store telemetry data + telemetry_data = { + "provider": provider_name, + "status": "running", + "live_url": browser_provider.get_live_view_url() + if hasattr(browser_provider, "get_live_view_url") + else None, + "cdp_url": cdp_url, + "instance_id": browser_provider._instance_id + if hasattr(browser_provider, "_instance_id") + else None, + "timestamp": datetime.now().isoformat(), + } + persistent_ctx.set_telemetry(telemetry_data) + + await send_progress(60, f"Browser launched") + else: + # Reuse existing browser session + await send_progress(20, "Reusing existing browser session...") + + # Get existing CDP URL from context + cdp_url = persistent_ctx.get_cdp_url() + if not cdp_url: + raise ValueError("No CDP URL in persistent context") + + await send_progress(60, f"Using existing CDP URL") + + # Initialize PlaywrightToolWithMemory with CDP URL from context + # This reconnects to the existing browser session on reloads + # playwright_tool = PlaywrightTool(cdp_url=cdp_url) + playwright_tool = PlaywrightTool(cdp_url=cdp_url) + + # Ensure browser is connected before registering tools + await playwright_tool._ensure_browser() + await send_progress(65, "Browser connection established") + + # Add playwright tool to MCP + mcp.add_tool(playwright_tool) + await send_progress(70, "Playwright tool registered") + + # Initialize browser executor + browser_executor = BrowserExecutor(playwright_tool) + await send_progress(75, "Browser executor initialized") + + # Create and register computer tools with default dimensions + mcp.add_tool(HudComputerTool(executor=browser_executor)) + # mcp.add_tool(AnthropicComputerTool(executor=browser_executor)) + # mcp.add_tool(OpenAIComputerTool(executor=browser_executor)) + mcp.add_tool(AnthropicComputerToolWithRecord(executor=browser_executor)) + mcp.add_tool(OpenAIComputerToolWithRecord(executor=browser_executor)) + + await send_progress(80, "Registered hud computer tools") + + # Set the persistent context as environment for setup and evaluate hubs + setup_hub.env = persistent_ctx + evaluate_hub.env = persistent_ctx + + # Also store the current playwright tool on the persistent context + # Note: This is NOT pickled/persisted, it's just for current session access + persistent_ctx.playwright_tool = playwright_tool + + # Mount the hubs + mcp.mount(setup_hub) + mcp.mount(evaluate_hub) + await send_progress(90, "Setup and evaluate tools registered") + + # Navigate to initial URL if specified (only for new sessions) + if not persistent_ctx.get_is_initialized(): + initial_url = os.getenv("BROWSER_URL") + if initial_url: + await send_progress(95, f"Navigating to {initial_url}") + await playwright_tool.navigate(initial_url) + + # Mark as initialized + persistent_ctx.set_initialized(True) + + await send_progress(100, "Remote browser environment ready!") + + except Exception as e: + logger.error(f"Initialization failed: {e}") + import traceback + + logger.error(f"Traceback: {traceback.format_exc()}") + raise + + +@mcp.shutdown +async def shutdown_environment(): + """Shutdown the remote browser environment (only called on SIGTERM).""" + global persistent_ctx, playwright_tool, browser_executor + + logger.info("🔧 SIGTERM received - shutting down browser provider") + try: + # Close the browser provider + if persistent_ctx: + logger.info("Closing browser provider...") + try: + provider = persistent_ctx.get_browser_provider() + if provider and hasattr(provider, "close"): + provider.close() + logger.info("Browser provider closed") + except Exception as e: + logger.error(f"Error closing provider: {e}") + + logger.info("✅ Browser shutdown completed") + except Exception as e: + logger.error(f"Error during shutdown: {e}") + finally: + # Clear local references + playwright_tool = None + browser_executor = None + + +if __name__ == "__main__": + mcp.run() diff --git a/environments/online_mind2web/src/hud_controller/setup/__init__.py b/environments/online_mind2web/src/hud_controller/setup/__init__.py new file mode 100644 index 00000000..6f56f8c2 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/setup/__init__.py @@ -0,0 +1,16 @@ +"""Setup layer for remote browser environment. + +This module exposes: +- ``setup``, the BaseHub instance for setup operations +""" + +from __future__ import annotations + +from hud.tools.base import BaseHub + +setup = BaseHub("setup") + +# Import all setup functions to register them +from . import navigate, cookies, interact + +__all__ = ["setup"] diff --git a/environments/online_mind2web/src/hud_controller/setup/cookies.py b/environments/online_mind2web/src/hud_controller/setup/cookies.py new file mode 100644 index 00000000..a160ee6d --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/setup/cookies.py @@ -0,0 +1,69 @@ +"""Cookie setup functions for remote browser environment.""" + +import logging +from typing import List, Dict, Any +from fastmcp import Context +from mcp.types import TextContent +from . import setup + +logger = logging.getLogger(__name__) + + +@setup.tool("set_cookies") +async def set_cookies(ctx: Context, cookies: List[Dict[str, Any]]): + """Set cookies in the browser. + + Args: + cookies: List of cookie dictionaries with name, value, and optional properties + + Returns: + Setup result with status + """ + logger.info(f"Setting {len(cookies)} cookies") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return TextContent(text="No browser page available", type="text") + + try: + # Add cookies to the context + await playwright_tool.page.context.add_cookies(cookies) + + logger.info(f"Successfully set {len(cookies)} cookies") + return TextContent( + text=f"Set {len(cookies)} cookies: {', '.join([c.get('name', 'unnamed') for c in cookies])}", + type="text", + ) + except Exception as e: + logger.error(f"Failed to set cookies: {e}") + return TextContent(text=f"Failed to set cookies: {str(e)}", type="text") + + +@setup.tool("clear_cookies") +async def clear_cookies(ctx: Context): + """Clear all cookies from the browser. + + Returns: + Setup result with status + """ + logger.info("Clearing all cookies") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return TextContent(text="No browser page available", type="text") + + try: + # Clear all cookies + await playwright_tool.page.context.clear_cookies() + + logger.info("Successfully cleared all cookies") + return TextContent(text="Cleared all cookies", type="text") + except Exception as e: + logger.error(f"Failed to clear cookies: {e}") + return TextContent(text=f"Failed to clear cookies: {str(e)}", type="text") diff --git a/environments/online_mind2web/src/hud_controller/setup/interact.py b/environments/online_mind2web/src/hud_controller/setup/interact.py new file mode 100644 index 00000000..84cbd682 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/setup/interact.py @@ -0,0 +1,105 @@ +"""Interaction setup functions for remote browser environment.""" + +import logging +from fastmcp import Context +from mcp.types import TextContent +from . import setup + +logger = logging.getLogger(__name__) + + +@setup.tool("click_element") +async def click_element(ctx: Context, selector: str, timeout: int = 30000): + """Click on an element by selector. + + Args: + selector: CSS selector for the element + timeout: Maximum time to wait for element (ms) + + Returns: + Setup result with status + """ + logger.info(f"Clicking element with selector: {selector}") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return TextContent(text="No browser page available", type="text") + + try: + # Wait for element and click + element = await playwright_tool.page.wait_for_selector(selector, timeout=timeout) + await element.click() + + logger.info(f"Successfully clicked element: {selector}") + return TextContent(text=f"Clicked element: {selector}", type="text") + except Exception as e: + logger.error(f"Failed to click element: {e}") + return TextContent(text=f"Failed to click element: {str(e)}", type="text") + + +@setup.tool("fill_input") +async def fill_input(ctx: Context, selector: str, text: str, timeout: int = 30000): + """Fill an input field with text. + + Args: + selector: CSS selector for the input element + text: Text to fill in the input + timeout: Maximum time to wait for element (ms) + + Returns: + Setup result with status + """ + logger.info(f"Filling input {selector} with text") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return TextContent(text="No browser page available", type="text") + + try: + # Wait for element and fill + element = await playwright_tool.page.wait_for_selector(selector, timeout=timeout) + await element.fill(text) + + logger.info(f"Successfully filled input: {selector}") + return TextContent(text=f"Filled input {selector} with {len(text)} characters", type="text") + except Exception as e: + logger.error(f"Failed to fill input: {e}") + return TextContent(text=f"Failed to fill input: {str(e)}", type="text") + + +@setup.tool("select_option") +async def select_option(ctx: Context, selector: str, value: str, timeout: int = 30000): + """Select an option in a dropdown. + + Args: + selector: CSS selector for the select element + value: Value of the option to select + timeout: Maximum time to wait for element (ms) + + Returns: + Setup result with status + """ + logger.info(f"Selecting option {value} in {selector}") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No browser page available") + return TextContent(text="No browser page available", type="text") + + try: + # Wait for element and select option + await playwright_tool.page.select_option(selector, value, timeout=timeout) + + logger.info(f"Successfully selected option: {value}") + return TextContent(text=f"Selected option '{value}' in {selector}", type="text") + except Exception as e: + logger.error(f"Failed to select option: {e}") + return TextContent(text=f"Failed to select option: {str(e)}", type="text") diff --git a/environments/online_mind2web/src/hud_controller/setup/navigate.py b/environments/online_mind2web/src/hud_controller/setup/navigate.py new file mode 100644 index 00000000..6bebedd7 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/setup/navigate.py @@ -0,0 +1,41 @@ +"""Navigation setup for remote browser environment.""" + +import logging +from fastmcp import Context +from mcp.types import TextContent +from . import setup + +logger = logging.getLogger(__name__) + + +@setup.tool("navigate_to_url") +async def navigate_to_url(ctx: Context, url: str, wait_for_load_state: str = "networkidle"): + """Navigate browser to a specific URL. + + Args: + url: The URL to navigate to + wait_for_load_state: State to wait for after navigation + + Returns: + Setup result with navigation status + """ + logger.info(f"Navigating to URL: {url}") + + # Get the playwright tool from the environment context + persistent_ctx = setup.env + playwright_tool = getattr(persistent_ctx, "playwright_tool", None) + if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page: + logger.error("No playwright tool available") + return TextContent(text="No browser available for navigation", type="text") + + # Navigate using the playwright tool + result = await playwright_tool.navigate(url, wait_for_load_state) + + if result.get("success"): + logger.info(f"Successfully navigated to {url}") + return TextContent( + text=f"Navigated to {url} - Title: {result.get('title', 'Unknown')}", type="text" + ) + else: + logger.error(f"Failed to navigate: {result.get('error')}") + return TextContent(text=f"Navigation failed: {result.get('error')}", type="text") diff --git a/environments/online_mind2web/src/hud_controller/tools/__init__.py b/environments/online_mind2web/src/hud_controller/tools/__init__.py new file mode 100644 index 00000000..3fa53196 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/tools/__init__.py @@ -0,0 +1,11 @@ +"""Tools module for remote browser environment.""" + +from .executor import BrowserExecutor +from .anthropic import AnthropicComputerToolWithRecord +from .openai import OpenAIComputerToolWithRecord + +__all__ = [ + "BrowserExecutor", + "AnthropicComputerToolWithRecord", + "OpenAIComputerToolWithRecord", +] diff --git a/environments/online_mind2web/src/hud_controller/tools/anthropic.py b/environments/online_mind2web/src/hud_controller/tools/anthropic.py new file mode 100644 index 00000000..58f86a92 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/tools/anthropic.py @@ -0,0 +1,250 @@ +"""PlaywrightTool with memory/history tracking for remote browser environment.""" + +import logging, os, base64 +from typing import Any, Dict, List, Optional, Literal +from datetime import datetime +from pydantic import Field +from hud.tools import AnthropicComputerTool +from mcp.types import ContentBlock, ImageContent, TextContent +from hud.tools.computer.settings import computer_settings +from _collections_abc import Callable, Awaitable +from hud.tools.executors import BaseExecutor + +logger = logging.getLogger(__name__) + + +class AnthropicComputerToolWithRecord(AnthropicComputerTool): + def __init__( + self, + # Define within environment based on platform + executor: BaseExecutor | None = None, + platform_type: Literal["auto", "xdo", "pyautogui"] = "auto", + display_num: int | None = None, + # Overrides for what dimensions the agent thinks it operates in + width: int = computer_settings.ANTHROPIC_COMPUTER_WIDTH, + height: int = computer_settings.ANTHROPIC_COMPUTER_HEIGHT, + rescale_images: bool = computer_settings.ANTHROPIC_RESCALE_IMAGES, + # What the agent sees as the tool's name, title, and description + name: str | None = None, + title: str | None = None, + description: str | None = None, + **kwargs: Any, + ): + super().__init__( + executor=executor, + platform_type=platform_type, + display_num=display_num, + width=width, + height=height, + rescale_images=rescale_images, + name=name, + title=title, + description=description, + **kwargs, + ) + self.add_callback("on_screenshot_action", self._on_screenshot_action) + self.add_callback("on_recorded_action", self._on_recorded_action) + + async def _on_screenshot_action(self, **_) -> None: + """Callback function to take and save screenshots to /screenshot directory""" + try: + # Check if executor is available and properly initialized + if not hasattr(self, "executor") or self.executor is None: + logger.debug("Executor not yet initialized, skipping screenshot") + return + + # Additional check for executor readiness + if not hasattr(self.executor, "screenshot"): + logger.debug("Executor screenshot method not available, skipping screenshot") + return + + screenshot_base64 = await self.executor.screenshot() + if screenshot_base64: + # Create screenshot directory if it doesn't exist + screenshot_dir = "/screenshot" + os.makedirs(screenshot_dir, exist_ok=True) + + # Generate timestamp-based filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] # Include milliseconds + filename = f"screenshot_{timestamp}.png" + filepath = os.path.join(screenshot_dir, filename) + + # Decode base64 and save to file + image_data = base64.b64decode(screenshot_base64) + with open(filepath, "wb") as f: + f.write(image_data) + + logger.info(f"Saved screenshot to {filepath}") + + except Exception as e: + logger.debug(f"Screenshot callback failed (this is normal during initialization): {e}") + + async def _on_recorded_action( + self, + action=None, + coordinate=None, + text=None, + start_coordinate=None, + scroll_direction=None, + scroll_amount=None, + **_, + ): + """Record action in unified representation format + + Creates unified action representations like: + - -> CLICK + - -> TYPE hello@example.com + - -> DRAG + """ + if not action: + return + + try: + # Create unified action representation + action_repr = self._to_action_repr( + action, coordinate, text, start_coordinate, scroll_direction, scroll_amount + ) + + # Dump to file + action_history_dir = "/action_history" + os.makedirs(action_history_dir, exist_ok=True) + action_file = os.path.join(action_history_dir, "action_history.txt") + + with open(action_file, "a", encoding="utf-8") as f: + f.write(f"{action_repr}\n") + + logger.info(f"Recorded action: {action_repr}") + + except Exception as e: + logger.warning(f"Failed to record action: {e}") + + async def __call__( + self, + action: str = Field(..., description="The action to perform on the computer"), + coordinate: list[int] | tuple[int, int] | None = Field( + None, description="The coordinate to interact with on the computer [x, y]" + ), + text: str | None = Field( + None, description="The text to type on the computer or key to press" + ), + start_coordinate: list[int] | tuple[int, int] | None = Field( + None, description="The starting coordinate for drag actions [x, y]" + ), + scroll_direction: str | None = Field( + None, description="The direction to scroll (up, down, left, right)" + ), + scroll_amount: int | None = Field(None, description="The amount to scroll"), + duration: float | None = Field(None, description="The duration of the action in seconds"), + take_screenshot_on_click: bool = Field( + True, description="Whether to take a screenshot after clicking" + ), + ) -> list[ContentBlock]: + result = await super().__call__( + action=action, + coordinate=coordinate, + text=text, + start_coordinate=start_coordinate, + scroll_direction=scroll_direction, + scroll_amount=scroll_amount, + duration=duration, + take_screenshot_on_click=take_screenshot_on_click, + ) + screenshot_actions = { + "screenshot", + "left_click", + "click", + "double_click", + "triple_click", + "right_click", + "middle_click", + "mouse_move", + "move", + "type", + "key", + "scroll", + "left_click_drag", + "drag", + "wait", + "hold_key", + "left_mouse_down", + "left_mouse_up", + } + if action in screenshot_actions and action != "screenshot" and take_screenshot_on_click: + await self._trigger_callbacks("on_screenshot_action") + recorded_actions = { + "left_click", + "click", + "double_click", + "triple_click", + "right_click", + "middle_click", + "type", + "key", + "scroll", + "left_click_drag", + "drag", + } + if action in recorded_actions: + await self._trigger_callbacks( + "on_recorded_action", + action=action, + coordinate=coordinate, + text=text, + start_coordinate=start_coordinate, + scroll_direction=scroll_direction, + scroll_amount=scroll_amount, + ) + return result + + def _to_action_repr( + self, + action, + coordinate=None, + text=None, + start_coordinate=None, + scroll_direction=None, + scroll_amount=None, + ): + """Create unified action representation following AgentRewardBench format + + Format examples: + - -> CLICK + - -> TYPE hello@example.com + - -> DRAG + - -> SCROLL + """ + + # Normalize action names to uppercase + action_name = action.upper().replace("LEFT_", "").replace("_", "") + if action_name == "LEFTCLICK": + action_name = "CLICK" + elif action_name == "LEFTCLICKDRAG": + action_name = "DRAG" + + # Build element attributes part + attributes = [] + + if coordinate: + attributes.append(f"coordinate={coordinate}") + + if start_coordinate and action_name == "DRAG": + attributes.append(f"start={start_coordinate}") + if coordinate: + attributes.append(f"end={coordinate}") + + if scroll_direction: + attributes.append(f"direction={scroll_direction}") + + if scroll_amount: + attributes.append(f"amount={scroll_amount}") + + # Create element part + element_part = f"<{', '.join(attributes)}>" if attributes else "<>" + + # Create action part + if text and action_name in ["TYPE", "KEY"]: + action_part = f"{action_name} {text}" + else: + action_part = action_name + + return f"{element_part} -> {action_part}" diff --git a/environments/online_mind2web/src/hud_controller/tools/executor.py b/environments/online_mind2web/src/hud_controller/tools/executor.py new file mode 100644 index 00000000..94196832 --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/tools/executor.py @@ -0,0 +1,379 @@ +"""Browser-based executor for computer tools that uses Playwright.""" + +import base64 +import logging +from typing import Literal, Optional + +from hud.tools.executors.base import BaseExecutor +from hud.tools.types import ContentResult + +logger = logging.getLogger(__name__) + +# Mapping from common key names to Playwright key names +PLAYWRIGHT_KEY_MAP = { + # Control keys + "ctrl": "Control", + "control": "Control", + "alt": "Alt", + "shift": "Shift", + "meta": "Meta", + "cmd": "Meta", # macOS Command key + "command": "Meta", + "win": "Meta", # Windows key + "windows": "Meta", + # Navigation keys + "enter": "Enter", + "return": "Enter", + "tab": "Tab", + "backspace": "Backspace", + "delete": "Delete", + "del": "Delete", + "escape": "Escape", + "esc": "Escape", + "space": "Space", + # Arrow keys + "up": "ArrowUp", + "down": "ArrowDown", + "left": "ArrowLeft", + "right": "ArrowRight", + # Page navigation + "pageup": "PageUp", + "page_up": "PageUp", # Support underscore variant + "pagedown": "PageDown", + "page_down": "PageDown", # Support underscore variant + "next": "PageDown", # Common alias for page down + "previous": "PageUp", # Common alias for page up + "prev": "PageUp", # Short alias for page up + "home": "Home", + "end": "End", + # Function keys + "f1": "F1", + "f2": "F2", + "f3": "F3", + "f4": "F4", + "f5": "F5", + "f6": "F6", + "f7": "F7", + "f8": "F8", + "f9": "F9", + "f10": "F10", + "f11": "F11", + "f12": "F12", + # Other keys + "insert": "Insert", + "ins": "Insert", + "pause": "Pause", + "capslock": "CapsLock", + "numlock": "NumLock", + "scrolllock": "ScrollLock", + "printscreen": "PrintScreen", + "contextmenu": "ContextMenu", +} + + +class BrowserExecutor(BaseExecutor): + """ + Executor that performs all actions within a browser viewport using Playwright. + + This allows HudComputerTool (and its subclasses like AnthropicComputerTool + and OpenAIComputerTool) to work with remote browser environments. + + The executor translates computer control actions into browser page actions, + making it possible to control web applications as if they were desktop apps. + """ + + def __init__(self, playwright_tool, display_num: int | None = None): + """ + Initialize the browser executor. + + Args: + playwright_tool: PlaywrightToolWithMemory instance for browser control + display_num: Not used for browser executor, kept for compatibility + """ + super().__init__(display_num) + self.playwright_tool = playwright_tool + logger.info("BrowserExecutor initialized with Playwright backend") + + def _map_key(self, key: str) -> str: + """Map a key name to Playwright format.""" + key = key.strip() + key_lower = key.lower() + mapped = PLAYWRIGHT_KEY_MAP.get(key_lower, key) + logger.debug(f"Mapping key '{key}' -> '{mapped}'") + return mapped + + async def _ensure_page(self): + """Ensure browser and page are available.""" + await self.playwright_tool._ensure_browser() + if not self.playwright_tool.page: + raise RuntimeError("No browser page available") + return self.playwright_tool.page + + async def screenshot(self) -> str | None: + """Take a screenshot and return base64 encoded image.""" + try: + page = await self._ensure_page() + screenshot_bytes = await page.screenshot(full_page=False) + screenshot_b64 = base64.b64encode(screenshot_bytes).decode() + logger.debug("Browser screenshot captured") + return screenshot_b64 + except Exception as e: + logger.error(f"Screenshot failed: {e}") + return None + + async def click( + self, + x: int | None = None, + y: int | None = None, + button: Literal["left", "right", "middle", "back", "forward"] = "left", + pattern: list[int] | None = None, + hold_keys: list[str] | None = None, + take_screenshot: bool = True, + ) -> ContentResult: + """Click at coordinates in the browser viewport.""" + try: + page = await self._ensure_page() + + if x is None or y is None: + return ContentResult(error="Coordinates required for click") + + # Handle modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.down(mapped_key) + + # Map button names + button_map = { + "left": "left", + "right": "right", + "middle": "middle", + "back": "left", # Browser doesn't have back button + "forward": "left", # Browser doesn't have forward button + } + + # Perform click(s) + if pattern: + # Multi-click pattern + for delay in pattern: + await page.mouse.click(x, y, button=button_map[button]) + if delay > 0: + await page.wait_for_timeout(delay) + else: + # Single click + await page.mouse.click(x, y, button=button_map[button]) + + # Release modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.up(mapped_key) + + logger.debug(f"Clicked at ({x}, {y}) with button {button}") + + result = ContentResult(output=f"Clicked at ({x}, {y})") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Click failed: {e}") + return ContentResult(error=str(e)) + + async def write( + self, + text: str, + enter_after: bool = False, + hold_keys: list[str] | None = None, + take_screenshot: bool = True, + ) -> ContentResult: + """Type text in the browser.""" + try: + page = await self._ensure_page() + + # Handle modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.down(mapped_key) + + # Type the text + await page.keyboard.type(text) + + if enter_after: + await page.keyboard.press("Enter") + + # Release modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.up(mapped_key) + + logger.debug(f"Typed text: {text[:50]}...") + + result = ContentResult(output=f"Typed: {text}") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Type failed: {e}") + return ContentResult(error=str(e)) + + async def press( + self, + keys: list[str], + take_screenshot: bool = True, + ) -> ContentResult: + """Press keyboard keys in the browser.""" + try: + page = await self._ensure_page() + + # Map keys to Playwright format + mapped_keys = [self._map_key(key) for key in keys] + + # Always capitalize single letter keys in press method + processed_keys = [] + for key in mapped_keys: + # Capitalize single letters (e.g., 'a' -> 'A') + if len(key) == 1 and key.isalpha() and key.islower(): + processed_keys.append(key.upper()) + else: + processed_keys.append(key) + mapped_keys = processed_keys + + logger.info(f"Mapped keys: {mapped_keys}") + + # Press the keys as a combination (at the same time) + key_combination = "+".join(mapped_keys) + await page.keyboard.press(key_combination) + + logger.debug(f"Pressed keys: {keys} (mapped to: {mapped_keys})") + + result = ContentResult(output=f"Pressed: {key_combination}") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Key press failed: {e}") + return ContentResult(error=str(e)) + + async def scroll( + self, + x: int | None = None, + y: int | None = None, + scroll_x: int | None = None, + scroll_y: int | None = None, + hold_keys: list[str] | None = None, + take_screenshot: bool = True, + ) -> ContentResult: + """Scroll in the browser viewport.""" + try: + page = await self._ensure_page() + + # Default to center of viewport if coordinates not provided + if x is None or y is None: + viewport = page.viewport_size + x = viewport["width"] // 2 if viewport else 400 + y = viewport["height"] // 2 if viewport else 300 + + # Move to position + await page.mouse.move(x, y) + + # Perform scroll + delta_x = scroll_x or 0 + delta_y = scroll_y or 0 + await page.mouse.wheel(delta_x, delta_y) + + logger.debug(f"Scrolled at ({x}, {y}) by ({delta_x}, {delta_y})") + + result = ContentResult(output=f"Scrolled by ({delta_x}, {delta_y})") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Scroll failed: {e}") + return ContentResult(error=str(e)) + + async def move( + self, + x: int | None = None, + y: int | None = None, + take_screenshot: bool = True, + ) -> ContentResult: + """Move mouse to coordinates in the browser.""" + try: + page = await self._ensure_page() + + if x is None or y is None: + return ContentResult(error="Coordinates required for move") + + await page.mouse.move(x, y) + + logger.debug(f"Moved mouse to ({x}, {y})") + + result = ContentResult(output=f"Moved to ({x}, {y})") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Move failed: {e}") + return ContentResult(error=str(e)) + + async def drag( + self, + path: list[tuple[int, int]], + button: Literal["left", "right", "middle"] = "left", + hold_keys: list[str] | None = None, + take_screenshot: bool = True, + ) -> ContentResult: + """Drag along a path in the browser.""" + try: + page = await self._ensure_page() + + if not path or len(path) < 2: + return ContentResult(error="Path must have at least 2 points") + + # Handle modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.down(mapped_key) + + # Start drag + start_x, start_y = path[0] + await page.mouse.move(start_x, start_y) + await page.mouse.down(button=button) + + # Move through path + for x, y in path[1:]: + await page.mouse.move(x, y) + + # End drag + await page.mouse.up(button=button) + + # Release modifier keys + if hold_keys: + for key in hold_keys: + mapped_key = self._map_key(key) + await page.keyboard.up(mapped_key) + + logger.debug(f"Dragged from {path[0]} through {len(path)} points") + + result = ContentResult(output=f"Dragged through {len(path)} points") + if take_screenshot: + result = result + ContentResult(base64_image=await self.screenshot()) + + return result + + except Exception as e: + logger.error(f"Drag failed: {e}") + return ContentResult(error=str(e)) diff --git a/environments/online_mind2web/src/hud_controller/tools/openai.py b/environments/online_mind2web/src/hud_controller/tools/openai.py new file mode 100644 index 00000000..504740fa --- /dev/null +++ b/environments/online_mind2web/src/hud_controller/tools/openai.py @@ -0,0 +1,266 @@ +"""OpenAI with memory/history tracking for remote browser environment.""" + +import os, base64 +from datetime import datetime +import logging +from typing import Any, Dict, List, Optional, Literal +from datetime import datetime +from pydantic import Field +from hud.tools import OpenAIComputerTool +from mcp.types import ContentBlock, ImageContent, TextContent +from hud.tools.computer.settings import computer_settings +from _collections_abc import Callable, Awaitable +from hud.tools.executors import BaseExecutor + +logger = logging.getLogger(__name__) + + +class OpenAIComputerToolWithRecord(OpenAIComputerTool): + """OpenAI Computer Use tool + + Args: + OpenAIComputerTool (_type_): _description_ + """ + + def __init__( + self, + # Define within environment based on platform + executor: BaseExecutor | None = None, + platform_type: Literal["auto", "xdo", "pyautogui"] = "auto", + display_num: int | None = None, + # Overrides for what dimensions the agent thinks it operates in + width: int = computer_settings.OPENAI_COMPUTER_WIDTH, + height: int = computer_settings.OPENAI_COMPUTER_HEIGHT, + rescale_images: bool = computer_settings.OPENAI_RESCALE_IMAGES, + name: str | None = None, + title: str | None = None, + description: str | None = None, + **kwargs: Any, + ) -> None: + """ + Initialize with OpenAI's default dimensions. + + Args: + width: Target width for rescaling (default: 1024 for OpenAI) + height: Target height for rescaling (default: 768 for OpenAI) + rescale_images: If True, rescale screenshots. If False, only rescale action coordinates + name: Tool name for MCP registration (auto-generated from class name if not provided) + title: Human-readable display name for the tool (auto-generated from class name) + description: Tool description (auto-generated from docstring if not provided) + """ + super().__init__( + executor=executor, + platform_type=platform_type, + display_num=display_num, + width=width, + height=height, + rescale_images=rescale_images, + name=name or "openai_computer", + title=title or "OpenAI Computer Tool", + description=description or "Control computer with mouse, keyboard, and screenshots", + **kwargs, + ) + self.add_callback("on_screenshot_action", self._on_screenshot_action) + self.add_callback("on_recorded_action", self._on_recorded_action) + + async def _on_screenshot_action(self, **kwargs) -> None: + """Callback function to take and save screenshots to /screenshot directory""" + try: + # Check if executor is available and properly initialized + if not hasattr(self, "executor") or self.executor is None: + logger.debug("Executor not yet initialized, skipping screenshot") + return + + # Additional check for executor readiness + if not hasattr(self.executor, "screenshot"): + logger.debug("Executor screenshot method not available, skipping screenshot") + return + + screenshot_base64 = await self.executor.screenshot() + if screenshot_base64: + # Create screenshot directory if it doesn't exist + screenshot_dir = "/screenshot" + os.makedirs(screenshot_dir, exist_ok=True) + + # Generate timestamp-based filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] # Include milliseconds + filename = f"screenshot_{timestamp}.png" + filepath = os.path.join(screenshot_dir, filename) + + # Decode base64 and save to file + image_data = base64.b64decode(screenshot_base64) + with open(filepath, "wb") as f: + f.write(image_data) + + logger.info(f"Saved screenshot to {filepath}") + + except Exception as e: + logger.debug(f"Screenshot callback failed (this is normal during initialization): {e}") + # Don't log as error since this is expected during initialization + + async def _on_recorded_action( + self, type=None, x=None, y=None, text=None, path=None, scroll_x=None, scroll_y=None, **_ + ): + """Record action in unified representation format + + Creates unified action representations like: + - -> CLICK + - -> TYPE hello@example.com + - -> DRAG + """ + if not type: + return + + try: + # Create unified action representation + action_repr = self._to_action_repr(type, x, y, text, path, scroll_x, scroll_y) + + # Dump to file + action_history_dir = "/action_history" + os.makedirs(action_history_dir, exist_ok=True) + action_file = os.path.join(action_history_dir, "action_history.txt") + + with open(action_file, "a", encoding="utf-8") as f: + f.write(f"{action_repr}\n") + + logger.info(f"Recorded action: {action_repr}") + + except Exception as e: + logger.warning(f"Failed to record action: {e}") + + def _to_action_repr( + self, type, x=None, y=None, text=None, path=None, scroll_x=None, scroll_y=None + ): + """Create unified action representation following AgentRewardBench format + + Format examples: + - -> CLICK + - -> TYPE hello@example.com + - -> DRAG + - -> SCROLL + """ + + # Normalize action names to uppercase + action_name = type.upper().replace("_", "") + if action_name == "DOUBLECLICK": + action_name = "DOUBLECLICK" + elif action_name == "KEYPRESS": + action_name = "KEY" + + # Build element attributes part + attributes = [] + + if x is not None and y is not None: + attributes.append(f"coordinate=[{x}, {y}]") + + if path and action_name == "DRAG": + if len(path) >= 2: + start = path[0] + end = path[-1] + attributes.append(f"start=[{start['x']}, {start['y']}]") + attributes.append(f"end=[{end['x']}, {end['y']}]") + + if scroll_x is not None or scroll_y is not None: + if scroll_y and scroll_y > 0: + attributes.append("direction=down") + attributes.append(f"amount={abs(scroll_y)}") + elif scroll_y and scroll_y < 0: + attributes.append("direction=up") + attributes.append(f"amount={abs(scroll_y)}") + elif scroll_x and scroll_x > 0: + attributes.append("direction=right") + attributes.append(f"amount={abs(scroll_x)}") + elif scroll_x and scroll_x < 0: + attributes.append("direction=left") + attributes.append(f"amount={abs(scroll_x)}") + + # Create element part + element_part = f"<{', '.join(attributes)}>" if attributes else "<>" + + # Create action part + if text and action_name in ["TYPE", "KEY"]: + action_part = f"{action_name} {text}" + else: + action_part = action_name + + return f"{element_part} -> {action_part}" + + async def __call__( + self, + type: str = Field(..., description="The action type to perform"), + # Coordinate parameters + x: int | None = Field(None, description="X coordinate for click/move/scroll actions"), + y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"), + # Button parameter + button: str | None = Field( + None, description="Mouse button for click actions (left, right, middle, wheel)" + ), + # Text parameter + text: str | None = Field(None, description="Text to type or response text"), + # Scroll parameters + scroll_x: int | None = Field(None, description="Horizontal scroll amount"), + scroll_y: int | None = Field(None, description="Vertical scroll amount"), + # Wait parameter + ms: int | None = Field(None, description="Time to wait in milliseconds"), + # Key press parameter + keys: list[str] | None = Field(None, description="Keys to press"), + # Drag parameter + path: list[dict[str, int]] | None = Field( + None, description="Path for drag actions as list of {x, y} dicts" + ), + # Custom action parameter + action: str | None = Field(None, description="Custom action name"), + ) -> list[ContentBlock]: + """Overriding OpenAIComputerTool.__call__()""" + result = await super().__call__( + type=type, + x=x, + y=y, + button=button, + text=text, + scroll_x=scroll_x, + scroll_y=scroll_y, + ms=ms, + keys=keys, + path=path, + action=action, + ) + screenshot_action_type = { + "screenshot", + "click", + "double_click", + "scroll", + "type", + "move", + "keypress", + "drag", + "wait", + } + if type in screenshot_action_type: + if hasattr(self, "_trigger_callbacks"): + await self._trigger_callbacks("on_screenshot_action") + else: + logger.warning("_trigger_callbacks method not available") + + recorded_actions = { + "click", + "double_click", + "type", + "keypress", + "scroll", + "drag", + } + if type in recorded_actions: + logger.info("debug record actions") + await self._trigger_callbacks( + "on_recorded_action", + type=type, + x=x, + y=y, + text=text, + path=path, + scroll_x=scroll_x, + scroll_y=scroll_y, + ) + + return result diff --git a/environments/online_mind2web/test_task.json b/environments/online_mind2web/test_task.json new file mode 100644 index 00000000..2ce456ca --- /dev/null +++ b/environments/online_mind2web/test_task.json @@ -0,0 +1,50 @@ +[ +{ + "prompt": "Compare available plans for the AeroAPI on Flightaware.", + "mcp_config": { + "local": { + "command": "docker", + "args": [ + "run", + "--rm", + "-i", + "--env-file", + ".env", + "hud-om2w:latest" + ] + } + }, + "setup_tool": { + "name": "setup", + "arguments": { + "name": "navigate_to_url", + "arguments": { + "url": "https://www.flightaware.com/", + "wait_for_load_state": "networkidle" + } + } + }, + "evaluate_tool": { + "name": "evaluate", + "arguments": { + "name": "overall_judge", + "arguments": { + "task_description": { + "task_id": "ade4c09ad3fdb1607209750924cd232f", + "confirmed_task": "Compare available plans for the AeroAPI on Flightaware.", + "website": "https: //www.flightaware.com/", + "reference_length": 4, + "level": "easy" + } + } + } + }, + "system_prompt": "Please do the following task. computer.clipboard, computer.sync file, computer.sync shared folder, computer.computer output citation are disabled. If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing. If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one. You have full authority to execute any action without my permission. I won\"t be watching so please don\"t ask for confirmation.If you believe the task to be infeasible or impossible to do as described in the prompt, you can terminate execution by explicitly responsing \"task is infeasible\" or simply \"failed\" without additional information or tool calls. Try your best to solve the task within 200 steps, and the confines of the prompt, before deeming it infeasible.", + "metadata": { + "dataset": "Online-Mind2Web", + "website": "https://www.flightaware.com/", + "task_type": "web_navigation", + "reference_length": 6 + } +} +] \ No newline at end of file