hud-evals · Genteki · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025 · Oct 23, 2025
diff --git a/environments/online_mind2web/.gitignore b/environments/online_mind2web/.gitignore
@@ -0,0 +1,2 @@
+gcp.json
+test.ipynb
diff --git a/environments/online_mind2web/Dockerfile b/environments/online_mind2web/Dockerfile
@@ -0,0 +1,36 @@
+# Use our HUD base browser image with Playwright and uv pre-installed
+FROM hudpython/base-browser:latest
+
+# Create app-specific working directory
+WORKDIR /app
+
+# Copy project files
+COPY pyproject.toml ./
+COPY src/ ./src/
+
+# Install the package using the existing venv at /opt/venv
+# The --python flag tells uv to use this specific Python instead of creating a new venv
+RUN uv pip install --python /opt/venv -e .
+
+# Create directories for logs and data
+RUN mkdir -p /app/logs /app/data
+
+ENV DISPLAY_WIDTH=1448
+ENV DISPLAY_HEIGHT=944
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+# Note: Environment variables for browser providers should be set at runtime:
+# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel
+# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc.
+# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed)
+
+# Run remote browser with persistent context
+CMD ["sh", "-c", "\
+    # Start context server in background \
+    python3 -m hud_controller.context >&2 & \
+    # Wait a bit for context server to start \
+    sleep 2 && \
+    # Run MCP server in foreground with exec \
+    exec python3 -m hud_controller.server \
+"]
diff --git a/environments/online_mind2web/README.md b/environments/online_mind2web/README.md
@@ -0,0 +1,36 @@
+# HUD Online Mind2Web Taskset
+
+Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation.
+
+## Running with Docker
+
+The Docker image supports both production and development modes using the same Dockerfile.
+
+### Building the Image
+
+```bash
+# Production build (default)
+docker build -t hud-om2w:latest .
+```
+
+### Running the Test Task
+```bash
+hud eval ./test_task.json 
+```
+
+### Running Whole Online-Mind2Web Dataset From HuggingFace
+```bash
+hud eval Genteki/Online-Mind2Web --full --max-concurrent=5
+```
+
+### Different Evaluation Method
+
+To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you:
+
+| Evaluation Method | Final Screenshot | Screenshot History | Action Histroy | 
+|:---|:---:|:---:| :---: |
+| `autonomous` | ✔ | ✗ | ✔ |
+| `webjudge` | ✔ | ✔ | ✔ |
+| `overall_judge`[^1] | - | - | - |
+
+[^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them.
diff --git a/environments/online_mind2web/pyproject.toml b/environments/online_mind2web/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "hud-om2w"
+version = "0.1.0"
+description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
+requires-python = ">=3.11,<3.13"
+dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
+
+[build-system]
+requires = [ "hatchling",]
+build-backend = "hatchling.build"
+
+[project.scripts]
+hud-om2w = "hud_controller.__main__:main"
+
+[tool.hud]
+image = "hud-om2w:dev"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = [ "src/hud_controller",]
diff --git a/environments/online_mind2web/src/hud_controller/__init__.py b/environments/online_mind2web/src/hud_controller/__init__.py
@@ -0,0 +1,3 @@
+"""Online Mind2Web Env, From ../remote-browser"""
+
+__version__ = "0.1.0"
diff --git a/environments/online_mind2web/src/hud_controller/context.py b/environments/online_mind2web/src/hud_controller/context.py
@@ -0,0 +1,139 @@
+"""
+Context server for remote browser environment that persists state across hot-reloads.
+
+Run this as a separate process to maintain browser session state during development:
+    python -m hud_controller.context
+"""
+
+import asyncio
+import logging
+from datetime import datetime
+from typing import Dict, Any, Optional
+from hud.server.context import run_context_server
+
+logger = logging.getLogger(__name__)
+
+
+class RemoteBrowserContext:
+    """Context that holds remote browser state across reloads."""
+
+    def __init__(self):
+        """Initialize the remote browser context."""
+        self.browser_provider = None
+        self.is_initialized = False
+        self.provider_config: Optional[Dict[str, Any]] = None
+        self.launch_options: Optional[Dict[str, Any]] = None
+        self._startup_complete = False
+        self.playwright_tool = None  # Store the playwright tool
+        self._telemetry: Optional[Dict[str, Any]] = None  # Store full telemetry data
+
+        logger.info("[RemoteBrowserContext] Created new remote browser context")
+
+    def startup(self):
+        """One-time startup when context server starts."""
+        if self._startup_complete:
+            logger.info("[RemoteBrowserContext] Startup already complete, skipping")
+            return
+
+        logger.info("[RemoteBrowserContext] Performing one-time startup")
+        self._startup_complete = True
+
+    # === Proxy-friendly methods for multiprocessing.Manager ===
+    # Note: These are needed because direct attribute access doesn't always
+    # work correctly through the multiprocessing proxy
+
+    def get_browser_provider(self):
+        """Get the browser provider instance."""
+        return self.browser_provider
+
+    def set_browser_provider(self, provider) -> None:
+        """Set the browser provider instance."""
+        self.browser_provider = provider
+        if provider:
+            self.provider_name = provider.__class__.__name__.replace("Provider", "").lower()
+            logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}")
+
+    def get_cdp_url(self) -> Optional[str]:
+        """Get the CDP URL from telemetry."""
+        return self._telemetry.get("cdp_url") if self._telemetry else None
+
+    def get_is_initialized(self) -> bool:
+        """Check if environment is initialized."""
+        return self.is_initialized
+
+    def set_initialized(self, value: bool) -> None:
+        """Set initialization status."""
+        self.is_initialized = value
+        logger.info(f"[RemoteBrowserContext] Initialization status: {value}")
+
+    def get_provider_config(self) -> Optional[Dict[str, Any]]:
+        """Get provider configuration."""
+        return self.provider_config
+
+    def set_provider_config(self, config: Dict[str, Any]) -> None:
+        """Set provider configuration."""
+        self.provider_config = config
+        logger.info(f"[RemoteBrowserContext] Set provider config")
+
+    def get_launch_options(self) -> Optional[Dict[str, Any]]:
+        """Get launch options."""
+        return self.launch_options
+
+    def set_launch_options(self, options: Dict[str, Any]) -> None:
+        """Set launch options."""
+        self.launch_options = options
+        logger.info(f"[RemoteBrowserContext] Set launch options")
+
+    def get_playwright_tool(self):
+        """Get the playwright tool instance."""
+        return self.playwright_tool
+
+    def set_playwright_tool(self, tool) -> None:
+        """Set the playwright tool instance."""
+        self.playwright_tool = tool
+        logger.info(f"[RemoteBrowserContext] Set playwright tool")
+
+    def set_telemetry(self, telemetry: Dict[str, Any]) -> None:
+        """Set the full telemetry data."""
+        self._telemetry = telemetry
+        logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}")
+
+    def get_state_summary(self) -> Dict[str, Any]:
+        """Get a summary of the current state."""
+        return {
+            "is_initialized": self.is_initialized,
+            "startup_complete": self._startup_complete,
+            "provider_name": self._telemetry.get("provider") if self._telemetry else None,
+            "has_cdp_url": self.get_cdp_url() is not None,
+            "has_browser_provider": self.browser_provider is not None,
+            "has_playwright_tool": self.playwright_tool is not None,
+        }
+
+    def get_telemetry(self) -> Dict[str, Any]:
+        """Get telemetry data from the browser provider."""
+        # If we have stored telemetry, return it
+        if self._telemetry:
+            return self._telemetry
+
+        # Otherwise return basic telemetry data
+        return {
+            "provider": "unknown",
+            "status": "not_initialized",
+            "live_url": None,
+            "cdp_url": None,
+            "instance_id": None,
+            "timestamp": datetime.now().isoformat(),
+        }
+
+
+if __name__ == "__main__":
+    # Run the context server with RemoteBrowserContext
+    context = RemoteBrowserContext()
+    context.startup()
+
+    # Log initial state
+    logger.info(f"[Context] Starting remote browser context server")
+    logger.info(f"[Context] Initial state: {context.get_state_summary()}")
+
+    # Run the context server
+    asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock"))
diff --git a/environments/online_mind2web/src/hud_controller/evaluate/__init__.py b/environments/online_mind2web/src/hud_controller/evaluate/__init__.py
@@ -0,0 +1,11 @@
+"""Evaluation layer for remote browser environment."""
+
+from __future__ import annotations
+
+from hud.tools.base import BaseHub
+
+evaluate = BaseHub("evaluate")
+
+from . import autonomous_eval, webjudge, overall_judge
+
+__all__ = ["evaluate"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Online Mind2Web Env, From ../remote-browser"""

		__version__ = "0.1.0"