Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions environments/online_mind2web/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
gcp.json
test.ipynb
36 changes: 36 additions & 0 deletions environments/online_mind2web/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Use our HUD base browser image with Playwright and uv pre-installed
FROM hudpython/base-browser:latest

# Create app-specific working directory
WORKDIR /app

# Copy project files
COPY pyproject.toml ./
COPY src/ ./src/

# Install the package using the existing venv at /opt/venv
# The --python flag tells uv to use this specific Python instead of creating a new venv
RUN uv pip install --python /opt/venv -e .

# Create directories for logs and data
RUN mkdir -p /app/logs /app/data

ENV DISPLAY_WIDTH=1448
ENV DISPLAY_HEIGHT=944

ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
# Note: Environment variables for browser providers should be set at runtime:
# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel
# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc.
# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed)

# Run remote browser with persistent context
CMD ["sh", "-c", "\
# Start context server in background \
python3 -m hud_controller.context >&2 & \
# Wait a bit for context server to start \
sleep 2 && \
# Run MCP server in foreground with exec \
exec python3 -m hud_controller.server \
"]
36 changes: 36 additions & 0 deletions environments/online_mind2web/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# HUD Online Mind2Web Taskset

Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation.

## Running with Docker

The Docker image supports both production and development modes using the same Dockerfile.

### Building the Image

```bash
# Production build (default)
docker build -t hud-om2w:latest .
```

### Running the Test Task
```bash
hud eval ./test_task.json
```

### Running Whole Online-Mind2Web Dataset From HuggingFace
```bash
hud eval Genteki/Online-Mind2Web --full --max-concurrent=5
```

### Different Evaluation Method

To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you:

| Evaluation Method | Final Screenshot | Screenshot History | Action Histroy |
|:---|:---:|:---:| :---: |
| `autonomous` | ✔ | ✗ | ✔ |
| `webjudge` | ✔ | ✔ | ✔ |
| `overall_judge`[^1] | - | - | - |

[^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them.
22 changes: 22 additions & 0 deletions environments/online_mind2web/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "hud-om2w"
version = "0.1.0"
description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
requires-python = ">=3.11,<3.13"
dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]

[build-system]
requires = [ "hatchling",]
build-backend = "hatchling.build"

[project.scripts]
hud-om2w = "hud_controller.__main__:main"

[tool.hud]
image = "hud-om2w:dev"

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.build.targets.wheel]
packages = [ "src/hud_controller",]
3 changes: 3 additions & 0 deletions environments/online_mind2web/src/hud_controller/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Online Mind2Web Env, From ../remote-browser"""

__version__ = "0.1.0"
139 changes: 139 additions & 0 deletions environments/online_mind2web/src/hud_controller/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""
Context server for remote browser environment that persists state across hot-reloads.

Run this as a separate process to maintain browser session state during development:
python -m hud_controller.context
"""

import asyncio
import logging
from datetime import datetime
from typing import Dict, Any, Optional
from hud.server.context import run_context_server

logger = logging.getLogger(__name__)


class RemoteBrowserContext:
"""Context that holds remote browser state across reloads."""

def __init__(self):
"""Initialize the remote browser context."""
self.browser_provider = None
self.is_initialized = False
self.provider_config: Optional[Dict[str, Any]] = None
self.launch_options: Optional[Dict[str, Any]] = None
self._startup_complete = False
self.playwright_tool = None # Store the playwright tool
self._telemetry: Optional[Dict[str, Any]] = None # Store full telemetry data

logger.info("[RemoteBrowserContext] Created new remote browser context")

def startup(self):
"""One-time startup when context server starts."""
if self._startup_complete:
logger.info("[RemoteBrowserContext] Startup already complete, skipping")
return

logger.info("[RemoteBrowserContext] Performing one-time startup")
self._startup_complete = True

# === Proxy-friendly methods for multiprocessing.Manager ===
# Note: These are needed because direct attribute access doesn't always
# work correctly through the multiprocessing proxy

def get_browser_provider(self):
"""Get the browser provider instance."""
return self.browser_provider

def set_browser_provider(self, provider) -> None:
"""Set the browser provider instance."""
self.browser_provider = provider
if provider:
self.provider_name = provider.__class__.__name__.replace("Provider", "").lower()
logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}")

def get_cdp_url(self) -> Optional[str]:
"""Get the CDP URL from telemetry."""
return self._telemetry.get("cdp_url") if self._telemetry else None

def get_is_initialized(self) -> bool:
"""Check if environment is initialized."""
return self.is_initialized

def set_initialized(self, value: bool) -> None:
"""Set initialization status."""
self.is_initialized = value
logger.info(f"[RemoteBrowserContext] Initialization status: {value}")

def get_provider_config(self) -> Optional[Dict[str, Any]]:
"""Get provider configuration."""
return self.provider_config

def set_provider_config(self, config: Dict[str, Any]) -> None:
"""Set provider configuration."""
self.provider_config = config
logger.info(f"[RemoteBrowserContext] Set provider config")

def get_launch_options(self) -> Optional[Dict[str, Any]]:
"""Get launch options."""
return self.launch_options

def set_launch_options(self, options: Dict[str, Any]) -> None:
"""Set launch options."""
self.launch_options = options
logger.info(f"[RemoteBrowserContext] Set launch options")

def get_playwright_tool(self):
"""Get the playwright tool instance."""
return self.playwright_tool

def set_playwright_tool(self, tool) -> None:
"""Set the playwright tool instance."""
self.playwright_tool = tool
logger.info(f"[RemoteBrowserContext] Set playwright tool")

def set_telemetry(self, telemetry: Dict[str, Any]) -> None:
"""Set the full telemetry data."""
self._telemetry = telemetry
logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}")

def get_state_summary(self) -> Dict[str, Any]:
"""Get a summary of the current state."""
return {
"is_initialized": self.is_initialized,
"startup_complete": self._startup_complete,
"provider_name": self._telemetry.get("provider") if self._telemetry else None,
"has_cdp_url": self.get_cdp_url() is not None,
"has_browser_provider": self.browser_provider is not None,
"has_playwright_tool": self.playwright_tool is not None,
}

def get_telemetry(self) -> Dict[str, Any]:
"""Get telemetry data from the browser provider."""
# If we have stored telemetry, return it
if self._telemetry:
return self._telemetry

# Otherwise return basic telemetry data
return {
"provider": "unknown",
"status": "not_initialized",
"live_url": None,
"cdp_url": None,
"instance_id": None,
"timestamp": datetime.now().isoformat(),
}


if __name__ == "__main__":
# Run the context server with RemoteBrowserContext
context = RemoteBrowserContext()
context.startup()

# Log initial state
logger.info(f"[Context] Starting remote browser context server")
logger.info(f"[Context] Initial state: {context.get_state_summary()}")

# Run the context server
asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Evaluation layer for remote browser environment."""

from __future__ import annotations

from hud.tools.base import BaseHub

evaluate = BaseHub("evaluate")

from . import autonomous_eval, webjudge, overall_judge

__all__ = ["evaluate"]
Loading
Loading