diff --git a/environments/from_mcp_template/.env.example b/environments/from_mcp_template/.env.example new file mode 100644 index 00000000..07846201 --- /dev/null +++ b/environments/from_mcp_template/.env.example @@ -0,0 +1,7 @@ +# HUD API Configuration +# Get your API key from https://hud.so/account +HUD_API_KEY="" + +# Anthropic API Configuration (optional) +# Required for using Claude agents - get from https://console.anthropic.com/ +ANTHROPIC_API_KEY="" diff --git a/environments/from_mcp_template/Dockerfile b/environments/from_mcp_template/Dockerfile new file mode 100644 index 00000000..5f36b1e4 --- /dev/null +++ b/environments/from_mcp_template/Dockerfile @@ -0,0 +1,12 @@ +FROM public.ecr.aws/docker/library/python:3.11-bookworm + +WORKDIR /app + +# Install git for dependency installation +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies +COPY pyproject.toml ./ +COPY controller/ ./controller/ +COPY environment/ ./environment/ +RUN pip install --no-cache-dir -e . diff --git a/environments/from_mcp_template/README.md b/environments/from_mcp_template/README.md new file mode 100644 index 00000000..d12d9b7c --- /dev/null +++ b/environments/from_mcp_template/README.md @@ -0,0 +1,108 @@ +# test-test + +## Environment design pattern +- Controller (Think of this as a frontend in web development) + - Creates the UX and manages the lifecycle of an app (in this case for an agent) + - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with +- Environment (Think of this as a backend in web development) + - Owns all long‑lived states of the environment and exposes the environment data structure + - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`) + +IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication + +### Testing your environment +```bash +# 1. Configure your API keys (optional - only needed for evaluation) +# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY + +# 2. Start the environment (optional: with --inspector or --interactive) +hud dev --build --interactive + +# 3. Choose your preferred way to test: + +# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY) +hud eval tasks.json --agent claude + +# Option B: Interactive notebook test_env.ipynb (great for learning!) + +# Option C: Simple Python script (runs all tasks from tasks.json) +python test_task.py +``` + +## Iterating on your environment +This is usually the process for making any environment better: +```bash +# 1. Start the environment and interact with it directly (or give MCP server to an agent): +hud dev --build --interactive + +# 2. If the environment cannot start or fails inexplicably: +hud debug test_env:dev # Or your env name that appears when you run hud dev +# After fixing the error, go back to 1. + +# 3. When the environment is in a stable state: +hud build +hud push # Requires docker login + +# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run: +hud rl +# This is a good test to see if your environment and tasks are high quality! + +## Layout +``` +controller/ + __init__.py # mcp + shared HTTP client + __main__.py # python -m controller → mcp.run() + hooks.py # @mcp.initialize / @mcp.shutdown + tools.py # @mcp.tool act / setup / evaluate + +./environment + ├── __init__.py + └── server.py # FastAPI app: /health, /act, /reset, /state +``` + +## Publishing Your Environment + +Once your environment is ready, you can share it with the community: + +### 1. Push to Registry +```bash +# Build and push your environment (requires docker hub login and hud api key) +hud build +hud push +``` + +### 2. Create a Dataset + +Create a dataset on HuggingFace with your tasks: + +**Option A: Upload manually** +1. Upload your `tasks.json` to HuggingFace +2. Make sure it's **public** to appear on leaderboards + +**Option B: Use the SDK** +```python +from hud.datasets import save_tasks +import json + +# Load your tasks +with open("tasks.json") as f: + tasks = json.load(f) + +# Push to HuggingFace +save_tasks(tasks, repo_id="your-org/your-dataset") +``` + +### 3. Run and Track Performance + +```bash +# Run Claude on your benchmark +hud eval "your-org/your-dataset" --agent claude + +# View results at: +# hud.so/leaderboards/your-org/your-dataset +``` + +**Note**: Only public HuggingFace datasets appear as leaderboards! + +📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards) + diff --git a/environments/from_mcp_template/controller/README.md b/environments/from_mcp_template/controller/README.md new file mode 100644 index 00000000..411e1b9d --- /dev/null +++ b/environments/from_mcp_template/controller/README.md @@ -0,0 +1,16 @@ +# Controller + +Frontend for the agent: defines tools, minimal state, calls the environment over HTTP. + +What to implement +- Shared client in `__init__.py` (one `httpx.AsyncClient`) +- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`) +- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions + +Run +```bash +hud run controller --transport http --reload +# Helper endpoints: http://localhost:8765/hud and /hud/tools +``` + +Principle: the controller is UX, not state. Keep long‑lived state in the environment. diff --git a/environments/from_mcp_template/controller/__init__.py b/environments/from_mcp_template/controller/__init__.py new file mode 100644 index 00000000..58e7efdf --- /dev/null +++ b/environments/from_mcp_template/controller/__init__.py @@ -0,0 +1,27 @@ +"""Controller package - registers hooks and tools.""" + +import sys +import os +import httpx +import logging +from hud.server import MCPServer + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", + force=True, # Force all loggers to use stderr +) + +# Suppress httpx INFO logs to avoid cluttering MCP protocol +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.WARNING) # Only show warnings and errors +httpcore_logger = logging.getLogger("httpcore") +httpcore_logger.setLevel(logging.WARNING) # Only show warnings and errors + +mcp = MCPServer() + +# Import tools and hooks to register them with the server +from . import tools, hooks + +__all__ = ["mcp"] diff --git a/environments/from_mcp_template/controller/__main__.py b/environments/from_mcp_template/controller/__main__.py new file mode 100644 index 00000000..81f2ce81 --- /dev/null +++ b/environments/from_mcp_template/controller/__main__.py @@ -0,0 +1,4 @@ +from controller import mcp + +if __name__ == "__main__": + mcp.run() diff --git a/environments/from_mcp_template/controller/tools.py b/environments/from_mcp_template/controller/tools.py new file mode 100644 index 00000000..67d87f83 --- /dev/null +++ b/environments/from_mcp_template/controller/tools.py @@ -0,0 +1,4 @@ +"""Controller tools that call the environment API.""" + +from controller import mcp +from hud.tools.types import EvaluationResult diff --git a/environments/from_mcp_template/pyproject.toml b/environments/from_mcp_template/pyproject.toml new file mode 100644 index 00000000..4b3c6595 --- /dev/null +++ b/environments/from_mcp_template/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "test_test" +version = "0.1.0" +description = "A minimal HUD environment" +requires-python = ">=3.11" +dependencies = [ "hud-python==0.4.41" ] + +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[tool.hud] +image = "test_test:dev" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = [ "controller", "environment",] diff --git a/environments/from_mcp_template/tasks.json b/environments/from_mcp_template/tasks.json new file mode 100644 index 00000000..2c39448c --- /dev/null +++ b/environments/from_mcp_template/tasks.json @@ -0,0 +1,13 @@ +[ + { + "prompt": "Do something in this strange new environment", + "mcp_config": { + "local": { + "url": "http://localhost:8765/mcp" + } + }, + "setup_tool": {}, + "integration_test_tool": {}, + "evaluate_tool": {} + } +] diff --git a/environments/from_mcp_template/test_env.ipynb b/environments/from_mcp_template/test_env.ipynb new file mode 100644 index 00000000..813d40ea --- /dev/null +++ b/environments/from_mcp_template/test_env.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure to `pip install hud-python[agents]` before running this notebook\n", + "\n", + "### Step 1: Create a Task\n", + "\n", + "A Task combines:\n", + "- **Prompt**: What we want an agent to accomplish\n", + "- **MCP Config**: How to spawn the environment\n", + "- **Setup Tool**: How to prepare the environment\n", + "- **Evaluate Tool**: How to check if the task succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hud.datasets import Task\n", + "from hud.types import MCPToolCall\n", + "\n", + "# Create a task that uses our test_test environment\n", + "# See tasks.json for how to build a loadable task dataset\n", + "task = Task(\n", + " prompt=\"Increment the counter to reach 10\",\n", + " mcp_config={\n", + " \"test_test\": {\"url\": \"http://localhost:8765/mcp\"},\n", + " },\n", + " setup_tool=MCPToolCall(name=\"setup\", arguments={}),\n", + " evaluate_tool=MCPToolCall(name=\"evaluate\", arguments={\"target\": 10}),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Initialize MCP Client\n", + "\n", + "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hud.clients import MCPClient\n", + "\n", + "# Create the client\n", + "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\n", + "\n", + "# Initialize it (this connects to our dev server)\n", + "await client.initialize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Run Setup\n", + "\n", + "Call the setup tool to prepare the environment according to the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the setup from our task\n", + "setup_result = await client.call_tool(task.setup_tool) # type: ignore\n", + "print(f\"Setup result: {setup_result}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Perform Actions\n", + "\n", + "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Increment the counter 10 times\n", + "for i in range(10):\n", + " result = await client.call_tool(name=\"act\", arguments={})\n", + " print(f\"Step {i + 1}: {result.content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Evaluate Success\n", + "\n", + "Check if we completed the task according to the evaluation criteria." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the evaluation from our task\n", + "eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\n", + "\n", + "# The result is a list with one TextContent item containing JSON\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: Cleanup\n", + "\n", + "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus: Running with an AI Agent\n", + "\n", + "Instead of manually calling tools, you can have an AI agent solve the task automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\n", + "from hud.agents import ClaudeAgent\n", + "\n", + "# Create an agent\n", + "agent = ClaudeAgent(\n", + " model=\"claude-sonnet-4-20250514\",\n", + " allowed_tools=[\"act\"], # Only allow the act tool\n", + ")\n", + "\n", + "# Run the task\n", + "result = await agent.run(task)\n", + "print(f\"Final reward: {result.reward}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Next Steps\n", + "\n", + "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\n", + "2. **Build complex environments**: Replace the simple counter with your actual application\n", + "3. **Test with agents**: Use different AI models to solve your tasks\n", + "\n", + "For more examples, check out:\n", + "- `environments/text_2048/` - A complete 2048 game environment\n", + "- `environments/browser/` - A full browser automation environment with GUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/environments/from_mcp_template/test_task.py b/environments/from_mcp_template/test_task.py new file mode 100644 index 00000000..28f7d083 --- /dev/null +++ b/environments/from_mcp_template/test_task.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +"""Simple example of running tasks from tasks.json. + +Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents] +""" + +from __future__ import annotations + +import asyncio +import json + +from hud.clients import MCPClient +from hud.datasets import Task + + +async def run_task(task_data: dict): + task = Task(**task_data) + client = MCPClient(mcp_config=task.mcp_config) + + try: + print("Initializing client...") + await client.initialize() + + result = await client.call_tool(task.setup_tool) # type: ignore + print(f"✅ Setup: {result.content}") + + print("\n🔄 Performing actions:") + for _ in range(10): + result = await client.call_tool(name="act", arguments={}) + print(f" {result.content}") + + result = await client.call_tool(task.evaluate_tool) # type: ignore + print(f"\n📊 Evaluation: {result.content}") + + return result.content + except Exception as e: + if "connection" in str(e).lower(): + print( + "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal." + ) + else: + raise e + finally: + await client.shutdown() + + +async def main(): + for task_data in json.load(open("tasks.json")): + await run_task(task_data) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 8bf812a9..6e11f4c7 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -874,6 +874,11 @@ def init( ), directory: str = typer.Option(".", "--dir", "-d", help="Target directory"), force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"), + from_mcp: str | None = typer.Option( + None, + "--from-mcp", + help="Use from_mcp_template (controller/environment pattern with HTTP backend)", + ), ) -> None: """🚀 Initialize a new HUD environment with minimal boilerplate. @@ -884,11 +889,12 @@ def init( - Required setup/evaluate tools Examples: - hud init # Use current directory name - hud init my-env # Create in ./my-env/ - hud init my-env --dir /tmp # Create in /tmp/my-env/ + hud init # Use current directory name + hud init my-env # Create in ./my-env/ + hud init my-env --dir /tmp # Create in /tmp/my-env/ + hud init --from-mcp [MCP SERVER URL] # Use from_mcp_template pattern """ - create_environment(name, directory, force, preset) + create_environment(name, directory, force, preset, from_mcp) @app.command() diff --git a/hud/cli/init.py b/hud/cli/init.py index a56d8ac1..8674fc50 100644 --- a/hud/cli/init.py +++ b/hud/cli/init.py @@ -7,6 +7,7 @@ import tempfile import time from pathlib import Path +from typing import Any import httpx import questionary @@ -170,8 +171,91 @@ def _download_tarball_subdir( os.remove(tmp_path) +def _generate_tool_stubs(tools_file: Path, tools: list[Any]) -> None: + """Generate tool stub functions from MCP tool schemas. + + Args: + tools_file: Path to controller/tools.py file + tools: List of tool objects from MCP server + """ + # Read existing file + content = tools_file.read_text() + + # Generate tool functions + tool_functions = [] + for tool in tools: + # Extract schema info + schema = tool.inputSchema if hasattr(tool, "inputSchema") else {} + properties = schema.get("properties", {}) + required = schema.get("required", []) + + # Build function parameters + params = [] + for prop_name, prop_info in properties.items(): + prop_type = prop_info.get("type", "str") + # Map JSON schema types to Python types + python_type = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool", + "array": "list", + "object": "dict", + }.get(prop_type, "Any") + + # Add optional marker if not required + if prop_name not in required: + python_type = f"{python_type} | None = None" + + params.append(f"{prop_name}: {python_type}") + + params_str = ", ".join(params) if params else "" + + # Build function + func = f''' +@mcp.tool +async def {tool.name}({params_str}) -> str: + """{tool.description}""" + raise NotImplementedError("TODO: Implement {tool.name}") +''' + tool_functions.append(func) + + # Append to file + new_content = content.rstrip() + "\n\n" + "\n".join(tool_functions) + "\n" + tools_file.write_text(new_content) + + +async def analyze_external_mcp_server(url: str) -> list[Any]: + """Fetch raw tool schemas from an external MCP server. + + Args: + url: MCP server URL (e.g., https://mcp.deepwiki.com/sse) + + Returns: + List of raw tool objects + """ + from hud.clients import MCPClient + + config = {"external": {"url": url}} + client = MCPClient(mcp_config=config, auto_trace=False) + + try: + await client.initialize() + tools = await client.list_tools() + return tools + finally: + try: + await client.shutdown() + except Exception: + pass + + def create_environment( - name: str | None, directory: str, force: bool, preset: str | None = None + name: str | None, + directory: str, + force: bool, + preset: str | None = None, + from_mcp: str | None = None, ) -> None: """Create a new HUD environment by downloading a preset from the repo.""" @@ -186,14 +270,22 @@ def create_environment( else: target_dir = Path(directory) / name - # Choose preset - preset_normalized = (preset or "").strip().lower() if preset else _prompt_for_preset() - if preset_normalized not in PRESET_MAP: - hud_console.warning( - f"Unknown preset '{preset_normalized}', defaulting to 'blank' " - "(available: blank, deep-research, browser)" - ) - preset_normalized = "blank" + # Handle --from-mcp flag + if from_mcp is not None: + preset_normalized = "from-mcp" + env_folder = "from_mcp_template" + branch = "from-mcp-init" + else: + # Choose preset + preset_normalized = (preset or "").strip().lower() if preset else _prompt_for_preset() + if preset_normalized not in PRESET_MAP: + hud_console.warning( + f"Unknown preset '{preset_normalized}', defaulting to 'blank' " + "(available: blank, deep-research, browser)" + ) + preset_normalized = "blank" + env_folder = PRESET_MAP[preset_normalized] + branch = GITHUB_BRANCH # Check if directory exists if target_dir.exists() and any(target_dir.iterdir()): @@ -204,9 +296,8 @@ def create_environment( else: hud_console.warning(f"Overwriting existing files in {target_dir}") - # Download preset from GitHub - env_folder = PRESET_MAP[preset_normalized] - if env_folder is None: + # Validate env_folder (already set above based on from_mcp flag) + if not from_mcp and env_folder is None: hud_console.error("Internal error: preset mapping missing folder name") raise typer.Exit(1) @@ -214,7 +305,7 @@ def create_environment( hud_console.section_title("Downloading template from public SDK") source_url = ( f"https://github.com/{GITHUB_OWNER}/{GITHUB_REPO}/tree/" - f"{GITHUB_BRANCH}/environments/{env_folder}" + f"{branch}/environments/{env_folder}" ) hud_console.info("Source: " + source_url) @@ -223,10 +314,11 @@ def create_environment( started = time.time() files_created_dl: list[str] = [] try: + assert env_folder is not None # Already validated above _download_tarball_subdir( owner=GITHUB_OWNER, repo=GITHUB_REPO, - ref=GITHUB_BRANCH, + ref=branch, subdir=env_folder, dest_dir=target_dir, files_created=files_created_dl, @@ -268,3 +360,22 @@ def create_environment( hud_console.info("\n3. Review the README in this preset for specific instructions.") hud_console.info("\n4. Customize as needed.") + + # Analyze external MCP server if URL provided + if from_mcp is not None: + import asyncio + hud_console.section_title("Fetching tools from MCP server") + try: + tools = asyncio.run(analyze_external_mcp_server(from_mcp)) + hud_console.success(f"Found {len(tools)} tools from {from_mcp}") + + # Generate tool stubs and write to tools.py + tools_file = target_dir / "controller" / "tools.py" + if tools_file.exists(): + hud_console.info(f"Generating tool stubs in {tools_file.relative_to(target_dir)}") + _generate_tool_stubs(tools_file, tools) + hud_console.success(f"Generated {len(tools)} tool stubs") + else: + hud_console.warning(f"tools.py not found at {tools_file}") + except Exception as e: + hud_console.warning(f"Could not fetch tools: {e}") \ No newline at end of file