Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions examples/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from datasets import load_dataset
from hud.agents import ClaudeAgent, OperatorAgent
from hud.datasets import Task, run_dataset
from hud.types import AgentType

logger = logging.getLogger(__name__)

Expand All @@ -58,14 +59,14 @@


def _build_agent(
agent_type: Literal["claude", "openai"],
agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI],
*,
model: str | None = None,
allowed_tools: list[str] | None = None,
) -> ClaudeAgent | OperatorAgent:
"""Create and return the requested agent type."""

if agent_type == "openai":
if agent_type == AgentType.OPENAI:
# Only pass allowed_tools if explicitly provided
# This allows tasks to specify their own via agent_config
if allowed_tools:
Expand Down Expand Up @@ -105,7 +106,7 @@ def _build_agent(
async def run_single_task(
dataset_name: str,
*,
agent_type: Literal["claude", "openai"] = "claude",
agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI] = AgentType.CLAUDE,
model: str | None = None,
allowed_tools: list[str] | None = None,
max_steps: int = 10,
Expand Down Expand Up @@ -144,7 +145,7 @@ async def run_single_task(
async def run_full_dataset(
dataset_name: str,
*,
agent_type: Literal["claude", "openai"] = "claude",
agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI] = AgentType.CLAUDE,
model: str | None = None,
allowed_tools: list[str] | None = None,
max_concurrent: int = 50,
Expand All @@ -154,7 +155,7 @@ async def run_full_dataset(

# Build agent class + config for run_dataset – we pass the *class* and a minimal
# config dict, run_dataset will create a fresh agent per task.
if agent_type == "openai":
if agent_type == AgentType.OPENAI:
agent_class = OperatorAgent
agent_config: dict[str, Any] = {
"validate_api_key": False,
Expand Down
23 changes: 14 additions & 9 deletions hud/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from rich.panel import Panel
from rich.table import Table

from hud.types import AgentType

from . import list_func as list_module
from .analyze import (
analyze_environment,
Expand Down Expand Up @@ -844,7 +846,7 @@ def eval(
hud_console = HUDConsole()

if integration_test:
agent = "integration_test"
agent = AgentType.INTEGRATION_TEST

# If no source provided, reuse RL helper to find a tasks file interactively
if source is None:
Expand Down Expand Up @@ -891,17 +893,17 @@ def eval(
# Add standard agent choices
choices.extend(
[
{"name": "Claude 4 Sonnet", "value": "claude"},
{"name": "OpenAI Computer Use", "value": "openai"},
{"name": "vLLM (Local Server)", "value": "vllm"},
{"name": "LiteLLM (Multi-provider)", "value": "litellm"},
{"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE},
{"name": "OpenAI Computer Use", "value": AgentType.OPENAI},
{"name": "vLLM (Local Server)", "value": AgentType.VLLM},
{"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM},
]
)

agent = hud_console.select("Select an agent to use:", choices=choices, default=0)

# Handle HUD model selection
if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
if agent and agent not in [e.value for e in AgentType]:
# Find remote model name
model = agent
if not vllm_base_url:
Expand All @@ -918,20 +920,23 @@ def eval(
hud_console.error(f"Model {model} not found")
raise typer.Exit(1)
model = base_model
agent = "vllm" # Use vLLM backend for HUD models
agent = AgentType.VLLM # Use vLLM backend for HUD models
hud_console.info(f"Using HUD model: {model} (trained on {base_model})")

# Validate agent choice
valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
valid_agents = [e.value for e in AgentType]
if agent not in valid_agents:
hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
raise typer.Exit(1)

# Type narrowing: agent is now guaranteed to be an AgentType value after validation
agent = AgentType(agent)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, do we need this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally no, but hud_console.select doesn't have a generic return type yet based on the values of the choices so by default agent here is str | Literal[AgentType.VLLM]


# Run the command
eval_command(
source=source,
full=full,
agent=agent, # type: ignore
agent=agent,
model=model,
allowed_tools=allowed_tools,
max_concurrent=max_concurrent,
Expand Down
47 changes: 24 additions & 23 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import asyncio
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

import typer

import hud
from hud.cli.utils.env_check import ensure_built, find_environment_dir
from hud.settings import settings
from hud.types import AgentType
from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
from hud.utils.hud_console import HUDConsole

Expand Down Expand Up @@ -113,7 +114,7 @@ def _build_vllm_config(


def build_agent(
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
agent_type: AgentType,
*,
model: str | None = None,
allowed_tools: list[str] | None = None,
Expand All @@ -123,11 +124,11 @@ def build_agent(
"""Create and return the requested agent type."""

# Import agents lazily to avoid dependency issues
if agent_type == "integration_test":
if agent_type == AgentType.INTEGRATION_TEST:
from hud.agents.misc.integration_test_agent import IntegrationTestRunner

return IntegrationTestRunner(verbose=verbose)
elif agent_type == "vllm":
elif agent_type == AgentType.VLLM:
# Create a generic OpenAI agent for vLLM server
try:
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
Expand All @@ -147,7 +148,7 @@ def build_agent(
)
return GenericOpenAIChatAgent(**config)

elif agent_type == "openai":
elif agent_type == AgentType.OPENAI:
try:
from hud.agents import OperatorAgent
except ImportError as e:
Expand All @@ -165,7 +166,7 @@ def build_agent(
else:
return OperatorAgent(verbose=verbose)

elif agent_type == "litellm":
elif agent_type == AgentType.LITELLM:
try:
from hud.agents.lite_llm import LiteAgent
except ImportError as e:
Expand Down Expand Up @@ -209,7 +210,7 @@ def build_agent(
async def run_single_task(
source: str,
*,
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
agent_type: AgentType = AgentType.CLAUDE,
model: str | None = None,
allowed_tools: list[str] | None = None,
max_steps: int = 10,
Expand Down Expand Up @@ -268,14 +269,14 @@ async def run_single_task(

# Use grouped evaluation if group_size > 1
agent_config: dict[str, Any] = {}
if agent_type == "integration_test":
if agent_type == AgentType.INTEGRATION_TEST:
from hud.agents.misc.integration_test_agent import IntegrationTestRunner

agent_class = IntegrationTestRunner
agent_config = {"verbose": verbose}
if allowed_tools:
agent_config["allowed_tools"] = allowed_tools
elif agent_type == "vllm":
elif agent_type == AgentType.VLLM:
# Special handling for vLLM
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent

Expand All @@ -288,14 +289,14 @@ async def run_single_task(
allowed_tools=allowed_tools,
verbose=verbose,
)
elif agent_type == "openai":
elif agent_type == AgentType.OPENAI:
from hud.agents import OperatorAgent

agent_class = OperatorAgent
agent_config = {"verbose": verbose}
if allowed_tools:
agent_config["allowed_tools"] = allowed_tools
elif agent_type == "litellm":
elif agent_type == AgentType.LITELLM:
from hud.agents.lite_llm import LiteAgent

agent_class = LiteAgent
Expand All @@ -305,7 +306,7 @@ async def run_single_task(
}
if allowed_tools:
agent_config["allowed_tools"] = allowed_tools
elif agent_type == "claude":
elif agent_type == AgentType.CLAUDE:
from hud.agents import ClaudeAgent

agent_class = ClaudeAgent
Expand Down Expand Up @@ -353,7 +354,7 @@ async def run_single_task(
async def run_full_dataset(
source: str,
*,
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
agent_type: AgentType = AgentType.CLAUDE,
model: str | None = None,
allowed_tools: list[str] | None = None,
max_concurrent: int = 30,
Expand Down Expand Up @@ -395,12 +396,12 @@ async def run_full_dataset(

# Build agent class + config for run_dataset
agent_config: dict[str, Any]
if agent_type == "integration_test": # --integration-test mode
if agent_type == AgentType.INTEGRATION_TEST: # --integration-test mode
from hud.agents.misc.integration_test_agent import IntegrationTestRunner

agent_class = IntegrationTestRunner
agent_config = {"verbose": verbose}
elif agent_type == "vllm":
elif agent_type == AgentType.VLLM:
try:
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent

Expand All @@ -419,7 +420,7 @@ async def run_full_dataset(
allowed_tools=allowed_tools,
verbose=verbose,
)
elif agent_type == "openai":
elif agent_type == AgentType.OPENAI:
try:
from hud.agents import OperatorAgent

Expand All @@ -435,7 +436,7 @@ async def run_full_dataset(
if allowed_tools:
agent_config["allowed_tools"] = allowed_tools

elif agent_type == "litellm":
elif agent_type == AgentType.LITELLM:
try:
from hud.agents.lite_llm import LiteAgent

Expand Down Expand Up @@ -539,8 +540,8 @@ def eval_command(
"--full",
help="Run the entire dataset (omit for single-task debug mode)",
),
agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
"claude",
agent: AgentType = typer.Option( # noqa: B008
AgentType.CLAUDE,
"--agent",
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
),
Expand Down Expand Up @@ -648,21 +649,21 @@ def eval_command(

# We pass integration_test as the agent_type
if integration_test:
agent = "integration_test"
agent = AgentType.INTEGRATION_TEST

# Check for required API keys
if agent == "claude":
if agent == AgentType.CLAUDE:
if not settings.anthropic_api_key:
hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
hud_console.info(
"Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
)
raise typer.Exit(1)
elif agent == "openai" and not settings.openai_api_key:
elif agent == AgentType.OPENAI and not settings.openai_api_key:
hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
raise typer.Exit(1)
elif agent == "vllm":
elif agent == AgentType.VLLM:
if model:
hud_console.info(f"Using vLLM with model: {model}")
else:
Expand Down
12 changes: 6 additions & 6 deletions hud/cli/tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
build_agent,
run_single_task,
)
from hud.types import Task, Trace
from hud.types import AgentType, Task, Trace


class TestBuildAgent:
Expand All @@ -26,7 +26,7 @@ def test_builds_integration_test_agent(self) -> None:
mock_runner.return_value = mock_instance

# Test with verbose=False
result = build_agent("integration_test", verbose=False)
result = build_agent(AgentType.INTEGRATION_TEST, verbose=False)

mock_runner.assert_called_once_with(verbose=False)
assert result == mock_instance
Expand All @@ -40,7 +40,7 @@ def test_builds_claude_agent(self) -> None:
mock_runner.return_value = mock_instance

# Test with verbose=False
result = build_agent("claude", verbose=False)
result = build_agent(AgentType.CLAUDE, verbose=False)

mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
assert result == mock_instance
Expand All @@ -55,7 +55,7 @@ def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:

# Test with verbose=False
result = build_agent(
"claude",
AgentType.CLAUDE,
model="claude-sonnet-4-20250514",
allowed_tools=["act"],
verbose=True,
Expand Down Expand Up @@ -97,7 +97,7 @@ async def test_applies_agent_config_from_task(self) -> None:
patch("hud.cli.eval.find_environment_dir", return_value=None),
patch("hud.cli.eval.hud.trace"),
):
await run_single_task("test.json", agent_type="integration_test", max_steps=10)
await run_single_task("test.json", agent_type=AgentType.INTEGRATION_TEST, max_steps=10)

# Verify agent.run was called with the task containing agent_config
mock_agent.run.assert_called_once()
Expand All @@ -119,7 +119,7 @@ async def test_runs_with_group_size_greater_than_one(self) -> None:
mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]

await run_single_task(
"test.json", agent_type="integration_test", group_size=3, max_steps=10
"test.json", agent_type=AgentType.INTEGRATION_TEST, group_size=3, max_steps=10
)

# Verify run_tasks_grouped was called with correct group_size
Expand Down
10 changes: 10 additions & 0 deletions hud/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import uuid
from collections import defaultdict
from enum import Enum
from string import Template
from typing import Any, Literal

Expand All @@ -18,6 +19,14 @@
logger = logging.getLogger(__name__)


class AgentType(str, Enum):
CLAUDE = "claude"
OPENAI = "openai"
VLLM = "vllm"
LITELLM = "litellm"
INTEGRATION_TEST = "integration_test"


class Task(BaseModel):
"""
A task configuration that can be used to create a task.
Expand Down Expand Up @@ -319,6 +328,7 @@ def populate_from_context(self) -> None:

__all__ = [
"AgentResponse",
"AgentType",
"MCPToolCall",
"MCPToolResult",
"Trace",
Expand Down
Loading