hud-evals · farrelmahaztra · Oct 7, 2025 · dylanbowman314 · Oct 7, 2025 · farrelmahaztra
diff --git a/examples/run_evaluation.py b/examples/run_evaluation.py
@@ -44,6 +44,7 @@
 from datasets import load_dataset
 from hud.agents import ClaudeAgent, OperatorAgent
 from hud.datasets import Task, run_dataset
+from hud.types import AgentType
 
 logger = logging.getLogger(__name__)
 
@@ -58,14 +59,14 @@
 
 
 def _build_agent(
-    agent_type: Literal["claude", "openai"],
+    agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI],
     *,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
 ) -> ClaudeAgent | OperatorAgent:
     """Create and return the requested agent type."""
 
-    if agent_type == "openai":
+    if agent_type == AgentType.OPENAI:
         # Only pass allowed_tools if explicitly provided
         # This allows tasks to specify their own via agent_config
         if allowed_tools:
@@ -105,7 +106,7 @@ def _build_agent(
 async def run_single_task(
     dataset_name: str,
     *,
-    agent_type: Literal["claude", "openai"] = "claude",
+    agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI] = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
@@ -144,7 +145,7 @@ async def run_single_task(
 async def run_full_dataset(
     dataset_name: str,
     *,
-    agent_type: Literal["claude", "openai"] = "claude",
+    agent_type: Literal[AgentType.CLAUDE, AgentType.OPENAI] = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_concurrent: int = 50,
@@ -154,7 +155,7 @@ async def run_full_dataset(
 
     # Build agent class + config for run_dataset – we pass the *class* and a minimal
     # config dict, run_dataset will create a fresh agent per task.
-    if agent_type == "openai":
+    if agent_type == AgentType.OPENAI:
         agent_class = OperatorAgent
         agent_config: dict[str, Any] = {
             "validate_api_key": False,

diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
@@ -12,6 +12,8 @@
 from rich.panel import Panel
 from rich.table import Table
 
+from hud.types import AgentType
+
 from . import list_func as list_module
 from .analyze import (
     analyze_environment,
@@ -844,7 +846,7 @@ def eval(
     hud_console = HUDConsole()
 
     if integration_test:
-        agent = "integration_test"
+        agent = AgentType.INTEGRATION_TEST
 
     # If no source provided, reuse RL helper to find a tasks file interactively
     if source is None:
@@ -891,17 +893,17 @@ def eval(
         # Add standard agent choices
         choices.extend(
             [
-                {"name": "Claude 4 Sonnet", "value": "claude"},
-                {"name": "OpenAI Computer Use", "value": "openai"},
-                {"name": "vLLM (Local Server)", "value": "vllm"},
-                {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
+                {"name": "Claude 4 Sonnet", "value": AgentType.CLAUDE},
+                {"name": "OpenAI Computer Use", "value": AgentType.OPENAI},
+                {"name": "vLLM (Local Server)", "value": AgentType.VLLM},
+                {"name": "LiteLLM (Multi-provider)", "value": AgentType.LITELLM},
             ]
         )
 
         agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
 
     # Handle HUD model selection
-    if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
+    if agent and agent not in [e.value for e in AgentType]:
         # Find remote model name
         model = agent
         if not vllm_base_url:
@@ -918,20 +920,23 @@ def eval(
             hud_console.error(f"Model {model} not found")
             raise typer.Exit(1)
         model = base_model
-        agent = "vllm"  # Use vLLM backend for HUD models
+        agent = AgentType.VLLM  # Use vLLM backend for HUD models
         hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
 
     # Validate agent choice
-    valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
+    valid_agents = [e.value for e in AgentType]
     if agent not in valid_agents:
         hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
 
+    # Type narrowing: agent is now guaranteed to be an AgentType value after validation
+    agent = AgentType(agent)
+
     # Run the command
     eval_command(
         source=source,
         full=full,
-        agent=agent,  # type: ignore
+        agent=agent,
         model=model,
         allowed_tools=allowed_tools,
         max_concurrent=max_concurrent,

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -5,13 +5,14 @@
 import asyncio
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any
 
 import typer
 
 import hud
 from hud.cli.utils.env_check import ensure_built, find_environment_dir
 from hud.settings import settings
+from hud.types import AgentType
 from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
 from hud.utils.hud_console import HUDConsole
 
@@ -113,7 +114,7 @@ def _build_vllm_config(
 
 
 def build_agent(
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
+    agent_type: AgentType,
     *,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
@@ -123,11 +124,11 @@ def build_agent(
     """Create and return the requested agent type."""
 
     # Import agents lazily to avoid dependency issues
-    if agent_type == "integration_test":
+    if agent_type == AgentType.INTEGRATION_TEST:
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
 
         return IntegrationTestRunner(verbose=verbose)
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         # Create a generic OpenAI agent for vLLM server
         try:
             from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
@@ -147,7 +148,7 @@ def build_agent(
         )
         return GenericOpenAIChatAgent(**config)
 
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         try:
             from hud.agents import OperatorAgent
         except ImportError as e:
@@ -165,7 +166,7 @@ def build_agent(
         else:
             return OperatorAgent(verbose=verbose)
 
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         try:
             from hud.agents.lite_llm import LiteAgent
         except ImportError as e:
@@ -209,7 +210,7 @@ def build_agent(
 async def run_single_task(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: AgentType = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
@@ -268,14 +269,14 @@ async def run_single_task(
 
     # Use grouped evaluation if group_size > 1
     agent_config: dict[str, Any] = {}
-    if agent_type == "integration_test":
+    if agent_type == AgentType.INTEGRATION_TEST:
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
 
         agent_class = IntegrationTestRunner
         agent_config = {"verbose": verbose}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         # Special handling for vLLM
         from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
 
@@ -288,14 +289,14 @@ async def run_single_task(
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         from hud.agents import OperatorAgent
 
         agent_class = OperatorAgent
         agent_config = {"verbose": verbose}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         from hud.agents.lite_llm import LiteAgent
 
         agent_class = LiteAgent
@@ -305,7 +306,7 @@ async def run_single_task(
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "claude":
+    elif agent_type == AgentType.CLAUDE:
         from hud.agents import ClaudeAgent
 
         agent_class = ClaudeAgent
@@ -353,7 +354,7 @@ async def run_single_task(
 async def run_full_dataset(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: AgentType = AgentType.CLAUDE,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_concurrent: int = 30,
@@ -395,12 +396,12 @@ async def run_full_dataset(
 
     # Build agent class + config for run_dataset
     agent_config: dict[str, Any]
-    if agent_type == "integration_test":  # --integration-test mode
+    if agent_type == AgentType.INTEGRATION_TEST:  # --integration-test mode
         from hud.agents.misc.integration_test_agent import IntegrationTestRunner
 
         agent_class = IntegrationTestRunner
         agent_config = {"verbose": verbose}
-    elif agent_type == "vllm":
+    elif agent_type == AgentType.VLLM:
         try:
             from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
 
@@ -419,7 +420,7 @@ async def run_full_dataset(
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
-    elif agent_type == "openai":
+    elif agent_type == AgentType.OPENAI:
         try:
             from hud.agents import OperatorAgent
 
@@ -435,7 +436,7 @@ async def run_full_dataset(
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
 
-    elif agent_type == "litellm":
+    elif agent_type == AgentType.LITELLM:
         try:
             from hud.agents.lite_llm import LiteAgent
 
@@ -539,8 +540,8 @@ def eval_command(
         "--full",
         help="Run the entire dataset (omit for single-task debug mode)",
     ),
-    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
-        "claude",
+    agent: AgentType = typer.Option(  # noqa: B008
+        AgentType.CLAUDE,
         "--agent",
         help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
     ),
@@ -648,21 +649,21 @@ def eval_command(
 
     # We pass integration_test as the agent_type
     if integration_test:
-        agent = "integration_test"
+        agent = AgentType.INTEGRATION_TEST
 
     # Check for required API keys
-    if agent == "claude":
+    if agent == AgentType.CLAUDE:
         if not settings.anthropic_api_key:
             hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
             hud_console.info(
                 "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
             )
             raise typer.Exit(1)
-    elif agent == "openai" and not settings.openai_api_key:
+    elif agent == AgentType.OPENAI and not settings.openai_api_key:
         hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
         hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
         raise typer.Exit(1)
-    elif agent == "vllm":
+    elif agent == AgentType.VLLM:
         if model:
             hud_console.info(f"Using vLLM with model: {model}")
         else:

diff --git a/hud/cli/tests/test_eval.py b/hud/cli/tests/test_eval.py
@@ -11,7 +11,7 @@
     build_agent,
     run_single_task,
 )
-from hud.types import Task, Trace
+from hud.types import AgentType, Task, Trace
 
 
 class TestBuildAgent:
@@ -26,7 +26,7 @@ def test_builds_integration_test_agent(self) -> None:
             mock_runner.return_value = mock_instance
 
             # Test with verbose=False
-            result = build_agent("integration_test", verbose=False)
+            result = build_agent(AgentType.INTEGRATION_TEST, verbose=False)
 
             mock_runner.assert_called_once_with(verbose=False)
             assert result == mock_instance
@@ -40,7 +40,7 @@ def test_builds_claude_agent(self) -> None:
             mock_runner.return_value = mock_instance
 
             # Test with verbose=False
-            result = build_agent("claude", verbose=False)
+            result = build_agent(AgentType.CLAUDE, verbose=False)
 
             mock_runner.assert_called_once_with(model="claude-sonnet-4-20250514", verbose=False)
             assert result == mock_instance
@@ -55,7 +55,7 @@ def test_builds_claude_agent_with_custom_model_and_allowed_tools(self) -> None:
 
             # Test with verbose=False
             result = build_agent(
-                "claude",
+                AgentType.CLAUDE,
                 model="claude-sonnet-4-20250514",
                 allowed_tools=["act"],
                 verbose=True,
@@ -97,7 +97,7 @@ async def test_applies_agent_config_from_task(self) -> None:
             patch("hud.cli.eval.find_environment_dir", return_value=None),
             patch("hud.cli.eval.hud.trace"),
         ):
-            await run_single_task("test.json", agent_type="integration_test", max_steps=10)
+            await run_single_task("test.json", agent_type=AgentType.INTEGRATION_TEST, max_steps=10)
 
             # Verify agent.run was called with the task containing agent_config
             mock_agent.run.assert_called_once()
@@ -119,7 +119,7 @@ async def test_runs_with_group_size_greater_than_one(self) -> None:
             mock_grouped.return_value = [{"task": mock_task, "rewards": [1.0, 0.5]}]
 
             await run_single_task(
-                "test.json", agent_type="integration_test", group_size=3, max_steps=10
+                "test.json", agent_type=AgentType.INTEGRATION_TEST, group_size=3, max_steps=10
             )
 
             # Verify run_tasks_grouped was called with correct group_size

diff --git a/hud/types.py b/hud/types.py
@@ -5,6 +5,7 @@
 import logging
 import uuid
 from collections import defaultdict
+from enum import Enum
 from string import Template
 from typing import Any, Literal
 
@@ -18,6 +19,14 @@
 logger = logging.getLogger(__name__)
 
 
+class AgentType(str, Enum):
+    CLAUDE = "claude"
+    OPENAI = "openai"
+    VLLM = "vllm"
+    LITELLM = "litellm"
+    INTEGRATION_TEST = "integration_test"
+
+
 class Task(BaseModel):
     """
     A task configuration that can be used to create a task.
@@ -319,6 +328,7 @@ def populate_from_context(self) -> None:
 
 __all__ = [
     "AgentResponse",
+    "AgentType",
     "MCPToolCall",
     "MCPToolResult",
     "Trace",