From 5a08f83858743e77adbbe07ca83dcc978f1c74b8 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Thu, 9 Oct 2025 22:22:04 +0200 Subject: [PATCH 1/4] WIP --- examples/04_openrouter_quickstart.py | 47 +++ examples/mcp_sum_server.py | 23 ++ hud/agents/__init__.py | 2 + hud/agents/openrouter.py | 592 +++++++++++++++++++++++++++ hud/agents/tests/test_openrouter.py | 205 ++++++++++ hud/cli/__init__.py | 8 +- hud/cli/eval.py | 38 +- hud/utils/agent_factories.py | 16 + 8 files changed, 923 insertions(+), 8 deletions(-) create mode 100644 examples/04_openrouter_quickstart.py create mode 100644 examples/mcp_sum_server.py create mode 100644 hud/agents/openrouter.py create mode 100644 hud/agents/tests/test_openrouter.py diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py new file mode 100644 index 00000000..2ac56044 --- /dev/null +++ b/examples/04_openrouter_quickstart.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path + +from hud.agents.openrouter import OpenRouterAgent +from hud.utils.hud_console import HUDConsole + + +async def main() -> None: + hud_console = HUDConsole() + + # Inline FastMCP sum task (no external JSON needed) + server_path = Path(__file__).parent / "mcp_sum_server.py" + task = { + "id": "sum-demo", + "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.", + "mcp_config": { + "local": { + "command": "python", + "args": [str(server_path)], + } + }, + "agent_config": { + "allowed_tools": ["sum"], + "system_prompt": ( + "You are a concise math assistant. Always call the `sum` tool when asked to add " + "numbers, wait for the result, then explain the answer in one sentence." + ), + }, + } + + # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env) + agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True) + + hud_console.info("Running task with OpenRouter agent...") + result = await agent.run(task, max_steps=3) + + hud_console.info("\nFinal content:") + hud_console.info(result.content or "") + hud_console.success(f"Reward: {result.reward}") + + +if __name__ == "__main__": + asyncio.run(main()) + + diff --git a/examples/mcp_sum_server.py b/examples/mcp_sum_server.py new file mode 100644 index 00000000..7c26d123 --- /dev/null +++ b/examples/mcp_sum_server.py @@ -0,0 +1,23 @@ +"""FastMCP server exposing a simple sum tool. + +Run with: `python examples/mcp_sum_server.py`. +""" + +from __future__ import annotations + +from fastmcp import FastMCP + + +server = FastMCP("SumServer") + + +@server.tool() +def sum(a: int, b: int) -> dict[str, int]: + """Return the sum of two integers.""" + return {"result": a + b} + + +if __name__ == "__main__": + server.run() + + diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py index 7470adb3..55a531ca 100644 --- a/hud/agents/__init__.py +++ b/hud/agents/__init__.py @@ -4,10 +4,12 @@ from .claude import ClaudeAgent from .openai import OperatorAgent from .openai_chat_generic import GenericOpenAIChatAgent +from .openrouter import OpenRouterAgent __all__ = [ "ClaudeAgent", "GenericOpenAIChatAgent", "MCPAgent", "OperatorAgent", + "OpenRouterAgent", ] diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py new file mode 100644 index 00000000..4306a386 --- /dev/null +++ b/hud/agents/openrouter.py @@ -0,0 +1,592 @@ +"""OpenRouter agent that uses the Responses API with prompt caching.""" + +from __future__ import annotations + +import json +import logging +import uuid +from typing import Any, Iterable + +import mcp.types as types +from openai import AsyncOpenAI + +from hud import instrument +from hud.settings import settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult + +from .openai_chat_generic import GenericOpenAIChatAgent + +logger = logging.getLogger(__name__) + +_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha" +_DEFAULT_HEADERS = { + "HTTP-Referer": "https://hud.so", + "X-Title": "HUD Python SDK", + "Accept": "application/json", +} + +_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = { + "temperature": 0.1, + "max_output_tokens": 1024, +} + + +class OpenRouterAgent(GenericOpenAIChatAgent): + """MCP-enabled agent that talks to OpenRouter through the Responses API.""" + + def __init__( + self, + *, + api_key: str | None = None, + base_url: str | None = None, + model_name: str = "z-ai/glm-4.5v", + default_headers: dict[str, str] | None = None, + cache_control: dict[str, Any] | bool | None = True, + cacheable_roles: Iterable[str] | None = None, + openai_client: AsyncOpenAI | None = None, + completion_kwargs: dict[str, Any] | None = None, + **agent_kwargs: Any, + ) -> None: + api_key = api_key or settings.openrouter_api_key + if not api_key: + raise ValueError( + "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly." + ) + + base_url = base_url or _DEFAULT_BASE_URL + + headers: dict[str, str] = dict(_DEFAULT_HEADERS) + if default_headers: + headers.update(default_headers) + + client = openai_client or AsyncOpenAI( + api_key=api_key, + base_url=base_url, + default_headers=headers, + ) + + super().__init__( + openai_client=client, + model_name=model_name, + completion_kwargs=completion_kwargs, + **agent_kwargs, + ) + + self._responses_kwargs = { + "tool_choice": "auto", + **_DEFAULT_COMPLETION_KWARGS, + **dict(self.completion_kwargs), + } + self.completion_kwargs.clear() + + self._cache_control = self._normalize_cache_control(cache_control) + self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool")) + + @staticmethod + def _normalize_cache_control( + cache_control: dict[str, Any] | bool | str | None, + ) -> dict[str, Any] | None: + if cache_control is False: + return None + if cache_control is None: + return {"type": "ephemeral"} + if cache_control is True: + return {"type": "ephemeral"} + if isinstance(cache_control, dict): + return cache_control + return {"type": str(cache_control)} + + def _should_cache(self, role: str) -> bool: + return self._cache_control is not None and role in self._cacheable_roles + + def _text_item(self, text: str, role: str) -> dict[str, Any]: + item: dict[str, Any] = {"type": "input_text", "text": text} + if self._should_cache(role): + item["cache_control"] = self._cache_control + return item + + def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]: + url: str | None = None + detail = None + + if isinstance(image_payload, dict): + # Standard OpenAI-style wrapper + if "image_url" in image_payload and isinstance(image_payload["image_url"], dict): + img = image_payload["image_url"] + url = img.get("url") + detail = img.get("detail") or image_payload.get("detail") + # Direct url / data uri + elif image_payload.get("url"): + url = image_payload.get("url") + detail = image_payload.get("detail") + # Raw base64 payload from computer/tool results + elif image_payload.get("data"): + mime = ( + image_payload.get("mimeType") + or image_payload.get("mime_type") + or "image/png" + ) + data = image_payload.get("data") + if data: + url = f"data:{mime};base64,{data}" + detail = image_payload.get("detail") + elif isinstance(image_payload.get("source"), dict): + source = image_payload["source"] + data = source.get("data") + mime = source.get("media_type") or source.get("mime_type") or "image/png" + if data: + url = f"data:{mime};base64,{data}" + detail = source.get("detail") + elif isinstance(image_payload, str): + url = image_payload + + item: dict[str, Any] = {"type": "input_image"} + if url: + item["image_url"] = url + item["detail"] = str(detail or "auto") + if self._should_cache(role): + item["cache_control"] = self._cache_control + return item + + def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]: + if content is None: + return [] + + blocks: list[dict[str, Any]] = [] + if isinstance(content, str): + blocks.append(self._text_item(content, role)) + return blocks + + if isinstance(content, dict): + content = [content] + + if isinstance(content, list): + for entry in content: + if isinstance(entry, str): + blocks.append(self._text_item(entry, role)) + elif isinstance(entry, dict): + entry_copy = dict(entry) + entry_type = entry_copy.get("type") + if entry_type in {"text", "input_text", None}: + text = entry_copy.get("text") or "" + blocks.append(self._text_item(text, role)) + elif entry_type in {"image_url", "input_image"}: + payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy + blocks.append(self._image_item(payload, role)) + elif entry_type in {"image", "output_image", "rendered"}: + blocks.append(self._image_item(entry_copy, role)) + elif entry_type == "tool_result": + text = entry_copy.get("text", "") + blocks.append(self._text_item(text, role)) + else: + text_value = entry_copy.get("text") or json.dumps(entry_copy) + blocks.append(self._text_item(text_value, role)) + else: + blocks.append(self._text_item(str(entry), role)) + return blocks + + blocks.append(self._text_item(str(content), role)) + return blocks + + def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]: + converted: list[dict[str, Any]] = [] + for message in messages: + if not isinstance(message, dict): + logger.debug("Skipping non-dict message: %s", message) + continue + + if "type" in message and "role" not in message: + converted.append(message) + continue + + role = message.get("role") or "user" + + if role == "assistant" and message.get("tool_calls"): + content_items = self._convert_message_content(role, message.get("content")) + if content_items: + converted.append({"role": "assistant", "content": content_items}) + for tool_call in message.get("tool_calls", []): + converted.append(self._convert_tool_call(tool_call)) + continue + + if role == "tool": + converted.extend(self._convert_tool_message(message)) + continue + + payload: dict[str, Any] = {"role": role} + content_items = self._convert_message_content(role, message.get("content")) + if content_items: + payload["content"] = content_items + if message.get("name"): + payload["name"] = message["name"] + if message.get("metadata"): + payload["metadata"] = message["metadata"] + converted.append(payload) + + return converted + + @staticmethod + def _jsonify_schema(value: Any) -> Any: + from pydantic import BaseModel + from pydantic.fields import FieldInfo + + if isinstance(value, (str, int, float, bool)) or value is None: + return value + + if isinstance(value, dict): + return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()} + + if isinstance(value, (list, tuple, set)): + return [OpenRouterAgent._jsonify_schema(v) for v in value] + + try: + return json.loads(json.dumps(value)) + except Exception: + if isinstance(value, BaseModel): + return OpenRouterAgent._jsonify_schema(value.model_dump()) + if isinstance(value, FieldInfo): + data: dict[str, Any] = {} + if value.annotation is not None: + data.setdefault( + "type", + getattr(value.annotation, "__name__", str(value.annotation)), + ) + if value.description: + data["description"] = value.description + if value.title: + data["title"] = value.title + if value.default not in (None, Ellipsis): + data["default"] = OpenRouterAgent._jsonify_schema(value.default) + if value.json_schema_extra: + extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra) + if isinstance(extra, dict): + data.update(extra) + return data or str(value) + if hasattr(value, "model_dump"): + return OpenRouterAgent._jsonify_schema(value.model_dump()) + if hasattr(value, "__dict__") and value.__dict__: + return OpenRouterAgent._jsonify_schema( + { + k: v + for k, v in value.__dict__.items() + if not k.startswith("_") + } + ) + return str(value) + + @staticmethod + def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]: + if not tools: + return [] + + converted: list[dict] = [] + for tool in tools: + if not isinstance(tool, dict): + continue + + if tool.get("type") == "function" and isinstance(tool.get("function"), dict): + fn = tool["function"] + name = fn.get("name") + params = fn.get("parameters", {}) + description = fn.get("description", "") + + if not isinstance(name, str) or not name: + logger.debug("Skipping tool with missing name: %s", tool) + continue + + converted.append( + { + "type": "function", + "name": name, + "description": str(description or ""), + "parameters": OpenRouterAgent._jsonify_schema(params), + } + ) + else: + converted.append(OpenRouterAgent._jsonify_schema(tool)) + + return converted + + def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]: + if not isinstance(tool_call, dict): + return {} + + function = tool_call.get("function") or {} + name = function.get("name") or tool_call.get("name") or "tool_call" + raw_arguments = function.get("arguments") + + if isinstance(raw_arguments, dict): + arguments = json.dumps(self._jsonify_schema(raw_arguments)) + elif isinstance(raw_arguments, str): + try: + parsed = json.loads(raw_arguments) + except json.JSONDecodeError: + arguments = raw_arguments + else: + arguments = json.dumps(self._jsonify_schema(parsed)) + elif raw_arguments is None: + arguments = "{}" + else: + arguments = json.dumps(self._jsonify_schema(raw_arguments)) + + call_id = ( + tool_call.get("id") + or function.get("id") + or function.get("call_id") + or f"call_{uuid.uuid4().hex}" + ) + + return { + "type": "function_call", + "id": call_id, + "name": name, + "arguments": arguments or "{}", + } + + def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}" + + text_parts: list[str] = [] + image_payloads: list[Any] = [] + + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict): + item_type = item.get("type") + if item_type in {"text", "input_text"} and item.get("text"): + text_parts.append(str(item.get("text"))) + elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}: + image_payloads.append(item) + elif isinstance(item, str): + text_parts.append(item) + elif isinstance(content, str): + text_parts.append(content) + + structured = message.get("structuredContent") + if structured and not text_parts: + try: + text_parts.append(json.dumps(structured)) + except Exception: + text_parts.append(str(structured)) + + output_text = "\n".join(part for part in text_parts if part) or "" + + entries.append( + { + "type": "function_call_output", + "id": message.get("id") or call_id, + "call_id": call_id, + "output": output_text, + } + ) + + for payload in image_payloads: + entries.append( + { + "role": "user", + "content": [self._image_item(payload, "user")], + } + ) + + return entries + + async def format_tool_results( + self, + tool_calls: list[MCPToolCall], + tool_results: list[MCPToolResult], + ) -> list[dict[str, Any]]: + converted: list[dict[str, Any]] = [] + + for call, result in zip(tool_calls, tool_results, strict=False): + call_id = call.id or call.name or f"call_{uuid.uuid4().hex}" + + text_parts: list[str] = [] + image_payloads: list[Any] = [] + + for item in result.content or []: + if isinstance(item, types.TextContent): + text_parts.append(item.text) + elif isinstance(item, types.ImageContent): + image_payloads.append( + { + "mimeType": item.mimeType, + "data": item.data, + "detail": getattr(item, "detail", None), + } + ) + elif isinstance(item, dict): + if item.get("type") in {"text", "input_text"}: + text_parts.append(str(item.get("text", ""))) + elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}: + image_payloads.append(item) + elif isinstance(item, str): + text_parts.append(item) + + if result.structuredContent and not text_parts: + try: + text_parts.append(json.dumps(result.structuredContent)) + except Exception: + text_parts.append(str(result.structuredContent)) + + if getattr(result, "isError", False): + text_parts.append(getattr(result, "error", "Tool execution failed.")) + + output_text = "\n".join(part for part in text_parts if part) or "" + + converted.append( + { + "type": "function_call_output", + "id": call_id, + "call_id": call_id, + "output": output_text, + } + ) + + for payload in image_payloads: + converted.append( + { + "role": "user", + "content": [self._image_item(payload, "user")], + } + ) + + return converted + + @staticmethod + def _parse_arguments(arguments: Any) -> dict[str, Any]: + if isinstance(arguments, dict): + return arguments + if isinstance(arguments, str) and arguments: + try: + parsed = json.loads(arguments) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + logger.debug("Failed to decode arguments: %s", arguments) + return {} + + def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall: + tool_name = payload.get("name") or payload.get("function", {}).get("name") or "" + call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id") + if not call_id: + call_id = tool_name + arguments = payload.get("arguments") + if not arguments and "function" in payload: + arguments = payload["function"].get("arguments") + parsed_arguments = self._parse_arguments(arguments) + return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments) + + def _coerce_response_payload(self, response: Any) -> dict[str, Any]: + """Convert OpenRouter SDK return types into a plain dictionary.""" + + if response is None: + return {} + + if isinstance(response, dict): + return response + + for attr in ("model_dump", "dict", "to_dict"): + if hasattr(response, attr): + try: + payload = getattr(response, attr)() + except Exception as exc: # pragma: no cover - defensive + logger.debug("Failed to read response via %s: %s", attr, exc) + else: + if isinstance(payload, dict): + return payload + + snapshot = getattr(response, "__dict__", None) + if isinstance(snapshot, dict): + return snapshot + + logger.error("Unexpected response carrier from OpenRouter: %r", response) + raise TypeError("Unexpected response type from OpenRouter") + + def _extract_response(self, response: Any) -> AgentResponse: + data = self._coerce_response_payload(response) + if not isinstance(data, dict): + raise TypeError("Unexpected response type from OpenRouter") + + output = data.get("output", []) + text_parts: list[str] = [] + tool_calls: list[MCPToolCall] = [] + reasoning_parts: list[str] = [] + + for item in output: + item_type = item.get("type") if isinstance(item, dict) else None + if item_type == "message": + contents = item.get("content", []) + if isinstance(contents, list): + for block in contents: + if not isinstance(block, dict): + continue + block_type = block.get("type") + if block_type in {"output_text", "text"}: + text = block.get("text") + if text: + text_parts.append(text) + elif block_type == "reasoning" and block.get("text"): + reasoning_parts.append(block["text"]) + for tc in item.get("tool_calls", []) or []: + if isinstance(tc, dict): + tool_calls.append(self._to_mcp_tool_call(tc)) + elif item_type in {"tool_call", "function_call"} and isinstance(item, dict): + tool_calls.append(self._to_mcp_tool_call(item)) + elif item_type == "reasoning" and isinstance(item, dict): + summary = item.get("summary") + if isinstance(summary, list): + for block in summary: + if isinstance(block, dict) and block.get("text"): + reasoning_parts.append(block["text"]) + elif isinstance(summary, str): + reasoning_parts.append(summary) + + merged_text = "\n".join(reasoning_parts + text_parts).strip() + status = data.get("status", "completed") + done = not tool_calls and status != "in_progress" + return AgentResponse( + content=merged_text, + tool_calls=tool_calls, + done=done, + raw=response, + ) + + @instrument( + span_type="agent", + record_args=False, + record_result=True, + ) + async def get_response(self, messages: list[Any]) -> AgentResponse: + converted_messages = self._convert_messages(messages) + tools = self._convert_tools_for_responses(self.get_tool_schemas()) + + protected_keys = {"model", "input", "tools"} + extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys} + # If tools are provided and tool_choice isn't explicitly set, require tool use + if tools and "tool_choice" not in extra: + extra["tool_choice"] = "required" + + try: + payload: dict[str, Any] = { + "model": self.model_name, + "input": converted_messages, + **extra, + } + if tools: + payload["tools"] = tools + + response = await self.oai.responses.create(**payload) + except Exception as exc: + error_content = f"Error getting response {exc}" + logger.exception("OpenRouter call failed: %s", exc) + return AgentResponse( + content=error_content, + tool_calls=[], + done=True, + isError=True, + raw=None, + ) + + return self._extract_response(response) diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py new file mode 100644 index 00000000..d3010e0d --- /dev/null +++ b/hud/agents/tests/test_openrouter.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import pytest +from unittest.mock import AsyncMock, MagicMock + +import mcp.types as types + +from hud.agents.openrouter import OpenRouterAgent +from hud.settings import settings +from hud.types import MCPToolCall, MCPToolResult + + +@pytest.fixture(autouse=True) +def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: + """Disable HUD telemetry during unit tests.""" + monkeypatch.setattr(settings, "telemetry_enabled", False) + monkeypatch.setattr(settings, "api_key", None) + + +class FakeResponse: + def __init__(self, payload: dict) -> None: + self._payload = payload + + def model_dump(self) -> dict: + return self._payload + + +@pytest.mark.asyncio +async def test_openrouter_agent_builds_cached_messages() -> None: + responses_create = AsyncMock( + return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"}) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent( + api_key="test-key", + openai_client=mock_client, + cache_control={"type": "ephemeral"}, + ) + agent._available_tools = [] # mimic initialized agent + + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + {"role": "assistant", "content": "Previous reply"}, + ] + + await agent.get_response(messages) + + await_call = responses_create.await_args + assert await_call is not None + kwargs = await_call.kwargs + assert kwargs["model"] == agent.model_name + input_payload = kwargs["input"] + + system_block = input_payload[0]["content"][0] + user_block = input_payload[1]["content"][0] + assistant_block = input_payload[2]["content"][0] + + assert system_block["cache_control"] == {"type": "ephemeral"} + assert user_block["cache_control"] == {"type": "ephemeral"} + assert "cache_control" not in assistant_block + + +@pytest.mark.asyncio +async def test_openrouter_agent_parses_tool_calls() -> None: + responses_create = AsyncMock( + return_value=FakeResponse( + { + "output": [ + { + "type": "message", + "content": [{"type": "output_text", "text": "Calling tool"}], + "tool_calls": [ + { + "id": "call_1", + "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"}, + } + ], + } + ], + "status": "requires_action", + } + ) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + agent._available_tools = [] + + result = await agent.get_response( + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + ] + ) + + assert not result.done + assert result.tool_calls[0].name == "search" + assert result.tool_calls[0].arguments == {"query": "hud"} + + +@pytest.mark.asyncio +async def test_openrouter_agent_returns_text_response() -> None: + responses_create = AsyncMock( + return_value=FakeResponse( + { + "output": [ + { + "type": "message", + "content": [{"type": "output_text", "text": "Hi there"}], + } + ], + "status": "completed", + } + ) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + agent._available_tools = [] + + result = await agent.get_response( + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + ] + ) + + assert result.done + assert result.content == "Hi there" + assert result.tool_calls == [] + + +def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + from pydantic import Field + + tools = [ + { + "type": "function", + "function": { + "name": "click", + "description": "Click an element", + "parameters": { + "type": "object", + "properties": { + "selector": Field(default="", description="CSS selector"), + }, + "required": ["selector"], + }, + }, + } + ] + + converted = agent._convert_tools_for_responses(tools) + selector_schema = converted[0]["parameters"]["properties"]["selector"] + assert isinstance(selector_schema, dict) + assert selector_schema.get("description") == "CSS selector" + + +def test_openrouter_agent_converts_image_blocks() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + content = [ + { + "type": "image", + "mimeType": "image/png", + "data": "dGVzdA==", + "detail": "high", + } + ] + + message_blocks = agent._convert_messages([{"role": "user", "content": content}]) + image_block = message_blocks[0]["content"][0] + assert image_block["type"] == "input_image" + assert image_block["image_url"].startswith("data:image/png;base64,") + assert image_block["detail"] == "high" + + +@pytest.mark.asyncio +async def test_format_tool_results_produces_function_call_output() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + tool_call = MCPToolCall(id="call-1", name="playwright", arguments={}) + tool_result = MCPToolResult( + content=[ + types.TextContent(type="text", text="navigation complete"), + types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"), + ] + ) + + formatted = await agent.format_tool_results([tool_call], [tool_result]) + + assert formatted[0]["type"] == "function_call_output" + assert formatted[0]["call_id"] == "call-1" + assert formatted[1]["role"] == "user" + assert formatted[1]["content"][0]["type"] == "input_image" diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 3708cf0e..99771913 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -777,7 +777,8 @@ def eval( agent: str | None = typer.Argument( None, help=( - "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501 + "Agent backend to use (claude, openai computer use, openrouter responses, " + "vllm, or litellm). If not provided, will prompt interactively." ), ), full: bool = typer.Option( @@ -893,6 +894,7 @@ def eval( [ {"name": "Claude 4 Sonnet", "value": "claude"}, {"name": "OpenAI Computer Use", "value": "openai"}, + {"name": "OpenRouter (Responses)", "value": "openrouter"}, {"name": "vLLM (Local Server)", "value": "vllm"}, {"name": "LiteLLM (Multi-provider)", "value": "litellm"}, ] @@ -901,7 +903,7 @@ def eval( agent = hud_console.select("Select an agent to use:", choices=choices, default=0) # Handle HUD model selection - if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]: + if agent and agent not in ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"]: # Find remote model name model = agent if not vllm_base_url: @@ -922,7 +924,7 @@ def eval( hud_console.info(f"Using HUD model: {model} (trained on {base_model})") # Validate agent choice - valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"] + valid_agents = ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] if agent not in valid_agents: hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}") raise typer.Exit(1) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index e8afceac..4900ba85 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -113,7 +113,7 @@ def _build_vllm_config( def build_agent( - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"], + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"], *, model: str | None = None, allowed_tools: list[str] | None = None, @@ -180,6 +180,21 @@ def build_agent( allowed_tools=allowed_tools, verbose=verbose, ) + elif agent_type == "openrouter": + try: + from hud.agents.openrouter import OpenRouterAgent + except ImportError as e: + hud_console.error( + "OpenRouter agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + return OpenRouterAgent( + model_name=model or "z-ai/glm-4.6", + allowed_tools=allowed_tools, + verbose=verbose, + ) # Fallback Claude agent (Anthropic) try: @@ -209,7 +224,7 @@ def build_agent( async def run_single_task( source: str, *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude", model: str | None = None, allowed_tools: list[str] | None = None, max_steps: int = 10, @@ -305,6 +320,16 @@ async def run_single_task( } if allowed_tools: agent_config["allowed_tools"] = allowed_tools + elif agent_type == "openrouter": + from hud.agents.openrouter import OpenRouterAgent + + agent_class = OpenRouterAgent + agent_config = { + "model_name": model or "z-ai/glm-4.5v", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools elif agent_type == "claude": from hud.agents import ClaudeAgent @@ -353,7 +378,7 @@ async def run_single_task( async def run_full_dataset( source: str, *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude", model: str | None = None, allowed_tools: list[str] | None = None, max_concurrent: int = 30, @@ -539,10 +564,13 @@ def eval_command( "--full", help="Run the entire dataset (omit for single-task debug mode)", ), - agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option( + agent: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = typer.Option( "claude", "--agent", - help="Agent backend to use (claude, openai, vllm for local server, or litellm)", + help=( + "Agent backend to use (claude, openai computer use, openrouter responses, " + "vllm for local server, or litellm)" + ), ), model: str | None = typer.Option( None, diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py index e15cb240..37b9fa7a 100644 --- a/hud/utils/agent_factories.py +++ b/hud/utils/agent_factories.py @@ -8,6 +8,7 @@ from hud.agents.grounded_openai import GroundedOpenAIChatAgent from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.agents.openrouter import OpenRouterAgent from hud.tools.grounding import GrounderConfig @@ -82,3 +83,18 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent: return GroundedOpenAIChatAgent( openai_client=openai_client, grounder_config=grounder_config, **kwargs ) + + +def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent: + """Factory for OpenRouterAgent with run_dataset compatibility.""" + + api_key = kwargs.pop("api_key", None) + base_url = kwargs.pop("base_url", None) + cache_control = kwargs.pop("cache_control", True) + + return OpenRouterAgent( + api_key=api_key, + base_url=base_url, + cache_control=cache_control, + **kwargs, + ) From 8281d6bc948287d295d800f248ae870def754023 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Sat, 11 Oct 2025 23:06:03 +0200 Subject: [PATCH 2/4] litllm, glm-4.5v cua loop --- examples/04_openrouter_quickstart.py | 47 -- hud/agents/glm45v.py | 820 +++++++++++++++++++++++ hud/agents/openrouter.py | 960 ++++++++++++--------------- hud/agents/tests/test_openrouter.py | 237 ++----- hud/cli/__init__.py | 2 +- hud/cli/eval.py | 4 +- hud/utils/agent_factories.py | 11 +- pyproject.toml | 2 +- 8 files changed, 1298 insertions(+), 785 deletions(-) delete mode 100644 examples/04_openrouter_quickstart.py create mode 100644 hud/agents/glm45v.py diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py deleted file mode 100644 index 2ac56044..00000000 --- a/examples/04_openrouter_quickstart.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -import asyncio -from pathlib import Path - -from hud.agents.openrouter import OpenRouterAgent -from hud.utils.hud_console import HUDConsole - - -async def main() -> None: - hud_console = HUDConsole() - - # Inline FastMCP sum task (no external JSON needed) - server_path = Path(__file__).parent / "mcp_sum_server.py" - task = { - "id": "sum-demo", - "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.", - "mcp_config": { - "local": { - "command": "python", - "args": [str(server_path)], - } - }, - "agent_config": { - "allowed_tools": ["sum"], - "system_prompt": ( - "You are a concise math assistant. Always call the `sum` tool when asked to add " - "numbers, wait for the result, then explain the answer in one sentence." - ), - }, - } - - # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env) - agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True) - - hud_console.info("Running task with OpenRouter agent...") - result = await agent.run(task, max_steps=3) - - hud_console.info("\nFinal content:") - hud_console.info(result.content or "") - hud_console.success(f"Reward: {result.reward}") - - -if __name__ == "__main__": - asyncio.run(main()) - - diff --git a/hud/agents/glm45v.py b/hud/agents/glm45v.py new file mode 100644 index 00000000..e7ff0fdc --- /dev/null +++ b/hud/agents/glm45v.py @@ -0,0 +1,820 @@ +"""glm-4.5v computer-use agent backed by litellm + openrouter.""" + +from __future__ import annotations + +import json +import logging +import re +from typing import Any, ClassVar + +import litellm +import mcp.types as types +from litellm.types.utils import ModelResponse + +from hud.agents.base import MCPAgent +from hud.tools.computer.settings import computer_settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult +from hud import instrument +from hud.agents.openrouter import ( + _convert_json_action_to_items, + _decode_image_dimensions, + _extract_user_instruction, + _make_click_item, + _make_double_click_item, + _make_drag_item, + _make_failed_tool_call_items, + _make_keypress_item, + _make_output_text_item, + _make_reasoning_item, + _make_screenshot_item, + _make_scroll_item, + _make_type_item, + _make_wait_item, + _parse_json_action_string, + _random_id, + get_last_image_from_messages, +) + +logger = logging.getLogger(__name__) + + +DEFAULT_SYSTEM_PROMPT = """ +You are an autonomous computer-using agent. Follow these guidelines: + +1. Do not ask for permission; act decisively to finish the task. +2. Always ground actions in the latest screenshot and task instructions. +3. Use the provided mouse/keyboard tools precisely (coordinates are 0-999). +4. Keep memory concise—store only facts that matter for later steps. +5. When the task is complete, reply with DONE() and include the final answer. +6. If the task is impossible, reply with FAIL() and explain briefly. +""".strip() + + +GLM_ACTION_SPACE = """ +### {left,right,middle}_click + +Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')` +{ + 'name': ['left_click', 'right_click', 'middle_click'], + 'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being clicked.' + } + }, + 'required': ['start_box'] + } +} + +### hover + +Call rule: `hover(start_box='[x,y]', element_info='')` +{ + 'name': 'hover', + 'description': 'Move the mouse pointer to the specified coordinates without performing any click action.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being hovered over.' + } + }, + 'required': ['start_box'] + } +} + +### left_double_click + +Call rule: `left_double_click(start_box='[x,y]', element_info='')` +{ + 'name': 'left_double_click', + 'description': 'Perform a left mouse double-click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being double-clicked.' + } + }, + 'required': ['start_box'] + } +} + +### left_drag + +Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')` +{ + 'name': 'left_drag', + 'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.' + }, + 'end_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being dragged.' + } + }, + 'required': ['start_box', 'end_box'] + } +} + +### key + +Call rule: `key(keys='')` +{ + 'name': 'key', + 'description': 'Simulate pressing a single key or combination of keys on the keyboard.', + 'parameters': { + 'type': 'object', + 'properties': { + 'keys': { + 'type': 'string', + 'description': "The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab')." + } + }, + 'required': ['keys'] + } +} + +### type + +Call rule: `type(content='')` +{ + 'name': 'type', + 'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.', + 'parameters': { + 'type': 'object', + 'properties': { + 'content': { + 'type': 'string', + 'description': 'The text content to be typed into the active text field.' + } + }, + 'required': ['content'] + } +} + +### scroll + +Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')` +{ + 'name': 'scroll', + 'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.' + }, + 'direction': { + 'type': 'string', + 'enum': ['down', 'up'], + 'description': "The direction to scroll: 'down' or 'up'." + }, + 'step': { + 'type': 'integer', + 'default': 5, + 'description': 'Number of wheel steps to scroll, default is 5.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being scrolled.' + } + }, + 'required': ['start_box', 'direction'] + } +} + +### WAIT + +Call rule: `WAIT()` +{ + 'name': 'WAIT', + 'description': 'Wait for 5 seconds before proceeding to the next action.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### DONE + +Call rule: `DONE()` +{ + 'name': 'DONE', + 'description': 'Indicate that the current task has been completed successfully and no further actions are needed.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### FAIL + +Call rule: `FAIL()` +{ + 'name': 'FAIL', + 'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +}""" + + + +def convert_responses_items_to_glm45v_pc_prompt( + messages: list[dict[str, Any]], + task: str, + memory: str = "[]", +) -> list[dict[str, Any]]: + action_space = GLM_ACTION_SPACE + head_text = ( + "You are a GUI Agent, and your primary task is to respond accurately to user" + " requests or questions. In addition to directly answering the user's queries," + " you can also use tools or perform GUI operations directly until you fulfill" + " the user's request or provide a correct answer. You should carefully read and" + " understand the images and questions provided by the user, and engage in" + " thinking and reflection when appropriate. The coordinates involved are all" + " represented in thousandths (0-999)." + "\n\n# Task:\n" + f"{task}\n\n# Task Platform\nUbuntu\n\n# Action Space\n{action_space}\n\n" + "# Historical Actions and Current Memory\nHistory:" + ) + + tail_text = ( + "\nMemory:\n" + f"{memory}\n" + "# Output Format\nPlain text explanation with action(param='...')\n" + "Memory:\n[{\"key\": \"value\"}, ...]\n\n# Some Additional Notes\n" + "- I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.\n" + "- You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round." + " The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory." + " Even if you don't need to remember anything, you should also output an empty list.\n" + "- If elevated privileges are needed, credentials are referenced as .\n" + "- For any mail account interactions, credentials are referenced as .\n\n" + "Current Screenshot:\n" + ) + + history: list[dict[str, Any]] = [] + history_images: list[str] = [] + current_step: list[dict[str, Any]] = [] + step_num = 0 + + for message in messages: + if not isinstance(message, dict): + continue + msg_type = message.get("type") + + if msg_type in {"reasoning", "message", "computer_call", "computer_call_output"}: + current_step.append(message) + + if msg_type == "computer_call_output" and current_step: + step_num += 1 + + bot_thought = "" + action_text = "" + for item in current_step: + if item.get("type") == "message" and item.get("role") == "assistant": + content = item.get("content") or [] + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") == "output_text": + bot_thought = block.get("text", "") + break + if item.get("type") == "computer_call": + action_text = json.dumps(item.get("action", {})) + + history.append({ + "step_num": step_num, + "bot_thought": bot_thought, + "action_text": action_text, + }) + + output = message.get("output") or {} + if isinstance(output, dict) and output.get("type") == "input_image": + url = output.get("image_url") + if isinstance(url, str): + history_images.append(url) + + current_step = [] + + content: list[dict[str, Any]] = [] + current_text = head_text + + total_steps = len(history) + image_tail = min(4, len(history_images)) + + for idx, step in enumerate(history): + step_no = step["step_num"] + bot_thought = step["bot_thought"] + action_text = step["action_text"] + + if idx < total_steps - image_tail: + current_text += ( + f"\nstep {step_no}: Screenshot:(Omitted in context.)" + f" Thought: {bot_thought}\nAction: {action_text}" + ) + else: + current_text += f"\nstep {step_no}: Screenshot:" + content.append({"type": "text", "text": current_text}) + image_idx = idx - (total_steps - image_tail) + if 0 <= image_idx < len(history_images): + content.append({"type": "image_url", "image_url": {"url": history_images[image_idx]}}) + current_text = f" Thought: {bot_thought}\nAction: {action_text}" + + current_text += tail_text + content.append({"type": "text", "text": current_text}) + return content + + +def convert_glm_completion_to_responses_items( + response: ModelResponse, + image_width: int, + image_height: int, + parsed_response: dict[str, str] | None = None, +) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] + + if not getattr(response, "choices", None): + return items + + choice = response.choices[0] + message = getattr(choice, "message", None) + if not message: + return items + + content = getattr(message, "content", "") or "" + reasoning_content = getattr(message, "reasoning_content", None) + + if reasoning_content: + items.append(_make_reasoning_item(str(reasoning_content))) + + parsed = parsed_response or parse_glm_response(content) + action = parsed.get("action", "") + action_text = parsed.get("action_text", "") + + if action_text: + clean_text = action_text + if action: + clean_text = clean_text.replace(action, "").strip() + clean_text = re.sub(r"Memory:\s*\[.*?\]\s*$", "", clean_text, flags=re.DOTALL).strip() + if clean_text: + items.append(_make_output_text_item(clean_text)) + + if action: + call_id = _random_id() + handled_json = False + + json_action = _parse_json_action_string(action) + if json_action: + json_entries = _convert_json_action_to_items( + json_action, + call_id=call_id, + image_width=image_width, + image_height=image_height, + ) + if json_entries: + items.extend(json_entries) + handled_json = True + + if action.startswith("left_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_click_item(actual_x, actual_y, call_id=call_id)) + elif action.startswith("right_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_click_item(actual_x, actual_y, button="right", call_id=call_id)) + elif action.startswith("left_double_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_double_click_item(actual_x, actual_y, call_id=call_id)) + elif action.startswith("left_drag"): + start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action) + if start_match and end_match: + x1, y1 = int(start_match.group(1)), int(start_match.group(2)) + x2, y2 = int(end_match.group(1)), int(end_match.group(2)) + actual_x1 = int((x1 / 999.0) * image_width) + actual_y1 = int((y1 / 999.0) * image_height) + actual_x2 = int((x2 / 999.0) * image_width) + actual_y2 = int((y2 / 999.0) * image_height) + path = [ + {"x": actual_x1, "y": actual_y1}, + {"x": actual_x2, "y": actual_y2}, + ] + if not handled_json: + items.append(_make_drag_item(path, call_id=call_id)) + elif action.startswith("key"): + key_match = re.search(r"keys='([^']+)'", action) + if key_match: + keys = key_match.group(1) + key_list = keys.split("+") if "+" in keys else [keys] + if not handled_json: + items.append(_make_keypress_item(key_list, call_id=call_id)) + elif action.startswith("type"): + content_match = re.search(r"content='([^']*)'", action) + if content_match: + text = content_match.group(1) + if not handled_json: + items.append(_make_type_item(text, call_id=call_id)) + elif action.startswith("scroll"): + coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + direction_match = re.search(r"direction='([^']+)'", action) + if coord_match and direction_match: + x, y = int(coord_match.group(1)), int(coord_match.group(2)) + direction = direction_match.group(1) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + scroll_x = 0 + scroll_y = 0 + if direction == "up": + scroll_y = -5 + elif direction == "down": + scroll_y = 5 + elif direction == "left": + scroll_x = -5 + elif direction == "right": + scroll_x = 5 + if not handled_json: + items.append(_make_scroll_item(actual_x, actual_y, scroll_x, scroll_y, call_id=call_id)) + elif action == "WAIT()": + if not handled_json: + items.append(_make_wait_item(call_id=call_id)) + + return items + + +def parse_glm_response(response: str) -> dict[str, str]: + pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>" + match = re.search(pattern, response) + if match: + action = match.group(1).strip() + else: + action_pattern = r"[\w_]+\([^)]*\)" + matches = re.findall(action_pattern, response) + action = matches[0] if matches else "" + + memory_pattern = r"Memory:(.*?)$" + memory_match = re.search(memory_pattern, response, re.DOTALL) + memory = memory_match.group(1).strip() if memory_match else "[]" + + action_text_pattern = r"^(.*?)Memory:" + action_text_match = re.search(action_text_pattern, response, re.DOTALL) + action_text = action_text_match.group(1).strip() if action_text_match else response + if action_text: + action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "") + + return { + "action": action or "", + "action_text": action_text, + "memory": memory, + } + + + + + + +class Glm45vAgent(MCPAgent): + """LiteLLM-backed GLM-4.5V agent that speaks MCP.""" + + metadata: ClassVar[dict[str, Any]] = { + "display_width": computer_settings.OPENAI_COMPUTER_WIDTH, + "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT, + } + + required_tools: ClassVar[list[str]] = ["openai_computer"] + + def __init__( + self, + *, + model_name: str = "z-ai/glm-4.5v", + completion_kwargs: dict[str, Any] | None = None, + system_prompt: str | None = None, + **agent_kwargs: Any, + ) -> None: + super().__init__(**agent_kwargs) + # Normalize to canonical openrouter// + if not model_name.startswith("openrouter/"): + self.model_name = f"openrouter/{model_name}" + else: + self.model_name = model_name + self.completion_kwargs = completion_kwargs or {} + combined_prompt = DEFAULT_SYSTEM_PROMPT + if system_prompt: + combined_prompt = f"{combined_prompt}\n\n{system_prompt}" + + if self.system_prompt: + self.system_prompt = f"{self.system_prompt}\n\n{combined_prompt}" + else: + self.system_prompt = combined_prompt + self._memory = "[]" + self._last_instruction = "" + self._task_description = "" + + async def get_system_messages(self) -> list[Any]: + return [] + + @instrument(span_type="agent", record_args=False) + async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: + content_items: list[dict[str, Any]] = [] + text_parts: list[str] = [] + for block in blocks: + if isinstance(block, types.TextContent): + text_parts.append(block.text) + elif isinstance(block, types.ImageContent): + content_items.append( + { + "type": "message", + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", + }, + } + ], + } + ) + + if text_parts: + content_items.insert( + 0, + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + }, + ) + + return content_items + + def _glm_tool_call_to_mcp(self, item: dict[str, Any]) -> MCPToolCall: + call_id = item.get("call_id") or _random_id() + action = item.get("action") or {} + action_type = action.get("type", "") + + arguments: dict[str, Any] = {"type": action_type} + for key in ("x", "y", "scroll_x", "scroll_y"): + if key in action: + arguments[key] = action[key] + if "button" in action: + arguments["button"] = action["button"] + if "keys" in action: + arguments["keys"] = action["keys"] + if "text" in action: + arguments["text"] = action["text"] + if "path" in action: + arguments["path"] = action["path"] + + return MCPToolCall(id=call_id, name="openai_computer", arguments=arguments) + + @instrument(span_type="agent", record_args=False) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + instruction = _extract_user_instruction(messages) + if instruction: + self._last_instruction = instruction # type: ignore[attr-defined] + self._task_description = instruction + task_instruction = self._task_description or getattr(self, "_last_instruction", "") + + screenshot_b64 = get_last_image_from_messages(messages) + if not screenshot_b64: + call_id = _random_id() + screenshot_call = _make_screenshot_item(call_id) + messages.append(screenshot_call) + logger.debug("glm45v requesting initial screenshot") + tool_call = MCPToolCall( + id=call_id, + name="openai_computer", + arguments={"type": "screenshot"}, + ) + return AgentResponse( + content="capturing initial screenshot", + tool_calls=[tool_call], + done=False, + ) + + self.console.debug(f"glm45v task instruction: {task_instruction}") + self.console.debug(f"glm45v memory (pre-step): {self._memory}") + + prompt_content = convert_responses_items_to_glm45v_pc_prompt( + messages=messages, + task=task_instruction, + memory=self._memory, + ) + prompt_content.append( + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}} + ) + + system_prompt = self.system_prompt or "You are a helpful GUI agent assistant." + litellm_messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt_content}, + ] + + api_kwargs = {"model": self.model_name, "messages": litellm_messages} + api_kwargs.update(self.completion_kwargs) + + try: + response = await litellm.acompletion(**api_kwargs) + except Exception as exc: # pragma: no cover - network errors + logger.exception("glm45v completion failed: %s", exc) + return AgentResponse( + content=f"GLM-4.5V request failed: {exc}", + tool_calls=[], + done=True, + isError=True, + ) + + choice = response.choices[0] + message = getattr(choice, "message", None) + response_content = getattr(message, "content", "") if message else "" + parsed = parse_glm_response(response_content or "") if response_content else { + "memory": self._memory, + } + if parsed.get("memory"): + self._memory = parsed["memory"] + logger.debug("glm45v model content: %s", response_content) + trimmed = response_content[:400] if response_content else "" + self.console.debug(f"glm45v model content: {trimmed}") + self.console.debug(f"glm45v parsed response: {parsed}") + + image_width, image_height = _decode_image_dimensions(screenshot_b64) + response_items = convert_glm_completion_to_responses_items( + response, + image_width=image_width, + image_height=image_height, + parsed_response=parsed, + ) + + messages.extend(response_items) + + text_parts: list[str] = [] + reasoning_parts: list[str] = [] + tool_calls: list[MCPToolCall] = [] + + for item in response_items: + if not isinstance(item, dict): + continue + if item.get("type") == "message" and item.get("role") == "assistant": + for block in item.get("content", []) or []: + if isinstance(block, dict) and block.get("type") == "output_text": + text = block.get("text") + if isinstance(text, str): + text_parts.append(text) + elif item.get("type") == "reasoning": + summary = item.get("summary", []) + for block in summary: + if isinstance(block, dict) and block.get("text"): + reasoning_parts.append(block["text"]) + elif item.get("type") == "computer_call": + tool_calls.append(self._glm_tool_call_to_mcp(item)) + + content_text = "\n".join(text_parts).strip() + reasoning_text = "\n".join(reasoning_parts).strip() + + if not tool_calls: + self.console.info_log( + f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'" + ) + self.console.info_log(f"glm45v parsed response: {parsed}") + + return AgentResponse( + content=content_text or None, + reasoning=reasoning_text or None, + tool_calls=tool_calls, + done=not tool_calls, + raw=response, + ) + + @instrument(span_type="agent", record_args=False) + async def format_tool_results( + self, + tool_calls: list[MCPToolCall], + tool_results: list[MCPToolResult], + ) -> list[dict[str, Any]]: + rendered: list[dict[str, Any]] = [] + + for call, result in zip(tool_calls, tool_results, strict=False): + call_args = call.arguments or {} + if result.isError: + error_text = "".join( + content.text + for content in result.content + if isinstance(content, types.TextContent) + ) + rendered.extend( + _make_failed_tool_call_items( + tool_name=call_args.get("type", call.name), + tool_kwargs=call_args, + error_message=error_text or "Unknown error", + call_id=call.id, + ) + ) + continue + + screenshot_found = False + for content in result.content: + if isinstance(content, types.ImageContent): + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": { + "type": "input_image", + "image_url": f"data:{content.mimeType};base64,{content.data}", + }, + } + ) + screenshot_found = True + break + + text_parts = [ + content.text + for content in result.content + if isinstance(content, types.TextContent) and content.text + ] + if text_parts: + rendered.append( + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + } + ) + + if not screenshot_found and not text_parts: + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": {"type": "input_text", "text": "Tool executed"}, + } + ) + + return rendered + + +__all__ = ["Glm45vAgent"] diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index 4306a386..c9445258 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -1,592 +1,452 @@ -"""OpenRouter agent that uses the Responses API with prompt caching.""" +"""OpenRouter agent facade plus shared tooling helpers.""" from __future__ import annotations +import base64 import json -import logging +import re import uuid -from typing import Any, Iterable +from importlib import import_module +from io import BytesIO +from typing import Any, Dict, Type -import mcp.types as types -from openai import AsyncOpenAI +from PIL import Image -from hud import instrument -from hud.settings import settings -from hud.types import AgentResponse, MCPToolCall, MCPToolResult +from hud.agents.base import MCPAgent +from hud.tools.computer.settings import computer_settings -from .openai_chat_generic import GenericOpenAIChatAgent +# Shared helper utilities for computer-use adapters +def _random_id() -> str: + return f"call_{uuid.uuid4().hex[:8]}" -logger = logging.getLogger(__name__) -_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha" -_DEFAULT_HEADERS = { - "HTTP-Referer": "https://hud.so", - "X-Title": "HUD Python SDK", - "Accept": "application/json", -} +def _make_reasoning_item(reasoning: str) -> dict[str, Any]: + return { + "id": _random_id(), + "type": "reasoning", + "summary": [{"type": "summary_text", "text": reasoning}], + } -_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = { - "temperature": 0.1, - "max_output_tokens": 1024, -} +def _make_output_text_item(content: str) -> dict[str, Any]: + return { + "id": _random_id(), + "type": "message", + "role": "assistant", + "status": "completed", + "content": [{"type": "output_text", "text": content, "annotations": []}], + } -class OpenRouterAgent(GenericOpenAIChatAgent): - """MCP-enabled agent that talks to OpenRouter through the Responses API.""" - - def __init__( - self, - *, - api_key: str | None = None, - base_url: str | None = None, - model_name: str = "z-ai/glm-4.5v", - default_headers: dict[str, str] | None = None, - cache_control: dict[str, Any] | bool | None = True, - cacheable_roles: Iterable[str] | None = None, - openai_client: AsyncOpenAI | None = None, - completion_kwargs: dict[str, Any] | None = None, - **agent_kwargs: Any, - ) -> None: - api_key = api_key or settings.openrouter_api_key - if not api_key: - raise ValueError( - "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly." - ) - base_url = base_url or _DEFAULT_BASE_URL +def _make_computer_call_item(action: dict[str, Any], call_id: str | None = None) -> dict[str, Any]: + call_id = call_id or _random_id() + return { + "id": _random_id(), + "call_id": call_id, + "type": "computer_call", + "status": "completed", + "pending_safety_checks": [], + "action": action, + } - headers: dict[str, str] = dict(_DEFAULT_HEADERS) - if default_headers: - headers.update(default_headers) - client = openai_client or AsyncOpenAI( - api_key=api_key, - base_url=base_url, - default_headers=headers, - ) +def _make_click_item(x: int, y: int, button: str = "left", call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "click", "x": x, "y": y, "button": button}, call_id) - super().__init__( - openai_client=client, - model_name=model_name, - completion_kwargs=completion_kwargs, - **agent_kwargs, - ) - self._responses_kwargs = { - "tool_choice": "auto", - **_DEFAULT_COMPLETION_KWARGS, - **dict(self.completion_kwargs), - } - self.completion_kwargs.clear() +def _make_double_click_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "double_click", "x": x, "y": y}, call_id) - self._cache_control = self._normalize_cache_control(cache_control) - self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool")) - @staticmethod - def _normalize_cache_control( - cache_control: dict[str, Any] | bool | str | None, - ) -> dict[str, Any] | None: - if cache_control is False: - return None - if cache_control is None: - return {"type": "ephemeral"} - if cache_control is True: - return {"type": "ephemeral"} - if isinstance(cache_control, dict): - return cache_control - return {"type": str(cache_control)} - - def _should_cache(self, role: str) -> bool: - return self._cache_control is not None and role in self._cacheable_roles - - def _text_item(self, text: str, role: str) -> dict[str, Any]: - item: dict[str, Any] = {"type": "input_text", "text": text} - if self._should_cache(role): - item["cache_control"] = self._cache_control - return item - - def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]: - url: str | None = None - detail = None - - if isinstance(image_payload, dict): - # Standard OpenAI-style wrapper - if "image_url" in image_payload and isinstance(image_payload["image_url"], dict): - img = image_payload["image_url"] - url = img.get("url") - detail = img.get("detail") or image_payload.get("detail") - # Direct url / data uri - elif image_payload.get("url"): - url = image_payload.get("url") - detail = image_payload.get("detail") - # Raw base64 payload from computer/tool results - elif image_payload.get("data"): - mime = ( - image_payload.get("mimeType") - or image_payload.get("mime_type") - or "image/png" - ) - data = image_payload.get("data") - if data: - url = f"data:{mime};base64,{data}" - detail = image_payload.get("detail") - elif isinstance(image_payload.get("source"), dict): - source = image_payload["source"] - data = source.get("data") - mime = source.get("media_type") or source.get("mime_type") or "image/png" - if data: - url = f"data:{mime};base64,{data}" - detail = source.get("detail") - elif isinstance(image_payload, str): - url = image_payload - - item: dict[str, Any] = {"type": "input_image"} - if url: - item["image_url"] = url - item["detail"] = str(detail or "auto") - if self._should_cache(role): - item["cache_control"] = self._cache_control - return item - - def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]: - if content is None: - return [] - - blocks: list[dict[str, Any]] = [] - if isinstance(content, str): - blocks.append(self._text_item(content, role)) - return blocks - - if isinstance(content, dict): - content = [content] - - if isinstance(content, list): - for entry in content: - if isinstance(entry, str): - blocks.append(self._text_item(entry, role)) - elif isinstance(entry, dict): - entry_copy = dict(entry) - entry_type = entry_copy.get("type") - if entry_type in {"text", "input_text", None}: - text = entry_copy.get("text") or "" - blocks.append(self._text_item(text, role)) - elif entry_type in {"image_url", "input_image"}: - payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy - blocks.append(self._image_item(payload, role)) - elif entry_type in {"image", "output_image", "rendered"}: - blocks.append(self._image_item(entry_copy, role)) - elif entry_type == "tool_result": - text = entry_copy.get("text", "") - blocks.append(self._text_item(text, role)) - else: - text_value = entry_copy.get("text") or json.dumps(entry_copy) - blocks.append(self._text_item(text_value, role)) - else: - blocks.append(self._text_item(str(entry), role)) - return blocks - - blocks.append(self._text_item(str(content), role)) - return blocks - - def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]: - converted: list[dict[str, Any]] = [] - for message in messages: - if not isinstance(message, dict): - logger.debug("Skipping non-dict message: %s", message) - continue - - if "type" in message and "role" not in message: - converted.append(message) - continue - - role = message.get("role") or "user" - - if role == "assistant" and message.get("tool_calls"): - content_items = self._convert_message_content(role, message.get("content")) - if content_items: - converted.append({"role": "assistant", "content": content_items}) - for tool_call in message.get("tool_calls", []): - converted.append(self._convert_tool_call(tool_call)) - continue - - if role == "tool": - converted.extend(self._convert_tool_message(message)) - continue - - payload: dict[str, Any] = {"role": role} - content_items = self._convert_message_content(role, message.get("content")) - if content_items: - payload["content"] = content_items - if message.get("name"): - payload["name"] = message["name"] - if message.get("metadata"): - payload["metadata"] = message["metadata"] - converted.append(payload) - - return converted +def _make_move_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "move", "x": x, "y": y}, call_id) + + +def _make_drag_item(path: list[dict[str, int]], call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "drag", "path": path}, call_id) + + +def _make_keypress_item(keys: list[str], call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "keypress", "keys": keys}, call_id) + + +def _make_type_item(text: str, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "type", "text": text}, call_id) - @staticmethod - def _jsonify_schema(value: Any) -> Any: - from pydantic import BaseModel - from pydantic.fields import FieldInfo - if isinstance(value, (str, int, float, bool)) or value is None: - return value +def _make_scroll_item( + x: int, + y: int, + scroll_x: int, + scroll_y: int, + call_id: str | None = None, +) -> dict[str, Any]: + action = {"type": "scroll", "x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y} + return _make_computer_call_item(action, call_id) - if isinstance(value, dict): - return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()} - if isinstance(value, (list, tuple, set)): - return [OpenRouterAgent._jsonify_schema(v) for v in value] +def _make_wait_item(call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "wait"}, call_id) + +def _make_screenshot_item(call_id: str) -> dict[str, Any]: + return _make_computer_call_item({"type": "screenshot"}, call_id) + + +def _make_failed_tool_call_items( + tool_name: str, + tool_kwargs: dict[str, Any], + error_message: str, + call_id: str, +) -> list[dict[str, Any]]: + call = _make_computer_call_item({"type": tool_name, **tool_kwargs}, call_id) + call["status"] = "failed" + failure_text = _make_output_text_item(f"Tool {tool_name} failed: {error_message}") + failure_text["role"] = "assistant" + return [call, failure_text] + + +def _coerce_to_pixel_coordinates( + x_val: Any, + y_val: Any, + *, + width: int, + height: int, +) -> tuple[int, int] | None: + try: + x_float = float(x_val) + y_float = float(y_val) + except (TypeError, ValueError): + return None + + def clamp(value: int, maximum: int) -> int: + return max(0, min(maximum - 1, value)) + + abs_x = abs(x_float) + abs_y = abs(y_float) + if abs_x <= 1.0 and abs_y <= 1.0: + px = int(x_float * width) + py = int(y_float * height) + elif abs_x <= 999.0 and abs_y <= 999.0: + px = int((x_float / 999.0) * width) + py = int((y_float / 999.0) * height) + else: + px = int(x_float) + py = int(y_float) + + return clamp(px, width), clamp(py, height) + + +def _parse_coordinate_box(value: Any) -> tuple[float, float] | None: + if isinstance(value, (list, tuple)) and len(value) >= 2: try: - return json.loads(json.dumps(value)) - except Exception: - if isinstance(value, BaseModel): - return OpenRouterAgent._jsonify_schema(value.model_dump()) - if isinstance(value, FieldInfo): - data: dict[str, Any] = {} - if value.annotation is not None: - data.setdefault( - "type", - getattr(value.annotation, "__name__", str(value.annotation)), - ) - if value.description: - data["description"] = value.description - if value.title: - data["title"] = value.title - if value.default not in (None, Ellipsis): - data["default"] = OpenRouterAgent._jsonify_schema(value.default) - if value.json_schema_extra: - extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra) - if isinstance(extra, dict): - data.update(extra) - return data or str(value) - if hasattr(value, "model_dump"): - return OpenRouterAgent._jsonify_schema(value.model_dump()) - if hasattr(value, "__dict__") and value.__dict__: - return OpenRouterAgent._jsonify_schema( - { - k: v - for k, v in value.__dict__.items() - if not k.startswith("_") - } - ) - return str(value) + return float(value[0]), float(value[1]) + except (TypeError, ValueError): + return None - @staticmethod - def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]: - if not tools: - return [] - - converted: list[dict] = [] - for tool in tools: - if not isinstance(tool, dict): - continue - - if tool.get("type") == "function" and isinstance(tool.get("function"), dict): - fn = tool["function"] - name = fn.get("name") - params = fn.get("parameters", {}) - description = fn.get("description", "") - - if not isinstance(name, str) or not name: - logger.debug("Skipping tool with missing name: %s", tool) - continue - - converted.append( - { - "type": "function", - "name": name, - "description": str(description or ""), - "parameters": OpenRouterAgent._jsonify_schema(params), - } - ) - else: - converted.append(OpenRouterAgent._jsonify_schema(tool)) - - return converted - - def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]: - if not isinstance(tool_call, dict): - return {} - - function = tool_call.get("function") or {} - name = function.get("name") or tool_call.get("name") or "tool_call" - raw_arguments = function.get("arguments") - - if isinstance(raw_arguments, dict): - arguments = json.dumps(self._jsonify_schema(raw_arguments)) - elif isinstance(raw_arguments, str): - try: - parsed = json.loads(raw_arguments) - except json.JSONDecodeError: - arguments = raw_arguments - else: - arguments = json.dumps(self._jsonify_schema(parsed)) - elif raw_arguments is None: - arguments = "{}" + if isinstance(value, str): + stripped = value.strip() + try: + loaded = json.loads(stripped) + except Exception: + matches = re.findall(r"-?\d+(?:\.\d+)?", stripped) + if len(matches) >= 2: + return float(matches[0]), float(matches[1]) else: - arguments = json.dumps(self._jsonify_schema(raw_arguments)) + if isinstance(loaded, (list, tuple)) and len(loaded) >= 2: + try: + return float(loaded[0]), float(loaded[1]) + except (TypeError, ValueError): + return None + return None + + +def _coerce_box_to_pixels( + box: Any, + *, + width: int, + height: int, +) -> tuple[int, int] | None: + coords = _parse_coordinate_box(box) + if not coords: + return None + return _coerce_to_pixel_coordinates(coords[0], coords[1], width=width, height=height) + + +def _parse_json_action_string(action_text: str) -> dict[str, Any] | None: + candidate = action_text.strip() + if not (candidate.startswith("{") and candidate.endswith("}")): + return None + + attempts = [candidate] + if "\\" in candidate: + try: + attempts.append(candidate.encode("utf-8").decode("unicode_escape")) + except Exception: + pass + attempts.append(candidate.replace("\\\"", '"')) - call_id = ( - tool_call.get("id") - or function.get("id") - or function.get("call_id") - or f"call_{uuid.uuid4().hex}" - ) + for attempt in attempts: + try: + return json.loads(attempt) + except Exception: + continue - return { - "type": "function_call", - "id": call_id, - "name": name, - "arguments": arguments or "{}", - } - - def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]: - entries: list[dict[str, Any]] = [] - call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}" - - text_parts: list[str] = [] - image_payloads: list[Any] = [] - - content = message.get("content") - if isinstance(content, list): - for item in content: - if isinstance(item, dict): - item_type = item.get("type") - if item_type in {"text", "input_text"} and item.get("text"): - text_parts.append(str(item.get("text"))) - elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}: - image_payloads.append(item) - elif isinstance(item, str): - text_parts.append(item) - elif isinstance(content, str): - text_parts.append(content) - - structured = message.get("structuredContent") - if structured and not text_parts: - try: - text_parts.append(json.dumps(structured)) - except Exception: - text_parts.append(str(structured)) - - output_text = "\n".join(part for part in text_parts if part) or "" - - entries.append( - { - "type": "function_call_output", - "id": message.get("id") or call_id, - "call_id": call_id, - "output": output_text, - } - ) + return None - for payload in image_payloads: - entries.append( - { - "role": "user", - "content": [self._image_item(payload, "user")], - } - ) +def _convert_json_action_to_items( + json_action: dict[str, Any], + *, + call_id: str, + image_width: int, + image_height: int, +) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + action_type = str(json_action.get("type", "")).lower() + if not action_type: return entries - async def format_tool_results( - self, - tool_calls: list[MCPToolCall], - tool_results: list[MCPToolResult], - ) -> list[dict[str, Any]]: - converted: list[dict[str, Any]] = [] - - for call, result in zip(tool_calls, tool_results, strict=False): - call_id = call.id or call.name or f"call_{uuid.uuid4().hex}" - - text_parts: list[str] = [] - image_payloads: list[Any] = [] - - for item in result.content or []: - if isinstance(item, types.TextContent): - text_parts.append(item.text) - elif isinstance(item, types.ImageContent): - image_payloads.append( - { - "mimeType": item.mimeType, - "data": item.data, - "detail": getattr(item, "detail", None), - } - ) - elif isinstance(item, dict): - if item.get("type") in {"text", "input_text"}: - text_parts.append(str(item.get("text", ""))) - elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}: - image_payloads.append(item) - elif isinstance(item, str): - text_parts.append(item) - - if result.structuredContent and not text_parts: - try: - text_parts.append(json.dumps(result.structuredContent)) - except Exception: - text_parts.append(str(result.structuredContent)) - - if getattr(result, "isError", False): - text_parts.append(getattr(result, "error", "Tool execution failed.")) - - output_text = "\n".join(part for part in text_parts if part) or "" - - converted.append( - { - "type": "function_call_output", - "id": call_id, - "call_id": call_id, - "output": output_text, - } + if action_type in {"type", "text"}: + text_value = json_action.get("content") or json_action.get("text") or "" + if text_value: + entries.append(_make_type_item(str(text_value), call_id=call_id)) + elif action_type in {"click", "left_click"}: + start_box = ( + json_action.get("start_box") + or json_action.get("startBox") + or json_action.get("position") + ) + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + button = str(json_action.get("button", "left") or "left").lower() + entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) + elif action_type in {"right_click", "middle_click"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + button = "right" if action_type == "right_click" else "middle" + entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) + elif action_type in {"double_click", "left_double_click"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, ) + if coords: + entries.append(_make_double_click_item(coords[0], coords[1], call_id=call_id)) + elif action_type in {"drag", "left_drag"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + end_box = json_action.get("end_box") or json_action.get("endBox") + start_coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + end_coords = _coerce_box_to_pixels(end_box, width=image_width, height=image_height) + if not start_coords and json_action.get("x") is not None and json_action.get("y") is not None: + start_coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if start_coords and end_coords: + path = [ + {"x": start_coords[0], "y": start_coords[1]}, + {"x": end_coords[0], "y": end_coords[1]}, + ] + entries.append(_make_drag_item(path, call_id=call_id)) + elif action_type == "scroll": + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + direction = str(json_action.get("direction", "")).lower() + step = int(json_action.get("step", 5) or 5) + if coords: + scroll_x = 0 + scroll_y = 0 + if direction == "up": + scroll_y = -abs(step) + elif direction == "down": + scroll_y = abs(step) + elif direction == "left": + scroll_x = -abs(step) + elif direction == "right": + scroll_x = abs(step) + entries.append( + _make_scroll_item(coords[0], coords[1], scroll_x, scroll_y, call_id=call_id) + ) + elif action_type in {"hover", "move"}: + target_box = ( + json_action.get("start_box") + or json_action.get("startBox") + or json_action.get("position") + ) + coords = _coerce_box_to_pixels(target_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + entries.append(_make_move_item(coords[0], coords[1], call_id=call_id)) + elif action_type in {"keypress", "key", "key_press"}: + keys = json_action.get("keys") + key_list: list[str] = [] + if isinstance(keys, str): + key_list = [segment.strip() for segment in keys.split("+") if segment.strip()] + elif isinstance(keys, list): + key_list = [str(segment).strip() for segment in keys if str(segment).strip()] + if key_list: + entries.append(_make_keypress_item(key_list, call_id=call_id)) + elif action_type == "wait": + entries.append(_make_wait_item(call_id=call_id)) + elif action_type == "screenshot": + entries.append(_make_screenshot_item(call_id)) + + return entries + + +def _decode_image_dimensions(image_b64: str) -> tuple[int, int]: + try: + data = base64.b64decode(image_b64) + with Image.open(BytesIO(data)) as img: + return img.size + except Exception: # pragma: no cover - defensive fallback + return computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT + + +def _extract_user_instruction(messages: list[dict[str, Any]]) -> str: + for message in messages: + if not isinstance(message, dict): + continue + if message.get("type") == "message" and message.get("role") == "user": + content = message.get("content") or [] + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") in {"text", "input_text"}: + text = block.get("text") + if isinstance(text, str) and text.strip(): + return text.strip() + return "" + + +def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: + for message in reversed(messages): + if not isinstance(message, dict): + continue + msg_type = message.get("type") + if msg_type == "computer_call_output": + output = message.get("output") or {} + if isinstance(output, dict): + image_url = output.get("image_url") + if isinstance(image_url, str) and image_url.startswith("data:image/"): + return image_url.split(",", 1)[1] + if msg_type == "message" and message.get("role") == "user": + content = message.get("content") + if isinstance(content, list): + for item in reversed(content): + if isinstance(item, dict) and item.get("type") == "image_url": + url_obj = item.get("image_url") + if isinstance(url_obj, dict): + url = url_obj.get("url") + if isinstance(url, str) and url.startswith("data:image/"): + return url.split(",", 1)[1] + return None + +# Adapter dispatch +_ADAPTER_REGISTRY: Dict[str, str] = { + "z-ai/glm-4.5v": "hud.agents.glm45v:Glm45vAgent", +} - for payload in image_payloads: - converted.append( - { - "role": "user", - "content": [self._image_item(payload, "user")], - } - ) - return converted +def _load_adapter(path: str) -> Type[MCPAgent]: + module_name, class_name = path.split(":", 1) + module = import_module(module_name) + return getattr(module, class_name) - @staticmethod - def _parse_arguments(arguments: Any) -> dict[str, Any]: - if isinstance(arguments, dict): - return arguments - if isinstance(arguments, str) and arguments: - try: - parsed = json.loads(arguments) - if isinstance(parsed, dict): - return parsed - except json.JSONDecodeError: - logger.debug("Failed to decode arguments: %s", arguments) - return {} - - def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall: - tool_name = payload.get("name") or payload.get("function", {}).get("name") or "" - call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id") - if not call_id: - call_id = tool_name - arguments = payload.get("arguments") - if not arguments and "function" in payload: - arguments = payload["function"].get("arguments") - parsed_arguments = self._parse_arguments(arguments) - return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments) - - def _coerce_response_payload(self, response: Any) -> dict[str, Any]: - """Convert OpenRouter SDK return types into a plain dictionary.""" - - if response is None: - return {} - - if isinstance(response, dict): - return response - - for attr in ("model_dump", "dict", "to_dict"): - if hasattr(response, attr): - try: - payload = getattr(response, attr)() - except Exception as exc: # pragma: no cover - defensive - logger.debug("Failed to read response via %s: %s", attr, exc) - else: - if isinstance(payload, dict): - return payload - - snapshot = getattr(response, "__dict__", None) - if isinstance(snapshot, dict): - return snapshot - - logger.error("Unexpected response carrier from OpenRouter: %r", response) - raise TypeError("Unexpected response type from OpenRouter") - - def _extract_response(self, response: Any) -> AgentResponse: - data = self._coerce_response_payload(response) - if not isinstance(data, dict): - raise TypeError("Unexpected response type from OpenRouter") - - output = data.get("output", []) - text_parts: list[str] = [] - tool_calls: list[MCPToolCall] = [] - reasoning_parts: list[str] = [] - - for item in output: - item_type = item.get("type") if isinstance(item, dict) else None - if item_type == "message": - contents = item.get("content", []) - if isinstance(contents, list): - for block in contents: - if not isinstance(block, dict): - continue - block_type = block.get("type") - if block_type in {"output_text", "text"}: - text = block.get("text") - if text: - text_parts.append(text) - elif block_type == "reasoning" and block.get("text"): - reasoning_parts.append(block["text"]) - for tc in item.get("tool_calls", []) or []: - if isinstance(tc, dict): - tool_calls.append(self._to_mcp_tool_call(tc)) - elif item_type in {"tool_call", "function_call"} and isinstance(item, dict): - tool_calls.append(self._to_mcp_tool_call(item)) - elif item_type == "reasoning" and isinstance(item, dict): - summary = item.get("summary") - if isinstance(summary, list): - for block in summary: - if isinstance(block, dict) and block.get("text"): - reasoning_parts.append(block["text"]) - elif isinstance(summary, str): - reasoning_parts.append(summary) - - merged_text = "\n".join(reasoning_parts + text_parts).strip() - status = data.get("status", "completed") - done = not tool_calls and status != "in_progress" - return AgentResponse( - content=merged_text, - tool_calls=tool_calls, - done=done, - raw=response, - ) - @instrument( - span_type="agent", - record_args=False, - record_result=True, - ) - async def get_response(self, messages: list[Any]) -> AgentResponse: - converted_messages = self._convert_messages(messages) - tools = self._convert_tools_for_responses(self.get_tool_schemas()) - - protected_keys = {"model", "input", "tools"} - extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys} - # If tools are provided and tool_choice isn't explicitly set, require tool use - if tools and "tool_choice" not in extra: - extra["tool_choice"] = "required" +class OpenRouterAgent: + """Dispatch wrapper that selects the correct OpenRouter adapter by model.""" + def __init__(self, *, model_name: str = "z-ai/glm-4.5v", **kwargs: Any) -> None: + normalized = self._normalize_model_name(model_name) try: - payload: dict[str, Any] = { - "model": self.model_name, - "input": converted_messages, - **extra, - } - if tools: - payload["tools"] = tools - - response = await self.oai.responses.create(**payload) - except Exception as exc: - error_content = f"Error getting response {exc}" - logger.exception("OpenRouter call failed: %s", exc) - return AgentResponse( - content=error_content, - tool_calls=[], - done=True, - isError=True, - raw=None, - ) + adapter_path = _ADAPTER_REGISTRY[normalized] + except KeyError as exc: # pragma: no cover - defensive + raise ValueError(f"Unsupported OpenRouter model: {model_name}") from exc + + adapter_cls = _load_adapter(adapter_path) + canonical_model = f"openrouter/{normalized}" + self.model_name = canonical_model + self._adapter = adapter_cls(model_name=canonical_model, **kwargs) - return self._extract_response(response) + @staticmethod + def _normalize_model_name(raw_model: str | None) -> str: + if not raw_model: + raise ValueError("Model name must be provided for OpenRouterAgent") + key = raw_model.strip() + if key.startswith("openrouter/"): + key = key[len("openrouter/") :] + key = key.lower() + if key in _ADAPTER_REGISTRY: + return key + raise ValueError(f"Unknown OpenRouter model: {raw_model}") + + def __getattr__(self, item: str) -> Any: + return getattr(self._adapter, item) + + def __dir__(self) -> list[str]: + base_dir = set(super().__dir__()) + base_dir.update(self.__dict__.keys()) + base_dir.update(dir(self._adapter)) + return sorted(base_dir) + + +__all__ = [ + "OpenRouterAgent", + "_random_id", + "_make_reasoning_item", + "_make_output_text_item", + "_make_computer_call_item", + "_make_click_item", + "_make_double_click_item", + "_make_drag_item", + "_make_keypress_item", + "_make_type_item", + "_make_scroll_item", + "_make_wait_item", + "_make_screenshot_item", + "_make_failed_tool_call_items", + "_coerce_to_pixel_coordinates", + "_parse_coordinate_box", + "_coerce_box_to_pixels", + "_parse_json_action_string", + "_convert_json_action_to_items", + "_decode_image_dimensions", + "_extract_user_instruction", + "get_last_image_from_messages", +] diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py index d3010e0d..7328586e 100644 --- a/hud/agents/tests/test_openrouter.py +++ b/hud/agents/tests/test_openrouter.py @@ -1,205 +1,94 @@ from __future__ import annotations import pytest -from unittest.mock import AsyncMock, MagicMock -import mcp.types as types +from types import SimpleNamespace +from typing import Any -from hud.agents.openrouter import OpenRouterAgent -from hud.settings import settings -from hud.types import MCPToolCall, MCPToolResult +def _import_agents(): + import mcp.types as types + from hud.agents.glm45v import Glm45vAgent + from hud.agents.openrouter import OpenRouterAgent + from hud.types import MCPToolResult + return Glm45vAgent, OpenRouterAgent, MCPToolResult, types -@pytest.fixture(autouse=True) -def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: - """Disable HUD telemetry during unit tests.""" - monkeypatch.setattr(settings, "telemetry_enabled", False) - monkeypatch.setattr(settings, "api_key", None) +def test_openrouter_agent_defaults_to_glm45v() -> None: + Glm45vAgent, OpenRouterAgent, _, _ = _import_agents() + agent = OpenRouterAgent() + assert isinstance(agent._adapter, Glm45vAgent) + assert agent.model_name == "openrouter/z-ai/glm-4.5v" -class FakeResponse: - def __init__(self, payload: dict) -> None: - self._payload = payload +def test_openrouter_agent_normalizes_alias() -> None: + _, OpenRouterAgent, _, _ = _import_agents() + agent = OpenRouterAgent(model_name="Z-AI/GLM-4.5V") + assert agent.model_name == "openrouter/z-ai/glm-4.5v" - def model_dump(self) -> dict: - return self._payload - -@pytest.mark.asyncio -async def test_openrouter_agent_builds_cached_messages() -> None: - responses_create = AsyncMock( - return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"}) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create - - agent = OpenRouterAgent( - api_key="test-key", - openai_client=mock_client, - cache_control={"type": "ephemeral"}, - ) - agent._available_tools = [] # mimic initialized agent - - messages = [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - {"role": "assistant", "content": "Previous reply"}, - ] - - await agent.get_response(messages) - - await_call = responses_create.await_args - assert await_call is not None - kwargs = await_call.kwargs - assert kwargs["model"] == agent.model_name - input_payload = kwargs["input"] - - system_block = input_payload[0]["content"][0] - user_block = input_payload[1]["content"][0] - assistant_block = input_payload[2]["content"][0] - - assert system_block["cache_control"] == {"type": "ephemeral"} - assert user_block["cache_control"] == {"type": "ephemeral"} - assert "cache_control" not in assistant_block +def test_openrouter_agent_rejects_unknown_model() -> None: + _, OpenRouterAgent, _, _ = _import_agents() + with pytest.raises(ValueError): + OpenRouterAgent(model_name="unknown/model") @pytest.mark.asyncio -async def test_openrouter_agent_parses_tool_calls() -> None: - responses_create = AsyncMock( - return_value=FakeResponse( - { - "output": [ - { - "type": "message", - "content": [{"type": "output_text", "text": "Calling tool"}], - "tool_calls": [ - { - "id": "call_1", - "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"}, - } - ], - } - ], - "status": "requires_action", - } - ) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create +async def test_openrouter_agent_parses_tool_calls(monkeypatch: pytest.MonkeyPatch) -> None: + Glm45vAgent, OpenRouterAgent, MCPToolResult, types = _import_agents() + png_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO61uFYAAAAASUVORK5CYII=" - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - agent._available_tools = [] + async def fake_completion(*_: Any, **__: Any) -> Any: + message = SimpleNamespace(content=( + "I will click the button.\n" + "<|begin_of_box|>{\"type\": \"click\", \"start_box\": [100, 200]}<|end_of_box|>\n" + "Memory:[]" + ), reasoning_content=None) + choice = SimpleNamespace(message=message) + return SimpleNamespace(choices=[choice]) - result = await agent.get_response( - [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - ] - ) + monkeypatch.setattr("hud.agents.glm45v.litellm.acompletion", fake_completion) - assert not result.done - assert result.tool_calls[0].name == "search" - assert result.tool_calls[0].arguments == {"query": "hud"} - - -@pytest.mark.asyncio -async def test_openrouter_agent_returns_text_response() -> None: - responses_create = AsyncMock( - return_value=FakeResponse( - { - "output": [ - { - "type": "message", - "content": [{"type": "output_text", "text": "Hi there"}], - } - ], - "status": "completed", - } - ) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create + agent = OpenRouterAgent(model_name="z-ai/glm-4.5v") - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - agent._available_tools = [] - - result = await agent.get_response( - [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - ] - ) - - assert result.done - assert result.content == "Hi there" - assert result.tool_calls == [] - - -def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - - from pydantic import Field - - tools = [ + messages: list[dict[str, Any]] = [ { - "type": "function", - "function": { - "name": "click", - "description": "Click an element", - "parameters": { - "type": "object", - "properties": { - "selector": Field(default="", description="CSS selector"), - }, - "required": ["selector"], - }, - }, - } - ] - - converted = agent._convert_tools_for_responses(tools) - selector_schema = converted[0]["parameters"]["properties"]["selector"] - assert isinstance(selector_schema, dict) - assert selector_schema.get("description") == "CSS selector" - - -def test_openrouter_agent_converts_image_blocks() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - - content = [ + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "click the highlighted cell"}], + }, { - "type": "image", - "mimeType": "image/png", - "data": "dGVzdA==", - "detail": "high", - } + "type": "computer_call_output", + "call_id": "initial", + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{png_base64}", + }, + }, ] - message_blocks = agent._convert_messages([{"role": "user", "content": content}]) - image_block = message_blocks[0]["content"][0] - assert image_block["type"] == "input_image" - assert image_block["image_url"].startswith("data:image/png;base64,") - assert image_block["detail"] == "high" + response = await agent.get_response(list(messages)) + assert not response.done + assert response.tool_calls, "expected at least one tool call" -@pytest.mark.asyncio -async def test_format_tool_results_produces_function_call_output() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + tool_call = response.tool_calls[0] + assert tool_call.name == "openai_computer" + assert tool_call.arguments["type"] == "click" + # coordinates are normalized from the 1x1 PNG back to pixel space -> 0/0 + assert tool_call.arguments["x"] == 0 + assert tool_call.arguments["y"] == 0 - tool_call = MCPToolCall(id="call-1", name="playwright", arguments={}) tool_result = MCPToolResult( content=[ - types.TextContent(type="text", text="navigation complete"), - types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"), + types.ImageContent(type="image", data=png_base64, mimeType="image/png"), + types.TextContent(type="text", text="button pressed"), ] ) - formatted = await agent.format_tool_results([tool_call], [tool_result]) + rendered = await agent.format_tool_results([tool_call], [tool_result]) - assert formatted[0]["type"] == "function_call_output" - assert formatted[0]["call_id"] == "call-1" - assert formatted[1]["role"] == "user" - assert formatted[1]["content"][0]["type"] == "input_image" + assert any(item.get("type") == "computer_call_output" for item in rendered) + assert any( + item.get("type") == "message" and item.get("role") == "user" + for item in rendered + ) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 99771913..c1701f5c 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -894,7 +894,7 @@ def eval( [ {"name": "Claude 4 Sonnet", "value": "claude"}, {"name": "OpenAI Computer Use", "value": "openai"}, - {"name": "OpenRouter (Responses)", "value": "openrouter"}, + {"name": "OpenRouter", "value": "openrouter"}, {"name": "vLLM (Local Server)", "value": "vllm"}, {"name": "LiteLLM (Multi-provider)", "value": "litellm"}, ] diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 4900ba85..2b63222d 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -191,7 +191,7 @@ def build_agent( raise typer.Exit(1) from e return OpenRouterAgent( - model_name=model or "z-ai/glm-4.6", + model_name=model or "z-ai/glm-4.5v", allowed_tools=allowed_tools, verbose=verbose, ) @@ -568,7 +568,7 @@ def eval_command( "claude", "--agent", help=( - "Agent backend to use (claude, openai computer use, openrouter responses, " + "Agent backend to use (claude, openai computer use, openrouter, " "vllm for local server, or litellm)" ), ), diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py index 37b9fa7a..f42248a4 100644 --- a/hud/utils/agent_factories.py +++ b/hud/utils/agent_factories.py @@ -88,13 +88,4 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent: def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent: """Factory for OpenRouterAgent with run_dataset compatibility.""" - api_key = kwargs.pop("api_key", None) - base_url = kwargs.pop("base_url", None) - cache_control = kwargs.pop("cache_control", True) - - return OpenRouterAgent( - api_key=api_key, - base_url=base_url, - cache_control=cache_control, - **kwargs, - ) + return OpenRouterAgent(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index dc6c77b4..0cfc9dfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ dev = [ "langchain", "langchain-openai", "langchain-anthropic", - "litellm>=1.55.0", + "litellm", # Jupyter support "ipykernel", "ipython <9", From 45fe54e9f59f1fe1cc1f1b792eefebe98b115c62 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Sat, 11 Oct 2025 23:13:49 +0200 Subject: [PATCH 3/4] eval run_full_dataset fix --- hud/cli/eval.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 2b63222d..7719e84d 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -479,6 +479,39 @@ async def run_full_dataset( if allowed_tools: agent_config["allowed_tools"] = allowed_tools + elif agent_type == "openrouter": + try: + # Use adapter class directly so it satisfies type[MCPAgent] + from hud.agents.openrouter import ( + OpenRouterAgent, + _ADAPTER_REGISTRY, + _load_adapter, + ) + except ImportError as e: + hud_console.error( + "OpenRouter agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + # Normalize model and resolve adapter + raw_model = model or "z-ai/glm-4.5v" + try: + normalized = OpenRouterAgent._normalize_model_name(raw_model) + adapter_path = _ADAPTER_REGISTRY[normalized] + except Exception as e: + hud_console.error(f"Unsupported OpenRouter model: {raw_model}") + raise typer.Exit(1) from e + + adapter_cls = _load_adapter(adapter_path) + agent_class = adapter_cls + agent_config = { + "model_name": f"openrouter/{normalized}", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + else: try: from hud.agents import ClaudeAgent From beb181618fc8a6fcbdf892ba3fdd4a2b28ddaf93 Mon Sep 17 00:00:00 2001 From: ilya <95108691+shfunc@users.noreply.github.com> Date: Mon, 13 Oct 2025 09:07:25 +0200 Subject: [PATCH 4/4] Update pyproject.toml, litellm version fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0cfc9dfa..dc6c77b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ dev = [ "langchain", "langchain-openai", "langchain-anthropic", - "litellm", + "litellm>=1.55.0", # Jupyter support "ipykernel", "ipython <9",