From 5a08f83858743e77adbbe07ca83dcc978f1c74b8 Mon Sep 17 00:00:00 2001
From: shinbehavior <hagforall@proton.me>
Date: Thu, 9 Oct 2025 22:22:04 +0200
Subject: [PATCH 1/4] WIP

---
 examples/04_openrouter_quickstart.py |  47 +++
 examples/mcp_sum_server.py           |  23 ++
 hud/agents/__init__.py               |   2 +
 hud/agents/openrouter.py             | 592 +++++++++++++++++++++++++++
 hud/agents/tests/test_openrouter.py  | 205 ++++++++++
 hud/cli/__init__.py                  |   8 +-
 hud/cli/eval.py                      |  38 +-
 hud/utils/agent_factories.py         |  16 +
 8 files changed, 923 insertions(+), 8 deletions(-)
 create mode 100644 examples/04_openrouter_quickstart.py
 create mode 100644 examples/mcp_sum_server.py
 create mode 100644 hud/agents/openrouter.py
 create mode 100644 hud/agents/tests/test_openrouter.py
diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py
new file mode 100644
index 00000000..2ac56044
--- /dev/null
+++ b/examples/04_openrouter_quickstart.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+from hud.agents.openrouter import OpenRouterAgent
+from hud.utils.hud_console import HUDConsole
+
+
+async def main() -> None:
+    hud_console = HUDConsole()
+
+    # Inline FastMCP sum task (no external JSON needed)
+    server_path = Path(__file__).parent / "mcp_sum_server.py"
+    task = {
+        "id": "sum-demo",
+        "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.",
+        "mcp_config": {
+            "local": {
+                "command": "python",
+                "args": [str(server_path)],
+            }
+        },
+        "agent_config": {
+            "allowed_tools": ["sum"],
+            "system_prompt": (
+                "You are a concise math assistant. Always call the `sum` tool when asked to add "
+                "numbers, wait for the result, then explain the answer in one sentence."
+            ),
+        },
+    }
+
+    # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env)
+    agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True)
+
+    hud_console.info("Running task with OpenRouter agent...")
+    result = await agent.run(task, max_steps=3)
+
+    hud_console.info("\nFinal content:")
+    hud_console.info(result.content or "<empty>")
+    hud_console.success(f"Reward: {result.reward}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+
diff --git a/examples/mcp_sum_server.py b/examples/mcp_sum_server.py
new file mode 100644
index 00000000..7c26d123
--- /dev/null
+++ b/examples/mcp_sum_server.py
@@ -0,0 +1,23 @@
+"""FastMCP server exposing a simple sum tool.
+
+Run with: `python examples/mcp_sum_server.py`.
+"""
+
+from __future__ import annotations
+
+from fastmcp import FastMCP
+
+
+server = FastMCP("SumServer")
+
+
+@server.tool()
+def sum(a: int, b: int) -> dict[str, int]:
+    """Return the sum of two integers."""
+    return {"result": a + b}
+
+
+if __name__ == "__main__":
+    server.run()
+
+
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
index 7470adb3..55a531ca 100644
--- a/hud/agents/__init__.py
+++ b/hud/agents/__init__.py
@@ -4,10 +4,12 @@
 from .claude import ClaudeAgent
 from .openai import OperatorAgent
 from .openai_chat_generic import GenericOpenAIChatAgent
+from .openrouter import OpenRouterAgent
 
 __all__ = [
     "ClaudeAgent",
     "GenericOpenAIChatAgent",
     "MCPAgent",
     "OperatorAgent",
+    "OpenRouterAgent",
 ]
diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py
new file mode 100644
index 00000000..4306a386
--- /dev/null
+++ b/hud/agents/openrouter.py
@@ -0,0 +1,592 @@
+"""OpenRouter agent that uses the Responses API with prompt caching."""
+
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from typing import Any, Iterable
+
+import mcp.types as types
+from openai import AsyncOpenAI
+
+from hud import instrument
+from hud.settings import settings
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+
+from .openai_chat_generic import GenericOpenAIChatAgent
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha"
+_DEFAULT_HEADERS = {
+    "HTTP-Referer": "https://hud.so",
+    "X-Title": "HUD Python SDK",
+    "Accept": "application/json",
+}
+
+_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = {
+    "temperature": 0.1,
+    "max_output_tokens": 1024,
+}
+
+
+class OpenRouterAgent(GenericOpenAIChatAgent):
+    """MCP-enabled agent that talks to OpenRouter through the Responses API."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        base_url: str | None = None,
+        model_name: str = "z-ai/glm-4.5v",
+        default_headers: dict[str, str] | None = None,
+        cache_control: dict[str, Any] | bool | None = True,
+        cacheable_roles: Iterable[str] | None = None,
+        openai_client: AsyncOpenAI | None = None,
+        completion_kwargs: dict[str, Any] | None = None,
+        **agent_kwargs: Any,
+    ) -> None:
+        api_key = api_key or settings.openrouter_api_key
+        if not api_key:
+            raise ValueError(
+                "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly."
+            )
+
+        base_url = base_url or _DEFAULT_BASE_URL
+
+        headers: dict[str, str] = dict(_DEFAULT_HEADERS)
+        if default_headers:
+            headers.update(default_headers)
+
+        client = openai_client or AsyncOpenAI(
+            api_key=api_key,
+            base_url=base_url,
+            default_headers=headers,
+        )
+
+        super().__init__(
+            openai_client=client,
+            model_name=model_name,
+            completion_kwargs=completion_kwargs,
+            **agent_kwargs,
+        )
+
+        self._responses_kwargs = {
+            "tool_choice": "auto",
+            **_DEFAULT_COMPLETION_KWARGS,
+            **dict(self.completion_kwargs),
+        }
+        self.completion_kwargs.clear()
+
+        self._cache_control = self._normalize_cache_control(cache_control)
+        self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool"))
+
+    @staticmethod
+    def _normalize_cache_control(
+        cache_control: dict[str, Any] | bool | str | None,
+    ) -> dict[str, Any] | None:
+        if cache_control is False:
+            return None
+        if cache_control is None:
+            return {"type": "ephemeral"}
+        if cache_control is True:
+            return {"type": "ephemeral"}
+        if isinstance(cache_control, dict):
+            return cache_control
+        return {"type": str(cache_control)}
+
+    def _should_cache(self, role: str) -> bool:
+        return self._cache_control is not None and role in self._cacheable_roles
+
+    def _text_item(self, text: str, role: str) -> dict[str, Any]:
+        item: dict[str, Any] = {"type": "input_text", "text": text}
+        if self._should_cache(role):
+            item["cache_control"] = self._cache_control
+        return item
+
+    def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]:
+        url: str | None = None
+        detail = None
+
+        if isinstance(image_payload, dict):
+            # Standard OpenAI-style wrapper
+            if "image_url" in image_payload and isinstance(image_payload["image_url"], dict):
+                img = image_payload["image_url"]
+                url = img.get("url")
+                detail = img.get("detail") or image_payload.get("detail")
+            # Direct url / data uri
+            elif image_payload.get("url"):
+                url = image_payload.get("url")
+                detail = image_payload.get("detail")
+            # Raw base64 payload from computer/tool results
+            elif image_payload.get("data"):
+                mime = (
+                    image_payload.get("mimeType")
+                    or image_payload.get("mime_type")
+                    or "image/png"
+                )
+                data = image_payload.get("data")
+                if data:
+                    url = f"data:{mime};base64,{data}"
+                detail = image_payload.get("detail")
+            elif isinstance(image_payload.get("source"), dict):
+                source = image_payload["source"]
+                data = source.get("data")
+                mime = source.get("media_type") or source.get("mime_type") or "image/png"
+                if data:
+                    url = f"data:{mime};base64,{data}"
+                detail = source.get("detail")
+        elif isinstance(image_payload, str):
+            url = image_payload
+
+        item: dict[str, Any] = {"type": "input_image"}
+        if url:
+            item["image_url"] = url
+        item["detail"] = str(detail or "auto")
+        if self._should_cache(role):
+            item["cache_control"] = self._cache_control
+        return item
+
+    def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]:
+        if content is None:
+            return []
+
+        blocks: list[dict[str, Any]] = []
+        if isinstance(content, str):
+            blocks.append(self._text_item(content, role))
+            return blocks
+
+        if isinstance(content, dict):
+            content = [content]
+
+        if isinstance(content, list):
+            for entry in content:
+                if isinstance(entry, str):
+                    blocks.append(self._text_item(entry, role))
+                elif isinstance(entry, dict):
+                    entry_copy = dict(entry)
+                    entry_type = entry_copy.get("type")
+                    if entry_type in {"text", "input_text", None}:
+                        text = entry_copy.get("text") or ""
+                        blocks.append(self._text_item(text, role))
+                    elif entry_type in {"image_url", "input_image"}:
+                        payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy
+                        blocks.append(self._image_item(payload, role))
+                    elif entry_type in {"image", "output_image", "rendered"}:
+                        blocks.append(self._image_item(entry_copy, role))
+                    elif entry_type == "tool_result":
+                        text = entry_copy.get("text", "")
+                        blocks.append(self._text_item(text, role))
+                    else:
+                        text_value = entry_copy.get("text") or json.dumps(entry_copy)
+                        blocks.append(self._text_item(text_value, role))
+                else:
+                    blocks.append(self._text_item(str(entry), role))
+            return blocks
+
+        blocks.append(self._text_item(str(content), role))
+        return blocks
+
+    def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]:
+        converted: list[dict[str, Any]] = []
+        for message in messages:
+            if not isinstance(message, dict):
+                logger.debug("Skipping non-dict message: %s", message)
+                continue
+
+            if "type" in message and "role" not in message:
+                converted.append(message)
+                continue
+
+            role = message.get("role") or "user"
+
+            if role == "assistant" and message.get("tool_calls"):
+                content_items = self._convert_message_content(role, message.get("content"))
+                if content_items:
+                    converted.append({"role": "assistant", "content": content_items})
+                for tool_call in message.get("tool_calls", []):
+                    converted.append(self._convert_tool_call(tool_call))
+                continue
+
+            if role == "tool":
+                converted.extend(self._convert_tool_message(message))
+                continue
+
+            payload: dict[str, Any] = {"role": role}
+            content_items = self._convert_message_content(role, message.get("content"))
+            if content_items:
+                payload["content"] = content_items
+            if message.get("name"):
+                payload["name"] = message["name"]
+            if message.get("metadata"):
+                payload["metadata"] = message["metadata"]
+            converted.append(payload)
+
+        return converted
+
+    @staticmethod
+    def _jsonify_schema(value: Any) -> Any:
+        from pydantic import BaseModel
+        from pydantic.fields import FieldInfo
+
+        if isinstance(value, (str, int, float, bool)) or value is None:
+            return value
+
+        if isinstance(value, dict):
+            return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()}
+
+        if isinstance(value, (list, tuple, set)):
+            return [OpenRouterAgent._jsonify_schema(v) for v in value]
+
+        try:
+            return json.loads(json.dumps(value))
+        except Exception:
+            if isinstance(value, BaseModel):
+                return OpenRouterAgent._jsonify_schema(value.model_dump())
+            if isinstance(value, FieldInfo):
+                data: dict[str, Any] = {}
+                if value.annotation is not None:
+                    data.setdefault(
+                        "type",
+                        getattr(value.annotation, "__name__", str(value.annotation)),
+                    )
+                if value.description:
+                    data["description"] = value.description
+                if value.title:
+                    data["title"] = value.title
+                if value.default not in (None, Ellipsis):
+                    data["default"] = OpenRouterAgent._jsonify_schema(value.default)
+                if value.json_schema_extra:
+                    extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra)
+                    if isinstance(extra, dict):
+                        data.update(extra)
+                return data or str(value)
+            if hasattr(value, "model_dump"):
+                return OpenRouterAgent._jsonify_schema(value.model_dump())
+            if hasattr(value, "__dict__") and value.__dict__:
+                return OpenRouterAgent._jsonify_schema(
+                    {
+                        k: v
+                        for k, v in value.__dict__.items()
+                        if not k.startswith("_")
+                    }
+                )
+            return str(value)
+
+    @staticmethod
+    def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]:
+        if not tools:
+            return []
+
+        converted: list[dict] = []
+        for tool in tools:
+            if not isinstance(tool, dict):
+                continue
+
+            if tool.get("type") == "function" and isinstance(tool.get("function"), dict):
+                fn = tool["function"]
+                name = fn.get("name")
+                params = fn.get("parameters", {})
+                description = fn.get("description", "")
+
+                if not isinstance(name, str) or not name:
+                    logger.debug("Skipping tool with missing name: %s", tool)
+                    continue
+
+                converted.append(
+                    {
+                        "type": "function",
+                        "name": name,
+                        "description": str(description or ""),
+                        "parameters": OpenRouterAgent._jsonify_schema(params),
+                    }
+                )
+            else:
+                converted.append(OpenRouterAgent._jsonify_schema(tool))
+
+        return converted
+
+    def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]:
+        if not isinstance(tool_call, dict):
+            return {}
+
+        function = tool_call.get("function") or {}
+        name = function.get("name") or tool_call.get("name") or "tool_call"
+        raw_arguments = function.get("arguments")
+
+        if isinstance(raw_arguments, dict):
+            arguments = json.dumps(self._jsonify_schema(raw_arguments))
+        elif isinstance(raw_arguments, str):
+            try:
+                parsed = json.loads(raw_arguments)
+            except json.JSONDecodeError:
+                arguments = raw_arguments
+            else:
+                arguments = json.dumps(self._jsonify_schema(parsed))
+        elif raw_arguments is None:
+            arguments = "{}"
+        else:
+            arguments = json.dumps(self._jsonify_schema(raw_arguments))
+
+        call_id = (
+            tool_call.get("id")
+            or function.get("id")
+            or function.get("call_id")
+            or f"call_{uuid.uuid4().hex}"
+        )
+
+        return {
+            "type": "function_call",
+            "id": call_id,
+            "name": name,
+            "arguments": arguments or "{}",
+        }
+
+    def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]:
+        entries: list[dict[str, Any]] = []
+        call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}"
+
+        text_parts: list[str] = []
+        image_payloads: list[Any] = []
+
+        content = message.get("content")
+        if isinstance(content, list):
+            for item in content:
+                if isinstance(item, dict):
+                    item_type = item.get("type")
+                    if item_type in {"text", "input_text"} and item.get("text"):
+                        text_parts.append(str(item.get("text")))
+                    elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}:
+                        image_payloads.append(item)
+                elif isinstance(item, str):
+                    text_parts.append(item)
+        elif isinstance(content, str):
+            text_parts.append(content)
+
+        structured = message.get("structuredContent")
+        if structured and not text_parts:
+            try:
+                text_parts.append(json.dumps(structured))
+            except Exception:
+                text_parts.append(str(structured))
+
+        output_text = "\n".join(part for part in text_parts if part) or ""
+
+        entries.append(
+            {
+                "type": "function_call_output",
+                "id": message.get("id") or call_id,
+                "call_id": call_id,
+                "output": output_text,
+            }
+        )
+
+        for payload in image_payloads:
+            entries.append(
+                {
+                    "role": "user",
+                    "content": [self._image_item(payload, "user")],
+                }
+            )
+
+        return entries
+
+    async def format_tool_results(
+        self,
+        tool_calls: list[MCPToolCall],
+        tool_results: list[MCPToolResult],
+    ) -> list[dict[str, Any]]:
+        converted: list[dict[str, Any]] = []
+
+        for call, result in zip(tool_calls, tool_results, strict=False):
+            call_id = call.id or call.name or f"call_{uuid.uuid4().hex}"
+
+            text_parts: list[str] = []
+            image_payloads: list[Any] = []
+
+            for item in result.content or []:
+                if isinstance(item, types.TextContent):
+                    text_parts.append(item.text)
+                elif isinstance(item, types.ImageContent):
+                    image_payloads.append(
+                        {
+                            "mimeType": item.mimeType,
+                            "data": item.data,
+                            "detail": getattr(item, "detail", None),
+                        }
+                    )
+                elif isinstance(item, dict):
+                    if item.get("type") in {"text", "input_text"}:
+                        text_parts.append(str(item.get("text", "")))
+                    elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}:
+                        image_payloads.append(item)
+                elif isinstance(item, str):
+                    text_parts.append(item)
+
+            if result.structuredContent and not text_parts:
+                try:
+                    text_parts.append(json.dumps(result.structuredContent))
+                except Exception:
+                    text_parts.append(str(result.structuredContent))
+
+            if getattr(result, "isError", False):
+                text_parts.append(getattr(result, "error", "Tool execution failed."))
+
+            output_text = "\n".join(part for part in text_parts if part) or ""
+
+            converted.append(
+                {
+                    "type": "function_call_output",
+                    "id": call_id,
+                    "call_id": call_id,
+                    "output": output_text,
+                }
+            )
+
+            for payload in image_payloads:
+                converted.append(
+                    {
+                        "role": "user",
+                        "content": [self._image_item(payload, "user")],
+                    }
+                )
+
+        return converted
+
+    @staticmethod
+    def _parse_arguments(arguments: Any) -> dict[str, Any]:
+        if isinstance(arguments, dict):
+            return arguments
+        if isinstance(arguments, str) and arguments:
+            try:
+                parsed = json.loads(arguments)
+                if isinstance(parsed, dict):
+                    return parsed
+            except json.JSONDecodeError:
+                logger.debug("Failed to decode arguments: %s", arguments)
+        return {}
+
+    def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall:
+        tool_name = payload.get("name") or payload.get("function", {}).get("name") or ""
+        call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id")
+        if not call_id:
+            call_id = tool_name
+        arguments = payload.get("arguments")
+        if not arguments and "function" in payload:
+            arguments = payload["function"].get("arguments")
+        parsed_arguments = self._parse_arguments(arguments)
+        return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments)
+
+    def _coerce_response_payload(self, response: Any) -> dict[str, Any]:
+        """Convert OpenRouter SDK return types into a plain dictionary."""
+
+        if response is None:
+            return {}
+
+        if isinstance(response, dict):
+            return response
+
+        for attr in ("model_dump", "dict", "to_dict"):
+            if hasattr(response, attr):
+                try:
+                    payload = getattr(response, attr)()
+                except Exception as exc:  # pragma: no cover - defensive
+                    logger.debug("Failed to read response via %s: %s", attr, exc)
+                else:
+                    if isinstance(payload, dict):
+                        return payload
+
+        snapshot = getattr(response, "__dict__", None)
+        if isinstance(snapshot, dict):
+            return snapshot
+
+        logger.error("Unexpected response carrier from OpenRouter: %r", response)
+        raise TypeError("Unexpected response type from OpenRouter")
+
+    def _extract_response(self, response: Any) -> AgentResponse:
+        data = self._coerce_response_payload(response)
+        if not isinstance(data, dict):
+            raise TypeError("Unexpected response type from OpenRouter")
+
+        output = data.get("output", [])
+        text_parts: list[str] = []
+        tool_calls: list[MCPToolCall] = []
+        reasoning_parts: list[str] = []
+
+        for item in output:
+            item_type = item.get("type") if isinstance(item, dict) else None
+            if item_type == "message":
+                contents = item.get("content", [])
+                if isinstance(contents, list):
+                    for block in contents:
+                        if not isinstance(block, dict):
+                            continue
+                        block_type = block.get("type")
+                        if block_type in {"output_text", "text"}:
+                            text = block.get("text")
+                            if text:
+                                text_parts.append(text)
+                        elif block_type == "reasoning" and block.get("text"):
+                            reasoning_parts.append(block["text"])
+                for tc in item.get("tool_calls", []) or []:
+                    if isinstance(tc, dict):
+                        tool_calls.append(self._to_mcp_tool_call(tc))
+            elif item_type in {"tool_call", "function_call"} and isinstance(item, dict):
+                tool_calls.append(self._to_mcp_tool_call(item))
+            elif item_type == "reasoning" and isinstance(item, dict):
+                summary = item.get("summary")
+                if isinstance(summary, list):
+                    for block in summary:
+                        if isinstance(block, dict) and block.get("text"):
+                            reasoning_parts.append(block["text"])
+                elif isinstance(summary, str):
+                    reasoning_parts.append(summary)
+
+        merged_text = "\n".join(reasoning_parts + text_parts).strip()
+        status = data.get("status", "completed")
+        done = not tool_calls and status != "in_progress"
+        return AgentResponse(
+            content=merged_text,
+            tool_calls=tool_calls,
+            done=done,
+            raw=response,
+        )
+
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
+    async def get_response(self, messages: list[Any]) -> AgentResponse:
+        converted_messages = self._convert_messages(messages)
+        tools = self._convert_tools_for_responses(self.get_tool_schemas())
+
+        protected_keys = {"model", "input", "tools"}
+        extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys}
+        # If tools are provided and tool_choice isn't explicitly set, require tool use
+        if tools and "tool_choice" not in extra:
+            extra["tool_choice"] = "required"
+
+        try:
+            payload: dict[str, Any] = {
+                "model": self.model_name,
+                "input": converted_messages,
+                **extra,
+            }
+            if tools:
+                payload["tools"] = tools
+
+            response = await self.oai.responses.create(**payload)
+        except Exception as exc:
+            error_content = f"Error getting response {exc}"
+            logger.exception("OpenRouter call failed: %s", exc)
+            return AgentResponse(
+                content=error_content,
+                tool_calls=[],
+                done=True,
+                isError=True,
+                raw=None,
+            )
+
+        return self._extract_response(response)
diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py
new file mode 100644
index 00000000..d3010e0d
--- /dev/null
+++ b/hud/agents/tests/test_openrouter.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+import mcp.types as types
+
+from hud.agents.openrouter import OpenRouterAgent
+from hud.settings import settings
+from hud.types import MCPToolCall, MCPToolResult
+
+
+@pytest.fixture(autouse=True)
+def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Disable HUD telemetry during unit tests."""
+    monkeypatch.setattr(settings, "telemetry_enabled", False)
+    monkeypatch.setattr(settings, "api_key", None)
+
+
+class FakeResponse:
+    def __init__(self, payload: dict) -> None:
+        self._payload = payload
+
+    def model_dump(self) -> dict:
+        return self._payload
+
+
+@pytest.mark.asyncio
+async def test_openrouter_agent_builds_cached_messages() -> None:
+    responses_create = AsyncMock(
+        return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"})
+    )
+    mock_client = MagicMock()
+    mock_client.responses.create = responses_create
+
+    agent = OpenRouterAgent(
+        api_key="test-key",
+        openai_client=mock_client,
+        cache_control={"type": "ephemeral"},
+    )
+    agent._available_tools = []  # mimic initialized agent
+
+    messages = [
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
+        {"role": "assistant", "content": "Previous reply"},
+    ]
+
+    await agent.get_response(messages)
+
+    await_call = responses_create.await_args
+    assert await_call is not None
+    kwargs = await_call.kwargs
+    assert kwargs["model"] == agent.model_name
+    input_payload = kwargs["input"]
+
+    system_block = input_payload[0]["content"][0]
+    user_block = input_payload[1]["content"][0]
+    assistant_block = input_payload[2]["content"][0]
+
+    assert system_block["cache_control"] == {"type": "ephemeral"}
+    assert user_block["cache_control"] == {"type": "ephemeral"}
+    assert "cache_control" not in assistant_block
+
+
+@pytest.mark.asyncio
+async def test_openrouter_agent_parses_tool_calls() -> None:
+    responses_create = AsyncMock(
+        return_value=FakeResponse(
+            {
+                "output": [
+                    {
+                        "type": "message",
+                        "content": [{"type": "output_text", "text": "Calling tool"}],
+                        "tool_calls": [
+                            {
+                                "id": "call_1",
+                                "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"},
+                            }
+                        ],
+                    }
+                ],
+                "status": "requires_action",
+            }
+        )
+    )
+    mock_client = MagicMock()
+    mock_client.responses.create = responses_create
+
+    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+    agent._available_tools = []
+
+    result = await agent.get_response(
+        [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
+        ]
+    )
+
+    assert not result.done
+    assert result.tool_calls[0].name == "search"
+    assert result.tool_calls[0].arguments == {"query": "hud"}
+
+
+@pytest.mark.asyncio
+async def test_openrouter_agent_returns_text_response() -> None:
+    responses_create = AsyncMock(
+        return_value=FakeResponse(
+            {
+                "output": [
+                    {
+                        "type": "message",
+                        "content": [{"type": "output_text", "text": "Hi there"}],
+                    }
+                ],
+                "status": "completed",
+            }
+        )
+    )
+    mock_client = MagicMock()
+    mock_client.responses.create = responses_create
+
+    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+    agent._available_tools = []
+
+    result = await agent.get_response(
+        [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
+        ]
+    )
+
+    assert result.done
+    assert result.content == "Hi there"
+    assert result.tool_calls == []
+
+
+def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None:
+    mock_client = MagicMock()
+    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+
+    from pydantic import Field
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "click",
+                "description": "Click an element",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "selector": Field(default="", description="CSS selector"),
+                    },
+                    "required": ["selector"],
+                },
+            },
+        }
+    ]
+
+    converted = agent._convert_tools_for_responses(tools)
+    selector_schema = converted[0]["parameters"]["properties"]["selector"]
+    assert isinstance(selector_schema, dict)
+    assert selector_schema.get("description") == "CSS selector"
+
+
+def test_openrouter_agent_converts_image_blocks() -> None:
+    mock_client = MagicMock()
+    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+
+    content = [
+        {
+            "type": "image",
+            "mimeType": "image/png",
+            "data": "dGVzdA==",
+            "detail": "high",
+        }
+    ]
+
+    message_blocks = agent._convert_messages([{"role": "user", "content": content}])
+    image_block = message_blocks[0]["content"][0]
+    assert image_block["type"] == "input_image"
+    assert image_block["image_url"].startswith("data:image/png;base64,")
+    assert image_block["detail"] == "high"
+
+
+@pytest.mark.asyncio
+async def test_format_tool_results_produces_function_call_output() -> None:
+    mock_client = MagicMock()
+    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+
+    tool_call = MCPToolCall(id="call-1", name="playwright", arguments={})
+    tool_result = MCPToolResult(
+        content=[
+            types.TextContent(type="text", text="navigation complete"),
+            types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"),
+        ]
+    )
+
+    formatted = await agent.format_tool_results([tool_call], [tool_result])
+
+    assert formatted[0]["type"] == "function_call_output"
+    assert formatted[0]["call_id"] == "call-1"
+    assert formatted[1]["role"] == "user"
+    assert formatted[1]["content"][0]["type"] == "input_image"
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 3708cf0e..99771913 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -777,7 +777,8 @@ def eval(
     agent: str | None = typer.Argument(
         None,
         help=(
-            "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively."  # noqa: E501
+            "Agent backend to use (claude, openai computer use, openrouter responses, "
+            "vllm, or litellm). If not provided, will prompt interactively."
         ),
     ),
     full: bool = typer.Option(
@@ -893,6 +894,7 @@ def eval(
             [
                 {"name": "Claude 4 Sonnet", "value": "claude"},
                 {"name": "OpenAI Computer Use", "value": "openai"},
+                {"name": "OpenRouter (Responses)", "value": "openrouter"},
                 {"name": "vLLM (Local Server)", "value": "vllm"},
                 {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
             ]
@@ -901,7 +903,7 @@ def eval(
         agent = hud_console.select("Select an agent to use:", choices=choices, default=0)
 
     # Handle HUD model selection
-    if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]:
+    if agent and agent not in ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"]:
         # Find remote model name
         model = agent
         if not vllm_base_url:
@@ -922,7 +924,7 @@ def eval(
         hud_console.info(f"Using HUD model: {model} (trained on {base_model})")
 
     # Validate agent choice
-    valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"]
+    valid_agents = ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"]
     if agent not in valid_agents:
         hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}")
         raise typer.Exit(1)
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index e8afceac..4900ba85 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -113,7 +113,7 @@ def _build_vllm_config(
 
 
 def build_agent(
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
+    agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"],
     *,
     model: str | None = None,
     allowed_tools: list[str] | None = None,
@@ -180,6 +180,21 @@ def build_agent(
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
+    elif agent_type == "openrouter":
+        try:
+            from hud.agents.openrouter import OpenRouterAgent
+        except ImportError as e:
+            hud_console.error(
+                "OpenRouter agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        return OpenRouterAgent(
+            model_name=model or "z-ai/glm-4.6",
+            allowed_tools=allowed_tools,
+            verbose=verbose,
+        )
 
     # Fallback Claude agent (Anthropic)
     try:
@@ -209,7 +224,7 @@ def build_agent(
 async def run_single_task(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude",
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_steps: int = 10,
@@ -305,6 +320,16 @@ async def run_single_task(
         }
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
+    elif agent_type == "openrouter":
+        from hud.agents.openrouter import OpenRouterAgent
+
+        agent_class = OpenRouterAgent
+        agent_config = {
+            "model_name": model or "z-ai/glm-4.5v",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
     elif agent_type == "claude":
         from hud.agents import ClaudeAgent
 
@@ -353,7 +378,7 @@ async def run_single_task(
 async def run_full_dataset(
     source: str,
     *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude",
     model: str | None = None,
     allowed_tools: list[str] | None = None,
     max_concurrent: int = 30,
@@ -539,10 +564,13 @@ def eval_command(
         "--full",
         help="Run the entire dataset (omit for single-task debug mode)",
     ),
-    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
+    agent: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = typer.Option(
         "claude",
         "--agent",
-        help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
+        help=(
+            "Agent backend to use (claude, openai computer use, openrouter responses, "
+            "vllm for local server, or litellm)"
+        ),
     ),
     model: str | None = typer.Option(
         None,
diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py
index e15cb240..37b9fa7a 100644
--- a/hud/utils/agent_factories.py
+++ b/hud/utils/agent_factories.py
@@ -8,6 +8,7 @@
 
 from hud.agents.grounded_openai import GroundedOpenAIChatAgent
 from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.agents.openrouter import OpenRouterAgent
 from hud.tools.grounding import GrounderConfig
 
 
@@ -82,3 +83,18 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
     return GroundedOpenAIChatAgent(
         openai_client=openai_client, grounder_config=grounder_config, **kwargs
     )
+
+
+def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent:
+    """Factory for OpenRouterAgent with run_dataset compatibility."""
+
+    api_key = kwargs.pop("api_key", None)
+    base_url = kwargs.pop("base_url", None)
+    cache_control = kwargs.pop("cache_control", True)
+
+    return OpenRouterAgent(
+        api_key=api_key,
+        base_url=base_url,
+        cache_control=cache_control,
+        **kwargs,
+    )

From 8281d6bc948287d295d800f248ae870def754023 Mon Sep 17 00:00:00 2001
From: shinbehavior <hagforall@proton.me>
Date: Sat, 11 Oct 2025 23:06:03 +0200
Subject: [PATCH 2/4] litllm, glm-4.5v cua loop

---
 examples/04_openrouter_quickstart.py |  47 --
 hud/agents/glm45v.py                 | 820 +++++++++++++++++++++++
 hud/agents/openrouter.py             | 960 ++++++++++++---------------
 hud/agents/tests/test_openrouter.py  | 237 ++-----
 hud/cli/__init__.py                  |   2 +-
 hud/cli/eval.py                      |   4 +-
 hud/utils/agent_factories.py         |  11 +-
 pyproject.toml                       |   2 +-
 8 files changed, 1298 insertions(+), 785 deletions(-)
 delete mode 100644 examples/04_openrouter_quickstart.py
 create mode 100644 hud/agents/glm45v.py

diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py
deleted file mode 100644
index 2ac56044..00000000
--- a/examples/04_openrouter_quickstart.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-from pathlib import Path
-
-from hud.agents.openrouter import OpenRouterAgent
-from hud.utils.hud_console import HUDConsole
-
-
-async def main() -> None:
-    hud_console = HUDConsole()
-
-    # Inline FastMCP sum task (no external JSON needed)
-    server_path = Path(__file__).parent / "mcp_sum_server.py"
-    task = {
-        "id": "sum-demo",
-        "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.",
-        "mcp_config": {
-            "local": {
-                "command": "python",
-                "args": [str(server_path)],
-            }
-        },
-        "agent_config": {
-            "allowed_tools": ["sum"],
-            "system_prompt": (
-                "You are a concise math assistant. Always call the `sum` tool when asked to add "
-                "numbers, wait for the result, then explain the answer in one sentence."
-            ),
-        },
-    }
-
-    # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env)
-    agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True)
-
-    hud_console.info("Running task with OpenRouter agent...")
-    result = await agent.run(task, max_steps=3)
-
-    hud_console.info("\nFinal content:")
-    hud_console.info(result.content or "<empty>")
-    hud_console.success(f"Reward: {result.reward}")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-
-
diff --git a/hud/agents/glm45v.py b/hud/agents/glm45v.py
new file mode 100644
index 00000000..e7ff0fdc
--- /dev/null
+++ b/hud/agents/glm45v.py
@@ -0,0 +1,820 @@
+"""glm-4.5v computer-use agent backed by litellm + openrouter."""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any, ClassVar
+
+import litellm
+import mcp.types as types
+from litellm.types.utils import ModelResponse
+
+from hud.agents.base import MCPAgent
+from hud.tools.computer.settings import computer_settings
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from hud import instrument
+from hud.agents.openrouter import (
+    _convert_json_action_to_items,
+    _decode_image_dimensions,
+    _extract_user_instruction,
+    _make_click_item,
+    _make_double_click_item,
+    _make_drag_item,
+    _make_failed_tool_call_items,
+    _make_keypress_item,
+    _make_output_text_item,
+    _make_reasoning_item,
+    _make_screenshot_item,
+    _make_scroll_item,
+    _make_type_item,
+    _make_wait_item,
+    _parse_json_action_string,
+    _random_id,
+    get_last_image_from_messages,
+)
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_SYSTEM_PROMPT = """
+You are an autonomous computer-using agent. Follow these guidelines:
+
+1. Do not ask for permission; act decisively to finish the task.
+2. Always ground actions in the latest screenshot and task instructions.
+3. Use the provided mouse/keyboard tools precisely (coordinates are 0-999).
+4. Keep memory concise—store only facts that matter for later steps.
+5. When the task is complete, reply with DONE() and include the final answer.
+6. If the task is impossible, reply with FAIL() and explain briefly.
+""".strip()
+
+
+GLM_ACTION_SPACE = """
+### {left,right,middle}_click
+
+Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')`
+{
+    'name': ['left_click', 'right_click', 'middle_click'],
+    'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'start_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.'
+            },
+            'element_info': {
+                'type': 'string',
+                'description': 'Optional text description of the UI element being clicked.'
+            }
+        },
+        'required': ['start_box']
+    }
+}
+
+### hover
+
+Call rule: `hover(start_box='[x,y]', element_info='')`
+{
+    'name': 'hover',
+    'description': 'Move the mouse pointer to the specified coordinates without performing any click action.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'start_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.'
+            },
+            'element_info': {
+                'type': 'string',
+                'description': 'Optional text description of the UI element being hovered over.'
+            }
+        },
+        'required': ['start_box']
+    }
+}
+
+### left_double_click
+
+Call rule: `left_double_click(start_box='[x,y]', element_info='')`
+{
+    'name': 'left_double_click',
+    'description': 'Perform a left mouse double-click at the specified coordinates on the screen.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'start_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.'
+            },
+            'element_info': {
+                'type': 'string',
+                'description': 'Optional text description of the UI element being double-clicked.'
+            }
+        },
+        'required': ['start_box']
+    }
+}
+
+### left_drag
+
+Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')`
+{
+    'name': 'left_drag',
+    'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'start_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.'
+            },
+            'end_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.'
+            },
+            'element_info': {
+                'type': 'string',
+                'description': 'Optional text description of the UI element being dragged.'
+            }
+        },
+        'required': ['start_box', 'end_box']
+    }
+}
+
+### key
+
+Call rule: `key(keys='')`
+{
+    'name': 'key',
+    'description': 'Simulate pressing a single key or combination of keys on the keyboard.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'keys': {
+                'type': 'string',
+                'description': "The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab')."
+            }
+        },
+        'required': ['keys']
+    }
+}
+
+### type
+
+Call rule: `type(content='')`
+{
+    'name': 'type',
+    'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'content': {
+                'type': 'string',
+                'description': 'The text content to be typed into the active text field.'
+            }
+        },
+        'required': ['content']
+    }
+}
+
+### scroll
+
+Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')`
+{
+    'name': 'scroll',
+    'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.',
+    'parameters': {
+        'type': 'object',
+        'properties': {
+            'start_box': {
+                'type': 'array',
+                'items': {
+                    'type': 'integer'
+                },
+                'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.'
+            },
+            'direction': {
+                'type': 'string',
+                'enum': ['down', 'up'],
+                'description': "The direction to scroll: 'down' or 'up'."
+            },
+            'step': {
+                'type': 'integer',
+                'default': 5,
+                'description': 'Number of wheel steps to scroll, default is 5.'
+            },
+            'element_info': {
+                'type': 'string',
+                'description': 'Optional text description of the UI element being scrolled.'
+            }
+        },
+        'required': ['start_box', 'direction']
+    }
+}
+
+### WAIT
+
+Call rule: `WAIT()`
+{
+    'name': 'WAIT',
+    'description': 'Wait for 5 seconds before proceeding to the next action.',
+    'parameters': {
+        'type': 'object',
+        'properties': {},
+        'required': []
+    }
+}
+
+### DONE
+
+Call rule: `DONE()`
+{
+    'name': 'DONE',
+    'description': 'Indicate that the current task has been completed successfully and no further actions are needed.',
+    'parameters': {
+        'type': 'object',
+        'properties': {},
+        'required': []
+    }
+}
+
+### FAIL
+
+Call rule: `FAIL()`
+{
+    'name': 'FAIL',
+    'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.',
+    'parameters': {
+        'type': 'object',
+        'properties': {},
+        'required': []
+    }
+}"""
+
+
+
+def convert_responses_items_to_glm45v_pc_prompt(
+    messages: list[dict[str, Any]],
+    task: str,
+    memory: str = "[]",
+) -> list[dict[str, Any]]:
+    action_space = GLM_ACTION_SPACE
+    head_text = (
+        "You are a GUI Agent, and your primary task is to respond accurately to user"
+        " requests or questions. In addition to directly answering the user's queries,"
+        " you can also use tools or perform GUI operations directly until you fulfill"
+        " the user's request or provide a correct answer. You should carefully read and"
+        " understand the images and questions provided by the user, and engage in"
+        " thinking and reflection when appropriate. The coordinates involved are all"
+        " represented in thousandths (0-999)."
+        "\n\n# Task:\n"
+        f"{task}\n\n# Task Platform\nUbuntu\n\n# Action Space\n{action_space}\n\n"
+        "# Historical Actions and Current Memory\nHistory:"
+    )
+
+    tail_text = (
+        "\nMemory:\n"
+        f"{memory}\n"
+        "# Output Format\nPlain text explanation with action(param='...')\n"
+        "Memory:\n[{\"key\": \"value\"}, ...]\n\n# Some Additional Notes\n"
+        "- I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.\n"
+        "- You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round."
+        " The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory."
+        " Even if you don't need to remember anything, you should also output an empty list.\n"
+        "- If elevated privileges are needed, credentials are referenced as <OS_PASSWORD>.\n"
+        "- For any mail account interactions, credentials are referenced as <MAIL_PASSWORD>.\n\n"
+        "Current Screenshot:\n"
+    )
+
+    history: list[dict[str, Any]] = []
+    history_images: list[str] = []
+    current_step: list[dict[str, Any]] = []
+    step_num = 0
+
+    for message in messages:
+        if not isinstance(message, dict):
+            continue
+        msg_type = message.get("type")
+
+        if msg_type in {"reasoning", "message", "computer_call", "computer_call_output"}:
+            current_step.append(message)
+
+        if msg_type == "computer_call_output" and current_step:
+            step_num += 1
+
+            bot_thought = ""
+            action_text = ""
+            for item in current_step:
+                if item.get("type") == "message" and item.get("role") == "assistant":
+                    content = item.get("content") or []
+                    if isinstance(content, list):
+                        for block in content:
+                            if isinstance(block, dict) and block.get("type") == "output_text":
+                                bot_thought = block.get("text", "")
+                                break
+                if item.get("type") == "computer_call":
+                    action_text = json.dumps(item.get("action", {}))
+
+            history.append({
+                "step_num": step_num,
+                "bot_thought": bot_thought,
+                "action_text": action_text,
+            })
+
+            output = message.get("output") or {}
+            if isinstance(output, dict) and output.get("type") == "input_image":
+                url = output.get("image_url")
+                if isinstance(url, str):
+                    history_images.append(url)
+
+            current_step = []
+
+    content: list[dict[str, Any]] = []
+    current_text = head_text
+
+    total_steps = len(history)
+    image_tail = min(4, len(history_images))
+
+    for idx, step in enumerate(history):
+        step_no = step["step_num"]
+        bot_thought = step["bot_thought"]
+        action_text = step["action_text"]
+
+        if idx < total_steps - image_tail:
+            current_text += (
+                f"\nstep {step_no}: Screenshot:(Omitted in context.)"
+                f" Thought: {bot_thought}\nAction: {action_text}"
+            )
+        else:
+            current_text += f"\nstep {step_no}: Screenshot:"
+            content.append({"type": "text", "text": current_text})
+            image_idx = idx - (total_steps - image_tail)
+            if 0 <= image_idx < len(history_images):
+                content.append({"type": "image_url", "image_url": {"url": history_images[image_idx]}})
+            current_text = f" Thought: {bot_thought}\nAction: {action_text}"
+
+    current_text += tail_text
+    content.append({"type": "text", "text": current_text})
+    return content
+
+
+def convert_glm_completion_to_responses_items(
+    response: ModelResponse,
+    image_width: int,
+    image_height: int,
+    parsed_response: dict[str, str] | None = None,
+) -> list[dict[str, Any]]:
+    items: list[dict[str, Any]] = []
+
+    if not getattr(response, "choices", None):
+        return items
+
+    choice = response.choices[0]
+    message = getattr(choice, "message", None)
+    if not message:
+        return items
+
+    content = getattr(message, "content", "") or ""
+    reasoning_content = getattr(message, "reasoning_content", None)
+
+    if reasoning_content:
+        items.append(_make_reasoning_item(str(reasoning_content)))
+
+    parsed = parsed_response or parse_glm_response(content)
+    action = parsed.get("action", "")
+    action_text = parsed.get("action_text", "")
+
+    if action_text:
+        clean_text = action_text
+        if action:
+            clean_text = clean_text.replace(action, "").strip()
+        clean_text = re.sub(r"Memory:\s*\[.*?\]\s*$", "", clean_text, flags=re.DOTALL).strip()
+        if clean_text:
+            items.append(_make_output_text_item(clean_text))
+
+    if action:
+        call_id = _random_id()
+        handled_json = False
+
+        json_action = _parse_json_action_string(action)
+        if json_action:
+            json_entries = _convert_json_action_to_items(
+                json_action,
+                call_id=call_id,
+                image_width=image_width,
+                image_height=image_height,
+            )
+            if json_entries:
+                items.extend(json_entries)
+                handled_json = True
+
+        if action.startswith("left_click"):
+            match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
+            if match:
+                x, y = int(match.group(1)), int(match.group(2))
+                actual_x = int((x / 999.0) * image_width)
+                actual_y = int((y / 999.0) * image_height)
+                if not handled_json:
+                    items.append(_make_click_item(actual_x, actual_y, call_id=call_id))
+        elif action.startswith("right_click"):
+            match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
+            if match:
+                x, y = int(match.group(1)), int(match.group(2))
+                actual_x = int((x / 999.0) * image_width)
+                actual_y = int((y / 999.0) * image_height)
+                if not handled_json:
+                    items.append(_make_click_item(actual_x, actual_y, button="right", call_id=call_id))
+        elif action.startswith("left_double_click"):
+            match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
+            if match:
+                x, y = int(match.group(1)), int(match.group(2))
+                actual_x = int((x / 999.0) * image_width)
+                actual_y = int((y / 999.0) * image_height)
+                if not handled_json:
+                    items.append(_make_double_click_item(actual_x, actual_y, call_id=call_id))
+        elif action.startswith("left_drag"):
+            start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
+            end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action)
+            if start_match and end_match:
+                x1, y1 = int(start_match.group(1)), int(start_match.group(2))
+                x2, y2 = int(end_match.group(1)), int(end_match.group(2))
+                actual_x1 = int((x1 / 999.0) * image_width)
+                actual_y1 = int((y1 / 999.0) * image_height)
+                actual_x2 = int((x2 / 999.0) * image_width)
+                actual_y2 = int((y2 / 999.0) * image_height)
+                path = [
+                    {"x": actual_x1, "y": actual_y1},
+                    {"x": actual_x2, "y": actual_y2},
+                ]
+                if not handled_json:
+                    items.append(_make_drag_item(path, call_id=call_id))
+        elif action.startswith("key"):
+            key_match = re.search(r"keys='([^']+)'", action)
+            if key_match:
+                keys = key_match.group(1)
+                key_list = keys.split("+") if "+" in keys else [keys]
+                if not handled_json:
+                    items.append(_make_keypress_item(key_list, call_id=call_id))
+        elif action.startswith("type"):
+            content_match = re.search(r"content='([^']*)'", action)
+            if content_match:
+                text = content_match.group(1)
+                if not handled_json:
+                    items.append(_make_type_item(text, call_id=call_id))
+        elif action.startswith("scroll"):
+            coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action)
+            direction_match = re.search(r"direction='([^']+)'", action)
+            if coord_match and direction_match:
+                x, y = int(coord_match.group(1)), int(coord_match.group(2))
+                direction = direction_match.group(1)
+                actual_x = int((x / 999.0) * image_width)
+                actual_y = int((y / 999.0) * image_height)
+                scroll_x = 0
+                scroll_y = 0
+                if direction == "up":
+                    scroll_y = -5
+                elif direction == "down":
+                    scroll_y = 5
+                elif direction == "left":
+                    scroll_x = -5
+                elif direction == "right":
+                    scroll_x = 5
+                if not handled_json:
+                    items.append(_make_scroll_item(actual_x, actual_y, scroll_x, scroll_y, call_id=call_id))
+        elif action == "WAIT()":
+            if not handled_json:
+                items.append(_make_wait_item(call_id=call_id))
+
+    return items
+
+
+def parse_glm_response(response: str) -> dict[str, str]:
+    pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>"
+    match = re.search(pattern, response)
+    if match:
+        action = match.group(1).strip()
+    else:
+        action_pattern = r"[\w_]+\([^)]*\)"
+        matches = re.findall(action_pattern, response)
+        action = matches[0] if matches else ""
+
+    memory_pattern = r"Memory:(.*?)$"
+    memory_match = re.search(memory_pattern, response, re.DOTALL)
+    memory = memory_match.group(1).strip() if memory_match else "[]"
+
+    action_text_pattern = r"^(.*?)Memory:"
+    action_text_match = re.search(action_text_pattern, response, re.DOTALL)
+    action_text = action_text_match.group(1).strip() if action_text_match else response
+    if action_text:
+        action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "")
+
+    return {
+        "action": action or "",
+        "action_text": action_text,
+        "memory": memory,
+    }
+
+
+
+
+
+
+class Glm45vAgent(MCPAgent):
+    """LiteLLM-backed GLM-4.5V agent that speaks MCP."""
+
+    metadata: ClassVar[dict[str, Any]] = {
+        "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
+        "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
+    }
+
+    required_tools: ClassVar[list[str]] = ["openai_computer"]
+
+    def __init__(
+        self,
+        *,
+        model_name: str = "z-ai/glm-4.5v",
+        completion_kwargs: dict[str, Any] | None = None,
+        system_prompt: str | None = None,
+        **agent_kwargs: Any,
+    ) -> None:
+        super().__init__(**agent_kwargs)
+        # Normalize to canonical openrouter/<vendor>/<model>
+        if not model_name.startswith("openrouter/"):
+            self.model_name = f"openrouter/{model_name}"
+        else:
+            self.model_name = model_name
+        self.completion_kwargs = completion_kwargs or {}
+        combined_prompt = DEFAULT_SYSTEM_PROMPT
+        if system_prompt:
+            combined_prompt = f"{combined_prompt}\n\n{system_prompt}"
+
+        if self.system_prompt:
+            self.system_prompt = f"{self.system_prompt}\n\n{combined_prompt}"
+        else:
+            self.system_prompt = combined_prompt
+        self._memory = "[]"
+        self._last_instruction = ""
+        self._task_description = ""
+
+    async def get_system_messages(self) -> list[Any]:
+        return []
+
+    @instrument(span_type="agent", record_args=False)
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
+        content_items: list[dict[str, Any]] = []
+        text_parts: list[str] = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                text_parts.append(block.text)
+            elif isinstance(block, types.ImageContent):
+                content_items.append(
+                    {
+                        "type": "message",
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}",
+                                },
+                            }
+                        ],
+                    }
+                )
+
+        if text_parts:
+            content_items.insert(
+                0,
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": [{"type": "input_text", "text": "\n".join(text_parts)}],
+                },
+            )
+
+        return content_items
+
+    def _glm_tool_call_to_mcp(self, item: dict[str, Any]) -> MCPToolCall:
+        call_id = item.get("call_id") or _random_id()
+        action = item.get("action") or {}
+        action_type = action.get("type", "")
+
+        arguments: dict[str, Any] = {"type": action_type}
+        for key in ("x", "y", "scroll_x", "scroll_y"):
+            if key in action:
+                arguments[key] = action[key]
+        if "button" in action:
+            arguments["button"] = action["button"]
+        if "keys" in action:
+            arguments["keys"] = action["keys"]
+        if "text" in action:
+            arguments["text"] = action["text"]
+        if "path" in action:
+            arguments["path"] = action["path"]
+
+        return MCPToolCall(id=call_id, name="openai_computer", arguments=arguments)
+
+    @instrument(span_type="agent", record_args=False)
+    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+        instruction = _extract_user_instruction(messages)
+        if instruction:
+            self._last_instruction = instruction  # type: ignore[attr-defined]
+            self._task_description = instruction
+        task_instruction = self._task_description or getattr(self, "_last_instruction", "")
+
+        screenshot_b64 = get_last_image_from_messages(messages)
+        if not screenshot_b64:
+            call_id = _random_id()
+            screenshot_call = _make_screenshot_item(call_id)
+            messages.append(screenshot_call)
+            logger.debug("glm45v requesting initial screenshot")
+            tool_call = MCPToolCall(
+                id=call_id,
+                name="openai_computer",
+                arguments={"type": "screenshot"},
+            )
+            return AgentResponse(
+                content="capturing initial screenshot",
+                tool_calls=[tool_call],
+                done=False,
+            )
+
+        self.console.debug(f"glm45v task instruction: {task_instruction}")
+        self.console.debug(f"glm45v memory (pre-step): {self._memory}")
+
+        prompt_content = convert_responses_items_to_glm45v_pc_prompt(
+            messages=messages,
+            task=task_instruction,
+            memory=self._memory,
+        )
+        prompt_content.append(
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}}
+        )
+
+        system_prompt = self.system_prompt or "You are a helpful GUI agent assistant."
+        litellm_messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt_content},
+        ]
+
+        api_kwargs = {"model": self.model_name, "messages": litellm_messages}
+        api_kwargs.update(self.completion_kwargs)
+
+        try:
+            response = await litellm.acompletion(**api_kwargs)
+        except Exception as exc:  # pragma: no cover - network errors
+            logger.exception("glm45v completion failed: %s", exc)
+            return AgentResponse(
+                content=f"GLM-4.5V request failed: {exc}",
+                tool_calls=[],
+                done=True,
+                isError=True,
+            )
+
+        choice = response.choices[0]
+        message = getattr(choice, "message", None)
+        response_content = getattr(message, "content", "") if message else ""
+        parsed = parse_glm_response(response_content or "") if response_content else {
+            "memory": self._memory,
+        }
+        if parsed.get("memory"):
+            self._memory = parsed["memory"]
+        logger.debug("glm45v model content: %s", response_content)
+        trimmed = response_content[:400] if response_content else ""
+        self.console.debug(f"glm45v model content: {trimmed}")
+        self.console.debug(f"glm45v parsed response: {parsed}")
+
+        image_width, image_height = _decode_image_dimensions(screenshot_b64)
+        response_items = convert_glm_completion_to_responses_items(
+            response,
+            image_width=image_width,
+            image_height=image_height,
+            parsed_response=parsed,
+        )
+
+        messages.extend(response_items)
+
+        text_parts: list[str] = []
+        reasoning_parts: list[str] = []
+        tool_calls: list[MCPToolCall] = []
+
+        for item in response_items:
+            if not isinstance(item, dict):
+                continue
+            if item.get("type") == "message" and item.get("role") == "assistant":
+                for block in item.get("content", []) or []:
+                    if isinstance(block, dict) and block.get("type") == "output_text":
+                        text = block.get("text")
+                        if isinstance(text, str):
+                            text_parts.append(text)
+            elif item.get("type") == "reasoning":
+                summary = item.get("summary", [])
+                for block in summary:
+                    if isinstance(block, dict) and block.get("text"):
+                        reasoning_parts.append(block["text"])
+            elif item.get("type") == "computer_call":
+                tool_calls.append(self._glm_tool_call_to_mcp(item))
+
+        content_text = "\n".join(text_parts).strip()
+        reasoning_text = "\n".join(reasoning_parts).strip()
+
+        if not tool_calls:
+            self.console.info_log(
+                f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'"
+            )
+            self.console.info_log(f"glm45v parsed response: {parsed}")
+
+        return AgentResponse(
+            content=content_text or None,
+            reasoning=reasoning_text or None,
+            tool_calls=tool_calls,
+            done=not tool_calls,
+            raw=response,
+        )
+
+    @instrument(span_type="agent", record_args=False)
+    async def format_tool_results(
+        self,
+        tool_calls: list[MCPToolCall],
+        tool_results: list[MCPToolResult],
+    ) -> list[dict[str, Any]]:
+        rendered: list[dict[str, Any]] = []
+
+        for call, result in zip(tool_calls, tool_results, strict=False):
+            call_args = call.arguments or {}
+            if result.isError:
+                error_text = "".join(
+                    content.text
+                    for content in result.content
+                    if isinstance(content, types.TextContent)
+                )
+                rendered.extend(
+                    _make_failed_tool_call_items(
+                        tool_name=call_args.get("type", call.name),
+                        tool_kwargs=call_args,
+                        error_message=error_text or "Unknown error",
+                        call_id=call.id,
+                    )
+                )
+                continue
+
+            screenshot_found = False
+            for content in result.content:
+                if isinstance(content, types.ImageContent):
+                    rendered.append(
+                        {
+                            "type": "computer_call_output",
+                            "call_id": call.id,
+                            "output": {
+                                "type": "input_image",
+                                "image_url": f"data:{content.mimeType};base64,{content.data}",
+                            },
+                        }
+                    )
+                    screenshot_found = True
+                    break
+
+            text_parts = [
+                content.text
+                for content in result.content
+                if isinstance(content, types.TextContent) and content.text
+            ]
+            if text_parts:
+                rendered.append(
+                    {
+                        "type": "message",
+                        "role": "user",
+                        "content": [{"type": "input_text", "text": "\n".join(text_parts)}],
+                    }
+                )
+
+            if not screenshot_found and not text_parts:
+                rendered.append(
+                    {
+                        "type": "computer_call_output",
+                        "call_id": call.id,
+                        "output": {"type": "input_text", "text": "Tool executed"},
+                    }
+                )
+
+        return rendered
+
+
+__all__ = ["Glm45vAgent"]
diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py
index 4306a386..c9445258 100644
--- a/hud/agents/openrouter.py
+++ b/hud/agents/openrouter.py
@@ -1,592 +1,452 @@
-"""OpenRouter agent that uses the Responses API with prompt caching."""
+"""OpenRouter agent facade plus shared tooling helpers."""
 
 from __future__ import annotations
 
+import base64
 import json
-import logging
+import re
 import uuid
-from typing import Any, Iterable
+from importlib import import_module
+from io import BytesIO
+from typing import Any, Dict, Type
 
-import mcp.types as types
-from openai import AsyncOpenAI
+from PIL import Image
 
-from hud import instrument
-from hud.settings import settings
-from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from hud.agents.base import MCPAgent
+from hud.tools.computer.settings import computer_settings
 
-from .openai_chat_generic import GenericOpenAIChatAgent
+# Shared helper utilities for computer-use adapters
+def _random_id() -> str:
+    return f"call_{uuid.uuid4().hex[:8]}"
 
-logger = logging.getLogger(__name__)
 
-_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha"
-_DEFAULT_HEADERS = {
-    "HTTP-Referer": "https://hud.so",
-    "X-Title": "HUD Python SDK",
-    "Accept": "application/json",
-}
+def _make_reasoning_item(reasoning: str) -> dict[str, Any]:
+    return {
+        "id": _random_id(),
+        "type": "reasoning",
+        "summary": [{"type": "summary_text", "text": reasoning}],
+    }
 
-_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = {
-    "temperature": 0.1,
-    "max_output_tokens": 1024,
-}
 
+def _make_output_text_item(content: str) -> dict[str, Any]:
+    return {
+        "id": _random_id(),
+        "type": "message",
+        "role": "assistant",
+        "status": "completed",
+        "content": [{"type": "output_text", "text": content, "annotations": []}],
+    }
 
-class OpenRouterAgent(GenericOpenAIChatAgent):
-    """MCP-enabled agent that talks to OpenRouter through the Responses API."""
-
-    def __init__(
-        self,
-        *,
-        api_key: str | None = None,
-        base_url: str | None = None,
-        model_name: str = "z-ai/glm-4.5v",
-        default_headers: dict[str, str] | None = None,
-        cache_control: dict[str, Any] | bool | None = True,
-        cacheable_roles: Iterable[str] | None = None,
-        openai_client: AsyncOpenAI | None = None,
-        completion_kwargs: dict[str, Any] | None = None,
-        **agent_kwargs: Any,
-    ) -> None:
-        api_key = api_key or settings.openrouter_api_key
-        if not api_key:
-            raise ValueError(
-                "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly."
-            )
 
-        base_url = base_url or _DEFAULT_BASE_URL
+def _make_computer_call_item(action: dict[str, Any], call_id: str | None = None) -> dict[str, Any]:
+    call_id = call_id or _random_id()
+    return {
+        "id": _random_id(),
+        "call_id": call_id,
+        "type": "computer_call",
+        "status": "completed",
+        "pending_safety_checks": [],
+        "action": action,
+    }
 
-        headers: dict[str, str] = dict(_DEFAULT_HEADERS)
-        if default_headers:
-            headers.update(default_headers)
 
-        client = openai_client or AsyncOpenAI(
-            api_key=api_key,
-            base_url=base_url,
-            default_headers=headers,
-        )
+def _make_click_item(x: int, y: int, button: str = "left", call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "click", "x": x, "y": y, "button": button}, call_id)
 
-        super().__init__(
-            openai_client=client,
-            model_name=model_name,
-            completion_kwargs=completion_kwargs,
-            **agent_kwargs,
-        )
 
-        self._responses_kwargs = {
-            "tool_choice": "auto",
-            **_DEFAULT_COMPLETION_KWARGS,
-            **dict(self.completion_kwargs),
-        }
-        self.completion_kwargs.clear()
+def _make_double_click_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "double_click", "x": x, "y": y}, call_id)
 
-        self._cache_control = self._normalize_cache_control(cache_control)
-        self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool"))
 
-    @staticmethod
-    def _normalize_cache_control(
-        cache_control: dict[str, Any] | bool | str | None,
-    ) -> dict[str, Any] | None:
-        if cache_control is False:
-            return None
-        if cache_control is None:
-            return {"type": "ephemeral"}
-        if cache_control is True:
-            return {"type": "ephemeral"}
-        if isinstance(cache_control, dict):
-            return cache_control
-        return {"type": str(cache_control)}
-
-    def _should_cache(self, role: str) -> bool:
-        return self._cache_control is not None and role in self._cacheable_roles
-
-    def _text_item(self, text: str, role: str) -> dict[str, Any]:
-        item: dict[str, Any] = {"type": "input_text", "text": text}
-        if self._should_cache(role):
-            item["cache_control"] = self._cache_control
-        return item
-
-    def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]:
-        url: str | None = None
-        detail = None
-
-        if isinstance(image_payload, dict):
-            # Standard OpenAI-style wrapper
-            if "image_url" in image_payload and isinstance(image_payload["image_url"], dict):
-                img = image_payload["image_url"]
-                url = img.get("url")
-                detail = img.get("detail") or image_payload.get("detail")
-            # Direct url / data uri
-            elif image_payload.get("url"):
-                url = image_payload.get("url")
-                detail = image_payload.get("detail")
-            # Raw base64 payload from computer/tool results
-            elif image_payload.get("data"):
-                mime = (
-                    image_payload.get("mimeType")
-                    or image_payload.get("mime_type")
-                    or "image/png"
-                )
-                data = image_payload.get("data")
-                if data:
-                    url = f"data:{mime};base64,{data}"
-                detail = image_payload.get("detail")
-            elif isinstance(image_payload.get("source"), dict):
-                source = image_payload["source"]
-                data = source.get("data")
-                mime = source.get("media_type") or source.get("mime_type") or "image/png"
-                if data:
-                    url = f"data:{mime};base64,{data}"
-                detail = source.get("detail")
-        elif isinstance(image_payload, str):
-            url = image_payload
-
-        item: dict[str, Any] = {"type": "input_image"}
-        if url:
-            item["image_url"] = url
-        item["detail"] = str(detail or "auto")
-        if self._should_cache(role):
-            item["cache_control"] = self._cache_control
-        return item
-
-    def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]:
-        if content is None:
-            return []
-
-        blocks: list[dict[str, Any]] = []
-        if isinstance(content, str):
-            blocks.append(self._text_item(content, role))
-            return blocks
-
-        if isinstance(content, dict):
-            content = [content]
-
-        if isinstance(content, list):
-            for entry in content:
-                if isinstance(entry, str):
-                    blocks.append(self._text_item(entry, role))
-                elif isinstance(entry, dict):
-                    entry_copy = dict(entry)
-                    entry_type = entry_copy.get("type")
-                    if entry_type in {"text", "input_text", None}:
-                        text = entry_copy.get("text") or ""
-                        blocks.append(self._text_item(text, role))
-                    elif entry_type in {"image_url", "input_image"}:
-                        payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy
-                        blocks.append(self._image_item(payload, role))
-                    elif entry_type in {"image", "output_image", "rendered"}:
-                        blocks.append(self._image_item(entry_copy, role))
-                    elif entry_type == "tool_result":
-                        text = entry_copy.get("text", "")
-                        blocks.append(self._text_item(text, role))
-                    else:
-                        text_value = entry_copy.get("text") or json.dumps(entry_copy)
-                        blocks.append(self._text_item(text_value, role))
-                else:
-                    blocks.append(self._text_item(str(entry), role))
-            return blocks
-
-        blocks.append(self._text_item(str(content), role))
-        return blocks
-
-    def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]:
-        converted: list[dict[str, Any]] = []
-        for message in messages:
-            if not isinstance(message, dict):
-                logger.debug("Skipping non-dict message: %s", message)
-                continue
-
-            if "type" in message and "role" not in message:
-                converted.append(message)
-                continue
-
-            role = message.get("role") or "user"
-
-            if role == "assistant" and message.get("tool_calls"):
-                content_items = self._convert_message_content(role, message.get("content"))
-                if content_items:
-                    converted.append({"role": "assistant", "content": content_items})
-                for tool_call in message.get("tool_calls", []):
-                    converted.append(self._convert_tool_call(tool_call))
-                continue
-
-            if role == "tool":
-                converted.extend(self._convert_tool_message(message))
-                continue
-
-            payload: dict[str, Any] = {"role": role}
-            content_items = self._convert_message_content(role, message.get("content"))
-            if content_items:
-                payload["content"] = content_items
-            if message.get("name"):
-                payload["name"] = message["name"]
-            if message.get("metadata"):
-                payload["metadata"] = message["metadata"]
-            converted.append(payload)
-
-        return converted
+def _make_move_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "move", "x": x, "y": y}, call_id)
+
+
+def _make_drag_item(path: list[dict[str, int]], call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "drag", "path": path}, call_id)
+
+
+def _make_keypress_item(keys: list[str], call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "keypress", "keys": keys}, call_id)
+
+
+def _make_type_item(text: str, call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "type", "text": text}, call_id)
 
-    @staticmethod
-    def _jsonify_schema(value: Any) -> Any:
-        from pydantic import BaseModel
-        from pydantic.fields import FieldInfo
 
-        if isinstance(value, (str, int, float, bool)) or value is None:
-            return value
+def _make_scroll_item(
+    x: int,
+    y: int,
+    scroll_x: int,
+    scroll_y: int,
+    call_id: str | None = None,
+) -> dict[str, Any]:
+    action = {"type": "scroll", "x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y}
+    return _make_computer_call_item(action, call_id)
 
-        if isinstance(value, dict):
-            return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()}
 
-        if isinstance(value, (list, tuple, set)):
-            return [OpenRouterAgent._jsonify_schema(v) for v in value]
+def _make_wait_item(call_id: str | None = None) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "wait"}, call_id)
 
+
+def _make_screenshot_item(call_id: str) -> dict[str, Any]:
+    return _make_computer_call_item({"type": "screenshot"}, call_id)
+
+
+def _make_failed_tool_call_items(
+    tool_name: str,
+    tool_kwargs: dict[str, Any],
+    error_message: str,
+    call_id: str,
+) -> list[dict[str, Any]]:
+    call = _make_computer_call_item({"type": tool_name, **tool_kwargs}, call_id)
+    call["status"] = "failed"
+    failure_text = _make_output_text_item(f"Tool {tool_name} failed: {error_message}")
+    failure_text["role"] = "assistant"
+    return [call, failure_text]
+
+
+def _coerce_to_pixel_coordinates(
+    x_val: Any,
+    y_val: Any,
+    *,
+    width: int,
+    height: int,
+) -> tuple[int, int] | None:
+    try:
+        x_float = float(x_val)
+        y_float = float(y_val)
+    except (TypeError, ValueError):
+        return None
+
+    def clamp(value: int, maximum: int) -> int:
+        return max(0, min(maximum - 1, value))
+
+    abs_x = abs(x_float)
+    abs_y = abs(y_float)
+    if abs_x <= 1.0 and abs_y <= 1.0:
+        px = int(x_float * width)
+        py = int(y_float * height)
+    elif abs_x <= 999.0 and abs_y <= 999.0:
+        px = int((x_float / 999.0) * width)
+        py = int((y_float / 999.0) * height)
+    else:
+        px = int(x_float)
+        py = int(y_float)
+
+    return clamp(px, width), clamp(py, height)
+
+
+def _parse_coordinate_box(value: Any) -> tuple[float, float] | None:
+    if isinstance(value, (list, tuple)) and len(value) >= 2:
         try:
-            return json.loads(json.dumps(value))
-        except Exception:
-            if isinstance(value, BaseModel):
-                return OpenRouterAgent._jsonify_schema(value.model_dump())
-            if isinstance(value, FieldInfo):
-                data: dict[str, Any] = {}
-                if value.annotation is not None:
-                    data.setdefault(
-                        "type",
-                        getattr(value.annotation, "__name__", str(value.annotation)),
-                    )
-                if value.description:
-                    data["description"] = value.description
-                if value.title:
-                    data["title"] = value.title
-                if value.default not in (None, Ellipsis):
-                    data["default"] = OpenRouterAgent._jsonify_schema(value.default)
-                if value.json_schema_extra:
-                    extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra)
-                    if isinstance(extra, dict):
-                        data.update(extra)
-                return data or str(value)
-            if hasattr(value, "model_dump"):
-                return OpenRouterAgent._jsonify_schema(value.model_dump())
-            if hasattr(value, "__dict__") and value.__dict__:
-                return OpenRouterAgent._jsonify_schema(
-                    {
-                        k: v
-                        for k, v in value.__dict__.items()
-                        if not k.startswith("_")
-                    }
-                )
-            return str(value)
+            return float(value[0]), float(value[1])
+        except (TypeError, ValueError):
+            return None
 
-    @staticmethod
-    def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]:
-        if not tools:
-            return []
-
-        converted: list[dict] = []
-        for tool in tools:
-            if not isinstance(tool, dict):
-                continue
-
-            if tool.get("type") == "function" and isinstance(tool.get("function"), dict):
-                fn = tool["function"]
-                name = fn.get("name")
-                params = fn.get("parameters", {})
-                description = fn.get("description", "")
-
-                if not isinstance(name, str) or not name:
-                    logger.debug("Skipping tool with missing name: %s", tool)
-                    continue
-
-                converted.append(
-                    {
-                        "type": "function",
-                        "name": name,
-                        "description": str(description or ""),
-                        "parameters": OpenRouterAgent._jsonify_schema(params),
-                    }
-                )
-            else:
-                converted.append(OpenRouterAgent._jsonify_schema(tool))
-
-        return converted
-
-    def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]:
-        if not isinstance(tool_call, dict):
-            return {}
-
-        function = tool_call.get("function") or {}
-        name = function.get("name") or tool_call.get("name") or "tool_call"
-        raw_arguments = function.get("arguments")
-
-        if isinstance(raw_arguments, dict):
-            arguments = json.dumps(self._jsonify_schema(raw_arguments))
-        elif isinstance(raw_arguments, str):
-            try:
-                parsed = json.loads(raw_arguments)
-            except json.JSONDecodeError:
-                arguments = raw_arguments
-            else:
-                arguments = json.dumps(self._jsonify_schema(parsed))
-        elif raw_arguments is None:
-            arguments = "{}"
+    if isinstance(value, str):
+        stripped = value.strip()
+        try:
+            loaded = json.loads(stripped)
+        except Exception:
+            matches = re.findall(r"-?\d+(?:\.\d+)?", stripped)
+            if len(matches) >= 2:
+                return float(matches[0]), float(matches[1])
         else:
-            arguments = json.dumps(self._jsonify_schema(raw_arguments))
+            if isinstance(loaded, (list, tuple)) and len(loaded) >= 2:
+                try:
+                    return float(loaded[0]), float(loaded[1])
+                except (TypeError, ValueError):
+                    return None
+    return None
+
+
+def _coerce_box_to_pixels(
+    box: Any,
+    *,
+    width: int,
+    height: int,
+) -> tuple[int, int] | None:
+    coords = _parse_coordinate_box(box)
+    if not coords:
+        return None
+    return _coerce_to_pixel_coordinates(coords[0], coords[1], width=width, height=height)
+
+
+def _parse_json_action_string(action_text: str) -> dict[str, Any] | None:
+    candidate = action_text.strip()
+    if not (candidate.startswith("{") and candidate.endswith("}")):
+        return None
+
+    attempts = [candidate]
+    if "\\" in candidate:
+        try:
+            attempts.append(candidate.encode("utf-8").decode("unicode_escape"))
+        except Exception:
+            pass
+        attempts.append(candidate.replace("\\\"", '"'))
 
-        call_id = (
-            tool_call.get("id")
-            or function.get("id")
-            or function.get("call_id")
-            or f"call_{uuid.uuid4().hex}"
-        )
+    for attempt in attempts:
+        try:
+            return json.loads(attempt)
+        except Exception:
+            continue
 
-        return {
-            "type": "function_call",
-            "id": call_id,
-            "name": name,
-            "arguments": arguments or "{}",
-        }
-
-    def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]:
-        entries: list[dict[str, Any]] = []
-        call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}"
-
-        text_parts: list[str] = []
-        image_payloads: list[Any] = []
-
-        content = message.get("content")
-        if isinstance(content, list):
-            for item in content:
-                if isinstance(item, dict):
-                    item_type = item.get("type")
-                    if item_type in {"text", "input_text"} and item.get("text"):
-                        text_parts.append(str(item.get("text")))
-                    elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}:
-                        image_payloads.append(item)
-                elif isinstance(item, str):
-                    text_parts.append(item)
-        elif isinstance(content, str):
-            text_parts.append(content)
-
-        structured = message.get("structuredContent")
-        if structured and not text_parts:
-            try:
-                text_parts.append(json.dumps(structured))
-            except Exception:
-                text_parts.append(str(structured))
-
-        output_text = "\n".join(part for part in text_parts if part) or ""
-
-        entries.append(
-            {
-                "type": "function_call_output",
-                "id": message.get("id") or call_id,
-                "call_id": call_id,
-                "output": output_text,
-            }
-        )
+    return None
 
-        for payload in image_payloads:
-            entries.append(
-                {
-                    "role": "user",
-                    "content": [self._image_item(payload, "user")],
-                }
-            )
 
+def _convert_json_action_to_items(
+    json_action: dict[str, Any],
+    *,
+    call_id: str,
+    image_width: int,
+    image_height: int,
+) -> list[dict[str, Any]]:
+    entries: list[dict[str, Any]] = []
+    action_type = str(json_action.get("type", "")).lower()
+    if not action_type:
         return entries
 
-    async def format_tool_results(
-        self,
-        tool_calls: list[MCPToolCall],
-        tool_results: list[MCPToolResult],
-    ) -> list[dict[str, Any]]:
-        converted: list[dict[str, Any]] = []
-
-        for call, result in zip(tool_calls, tool_results, strict=False):
-            call_id = call.id or call.name or f"call_{uuid.uuid4().hex}"
-
-            text_parts: list[str] = []
-            image_payloads: list[Any] = []
-
-            for item in result.content or []:
-                if isinstance(item, types.TextContent):
-                    text_parts.append(item.text)
-                elif isinstance(item, types.ImageContent):
-                    image_payloads.append(
-                        {
-                            "mimeType": item.mimeType,
-                            "data": item.data,
-                            "detail": getattr(item, "detail", None),
-                        }
-                    )
-                elif isinstance(item, dict):
-                    if item.get("type") in {"text", "input_text"}:
-                        text_parts.append(str(item.get("text", "")))
-                    elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}:
-                        image_payloads.append(item)
-                elif isinstance(item, str):
-                    text_parts.append(item)
-
-            if result.structuredContent and not text_parts:
-                try:
-                    text_parts.append(json.dumps(result.structuredContent))
-                except Exception:
-                    text_parts.append(str(result.structuredContent))
-
-            if getattr(result, "isError", False):
-                text_parts.append(getattr(result, "error", "Tool execution failed."))
-
-            output_text = "\n".join(part for part in text_parts if part) or ""
-
-            converted.append(
-                {
-                    "type": "function_call_output",
-                    "id": call_id,
-                    "call_id": call_id,
-                    "output": output_text,
-                }
+    if action_type in {"type", "text"}:
+        text_value = json_action.get("content") or json_action.get("text") or ""
+        if text_value:
+            entries.append(_make_type_item(str(text_value), call_id=call_id))
+    elif action_type in {"click", "left_click"}:
+        start_box = (
+            json_action.get("start_box")
+            or json_action.get("startBox")
+            or json_action.get("position")
+        )
+        coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height)
+        if not coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
+            )
+        if coords:
+            button = str(json_action.get("button", "left") or "left").lower()
+            entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id))
+    elif action_type in {"right_click", "middle_click"}:
+        start_box = json_action.get("start_box") or json_action.get("startBox")
+        coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height)
+        if not coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
+            )
+        if coords:
+            button = "right" if action_type == "right_click" else "middle"
+            entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id))
+    elif action_type in {"double_click", "left_double_click"}:
+        start_box = json_action.get("start_box") or json_action.get("startBox")
+        coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height)
+        if not coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
             )
+        if coords:
+            entries.append(_make_double_click_item(coords[0], coords[1], call_id=call_id))
+    elif action_type in {"drag", "left_drag"}:
+        start_box = json_action.get("start_box") or json_action.get("startBox")
+        end_box = json_action.get("end_box") or json_action.get("endBox")
+        start_coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height)
+        end_coords = _coerce_box_to_pixels(end_box, width=image_width, height=image_height)
+        if not start_coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            start_coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
+            )
+        if start_coords and end_coords:
+            path = [
+                {"x": start_coords[0], "y": start_coords[1]},
+                {"x": end_coords[0], "y": end_coords[1]},
+            ]
+            entries.append(_make_drag_item(path, call_id=call_id))
+    elif action_type == "scroll":
+        start_box = json_action.get("start_box") or json_action.get("startBox")
+        coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height)
+        if not coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
+            )
+        direction = str(json_action.get("direction", "")).lower()
+        step = int(json_action.get("step", 5) or 5)
+        if coords:
+            scroll_x = 0
+            scroll_y = 0
+            if direction == "up":
+                scroll_y = -abs(step)
+            elif direction == "down":
+                scroll_y = abs(step)
+            elif direction == "left":
+                scroll_x = -abs(step)
+            elif direction == "right":
+                scroll_x = abs(step)
+            entries.append(
+                _make_scroll_item(coords[0], coords[1], scroll_x, scroll_y, call_id=call_id)
+            )
+    elif action_type in {"hover", "move"}:
+        target_box = (
+            json_action.get("start_box")
+            or json_action.get("startBox")
+            or json_action.get("position")
+        )
+        coords = _coerce_box_to_pixels(target_box, width=image_width, height=image_height)
+        if not coords and json_action.get("x") is not None and json_action.get("y") is not None:
+            coords = _coerce_to_pixel_coordinates(
+                json_action.get("x"),
+                json_action.get("y"),
+                width=image_width,
+                height=image_height,
+            )
+        if coords:
+            entries.append(_make_move_item(coords[0], coords[1], call_id=call_id))
+    elif action_type in {"keypress", "key", "key_press"}:
+        keys = json_action.get("keys")
+        key_list: list[str] = []
+        if isinstance(keys, str):
+            key_list = [segment.strip() for segment in keys.split("+") if segment.strip()]
+        elif isinstance(keys, list):
+            key_list = [str(segment).strip() for segment in keys if str(segment).strip()]
+        if key_list:
+            entries.append(_make_keypress_item(key_list, call_id=call_id))
+    elif action_type == "wait":
+        entries.append(_make_wait_item(call_id=call_id))
+    elif action_type == "screenshot":
+        entries.append(_make_screenshot_item(call_id))
+
+    return entries
+
+
+def _decode_image_dimensions(image_b64: str) -> tuple[int, int]:
+    try:
+        data = base64.b64decode(image_b64)
+        with Image.open(BytesIO(data)) as img:
+            return img.size
+    except Exception:  # pragma: no cover - defensive fallback
+        return computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT
+
+
+def _extract_user_instruction(messages: list[dict[str, Any]]) -> str:
+    for message in messages:
+        if not isinstance(message, dict):
+            continue
+        if message.get("type") == "message" and message.get("role") == "user":
+            content = message.get("content") or []
+            if isinstance(content, list):
+                for block in content:
+                    if isinstance(block, dict) and block.get("type") in {"text", "input_text"}:
+                        text = block.get("text")
+                        if isinstance(text, str) and text.strip():
+                            return text.strip()
+    return ""
+
+
+def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None:
+    for message in reversed(messages):
+        if not isinstance(message, dict):
+            continue
+        msg_type = message.get("type")
+        if msg_type == "computer_call_output":
+            output = message.get("output") or {}
+            if isinstance(output, dict):
+                image_url = output.get("image_url")
+                if isinstance(image_url, str) and image_url.startswith("data:image/"):
+                    return image_url.split(",", 1)[1]
+        if msg_type == "message" and message.get("role") == "user":
+            content = message.get("content")
+            if isinstance(content, list):
+                for item in reversed(content):
+                    if isinstance(item, dict) and item.get("type") == "image_url":
+                        url_obj = item.get("image_url")
+                        if isinstance(url_obj, dict):
+                            url = url_obj.get("url")
+                            if isinstance(url, str) and url.startswith("data:image/"):
+                                return url.split(",", 1)[1]
+    return None
+
+# Adapter dispatch
+_ADAPTER_REGISTRY: Dict[str, str] = {
+    "z-ai/glm-4.5v": "hud.agents.glm45v:Glm45vAgent",
+}
 
-            for payload in image_payloads:
-                converted.append(
-                    {
-                        "role": "user",
-                        "content": [self._image_item(payload, "user")],
-                    }
-                )
 
-        return converted
+def _load_adapter(path: str) -> Type[MCPAgent]:
+    module_name, class_name = path.split(":", 1)
+    module = import_module(module_name)
+    return getattr(module, class_name)
 
-    @staticmethod
-    def _parse_arguments(arguments: Any) -> dict[str, Any]:
-        if isinstance(arguments, dict):
-            return arguments
-        if isinstance(arguments, str) and arguments:
-            try:
-                parsed = json.loads(arguments)
-                if isinstance(parsed, dict):
-                    return parsed
-            except json.JSONDecodeError:
-                logger.debug("Failed to decode arguments: %s", arguments)
-        return {}
-
-    def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall:
-        tool_name = payload.get("name") or payload.get("function", {}).get("name") or ""
-        call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id")
-        if not call_id:
-            call_id = tool_name
-        arguments = payload.get("arguments")
-        if not arguments and "function" in payload:
-            arguments = payload["function"].get("arguments")
-        parsed_arguments = self._parse_arguments(arguments)
-        return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments)
-
-    def _coerce_response_payload(self, response: Any) -> dict[str, Any]:
-        """Convert OpenRouter SDK return types into a plain dictionary."""
-
-        if response is None:
-            return {}
-
-        if isinstance(response, dict):
-            return response
-
-        for attr in ("model_dump", "dict", "to_dict"):
-            if hasattr(response, attr):
-                try:
-                    payload = getattr(response, attr)()
-                except Exception as exc:  # pragma: no cover - defensive
-                    logger.debug("Failed to read response via %s: %s", attr, exc)
-                else:
-                    if isinstance(payload, dict):
-                        return payload
-
-        snapshot = getattr(response, "__dict__", None)
-        if isinstance(snapshot, dict):
-            return snapshot
-
-        logger.error("Unexpected response carrier from OpenRouter: %r", response)
-        raise TypeError("Unexpected response type from OpenRouter")
-
-    def _extract_response(self, response: Any) -> AgentResponse:
-        data = self._coerce_response_payload(response)
-        if not isinstance(data, dict):
-            raise TypeError("Unexpected response type from OpenRouter")
-
-        output = data.get("output", [])
-        text_parts: list[str] = []
-        tool_calls: list[MCPToolCall] = []
-        reasoning_parts: list[str] = []
-
-        for item in output:
-            item_type = item.get("type") if isinstance(item, dict) else None
-            if item_type == "message":
-                contents = item.get("content", [])
-                if isinstance(contents, list):
-                    for block in contents:
-                        if not isinstance(block, dict):
-                            continue
-                        block_type = block.get("type")
-                        if block_type in {"output_text", "text"}:
-                            text = block.get("text")
-                            if text:
-                                text_parts.append(text)
-                        elif block_type == "reasoning" and block.get("text"):
-                            reasoning_parts.append(block["text"])
-                for tc in item.get("tool_calls", []) or []:
-                    if isinstance(tc, dict):
-                        tool_calls.append(self._to_mcp_tool_call(tc))
-            elif item_type in {"tool_call", "function_call"} and isinstance(item, dict):
-                tool_calls.append(self._to_mcp_tool_call(item))
-            elif item_type == "reasoning" and isinstance(item, dict):
-                summary = item.get("summary")
-                if isinstance(summary, list):
-                    for block in summary:
-                        if isinstance(block, dict) and block.get("text"):
-                            reasoning_parts.append(block["text"])
-                elif isinstance(summary, str):
-                    reasoning_parts.append(summary)
-
-        merged_text = "\n".join(reasoning_parts + text_parts).strip()
-        status = data.get("status", "completed")
-        done = not tool_calls and status != "in_progress"
-        return AgentResponse(
-            content=merged_text,
-            tool_calls=tool_calls,
-            done=done,
-            raw=response,
-        )
 
-    @instrument(
-        span_type="agent",
-        record_args=False,
-        record_result=True,
-    )
-    async def get_response(self, messages: list[Any]) -> AgentResponse:
-        converted_messages = self._convert_messages(messages)
-        tools = self._convert_tools_for_responses(self.get_tool_schemas())
-
-        protected_keys = {"model", "input", "tools"}
-        extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys}
-        # If tools are provided and tool_choice isn't explicitly set, require tool use
-        if tools and "tool_choice" not in extra:
-            extra["tool_choice"] = "required"
+class OpenRouterAgent:
+    """Dispatch wrapper that selects the correct OpenRouter adapter by model."""
 
+    def __init__(self, *, model_name: str = "z-ai/glm-4.5v", **kwargs: Any) -> None:
+        normalized = self._normalize_model_name(model_name)
         try:
-            payload: dict[str, Any] = {
-                "model": self.model_name,
-                "input": converted_messages,
-                **extra,
-            }
-            if tools:
-                payload["tools"] = tools
-
-            response = await self.oai.responses.create(**payload)
-        except Exception as exc:
-            error_content = f"Error getting response {exc}"
-            logger.exception("OpenRouter call failed: %s", exc)
-            return AgentResponse(
-                content=error_content,
-                tool_calls=[],
-                done=True,
-                isError=True,
-                raw=None,
-            )
+            adapter_path = _ADAPTER_REGISTRY[normalized]
+        except KeyError as exc:  # pragma: no cover - defensive
+            raise ValueError(f"Unsupported OpenRouter model: {model_name}") from exc
+
+        adapter_cls = _load_adapter(adapter_path)
+        canonical_model = f"openrouter/{normalized}"
+        self.model_name = canonical_model
+        self._adapter = adapter_cls(model_name=canonical_model, **kwargs)
 
-        return self._extract_response(response)
+    @staticmethod
+    def _normalize_model_name(raw_model: str | None) -> str:
+        if not raw_model:
+            raise ValueError("Model name must be provided for OpenRouterAgent")
+        key = raw_model.strip()
+        if key.startswith("openrouter/"):
+            key = key[len("openrouter/") :]
+        key = key.lower()
+        if key in _ADAPTER_REGISTRY:
+            return key
+        raise ValueError(f"Unknown OpenRouter model: {raw_model}")
+
+    def __getattr__(self, item: str) -> Any: 
+        return getattr(self._adapter, item)
+
+    def __dir__(self) -> list[str]: 
+        base_dir = set(super().__dir__())
+        base_dir.update(self.__dict__.keys())
+        base_dir.update(dir(self._adapter))
+        return sorted(base_dir)
+
+
+__all__ = [
+    "OpenRouterAgent",
+    "_random_id",
+    "_make_reasoning_item",
+    "_make_output_text_item",
+    "_make_computer_call_item",
+    "_make_click_item",
+    "_make_double_click_item",
+    "_make_drag_item",
+    "_make_keypress_item",
+    "_make_type_item",
+    "_make_scroll_item",
+    "_make_wait_item",
+    "_make_screenshot_item",
+    "_make_failed_tool_call_items",
+    "_coerce_to_pixel_coordinates",
+    "_parse_coordinate_box",
+    "_coerce_box_to_pixels",
+    "_parse_json_action_string",
+    "_convert_json_action_to_items",
+    "_decode_image_dimensions",
+    "_extract_user_instruction",
+    "get_last_image_from_messages",
+]
diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py
index d3010e0d..7328586e 100644
--- a/hud/agents/tests/test_openrouter.py
+++ b/hud/agents/tests/test_openrouter.py
@@ -1,205 +1,94 @@
 from __future__ import annotations
 
 import pytest
-from unittest.mock import AsyncMock, MagicMock
 
-import mcp.types as types
+from types import SimpleNamespace
+from typing import Any
 
-from hud.agents.openrouter import OpenRouterAgent
-from hud.settings import settings
-from hud.types import MCPToolCall, MCPToolResult
+def _import_agents():
+    import mcp.types as types
+    from hud.agents.glm45v import Glm45vAgent
+    from hud.agents.openrouter import OpenRouterAgent
+    from hud.types import MCPToolResult
+    return Glm45vAgent, OpenRouterAgent, MCPToolResult, types
 
 
-@pytest.fixture(autouse=True)
-def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None:
-    """Disable HUD telemetry during unit tests."""
-    monkeypatch.setattr(settings, "telemetry_enabled", False)
-    monkeypatch.setattr(settings, "api_key", None)
+def test_openrouter_agent_defaults_to_glm45v() -> None:
+    Glm45vAgent, OpenRouterAgent, _, _ = _import_agents()
+    agent = OpenRouterAgent()
+    assert isinstance(agent._adapter, Glm45vAgent)
+    assert agent.model_name == "openrouter/z-ai/glm-4.5v"
 
 
-class FakeResponse:
-    def __init__(self, payload: dict) -> None:
-        self._payload = payload
+def test_openrouter_agent_normalizes_alias() -> None:
+    _, OpenRouterAgent, _, _ = _import_agents()
+    agent = OpenRouterAgent(model_name="Z-AI/GLM-4.5V")
+    assert agent.model_name == "openrouter/z-ai/glm-4.5v"
 
-    def model_dump(self) -> dict:
-        return self._payload
 
-
-@pytest.mark.asyncio
-async def test_openrouter_agent_builds_cached_messages() -> None:
-    responses_create = AsyncMock(
-        return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"})
-    )
-    mock_client = MagicMock()
-    mock_client.responses.create = responses_create
-
-    agent = OpenRouterAgent(
-        api_key="test-key",
-        openai_client=mock_client,
-        cache_control={"type": "ephemeral"},
-    )
-    agent._available_tools = []  # mimic initialized agent
-
-    messages = [
-        {"role": "system", "content": "You are helpful."},
-        {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
-        {"role": "assistant", "content": "Previous reply"},
-    ]
-
-    await agent.get_response(messages)
-
-    await_call = responses_create.await_args
-    assert await_call is not None
-    kwargs = await_call.kwargs
-    assert kwargs["model"] == agent.model_name
-    input_payload = kwargs["input"]
-
-    system_block = input_payload[0]["content"][0]
-    user_block = input_payload[1]["content"][0]
-    assistant_block = input_payload[2]["content"][0]
-
-    assert system_block["cache_control"] == {"type": "ephemeral"}
-    assert user_block["cache_control"] == {"type": "ephemeral"}
-    assert "cache_control" not in assistant_block
+def test_openrouter_agent_rejects_unknown_model() -> None:
+    _, OpenRouterAgent, _, _ = _import_agents()
+    with pytest.raises(ValueError):
+        OpenRouterAgent(model_name="unknown/model")
 
 
 @pytest.mark.asyncio
-async def test_openrouter_agent_parses_tool_calls() -> None:
-    responses_create = AsyncMock(
-        return_value=FakeResponse(
-            {
-                "output": [
-                    {
-                        "type": "message",
-                        "content": [{"type": "output_text", "text": "Calling tool"}],
-                        "tool_calls": [
-                            {
-                                "id": "call_1",
-                                "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"},
-                            }
-                        ],
-                    }
-                ],
-                "status": "requires_action",
-            }
-        )
-    )
-    mock_client = MagicMock()
-    mock_client.responses.create = responses_create
+async def test_openrouter_agent_parses_tool_calls(monkeypatch: pytest.MonkeyPatch) -> None:
+    Glm45vAgent, OpenRouterAgent, MCPToolResult, types = _import_agents()
+    png_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO61uFYAAAAASUVORK5CYII="
 
-    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
-    agent._available_tools = []
+    async def fake_completion(*_: Any, **__: Any) -> Any:
+        message = SimpleNamespace(content=(
+            "I will click the button.\n"
+            "<|begin_of_box|>{\"type\": \"click\", \"start_box\": [100, 200]}<|end_of_box|>\n"
+            "Memory:[]"
+        ), reasoning_content=None)
+        choice = SimpleNamespace(message=message)
+        return SimpleNamespace(choices=[choice])
 
-    result = await agent.get_response(
-        [
-            {"role": "system", "content": "You are helpful."},
-            {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
-        ]
-    )
+    monkeypatch.setattr("hud.agents.glm45v.litellm.acompletion", fake_completion)
 
-    assert not result.done
-    assert result.tool_calls[0].name == "search"
-    assert result.tool_calls[0].arguments == {"query": "hud"}
-
-
-@pytest.mark.asyncio
-async def test_openrouter_agent_returns_text_response() -> None:
-    responses_create = AsyncMock(
-        return_value=FakeResponse(
-            {
-                "output": [
-                    {
-                        "type": "message",
-                        "content": [{"type": "output_text", "text": "Hi there"}],
-                    }
-                ],
-                "status": "completed",
-            }
-        )
-    )
-    mock_client = MagicMock()
-    mock_client.responses.create = responses_create
+    agent = OpenRouterAgent(model_name="z-ai/glm-4.5v")
 
-    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
-    agent._available_tools = []
-
-    result = await agent.get_response(
-        [
-            {"role": "system", "content": "You are helpful."},
-            {"role": "user", "content": [{"type": "text", "text": "Hello"}]},
-        ]
-    )
-
-    assert result.done
-    assert result.content == "Hi there"
-    assert result.tool_calls == []
-
-
-def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None:
-    mock_client = MagicMock()
-    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
-
-    from pydantic import Field
-
-    tools = [
+    messages: list[dict[str, Any]] = [
         {
-            "type": "function",
-            "function": {
-                "name": "click",
-                "description": "Click an element",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "selector": Field(default="", description="CSS selector"),
-                    },
-                    "required": ["selector"],
-                },
-            },
-        }
-    ]
-
-    converted = agent._convert_tools_for_responses(tools)
-    selector_schema = converted[0]["parameters"]["properties"]["selector"]
-    assert isinstance(selector_schema, dict)
-    assert selector_schema.get("description") == "CSS selector"
-
-
-def test_openrouter_agent_converts_image_blocks() -> None:
-    mock_client = MagicMock()
-    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
-
-    content = [
+            "type": "message",
+            "role": "user",
+            "content": [{"type": "input_text", "text": "click the highlighted cell"}],
+        },
         {
-            "type": "image",
-            "mimeType": "image/png",
-            "data": "dGVzdA==",
-            "detail": "high",
-        }
+            "type": "computer_call_output",
+            "call_id": "initial",
+            "output": {
+                "type": "input_image",
+                "image_url": f"data:image/png;base64,{png_base64}",
+            },
+        },
     ]
 
-    message_blocks = agent._convert_messages([{"role": "user", "content": content}])
-    image_block = message_blocks[0]["content"][0]
-    assert image_block["type"] == "input_image"
-    assert image_block["image_url"].startswith("data:image/png;base64,")
-    assert image_block["detail"] == "high"
+    response = await agent.get_response(list(messages))
 
+    assert not response.done
+    assert response.tool_calls, "expected at least one tool call"
 
-@pytest.mark.asyncio
-async def test_format_tool_results_produces_function_call_output() -> None:
-    mock_client = MagicMock()
-    agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client)
+    tool_call = response.tool_calls[0]
+    assert tool_call.name == "openai_computer"
+    assert tool_call.arguments["type"] == "click"
+    # coordinates are normalized from the 1x1 PNG back to pixel space -> 0/0
+    assert tool_call.arguments["x"] == 0
+    assert tool_call.arguments["y"] == 0
 
-    tool_call = MCPToolCall(id="call-1", name="playwright", arguments={})
     tool_result = MCPToolResult(
         content=[
-            types.TextContent(type="text", text="navigation complete"),
-            types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"),
+            types.ImageContent(type="image", data=png_base64, mimeType="image/png"),
+            types.TextContent(type="text", text="button pressed"),
         ]
     )
 
-    formatted = await agent.format_tool_results([tool_call], [tool_result])
+    rendered = await agent.format_tool_results([tool_call], [tool_result])
 
-    assert formatted[0]["type"] == "function_call_output"
-    assert formatted[0]["call_id"] == "call-1"
-    assert formatted[1]["role"] == "user"
-    assert formatted[1]["content"][0]["type"] == "input_image"
+    assert any(item.get("type") == "computer_call_output" for item in rendered)
+    assert any(
+        item.get("type") == "message" and item.get("role") == "user"
+        for item in rendered
+    )
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 99771913..c1701f5c 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -894,7 +894,7 @@ def eval(
             [
                 {"name": "Claude 4 Sonnet", "value": "claude"},
                 {"name": "OpenAI Computer Use", "value": "openai"},
-                {"name": "OpenRouter (Responses)", "value": "openrouter"},
+                {"name": "OpenRouter", "value": "openrouter"},
                 {"name": "vLLM (Local Server)", "value": "vllm"},
                 {"name": "LiteLLM (Multi-provider)", "value": "litellm"},
             ]
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 4900ba85..2b63222d 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -191,7 +191,7 @@ def build_agent(
             raise typer.Exit(1) from e
 
         return OpenRouterAgent(
-            model_name=model or "z-ai/glm-4.6",
+            model_name=model or "z-ai/glm-4.5v",
             allowed_tools=allowed_tools,
             verbose=verbose,
         )
@@ -568,7 +568,7 @@ def eval_command(
         "claude",
         "--agent",
         help=(
-            "Agent backend to use (claude, openai computer use, openrouter responses, "
+            "Agent backend to use (claude, openai computer use, openrouter, "
             "vllm for local server, or litellm)"
         ),
     ),
diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py
index 37b9fa7a..f42248a4 100644
--- a/hud/utils/agent_factories.py
+++ b/hud/utils/agent_factories.py
@@ -88,13 +88,4 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent:
 def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent:
     """Factory for OpenRouterAgent with run_dataset compatibility."""
 
-    api_key = kwargs.pop("api_key", None)
-    base_url = kwargs.pop("base_url", None)
-    cache_control = kwargs.pop("cache_control", True)
-
-    return OpenRouterAgent(
-        api_key=api_key,
-        base_url=base_url,
-        cache_control=cache_control,
-        **kwargs,
-    )
+    return OpenRouterAgent(**kwargs)
diff --git a/pyproject.toml b/pyproject.toml
index dc6c77b4..0cfc9dfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -135,7 +135,7 @@ dev = [
     "langchain",
     "langchain-openai",
     "langchain-anthropic",
-    "litellm>=1.55.0",
+    "litellm",
     # Jupyter support
     "ipykernel",
     "ipython <9",

From 45fe54e9f59f1fe1cc1f1b792eefebe98b115c62 Mon Sep 17 00:00:00 2001
From: shinbehavior <hagforall@proton.me>
Date: Sat, 11 Oct 2025 23:13:49 +0200
Subject: [PATCH 3/4] eval run_full_dataset fix

---
 hud/cli/eval.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 2b63222d..7719e84d 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -479,6 +479,39 @@ async def run_full_dataset(
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
 
+    elif agent_type == "openrouter":
+        try:
+            # Use adapter class directly so it satisfies type[MCPAgent]
+            from hud.agents.openrouter import (
+                OpenRouterAgent,
+                _ADAPTER_REGISTRY,
+                _load_adapter,
+            )
+        except ImportError as e:
+            hud_console.error(
+                "OpenRouter agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        # Normalize model and resolve adapter
+        raw_model = model or "z-ai/glm-4.5v"
+        try:
+            normalized = OpenRouterAgent._normalize_model_name(raw_model)
+            adapter_path = _ADAPTER_REGISTRY[normalized]
+        except Exception as e:
+            hud_console.error(f"Unsupported OpenRouter model: {raw_model}")
+            raise typer.Exit(1) from e
+
+        adapter_cls = _load_adapter(adapter_path)
+        agent_class = adapter_cls
+        agent_config = {
+            "model_name": f"openrouter/{normalized}",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+
     else:
         try:
             from hud.agents import ClaudeAgent

From beb181618fc8a6fcbdf892ba3fdd4a2b28ddaf93 Mon Sep 17 00:00:00 2001
From: ilya <95108691+shfunc@users.noreply.github.com>
Date: Mon, 13 Oct 2025 09:07:25 +0200
Subject: [PATCH 4/4] Update pyproject.toml, litellm version fix

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0cfc9dfa..dc6c77b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -135,7 +135,7 @@ dev = [
     "langchain",
     "langchain-openai",
     "langchain-anthropic",
-    "litellm",
+    "litellm>=1.55.0",
     # Jupyter support
     "ipykernel",
     "ipython <9",