Skip to content
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ac2d5f0
feat: stabilize liveweb arena eval execution
Mar 13, 2026
2209b06
feat: update .gitignore to include .cache directory
Mar 13, 2026
5349b48
fix: bypass proxy for local llm endpoints
Mar 14, 2026
b85717a
fix: harden browser lifecycle for rl rollouts
Mar 14, 2026
698516a
fix: harden liveweb protocol and site handling
Mar 14, 2026
e80c114
fix: retry normalized openlibrary search queries
Mar 15, 2026
1d6bf13
fix: retry transient taostats api fetches
Mar 15, 2026
5b81a32
fix: make batch eval incremental and deterministic
Mar 16, 2026
632d3e4
feat: add recoverable liveweb format retries
Mar 17, 2026
9827b57
feat: tune liveweb format recovery sampling
Mar 17, 2026
71ae7bf
feat: audit liveweb reachability and recovery failures
Mar 18, 2026
69570ad
fix: restore cache manager compatibility helper
Mar 18, 2026
e30da8f
fix: bound recovery context and refine browser audit
Mar 18, 2026
99a826a
Stabilize browser and cache handling for noisy data sites
Mar 20, 2026
bef5001
Improve protocol parsing and LLM failure diagnostics
Mar 20, 2026
76afd0b
Add structured attribution for blocked domains and taostats failures
Mar 20, 2026
171826d
Disable Kimi reasoning in OpenRouter requests
Mar 20, 2026
c24bcf8
Stabilize taostats cache setup and failure attribution
Mar 20, 2026
aef7546
Harden taostats list actions and UI target attribution
Mar 20, 2026
17701c5
Fail fast on repeated disallowed-domain navigation
Mar 20, 2026
63a2427
Split strict eval from fast collection runtime
Mar 22, 2026
c5c3594
Harden LiveWeb recovery and browser proxy flow
Mar 23, 2026
77d3085
Add think ablation experiment tooling
Mar 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,6 @@ logs/
*.orig

# uv
.python-version
.python-version
.cache/
tmp/
540 changes: 451 additions & 89 deletions env.py

Large diffs are not rendered by default.

349 changes: 331 additions & 18 deletions liveweb_arena/core/agent_loop.py

Large diffs are not rendered by default.

271 changes: 246 additions & 25 deletions liveweb_arena/core/agent_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"""

import json
import re
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Tuple

Expand Down Expand Up @@ -170,6 +171,10 @@ def parse_response(self, raw: str, tool_calls: Optional[List[Any]] = None) -> Op
def serialize_step(self, step: TrajectoryStep) -> List[dict]:
"""Serialize a trajectory step as conversation messages for training export."""

def classify_format_failure(self, raw: str, tool_calls: Optional[List[Any]] = None) -> str:
"""Classify whether a parse failure is worth a local recovery attempt."""
return "terminal"


# Shared step prompt (observation format is protocol-independent)
_STEP_PROMPT_TEMPLATE = """## Current Page State
Expand All @@ -190,7 +195,7 @@ def serialize_step(self, step: TrajectoryStep) -> List[dict]:

_LAST_STEP_WARNING = """

**THIS IS YOUR LAST STEP!** You MUST use the "stop" action now and provide your best answers based on the information you have gathered. Do not attempt any other action."""
**THIS IS YOUR LAST STEP!** You MUST use the "stop" action now and provide your best answers based on the information you have gathered. Do not attempt any other action. Do not explore. Each answer must be a short final string only."""


def _build_step_prompt_common(
Expand Down Expand Up @@ -260,23 +265,25 @@ def _build_tools(self) -> List[dict]:
return tools

def build_system_prompt(self, task: CompositeTask) -> str:
hints = ""
if task.plugin_hints:
hints = "## Available Information Sources\n\n"
for _, usage_hint in task.plugin_hints.items():
hints += usage_hint + "\n\n"

return (
"You are a web automation agent that interacts with real websites to complete tasks.\n\n"
"You have access to a browser and can navigate to any website to gather information.\n"
"Use the provided tools to interact with the browser.\n\n"
f"{hints}"
"## Response Rules\n\n"
"- You must use OpenAI function calling / tool calls for every action.\n"
"- Do not output natural language explanations before or after a tool call.\n"
"- Do not output chain-of-thought, hidden reasoning, analysis, or planning text.\n"
"- Do not output <think>, </think>, <tool_call>, XML, markdown code fences, or raw JSON in message text.\n"
"- Your assistant message should contain only the tool call selected by the model.\n"
"- If you have enough information, call the stop tool immediately with the final answers.\n"
"- Never describe what you are about to do in text; just call the correct tool.\n\n"
f"{task.combined_intent}\n\n"
"## Tips\n\n"
"- First analyze the task and decide which website to visit\n"
"- Use the goto tool to navigate to the appropriate URL\n"
"- Homepage/list data may be inaccurate. Always visit detail pages for precise values\n"
"- When done, use the stop tool with your answers\n"
"- Only visit allowed domains\n"
"- Finish data collection for one subtask, then move on\n"
"- Keep responses action-only; no commentary\n"
)

def build_step_prompt(
Expand All @@ -298,45 +305,259 @@ def format_step(step: TrajectoryStep) -> str:
obs, trajectory, current_step, max_steps,
self._max_recent_steps, format_step,
)
return prompt + "\nWhat is your next action? Use one of the available tools."
return (
prompt
+ "\nWhat is your next action? Use one of the available tools."
+ "\nReturn only a tool call. Do not output any text explanation, <think> block, XML tag, markdown, or raw JSON."
)

def get_tools(self) -> List[dict]:
return self._tools

def parse_response(self, raw: str, tool_calls: Optional[List[Any]] = None) -> Optional[BrowserAction]:
"""Parse tool_calls from LLM response."""
"""Parse tool_calls from LLM response.

Fallback support covers common text-encoded tool-call styles:
- <tool_call>{...}</tool_call>
- raw JSON object {"name": "...", "arguments": {...}}
- [tool_call: stop({...})]
"""
parsed = self._parse_primary_tool_call(tool_calls)
if parsed is None and raw:
parsed = self._parse_qwen_fallback(raw)
if parsed is None:
return None

fn_name, params = parsed

# Normalize stop action format for compatibility with existing agent_loop
if fn_name == "stop":
answers = params.get("answers", {})
params = {"final": {"answers": answers}}

return BrowserAction(action_type=fn_name, params=params)

def classify_format_failure(self, raw: str, tool_calls: Optional[List[Any]] = None) -> str:
if self._parse_primary_tool_call(tool_calls) is not None:
return "none"
if raw and self._parse_qwen_fallback(raw) is not None:
return "none"

stripped = (raw or "").strip()
if tool_calls:
return "recoverable_truncated_tool_json"
if not stripped:
return "recoverable_empty"

stripped = re.sub(r"^\s*<think>.*?</think>\s*", "", stripped, flags=re.DOTALL).strip()
if not stripped:
return "recoverable_empty"

if "<tool_call>" in stripped or "</tool_call>" in stripped:
return "recoverable_truncated_tool_json"
if re.match(r"^\s*\{", stripped):
return "recoverable_truncated_tool_json"
if re.match(r"^\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\(", stripped):
return "recoverable_qwen_tool_text"
if stripped.startswith("[tool_call:"):
return "recoverable_truncated_tool_json"
return "terminal_natural_language"

def debug_parse_metadata(self, raw: str, tool_calls: Optional[List[Any]] = None) -> Dict[str, Any]:
branch = "none"
preview_calls: list[dict[str, Any]] = []
if tool_calls:
for call in tool_calls[:2]:
fn_name = None
fn_args = None
if hasattr(call, "function") and hasattr(call.function, "name"):
fn_name = call.function.name
fn_args = call.function.arguments
elif hasattr(call, "function") and isinstance(call.function, dict):
fn_name = call.function.get("name")
fn_args = call.function.get("arguments")
elif isinstance(call, dict):
function = call.get("function", {})
fn_name = function.get("name")
fn_args = function.get("arguments")
preview_calls.append(
{
"name": fn_name,
"arguments_preview": str(fn_args)[:200] if fn_args is not None else None,
}
)
if self._parse_primary_tool_call(tool_calls) is not None:
branch = "tool_calls"
else:
stripped = (raw or "").strip()
if stripped:
stripped = re.sub(r"^\s*<think>.*?</think>\s*", "", stripped, flags=re.DOTALL).strip()
tag_match = re.search(r"<tool_call>\s*(.*?)\s*</tool_call>", stripped, re.DOTALL)
bracket_tool_match = re.fullmatch(r"\[\s*tool_call:\s*(.+?)\s*\]", stripped, flags=re.DOTALL)
if tag_match and self._parse_qwen_fallback(raw) is not None:
branch = "tool_call_tag"
elif bracket_tool_match and self._parse_qwen_fallback(raw) is not None:
branch = "bracket_tool_call"
elif re.match(r"^\s*\{", stripped) and self._parse_qwen_fallback(raw) is not None:
branch = "raw_json_object"
elif re.match(r"^\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\(", stripped) and self._parse_qwen_fallback(raw) is not None:
branch = "qwen_function_text"
elif re.match(r"^\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\(", stripped):
branch = "qwen_function_text_unparsed"
elif stripped.startswith("[tool_call:"):
branch = "bracket_tool_call_unparsed"
elif "<tool_call>" in stripped or "</tool_call>" in stripped:
branch = "tool_call_tag_unparsed"
elif re.match(r"^\s*\{", stripped):
branch = "raw_json_object_unparsed"
else:
branch = "natural_language"
return {
"protocol_parser_branch": branch,
"tool_calls_preview": preview_calls,
}

def _parse_primary_tool_call(self, tool_calls: Optional[List[Any]]) -> Optional[Tuple[str, Dict[str, Any]]]:
if not tool_calls:
return None

# Use the first tool call — handle both OpenAI SDK objects and dicts
call = tool_calls[0]
if hasattr(call, 'function') and hasattr(call.function, 'name'):
# OpenAI SDK object (from streaming)
if hasattr(call, "function") and hasattr(call.function, "name"):
fn_name = call.function.name
fn_args = call.function.arguments
elif hasattr(call, 'function') and isinstance(call.function, dict):
# ToolCall dataclass (from chat_with_tools)
elif hasattr(call, "function") and isinstance(call.function, dict):
fn_name = call.function.get("name")
fn_args = call.function.get("arguments", "{}")
else:
# Plain dict
fn_name = call.get("function", {}).get("name")
fn_args = call.get("function", {}).get("arguments", "{}")

if not fn_name or fn_name not in VALID_ACTION_TYPES:
return self._normalize_tool_call(fn_name, fn_args)

def _parse_qwen_fallback(self, raw: str) -> Optional[Tuple[str, Dict[str, Any]]]:
stripped = raw.strip()
if not stripped:
return None

stripped = re.sub(r"^\s*<think>.*?</think>\s*", "", stripped, flags=re.DOTALL)
if not stripped:
return None

tag_match = re.search(r"<tool_call>\s*(.*?)\s*</tool_call>", stripped, re.DOTALL)
if tag_match:
stripped = tag_match.group(1).strip()

for candidate in self._qwen_payload_candidates(stripped):
payload = self._parse_qwen_payload(candidate)
if not isinstance(payload, dict):
repaired_candidate = self._repair_function_text_payload(candidate)
if repaired_candidate and repaired_candidate != candidate:
payload = self._parse_qwen_payload(repaired_candidate)
if not isinstance(payload, dict):
continue
fn_name = payload.get("name")
fn_args = payload.get("arguments", {})
normalized = self._normalize_tool_call(fn_name, fn_args)
if normalized is not None:
return normalized
return None

def _qwen_payload_candidates(self, payload_text: str) -> List[str]:
candidates: List[str] = []
seen: set[str] = set()

def add(text: str) -> None:
text = text.strip()
if text and text not in seen:
seen.add(text)
candidates.append(text)

add(payload_text)
stripped = payload_text.strip()
add(re.sub(r"^```(?:json)?\s*|\s*```$", "", stripped, flags=re.DOTALL).strip())
add(re.sub(r"^<[^>]+>\s*|\s*</[^>]+>$", "", stripped, flags=re.DOTALL).strip())

bracket_tool_match = re.fullmatch(r"\[\s*tool_call:\s*(.+?)\s*\]", stripped, flags=re.DOTALL)
if bracket_tool_match:
add(bracket_tool_match.group(1))

wrapper_match = re.fullmatch(r"([`_]+)\s*(.+?)\s*\1", stripped, flags=re.DOTALL)
if wrapper_match:
add(wrapper_match.group(2))
add(stripped.strip("`_ \n\t"))
return candidates

def _parse_qwen_payload(self, payload_text: str) -> Optional[Dict[str, Any]]:
stripped = payload_text.strip()
try:
params = json.loads(fn_args) if isinstance(fn_args, str) else fn_args
payload = json.loads(stripped)
except json.JSONDecodeError:
payload = None
if isinstance(payload, dict):
return payload

sanitized = stripped.strip("` \n\t")
sanitized = re.sub(r"^_+", "", sanitized)
sanitized = re.sub(r"_+\s*\)$", ")", sanitized)
match = re.fullmatch(
r"([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*(\{.*\})\s*\)\s*",
sanitized,
flags=re.DOTALL,
)
if not match:
repaired = self._repair_function_text_payload(sanitized)
if repaired is not None:
match = re.fullmatch(
r"([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*(\{.*\})\s*\)\s*",
repaired,
flags=re.DOTALL,
)
if not match:
return None

# Normalize stop action format for compatibility with existing agent_loop
if fn_name == "stop":
answers = params.get("answers", {})
params = {"final": {"answers": answers}}
fn_name = match.group(1)
if fn_name not in VALID_ACTION_TYPES:
return None

return BrowserAction(action_type=fn_name, params=params)
try:
fn_args = json.loads(match.group(2))
except json.JSONDecodeError:
return None
if not isinstance(fn_args, dict):
return None
return {"name": fn_name, "arguments": fn_args}

def _repair_function_text_payload(self, payload_text: str) -> Optional[str]:
stripped = payload_text.strip()
prefix_match = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", stripped)
if not prefix_match or not stripped.endswith(")"):
return None

fn_name = prefix_match.group(1)
body = stripped[prefix_match.end():-1].strip()
if not body.startswith("{"):
return None

opens = body.count("{")
closes = body.count("}")
if opens <= closes:
return None

repaired_body = body + ("}" * (opens - closes))
return f"{fn_name}({repaired_body})"

def _normalize_tool_call(self, fn_name: Any, fn_args: Any) -> Optional[Tuple[str, Dict[str, Any]]]:
if not isinstance(fn_name, str) or fn_name not in VALID_ACTION_TYPES:
return None

try:
params = json.loads(fn_args) if isinstance(fn_args, str) else fn_args
except json.JSONDecodeError:
return None

if not isinstance(params, dict):
return None
return fn_name, params

def serialize_step(self, step: TrajectoryStep) -> List[dict]:
"""Serialize as tool_call + tool response messages (standard OpenAI format)."""
Expand Down
Loading