Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions liveweb_arena/core/agent_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from .browser import BrowserSession
from .cache import CacheFatalError
from .memory_patch import apply_memory_patch
from .models import BrowserAction, CompositeTask, TrajectoryStep
from .agent_protocol import AgentProtocol
from ..utils.llm_client import LLMClient, LLMFatalError
Expand Down Expand Up @@ -59,6 +60,8 @@ class AgentLoop:
The loop maintains trajectory state internally for partial recovery on timeout.
"""

MEMORY_MAX_PATCH_ADD_CHARS = 120

def __init__(
self,
session: BrowserSession,
Expand All @@ -81,6 +84,7 @@ def __init__(
self._trajectory: List[TrajectoryStep] = []
self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "last_call": None}
self._final_answer = None
self._working_memory = ""

def get_trajectory(self) -> List[TrajectoryStep]:
"""Get current trajectory (for partial recovery on timeout)"""
Expand All @@ -94,6 +98,17 @@ def get_final_answer(self) -> Any:
"""Get final answer if available"""
return self._final_answer

def get_working_memory(self) -> str:
"""Get current working memory document."""
return self._working_memory

def _apply_memory_patch(self, patch_text: str) -> str:
"""Apply a simplified diff patch to the working memory document."""
result = apply_memory_patch(self._working_memory, patch_text, self.MEMORY_MAX_PATCH_ADD_CHARS)
if result.applied:
self._working_memory = result.document
return result.message

async def _call_llm(
self, system_prompt: str, user_prompt: str, model: str,
temperature: float, seed: Optional[int],
Expand Down Expand Up @@ -146,6 +161,7 @@ async def run(
self._trajectory = []
self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "last_call": None}
self._final_answer = None
self._working_memory = ""
self._max_steps_reached = False
self._parse_failed = False

Expand Down Expand Up @@ -208,7 +224,11 @@ async def run(
current_obs = obs
step_num = effective_step - 1 # 0-indexed step number for trajectory
user_prompt = self._protocol.build_step_prompt(
current_obs, self._trajectory, effective_step, self._max_steps
current_obs,
self._trajectory,
effective_step,
self._max_steps,
working_memory=self._working_memory,
)

try:
Expand Down Expand Up @@ -256,23 +276,33 @@ async def run(
action_result="Parse failed - model output not valid JSON",
prompt=user_prompt,
raw_response=raw_response,
memory_snapshot=self.get_working_memory(),
)
self._trajectory.append(step)
self._parse_failed = True
break

memory_patch_result = None
memory_patch = action.params.get("memory_patch")

if action.action_type == "stop":
if memory_patch is not None:
memory_patch_result = self._apply_memory_patch(memory_patch)
final_params = action.params.get("final", {})
self._final_answer = final_params if final_params else action.params
log("Agent", f"Completed: {self._final_answer}")

action_result = "Task completed"
if memory_patch_result:
action_result += f" | {memory_patch_result}"
step = TrajectoryStep(
step_num=step_num,
observation=current_obs,
action=action,
action_result="Task completed",
action_result=action_result,
prompt=user_prompt,
raw_response=raw_response,
memory_snapshot=self.get_working_memory(),
)
self._trajectory.append(step)

Expand All @@ -287,9 +317,9 @@ async def run(
log("Agent", f"Action: {action.action_type}")
old_url = obs.url if obs else None

# Execute action - browser handles navigation errors internally
# and returns error pages as valid observations
try:
# Execute action - browser handles navigation errors internally
# and returns error pages as valid observations
obs = await self._session.execute_action(action)
action_result = "Success"

Expand All @@ -309,13 +339,18 @@ async def run(
# Non-navigation action failed
action_result = f"Failed: {e}"

if memory_patch is not None:
memory_patch_result = self._apply_memory_patch(memory_patch)
action_result = f"{action_result} | {memory_patch_result}"

step = TrajectoryStep(
step_num=step_num,
observation=current_obs,
action=action,
action_result=action_result,
prompt=user_prompt,
raw_response=raw_response,
memory_snapshot=self.get_working_memory(),
)
self._trajectory.append(step)

Expand Down
40 changes: 34 additions & 6 deletions liveweb_arena/core/agent_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def build_step_prompt(
trajectory: List[TrajectoryStep],
current_step: int,
max_steps: int,
working_memory: Optional[str] = None,
) -> str:
"""Build the per-step user message with current observation."""

Expand Down Expand Up @@ -182,9 +183,14 @@ def serialize_step(self, step: TrajectoryStep) -> List[dict]:
{accessibility_tree}
```

## State

### Recent Actions
{recent_actions}

### Working Memory
{working_memory}

**Step {current_step}/{max_steps}** ({remaining_steps} steps remaining){last_step_warning}
"""

Expand All @@ -198,6 +204,7 @@ def _build_step_prompt_common(
trajectory: List[TrajectoryStep],
current_step: int,
max_steps: int,
working_memory: Optional[str] = None,
max_recent_steps: int = 5,
format_step_fn=None,
) -> str:
Expand All @@ -217,6 +224,8 @@ def _build_step_prompt_common(
else:
recent_actions = "(no actions yet)"

working_memory_text = (working_memory or "").strip() or "(empty)"

remaining_steps = max_steps - current_step
last_step_warning = _LAST_STEP_WARNING if remaining_steps == 0 else ""

Expand All @@ -225,13 +234,12 @@ def _build_step_prompt_common(
title=obs.title,
accessibility_tree=obs.accessibility_tree,
recent_actions=recent_actions,
working_memory=working_memory_text,
current_step=current_step,
max_steps=max_steps,
remaining_steps=remaining_steps,
last_step_warning=last_step_warning,
)


class FunctionCallingProtocol(AgentProtocol):
"""
Standard OpenAI function calling protocol.
Expand All @@ -249,12 +257,21 @@ def _build_tools(self) -> List[dict]:
"""Build OpenAI-format tool definitions from BROWSER_ACTIONS."""
tools = []
for name, spec in BROWSER_ACTIONS.items():
parameters = json.loads(json.dumps(spec["parameters"]))
parameters.setdefault("properties", {})["memory_patch"] = {
"type": "string",
"description": (
"Optional simplified diff patch for working memory. "
"Use the format '@@' on the first line, then lines starting with '- ' "
"to delete an exact memory line or '+ ' to add a new memory line."
),
}
tools.append({
"type": "function",
"function": {
"name": name,
"description": spec["description"],
"parameters": spec["parameters"],
"parameters": parameters,
},
})
return tools
Expand Down Expand Up @@ -285,6 +302,7 @@ def build_step_prompt(
trajectory: List[TrajectoryStep],
current_step: int = 1,
max_steps: int = 30,
working_memory: Optional[str] = None,
) -> str:
def format_step(step: TrajectoryStep) -> str:
if step.action:
Expand All @@ -296,9 +314,13 @@ def format_step(step: TrajectoryStep) -> str:

prompt = _build_step_prompt_common(
obs, trajectory, current_step, max_steps,
self._max_recent_steps, format_step,
working_memory, self._max_recent_steps, format_step,
)
return (
prompt
+ "\nWhat is your next action? Use one of the available tools."
+ "\nYou may optionally include one memory_patch string in your tool arguments."
)
return prompt + "\nWhat is your next action? Use one of the available tools."

def get_tools(self) -> List[dict]:
return self._tools
Expand Down Expand Up @@ -334,7 +356,10 @@ def parse_response(self, raw: str, tool_calls: Optional[List[Any]] = None) -> Op
# Normalize stop action format for compatibility with existing agent_loop
if fn_name == "stop":
answers = params.get("answers", {})
memory_patch = params.get("memory_patch")
params = {"final": {"answers": answers}}
if memory_patch is not None:
params["memory_patch"] = memory_patch

return BrowserAction(action_type=fn_name, params=params)

Expand All @@ -353,8 +378,11 @@ def serialize_step(self, step: TrajectoryStep) -> List[dict]:
# Denormalize stop params back to tool format
final = step.action.params.get("final", {})
args = {"answers": final.get("answers", {})}
memory_patch = step.action.params.get("memory_patch")
if memory_patch is not None:
args["memory_patch"] = memory_patch
else:
args = step.action.params
args = dict(step.action.params)

tool_call_id = f"call_{step.step_num}"
messages.append({
Expand Down
1 change: 1 addition & 0 deletions liveweb_arena/core/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ def _load_cache(self, cache_file: Path, need_api: bool, allow_stale: bool) -> Op
return None

if not allow_stale and cached.is_expired(self.ttl):
self._delete_cache(cache_file)
return None

# Check if cache is complete
Expand Down
64 changes: 64 additions & 0 deletions liveweb_arena/core/memory_patch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Shared working-memory diff patch utilities."""

from dataclasses import dataclass


@dataclass(frozen=True)
class MemoryPatchResult:
"""Result of applying a working-memory patch."""

document: str
message: str
applied: bool


def apply_memory_patch(
document: str,
patch_text: str,
max_patch_add_chars: int,
) -> MemoryPatchResult:
"""Apply a simplified line-based diff patch to a memory document."""
if not isinstance(patch_text, str):
return MemoryPatchResult(document=document, message="Memory patch ignored: patch must be a string", applied=False)

lines = [line.rstrip() for line in patch_text.splitlines() if line.strip()]
if not lines or lines[0] != "@@":
return MemoryPatchResult(document=document, message="Memory patch ignored: invalid diff header", applied=False)

removals: list[str] = []
additions: list[str] = []
added_chars = 0

for line in lines[1:]:
if line.startswith("- "):
removals.append(line[2:])
continue
if line.startswith("+ "):
added_text = line[2:]
if not added_text:
return MemoryPatchResult(document=document, message="Memory patch ignored: empty addition", applied=False)
additions.append(added_text)
added_chars += len(added_text)
continue
return MemoryPatchResult(document=document, message="Memory patch ignored: invalid diff line", applied=False)

if added_chars > max_patch_add_chars:
return MemoryPatchResult(
document=document,
message=f"Memory patch ignored: added content exceeds {max_patch_add_chars} characters",
applied=False,
)

updated_lines = document.splitlines()
for text in removals:
if text not in updated_lines:
return MemoryPatchResult(document=document, message="Memory patch ignored: deletion target not found", applied=False)
updated_lines.remove(text)

updated_lines.extend(additions)
updated_document = "\n".join(updated_lines)
return MemoryPatchResult(
document=updated_document,
message=f"Memory patch applied: -{len(removals)}, +{added_chars} chars",
applied=True,
)
1 change: 1 addition & 0 deletions liveweb_arena/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,4 @@ class TrajectoryStep:
action_result: str = ""
prompt: Optional[str] = None # Actual prompt sent to LLM
raw_response: Optional[str] = None # Raw LLM response (used for history and conversation)
memory_snapshot: str = ""
15 changes: 9 additions & 6 deletions liveweb_arena/core/task_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ class TaskRegistry:
86: ("openmeteo", "openmeteo_comparison"),
87: ("openmeteo", "openmeteo_hourly_extrema"),
88: ("openmeteo", "openmeteo_forecast_trend"),
96: ("openlibrary", "openlibrary_author_engagement_extrema"),
97: ("openlibrary", "openlibrary_author_comparison"),
98: ("openlibrary", "openlibrary_reading_stats_filter"),
99: ("openmeteo", "openmeteo_hourly_threshold"),
100: ("openmeteo", "openmeteo_sunrise_sunset"),
101: ("openmeteo", "openmeteo_hourly_time_of"),
Expand All @@ -156,11 +159,10 @@ class TaskRegistry:
92: ("arxiv", "arxiv_category_comparison"),
94: ("arxiv", "arxiv_multi_author_filter"),
95: ("arxiv", "arxiv_title_length_extrema"),

# Open Library templates — engagement & comparison
96: ("openlibrary", "openlibrary_author_engagement_extrema"),
97: ("openlibrary", "openlibrary_author_comparison"),
98: ("openlibrary", "openlibrary_reading_stats_filter"),
110: ("openmeteo", "openmeteo_daily_precip_peak_day"),
111: ("openlibrary", "openlibrary_subject_nested_work_title"),
112: ("arxiv", "arxiv_category_infer_title_substring"),
113: ("arxiv", "arxiv_category_infer_author_filter"),
}

# Template versions - each version's combinations come AFTER all previous versions
Expand Down Expand Up @@ -190,10 +192,11 @@ class TaskRegistry:
# Version 6: ArXiv templates
[90, 91, 92, 94, 95],
# Version 7: Open Library engagement & comparison templates (PR #13)
# NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8.
[96, 97, 98],
# Version 8: Additional Open Meteo templates
[99, 100, 101],
# Version 9: Cross-site templates (daylight calendar, subject hub, arXiv category feeds)
[110, 111, 112, 113],
]

# Combination registry: list of template ID tuples
Expand Down
4 changes: 4 additions & 0 deletions liveweb_arena/plugins/arxiv/templates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
from .multi_author_filter import ArxivMultiAuthorFilterTemplate
from .title_length_extrema import ArxivTitleLengthExtremaTemplate
from .category_comparison import ArxivCategoryComparisonTemplate
from .category_infer_title_substring import ArxivCategoryInferTitleSubstringTemplate
from .category_infer_author_filter import ArxivCategoryInferAuthorFilterTemplate

__all__ = [
"ArxivPaperInfoTemplate",
"ArxivAuthorExtremaTemplate",
"ArxivMultiAuthorFilterTemplate",
"ArxivTitleLengthExtremaTemplate",
"ArxivCategoryComparisonTemplate",
"ArxivCategoryInferTitleSubstringTemplate",
"ArxivCategoryInferAuthorFilterTemplate",
]
Loading