AffineFoundation · kiannidev · Mar 20, 2026 · Mar 26, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/liveweb_arena/core/agent_loop.py b/liveweb_arena/core/agent_loop.py
@@ -5,6 +5,7 @@
 
 from .browser import BrowserSession
 from .cache import CacheFatalError
+from .memory_patch import apply_memory_patch
 from .models import BrowserAction, CompositeTask, TrajectoryStep
 from .agent_protocol import AgentProtocol
 from ..utils.llm_client import LLMClient, LLMFatalError
@@ -59,6 +60,8 @@ class AgentLoop:
     The loop maintains trajectory state internally for partial recovery on timeout.
     """
 
+    MEMORY_MAX_PATCH_ADD_CHARS = 120
+
     def __init__(
         self,
         session: BrowserSession,
@@ -81,6 +84,7 @@ def __init__(
         self._trajectory: List[TrajectoryStep] = []
         self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "last_call": None}
         self._final_answer = None
+        self._working_memory = ""
 
     def get_trajectory(self) -> List[TrajectoryStep]:
         """Get current trajectory (for partial recovery on timeout)"""
@@ -94,6 +98,17 @@ def get_final_answer(self) -> Any:
         """Get final answer if available"""
         return self._final_answer
 
+    def get_working_memory(self) -> str:
+        """Get current working memory document."""
+        return self._working_memory
+
+    def _apply_memory_patch(self, patch_text: str) -> str:
+        """Apply a simplified diff patch to the working memory document."""
+        result = apply_memory_patch(self._working_memory, patch_text, self.MEMORY_MAX_PATCH_ADD_CHARS)
+        if result.applied:
+            self._working_memory = result.document
+        return result.message
+
     async def _call_llm(
         self, system_prompt: str, user_prompt: str, model: str,
         temperature: float, seed: Optional[int],
@@ -146,6 +161,7 @@ async def run(
         self._trajectory = []
         self._total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "last_call": None}
         self._final_answer = None
+        self._working_memory = ""
         self._max_steps_reached = False
         self._parse_failed = False
 
@@ -208,7 +224,11 @@ async def run(
             current_obs = obs
             step_num = effective_step - 1  # 0-indexed step number for trajectory
             user_prompt = self._protocol.build_step_prompt(
-                current_obs, self._trajectory, effective_step, self._max_steps
+                current_obs,
+                self._trajectory,
+                effective_step,
+                self._max_steps,
+                working_memory=self._working_memory,
             )
 
             try:
@@ -256,23 +276,33 @@ async def run(
                     action_result="Parse failed - model output not valid JSON",
                     prompt=user_prompt,
                     raw_response=raw_response,
+                    memory_snapshot=self.get_working_memory(),
                 )
                 self._trajectory.append(step)
                 self._parse_failed = True
                 break
 
+            memory_patch_result = None
+            memory_patch = action.params.get("memory_patch")
+
             if action.action_type == "stop":
+                if memory_patch is not None:
+                    memory_patch_result = self._apply_memory_patch(memory_patch)
                 final_params = action.params.get("final", {})
                 self._final_answer = final_params if final_params else action.params
                 log("Agent", f"Completed: {self._final_answer}")
 
+                action_result = "Task completed"
+                if memory_patch_result:
+                    action_result += f" | {memory_patch_result}"
                 step = TrajectoryStep(
                     step_num=step_num,
                     observation=current_obs,
                     action=action,
-                    action_result="Task completed",
+                    action_result=action_result,
                     prompt=user_prompt,
                     raw_response=raw_response,
+                    memory_snapshot=self.get_working_memory(),
                 )
                 self._trajectory.append(step)
 
@@ -287,9 +317,9 @@ async def run(
                 log("Agent", f"Action: {action.action_type}")
                 old_url = obs.url if obs else None
 
-                # Execute action - browser handles navigation errors internally
-                # and returns error pages as valid observations
                 try:
+                    # Execute action - browser handles navigation errors internally
+                    # and returns error pages as valid observations
                     obs = await self._session.execute_action(action)
                     action_result = "Success"
 
@@ -309,13 +339,18 @@ async def run(
                     # Non-navigation action failed
                     action_result = f"Failed: {e}"
 
+                if memory_patch is not None:
+                    memory_patch_result = self._apply_memory_patch(memory_patch)
+                    action_result = f"{action_result} | {memory_patch_result}"
+
             step = TrajectoryStep(
                 step_num=step_num,
                 observation=current_obs,
                 action=action,
                 action_result=action_result,
                 prompt=user_prompt,
                 raw_response=raw_response,
+                memory_snapshot=self.get_working_memory(),
             )
             self._trajectory.append(step)
 

diff --git a/liveweb_arena/core/agent_protocol.py b/liveweb_arena/core/agent_protocol.py
@@ -155,6 +155,7 @@ def build_step_prompt(
         trajectory: List[TrajectoryStep],
         current_step: int,
         max_steps: int,
+        working_memory: Optional[str] = None,
     ) -> str:
         """Build the per-step user message with current observation."""
 
@@ -182,9 +183,14 @@ def serialize_step(self, step: TrajectoryStep) -> List[dict]:
 {accessibility_tree}
 ```
 
+## State
+
 ### Recent Actions
 {recent_actions}
 
+### Working Memory
+{working_memory}
+
 **Step {current_step}/{max_steps}** ({remaining_steps} steps remaining){last_step_warning}
 """
 
@@ -198,6 +204,7 @@ def _build_step_prompt_common(
     trajectory: List[TrajectoryStep],
     current_step: int,
     max_steps: int,
+    working_memory: Optional[str] = None,
     max_recent_steps: int = 5,
     format_step_fn=None,
 ) -> str:
@@ -217,6 +224,8 @@ def _build_step_prompt_common(
     else:
         recent_actions = "(no actions yet)"
 
+    working_memory_text = (working_memory or "").strip() or "(empty)"
+
     remaining_steps = max_steps - current_step
     last_step_warning = _LAST_STEP_WARNING if remaining_steps == 0 else ""
 
@@ -225,13 +234,12 @@ def _build_step_prompt_common(
         title=obs.title,
         accessibility_tree=obs.accessibility_tree,
         recent_actions=recent_actions,
+        working_memory=working_memory_text,
         current_step=current_step,
         max_steps=max_steps,
         remaining_steps=remaining_steps,
         last_step_warning=last_step_warning,
     )
-
-
 class FunctionCallingProtocol(AgentProtocol):
     """
     Standard OpenAI function calling protocol.
@@ -249,12 +257,21 @@ def _build_tools(self) -> List[dict]:
         """Build OpenAI-format tool definitions from BROWSER_ACTIONS."""
         tools = []
         for name, spec in BROWSER_ACTIONS.items():
+            parameters = json.loads(json.dumps(spec["parameters"]))
+            parameters.setdefault("properties", {})["memory_patch"] = {
+                "type": "string",
+                "description": (
+                    "Optional simplified diff patch for working memory. "
+                    "Use the format '@@' on the first line, then lines starting with '- ' "
+                    "to delete an exact memory line or '+ ' to add a new memory line."
+                ),
+            }
             tools.append({
                 "type": "function",
                 "function": {
                     "name": name,
                     "description": spec["description"],
-                    "parameters": spec["parameters"],
+                    "parameters": parameters,
                 },
             })
         return tools
@@ -285,6 +302,7 @@ def build_step_prompt(
         trajectory: List[TrajectoryStep],
         current_step: int = 1,
         max_steps: int = 30,
+        working_memory: Optional[str] = None,
     ) -> str:
         def format_step(step: TrajectoryStep) -> str:
             if step.action:
@@ -296,9 +314,13 @@ def format_step(step: TrajectoryStep) -> str:
 
         prompt = _build_step_prompt_common(
             obs, trajectory, current_step, max_steps,
-            self._max_recent_steps, format_step,
+            working_memory, self._max_recent_steps, format_step,
+        )
+        return (
+            prompt
+            + "\nWhat is your next action? Use one of the available tools."
+            + "\nYou may optionally include one memory_patch string in your tool arguments."
         )
-        return prompt + "\nWhat is your next action? Use one of the available tools."
 
     def get_tools(self) -> List[dict]:
         return self._tools
@@ -334,7 +356,10 @@ def parse_response(self, raw: str, tool_calls: Optional[List[Any]] = None) -> Op
         # Normalize stop action format for compatibility with existing agent_loop
         if fn_name == "stop":
             answers = params.get("answers", {})
+            memory_patch = params.get("memory_patch")
             params = {"final": {"answers": answers}}
+            if memory_patch is not None:
+                params["memory_patch"] = memory_patch
 
         return BrowserAction(action_type=fn_name, params=params)
 
@@ -353,8 +378,11 @@ def serialize_step(self, step: TrajectoryStep) -> List[dict]:
                 # Denormalize stop params back to tool format
                 final = step.action.params.get("final", {})
                 args = {"answers": final.get("answers", {})}
+                memory_patch = step.action.params.get("memory_patch")
+                if memory_patch is not None:
+                    args["memory_patch"] = memory_patch
             else:
-                args = step.action.params
+                args = dict(step.action.params)
 
             tool_call_id = f"call_{step.step_num}"
             messages.append({

diff --git a/liveweb_arena/core/cache.py b/liveweb_arena/core/cache.py
@@ -489,6 +489,7 @@ def _load_cache(self, cache_file: Path, need_api: bool, allow_stale: bool) -> Op
             return None
 
         if not allow_stale and cached.is_expired(self.ttl):
+            self._delete_cache(cache_file)
             return None
 
         # Check if cache is complete

diff --git a/liveweb_arena/core/memory_patch.py b/liveweb_arena/core/memory_patch.py
@@ -0,0 +1,64 @@
+"""Shared working-memory diff patch utilities."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class MemoryPatchResult:
+    """Result of applying a working-memory patch."""
+
+    document: str
+    message: str
+    applied: bool
+
+
+def apply_memory_patch(
+    document: str,
+    patch_text: str,
+    max_patch_add_chars: int,
+) -> MemoryPatchResult:
+    """Apply a simplified line-based diff patch to a memory document."""
+    if not isinstance(patch_text, str):
+        return MemoryPatchResult(document=document, message="Memory patch ignored: patch must be a string", applied=False)
+
+    lines = [line.rstrip() for line in patch_text.splitlines() if line.strip()]
+    if not lines or lines[0] != "@@":
+        return MemoryPatchResult(document=document, message="Memory patch ignored: invalid diff header", applied=False)
+
+    removals: list[str] = []
+    additions: list[str] = []
+    added_chars = 0
+
+    for line in lines[1:]:
+        if line.startswith("- "):
+            removals.append(line[2:])
+            continue
+        if line.startswith("+ "):
+            added_text = line[2:]
+            if not added_text:
+                return MemoryPatchResult(document=document, message="Memory patch ignored: empty addition", applied=False)
+            additions.append(added_text)
+            added_chars += len(added_text)
+            continue
+        return MemoryPatchResult(document=document, message="Memory patch ignored: invalid diff line", applied=False)
+
+    if added_chars > max_patch_add_chars:
+        return MemoryPatchResult(
+            document=document,
+            message=f"Memory patch ignored: added content exceeds {max_patch_add_chars} characters",
+            applied=False,
+        )
+
+    updated_lines = document.splitlines()
+    for text in removals:
+        if text not in updated_lines:
+            return MemoryPatchResult(document=document, message="Memory patch ignored: deletion target not found", applied=False)
+        updated_lines.remove(text)
+
+    updated_lines.extend(additions)
+    updated_document = "\n".join(updated_lines)
+    return MemoryPatchResult(
+        document=updated_document,
+        message=f"Memory patch applied: -{len(removals)}, +{added_chars} chars",
+        applied=True,
+    )
diff --git a/liveweb_arena/core/models.py b/liveweb_arena/core/models.py
@@ -42,3 +42,4 @@ class TrajectoryStep:
     action_result: str = ""
     prompt: Optional[str] = None  # Actual prompt sent to LLM
     raw_response: Optional[str] = None  # Raw LLM response (used for history and conversation)
+    memory_snapshot: str = ""
diff --git a/liveweb_arena/core/task_registry.py b/liveweb_arena/core/task_registry.py
@@ -146,6 +146,9 @@ class TaskRegistry:
         86: ("openmeteo", "openmeteo_comparison"),
         87: ("openmeteo", "openmeteo_hourly_extrema"),
         88: ("openmeteo", "openmeteo_forecast_trend"),
+        96: ("openlibrary", "openlibrary_author_engagement_extrema"),
+        97: ("openlibrary", "openlibrary_author_comparison"),
+        98: ("openlibrary", "openlibrary_reading_stats_filter"),
         99: ("openmeteo", "openmeteo_hourly_threshold"),
         100: ("openmeteo", "openmeteo_sunrise_sunset"),
         101: ("openmeteo", "openmeteo_hourly_time_of"),
@@ -156,11 +159,10 @@ class TaskRegistry:
         92: ("arxiv", "arxiv_category_comparison"),
         94: ("arxiv", "arxiv_multi_author_filter"),
         95: ("arxiv", "arxiv_title_length_extrema"),
-
-        # Open Library templates — engagement & comparison
-        96: ("openlibrary", "openlibrary_author_engagement_extrema"),
-        97: ("openlibrary", "openlibrary_author_comparison"),
-        98: ("openlibrary", "openlibrary_reading_stats_filter"),
+        110: ("openmeteo", "openmeteo_daily_precip_peak_day"),
+        111: ("openlibrary", "openlibrary_subject_nested_work_title"),
+        112: ("arxiv", "arxiv_category_infer_title_substring"),
+        113: ("arxiv", "arxiv_category_infer_author_filter"),
     }
 
     # Template versions - each version's combinations come AFTER all previous versions
@@ -190,10 +192,11 @@ class TaskRegistry:
         # Version 6: ArXiv templates
         [90, 91, 92, 94, 95],
         # Version 7: Open Library engagement & comparison templates (PR #13)
-        # NOTE: PR #14 (openmeteo IDs 99-101) must use Version 8.
         [96, 97, 98],
         # Version 8: Additional Open Meteo templates
         [99, 100, 101],
+        # Version 9: Cross-site templates (daylight calendar, subject hub, arXiv category feeds)
+        [110, 111, 112, 113],
     ]
 
     # Combination registry: list of template ID tuples

diff --git a/liveweb_arena/plugins/arxiv/templates/__init__.py b/liveweb_arena/plugins/arxiv/templates/__init__.py
@@ -6,11 +6,15 @@
 from .multi_author_filter import ArxivMultiAuthorFilterTemplate
 from .title_length_extrema import ArxivTitleLengthExtremaTemplate
 from .category_comparison import ArxivCategoryComparisonTemplate
+from .category_infer_title_substring import ArxivCategoryInferTitleSubstringTemplate
+from .category_infer_author_filter import ArxivCategoryInferAuthorFilterTemplate
 
 __all__ = [
     "ArxivPaperInfoTemplate",
     "ArxivAuthorExtremaTemplate",
     "ArxivMultiAuthorFilterTemplate",
     "ArxivTitleLengthExtremaTemplate",
     "ArxivCategoryComparisonTemplate",
+    "ArxivCategoryInferTitleSubstringTemplate",
+    "ArxivCategoryInferAuthorFilterTemplate",
 ]