From 95458eafdd81ffdc0d7066beb4e26240ee843ba6 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Fri, 13 Mar 2026 17:10:33 +0800 Subject: [PATCH 01/33] Fix sandbox image permissions for arbitrary users --- platform/sandbox/Dockerfile.base | 10 ++++++---- platform/tests/test_sandbox_env_contract.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/platform/sandbox/Dockerfile.base b/platform/sandbox/Dockerfile.base index d26014d..7c7f017 100644 --- a/platform/sandbox/Dockerfile.base +++ b/platform/sandbox/Dockerfile.base @@ -17,11 +17,13 @@ COPY sandbox/requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r /app/requirements.txt # Copy agent server -COPY sandbox/agent_server.py /app/agent_server.py -COPY sandbox/tool_policy.py /app/tool_policy.py +COPY --chown=agent:agent sandbox/agent_server.py /app/agent_server.py +COPY --chown=agent:agent sandbox/tool_policy.py /app/tool_policy.py -# Copy skills -COPY skills/ /skills/ +# Copy skills and keep them readable for sandbox processes that run as +# the workspace owner instead of the image's default agent user. +COPY --chown=agent:agent skills/ /skills/ +RUN chmod -R a+rX /app /skills EXPOSE 9090 diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index a4782b9..b472466 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -143,3 +143,19 @@ def test_coding_sandbox_image_provides_java_toolchain(): assert "JAVA8_HOME" in content assert "JAVA17_HOME" in content assert "ENV JAVA_HOME=/opt/jdk17" in content + + +def test_base_sandbox_image_makes_runtime_entrypoints_world_readable(): + dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.base" + content = dockerfile_path.read_text(encoding="utf-8") + + assert "COPY --chown=agent:agent sandbox/agent_server.py /app/agent_server.py" in content + assert "COPY --chown=agent:agent sandbox/tool_policy.py /app/tool_policy.py" in content + assert "RUN chmod -R a+rX /app /skills" in content + + +def test_base_sandbox_image_copies_skills_with_agent_ownership(): + dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.base" + content = dockerfile_path.read_text(encoding="utf-8") + + assert "COPY --chown=agent:agent skills/ /skills/" in content From f30f0c2cceaa50e17040bafd61256d685d5d6a44 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Sat, 14 Mar 2026 13:01:12 +0800 Subject: [PATCH 02/33] chore: raise circuit breaker defaults and optimize coding image --- platform/app/config.py | 4 ++-- platform/sandbox/Dockerfile.coding | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/platform/app/config.py b/platform/app/config.py index 8e0a8f1..efa949b 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -38,8 +38,8 @@ class Settings(BaseSettings): DB_POOL_TIMEOUT: int = 30 # Circuit breaker configuration - CB_MAX_TOKENS_PER_TASK: int = 200000 - CB_MAX_COST_PER_TASK_RMB: float = 50.0 + CB_MAX_TOKENS_PER_TASK: int = 400000 + CB_MAX_COST_PER_TASK_RMB: float = 100.0 CB_TOKEN_PRICE_PER_1K: float = 0.01 # Webhook secrets (empty = skip verification) diff --git a/platform/sandbox/Dockerfile.coding b/platform/sandbox/Dockerfile.coding index 9299e6d..a20e3f8 100644 --- a/platform/sandbox/Dockerfile.coding +++ b/platform/sandbox/Dockerfile.coding @@ -2,14 +2,24 @@ FROM eclipse-temurin:8-jdk AS jdk8 FROM eclipse-temurin:17-jdk AS jdk17 FROM silicon-agent-sandbox:base +ARG GRADLE_VERSION=8.5 + USER root # Development toolchain RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential gcc g++ make \ nodejs npm \ + unzip \ && rm -rf /var/lib/apt/lists/* +# Install Gradle from the official distribution so we can reuse the bundled +# JDK 17 and avoid pulling Debian's extra OpenJDK runtime chain. +RUN wget -q "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip" -O /tmp/gradle.zip && \ + unzip -q /tmp/gradle.zip -d /opt && \ + ln -s "/opt/gradle-${GRADLE_VERSION}/bin/gradle" /usr/local/bin/gradle && \ + rm -f /tmp/gradle.zip + COPY --from=jdk8 /opt/java/openjdk /opt/jdk8 COPY --from=jdk17 /opt/java/openjdk /opt/jdk17 @@ -18,6 +28,9 @@ ENV JAVA17_HOME=/opt/jdk17 ENV JAVA_HOME=/opt/jdk17 ENV PATH="${JAVA_HOME}/bin:${PATH}" +RUN ln -sf /opt/jdk17/bin/java /usr/local/bin/java && \ + ln -sf /opt/jdk17/bin/javac /usr/local/bin/javac + # Common Python dev tools RUN pip install --no-cache-dir \ pytest ruff black mypy \ From 6ec415b1e35285ba0ba3b4a794153b85b18a0f2d Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Sat, 14 Mar 2026 14:24:42 +0800 Subject: [PATCH 03/33] fix(sandbox): force system gradle with wrapper fallback and timeout --- platform/app/config.py | 6 + platform/app/worker/sandbox.py | 10 ++ platform/sandbox/agent_server.py | 147 ++++++++++++++++++++ platform/tests/test_sandbox_agent_server.py | 88 ++++++++++++ platform/tests/test_sandbox_env_contract.py | 6 + 5 files changed, 257 insertions(+) diff --git a/platform/app/config.py b/platform/app/config.py index efa949b..f435136 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -41,6 +41,9 @@ class Settings(BaseSettings): CB_MAX_TOKENS_PER_TASK: int = 400000 CB_MAX_COST_PER_TASK_RMB: float = 100.0 CB_TOKEN_PRICE_PER_1K: float = 0.01 + # Per-stage token budgets (JSON string). Empty means disabled. + # Example: {"parse": 60000, "code": 250000} + CB_STAGE_TOKEN_BUDGETS: str = '{"parse": 60000, "code": 250000}' # Webhook secrets (empty = skip verification) JIRA_WEBHOOK_SECRET: str = "" @@ -96,6 +99,9 @@ class Settings(BaseSettings): SANDBOX_ROLES: str = '["coding", "test"]' SANDBOX_DUMP_MODEL_API_RESPONSE: bool = True SANDBOX_MODEL_API_RAW_LOG_HOST_DIR: str = "/tmp/silicon_agent/model_api_logs" + SANDBOX_FORCE_SYSTEM_GRADLE: bool = True + SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS: int = 480 + SANDBOX_ALLOW_WRAPPER_FALLBACK: bool = True # Memory & compression configuration MEMORY_ENABLED: bool = True diff --git a/platform/app/worker/sandbox.py b/platform/app/worker/sandbox.py index f84a703..7cf63f0 100644 --- a/platform/app/worker/sandbox.py +++ b/platform/app/worker/sandbox.py @@ -482,6 +482,16 @@ def _build_docker_run_cmd( ) if capture_model_api_raw and container_raw_log_path: parts.extend(["-e", f"SANDBOX_MODEL_API_RAW_LOG_PATH={container_raw_log_path}"]) + parts.extend( + [ + "-e", + f"SANDBOX_FORCE_SYSTEM_GRADLE={'true' if settings.SANDBOX_FORCE_SYSTEM_GRADLE else 'false'}", + "-e", + f"SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS={int(settings.SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS)}", + "-e", + f"SANDBOX_ALLOW_WRAPPER_FALLBACK={'true' if settings.SANDBOX_ALLOW_WRAPPER_FALLBACK else 'false'}", + ] + ) parts.append(image) diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index 1edf197..e3d71b5 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -14,6 +14,7 @@ import logging import os import re +import shlex import sys import time from datetime import datetime, timezone @@ -76,6 +77,10 @@ r"sourceCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", r"targetCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", ) +_GRADLE_WRAPPER_CMD_RE = re.compile(r"(? str: @@ -168,6 +173,88 @@ def _sanitize_reasoning_kwargs_for_model( return sanitized_kwargs +def _env_flag(name: str, default: bool) -> bool: + raw = (os.environ.get(name) or "").strip().lower() + if not raw: + return default + return raw in {"1", "true", "yes", "on"} + + +def _env_int(name: str, default: int) -> int: + raw = (os.environ.get(name) or "").strip() + if not raw: + return default + try: + parsed = int(raw) + except ValueError: + return default + return parsed if parsed > 0 else default + + +def _rewrite_gradle_wrapper_command(command: str) -> tuple[str, bool]: + rewritten, count = _GRADLE_WRAPPER_CMD_RE.subn("gradle", command) + return rewritten, count > 0 + + +def _build_gradle_command( + *, + original_command: str, + rewritten_command: str, + timeout_seconds: int, + allow_wrapper_fallback: bool, +) -> tuple[str, str]: + payload = rewritten_command + strategy = "system" + if allow_wrapper_fallback and rewritten_command != original_command: + strategy = "wrapper_fallback" + payload = ( + f"{rewritten_command}; __sa_rc=$?; " + f"if [ $__sa_rc -eq 126 ] || [ $__sa_rc -eq 127 ]; then {original_command}; " + f"else exit $__sa_rc; fi" + ) + if timeout_seconds > 0: + payload = f"timeout {timeout_seconds}s bash -lc {shlex.quote(payload)}" + return payload, strategy + + +async def _run_runtime_preflight_once() -> None: + global _RUNTIME_PREFLIGHT_DONE + if _RUNTIME_PREFLIGHT_DONE: + return + + async with _RUNTIME_PREFLIGHT_LOCK: + if _RUNTIME_PREFLIGHT_DONE: + return + + async def _run_line(cmd: str, *, timeout: float = 5.0) -> str: + proc = await asyncio.create_subprocess_exec( + "sh", "-lc", cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + try: + out, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout) + except Exception: + with contextlib.suppress(Exception): + proc.kill() + await proc.communicate() + return "" + return (out or b"").decode("utf-8", errors="ignore").strip() + + gradle_line = await _run_line("gradle -v 2>&1 | head -n 5") + java_line = await _run_line("java -version 2>&1 | head -n 3") + if gradle_line: + logger.info("sandbox_runtime_preflight gradle=%s", gradle_line.replace("\n", " | ")) + else: + logger.info("sandbox_runtime_preflight gradle=unavailable") + if java_line: + logger.info("sandbox_runtime_preflight java=%s", java_line.replace("\n", " | ")) + else: + logger.info("sandbox_runtime_preflight java=unavailable") + + _RUNTIME_PREFLIGHT_DONE = True + + def _extract_gemini_thought_signatures_from_response(response_obj: Any) -> dict[str, str]: """Extract OpenAI-compat thought signatures keyed by tool_call id.""" if hasattr(response_obj, "model_dump"): @@ -463,6 +550,64 @@ async def _execute_tool_base(self, tool_call, on_output=None) -> str: async def _execute_tool(self, tool_call, on_output=None): return await self._execute_tool_with_policy(tool_call, on_output=on_output) + def _preprocess_validated_tool_call( + self, + *, + tool_name: str, + args: dict[str, Any], + tool_call: dict[str, Any], + ) -> tuple[dict[str, Any], dict[str, Any], str | None, str | None]: + if tool_name != "execute": + return super()._preprocess_validated_tool_call( + tool_name=tool_name, + args=args, + tool_call=tool_call, + ) + + command = str(args.get("command") or "").strip() + if not command or not _GRADLE_ANY_CMD_RE.search(command): + return super()._preprocess_validated_tool_call( + tool_name=tool_name, + args=args, + tool_call=tool_call, + ) + + force_system_gradle = _env_flag("SANDBOX_FORCE_SYSTEM_GRADLE", True) + allow_wrapper_fallback = _env_flag("SANDBOX_ALLOW_WRAPPER_FALLBACK", True) + timeout_seconds = _env_int("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) + rewritten = command + rewritten_flag = False + strategy = "wrapper" + if force_system_gradle: + rewritten, rewritten_flag = _rewrite_gradle_wrapper_command(command) + if rewritten_flag: + rewritten, strategy = _build_gradle_command( + original_command=command, + rewritten_command=rewritten, + timeout_seconds=timeout_seconds, + allow_wrapper_fallback=allow_wrapper_fallback, + ) + elif timeout_seconds > 0: + strategy = "system" + rewritten = f"timeout {timeout_seconds}s bash -lc {shlex.quote(rewritten)}" + elif timeout_seconds > 0: + rewritten = f"timeout {timeout_seconds}s bash -lc {shlex.quote(rewritten)}" + + if rewritten != command: + logger.info( + "gradle_command_rewrite strategy=%s rewritten=%s original_command=%s rewritten_command=%s", + strategy, + str(rewritten_flag).lower(), + command, + rewritten, + ) + + updated_args = dict(args) + updated_args["command"] = rewritten + normalized_tool_call = dict(tool_call) + normalized_tool_call["arguments"] = json.dumps(updated_args, ensure_ascii=False) + return normalized_tool_call, updated_args, None, None + def _on_tool_validation_error( self, *, @@ -747,6 +892,7 @@ async def handle_execute(request: web.Request) -> web.Response: parsed["workdir"], parsed["timeout"], ) + await _run_runtime_preflight_once() _configure_java_runtime_for_workspace(parsed["workdir"]) try: @@ -818,6 +964,7 @@ async def handle_execute_stream(request: web.Request) -> web.StreamResponse: parsed["workdir"], parsed["timeout"], ) + await _run_runtime_preflight_once() _configure_java_runtime_for_workspace(parsed["workdir"]) try: diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index a0bb670..5935ad9 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -175,3 +175,91 @@ def test_configure_java_runtime_sets_java_home_and_path(tmp_path, monkeypatch): assert selected == 8 assert os.environ["JAVA_HOME"] == "/opt/jdk8" assert os.environ["PATH"].split(":")[0] == "/opt/jdk8/bin" + + +def test_rewrite_gradle_wrapper_command_simple(): + agent_server = _load_agent_server_with_fake_skillkit() + rewritten, changed = agent_server._rewrite_gradle_wrapper_command("./gradlew compileJava --no-daemon") + assert changed is True + assert rewritten == "gradle compileJava --no-daemon" + + +def test_rewrite_gradle_wrapper_command_keeps_prefix_chain(): + agent_server = _load_agent_server_with_fake_skillkit() + rewritten, changed = agent_server._rewrite_gradle_wrapper_command("cd /workspace && ./gradlew test") + assert changed is True + assert rewritten == "cd /workspace && gradle test" + + +def test_container_runner_rewrites_gradlew_when_force_enabled(monkeypatch): + agent_server = _load_agent_server_with_fake_skillkit() + monkeypatch.setenv("SANDBOX_FORCE_SYSTEM_GRADLE", "true") + monkeypatch.setenv("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", "480") + monkeypatch.setenv("SANDBOX_ALLOW_WRAPPER_FALLBACK", "false") + runner = agent_server._create_runner( + { + "skill_dirs": [], + "system_prompt": "system", + "max_turns": 5, + "enable_tools": True, + "model": "gpt-4o", + "temperature": None, + "max_tokens": None, + "workdir": "/workspace", + "allowed_tools": {"execute"}, + } + ) + tool_call = {"name": "execute", "arguments": '{"command":"cd /workspace && ./gradlew test"}'} + normalized, args, err, result = runner._preprocess_validated_tool_call( + tool_name="execute", + args={"command": "cd /workspace && ./gradlew test"}, + tool_call=tool_call, + ) + assert err is None + assert result is None + assert args["command"].startswith("timeout 480s bash -lc ") + assert "gradle test" in args["command"] + assert "./gradlew test" not in args["command"] + assert normalized["arguments"] + + +def test_container_runner_keeps_gradlew_when_force_disabled(monkeypatch): + agent_server = _load_agent_server_with_fake_skillkit() + monkeypatch.setenv("SANDBOX_FORCE_SYSTEM_GRADLE", "false") + monkeypatch.setenv("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", "480") + monkeypatch.setenv("SANDBOX_ALLOW_WRAPPER_FALLBACK", "true") + runner = agent_server._create_runner( + { + "skill_dirs": [], + "system_prompt": "system", + "max_turns": 5, + "enable_tools": True, + "model": "gpt-4o", + "temperature": None, + "max_tokens": None, + "workdir": "/workspace", + "allowed_tools": {"execute"}, + } + ) + _, args, err, result = runner._preprocess_validated_tool_call( + tool_name="execute", + args={"command": "cd /workspace && ./gradlew test"}, + tool_call={"name": "execute", "arguments": '{"command":"cd /workspace && ./gradlew test"}'}, + ) + assert err is None + assert result is None + assert args["command"].startswith("timeout 480s bash -lc ") + assert "./gradlew test" in args["command"] + + +def test_build_gradle_command_enables_wrapper_fallback(): + agent_server = _load_agent_server_with_fake_skillkit() + cmd, strategy = agent_server._build_gradle_command( + original_command="cd /workspace && ./gradlew compileJava", + rewritten_command="cd /workspace && gradle compileJava", + timeout_seconds=480, + allow_wrapper_fallback=True, + ) + assert strategy == "wrapper_fallback" + assert "if [ $__sa_rc -eq 126 ] || [ $__sa_rc -eq 127 ]" in cmd + assert "./gradlew compileJava" in cmd diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index b472466..7d71959 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -46,6 +46,9 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path "SANDBOX_MODEL_API_RAW_LOG_HOST_DIR", str(raw_log_dir), ) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_FORCE_SYSTEM_GRADLE", True) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_ALLOW_WRAPPER_FALLBACK", True) backend = DockerSandboxBackend() cmd = backend._build_docker_run_cmd( @@ -66,6 +69,9 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path assert env["AGENT_PORT"] == "19090" assert env["SANDBOX_DUMP_MODEL_API_RESPONSE"] == "true" assert env["SANDBOX_MODEL_API_RAW_LOG_PATH"] == "/model_api_logs/task-123.jsonl" + assert env["SANDBOX_FORCE_SYSTEM_GRADLE"] == "true" + assert env["SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS"] == "480" + assert env["SANDBOX_ALLOW_WRAPPER_FALLBACK"] == "true" assert f"type=bind,src={raw_log_dir},dst=/model_api_logs" in mounts From 7a829d9856161c73130dd2e8b01742119e118a23 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Sat, 14 Mar 2026 14:58:40 +0800 Subject: [PATCH 04/33] fix(sandbox): keep gradlew and stabilize wrapper cache path --- platform/app/config.py | 5 +- platform/app/worker/sandbox.py | 32 +++++- platform/sandbox/agent_server.py | 117 ++++++++++++-------- platform/tests/test_sandbox_agent_server.py | 74 +++---------- platform/tests/test_sandbox_env_contract.py | 16 ++- 5 files changed, 128 insertions(+), 116 deletions(-) diff --git a/platform/app/config.py b/platform/app/config.py index f435136..5a9c837 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -99,9 +99,10 @@ class Settings(BaseSettings): SANDBOX_ROLES: str = '["coding", "test"]' SANDBOX_DUMP_MODEL_API_RESPONSE: bool = True SANDBOX_MODEL_API_RAW_LOG_HOST_DIR: str = "/tmp/silicon_agent/model_api_logs" - SANDBOX_FORCE_SYSTEM_GRADLE: bool = True SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS: int = 480 - SANDBOX_ALLOW_WRAPPER_FALLBACK: bool = True + SANDBOX_GRADLE_USER_HOME_HOST_DIR: str = "/var/lib/silicon_agent/gradle-cache" + SANDBOX_GRADLE_WRAPPER_PREWARM: bool = True + SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS: int = 180 # Memory & compression configuration MEMORY_ENABLED: bool = True diff --git a/platform/app/worker/sandbox.py b/platform/app/worker/sandbox.py index 7cf63f0..d7cee61 100644 --- a/platform/app/worker/sandbox.py +++ b/platform/app/worker/sandbox.py @@ -11,9 +11,11 @@ from __future__ import annotations import asyncio +import contextlib import inspect import json import logging +import os import time from pathlib import Path from typing import Any, Awaitable, Callable, Dict, Optional @@ -457,6 +459,26 @@ def _build_docker_run_cmd( ) capture_model_api_raw = False + gradle_user_home_mount = "/gradle-cache" + gradle_user_home_host = Path(settings.SANDBOX_GRADLE_USER_HOME_HOST_DIR).expanduser() + try: + gradle_user_home_host.mkdir(parents=True, exist_ok=True) + with contextlib.suppress(OSError): + os.chmod(gradle_user_home_host, 0o777) + parts.extend( + [ + "--mount", + f"type=bind,src={gradle_user_home_host},dst={gradle_user_home_mount}", + ] + ) + except Exception: + logger.warning( + "Failed to prepare gradle cache mount directory: %s", + gradle_user_home_host, + exc_info=True, + ) + gradle_user_home_mount = "/tmp/.gradle" + if settings.SANDBOX_READONLY_ROOT: parts.append("--read-only") parts.extend(["--tmpfs", "/tmp:size=512m"]) @@ -478,18 +500,20 @@ def _build_docker_run_cmd( [ "-e", f"SANDBOX_DUMP_MODEL_API_RESPONSE={'true' if capture_model_api_raw else 'false'}", + "-e", + f"GRADLE_USER_HOME={gradle_user_home_mount}", + "-e", + f"SANDBOX_GRADLE_WRAPPER_PREWARM={'true' if settings.SANDBOX_GRADLE_WRAPPER_PREWARM else 'false'}", + "-e", + f"SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS={int(settings.SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS)}", ] ) if capture_model_api_raw and container_raw_log_path: parts.extend(["-e", f"SANDBOX_MODEL_API_RAW_LOG_PATH={container_raw_log_path}"]) parts.extend( [ - "-e", - f"SANDBOX_FORCE_SYSTEM_GRADLE={'true' if settings.SANDBOX_FORCE_SYSTEM_GRADLE else 'false'}", "-e", f"SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS={int(settings.SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS)}", - "-e", - f"SANDBOX_ALLOW_WRAPPER_FALLBACK={'true' if settings.SANDBOX_ALLOW_WRAPPER_FALLBACK else 'false'}", ] ) diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index e3d71b5..d62ccc9 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -77,10 +77,11 @@ r"sourceCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", r"targetCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", ) -_GRADLE_WRAPPER_CMD_RE = re.compile(r"(? str: @@ -191,30 +192,15 @@ def _env_int(name: str, default: int) -> int: return parsed if parsed > 0 else default -def _rewrite_gradle_wrapper_command(command: str) -> tuple[str, bool]: - rewritten, count = _GRADLE_WRAPPER_CMD_RE.subn("gradle", command) - return rewritten, count > 0 - - -def _build_gradle_command( - *, - original_command: str, - rewritten_command: str, - timeout_seconds: int, - allow_wrapper_fallback: bool, -) -> tuple[str, str]: - payload = rewritten_command - strategy = "system" - if allow_wrapper_fallback and rewritten_command != original_command: - strategy = "wrapper_fallback" - payload = ( - f"{rewritten_command}; __sa_rc=$?; " - f"if [ $__sa_rc -eq 126 ] || [ $__sa_rc -eq 127 ]; then {original_command}; " - f"else exit $__sa_rc; fi" - ) - if timeout_seconds > 0: - payload = f"timeout {timeout_seconds}s bash -lc {shlex.quote(payload)}" - return payload, strategy +def _wrap_with_timeout(command: str, timeout_seconds: int) -> str: + if timeout_seconds <= 0: + return command + payload = ( + "if command -v timeout >/dev/null 2>&1; " + f"then timeout {timeout_seconds}s bash -lc {shlex.quote(command)}; " + f"else bash -lc {shlex.quote(command)}; fi" + ) + return payload async def _run_runtime_preflight_once() -> None: @@ -255,6 +241,61 @@ async def _run_line(cmd: str, *, timeout: float = 5.0) -> str: _RUNTIME_PREFLIGHT_DONE = True +async def _run_gradle_wrapper_prewarm_once(workdir: str) -> None: + global _WRAPPER_PREWARM_DONE + if _WRAPPER_PREWARM_DONE: + return + if not _env_flag("SANDBOX_GRADLE_WRAPPER_PREWARM", True): + return + + gradlew = Path(workdir) / "gradlew" + if not gradlew.is_file(): + return + + timeout_seconds = _env_int("SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", 180) + cmd = _wrap_with_timeout(f"cd {shlex.quote(workdir)} && ./gradlew --version", timeout_seconds) + + async with _WRAPPER_PREWARM_LOCK: + if _WRAPPER_PREWARM_DONE: + return + + started = time.monotonic() + proc = await asyncio.create_subprocess_exec( + "sh", "-lc", cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + try: + out, _ = await proc.communicate() + elapsed_ms = int((time.monotonic() - started) * 1000) + output = (out or b"").decode("utf-8", errors="ignore") + if proc.returncode == 0: + _WRAPPER_PREWARM_DONE = True + logger.info( + "gradle_wrapper_prewarm status=success workdir=%s elapsed_ms=%d gradle_user_home=%s", + workdir, + elapsed_ms, + os.environ.get("GRADLE_USER_HOME", ""), + ) + return + + logger.warning( + "gradle_wrapper_prewarm status=failed workdir=%s elapsed_ms=%d exit_code=%s output=%s", + workdir, + elapsed_ms, + proc.returncode, + output[-600:].replace("\n", " | "), + ) + except Exception as exc: + elapsed_ms = int((time.monotonic() - started) * 1000) + logger.warning( + "gradle_wrapper_prewarm status=error workdir=%s elapsed_ms=%d error=%s", + workdir, + elapsed_ms, + exc, + ) + + def _extract_gemini_thought_signatures_from_response(response_obj: Any) -> dict[str, str]: """Extract OpenAI-compat thought signatures keyed by tool_call id.""" if hasattr(response_obj, "model_dump"): @@ -572,32 +613,14 @@ def _preprocess_validated_tool_call( tool_call=tool_call, ) - force_system_gradle = _env_flag("SANDBOX_FORCE_SYSTEM_GRADLE", True) - allow_wrapper_fallback = _env_flag("SANDBOX_ALLOW_WRAPPER_FALLBACK", True) timeout_seconds = _env_int("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) rewritten = command - rewritten_flag = False - strategy = "wrapper" - if force_system_gradle: - rewritten, rewritten_flag = _rewrite_gradle_wrapper_command(command) - if rewritten_flag: - rewritten, strategy = _build_gradle_command( - original_command=command, - rewritten_command=rewritten, - timeout_seconds=timeout_seconds, - allow_wrapper_fallback=allow_wrapper_fallback, - ) - elif timeout_seconds > 0: - strategy = "system" - rewritten = f"timeout {timeout_seconds}s bash -lc {shlex.quote(rewritten)}" - elif timeout_seconds > 0: - rewritten = f"timeout {timeout_seconds}s bash -lc {shlex.quote(rewritten)}" + if timeout_seconds > 0: + rewritten = _wrap_with_timeout(rewritten, timeout_seconds) if rewritten != command: logger.info( - "gradle_command_rewrite strategy=%s rewritten=%s original_command=%s rewritten_command=%s", - strategy, - str(rewritten_flag).lower(), + "gradle_command_wrap strategy=wrapper rewritten=true original_command=%s rewritten_command=%s", command, rewritten, ) @@ -894,6 +917,7 @@ async def handle_execute(request: web.Request) -> web.Response: ) await _run_runtime_preflight_once() _configure_java_runtime_for_workspace(parsed["workdir"]) + await _run_gradle_wrapper_prewarm_once(parsed["workdir"]) try: runner = _create_runner(parsed) @@ -966,6 +990,7 @@ async def handle_execute_stream(request: web.Request) -> web.StreamResponse: ) await _run_runtime_preflight_once() _configure_java_runtime_for_workspace(parsed["workdir"]) + await _run_gradle_wrapper_prewarm_once(parsed["workdir"]) try: runner = _create_runner(parsed) diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index 5935ad9..d3e7bdc 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -177,25 +177,9 @@ def test_configure_java_runtime_sets_java_home_and_path(tmp_path, monkeypatch): assert os.environ["PATH"].split(":")[0] == "/opt/jdk8/bin" -def test_rewrite_gradle_wrapper_command_simple(): +def test_container_runner_keeps_gradlew_and_wraps_timeout(monkeypatch): agent_server = _load_agent_server_with_fake_skillkit() - rewritten, changed = agent_server._rewrite_gradle_wrapper_command("./gradlew compileJava --no-daemon") - assert changed is True - assert rewritten == "gradle compileJava --no-daemon" - - -def test_rewrite_gradle_wrapper_command_keeps_prefix_chain(): - agent_server = _load_agent_server_with_fake_skillkit() - rewritten, changed = agent_server._rewrite_gradle_wrapper_command("cd /workspace && ./gradlew test") - assert changed is True - assert rewritten == "cd /workspace && gradle test" - - -def test_container_runner_rewrites_gradlew_when_force_enabled(monkeypatch): - agent_server = _load_agent_server_with_fake_skillkit() - monkeypatch.setenv("SANDBOX_FORCE_SYSTEM_GRADLE", "true") monkeypatch.setenv("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", "480") - monkeypatch.setenv("SANDBOX_ALLOW_WRAPPER_FALLBACK", "false") runner = agent_server._create_runner( { "skill_dirs": [], @@ -209,57 +193,27 @@ def test_container_runner_rewrites_gradlew_when_force_enabled(monkeypatch): "allowed_tools": {"execute"}, } ) - tool_call = {"name": "execute", "arguments": '{"command":"cd /workspace && ./gradlew test"}'} normalized, args, err, result = runner._preprocess_validated_tool_call( - tool_name="execute", - args={"command": "cd /workspace && ./gradlew test"}, - tool_call=tool_call, - ) - assert err is None - assert result is None - assert args["command"].startswith("timeout 480s bash -lc ") - assert "gradle test" in args["command"] - assert "./gradlew test" not in args["command"] - assert normalized["arguments"] - - -def test_container_runner_keeps_gradlew_when_force_disabled(monkeypatch): - agent_server = _load_agent_server_with_fake_skillkit() - monkeypatch.setenv("SANDBOX_FORCE_SYSTEM_GRADLE", "false") - monkeypatch.setenv("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", "480") - monkeypatch.setenv("SANDBOX_ALLOW_WRAPPER_FALLBACK", "true") - runner = agent_server._create_runner( - { - "skill_dirs": [], - "system_prompt": "system", - "max_turns": 5, - "enable_tools": True, - "model": "gpt-4o", - "temperature": None, - "max_tokens": None, - "workdir": "/workspace", - "allowed_tools": {"execute"}, - } - ) - _, args, err, result = runner._preprocess_validated_tool_call( tool_name="execute", args={"command": "cd /workspace && ./gradlew test"}, tool_call={"name": "execute", "arguments": '{"command":"cd /workspace && ./gradlew test"}'}, ) assert err is None assert result is None - assert args["command"].startswith("timeout 480s bash -lc ") + assert "timeout 480s bash -lc" in args["command"] assert "./gradlew test" in args["command"] + assert "gradle test" not in args["command"] + assert normalized["arguments"] -def test_build_gradle_command_enables_wrapper_fallback(): +def test_run_gradle_wrapper_prewarm_once_marks_done(tmp_path, monkeypatch): agent_server = _load_agent_server_with_fake_skillkit() - cmd, strategy = agent_server._build_gradle_command( - original_command="cd /workspace && ./gradlew compileJava", - rewritten_command="cd /workspace && gradle compileJava", - timeout_seconds=480, - allow_wrapper_fallback=True, - ) - assert strategy == "wrapper_fallback" - assert "if [ $__sa_rc -eq 126 ] || [ $__sa_rc -eq 127 ]" in cmd - assert "./gradlew compileJava" in cmd + gradlew = tmp_path / "gradlew" + gradlew.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + gradlew.chmod(0o755) + monkeypatch.setenv("SANDBOX_GRADLE_WRAPPER_PREWARM", "true") + monkeypatch.setenv("SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", "30") + agent_server._WRAPPER_PREWARM_DONE = False + import asyncio + asyncio.run(agent_server._run_gradle_wrapper_prewarm_once(str(tmp_path))) + assert agent_server._WRAPPER_PREWARM_DONE is True diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index 7d71959..8a574d9 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -46,9 +46,15 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path "SANDBOX_MODEL_API_RAW_LOG_HOST_DIR", str(raw_log_dir), ) - monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_FORCE_SYSTEM_GRADLE", True) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) - monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_ALLOW_WRAPPER_FALLBACK", True) + gradle_cache_dir = tmp_path / "gradle_cache" + monkeypatch.setattr( + sandbox_mod.settings, + "SANDBOX_GRADLE_USER_HOME_HOST_DIR", + str(gradle_cache_dir), + ) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM", True) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", 180) backend = DockerSandboxBackend() cmd = backend._build_docker_run_cmd( @@ -69,10 +75,12 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path assert env["AGENT_PORT"] == "19090" assert env["SANDBOX_DUMP_MODEL_API_RESPONSE"] == "true" assert env["SANDBOX_MODEL_API_RAW_LOG_PATH"] == "/model_api_logs/task-123.jsonl" - assert env["SANDBOX_FORCE_SYSTEM_GRADLE"] == "true" assert env["SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS"] == "480" - assert env["SANDBOX_ALLOW_WRAPPER_FALLBACK"] == "true" + assert env["GRADLE_USER_HOME"] == "/gradle-cache" + assert env["SANDBOX_GRADLE_WRAPPER_PREWARM"] == "true" + assert env["SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS"] == "180" assert f"type=bind,src={raw_log_dir},dst=/model_api_logs" in mounts + assert f"type=bind,src={gradle_cache_dir},dst=/gradle-cache" in mounts def test_build_docker_run_cmd_disables_raw_model_dump_when_config_off(monkeypatch, tmp_path): From 053b93962bb4dd2ec6fa0c56509eed860c42ff77 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Sat, 14 Mar 2026 15:02:23 +0800 Subject: [PATCH 05/33] chore(sandbox): use workspace gradle cache path --- platform/app/config.py | 2 +- platform/app/worker/sandbox.py | 24 +-------------------- platform/tests/test_sandbox_env_contract.py | 10 ++------- 3 files changed, 4 insertions(+), 32 deletions(-) diff --git a/platform/app/config.py b/platform/app/config.py index 5a9c837..931e181 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -100,7 +100,7 @@ class Settings(BaseSettings): SANDBOX_DUMP_MODEL_API_RESPONSE: bool = True SANDBOX_MODEL_API_RAW_LOG_HOST_DIR: str = "/tmp/silicon_agent/model_api_logs" SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS: int = 480 - SANDBOX_GRADLE_USER_HOME_HOST_DIR: str = "/var/lib/silicon_agent/gradle-cache" + SANDBOX_GRADLE_USER_HOME: str = "/workspace/.gradle" SANDBOX_GRADLE_WRAPPER_PREWARM: bool = True SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS: int = 180 diff --git a/platform/app/worker/sandbox.py b/platform/app/worker/sandbox.py index d7cee61..049c985 100644 --- a/platform/app/worker/sandbox.py +++ b/platform/app/worker/sandbox.py @@ -11,11 +11,9 @@ from __future__ import annotations import asyncio -import contextlib import inspect import json import logging -import os import time from pathlib import Path from typing import Any, Awaitable, Callable, Dict, Optional @@ -459,26 +457,6 @@ def _build_docker_run_cmd( ) capture_model_api_raw = False - gradle_user_home_mount = "/gradle-cache" - gradle_user_home_host = Path(settings.SANDBOX_GRADLE_USER_HOME_HOST_DIR).expanduser() - try: - gradle_user_home_host.mkdir(parents=True, exist_ok=True) - with contextlib.suppress(OSError): - os.chmod(gradle_user_home_host, 0o777) - parts.extend( - [ - "--mount", - f"type=bind,src={gradle_user_home_host},dst={gradle_user_home_mount}", - ] - ) - except Exception: - logger.warning( - "Failed to prepare gradle cache mount directory: %s", - gradle_user_home_host, - exc_info=True, - ) - gradle_user_home_mount = "/tmp/.gradle" - if settings.SANDBOX_READONLY_ROOT: parts.append("--read-only") parts.extend(["--tmpfs", "/tmp:size=512m"]) @@ -501,7 +479,7 @@ def _build_docker_run_cmd( "-e", f"SANDBOX_DUMP_MODEL_API_RESPONSE={'true' if capture_model_api_raw else 'false'}", "-e", - f"GRADLE_USER_HOME={gradle_user_home_mount}", + f"GRADLE_USER_HOME={settings.SANDBOX_GRADLE_USER_HOME}", "-e", f"SANDBOX_GRADLE_WRAPPER_PREWARM={'true' if settings.SANDBOX_GRADLE_WRAPPER_PREWARM else 'false'}", "-e", diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index 8a574d9..e1161f0 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -47,12 +47,7 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path str(raw_log_dir), ) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) - gradle_cache_dir = tmp_path / "gradle_cache" - monkeypatch.setattr( - sandbox_mod.settings, - "SANDBOX_GRADLE_USER_HOME_HOST_DIR", - str(gradle_cache_dir), - ) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_USER_HOME", "/workspace/.gradle") monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM", True) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", 180) @@ -76,11 +71,10 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path assert env["SANDBOX_DUMP_MODEL_API_RESPONSE"] == "true" assert env["SANDBOX_MODEL_API_RAW_LOG_PATH"] == "/model_api_logs/task-123.jsonl" assert env["SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS"] == "480" - assert env["GRADLE_USER_HOME"] == "/gradle-cache" + assert env["GRADLE_USER_HOME"] == "/workspace/.gradle" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM"] == "true" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS"] == "180" assert f"type=bind,src={raw_log_dir},dst=/model_api_logs" in mounts - assert f"type=bind,src={gradle_cache_dir},dst=/gradle-cache" in mounts def test_build_docker_run_cmd_disables_raw_model_dump_when_config_off(monkeypatch, tmp_path): From a72aa15cde34302a7611810d4b3cad8935048041 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Sat, 14 Mar 2026 15:16:25 +0800 Subject: [PATCH 06/33] chore(sandbox): move gradle cache out of worktrees --- platform/app/config.py | 3 ++- platform/app/worker/sandbox.py | 20 +++++++++++++++++++- platform/tests/test_sandbox_env_contract.py | 7 +++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/platform/app/config.py b/platform/app/config.py index 931e181..8215624 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -100,7 +100,8 @@ class Settings(BaseSettings): SANDBOX_DUMP_MODEL_API_RESPONSE: bool = True SANDBOX_MODEL_API_RAW_LOG_HOST_DIR: str = "/tmp/silicon_agent/model_api_logs" SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS: int = 480 - SANDBOX_GRADLE_USER_HOME: str = "/workspace/.gradle" + SANDBOX_GRADLE_CACHE_HOST_DIR: str = "/var/lib/silicon_agent/gradle-cache" + SANDBOX_GRADLE_USER_HOME: str = "/var/lib/silicon_agent/gradle-cache" SANDBOX_GRADLE_WRAPPER_PREWARM: bool = True SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS: int = 180 diff --git a/platform/app/worker/sandbox.py b/platform/app/worker/sandbox.py index 049c985..b8861d3 100644 --- a/platform/app/worker/sandbox.py +++ b/platform/app/worker/sandbox.py @@ -457,6 +457,24 @@ def _build_docker_run_cmd( ) capture_model_api_raw = False + gradle_cache_host_dir = Path(settings.SANDBOX_GRADLE_CACHE_HOST_DIR).expanduser() + gradle_cache_container_dir = str(settings.SANDBOX_GRADLE_USER_HOME).strip() or "/var/lib/silicon_agent/gradle-cache" + try: + gradle_cache_host_dir.mkdir(parents=True, exist_ok=True) + parts.extend( + [ + "--mount", + f"type=bind,src={gradle_cache_host_dir},dst={gradle_cache_container_dir}", + ] + ) + except Exception: + logger.warning( + "Failed to prepare gradle cache mount directory: %s", + gradle_cache_host_dir, + exc_info=True, + ) + gradle_cache_container_dir = "/tmp/.gradle" + if settings.SANDBOX_READONLY_ROOT: parts.append("--read-only") parts.extend(["--tmpfs", "/tmp:size=512m"]) @@ -479,7 +497,7 @@ def _build_docker_run_cmd( "-e", f"SANDBOX_DUMP_MODEL_API_RESPONSE={'true' if capture_model_api_raw else 'false'}", "-e", - f"GRADLE_USER_HOME={settings.SANDBOX_GRADLE_USER_HOME}", + f"GRADLE_USER_HOME={gradle_cache_container_dir}", "-e", f"SANDBOX_GRADLE_WRAPPER_PREWARM={'true' if settings.SANDBOX_GRADLE_WRAPPER_PREWARM else 'false'}", "-e", diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index e1161f0..e8f1323 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -47,7 +47,9 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path str(raw_log_dir), ) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", 480) - monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_USER_HOME", "/workspace/.gradle") + gradle_cache_dir = tmp_path / "gradle_cache" + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_CACHE_HOST_DIR", str(gradle_cache_dir)) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_USER_HOME", "/var/lib/silicon_agent/gradle-cache") monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM", True) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", 180) @@ -71,10 +73,11 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path assert env["SANDBOX_DUMP_MODEL_API_RESPONSE"] == "true" assert env["SANDBOX_MODEL_API_RAW_LOG_PATH"] == "/model_api_logs/task-123.jsonl" assert env["SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS"] == "480" - assert env["GRADLE_USER_HOME"] == "/workspace/.gradle" + assert env["GRADLE_USER_HOME"] == "/var/lib/silicon_agent/gradle-cache" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM"] == "true" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS"] == "180" assert f"type=bind,src={raw_log_dir},dst=/model_api_logs" in mounts + assert f"type=bind,src={gradle_cache_dir},dst=/var/lib/silicon_agent/gradle-cache" in mounts def test_build_docker_run_cmd_disables_raw_model_dump_when_config_off(monkeypatch, tmp_path): From 1f80ab2c90e7fe4f875d0385bb560260056796e5 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:31:05 +0800 Subject: [PATCH 07/33] docs: add java gradle sandbox design and plan --- .../2026-03-19-java-gradle-sandbox-design.md | 169 ++++++++++ ...3-19-java-gradle-sandbox-implementation.md | 308 ++++++++++++++++++ 2 files changed, 477 insertions(+) create mode 100644 docs/plans/2026-03-19-java-gradle-sandbox-design.md create mode 100644 docs/plans/2026-03-19-java-gradle-sandbox-implementation.md diff --git a/docs/plans/2026-03-19-java-gradle-sandbox-design.md b/docs/plans/2026-03-19-java-gradle-sandbox-design.md new file mode 100644 index 0000000..ae26e0d --- /dev/null +++ b/docs/plans/2026-03-19-java-gradle-sandbox-design.md @@ -0,0 +1,169 @@ +# Java Gradle Sandbox Design + +## Context + +The sandbox needs to run a broad set of Java 8 or Java 17 Gradle + Spring Boot projects with poor network conditions. The current repository already ships a dual-JDK coding image, preserves `./gradlew`, mounts a shared Gradle cache, and prewarms wrapper downloads. The remaining gap is making that setup reliable across different project wrappers and dependency graphs without forcing per-project customization. + +## Goals + +- Provide a general-purpose sandbox image that can run most Java 8 or Java 17 Gradle + Spring Boot projects without online dependency fetching on the common path. +- Keep project-specific Gradle behavior aligned with each repository by preferring the project wrapper. +- Default to Java 8 when version detection is inconclusive. +- Offer a manual override for edge cases and keep failure handling bounded and observable. + +## Non-Goals + +- Supporting every private Maven repository or every unusual plugin offline on day one. +- Replacing project wrappers with a single global Gradle version. +- Solving long-term dependency distribution purely through the sandbox image when an internal artifact proxy becomes available later. + +## Decision Summary + +- Ship both JDK 8 and JDK 17 in the sandbox image. +- Prefer `./gradlew` for build and test commands; use system `gradle` only as a fallback when a project has no wrapper. +- Expand Java version detection to cover Gradle toolchains, Maven compiler properties, `.java-version`, `.tool-versions`, and existing compatibility markers. +- If detection fails, default to Java 8. +- Allow a manual override with `SANDBOX_JAVA_VERSION=8|17`. +- Preload common Gradle distributions plus common Spring Boot plugin and dependency caches for offline-first execution. +- Keep a writable runtime cache for uncommon dependencies that are not covered by the prewarmed cache. +- Permit at most one Java-version retry when build output clearly shows a version mismatch. + +## Architecture + +### 1. Image Layering + +- `platform/sandbox/Dockerfile.base` remains the common runtime base for Python, Node, and shared agent tooling. +- `platform/sandbox/Dockerfile.coding` remains the main Java-capable image and should continue to include: + - Temurin JDK 8 + - Temurin JDK 17 + - a system Gradle installation for diagnostics and last-resort fallback + - a prewarmed Gradle cache directory with wrapper distributions and common modules +- `platform/sandbox/Dockerfile.test` layers browser and test tooling on top of `coding`. + +This preserves the current image split while adding an explicit offline dependency layer. + +### 2. Runtime Decision Flow + +The sandbox runtime should resolve Java and Gradle execution in this order: + +1. Check `SANDBOX_JAVA_VERSION` for an explicit override. +2. Auto-detect Java from project files. +3. If no reliable signal is found, select Java 8. +4. Prefer `./gradlew` for Gradle commands. +5. Use system `gradle` only when the workspace lacks `gradlew`. +6. If the command fails with a clear Java-version mismatch, switch once to the other supported JDK and retry one time. + +This flow keeps project compatibility high while avoiding silent loops or repeated environment churn. + +### 3. Java Detection Rules + +The runtime should scan these files when present: + +- `pom.xml` +- `build.gradle` +- `build.gradle.kts` +- `gradle.properties` +- `settings.gradle` +- `settings.gradle.kts` +- `.java-version` +- `.tool-versions` + +Signals should be ranked in this order: + +1. Explicit `SANDBOX_JAVA_VERSION` +2. Gradle toolchain declarations such as `JavaLanguageVersion.of(8|17)` +3. `sourceCompatibility` and `targetCompatibility` +4. Maven compiler `source`, `target`, `release`, and `java.version` +5. version manager hints from `.java-version` or `.tool-versions` + +If conflicting markers exist, the highest-ranked explicit signal wins, and the runtime logs the winning rule. + +### 4. Gradle and Wrapper Strategy + +`./gradlew` should remain the primary execution path because it carries the project-specific Gradle version and plugin resolution behavior. The sandbox should not try to standardize project builds onto one system Gradle version. + +Instead, the sandbox should preload the resources that wrappers usually need to fetch: + +- wrapper distributions for representative Gradle versions in the 6.x, 7.x, and 8.x lines +- Gradle plugin metadata and jars commonly used by Spring Boot builds +- common Maven Central modules used by Spring Boot starters and test dependencies + +System `gradle` stays available for diagnostics such as `gradle -v` and as a fallback when `gradlew` is absent. + +### 5. Offline Cache Model + +The cache model should have two layers: + +- A prewarmed base cache baked into the image or injected as a prepared cache artifact +- A writable runtime cache mounted as `GRADLE_USER_HOME` for project-specific misses + +The prewarmed cache should include: + +- wrapper distributions for selected Gradle versions +- plugin portal artifacts for Spring Boot and dependency management plugins +- common modules and metadata under Gradle's module cache + +The writable layer captures rare dependencies without forcing a full image rebuild. + +### 6. Cache Refresh Strategy + +Refresh the base cache by running a representative project matrix in a controlled environment: + +- Java 8 + Spring Boot 2.x + Gradle 6.x +- Java 8 + Spring Boot 2.x + Gradle 7.x +- Java 17 + Spring Boot 2.7 + Gradle 7.x +- Java 17 + Spring Boot 3.x + Gradle 8.x + +For each representative project, run: + +- `./gradlew --no-daemon help` +- `./gradlew --no-daemon dependencies` +- `./gradlew --no-daemon testClasses` + +This strategy prefetches the Gradle distributions, plugins, starter dependencies, test dependencies, and most metadata needed by common projects. + +## Error Handling + +- If Java detection fails, log that no explicit signal was found and continue with Java 8. +- If the manual override references an unavailable JDK, fail fast with a clear configuration error. +- Retry only once when build output clearly indicates a Java-version incompatibility. +- If both Java 8 and Java 17 fail, surface the original command, selected JDK, retry decision, and the final error in logs and task output. + +## Testing Strategy + +Add or extend tests in the sandbox suite to verify: + +- default Java 8 selection when no markers are present +- Java 17 detection for Gradle toolchains +- Java 8 detection for legacy Gradle or Maven properties +- explicit override handling through environment variables +- wrapper-first execution behavior +- Gradle cache and prewarm environment wiring through the Docker run contract +- coding image assertions for dual JDK plus offline cache preparation hooks + +Add representative sandbox fixtures or smoke validation inputs for: + +- Java 8 + Spring Boot 2.x +- Java 17 + Spring Boot 2.7 +- Java 17 + Spring Boot 3.x + +## Risks And Mitigations + +- Large image size: keep the writable cache separate and refresh only the shared offline layer when possible. +- Cache staleness: refresh the matrix on a scheduled cadence or alongside sandbox release candidates. +- Private repositories: document that private artifacts still need runtime access or a mirrored internal repository. +- False-positive detection: keep a manual override and log the matched detection rule for every task. + +## Recommended Implementation Order + +1. Expand Java detection and add explicit override support. +2. Change runtime defaulting behavior to Java 8 plus bounded retry. +3. Add offline cache preparation hooks in the coding image. +4. Extend Docker env wiring and contract tests for any new cache paths or override variables. +5. Add representative fixture-based verification for Java 8 and Java 17 Spring Boot projects. + +## Open Assumptions + +- Most target projects are standard Gradle + Spring Boot applications using public Maven-style dependencies. +- The sandbox may still need runtime network access for rare or private dependencies, but the common path should succeed offline. +- The current untracked `gradle-8.5-wrapper-cache.tgz` artifact is treated as unrelated local state and is not part of this design. diff --git a/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md b/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md new file mode 100644 index 0000000..f66c609 --- /dev/null +++ b/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md @@ -0,0 +1,308 @@ +# Java Gradle Sandbox Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Make the sandbox reliably run common Java 8 and Java 17 Gradle + Spring Boot projects offline-first by default, while preferring each project's Gradle wrapper and defaulting unknown projects to Java 8. + +**Architecture:** Extend the sandbox runtime to choose Java deterministically, prefer `./gradlew`, and optionally retry once on clear Java-version mismatches. Expand the coding image with prewarmed Gradle distribution and dependency caches, then lock the behavior down with targeted unit and contract tests. + +**Tech Stack:** Docker, Temurin JDK 8/17, Gradle wrapper, Python 3.11, aiohttp sandbox agent, pytest + +--- + +### Task 1: Expand Java version detection inputs + +**Files:** +- Modify: `platform/sandbox/agent_server.py` +- Test: `platform/tests/test_sandbox_agent_server.py` + +**Step 1: Write the failing tests** + +```python +def test_detect_java_version_from_gradle_toolchain(tmp_path): + agent_server = _load_agent_server_with_fake_skillkit() + gradle = tmp_path / "build.gradle.kts" + gradle.write_text( + 'java { toolchain { languageVersion.set(JavaLanguageVersion.of(17)) } }', + encoding="utf-8", + ) + assert agent_server._detect_java_major_version(str(tmp_path)) == 17 + + +def test_detect_java_version_defaults_to_none_without_markers(tmp_path): + agent_server = _load_agent_server_with_fake_skillkit() + (tmp_path / "settings.gradle").write_text('rootProject.name = "demo"', encoding="utf-8") + assert agent_server._detect_java_major_version(str(tmp_path)) is None +``` + +**Step 2: Run test to verify it fails** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "toolchain or defaults_to_none" -v` +Expected: FAIL because toolchain markers are not detected yet. + +**Step 3: Write minimal implementation** + +```python +_JAVA_DETECT_FILES = ( + "pom.xml", + "build.gradle", + "build.gradle.kts", + "gradle.properties", + "settings.gradle", + "settings.gradle.kts", + ".java-version", + ".tool-versions", +) + +_JAVA17_PATTERNS = ( + r"JavaLanguageVersion\.of\(\s*17\s*\)", + r"languageVersion\s*(?:=|\.set\()\s*JavaLanguageVersion\.of\(\s*17\s*\)", +) +``` + +**Step 4: Run test to verify it passes** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "toolchain or defaults_to_none" -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add platform/sandbox/agent_server.py platform/tests/test_sandbox_agent_server.py +git commit -m "feat: expand sandbox java version detection" +``` + +### Task 2: Add explicit Java override and Java 8 defaulting + +**Files:** +- Modify: `platform/sandbox/agent_server.py` +- Modify: `platform/app/config.py` +- Modify: `platform/app/worker/sandbox.py` +- Test: `platform/tests/test_sandbox_agent_server.py` +- Test: `platform/tests/test_sandbox_env_contract.py` + +**Step 1: Write the failing tests** + +```python +def test_configure_java_runtime_respects_explicit_override(tmp_path, monkeypatch): + agent_server = _load_agent_server_with_fake_skillkit() + monkeypatch.setenv("SANDBOX_JAVA_VERSION", "17") + monkeypatch.setenv("JAVA17_HOME", "/opt/jdk17") + monkeypatch.setenv("PATH", "/usr/bin:/bin") + selected = agent_server._configure_java_runtime_for_workspace(str(tmp_path)) + assert selected == 17 + + +def test_build_docker_run_cmd_includes_java_override_env(monkeypatch, tmp_path): + from app.worker import sandbox as sandbox_mod + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_DEFAULT_JAVA_VERSION", 8) + backend = DockerSandboxBackend() + cmd = backend._build_docker_run_cmd("sbx-test", "sandbox-image:latest", "/tmp/workspace", "task-123") + env = _extract_env_vars_from_docker_cmd(cmd) + assert env["SANDBOX_DEFAULT_JAVA_VERSION"] == "8" +``` + +**Step 2: Run test to verify it fails** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -k "override or DEFAULT_JAVA_VERSION" -v` +Expected: FAIL because the env plumbing and defaulting do not exist yet. + +**Step 3: Write minimal implementation** + +```python +override_raw = (os.environ.get("SANDBOX_JAVA_VERSION") or "").strip() +if override_raw in {"8", "17"}: + major = int(override_raw) +else: + major = _detect_java_major_version(workdir) or _env_int("SANDBOX_DEFAULT_JAVA_VERSION", 8) +``` + +**Step 4: Run test to verify it passes** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -k "override or DEFAULT_JAVA_VERSION" -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add platform/sandbox/agent_server.py platform/app/config.py platform/app/worker/sandbox.py platform/tests/test_sandbox_agent_server.py platform/tests/test_sandbox_env_contract.py +git commit -m "feat: add sandbox java override and defaulting" +``` + +### Task 3: Add bounded Java-version fallback on known mismatch errors + +**Files:** +- Modify: `platform/sandbox/agent_server.py` +- Test: `platform/tests/test_sandbox_agent_server.py` + +**Step 1: Write the failing tests** + +```python +def test_should_retry_gradle_command_on_java_version_mismatch(): + assert _should_retry_with_other_java("Unsupported class file major version 61") is True + assert _should_retry_with_other_java("Execution failed for task ':test'") is False +``` + +**Step 2: Run test to verify it fails** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "retry_gradle_command_on_java_version_mismatch" -v` +Expected: FAIL because the helper does not exist yet. + +**Step 3: Write minimal implementation** + +```python +_JAVA_MISMATCH_PATTERNS = ( + r"Unsupported class file major version", + r"invalid source release", + r"release version .* not supported", +) + +def _should_retry_with_other_java(output: str) -> bool: + return any(re.search(pattern, output, re.IGNORECASE) for pattern in _JAVA_MISMATCH_PATTERNS) +``` + +**Step 4: Run test to verify it passes** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "retry_gradle_command_on_java_version_mismatch" -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add platform/sandbox/agent_server.py platform/tests/test_sandbox_agent_server.py +git commit -m "feat: add sandbox java mismatch fallback" +``` + +### Task 4: Add offline Gradle cache preparation to the coding image + +**Files:** +- Modify: `platform/sandbox/Dockerfile.coding` +- Create: `platform/sandbox/scripts/prewarm_gradle_cache.sh` +- Test: `platform/tests/test_sandbox_env_contract.py` + +**Step 1: Write the failing test** + +```python +def test_coding_sandbox_image_prepares_offline_gradle_cache(): + dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.coding" + content = dockerfile_path.read_text(encoding="utf-8") + assert "prewarm_gradle_cache.sh" in content + assert "GRADLE_USER_HOME" in content +``` + +**Step 2: Run test to verify it fails** + +Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "offline_gradle_cache" -v` +Expected: FAIL because the prewarm hook is not present yet. + +**Step 3: Write minimal implementation** + +```bash +#!/usr/bin/env bash +set -euo pipefail +export GRADLE_USER_HOME="${GRADLE_USER_HOME:-/opt/gradle-offline-cache}" +for version in 6.9.4 7.6.4 8.5; do + gradle -g "$GRADLE_USER_HOME" -v >/dev/null 2>&1 || true +done +``` + +**Step 4: Run test to verify it passes** + +Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "offline_gradle_cache" -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add platform/sandbox/Dockerfile.coding platform/sandbox/scripts/prewarm_gradle_cache.sh platform/tests/test_sandbox_env_contract.py +git commit -m "feat: prewarm offline gradle cache in sandbox image" +``` + +### Task 5: Add representative sandbox fixture coverage + +**Files:** +- Create: `platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle` +- Create: `platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts` +- Modify: `platform/tests/test_sandbox_agent_server.py` + +**Step 1: Write the failing tests** + +```python +def test_detect_java_version_from_java8_fixture(): + fixture = Path(__file__).resolve().parent / "fixtures" / "sandbox" / "java8-springboot-gradle" + agent_server = _load_agent_server_with_fake_skillkit() + assert agent_server._detect_java_major_version(str(fixture)) == 8 + + +def test_detect_java_version_from_java17_fixture(): + fixture = Path(__file__).resolve().parent / "fixtures" / "sandbox" / "java17-springboot-gradle" + agent_server = _load_agent_server_with_fake_skillkit() + assert agent_server._detect_java_major_version(str(fixture)) == 17 +``` + +**Step 2: Run test to verify it fails** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "java8_fixture or java17_fixture" -v` +Expected: FAIL because the fixtures do not exist yet. + +**Step 3: Write minimal implementation** + +```groovy +plugins { + id 'org.springframework.boot' version '2.7.18' +} + +sourceCompatibility = JavaVersion.VERSION_1_8 +``` + +```kotlin +plugins { + id("org.springframework.boot") version "3.2.4" +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} +``` + +**Step 4: Run test to verify it passes** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "java8_fixture or java17_fixture" -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts platform/tests/test_sandbox_agent_server.py +git commit -m "test: add sandbox java fixture coverage" +``` + +### Task 6: Run focused regression coverage + +**Files:** +- Test: `platform/tests/test_sandbox_agent_server.py` +- Test: `platform/tests/test_sandbox_env_contract.py` + +**Step 1: Run the focused regression suite** + +Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -v` +Expected: PASS + +**Step 2: Run a Dockerfile assertion smoke** + +Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "coding_sandbox_image or offline_gradle_cache" -v` +Expected: PASS + +**Step 3: Review git diff** + +Run: `git diff --stat` +Expected: Only sandbox runtime, image, script, fixture, and test files changed. + +**Step 4: Commit the verified implementation** + +```bash +git add platform/sandbox/agent_server.py platform/app/config.py platform/app/worker/sandbox.py platform/sandbox/Dockerfile.coding platform/sandbox/scripts/prewarm_gradle_cache.sh platform/tests/test_sandbox_agent_server.py platform/tests/test_sandbox_env_contract.py platform/tests/fixtures/sandbox +git commit -m "feat: harden sandbox java gradle offline support" +``` From 7fdbfc6f69139b5f835cd6aeba79d65f806a709e Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:37:24 +0800 Subject: [PATCH 08/33] chore: ignore local worktrees --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 90ef702..f2b852e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ platform/htmlcov/ platform/coverage.xml platform/**/*.cover platform/**/*.py,cover + +# Local git worktrees +.worktrees/ From a10fcecd561f14fc50b8996d00f80cf4d35ccb3d Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:43:59 +0800 Subject: [PATCH 09/33] feat: expand sandbox java version detection --- platform/sandbox/agent_server.py | 10 +++++++++ platform/tests/test_sandbox_agent_server.py | 23 +++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index d62ccc9..380e2f2 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -64,18 +64,28 @@ "build.gradle", "build.gradle.kts", "gradle.properties", + "settings.gradle", + "settings.gradle.kts", + ".java-version", + ".tool-versions", ) _JAVA8_PATTERNS = ( r"\s*(?:1\.8|8)\s*", r"\s*(?:1\.8|8)\s*", r"sourceCompatibility\s*=\s*(?:['\"]?1\.8['\"]?|JavaVersion\.VERSION_1_8)", r"targetCompatibility\s*=\s*(?:['\"]?1\.8['\"]?|JavaVersion\.VERSION_1_8)", + r"JavaLanguageVersion\.of\(\s*(?:1\.8|8)\s*\)", + r"^\s*8(?:\.\d+)?\s*$", + r"(?m)^\s*java\s+(?:temurin-)?(?:1\.8|8)\s*$", ) _JAVA17_PATTERNS = ( r"\s*17\s*", r"\s*17\s*", r"sourceCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", r"targetCompatibility\s*=\s*(?:['\"]?17['\"]?|JavaVersion\.VERSION_17)", + r"JavaLanguageVersion\.of\(\s*17\s*\)", + r"^\s*17(?:\.\d+)?\s*$", + r"(?m)^\s*java\s+(?:temurin-)?17\s*$", ) _GRADLE_ANY_CMD_RE = re.compile(r"(? Date: Thu, 19 Mar 2026 14:49:47 +0800 Subject: [PATCH 10/33] feat: add sandbox java override and defaulting --- platform/app/config.py | 1 + platform/app/worker/sandbox.py | 2 ++ platform/sandbox/agent_server.py | 11 ++++++--- platform/tests/test_sandbox_agent_server.py | 27 +++++++++++++++++++++ platform/tests/test_sandbox_env_contract.py | 2 ++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/platform/app/config.py b/platform/app/config.py index 8215624..db012ab 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -102,6 +102,7 @@ class Settings(BaseSettings): SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS: int = 480 SANDBOX_GRADLE_CACHE_HOST_DIR: str = "/var/lib/silicon_agent/gradle-cache" SANDBOX_GRADLE_USER_HOME: str = "/var/lib/silicon_agent/gradle-cache" + SANDBOX_DEFAULT_JAVA_VERSION: int = 8 SANDBOX_GRADLE_WRAPPER_PREWARM: bool = True SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS: int = 180 diff --git a/platform/app/worker/sandbox.py b/platform/app/worker/sandbox.py index b8861d3..83b1be6 100644 --- a/platform/app/worker/sandbox.py +++ b/platform/app/worker/sandbox.py @@ -499,6 +499,8 @@ def _build_docker_run_cmd( "-e", f"GRADLE_USER_HOME={gradle_cache_container_dir}", "-e", + f"SANDBOX_DEFAULT_JAVA_VERSION={int(settings.SANDBOX_DEFAULT_JAVA_VERSION)}", + "-e", f"SANDBOX_GRADLE_WRAPPER_PREWARM={'true' if settings.SANDBOX_GRADLE_WRAPPER_PREWARM else 'false'}", "-e", f"SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS={int(settings.SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS)}", diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index 380e2f2..b99d8cf 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -130,9 +130,14 @@ def _detect_java_major_version(workdir: str) -> int | None: def _configure_java_runtime_for_workspace(workdir: str) -> int | None: - major = _detect_java_major_version(workdir) - if major is None: - return None + override_raw = (os.environ.get("SANDBOX_JAVA_VERSION") or "").strip() + if override_raw in {"8", "17"}: + major = int(override_raw) + else: + default_major = _env_int("SANDBOX_DEFAULT_JAVA_VERSION", 8) + if default_major not in {8, 17}: + default_major = 8 + major = _detect_java_major_version(workdir) or default_major java_home_key = "JAVA8_HOME" if major == 8 else "JAVA17_HOME" target_java_home = (os.environ.get(java_home_key) or "").strip() diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index db864ff..0429b32 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -200,6 +200,33 @@ def test_configure_java_runtime_sets_java_home_and_path(tmp_path, monkeypatch): assert os.environ["PATH"].split(":")[0] == "/opt/jdk8/bin" +def test_configure_java_runtime_respects_explicit_override(tmp_path, monkeypatch): + agent_server = _load_agent_server_with_fake_skillkit() + monkeypatch.setenv("SANDBOX_JAVA_VERSION", "17") + monkeypatch.setenv("JAVA17_HOME", "/opt/jdk17") + monkeypatch.setenv("JAVA_HOME", "/opt/jdk8") + monkeypatch.setenv("PATH", "/opt/jdk8/bin:/usr/bin:/bin") + + selected = agent_server._configure_java_runtime_for_workspace(str(tmp_path)) + assert selected == 17 + assert os.environ["JAVA_HOME"] == "/opt/jdk17" + assert os.environ["PATH"].split(":")[0] == "/opt/jdk17/bin" + + +def test_configure_java_runtime_defaults_to_java8_without_markers(tmp_path, monkeypatch): + agent_server = _load_agent_server_with_fake_skillkit() + monkeypatch.delenv("SANDBOX_JAVA_VERSION", raising=False) + monkeypatch.setenv("SANDBOX_DEFAULT_JAVA_VERSION", "8") + monkeypatch.setenv("JAVA8_HOME", "/opt/jdk8") + monkeypatch.setenv("JAVA_HOME", "/opt/jdk17") + monkeypatch.setenv("PATH", "/opt/jdk17/bin:/usr/bin:/bin") + + selected = agent_server._configure_java_runtime_for_workspace(str(tmp_path)) + assert selected == 8 + assert os.environ["JAVA_HOME"] == "/opt/jdk8" + assert os.environ["PATH"].split(":")[0] == "/opt/jdk8/bin" + + def test_container_runner_keeps_gradlew_and_wraps_timeout(monkeypatch): agent_server = _load_agent_server_with_fake_skillkit() monkeypatch.setenv("SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS", "480") diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index e8f1323..305d873 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -52,6 +52,7 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_USER_HOME", "/var/lib/silicon_agent/gradle-cache") monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM", True) monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS", 180) + monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_DEFAULT_JAVA_VERSION", 8) backend = DockerSandboxBackend() cmd = backend._build_docker_run_cmd( @@ -74,6 +75,7 @@ def test_build_docker_run_cmd_includes_skillkit_compat_env(monkeypatch, tmp_path assert env["SANDBOX_MODEL_API_RAW_LOG_PATH"] == "/model_api_logs/task-123.jsonl" assert env["SANDBOX_GRADLE_CMD_TIMEOUT_SECONDS"] == "480" assert env["GRADLE_USER_HOME"] == "/var/lib/silicon_agent/gradle-cache" + assert env["SANDBOX_DEFAULT_JAVA_VERSION"] == "8" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM"] == "true" assert env["SANDBOX_GRADLE_WRAPPER_PREWARM_TIMEOUT_SECONDS"] == "180" assert f"type=bind,src={raw_log_dir},dst=/model_api_logs" in mounts From a968add6df5f448a20b7777958aa049b21e2c314 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:53:59 +0800 Subject: [PATCH 11/33] feat: detect java mismatch retry conditions --- platform/sandbox/agent_server.py | 14 ++++++++++++++ platform/tests/test_sandbox_agent_server.py | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index b99d8cf..d497b10 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -87,6 +87,12 @@ r"^\s*17(?:\.\d+)?\s*$", r"(?m)^\s*java\s+(?:temurin-)?17\s*$", ) +_JAVA_VERSION_MISMATCH_PATTERNS = ( + r"Unsupported class file major version", + r"invalid source release", + r"release version \d+ not supported", + r"Could not target platform", +) _GRADLE_ANY_CMD_RE = re.compile(r"(? int | None: return major +def _should_retry_with_other_java(output: str) -> bool: + text = str(output or "") + return any( + re.search(pattern, text, re.IGNORECASE) + for pattern in _JAVA_VERSION_MISMATCH_PATTERNS + ) + + def _is_gemini_model(model: str | None) -> bool: return "gemini" in ((model or "").lower()) diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index 0429b32..6b1a3f0 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -267,3 +267,10 @@ def test_run_gradle_wrapper_prewarm_once_marks_done(tmp_path, monkeypatch): import asyncio asyncio.run(agent_server._run_gradle_wrapper_prewarm_once(str(tmp_path))) assert agent_server._WRAPPER_PREWARM_DONE is True + + +def test_should_retry_with_other_java_on_version_mismatch(): + agent_server = _load_agent_server_with_fake_skillkit() + assert agent_server._should_retry_with_other_java("Unsupported class file major version 61") + assert agent_server._should_retry_with_other_java("invalid source release: 17") + assert not agent_server._should_retry_with_other_java("Execution failed for task ':test'") From 989454cc40f162d87903a729bfb3fcfce90ea1a0 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:56:21 +0800 Subject: [PATCH 12/33] feat: prewarm offline gradle cache in sandbox image --- platform/sandbox/Dockerfile.coding | 5 +++++ .../sandbox/scripts/prewarm_gradle_cache.sh | 21 +++++++++++++++++++ platform/tests/test_sandbox_env_contract.py | 9 ++++++++ 3 files changed, 35 insertions(+) create mode 100644 platform/sandbox/scripts/prewarm_gradle_cache.sh diff --git a/platform/sandbox/Dockerfile.coding b/platform/sandbox/Dockerfile.coding index a20e3f8..4cae582 100644 --- a/platform/sandbox/Dockerfile.coding +++ b/platform/sandbox/Dockerfile.coding @@ -3,6 +3,7 @@ FROM eclipse-temurin:17-jdk AS jdk17 FROM silicon-agent-sandbox:base ARG GRADLE_VERSION=8.5 +ENV GRADLE_PREWARM_USER_HOME=/opt/gradle-prewarm USER root @@ -22,6 +23,7 @@ RUN wget -q "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}- COPY --from=jdk8 /opt/java/openjdk /opt/jdk8 COPY --from=jdk17 /opt/java/openjdk /opt/jdk17 +COPY sandbox/scripts/prewarm_gradle_cache.sh /usr/local/bin/prewarm_gradle_cache.sh ENV JAVA8_HOME=/opt/jdk8 ENV JAVA17_HOME=/opt/jdk17 @@ -31,6 +33,9 @@ ENV PATH="${JAVA_HOME}/bin:${PATH}" RUN ln -sf /opt/jdk17/bin/java /usr/local/bin/java && \ ln -sf /opt/jdk17/bin/javac /usr/local/bin/javac +RUN chmod +x /usr/local/bin/prewarm_gradle_cache.sh && \ + /usr/local/bin/prewarm_gradle_cache.sh + # Common Python dev tools RUN pip install --no-cache-dir \ pytest ruff black mypy \ diff --git a/platform/sandbox/scripts/prewarm_gradle_cache.sh b/platform/sandbox/scripts/prewarm_gradle_cache.sh new file mode 100644 index 0000000..ddf5193 --- /dev/null +++ b/platform/sandbox/scripts/prewarm_gradle_cache.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +set -euo pipefail + +GRADLE_PREWARM_USER_HOME="${GRADLE_PREWARM_USER_HOME:-/opt/gradle-prewarm}" +mkdir -p "${GRADLE_PREWARM_USER_HOME}" + +run_gradle() { + local version="$1" + local install_dir="/opt/gradle-${version}" + + if [ ! -x "${install_dir}/bin/gradle" ]; then + return 0 + fi + + "${install_dir}/bin/gradle" \ + --gradle-user-home "${GRADLE_PREWARM_USER_HOME}" \ + --no-daemon \ + -v >/dev/null 2>&1 || true +} + +run_gradle "8.5" diff --git a/platform/tests/test_sandbox_env_contract.py b/platform/tests/test_sandbox_env_contract.py index 305d873..66124a3 100644 --- a/platform/tests/test_sandbox_env_contract.py +++ b/platform/tests/test_sandbox_env_contract.py @@ -158,6 +158,15 @@ def test_coding_sandbox_image_provides_java_toolchain(): assert "ENV JAVA_HOME=/opt/jdk17" in content +def test_coding_sandbox_image_prepares_offline_gradle_cache(): + dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.coding" + content = dockerfile_path.read_text(encoding="utf-8") + + assert "sandbox/scripts/prewarm_gradle_cache.sh" in content + assert "GRADLE_PREWARM_USER_HOME" in content + assert "prewarm_gradle_cache.sh" in content + + def test_base_sandbox_image_makes_runtime_entrypoints_world_readable(): dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.base" content = dockerfile_path.read_text(encoding="utf-8") From 9cc499da1c3e0e260d26b94fe79db768b7c433d5 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 14:56:47 +0800 Subject: [PATCH 13/33] test: add sandbox java fixture coverage --- .../java17-springboot-gradle/build.gradle.kts | 13 +++++++++++ .../java8-springboot-gradle/build.gradle | 9 ++++++++ platform/tests/test_sandbox_agent_server.py | 23 +++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts create mode 100644 platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle diff --git a/platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts b/platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts new file mode 100644 index 0000000..836d6da --- /dev/null +++ b/platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts @@ -0,0 +1,13 @@ +plugins { + java + id("org.springframework.boot") version "3.2.4" +} + +group = "com.example" +version = "0.0.1-SNAPSHOT" + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} diff --git a/platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle b/platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle new file mode 100644 index 0000000..1ea6620 --- /dev/null +++ b/platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle @@ -0,0 +1,9 @@ +plugins { + id 'java' + id 'org.springframework.boot' version '2.7.18' +} + +group = 'com.example' +version = '0.0.1-SNAPSHOT' + +sourceCompatibility = JavaVersion.VERSION_1_8 diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index 6b1a3f0..10eeaec 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -3,6 +3,7 @@ import importlib import os import sys +from pathlib import Path from types import ModuleType, SimpleNamespace @@ -274,3 +275,25 @@ def test_should_retry_with_other_java_on_version_mismatch(): assert agent_server._should_retry_with_other_java("Unsupported class file major version 61") assert agent_server._should_retry_with_other_java("invalid source release: 17") assert not agent_server._should_retry_with_other_java("Execution failed for task ':test'") + + +def test_detect_java_version_from_java8_fixture(): + agent_server = _load_agent_server_with_fake_skillkit() + fixture = ( + Path(__file__).resolve().parent + / "fixtures" + / "sandbox" + / "java8-springboot-gradle" + ) + assert agent_server._detect_java_major_version(str(fixture)) == 8 + + +def test_detect_java_version_from_java17_fixture(): + agent_server = _load_agent_server_with_fake_skillkit() + fixture = ( + Path(__file__).resolve().parent + / "fixtures" + / "sandbox" + / "java17-springboot-gradle" + ) + assert agent_server._detect_java_major_version(str(fixture)) == 17 From 688480cdf57b790de747c2b55e0985a5b86af591 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 15:34:19 +0800 Subject: [PATCH 14/33] fix(project): default sandbox image to coding image --- platform/app/services/project_service.py | 3 ++- platform/tests/test_project_service.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/platform/app/services/project_service.py b/platform/app/services/project_service.py index d74b6b8..03b146e 100644 --- a/platform/app/services/project_service.py +++ b/platform/app/services/project_service.py @@ -7,6 +7,7 @@ from sqlalchemy import func, or_, select from sqlalchemy.ext.asyncio import AsyncSession +from app.config import settings from app.models.project import ProjectModel from app.schemas.project import ( ProjectCreateRequest, @@ -75,7 +76,7 @@ async def create_project(self, request: ProjectCreateRequest) -> ProjectResponse repo_local_path=request.repo_local_path, branch=request.branch, description=request.description, - sandbox_image=request.sandbox_image, + sandbox_image=request.sandbox_image or settings.SANDBOX_IMAGE, ) self.session.add(project) await self.session.commit() diff --git a/platform/tests/test_project_service.py b/platform/tests/test_project_service.py index 4f5ccf2..a6507ad 100644 --- a/platform/tests/test_project_service.py +++ b/platform/tests/test_project_service.py @@ -302,6 +302,20 @@ async def test_create_project_minimal(): await _cleanup_project(resp.id) +@pytest.mark.asyncio +async def test_create_project_defaults_sandbox_image(): + """create_project falls back to the configured sandbox image when none is provided.""" + name = _unique_name("svc-create-default-image") + request = ProjectCreateRequest(name=name, display_name="Default Image") + async with async_session_factory() as session: + svc = ProjectService(session) + resp = await svc.create_project(request) + + assert resp.sandbox_image == "silicon-agent-sandbox:coding" + + await _cleanup_project(resp.id) + + # ── update_project ──────────────────────────────────────────────────────────── From f2f285afec162f70201228d23d34d02164cd0613 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 16:41:39 +0800 Subject: [PATCH 15/33] docs: plan coding and testing convergence --- ...03-19-coding-testing-convergence-design.md | 154 ++++++++++++ ...ding-testing-convergence-implementation.md | 224 ++++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 docs/plans/2026-03-19-coding-testing-convergence-design.md create mode 100644 docs/plans/2026-03-19-coding-testing-convergence-implementation.md diff --git a/docs/plans/2026-03-19-coding-testing-convergence-design.md b/docs/plans/2026-03-19-coding-testing-convergence-design.md new file mode 100644 index 0000000..cdf6889 --- /dev/null +++ b/docs/plans/2026-03-19-coding-testing-convergence-design.md @@ -0,0 +1,154 @@ +# Coding And Testing Convergence Design + +**Date:** 2026-03-19 + +**Problem** + +Simple tasks are spending too many turns in the `coding` and `test` stages. The current behavior allows broad repository exploration, and when the model hits the turn limit the continuation prompt is too weak to force the stage back toward concrete actions. In practice this leads to repeated `read/ls/find` calls, continuation loops, and slow delivery even when the requested change is small. + +**Goal** + +Improve convergence for all `coding` and `test` stage executions with the smallest possible execution-chain change. The design should avoid changes to stage orchestration, model routing, sandbox selection, or task lifecycle. The work should only tighten stage instructions and continuation behavior. + +**Non-Goals** + +- Do not change task templates or stage topology. +- Do not add new execution modes or sandbox backends. +- Do not introduce repository-specific heuristics such as “simple task mode”. +- Do not add exploration budgets, tool counters, or runtime interruption logic in this iteration. +- Do not change non-`coding` / non-`test` stages unless required for shared helper plumbing. + +**Current State** + +- Stage prompts are built in [prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py). +- In-process continuation handling lives in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py). +- `coding` and `test` already have stage guardrails, but they focus on stage boundaries rather than convergence. +- Continuation prompts currently say only “请继续完成上面的输出,从你停下的地方继续。”, which encourages more prose instead of concrete progress. + +**Constraints** + +- Keep the implementation localized to prompt and continuation layers. +- Preserve compatibility with existing event logging and continuation flow. +- Keep Chinese prompt style consistent with the rest of the worker prompts. +- Avoid changing default max-turn behavior in this iteration. + +**Approach Options** + +1. Guardrails only +Add stricter `coding` and `test` guardrails in `prompts.py`. + +Pros: +- Smallest code diff. +- No helper changes. + +Cons: +- Only affects the first prompt. +- Does not solve continuation loops directly. + +2. Guardrails plus stage-specific continuation prompts +Strengthen `coding` and `test` stage guardrails and replace the generic continuation prompt with stage-specific convergence prompts. + +Pros: +- Directly addresses the observed failure mode. +- Still limited to prompt-generation and continuation code. +- No model or orchestration changes. + +Cons: +- Slightly larger change surface than guardrails alone. + +3. Guardrails plus continuation prompts plus lower max turns +Do option 2 and also reduce `coding` / `test` max-turn ceilings. + +Pros: +- More aggressive convergence. + +Cons: +- Higher risk of hurting legitimate longer tasks. +- Harder to tune safely without broader validation. + +**Recommendation** + +Choose option 2. + +It is the best fit for the stated goal: stronger convergence for all `coding` and `test` stages with minimal execution-chain change. It improves both the initial stage instruction and the continuation loop without changing orchestration or routing. + +**Design** + +### 1. Tighten `coding` stage guardrail + +Update the `code` entry in `STAGE_GUARDRAILS` so that it explicitly instructs the agent to: + +- avoid broad repository exploration; +- use already available context first; +- read more files only when a missing detail blocks implementation; +- move quickly to concrete file edits; +- run the smallest necessary validation; +- stop after implementation and a brief summary. + +The guardrail should discourage “understand the whole repo first” behavior and push the agent toward the minimum set of reads needed to safely modify code. + +### 2. Tighten `test` stage guardrail + +Update the `test` entry in `STAGE_GUARDRAILS` so that it explicitly instructs the agent to: + +- focus only on validation directly tied to the current change; +- prefer the fastest relevant verification path; +- stop once acceptance is sufficiently proven; +- avoid expanding into smoke/E2E/performance work unless explicitly requested; +- report the concrete blocker if validation cannot proceed. + +This keeps the `test` stage from growing into an open-ended general validation phase. + +### 3. Replace generic continuation prompts with stage-specific convergence prompts + +Update `_handle_continuations()` in `executor.py` so that: + +- `coding` continuations tell the agent to stop broad exploration, use the information already gathered, and produce concrete edits or a single evidenced blocker; +- `test` continuations tell the agent to stop expanding coverage, run the smallest relevant validation, and return concrete results or a blocker; +- other stages can keep the generic continuation wording, or use a neutral fallback prompt. + +The continuation prompt should be action-oriented. Its purpose is not to continue reasoning indefinitely; it is to force the stage back toward a terminating action. + +**Data Flow** + +1. `build_user_prompt()` continues to assemble the stage prompt as today. +2. `coding` / `test` guardrails now embed convergence-specific instructions. +3. If the model returns the truncation sentinel, `_handle_continuations()` chooses a stage-aware prompt. +4. The runner continues with `reset=False`, but the continuation now carries explicit instructions to finish implementation or verification instead of continuing exploratory dialogue. + +**Error Handling** + +- Existing continuation retry and logging behavior remains unchanged. +- If a continuation still fails or times out, the current stage error handling path stays in place. +- No changes are needed to fallback logic, task logs schema, or retry scheduling. + +**Testing Strategy** + +Add focused tests only. + +- Prompt tests in [test_prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py): + - verify `coding` guardrail includes the new convergence instructions; + - verify `test` guardrail includes the new minimal-validation instructions. +- Executor tests in [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) or a nearby executor-focused test module: + - verify `coding` continuation uses the stage-specific prompt; + - verify `test` continuation uses the stage-specific prompt; + - verify non-target stages still use the generic fallback prompt. + +**Risks** + +- Over-constraining `coding` may reduce necessary repo discovery for legitimately complex changes. +- Over-constraining `test` may cause the agent to stop too early if prompts are too absolute. +- Prompt wording that is too long may dilute the core instruction. + +**Mitigations** + +- Phrase the new guardrails as “use available context first” rather than “never explore”. +- Allow one additional critical file lookup in continuation prompts when truly needed. +- Leave max-turn settings unchanged for now to isolate the impact of prompt changes. + +**Success Criteria** + +- `coding` stage continuations stop looping on generic prose and push toward edits or a blocker. +- `test` stage continuations stop expanding test scope after sufficient evidence is available. +- The change is limited to prompt and continuation logic. +- Existing worker flow, logging, and task lifecycle remain unchanged. diff --git a/docs/plans/2026-03-19-coding-testing-convergence-implementation.md b/docs/plans/2026-03-19-coding-testing-convergence-implementation.md new file mode 100644 index 0000000..0436482 --- /dev/null +++ b/docs/plans/2026-03-19-coding-testing-convergence-implementation.md @@ -0,0 +1,224 @@ +# Coding And Testing Convergence Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Tighten `coding` and `test` stage convergence by strengthening stage guardrails and using stage-specific continuation prompts. + +**Architecture:** Keep the execution chain unchanged and localize the work to prompt assembly and continuation handling. `prompts.py` will define stronger convergence instructions for `coding` and `test`, and `executor.py` will choose stage-aware continuation prompts when the runner hits the truncation sentinel. + +**Tech Stack:** Python, pytest, async worker executor, stage prompt generation + +--- + +### Task 1: Strengthen stage guardrails in prompt generation + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` + +**Step 1: Write the failing tests** + +Add prompt assertions that prove: + +- the `code` guardrail tells the agent to avoid broad repo exploration and move toward concrete edits plus minimal validation; +- the `test` guardrail tells the agent to use the smallest relevant verification path and stop once evidence is sufficient. + +Example assertions: + +```python +def test_code_prompt_emphasizes_convergence(): + ctx = _minimal_ctx(stage_name="code") + result = build_user_prompt(ctx) + assert "不要为了理解整个仓库而广泛探索" in result + assert "最小必要验证" in result + + +def test_test_prompt_emphasizes_minimal_validation(): + ctx = _minimal_ctx(stage_name="test") + result = build_user_prompt(ctx) + assert "最小、最相关、最快的验证路径" in result + assert "满足验收标准" in result +``` + +**Step 2: Run the tests to verify they fail** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -k "convergence or minimal_validation" -q +``` + +Expected: FAIL because the current guardrails do not contain the new phrases. + +**Step 3: Write the minimal implementation** + +Update `STAGE_GUARDRAILS["code"]` and `STAGE_GUARDRAILS["test"]` in [prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py) so they: + +- push `coding` toward immediate implementation and minimal verification; +- push `test` toward targeted validation and fast termination once evidence is enough. + +Keep the wording concise and consistent with the existing Chinese prompt style. + +**Step 4: Run the tests to verify they pass** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -k "convergence or minimal_validation" -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py +git commit -m "fix(worker): tighten coding and test stage guardrails" +``` + +### Task 2: Add stage-specific continuation prompts + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Write the failing tests** + +Add executor-focused tests that exercise `_handle_continuations()` and prove: + +- `coding` continuations use a convergence prompt that stops broad exploration and asks for concrete edits or one blocker; +- `test` continuations use a convergence prompt that asks for the smallest relevant validation result; +- non-`coding` / non-`test` stages still use the generic continuation prompt. + +Use a fake runner whose `chat()` captures the continuation prompt and returns a non-truncated response. + +Example structure: + +```python +@pytest.mark.asyncio +async def test_handle_continuations_uses_coding_specific_prompt(): + runner = FakeRunner(["done"]) + tracker = FakeTracker() + output, _tokens = await _handle_continuations( + runner, + "[Max turns reached. Please continue the conversation.]", + {"stage_name": "code"}, + tracker, + ) + assert "不要继续广泛浏览代码库" in runner.prompts[0] +``` + +If `_handle_continuations()` does not currently know the stage, first design the smallest helper change that allows the caller to pass it in. + +**Step 2: Run the tests to verify they fail** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -k "continuation and coding or continuation and test" -q +``` + +Expected: FAIL because continuation prompts are currently generic. + +**Step 3: Write the minimal implementation** + +In [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py): + +- introduce a small helper that returns the continuation prompt for a given stage name; +- use stage-aware prompts for `code` and `test`; +- keep a generic fallback for other stages; +- thread the current stage name into `_handle_continuations()` with the smallest possible call-site change. + +Do not change retry counts, timeout behavior, logging contracts, or runner reset behavior. + +**Step 4: Run the tests to verify they pass** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -k "continuation and coding or continuation and test" -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py +git commit -m "fix(worker): add convergent continuation prompts" +``` + +### Task 3: Run regression checks for prompt and executor paths + +**Files:** +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Run the prompt tests** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -q +``` + +Expected: PASS. + +**Step 2: Run the executor tests** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -q +``` + +Expected: PASS. + +**Step 3: Run a focused combined smoke check** + +Run: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py tests/test_executor_stage_logs.py -q +``` + +Expected: PASS with no new failures in the touched prompt and continuation logic. + +**Step 4: Commit the verification state** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/docs/plans/2026-03-19-coding-testing-convergence-design.md /Users/jowang/Documents/github/silicon_agent/docs/plans/2026-03-19-coding-testing-convergence-implementation.md +git commit -m "docs: plan coding and testing convergence changes" +``` + +### Task 4: Optional manual validation on the VM + +**Files:** +- Modify: none +- Test: none + +**Step 1: Restart the VM worker with the intended config** + +Run on the VM after code deployment: + +```bash +grep -n 'SANDBOX_ENABLED' /home/stb_admin/silicon_agent/platform/.env +``` + +Expected: whichever value is desired for the validation session. + +**Step 2: Re-run a previously slow task shape** + +Use a simple API task similar to “create a helloworld interface”. + +**Step 3: Inspect logs** + +Check that: + +- `coding` does not spend most turns on broad repo exploration; +- continuation prompts no longer produce repeated generic continuation loops; +- `test` stops after targeted validation. + +**Step 4: Record any follow-up gaps** + +If the new prompt strategy is still too weak, capture concrete examples before considering max-turn tuning in a later change. From 8f42207d650264e96d58d6d2db25041002321a6e Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 16:45:40 +0800 Subject: [PATCH 16/33] fix(worker): tighten coding and test stage guardrails --- platform/app/worker/prompts.py | 9 ++++++--- platform/tests/test_prompts.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index 597b8b4..defd21c 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -130,13 +130,16 @@ STAGE_GUARDRAILS: Dict[str, str] = { "code": ( "只完成当前阶段,不要提前执行后续阶段任务。\n" - "你可以为了验证实现而运行必要命令,但不要提前生成最终签收/验收报告," - "也不要调用 signoff、review、smoke、e2e-test 等后续阶段能力。\n" + "不要为了理解整个仓库而广泛探索,优先基于已知信息直接实现。\n" + "只有在缺少关键实现信息时才少量补读文件;你可以为了验证实现而运行必要命令," + "但目标必须是最小必要验证。\n" + "不要提前生成最终签收/验收报告,也不要调用 signoff、review、smoke、e2e-test 等后续阶段能力。\n" "完成实现并简要总结本阶段改动后结束。" ), "test": ( "只完成当前阶段,不要提前执行后续阶段任务。\n" - "请聚焦当前任务直接相关的自动化测试与验证;如果相关测试已经通过,且已覆盖验收标准,请立即停止。\n" + "请聚焦当前任务直接相关的自动化测试与验证,优先最小、最相关、最快的验证路径。\n" + "如果相关测试已经通过,且已满足验收标准,请立即停止。\n" "不要继续扩展额外类型的测试,例如 E2E、冒烟、性能或签收报告,除非任务明确要求。" ), "signoff": ( diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index a1900c6..e0a16e4 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -32,6 +32,20 @@ def test_minimal_title_only(): assert STAGE_INSTRUCTIONS["code"] in result +def test_code_guardrail_emphasizes_convergence(): + ctx = _minimal_ctx(stage_name="code") + result = build_user_prompt(ctx) + assert "不要为了理解整个仓库而广泛探索" in result + assert "最小必要验证" in result + + +def test_test_guardrail_emphasizes_minimal_validation(): + ctx = _minimal_ctx(stage_name="test", agent_role="test") + result = build_user_prompt(ctx) + assert "最小、最相关、最快的验证路径" in result + assert "满足验收标准" in result + + # --------------------------------------------------------------------------- # With description # --------------------------------------------------------------------------- From 0fa8ef967f4006d10996d0b789407b135e34bdba Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 16:47:29 +0800 Subject: [PATCH 17/33] fix(worker): add convergent continuation prompts --- platform/app/worker/executor.py | 14 +++- platform/tests/test_executor_stage_logs.py | 85 ++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index 60ba1ce..ba7ef21 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -655,11 +655,21 @@ def detach_all_handlers(self) -> None: # Extracted helpers: continuations and stage success # --------------------------------------------------------------------------- +def _build_continuation_prompt(stage_name: str | None) -> str: + normalized = (stage_name or "").strip().lower() + if normalized == "code": + return "请停止继续广泛探索,基于已知信息直接补全代码修改。" + if normalized == "test": + return "请停止扩展测试范围,直接执行最小、最相关的验证并给出结果。" + return "请继续完成上面的输出,从你停下的地方继续。" + + async def _handle_continuations( runner: Any, output: str, runtime_overrides: dict[str, Any], tracker: StageEventTracker, + stage_name: str | None = None, ) -> tuple[str, int]: """Follow up with continuation prompts when the LLM output was truncated.""" _MAX_CONTINUATIONS = 3 @@ -669,7 +679,7 @@ async def _handle_continuations( while _TRUNCATION_SENTINEL in (output or "") and continuations < _MAX_CONTINUATIONS: continuations += 1 continuation_started = time.monotonic() - prompt = "请继续完成上面的输出,从你停下的地方继续。" + prompt = _build_continuation_prompt(stage_name or tracker.stage_name) chat_correlation = await tracker.emit_chat_sent( request_body={ "prompt": prompt, @@ -1017,7 +1027,7 @@ async def execute_stage( total_tokens = runner.cumulative_usage.total_tokens output, total_tokens = await _handle_continuations( - runner, output, runtime_overrides, tracker + runner, output, runtime_overrides, tracker, stage.stage_name ) # Phase 2.2: Evaluator-optimizer loop (if configured for this stage) diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 07d4147..125e76c 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -136,6 +136,34 @@ async def emit_update(self, *, log_id: str, updates: dict, priority: str = 'norm return True +class _ContinuationRunner: + def __init__(self, *, response_text: str = 'done') -> None: + self.config = SimpleNamespace(model='test-model') + self.cumulative_usage = SimpleNamespace(total_tokens=11) + self.prompts: list[str] = [] + self.response_text = response_text + + async def chat(self, prompt: str, reset: bool = True, **_: object): + self.prompts.append(prompt) + return SimpleNamespace(text_content=self.response_text) + + +class _ContinuationTracker: + def __init__(self, stage_name: str, agent_role: str = 'coding') -> None: + self.stage_name = stage_name + self.agent_role = agent_role + self.sent: list[dict[str, object]] = [] + self.received: list[dict[str, object]] = [] + + async def emit_chat_sent(self, **kwargs): + self.sent.append(kwargs) + return f"sent-{len(self.sent)}" + + async def emit_chat_received(self, *args, **kwargs): + self.received.append({"args": args, "kwargs": kwargs}) + return True + + class _CancelledRunner(_FakeRunner): async def chat(self, _prompt: str, reset: bool = True, **_: object): await self.events.emit( @@ -572,6 +600,63 @@ async def test_execute_stage_cancellation_still_finalizes_started_logs(monkeypat assert isinstance(turn_updates[-1]['updates']['duration_ms'], float) +@pytest.mark.asyncio +async def test_handle_continuations_uses_coding_specific_prompt(): + runner = _ContinuationRunner() + tracker = _ContinuationTracker(stage_name='code', agent_role='coding') + + output, total_tokens = await executor._handle_continuations( + runner, + "[Max turns reached. Please continue the conversation.]", + {}, + tracker, + ) + + assert total_tokens == 11 + assert output == 'done' + assert runner.prompts == [ + '请停止继续广泛探索,基于已知信息直接补全代码修改。' + ] + + +@pytest.mark.asyncio +async def test_handle_continuations_uses_test_specific_prompt(): + runner = _ContinuationRunner() + tracker = _ContinuationTracker(stage_name='test', agent_role='test') + + output, total_tokens = await executor._handle_continuations( + runner, + "[Max turns reached. Please continue the conversation.]", + {}, + tracker, + ) + + assert total_tokens == 11 + assert output == 'done' + assert runner.prompts == [ + '请停止扩展测试范围,直接执行最小、最相关的验证并给出结果。' + ] + + +@pytest.mark.asyncio +async def test_handle_continuations_uses_generic_prompt_for_other_stage(): + runner = _ContinuationRunner() + tracker = _ContinuationTracker(stage_name='review', agent_role='review') + + output, total_tokens = await executor._handle_continuations( + runner, + "[Max turns reached. Please continue the conversation.]", + {}, + tracker, + ) + + assert total_tokens == 11 + assert output == 'done' + assert runner.prompts == [ + '请继续完成上面的输出,从你停下的地方继续。' + ] + + @pytest.mark.asyncio async def test_execute_stage_sandboxed_emits_standardized_pipeline_events(monkeypatch): from app.worker import agents as worker_agents From d180f82cabfd6fadbc40e527c252b5b5788ced84 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 17:00:17 +0800 Subject: [PATCH 18/33] feat(tasks): add task clone endpoint --- platform/app/api/v1/tasks.py | 8 +++ platform/app/services/task_service.py | 18 +++++++ platform/tests/test_task_service.py | 64 +++++++++++++++++++++++ platform/tests/test_tasks_api.py | 74 +++++++++++++++++++++++++++ 4 files changed, 164 insertions(+) diff --git a/platform/app/api/v1/tasks.py b/platform/app/api/v1/tasks.py index 227149b..99ec6d0 100644 --- a/platform/app/api/v1/tasks.py +++ b/platform/app/api/v1/tasks.py @@ -95,6 +95,14 @@ async def cancel_task(task_id: str, service: TaskService = Depends(get_task_serv return task +@router.post("/{task_id}/clone", response_model=TaskDetailResponse, status_code=201) +async def clone_task(task_id: str, service: TaskService = Depends(get_task_service)): + task = await service.clone_task(task_id) + if task is None: + raise HTTPException(status_code=404, detail="Task not found") + return task + + @router.post("/{task_id}/retry", response_model=TaskDetailResponse) async def retry_task(task_id: str, service: TaskService = Depends(get_task_service)): task = await service.retry_task(task_id) diff --git a/platform/app/services/task_service.py b/platform/app/services/task_service.py index bbcaa75..05a3e24 100644 --- a/platform/app/services/task_service.py +++ b/platform/app/services/task_service.py @@ -129,6 +129,24 @@ async def create_task(self, request: TaskCreateRequest) -> TaskDetailResponse: task = result.scalar_one() return self._task_to_response(task) + async def clone_task(self, task_id: str) -> Optional[TaskDetailResponse]: + """Create a new task by copying only safe creation fields from a source task.""" + source_task = await self._load_task_with_relations_optional(task_id) + if source_task is None: + return None + + return await self.create_task( + TaskCreateRequest( + jira_id=source_task.jira_id, + title=source_task.title, + description=source_task.description, + template_id=source_task.template_id, + project_id=source_task.project_id, + yunxiao_task_id=source_task.yunxiao_task_id, + github_issue_number=getattr(source_task, "github_issue_number", None), + ) + ) + async def get_task(self, task_id: str) -> Optional[TaskDetailResponse]: result = await self.session.execute( select(TaskModel) diff --git a/platform/tests/test_task_service.py b/platform/tests/test_task_service.py index 0020842..219c14b 100644 --- a/platform/tests/test_task_service.py +++ b/platform/tests/test_task_service.py @@ -351,6 +351,70 @@ async def test_create_task_template_not_found(): assert result.id == "new-task-4" +# --------------------------------------------------------------------------- +# clone_task +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_clone_task_not_found(): + """clone_task returns None when source task is missing.""" + session = _make_session() + session.execute.return_value = _mock_result(scalar_one_or_none=None) + svc = TaskService(session) + + result = await svc.clone_task("missing-task") + + assert result is None + + +@pytest.mark.asyncio +async def test_clone_task_reuses_create_task_with_whitelisted_fields(): + """clone_task should create a fresh task from copy-safe source fields only.""" + session = _make_session() + source = _make_task( + id="task-source", + title="Clone Me", + description="Original task body", + status="failed", + jira_id="JIRA-1", + template_id="tmpl-1", + project_id="proj-1", + target_branch="silicon_agent/source", + yunxiao_task_id="YX-1", + branch_name="feature/source", + pr_url="https://example.com/pr/1", + ) + session.execute.return_value = _mock_result(scalar_one_or_none=source) + svc = TaskService(session) + cloned = _make_task( + id="task-clone", + title="Clone Me", + description="Original task body", + status="pending", + jira_id="JIRA-1", + template_id="tmpl-1", + project_id="proj-1", + target_branch="silicon_agent/clone", + yunxiao_task_id="YX-1", + ) + svc.create_task = AsyncMock(return_value=svc._task_to_response(cloned)) + + result = await svc.clone_task("task-source") + + assert result.id == "task-clone" + svc.create_task.assert_awaited_once() + request = svc.create_task.await_args.args[0] + assert isinstance(request, TaskCreateRequest) + assert request.title == "Clone Me" + assert request.description == "Original task body" + assert request.jira_id == "JIRA-1" + assert request.template_id == "tmpl-1" + assert request.project_id == "proj-1" + assert request.yunxiao_task_id == "YX-1" + assert request.target_branch is None + + # --------------------------------------------------------------------------- # get_task (lines 133-136) # --------------------------------------------------------------------------- diff --git a/platform/tests/test_tasks_api.py b/platform/tests/test_tasks_api.py index c1c3cc7..8558dd0 100644 --- a/platform/tests/test_tasks_api.py +++ b/platform/tests/test_tasks_api.py @@ -261,6 +261,80 @@ async def test_get_task_404(client): assert resp.status_code == 404 +@pytest.mark.asyncio +async def test_clone_task_creates_fresh_pending_copy(client, seed_template_with_stages): + """POST /api/v1/tasks/{id}/clone creates a new task without inheriting runtime state.""" + template_id = seed_template_with_stages + create_resp = await client.post("/api/v1/tasks", json={ + "title": "TT Clone Source", + "description": "Clone this task", + "template_id": template_id, + "jira_id": "TT-123", + "project_id": None, + "yunxiao_task_id": "YX-123", + }) + assert create_resp.status_code == 201 + source = create_resp.json() + + async with async_session_factory() as session: + result = await session.execute(select(TaskModel).where(TaskModel.id == source["id"])) + task = result.scalar_one() + task.status = "failed" + task.branch_name = "feature/original" + task.pr_url = "https://example.com/pr/123" + + stage_result = await session.execute( + select(TaskStageModel).where(TaskStageModel.task_id == source["id"]) + ) + stages = stage_result.scalars().all() + stages[0].status = "completed" + stages[1].status = "failed" + stages[1].retry_count = 2 + stages[1].error_message = "compile failed" + await session.commit() + + clone_resp = await client.post(f"/api/v1/tasks/{source['id']}/clone") + assert clone_resp.status_code == 201 + cloned = clone_resp.json() + + assert cloned["id"] != source["id"] + assert cloned["title"] == source["title"] + assert cloned["description"] == source["description"] + assert cloned["jira_id"] == source["jira_id"] + assert cloned["template_id"] == source["template_id"] + assert cloned["yunxiao_task_id"] == source["yunxiao_task_id"] + assert cloned["status"] == "pending" + assert cloned["branch_name"] is None + assert cloned["pr_url"] is None + assert cloned["target_branch"] == f"silicon_agent/{cloned['id'].rsplit('-', 1)[-1]}" + assert len(cloned["stages"]) == 3 + for stage in cloned["stages"]: + assert stage["status"] == "pending" + assert stage["retry_count"] == 0 + assert stage["error_message"] is None + + async with async_session_factory() as session: + stage_result = await session.execute( + select(TaskStageModel).where(TaskStageModel.task_id.in_([source["id"], cloned["id"]])) + ) + for stage in stage_result.scalars().all(): + await session.delete(stage) + + task_result = await session.execute( + select(TaskModel).where(TaskModel.id.in_([source["id"], cloned["id"]])) + ) + for task in task_result.scalars().all(): + await session.delete(task) + await session.commit() + + +@pytest.mark.asyncio +async def test_clone_task_404(client): + """POST /api/v1/tasks/{id}/clone returns 404 for nonexistent task.""" + resp = await client.post("/api/v1/tasks/tt-nonexistent-id/clone") + assert resp.status_code == 404 + + # ── List Tasks Tests ────────────────────────────────────── From 028f24d2da779e1f9a7e861b45859469b7c948f9 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 17:26:10 +0800 Subject: [PATCH 19/33] fix(worker): tighten coding and test turn budgets --- platform/app/worker/executor.py | 41 ++++++++++++++++++---- platform/app/worker/prompts.py | 7 ++-- platform/tests/test_executor_stage_logs.py | 17 +++++++-- platform/tests/test_prompts.py | 3 ++ 4 files changed, 57 insertions(+), 11 deletions(-) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index ba7ef21..e74f148 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -77,6 +77,26 @@ def _build_runtime_overrides( } +_DEFAULT_STAGE_MAX_TURNS: dict[str, int] = { + "spec": 10, + "coding": 6, + "doc": 10, + "test": 6, +} + +_STAGE_MAX_TURN_CAPS: dict[str, int] = { + "coding": 6, + "test": 6, +} + + +def _resolve_stage_max_turns(agent_role: str, override: Optional[int]) -> int: + default_value = _DEFAULT_STAGE_MAX_TURNS.get(agent_role, 10) + requested = override if isinstance(override, int) and override > 0 else default_value + cap = _STAGE_MAX_TURN_CAPS.get(agent_role) + return min(requested, cap) if cap else requested + + def _chat_kwargs_for_runner(runner: Any, runtime_overrides: dict[str, Any]) -> dict[str, Any]: kwargs: dict[str, Any] = {} try: @@ -658,9 +678,16 @@ def detach_all_handlers(self) -> None: def _build_continuation_prompt(stage_name: str | None) -> str: normalized = (stage_name or "").strip().lower() if normalized == "code": - return "请停止继续广泛探索,基于已知信息直接补全代码修改。" + return ( + "请停止继续广泛探索。基于已知信息直接修改代码;" + "如果仍缺信息,只允许再查看 1 个最关键文件,然后必须完成修改并给出最小验证结果。" + ) if normalized == "test": - return "请停止扩展测试范围,直接执行最小、最相关的验证并给出结果。" + return ( + "请停止扩展测试范围。只做最小、最相关的验证;" + "如果验证命令失败,必须直接给出失败命令、关键报错和唯一阻塞点," + "不要再用代码阅读代替测试结论。" + ) return "请继续完成上面的输出,从你停下的地方继续。" @@ -867,13 +894,14 @@ async def execute_stage( user_prompt = build_user_prompt(ctx) runtime_overrides = _build_runtime_overrides(agent, stage_model) + stage_max_turns = _resolve_stage_max_turns(stage.agent_role, runtime_overrides["max_turns"]) runner = get_agent( stage.agent_role, task_id, model=runtime_overrides["model"], temperature=runtime_overrides["temperature"], max_tokens=runtime_overrides["max_tokens"], - max_turns=runtime_overrides["max_turns"], + max_turns=stage_max_turns, extra_skill_dirs=runtime_overrides["extra_skill_dirs"], system_prompt_append=runtime_overrides["system_prompt_append"], ) @@ -905,6 +933,7 @@ async def execute_stage( "agent_role": stage.agent_role, "temperature": runtime_overrides.get("temperature"), "max_tokens": runtime_overrides.get("max_tokens"), + "max_turns": stage_max_turns, "attempt": attempt + 1, "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, }, @@ -978,7 +1007,7 @@ async def execute_stage( model=runtime_overrides["model"], temperature=runtime_overrides["temperature"], max_tokens=runtime_overrides["max_tokens"], - max_turns=runtime_overrides["max_turns"], + max_turns=stage_max_turns, extra_skill_dirs=runtime_overrides["extra_skill_dirs"], system_prompt_append=runtime_overrides["system_prompt_append"], ) @@ -1279,8 +1308,7 @@ async def execute_stage_sandboxed( from app.worker.agents import _get_skill_dirs skill_dirs = [f"/skills/{d.name}" for d in _get_skill_dirs(stage.agent_role)] - max_turns_map = {"spec": 20, "coding": 20, "doc": 20, "test": 20} - max_turns = max_turns_map.get(stage.agent_role, 10) + max_turns = _resolve_stage_max_turns(stage.agent_role, runtime_overrides["max_turns"]) # 5. Log the request via shared pipeline contract pipeline = get_task_log_pipeline() @@ -1301,6 +1329,7 @@ async def execute_stage_sandboxed( "model": resolved_model, "temperature": runtime_overrides.get("temperature"), "max_tokens": runtime_overrides.get("max_tokens"), + "max_turns": max_turns, "stage": stage.stage_name, "agent_role": stage.agent_role, "prompt": user_prompt, diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index defd21c..851f5f6 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -131,14 +131,17 @@ "code": ( "只完成当前阶段,不要提前执行后续阶段任务。\n" "不要为了理解整个仓库而广泛探索,优先基于已知信息直接实现。\n" - "只有在缺少关键实现信息时才少量补读文件;你可以为了验证实现而运行必要命令," - "但目标必须是最小必要验证。\n" + "只有在缺少关键实现信息时才少量补读文件;最多再检查 3 个关键文件或执行 1 次探索性目录命令," + "之后必须开始修改代码。\n" + "你可以为了验证实现而运行必要命令,但目标必须是最小必要验证。\n" "不要提前生成最终签收/验收报告,也不要调用 signoff、review、smoke、e2e-test 等后续阶段能力。\n" "完成实现并简要总结本阶段改动后结束。" ), "test": ( "只完成当前阶段,不要提前执行后续阶段任务。\n" "请聚焦当前任务直接相关的自动化测试与验证,优先最小、最相关、最快的验证路径。\n" + "最多再补读 2 个关键文件、执行 2 条验证命令;超过后必须停止扩展并给出结论。\n" + "如果验证命令失败,必须明确给出失败命令、关键报错和阻塞点;不要只根据代码阅读就判定测试通过。\n" "如果相关测试已经通过,且已满足验收标准,请立即停止。\n" "不要继续扩展额外类型的测试,例如 E2E、冒烟、性能或签收报告,除非任务明确要求。" ), diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 125e76c..b34a9f4 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -466,7 +466,7 @@ def _capture_runner( assert captured_params['model'] == 'gpt-5.1-codex-mini' assert captured_params['temperature_override'] == 0.2 assert captured_params['max_tokens_override'] == 1200 - assert captured_params['max_turns'] == 18 + assert captured_params['max_turns'] == 6 assert captured_params['extra_skill_dirs'] == ['/tmp/skills'] assert captured_params['system_prompt_append'] == 'extra prompt' assert captured_params['temperature'] == 0.2 @@ -615,7 +615,7 @@ async def test_handle_continuations_uses_coding_specific_prompt(): assert total_tokens == 11 assert output == 'done' assert runner.prompts == [ - '请停止继续广泛探索,基于已知信息直接补全代码修改。' + '请停止继续广泛探索。基于已知信息直接修改代码;如果仍缺信息,只允许再查看 1 个最关键文件,然后必须完成修改并给出最小验证结果。' ] @@ -634,10 +634,21 @@ async def test_handle_continuations_uses_test_specific_prompt(): assert total_tokens == 11 assert output == 'done' assert runner.prompts == [ - '请停止扩展测试范围,直接执行最小、最相关的验证并给出结果。' + '请停止扩展测试范围。只做最小、最相关的验证;如果验证命令失败,必须直接给出失败命令、关键报错和唯一阻塞点,不要再用代码阅读代替测试结论。' ] +def test_resolve_stage_max_turns_caps_coding_and_test(): + assert executor._resolve_stage_max_turns('coding', None) == 6 + assert executor._resolve_stage_max_turns('coding', 18) == 6 + assert executor._resolve_stage_max_turns('test', 12) == 6 + + +def test_resolve_stage_max_turns_preserves_other_roles(): + assert executor._resolve_stage_max_turns('doc', None) == 10 + assert executor._resolve_stage_max_turns('doc', 18) == 18 + + @pytest.mark.asyncio async def test_handle_continuations_uses_generic_prompt_for_other_stage(): runner = _ContinuationRunner() diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index e0a16e4..45504bb 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -37,6 +37,7 @@ def test_code_guardrail_emphasizes_convergence(): result = build_user_prompt(ctx) assert "不要为了理解整个仓库而广泛探索" in result assert "最小必要验证" in result + assert "最多再检查 3 个关键文件" in result def test_test_guardrail_emphasizes_minimal_validation(): @@ -44,6 +45,8 @@ def test_test_guardrail_emphasizes_minimal_validation(): result = build_user_prompt(ctx) assert "最小、最相关、最快的验证路径" in result assert "满足验收标准" in result + assert "执行 2 条验证命令" in result + assert "不要只根据代码阅读就判定测试通过" in result # --------------------------------------------------------------------------- From befdeddbdd16e40339ecf2ade77abb4f9487269a Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 17:39:32 +0800 Subject: [PATCH 20/33] docs: plan exploration budget convergence --- ...9-exploration-budget-convergence-design.md | 133 ++++++++++++++++++ ...ation-budget-convergence-implementation.md | 103 ++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 docs/plans/2026-03-19-exploration-budget-convergence-design.md create mode 100644 docs/plans/2026-03-19-exploration-budget-convergence-implementation.md diff --git a/docs/plans/2026-03-19-exploration-budget-convergence-design.md b/docs/plans/2026-03-19-exploration-budget-convergence-design.md new file mode 100644 index 0000000..f465442 --- /dev/null +++ b/docs/plans/2026-03-19-exploration-budget-convergence-design.md @@ -0,0 +1,133 @@ +# Exploration Budget Convergence Design + +## Background + +Recent live validation on VM shows that tightening `coding` and `test` stage guardrails plus lowering `max_turns` improved behavior, but did not remove the root failure mode: + +- `coding` still spends most turns on repository exploration (`read/find/ls/cat`) before acting. +- `test` can still drift into explanation-only output after verification failures. +- `max_turns` now bounds damage, but it does not prevent the turns from being spent on the wrong behavior. + +The goal of this design is to add a stronger convergence mechanism with minimal chain changes. We will keep the existing stage model, sandbox model, and AgentRunner integration intact, and only strengthen the executor behavior for `coding` and `test`. + +## Goals + +- Reduce wasted `coding` turns spent on exploration before code edits. +- Reduce `test` drift after failed validation commands. +- Preserve the current task/stage architecture and runtime interfaces. +- Apply consistently to host execution and sandbox execution. + +## Non-Goals + +- No new stages such as `explore` or `verify`. +- No AgentRunner API changes. +- No sandbox protocol changes. +- No model-routing redesign. + +## Recommended Approach + +Use executor-level exploration budgets with a single forced-convergence prompt. + +### Why this approach + +This is the smallest effective change that targets the actual failure mode. The system already captures stage events and tool-call lifecycle data inside the executor. Instead of only changing prompts or reducing `max_turns`, we can observe repeated exploration behavior and intervene once, at the executor layer, before the stage fully degenerates. + +This keeps the architecture stable while giving the runtime one stronger lever than prompt text alone. + +## Alternatives Considered + +### 1. Lower `max_turns` further + +This is easy, but it does not solve the root issue. The model can still waste the smaller budget on exploration and then get truncated earlier. + +### 2. Split `coding` and `test` into sub-phases + +This would likely be more effective, but it changes stage orchestration and increases behavioral complexity. It is larger than needed for the immediate problem. + +### 3. Add tool-level hard blocking in AgentRunner + +This could be very strong, but it requires changes below the executor boundary and is not the minimal-path solution. + +## Design + +### 1. Exploration budget tracking + +Inside the executor, track lightweight exploration signals for `coding` and `test`: + +- read-like tool usage +- directory/list/search style commands +- repeated tool-only turns without implementation or verification progress + +The budget should stay intentionally simple and heuristic-based. The goal is not perfect classification. The goal is to catch obvious drift. + +Suggested first-pass behavior: + +- `coding`: trigger after too many exploration actions in the same stage before meaningful implementation progress +- `test`: trigger after too many exploration actions, or after failed validation attempts followed by further drift + +### 2. Forced-convergence prompt + +When the budget is exceeded, inject one explicit recovery prompt. + +For `coding`, the prompt should require: + +- stop exploring +- directly modify files +- run only minimal verification +- if still blocked, report the single blocker clearly + +For `test`, the prompt should require: + +- stop expanding the test surface +- give the smallest relevant validation result +- if a command failed, report the failed command, key error, and blocker +- do not declare success based only on code inspection + +This should happen once per stage execution, not repeatedly. + +### 3. No repeated soft looping + +After a forced-convergence prompt has been issued, the executor should not continue to allow the same stage to loop through broad exploration again. The intention is: + +- one normal execution window +- one forced convergence recovery chance +- then end based on the resulting output or failure + +This avoids replacing one loop with another. + +### 4. Minimal runtime surface area + +The behavior should live in `executor.py` so it applies uniformly to: + +- host/in-process task execution +- sandboxed task execution + +The prompt texts may remain in executor helper functions rather than introducing a larger new prompt framework. + +## Error Handling + +If budget tracking cannot confidently classify a tool action, it should ignore it rather than overreact. + +If the forced-convergence prompt itself fails, the stage should continue to use the current lifecycle behavior and surface the latest failure normally. The new logic should not hide existing error messages. + +## Testing Strategy + +Add focused executor tests that cover: + +- exploration budget exceeded in `coding` triggers the forced-convergence path +- exploration budget exceeded in `test` triggers the correct test-specific prompt +- failed test-command flow requires blocker-style follow-up rather than success-style summary +- only one forced-convergence injection happens per stage +- non-target roles are unaffected + +## Success Criteria + +Live task behavior should improve in these ways: + +- fewer `llm_turn_sent` / `tool_call_executed` events before code changes or validation +- fewer `Max turns reached` events in `coding` and `test` +- fewer `test` outputs that claim success after command failure without citing blockers + +## Rollout Notes + +This should ship behind existing behavior with no config migration. If needed, thresholds can remain hard-coded for the first version and be externalized later only if real usage shows that tuning is necessary. diff --git a/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md b/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md new file mode 100644 index 0000000..9de4975 --- /dev/null +++ b/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md @@ -0,0 +1,103 @@ +# Exploration Budget Convergence Implementation Plan + +## Objective + +Implement executor-level exploration-budget convergence for `coding` and `test` stages, with one forced-convergence recovery prompt and targeted regression coverage. + +## Scope + +- Modify executor behavior only. +- Add or update tests for executor behavior. +- Do not alter stage orchestration, sandbox APIs, or AgentRunner interfaces. + +## Planned Changes + +### 1. Add lightweight stage exploration state + +Update [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) to maintain per-stage exploration state during execution. + +Planned state: + +- exploration action counter +- test-command failure flag or summary +- whether forced convergence was already injected + +### 2. Define budget heuristics + +Add small helper functions in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) to classify tool behavior and decide whether the stage has exceeded its budget. + +Initial heuristics should stay simple: + +- `coding`: repeated read/search/list behavior without implementation progress +- `test`: repeated read/search/list behavior, or failure of validation commands followed by continued drift + +### 3. Inject one forced-convergence prompt + +Extend the stage execution loop in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) so that when the budget is exceeded: + +- a stage-specific forced prompt is sent once +- the prompt differs from the existing generic continuation prompt +- subsequent looping does not repeatedly inject the same recovery prompt + +### 4. Keep current completion/failure semantics + +Preserve existing lifecycle behavior: + +- normal successful stage completion still goes through existing summary/finalization +- existing error handling and retry/fallback behavior remains intact +- no new persisted schema changes + +### 5. Add regression tests + +Update [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) to cover: + +- coding exploration budget breach +- test exploration budget breach +- failed test command followed by forced blocker-style convergence +- one-time forced-convergence injection +- unaffected behavior for other roles + +If needed, add prompt-text expectation coverage to [test_prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py) only when shared prompt helpers are updated. + +## Verification + +Run: + +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q +``` + +If implementation reaches live validation, use a cloned VM task similar to the previous `helloworld` runs and compare: + +- `llm_turn_sent` +- `tool_call_executed` +- `Max turns reached` + +for `code` and `test` stages. + +## Risks + +- Heuristics may be too aggressive and cut off valid exploration in legitimate tasks. +- Heuristics may be too weak and not materially improve live behavior. +- Tool classification may miss edge cases where an `execute` command is exploratory vs. truly validating. + +## Mitigations + +- Keep the first-pass thresholds conservative. +- Inject one recovery prompt before failing or finishing, instead of immediately aborting. +- Limit the initial implementation to `coding` and `test` only. + +## Rollback + +Revert the executor helpers and test changes in: + +- [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) +- [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) + +## Exit Criteria + +- Targeted tests pass. +- The new logic is isolated to executor-level behavior. +- A subsequent live validation can reasonably show lower exploration churn than the current capped-`max_turns` behavior. From c45e48ce352e464e488d9b5570b21ba6e85c79d1 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 17:42:52 +0800 Subject: [PATCH 21/33] fix(worker): force convergence after exploration drift --- platform/app/worker/executor.py | 162 +++++++++++++++++++++ platform/tests/test_executor_stage_logs.py | 90 ++++++++++++ 2 files changed, 252 insertions(+) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index e74f148..5ae84d9 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -257,6 +257,48 @@ def _clear_current_task_cancellation_state() -> None: current.uncancel() +_EXPLORATION_EXECUTE_PREFIXES = ( + "ls ", + "find ", + "pwd", + "cat ", + "head ", + "tail ", + "tree", + "rg ", +) +_VERIFICATION_EXECUTE_MARKERS = ( + "pytest", + "unittest", + "gradlew test", + "gradlew build", + "gradlew testclasses", + "mvn test", + "npm test", + "pnpm test", + "yarn test", + "go test", + "cargo test", +) + + +def _classify_tool_activity(tool_name: str, args: dict[str, Any]) -> str: + normalized_tool = (tool_name or "").strip().lower() + if normalized_tool in {"write", "edit"}: + return "implementation" + if normalized_tool == "read": + return "exploration" + + if normalized_tool in {"execute", "execute_script"}: + command = str(args.get("command") or "").strip().lower() + if any(marker in command for marker in _VERIFICATION_EXECUTE_MARKERS): + return "verification" + if any(command.startswith(prefix) for prefix in _EXPLORATION_EXECUTE_PREFIXES): + return "exploration" + + return "other" + + # --------------------------------------------------------------------------- # StageEventTracker – encapsulates mutable tracking state and event helpers # --------------------------------------------------------------------------- @@ -286,6 +328,12 @@ def __init__( self._instrumented_runners: list[Any] = [] self._instrumented_runner_ids: set[int] = set() self._completed_tool_runs: list[dict[str, str]] = [] + self._exploration_actions = 0 + self._implementation_actions = 0 + self._verification_attempts = 0 + self._verification_failures = 0 + self._successful_verifications = 0 + self._forced_convergence_used = False # -- public emit helpers -------------------------------------------------- @@ -372,6 +420,37 @@ async def emit_chat_received( def get_completed_tool_runs(self) -> list[dict[str, str]]: return list(self._completed_tool_runs) + def should_force_convergence(self) -> bool: + if self._forced_convergence_used: + return False + + normalized = self.stage_name.strip().lower() + if normalized == "code": + return self._implementation_actions == 0 and self._exploration_actions >= 4 + if normalized == "test": + if self._verification_failures > 0 and self._successful_verifications == 0: + return True + return self._verification_attempts == 0 and self._exploration_actions >= 3 + return False + + def mark_forced_convergence_used(self) -> None: + self._forced_convergence_used = True + + def record_tool_activity(self, tool_name: str, args: dict[str, Any], status: str) -> None: + activity = _classify_tool_activity(tool_name, args) + if activity == "exploration": + self._exploration_actions += 1 + return + if activity == "implementation": + self._implementation_actions += 1 + return + if activity == "verification": + self._verification_attempts += 1 + if status == "success": + self._successful_verifications += 1 + else: + self._verification_failures += 1 + # -- runner event registration ------------------------------------------- def register_runner_events(self, current_runner: Any) -> None: @@ -515,6 +594,7 @@ async def _on_after_tool_result(event: Any) -> None: args = {} output = str(getattr(event, "result", "")) status = infer_tool_status(output) + tracker.record_tool_activity(tool_name, args, status) run_info = tracker._tool_runs.get(tool_call_id) duration_ms: Optional[float] = None @@ -691,6 +771,87 @@ def _build_continuation_prompt(stage_name: str | None) -> str: return "请继续完成上面的输出,从你停下的地方继续。" +def _build_forced_convergence_prompt(stage_name: str | None) -> str: + normalized = (stage_name or "").strip().lower() + if normalized == "code": + return ( + "你已经在当前阶段花了过多轮次进行探索。现在禁止继续浏览仓库。" + "请直接做最小代码修改,并只执行最小必要验证。" + "如果仍然无法完成,请只输出唯一阻塞点和证据。" + ) + if normalized == "test": + return ( + "你已经在当前阶段花了过多轮次进行探索。现在禁止继续扩展测试范围。" + "请直接执行最小、最相关的验证。" + "如果验证命令失败,必须明确给出失败命令、关键报错和唯一阻塞点;" + "不要仅凭代码阅读判断测试通过。" + ) + return "请立即收敛到当前阶段的最终结果,不要继续扩展。" + + +async def _run_forced_convergence( + runner: Any, + output: str, + runtime_overrides: dict[str, Any], + tracker: StageEventTracker, + stage_name: str | None = None, +) -> str: + if not tracker.should_force_convergence(): + return output + + tracker.mark_forced_convergence_used() + prompt = _build_forced_convergence_prompt(stage_name or tracker.stage_name) + chat_started = time.monotonic() + chat_correlation = await tracker.emit_chat_sent( + request_body={ + "prompt": prompt, + "model": getattr(getattr(runner, "config", None), "model", None), + "stage": tracker.stage_name, + "agent_role": tracker.agent_role, + "temperature": runtime_overrides.get("temperature"), + "max_tokens": runtime_overrides.get("max_tokens"), + "forced_convergence": True, + "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, + }, + ) + + try: + followup_kwargs = _chat_kwargs_for_runner(runner, runtime_overrides) + response = await asyncio.wait_for( + runner.chat(prompt, reset=False, **followup_kwargs), + timeout=settings.WORKER_STAGE_TIMEOUT, + ) + forced_text = response.text_content or "" + await tracker.emit_chat_received( + chat_correlation, + status="success", + response_body={"forced_convergence": True, "content": forced_text}, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + cleaned = output.replace( + "[Max turns reached. Please continue the conversation.]", + "", + ).strip() + return f"{cleaned}\n\n{forced_text}".strip() if cleaned else forced_text + except asyncio.CancelledError: + _clear_current_task_cancellation_state() + await tracker.emit_chat_received( + chat_correlation, + status="cancelled", + response_body={"forced_convergence": True, "error": "cancelled"}, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + raise + except Exception as exc: + await tracker.emit_chat_received( + chat_correlation, + status="failed", + response_body={"forced_convergence": True, "error": str(exc)}, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + return output + + async def _handle_continuations( runner: Any, output: str, @@ -702,6 +863,7 @@ async def _handle_continuations( _MAX_CONTINUATIONS = 3 _TRUNCATION_SENTINEL = "Max turns reached" continuations = 0 + output = await _run_forced_convergence(runner, output, runtime_overrides, tracker, stage_name) while _TRUNCATION_SENTINEL in (output or "") and continuations < _MAX_CONTINUATIONS: continuations += 1 diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index b34a9f4..148f088 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -154,6 +154,12 @@ def __init__(self, stage_name: str, agent_role: str = 'coding') -> None: self.agent_role = agent_role self.sent: list[dict[str, object]] = [] self.received: list[dict[str, object]] = [] + self._forced_convergence_used = False + self._implementation_actions = 0 + self._exploration_actions = 0 + self._verification_failures = 0 + self._successful_verifications = 0 + self._verification_attempts = 0 async def emit_chat_sent(self, **kwargs): self.sent.append(kwargs) @@ -163,6 +169,20 @@ async def emit_chat_received(self, *args, **kwargs): self.received.append({"args": args, "kwargs": kwargs}) return True + def should_force_convergence(self) -> bool: + if self._forced_convergence_used: + return False + if self.stage_name == 'code': + return self._implementation_actions == 0 and self._exploration_actions >= 4 + if self.stage_name == 'test': + if self._verification_failures > 0 and self._successful_verifications == 0: + return True + return self._verification_attempts == 0 and self._exploration_actions >= 3 + return False + + def mark_forced_convergence_used(self) -> None: + self._forced_convergence_used = True + class _CancelledRunner(_FakeRunner): async def chat(self, _prompt: str, reset: bool = True, **_: object): @@ -231,6 +251,36 @@ def test_is_tool_call_error_matches_gemini_thought_signature_error(): assert executor._is_tool_call_error(err) is True +def test_classify_tool_activity_marks_exploration_implementation_and_verification(): + assert executor._classify_tool_activity('read', {}) == 'exploration' + assert executor._classify_tool_activity('edit', {}) == 'implementation' + assert executor._classify_tool_activity('execute', {'command': 'find src -name "*.java"'}) == 'exploration' + assert executor._classify_tool_activity('execute', {'command': './gradlew test'}) == 'verification' + + +def test_stage_event_tracker_force_convergence_budget_for_code_and_test(): + tracker = executor.StageEventTracker( + pipeline=_FakePipeline(), + task_id='task-1', + stage_id='stage-1', + stage_name='code', + agent_role='coding', + ) + for _ in range(4): + tracker.record_tool_activity('read', {}, 'success') + assert tracker.should_force_convergence() is True + + test_tracker = executor.StageEventTracker( + pipeline=_FakePipeline(), + task_id='task-2', + stage_id='stage-2', + stage_name='test', + agent_role='test', + ) + test_tracker.record_tool_activity('execute', {'command': './gradlew test'}, 'failed') + assert test_tracker.should_force_convergence() is True + + @pytest.mark.asyncio async def test_execute_stage_falls_back_to_text_only_on_thought_signature_error(monkeypatch): session = SimpleNamespace(commit=AsyncMock()) @@ -638,6 +688,46 @@ async def test_handle_continuations_uses_test_specific_prompt(): ] +@pytest.mark.asyncio +async def test_handle_continuations_injects_forced_convergence_for_coding_budget(): + runner = _ContinuationRunner() + tracker = _ContinuationTracker(stage_name='code', agent_role='coding') + tracker._exploration_actions = 4 + + output, total_tokens = await executor._handle_continuations( + runner, + 'partial summary', + {}, + tracker, + ) + + assert total_tokens == 11 + assert output == 'partial summary\n\ndone' + assert runner.prompts == [ + '你已经在当前阶段花了过多轮次进行探索。现在禁止继续浏览仓库。请直接做最小代码修改,并只执行最小必要验证。如果仍然无法完成,请只输出唯一阻塞点和证据。' + ] + + +@pytest.mark.asyncio +async def test_handle_continuations_injects_forced_convergence_for_failed_test_verification(): + runner = _ContinuationRunner() + tracker = _ContinuationTracker(stage_name='test', agent_role='test') + tracker._verification_failures = 1 + + output, total_tokens = await executor._handle_continuations( + runner, + 'analysis only', + {}, + tracker, + ) + + assert total_tokens == 11 + assert output == 'analysis only\n\ndone' + assert runner.prompts == [ + '你已经在当前阶段花了过多轮次进行探索。现在禁止继续扩展测试范围。请直接执行最小、最相关的验证。如果验证命令失败,必须明确给出失败命令、关键报错和唯一阻塞点;不要仅凭代码阅读判断测试通过。' + ] + + def test_resolve_stage_max_turns_caps_coding_and_test(): assert executor._resolve_stage_max_turns('coding', None) == 6 assert executor._resolve_stage_max_turns('coding', 18) == 6 From 5611d6d15635107284418429b2a7e970af408fba Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:01:50 +0800 Subject: [PATCH 22/33] feat(web): surface continuation markers in react timeline --- web/src/components/ReActTimeline/index.tsx | 109 ++++++++++++++++++-- web/src/components/ReActTimeline/styles.css | 25 ++++- web/tests/ReActTimeline.test.ts | 80 ++++++++++++++ 3 files changed, 207 insertions(+), 7 deletions(-) create mode 100644 web/tests/ReActTimeline.test.ts diff --git a/web/src/components/ReActTimeline/index.tsx b/web/src/components/ReActTimeline/index.tsx index e976c22..8609489 100644 --- a/web/src/components/ReActTimeline/index.tsx +++ b/web/src/components/ReActTimeline/index.tsx @@ -19,6 +19,11 @@ import './styles.css'; const { Text, Paragraph } = Typography; +export interface TurnBadge { + label: string; + color: string; +} + interface ReActTurn { id: string; turnNumber: number; @@ -34,6 +39,8 @@ interface ReActViewProps { loading?: boolean; } +const MAX_TURNS_SENTINEL = '[Max turns reached. Please continue the conversation.]'; + function getLogContent(log?: TaskLogEvent): string { if (!log || !log.response_body) return ''; const raw = (log.response_body as Record).content; @@ -56,6 +63,64 @@ function getLogContent(log?: TaskLogEvent): string { return ''; } +function getRecordValue(record: Record | null | undefined, key: string): unknown { + if (!record) return undefined; + return record[key]; +} + +function getContinuationNumber(log?: TaskLogEvent): number | null { + const requestValue = getRecordValue(log?.request_body, 'continuation'); + const responseValue = getRecordValue(log?.response_body, 'continuation'); + const candidate = requestValue ?? responseValue; + if (typeof candidate === 'number' && Number.isFinite(candidate)) return candidate; + if (typeof candidate === 'string' && candidate.trim()) { + const parsed = Number(candidate); + if (Number.isFinite(parsed)) return parsed; + } + return null; +} + +function hasForcedConvergence(log?: TaskLogEvent): boolean { + return Boolean(getRecordValue(log?.request_body, 'forced_convergence') || getRecordValue(log?.response_body, 'forced_convergence')); +} + +export function getTurnBadges(log?: TaskLogEvent): TurnBadge[] { + if (!log) return []; + + const badges: TurnBadge[] = []; + const continuation = getContinuationNumber(log); + if (continuation != null) { + badges.push({ label: `Continuation #${continuation}`, color: 'blue' }); + } + + if (hasForcedConvergence(log)) { + badges.push({ label: 'Forced Convergence', color: 'gold' }); + } + + return badges; +} + +export function stripMaxTurnsSentinel(text: string): { text: string; truncated: boolean } { + if (!text.includes(MAX_TURNS_SENTINEL)) { + return { text, truncated: false }; + } + + return { + text: text.replace(MAX_TURNS_SENTINEL, '\n').replace(/\n{2,}/g, '\n').trim(), + truncated: true, + }; +} + +export function getThoughtDisplay(log?: TaskLogEvent): { text: string; truncated: boolean; badges: TurnBadge[] } { + const content = getLogContent(log); + const { text, truncated } = stripMaxTurnsSentinel(content); + return { + text, + truncated, + badges: getTurnBadges(log), + }; +} + function parseReActTurns(logs: TaskLogEvent[]): ReActTurn[] { const sorted = [...logs].sort((a, b) => a.event_seq - b.event_seq); const turnsMap = new Map(); @@ -100,7 +165,21 @@ function parseReActTurns(logs: TaskLogEvent[]): ReActTurn[] { return Array.from(turnsMap.values()); } -const ExpandablePromptBlock: React.FC<{ content: string; title: string; maxHeight?: number }> = ({ content, title, maxHeight = 150 }) => { +const BadgeRow: React.FC<{ badges: TurnBadge[] }> = ({ badges }) => { + if (badges.length === 0) return null; + + return ( +
+ {badges.map((badge) => ( + + {badge.label} + + ))} +
+ ); +}; + +const ExpandablePromptBlock: React.FC<{ content: string; title: string; badges?: TurnBadge[]; maxHeight?: number }> = ({ content, title, badges = [], maxHeight = 150 }) => { const [expanded, setExpanded] = useState(false); const [isOverflowing, setIsOverflowing] = useState(false); const contentRef = useRef(null); @@ -127,6 +206,8 @@ const ExpandablePromptBlock: React.FC<{ content: string; title: string; maxHeigh )} + +
= ({ thoughtText, durationMs, defaultExpanded = false }) => { +}> = ({ thoughtText, durationMs, badges = [], truncated = false, defaultExpanded = false }) => { const [expanded, setExpanded] = useState(defaultExpanded); // In some cases duration_ms might be tiny or null; default to <1s or omit @@ -164,6 +247,13 @@ const CursorStyleThoughtBlock: React.FC<{ {expanded ? : } {label}
+ + {truncated && ( +
+ Max turns reached + 系统已截断当前轮次并请求继续,下面展示的是后续收敛输出。 +
+ )} {expanded && (
{thoughtText} @@ -214,11 +304,13 @@ export const ReActTimeline: React.FC = ({ logs, loading }) => { const isRunning = turn.thought?.status === 'running' || turn.action?.status === 'running' || turn.observation?.status === 'running'; const promptContent = turn.prompt?.request_body?.prompt as string; + const promptBadges = getTurnBadges(turn.prompt); const isLLMRunning = turn.thought_sent?.status === 'running' && !turn.thought; const streamLogId = turn.thought_sent?.id; - let thoughtText = getLogContent(turn.thought); + const thoughtDisplay = getThoughtDisplay(turn.thought); + let thoughtText = thoughtDisplay.text; if (isLLMRunning && streamLogId) { const streamLines = linesByLog[streamLogId]; if (streamLines && streamLines.length > 0) { @@ -226,6 +318,9 @@ export const ReActTimeline: React.FC = ({ logs, loading }) => { } } + const thoughtBadges = getTurnBadges(turn.thought); + const thoughtTruncated = thoughtDisplay.truncated || thoughtText.includes(MAX_TURNS_SENTINEL); + if (thoughtText && thoughtText.includes('')) { const match = thoughtText.match(/([\s\S]*?)(?:<\/thought>|$)/); if (match) { @@ -234,7 +329,7 @@ export const ReActTimeline: React.FC = ({ logs, loading }) => { } // AI Response Segment needs to show up if it's currently running, even if text is empty yet - const hasAIActivity = thoughtText || turn.action?.command || isLLMRunning; + const hasAIActivity = thoughtText || thoughtTruncated || thoughtBadges.length > 0 || turn.action?.command || isLLMRunning; const actionCommand = turn.action?.command || turn.observation?.command; const actionArgs = turn.action?.command_args || turn.observation?.command_args; @@ -252,7 +347,7 @@ export const ReActTimeline: React.FC = ({ logs, loading }) => {
} className="message-avatar" style={{ backgroundColor: '#87d068' }} />
- +
)} @@ -265,10 +360,12 @@ export const ReActTimeline: React.FC = ({ logs, loading }) => { Silicon Agent {/* Thought formatted with Cursor-style collapse */} - {thoughtText && ( + {(thoughtText || thoughtTruncated || thoughtBadges.length > 0) && ( )} diff --git a/web/src/components/ReActTimeline/styles.css b/web/src/components/ReActTimeline/styles.css index 4f61435..74981b5 100644 --- a/web/src/components/ReActTimeline/styles.css +++ b/web/src/components/ReActTimeline/styles.css @@ -31,6 +31,29 @@ color: #1f1f1f; } +.react-gemini-badge-row { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-bottom: 10px; +} + +.react-gemini-badge { + margin-inline-end: 0 !important; +} + +.react-gemini-truncation-note { + display: flex; + align-items: center; + gap: 8px; + margin: 6px 0 10px; + padding: 8px 10px; + border-radius: 8px; + background: #fafafa; + border: 1px dashed #d9d9d9; + color: #595959; +} + .message-bubble { border-radius: 8px; padding: 12px 16px; @@ -189,4 +212,4 @@ padding-left: 14px; border-left: 2px solid #e8e8e8; margin-left: 5px; -} \ No newline at end of file +} diff --git a/web/tests/ReActTimeline.test.ts b/web/tests/ReActTimeline.test.ts new file mode 100644 index 0000000..3beea40 --- /dev/null +++ b/web/tests/ReActTimeline.test.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from 'vitest'; +import type { TaskLogEvent } from '@/services/taskLogApi'; +import { getThoughtDisplay, getTurnBadges, stripMaxTurnsSentinel } from '@/components/ReActTimeline'; + +function makeLog(overrides: Partial): TaskLogEvent { + return { + id: 'log-1', + task_id: 'task-1', + stage_id: 'stage-1', + stage_name: 'coding', + agent_role: 'coding', + correlation_id: 'chat-1', + event_seq: 1, + event_type: 'agent_runner_chat_received', + event_source: 'llm', + status: 'success', + request_body: null, + response_body: null, + command: null, + command_args: null, + workspace: null, + execution_mode: null, + duration_ms: null, + result: null, + output_summary: null, + output_truncated: false, + missing_fields: [], + created_at: '2026-03-19T00:00:00.000Z', + ...overrides, + }; +} + +describe('ReActTimeline helper transforms', () => { + it('extracts continuation and forced convergence badges from task logs', () => { + const log = makeLog({ + request_body: { continuation: 2, forced_convergence: true }, + }); + + expect(getTurnBadges(log)).toEqual([ + { label: 'Continuation #2', color: 'blue' }, + { label: 'Forced Convergence', color: 'gold' }, + ]); + }); + + it('prefers response metadata when request body is absent', () => { + const log = makeLog({ + response_body: { continuation: '1', forced_convergence: true }, + }); + + expect(getTurnBadges(log)).toEqual([ + { label: 'Continuation #1', color: 'blue' }, + { label: 'Forced Convergence', color: 'gold' }, + ]); + }); + + it('strips the max-turn sentinel while preserving surrounding text', () => { + expect( + stripMaxTurnsSentinel( + 'alpha\n[Max turns reached. Please continue the conversation.]\nbeta' + ) + ).toEqual({ + text: 'alpha\nbeta', + truncated: true, + }); + }); + + it('treats sentinel-only thoughts as truncated system notes', () => { + const thought = getThoughtDisplay( + makeLog({ + response_body: { content: '[Max turns reached. Please continue the conversation.]' }, + }) + ); + + expect(thought).toEqual({ + text: '', + truncated: true, + badges: [], + }); + }); +}); From 499145f263f34d2cee5b8ab4c589acee025b469b Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:08:18 +0800 Subject: [PATCH 23/33] docs: plan stage reset and preflight optimization --- ...-03-19-stage-reset-and-preflight-design.md | 188 +++++++++++ ...tage-reset-and-preflight-implementation.md | 314 ++++++++++++++++++ 2 files changed, 502 insertions(+) create mode 100644 docs/plans/2026-03-19-stage-reset-and-preflight-design.md create mode 100644 docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md diff --git a/docs/plans/2026-03-19-stage-reset-and-preflight-design.md b/docs/plans/2026-03-19-stage-reset-and-preflight-design.md new file mode 100644 index 0000000..0168430 --- /dev/null +++ b/docs/plans/2026-03-19-stage-reset-and-preflight-design.md @@ -0,0 +1,188 @@ +# Stage Reset And Preflight Design + +## Background + +Recent task log analysis shows that extreme token usage in `coding` and `test` is dominated by repeated multi-turn ReAct loops rather than any one oversized file. + +The current cost pattern comes from three factors compounding together: + +1. A large fixed prompt base per stage: + - system prompt + - role skill directories + - repo context + - project memory +2. Repeated `runner.chat(..., reset=False)` continuation within the same stage. +3. Repeated exploration turns (`ls`, `find`, `read`, lightweight shell discovery) before implementation or validation. + +Even after recent turn-budget and convergence work, live runs still show the model spending too many turns on exploration, while each additional turn carries increasing historical context. + +## Goal + +Reduce token cost and exploration churn in `coding` and `test` by implementing both: + +- stage-local rolling resets with compact checkpoints +- deterministic platform-side preflight scan summaries + +## Non-Goals + +- No task/stage graph redesign +- No provider-specific prompt caching as the primary fix +- No changes to task log API shape +- No model routing redesign + +## Why These Two Changes Together + +Either change alone helps, but together they address both sides of the problem: + +- rolling reset reduces repeated historical context +- preflight scan removes avoidable exploration turns + +This is the strongest cost/control improvement available without changing the overall stage model. + +## Recommended Design + +### 1. Rolling Stage Reset With Compact Checkpoints + +For `coding` and `test`, do not let one long-running stage conversation accumulate unbounded history. + +Instead, after a small number of exploration or tool rounds, or after truncation pressure is observed, the executor should: + +1. collect a compact checkpoint +2. restart the runner conversation with `reset=True` +3. continue from the checkpoint rather than from full raw history + +The checkpoint should contain only the minimum needed state: + +- current task objective +- current stage goal +- confirmed facts discovered so far +- files already changed +- latest meaningful tool results +- the immediate next required action + +This preserves continuity while preventing the stage from carrying every prior prompt, tool reply, and continuation through the entire run. + +### 2. Platform-Side Preflight Scan Summary + +Before `coding` and `test`, the platform should run a small deterministic repo scan and inject a compact summary into the stage prompt. + +This replaces a large portion of the model’s exploratory shell work. + +For `coding`, preflight should gather things like: + +- key package / module roots +- likely implementation entrypoints +- existing controller / handler / service examples +- common response wrapper or domain model locations +- test framework presence +- build file hints + +For `test`, preflight should gather things like: + +- relevant existing test files +- framework and runner clues +- the most likely target test directories +- existing test pattern examples + +The preflight output should be short and structured, designed to replace multiple `find`, `ls`, and `read` rounds with one injected context block. + +### 3. Executor Ownership + +The changes should remain executor-driven so they apply consistently to: + +- host execution +- sandbox execution + +The stage executor should become responsible for: + +- deciding when a stage conversation has accumulated too much churn +- building the compact checkpoint +- restarting the stage chat cleanly + +The repo scan should be generated before stage execution and passed in as a small extra context block, similar to how repo context and project memory are already injected today. + +### 4. Minimal Context Surface + +This design should avoid adding yet another large context block. + +The preflight summary should therefore be: + +- tightly capped +- role-specific +- intentionally factual rather than verbose + +Likewise, checkpoint summaries should be much smaller than carrying the full multi-turn history forward. + +## Alternatives Considered + +### 1. Prompt Caching First + +This helps billing for repeated static prompt prefixes, but it does not solve the growing-history problem. It is still useful later, but it should not be the first or only fix. + +### 2. Lower `max_turns` Further + +This constrains runaway behavior, but it does not ensure that the remaining turns are spent effectively. + +### 3. Prompt-Only “Use Fewer Bash Calls” + +This is helpful as a guardrail, but not reliable enough as the main control. Deterministic preflight is more stable than asking the model to be disciplined. + +## Data Flow + +### Coding + +1. Engine prepares repo context and project memory as today. +2. New coding preflight scan runs and produces compact summary text. +3. Executor starts coding stage with the summary included. +4. If the stage begins to accumulate too many exploration/tool rounds, executor builds a checkpoint and re-enters with a fresh chat. +5. The stage proceeds from compressed current state rather than full historical turns. + +### Test + +1. Engine prepares existing compressed prior outputs as today. +2. New test preflight scan runs and provides framework/test-location summary. +3. Executor starts test stage with the summary included. +4. If test churn accumulates, executor rebuilds the stage from a compact checkpoint. +5. Validation continues from current facts instead of raw conversation history. + +## Error Handling + +If preflight scan fails: + +- stage should continue without it +- failure should be logged +- no task failure should occur solely because preflight was unavailable + +If rolling reset checkpoint generation fails: + +- stage should fall back to current behavior +- failure should be logged +- existing stage lifecycle semantics should remain intact + +## Testing Strategy + +Add focused coverage for: + +- coding preflight summary generation +- test preflight summary generation +- executor restart path after exploration churn +- checkpoint prompt includes only reduced current-state data +- non-target roles remain unchanged + +## Success Criteria + +Compared to current live baselines, successful improvement should show: + +- fewer exploration tool calls before implementation +- fewer continuation rounds +- materially lower total tokens in `coding` and `test` +- fewer repeated `Max turns reached` events + +## Rollout Order + +Recommended rollout sequence: + +1. implement preflight summary generation +2. implement rolling reset/checkpoint logic +3. validate on the same VM task family currently used for comparison +4. only after that, decide whether prompt caching is still worth prioritizing diff --git a/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md b/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md new file mode 100644 index 0000000..12ed084 --- /dev/null +++ b/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md @@ -0,0 +1,314 @@ +# Stage Reset And Preflight Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Reduce `coding` and `test` token usage by combining deterministic preflight scan summaries with executor-driven rolling conversation resets. + +**Architecture:** Add small role-specific preflight summaries ahead of `coding` and `test`, then teach the executor to restart long-running stage conversations from compact checkpoints instead of carrying full multi-turn history forever. Keep the existing stage model and runtime entrypoints intact. + +**Tech Stack:** Python, FastAPI worker runtime, SQLAlchemy task pipeline, SkillKit AgentRunner, pytest + +--- + +### Task 1: Add Preflight Summary Builders + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Write the failing test** + +Add tests for a helper that produces compact preflight text for `coding` and `test` from workspace facts. + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: FAIL because the new preflight helper does not exist yet. + +**Step 3: Write minimal implementation** + +Add helpers that: +- gather lightweight repo facts for `coding` / `test` +- cap output size aggressively +- degrade gracefully when data is missing + +Keep the first version simple and deterministic. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: PASS for the new helper coverage. + +**Step 5: Commit** + +```bash +git add platform/app/worker/engine.py platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py +git commit -m "feat(worker): add coding and test preflight summaries" +``` + +### Task 2: Inject Preflight Into Stage Execution + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` + +**Step 1: Write the failing test** + +Add tests showing that `coding` / `test` prompts include the preflight summary block when available. + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_prompts.py -q +``` + +Expected: FAIL because prompts do not yet include the new block. + +**Step 3: Write minimal implementation** + +Extend stage context and prompt assembly so the preflight summary is included only for the roles that need it, and is clearly labeled. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_prompts.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add platform/app/worker/engine.py platform/app/worker/prompts.py platform/tests/test_prompts.py +git commit -m "feat(worker): inject role preflight summaries" +``` + +### Task 3: Add Rolling Reset Checkpoint Builder + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Write the failing test** + +Add tests for a helper that turns current stage state into a compact restart checkpoint. + +Cover: +- task objective +- current stage goal +- recent tool digest +- next required action + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: FAIL because checkpoint logic does not exist yet. + +**Step 3: Write minimal implementation** + +Build a compact textual checkpoint helper in the executor and keep it size-capped. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py +git commit -m "feat(worker): build compact stage restart checkpoints" +``` + +### Task 4: Restart Long-Running Stage Conversations From Checkpoints + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Write the failing test** + +Add tests showing that when churn thresholds are exceeded: +- executor restarts the stage from a checkpoint +- the follow-up chat uses `reset=True` +- repeated raw-history continuation is reduced + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: FAIL because executor still only continues in-place. + +**Step 3: Write minimal implementation** + +Add restart logic for `coding` and `test` only: +- detect churn threshold +- build checkpoint +- restart with a fresh chat +- preserve existing lifecycle logging + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py +git commit -m "fix(worker): reset stage chats from compact checkpoints" +``` + +### Task 5: End-to-End Regression Verification + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` + +**Step 1: Add missing regression coverage** + +Ensure tests cover: +- fallback behavior when preflight is unavailable +- non-target roles unaffected +- prompt size stays bounded +- forced convergence and rolling reset do not fight each other + +**Step 2: Run targeted regression suite** + +Run: +```bash +cd platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q +``` + +Expected: PASS. + +**Step 3: Commit** + +```bash +git add platform/tests/test_executor_stage_logs.py platform/tests/test_prompts.py +git commit -m "test(worker): cover stage reset and preflight regressions" +``` + +### Task 6: Live Validation On VM + +**Files:** +- No local code changes required unless fixes are needed + +**Step 1: Deploy latest branch to VM** + +Pull latest code and restart backend in the current host-execution mode. + +**Step 2: Clone known comparison task** + +Use: +```bash +POST /api/v1/tasks/339f8bd3-c5f2-4da5-8267-15a6ec3aaaa3/clone +``` + +**Step 3: Compare live metrics** + +Capture: +- `llm_turn_sent` +- `tool_call_executed` +- `Max turns reached` +- total tokens + +for `code` and `test`. + +**Step 4: Record comparison** + +Compare against the recent baselines already observed on VM and summarize whether: +- exploration rounds dropped +- token usage dropped +- repeated truncation dropped + +**Step 5: Commit only if code changed during validation** + +```bash +git add +git commit -m "fix(worker): adjust stage reset thresholds" +``` + +## Verification + +Primary local verification: + +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q +``` + +Secondary live verification: + +- deploy current branch to VM +- clone the known `helloworld` task +- compare `code/test` stage metrics with earlier runs + +## Risks + +- Restart checkpoints may omit important context and hurt correctness. +- Preflight summaries may become too verbose and recreate the same token problem in a different form. +- Rolling reset may interact awkwardly with current continuation and forced-convergence behavior. + +## Mitigations + +- Keep checkpoint format deliberately compact and task-focused. +- Hard-cap preflight and checkpoint text. +- Limit the first version to `coding` and `test`. +- Add targeted tests around restart behavior. + +## Rollback + +Rollback would revert changes in: + +- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` +- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` +- `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` +- `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` From 5f04a5f55f1d165f3376a0c6b39e698433b1e731 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:21:49 +0800 Subject: [PATCH 24/33] feat(worker): reset stage context after exploration drift --- platform/app/worker/engine.py | 137 ++++++++++ platform/app/worker/executor.py | 256 ++++++++++++++---- platform/app/worker/prompts.py | 4 + platform/sandbox/agent_server.py | 60 +++- platform/tests/test_engine_stage_execution.py | 60 ++++ platform/tests/test_executor_stage_logs.py | 58 ++++ platform/tests/test_prompts.py | 13 + platform/tests/test_sandbox_agent_server.py | 19 ++ platform/tests/test_worker.py | 42 ++- 9 files changed, 595 insertions(+), 54 deletions(-) diff --git a/platform/app/worker/engine.py b/platform/app/worker/engine.py index b322854..d74fefd 100644 --- a/platform/app/worker/engine.py +++ b/platform/app/worker/engine.py @@ -4,6 +4,7 @@ import asyncio import json import logging +import os import shutil import tempfile import time @@ -47,6 +48,24 @@ _running = False _task: Optional[asyncio.Task] = None +_PREFLIGHT_SKIP_DIRS = { + ".git", + ".hg", + ".svn", + "node_modules", + ".venv", + "venv", + "build", + "dist", + "target", + ".gradle", + ".idea", + "__pycache__", +} +_PREFLIGHT_MAX_FILES = 2000 +_PREFLIGHT_MAX_DEPTH = 6 +_PREFLIGHT_MAX_CHARS = 1200 + async def _safe_broadcast(event: str, data: dict) -> None: """Broadcast a WebSocket event, swallowing any errors.""" @@ -1702,6 +1721,8 @@ async def _execute_single_stage( except Exception: logger.warning("Failed to load memory for role %s", stage.agent_role, exc_info=True) + preflight_summary = _build_stage_preflight_summary(stage.stage_name, workspace_path) + # Build compressed prior context via sliding window # Phase 1.5: Cross-stage context recall — override compression for specified stages context_from = sdef.get("context_from") @@ -1816,6 +1837,7 @@ async def _execute_single_stage( compressed_outputs=compressed_prior if compressed_prior else None, project_memory=project_memory, repo_context=repo_context, + preflight_summary=preflight_summary, retry_context=retry_context, stage_model=stage_model, custom_instruction=custom_instruction, @@ -1827,6 +1849,7 @@ async def _execute_single_stage( compressed_outputs=compressed_prior if compressed_prior else None, project_memory=project_memory, repo_context=repo_context, + preflight_summary=preflight_summary, retry_context=retry_context, stage_model=stage_model, workdir_override=effective_workdir, @@ -2802,6 +2825,120 @@ def _build_repo_context(project) -> str: return "\n\n".join(parts) +def _iter_preflight_files(workspace_path: str): + root = Path(workspace_path) + scanned = 0 + for dirpath, dirnames, filenames in os.walk(root): + current = Path(dirpath) + try: + rel = current.relative_to(root) + depth = len(rel.parts) + except ValueError: + depth = 0 + dirnames[:] = [ + name for name in dirnames + if name not in _PREFLIGHT_SKIP_DIRS and depth < _PREFLIGHT_MAX_DEPTH + ] + for filename in filenames: + scanned += 1 + yield current / filename + if scanned >= _PREFLIGHT_MAX_FILES: + return + + +def _format_preflight_section(title: str, items: list[str], *, limit: int = 4) -> str: + if not items: + return "" + unique: list[str] = [] + seen: set[str] = set() + for item in items: + if item in seen: + continue + seen.add(item) + unique.append(item) + if len(unique) >= limit: + break + if not unique: + return "" + return f"- {title}: {', '.join(unique)}" + + +def _build_stage_preflight_summary(stage_name: str, workspace_path: Optional[str]) -> Optional[str]: + normalized = (stage_name or "").strip().lower() + if normalized not in {"code", "coding", "test"}: + return None + if not workspace_path: + return None + + root = Path(workspace_path) + if not root.exists() or not root.is_dir(): + return None + + build_files: list[str] = [] + source_roots: list[str] = [] + impl_examples: list[str] = [] + test_examples: list[str] = [] + + build_file_names = { + "build.gradle", + "build.gradle.kts", + "settings.gradle", + "settings.gradle.kts", + "pom.xml", + "package.json", + "pyproject.toml", + "go.mod", + "cargo.toml", + } + impl_keywords = ("controller", "handler", "service", "api", "route", "response") + test_keywords = ("test", "spec") + + for path in _iter_preflight_files(str(root)): + try: + rel = path.relative_to(root).as_posix() + except ValueError: + rel = path.as_posix() + lower_rel = rel.lower() + name = path.name.lower() + + if name in build_file_names: + build_files.append(rel) + if any(token in lower_rel for token in ("src/main", "app/", "cmd/", "internal/", "lib/")): + parent = str(Path(rel).parent).replace("\\", "/") + if parent and parent != ".": + source_roots.append(parent) + if any(keyword in name for keyword in impl_keywords) or any( + segment in lower_rel for segment in ("/controller/", "/handler/", "/service/", "/api/") + ): + impl_examples.append(rel) + if any(keyword in name for keyword in test_keywords) or any( + segment in lower_rel for segment in ("/test/", "/tests/", "/__tests__/") + ): + test_examples.append(rel) + + lines = [] + if normalized in {"code", "coding"}: + lines.append(_format_preflight_section("构建文件", build_files, limit=3)) + lines.append(_format_preflight_section("源码目录", source_roots, limit=3)) + lines.append(_format_preflight_section("实现参考", impl_examples, limit=4)) + lines.append(_format_preflight_section("测试参考", test_examples, limit=3)) + if not any(lines): + lines.append("- 未发现明显的实现参考,请直接聚焦最小修改并谨慎验证。") + else: + lines.append(_format_preflight_section("构建文件", build_files, limit=3)) + lines.append(_format_preflight_section("测试参考", test_examples, limit=5)) + lines.append(_format_preflight_section("实现参考", impl_examples, limit=3)) + if not any(lines): + lines.append("- 未发现明显测试样例,请优先选择最小、最快的验证路径。") + + summary = "\n".join(line for line in lines if line).strip() + if not summary: + return None + if len(summary) > _PREFLIGHT_MAX_CHARS: + summary = summary[:_PREFLIGHT_MAX_CHARS] + "\n...(预扫摘要已截断)" + return summary + + async def _fail_task(session: AsyncSession, task: TaskModel, reason: str) -> None: """Mark task as failed, broadcast, and send external notification.""" failed_at = datetime.now(timezone.utc) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index 5ae84d9..256bbce 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -230,6 +230,15 @@ def _resolve_stage_output_summary( return _clip_text(resolved, _output_summary_limit(stage_name)) +def _stage_goal_summary(stage_name: str | None) -> str: + normalized = (stage_name or "").strip().lower() + if normalized in {"code", "coding"}: + return "直接完成最小必要代码修改,并提供最小验证结果。" + if normalized == "test": + return "直接完成最小、最相关的验证,并明确成功或阻塞结论。" + return "完成当前阶段的最终结果。" + + # --------------------------------------------------------------------------- # Module-level helpers extracted from execute_stage # --------------------------------------------------------------------------- @@ -280,6 +289,7 @@ def _clear_current_task_cancellation_state() -> None: "go test", "cargo test", ) +_RESTART_OUTPUT_CHARS = 1500 def _classify_tool_activity(tool_name: str, args: dict[str, Any]) -> str: @@ -789,6 +799,109 @@ def _build_forced_convergence_prompt(stage_name: str | None) -> str: return "请立即收敛到当前阶段的最终结果,不要继续扩展。" +def _build_stage_restart_prompt( + restart_context: dict[str, Any] | None, + tracker: StageEventTracker, + output: str, + *, + reason: str, +) -> str: + context = restart_context or {} + title = str(context.get("task_title") or "").strip() + description = str(context.get("task_description") or "").strip() + stage_name = str(context.get("stage_name") or tracker.stage_name).strip() + preflight_summary = str(context.get("preflight_summary") or "").strip() + partial_output = _clip_text((output or "").replace("[Max turns reached. Please continue the conversation.]", "").strip(), _RESTART_OUTPUT_CHARS) + tool_digest = _format_tool_digest(tracker.get_completed_tool_runs(), limit=4) + action_prompt = ( + _build_forced_convergence_prompt(stage_name) + if reason == "forced_convergence" + else _build_continuation_prompt(stage_name) + ) + + parts: list[str] = [] + if title: + parts.append(f"## 任务\n**{title}**") + if description: + parts.append(description) + parts.append(f"\n## 当前阶段\n{stage_name}") + parts.append(_stage_goal_summary(stage_name)) + if preflight_summary: + parts.append(f"\n## 阶段预扫摘要\n{preflight_summary}") + if partial_output: + parts.append(f"\n## 当前阶段已有部分输出\n{partial_output}") + if tool_digest: + parts.append(f"\n## 最近关键工具结果\n{tool_digest}") + parts.append("\n## 下一步要求") + parts.append(action_prompt) + parts.append("不要重新展开整段历史;只基于上面的当前状态继续完成必要工作。") + return "\n".join(parts).strip() + + +async def _run_stage_restart( + runner: Any, + output: str, + runtime_overrides: dict[str, Any], + tracker: StageEventTracker, + *, + reason: str, + restart_index: int, + restart_context: dict[str, Any] | None = None, +) -> str: + prompt = _build_stage_restart_prompt(restart_context, tracker, output, reason=reason) + chat_started = time.monotonic() + chat_correlation = await tracker.emit_chat_sent( + request_body={ + "prompt": prompt, + "model": getattr(getattr(runner, "config", None), "model", None), + "stage": tracker.stage_name, + "agent_role": tracker.agent_role, + "temperature": runtime_overrides.get("temperature"), + "max_tokens": runtime_overrides.get("max_tokens"), + "restart": restart_index, + "restart_reason": reason, + "reset": True, + "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, + }, + ) + try: + restart_kwargs = _chat_kwargs_for_runner(runner, runtime_overrides) + response = await asyncio.wait_for( + runner.chat(prompt, reset=True, **restart_kwargs), + timeout=settings.WORKER_STAGE_TIMEOUT, + ) + restart_text = response.text_content or "" + await tracker.emit_chat_received( + chat_correlation, + status="success", + response_body={ + "restart": restart_index, + "restart_reason": reason, + "content": restart_text, + }, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + cleaned = (output or "").replace("[Max turns reached. Please continue the conversation.]", "").strip() + return f"{cleaned}\n\n{restart_text}".strip() if cleaned else restart_text + except asyncio.CancelledError: + _clear_current_task_cancellation_state() + await tracker.emit_chat_received( + chat_correlation, + status="cancelled", + response_body={"restart": restart_index, "restart_reason": reason, "error": "cancelled"}, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + raise + except Exception as exc: + await tracker.emit_chat_received( + chat_correlation, + status="failed", + response_body={"restart": restart_index, "restart_reason": reason, "error": str(exc)}, + duration_ms=round((time.monotonic() - chat_started) * 1000, 2), + ) + return output + + async def _run_forced_convergence( runner: Any, output: str, @@ -858,64 +971,93 @@ async def _handle_continuations( runtime_overrides: dict[str, Any], tracker: StageEventTracker, stage_name: str | None = None, + restart_context: dict[str, Any] | None = None, ) -> tuple[str, int]: """Follow up with continuation prompts when the LLM output was truncated.""" _MAX_CONTINUATIONS = 3 _TRUNCATION_SENTINEL = "Max turns reached" - continuations = 0 - output = await _run_forced_convergence(runner, output, runtime_overrides, tracker, stage_name) - - while _TRUNCATION_SENTINEL in (output or "") and continuations < _MAX_CONTINUATIONS: - continuations += 1 - continuation_started = time.monotonic() - prompt = _build_continuation_prompt(stage_name or tracker.stage_name) - chat_correlation = await tracker.emit_chat_sent( - request_body={ - "prompt": prompt, - "model": getattr(getattr(runner, "config", None), "model", None), - "stage": tracker.stage_name, - "agent_role": tracker.agent_role, - "temperature": runtime_overrides.get("temperature"), - "max_tokens": runtime_overrides.get("max_tokens"), - "continuation": continuations, - "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, - }, - ) - try: - continuation_kwargs = _chat_kwargs_for_runner(runner, runtime_overrides) - cont_response = await asyncio.wait_for( - runner.chat(prompt, reset=False, **continuation_kwargs), - timeout=settings.WORKER_STAGE_TIMEOUT, - ) - cont_text = cont_response.text_content or "" - await tracker.emit_chat_received( - chat_correlation, - status="success", - response_body={"continuation": continuations, "content": cont_text}, - duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + restarts = 0 + + if restart_context is None: + continuations = 0 + output = await _run_forced_convergence(runner, output, runtime_overrides, tracker, stage_name) + + while _TRUNCATION_SENTINEL in (output or "") and continuations < _MAX_CONTINUATIONS: + continuations += 1 + continuation_started = time.monotonic() + prompt = _build_continuation_prompt(stage_name or tracker.stage_name) + chat_correlation = await tracker.emit_chat_sent( + request_body={ + "prompt": prompt, + "model": getattr(getattr(runner, "config", None), "model", None), + "stage": tracker.stage_name, + "agent_role": tracker.agent_role, + "temperature": runtime_overrides.get("temperature"), + "max_tokens": runtime_overrides.get("max_tokens"), + "continuation": continuations, + "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, + }, ) - output = output.replace( - f"[{_TRUNCATION_SENTINEL}. Please continue the conversation.]", - "", - ).strip() - output = f"{output}\n\n{cont_text}".strip() - except asyncio.CancelledError: - _clear_current_task_cancellation_state() - await tracker.emit_chat_received( - chat_correlation, - status="cancelled", - response_body={"continuation": continuations, "error": "cancelled"}, - duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + try: + continuation_kwargs = _chat_kwargs_for_runner(runner, runtime_overrides) + cont_response = await asyncio.wait_for( + runner.chat(prompt, reset=False, **continuation_kwargs), + timeout=settings.WORKER_STAGE_TIMEOUT, + ) + cont_text = cont_response.text_content or "" + await tracker.emit_chat_received( + chat_correlation, + status="success", + response_body={"continuation": continuations, "content": cont_text}, + duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + ) + output = output.replace( + f"[{_TRUNCATION_SENTINEL}. Please continue the conversation.]", + "", + ).strip() + output = f"{output}\n\n{cont_text}".strip() + except asyncio.CancelledError: + _clear_current_task_cancellation_state() + await tracker.emit_chat_received( + chat_correlation, + status="cancelled", + response_body={"continuation": continuations, "error": "cancelled"}, + duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + ) + raise + except Exception as e: + await tracker.emit_chat_received( + chat_correlation, + status="failed", + response_body={"continuation": continuations, "error": str(e)}, + duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + ) + break + else: + if tracker.should_force_convergence(): + restarts += 1 + tracker.mark_forced_convergence_used() + output = await _run_stage_restart( + runner, + output, + runtime_overrides, + tracker, + reason="forced_convergence", + restart_index=restarts, + restart_context=restart_context, ) - raise - except Exception as e: - await tracker.emit_chat_received( - chat_correlation, - status="failed", - response_body={"continuation": continuations, "error": str(e)}, - duration_ms=round((time.monotonic() - continuation_started) * 1000, 2), + + while _TRUNCATION_SENTINEL in (output or "") and restarts < _MAX_CONTINUATIONS: + restarts += 1 + output = await _run_stage_restart( + runner, + output, + runtime_overrides, + tracker, + reason="truncation", + restart_index=restarts, + restart_context=restart_context, ) - break total_tokens = runner.cumulative_usage.total_tokens return output, total_tokens @@ -995,6 +1137,7 @@ async def execute_stage( compressed_outputs: Optional[List[Dict[str, str]]] = None, project_memory: Optional[str] = None, repo_context: Optional[str] = None, + preflight_summary: Optional[str] = None, retry_context: Optional[Dict[str, str]] = None, stage_model: Optional[str] = None, workdir_override: Optional[str] = None, @@ -1049,6 +1192,7 @@ async def execute_stage( compressed_outputs=compressed_outputs, project_memory=project_memory, repo_context=repo_context, + preflight_summary=preflight_summary, retry_context=retry_context, custom_instruction=custom_instruction, gate_rejection_context=gate_rejection_context, @@ -1216,9 +1360,15 @@ async def execute_stage( elapsed = time.monotonic() - start_time output = response.text_content total_tokens = runner.cumulative_usage.total_tokens + restart_context = { + "task_title": task.title, + "task_description": task.description, + "stage_name": stage.stage_name, + "preflight_summary": preflight_summary, + } output, total_tokens = await _handle_continuations( - runner, output, runtime_overrides, tracker, stage.stage_name + runner, output, runtime_overrides, tracker, stage.stage_name, restart_context ) # Phase 2.2: Evaluator-optimizer loop (if configured for this stage) @@ -1395,6 +1545,7 @@ async def execute_stage_sandboxed( compressed_outputs: Optional[List[Dict[str, str]]] = None, project_memory: Optional[str] = None, repo_context: Optional[str] = None, + preflight_summary: Optional[str] = None, retry_context: Optional[Dict[str, str]] = None, stage_model: Optional[str] = None, custom_instruction: Optional[str] = None, @@ -1451,6 +1602,7 @@ async def execute_stage_sandboxed( compressed_outputs=compressed_outputs, project_memory=project_memory, repo_context=repo_context, + preflight_summary=preflight_summary, retry_context=retry_context, custom_instruction=custom_instruction, gate_rejection_context=gate_rejection_context, diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index 851f5f6..438febf 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -169,6 +169,7 @@ class StageContext: compressed_outputs: Optional[List[Dict[str, str]]] = None # sliding-window compressed project_memory: Optional[str] = None # injected project memory text repo_context: Optional[str] = None # injected repo context (tech stack + dir tree) + preflight_summary: Optional[str] = None # deterministic stage-local workspace scan summary # Smart retry: failure context from previous attempt (Ralph Loop V2 pattern) retry_context: Optional[Dict[str, str]] = None # {"error": msg, "prior_output": text} # Phase 1.4: Custom instruction from template stage definition @@ -198,6 +199,9 @@ def build_user_prompt(ctx: StageContext) -> str: if ctx.project_memory: parts.append(f"\n## 项目上下文(来自历史任务)\n{ctx.project_memory}") + if ctx.preflight_summary: + parts.append(f"\n## 阶段预扫摘要\n{ctx.preflight_summary}") + # Use compressed outputs (sliding-window) when available, otherwise raw prior = ctx.compressed_outputs if ctx.compressed_outputs is not None else ctx.prior_outputs if prior: diff --git a/platform/sandbox/agent_server.py b/platform/sandbox/agent_server.py index d497b10..fc983c3 100644 --- a/platform/sandbox/agent_server.py +++ b/platform/sandbox/agent_server.py @@ -782,6 +782,58 @@ def _create_runner(parsed: dict[str, Any]) -> ContainerAgentRunner: return runner +def _build_restart_prompt( + user_prompt: str, + text_content: str, + tool_calls: list[dict[str, Any]], + *, + reason: str, +) -> str: + prompt_excerpt = (user_prompt or "").strip() + if len(prompt_excerpt) > 1200: + prompt_excerpt = prompt_excerpt[:1200] + "\n...(任务上下文已截断)" + partial_output = (text_content or "").replace( + "[Max turns reached. Please continue the conversation.]", + "", + ).strip() + if len(partial_output) > 1200: + partial_output = partial_output[:1200] + "\n...(已有输出已截断)" + + digest_lines: list[str] = [] + for item in tool_calls[-4:]: + status = str(item.get("status") or "success").upper() + tool_name = str(item.get("tool_name") or "tool") + preview = str(item.get("result_preview") or "").strip() + if len(preview) > 240: + preview = preview[:240] + "...[truncated]" + line = f"- [{status}] {tool_name}" + if preview: + line += f"\n 结果: {preview}" + digest_lines.append(line) + + action_prompt = ( + "请停止继续广泛探索,直接完成最小必要工作。" + if reason == "forced_convergence" + else "请不要重复整段历史,只基于当前状态继续完成剩余必要内容。" + ) + + parts = [ + "## 原始任务摘要", + prompt_excerpt or "(无)", + ] + if partial_output: + parts.extend(["\n## 当前阶段已有部分输出", partial_output]) + if digest_lines: + parts.extend(["\n## 最近关键工具结果", "\n".join(digest_lines)]) + parts.extend( + [ + "\n## 下一步要求", + action_prompt, + ] + ) + return "\n".join(parts).strip() + + async def _run_stage_chat( runner: ContainerAgentRunner, *, @@ -807,8 +859,14 @@ async def _run_stage_chat( max_continuations, ) try: + restart_prompt = _build_restart_prompt( + user_prompt, + text_content, + runner.tool_calls_log, + reason="truncation", + ) cont = await asyncio.wait_for( - runner.chat("请继续完成上面的输出,从你停下的地方继续。", reset=False), + runner.chat(restart_prompt, reset=True), timeout=timeout, ) cont_text = cont.text_content or "" diff --git a/platform/tests/test_engine_stage_execution.py b/platform/tests/test_engine_stage_execution.py index 53a06ed..0e4a308 100644 --- a/platform/tests/test_engine_stage_execution.py +++ b/platform/tests/test_engine_stage_execution.py @@ -164,6 +164,66 @@ async def test_execute_single_stage_reflection_disabled_uses_plain_context(monke await session.commit() +@pytest.mark.asyncio +async def test_execute_single_stage_passes_preflight_summary(monkeypatch, tmp_path): + monkeypatch.setattr(engine.settings, "SANDBOX_ENABLED", False) + monkeypatch.setattr(engine.settings, "MEMORY_ENABLED", False) + execute_stage_mock = AsyncMock(return_value="stage output") + monkeypatch.setattr(engine, "execute_stage", execute_stage_mock) + monkeypatch.setattr(engine, "execute_stage_sandboxed", execute_stage_mock) + monkeypatch.setattr(engine, "_emit_system_log", AsyncMock(return_value="log-id")) + monkeypatch.setattr(engine, "_close_started_system_log", AsyncMock()) + + (tmp_path / "build.gradle").write_text("plugins {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/main/java/demo/controller/HelloController.java").write_text("class X {}", encoding="utf-8") + + task_id = "tt-exec-preflight-1" + async with async_session_factory() as session: + session.add(TaskModel(id=task_id, title="Preflight Test", status="running")) + await session.commit() + + async with async_session_factory() as session: + task = await session.get(TaskModel, task_id) + stage = SimpleNamespace( + id="stage-preflight-1", + stage_name="coding", + agent_role="coding", + error_message=None, + output_summary=None, + output_structured=None, + execution_count=0, + status="pending", + ) + + from app.worker.compressor import CompressionResult + compression = CompressionResult() + + result = await engine._execute_single_stage( + session, # type: ignore[arg-type] + task, # type: ignore[arg-type] + stage, # type: ignore[arg-type] + 0, + [], + compression, + None, + None, + {}, + str(tmp_path), + None, + ) + + assert result == "stage output" + call_kwargs = execute_stage_mock.call_args.kwargs + assert "HelloController.java" in (call_kwargs.get("preflight_summary") or "") + + async with async_session_factory() as session: + t = await session.get(TaskModel, task_id) + if t: + await session.delete(t) + await session.commit() + + @pytest.mark.asyncio async def test_execute_single_stage_uses_sandbox(monkeypatch): """sandbox_info is truthy AND agent_role='coding' → calls execute_stage_sandboxed.""" diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 148f088..aab3be6 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -141,10 +141,12 @@ def __init__(self, *, response_text: str = 'done') -> None: self.config = SimpleNamespace(model='test-model') self.cumulative_usage = SimpleNamespace(total_tokens=11) self.prompts: list[str] = [] + self.resets: list[bool] = [] self.response_text = response_text async def chat(self, prompt: str, reset: bool = True, **_: object): self.prompts.append(prompt) + self.resets.append(reset) return SimpleNamespace(text_content=self.response_text) @@ -183,6 +185,9 @@ def should_force_convergence(self) -> bool: def mark_forced_convergence_used(self) -> None: self._forced_convergence_used = True + def get_completed_tool_runs(self): + return [] + class _CancelledRunner(_FakeRunner): async def chat(self, _prompt: str, reset: bool = True, **_: object): @@ -728,6 +733,59 @@ async def test_handle_continuations_injects_forced_convergence_for_failed_test_v ] +@pytest.mark.asyncio +async def test_handle_continuations_uses_checkpoint_restart_with_reset_true(): + runner = _ContinuationRunner(response_text='final answer') + tracker = _ContinuationTracker(stage_name='code', agent_role='coding') + + output, total_tokens = await executor._handle_continuations( + runner, + "[Max turns reached. Please continue the conversation.]", + {}, + tracker, + 'code', + { + 'task_title': 'Hello Task', + 'task_description': 'Implement hello endpoint', + 'stage_name': 'code', + 'preflight_summary': '- 构建文件: build.gradle', + }, + ) + + assert total_tokens == 11 + assert output == 'final answer' + assert runner.resets == [True] + assert '## 任务\n**Hello Task**' in runner.prompts[0] + assert '## 阶段预扫摘要' in runner.prompts[0] + assert '不要重新展开整段历史' in runner.prompts[0] + + +@pytest.mark.asyncio +async def test_handle_continuations_restarts_from_checkpoint_for_forced_convergence(): + runner = _ContinuationRunner(response_text='implemented result') + tracker = _ContinuationTracker(stage_name='code', agent_role='coding') + tracker._exploration_actions = 4 + + output, total_tokens = await executor._handle_continuations( + runner, + 'partial summary', + {}, + tracker, + 'code', + { + 'task_title': 'Hello Task', + 'task_description': 'Implement hello endpoint', + 'stage_name': 'code', + 'preflight_summary': '- 实现参考: src/main/java/demo/HelloController.java', + }, + ) + + assert total_tokens == 11 + assert output == 'partial summary\n\nimplemented result' + assert runner.resets == [True] + assert 'restart_reason' not in runner.prompts[0] + + def test_resolve_stage_max_turns_caps_coding_and_test(): assert executor._resolve_stage_max_turns('coding', None) == 6 assert executor._resolve_stage_max_turns('coding', 18) == 6 diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index 45504bb..44cb262 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -100,6 +100,19 @@ def test_without_project_memory(): assert "## 项目上下文(来自历史任务)" not in result +def test_with_preflight_summary(): + ctx = _minimal_ctx(preflight_summary="- 构建文件: build.gradle\n- 实现参考: src/main/java/demo/HelloController.java") + result = build_user_prompt(ctx) + assert "## 阶段预扫摘要" in result + assert "HelloController.java" in result + + +def test_without_preflight_summary(): + ctx = _minimal_ctx(preflight_summary=None) + result = build_user_prompt(ctx) + assert "## 阶段预扫摘要" not in result + + # --------------------------------------------------------------------------- # With prior_outputs (raw) # --------------------------------------------------------------------------- diff --git a/platform/tests/test_sandbox_agent_server.py b/platform/tests/test_sandbox_agent_server.py index 10eeaec..7676227 100644 --- a/platform/tests/test_sandbox_agent_server.py +++ b/platform/tests/test_sandbox_agent_server.py @@ -270,6 +270,25 @@ def test_run_gradle_wrapper_prewarm_once_marks_done(tmp_path, monkeypatch): assert agent_server._WRAPPER_PREWARM_DONE is True +def test_build_restart_prompt_includes_task_excerpt_and_tool_digest(): + agent_server = _load_agent_server_with_fake_skillkit() + prompt = agent_server._build_restart_prompt( + "## 任务\n实现 hello 接口", + "[Max turns reached. Please continue the conversation.]", + [ + { + "tool_name": "execute", + "result_preview": "src/main/java/demo/HelloController.java", + "status": "success", + } + ], + reason="truncation", + ) + assert "原始任务摘要" in prompt + assert "最近关键工具结果" in prompt + assert "HelloController.java" in prompt + + def test_should_retry_with_other_java_on_version_mismatch(): agent_server = _load_agent_server_with_fake_skillkit() assert agent_server._should_retry_with_other_java("Unsupported class file major version 61") diff --git a/platform/tests/test_worker.py b/platform/tests/test_worker.py index c4780f5..9353ffe 100644 --- a/platform/tests/test_worker.py +++ b/platform/tests/test_worker.py @@ -1,11 +1,17 @@ """Unit tests for worker engine pure functions.""" import asyncio import json +from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest -from app.worker.engine import _parse_gates, _sort_stages, _build_repo_context +from app.worker.engine import ( + _parse_gates, + _sort_stages, + _build_repo_context, + _build_stage_preflight_summary, +) from app.worker.engine import _safe_broadcast as engine_safe_broadcast from app.worker.executor import _safe_broadcast as executor_safe_broadcast @@ -108,6 +114,40 @@ def test_build_repo_context_default_branch(self): assert "main" in result +class TestBuildStagePreflightSummary: + def test_build_stage_preflight_summary_for_coding(self, tmp_path: Path): + (tmp_path / "build.gradle").write_text("plugins {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/test/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/main/java/demo/controller/HelloController.java").write_text("class X {}", encoding="utf-8") + (tmp_path / "src/test/java/demo/controller/HelloControllerTest.java").write_text("class T {}", encoding="utf-8") + + result = _build_stage_preflight_summary("coding", str(tmp_path)) + + assert result is not None + assert "构建文件" in result + assert "实现参考" in result + assert "测试参考" in result + assert "HelloController.java" in result + + def test_build_stage_preflight_summary_for_test(self, tmp_path: Path): + (tmp_path / "pom.xml").write_text("", encoding="utf-8") + (tmp_path / "src/test/java/demo").mkdir(parents=True) + (tmp_path / "src/main/java/demo/service").mkdir(parents=True) + (tmp_path / "src/test/java/demo/DemoServiceTest.java").write_text("class T {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/service/DemoService.java").write_text("class S {}", encoding="utf-8") + + result = _build_stage_preflight_summary("test", str(tmp_path)) + + assert result is not None + assert "构建文件" in result + assert "测试参考" in result + assert "DemoServiceTest.java" in result + + def test_build_stage_preflight_summary_ignores_other_stages(self, tmp_path: Path): + assert _build_stage_preflight_summary("signoff", str(tmp_path)) is None + + class TestSafeBroadcast: @pytest.mark.asyncio async def test_engine_safe_broadcast_swallows_errors(self): From f8a343a4104f757d6eb5235dda55946b7805dd44 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:39:30 +0800 Subject: [PATCH 25/33] docs: plan static context token optimization --- ...tatic-context-token-optimization-design.md | 207 ++++++++++++ ...ntext-token-optimization-implementation.md | 308 ++++++++++++++++++ 2 files changed, 515 insertions(+) create mode 100644 docs/plans/2026-03-19-static-context-token-optimization-design.md create mode 100644 docs/plans/2026-03-19-static-context-token-optimization-implementation.md diff --git a/docs/plans/2026-03-19-static-context-token-optimization-design.md b/docs/plans/2026-03-19-static-context-token-optimization-design.md new file mode 100644 index 0000000..8fe5c92 --- /dev/null +++ b/docs/plans/2026-03-19-static-context-token-optimization-design.md @@ -0,0 +1,207 @@ +# Static Context Token Optimization Design + +## Background + +Recent live task analysis confirms that high token cost is still dominated by repeated stage-local chat calls, especially in `code` and `test`. + +We have already improved behavior in three ways: + +- stronger `coding` / `test` convergence guardrails +- exploration budgets and forced convergence +- preflight summaries plus checkpoint-style restart prompts + +Those changes reduced some waste, but they did not fully remove the biggest fixed cost: each stage still repeatedly carries a large static prompt base. + +Today that base includes some combination of: + +- system prompt +- role skill directories and tool schema exposure +- repo context +- project memory +- prior stage outputs + +As a result, even when the agent is doing the right kind of work, each extra turn still re-pays too much prompt cost. + +## Goal + +Reduce token consumption primarily by shrinking repeated static prompt/context overhead before investing in provider-specific prompt caching. + +## Non-Goals + +- Do not redesign the task/stage graph +- Do not replace the current AgentRunner stack +- Do not make prompt caching the immediate primary fix +- Do not remove the existing preflight or restart work + +## Key Observation + +The next best savings are not from forcing the model to be “more disciplined” in the abstract. They come from making each stage carry less static baggage per turn. + +That means the platform should prioritize: + +1. smaller models for low-complexity stages +2. fewer tools and skills exposed per stage +3. less injected repo and memory context where preflight already covers the need +4. making restart/checkpoint execution the preferred continuation path + +## Recommended Approach + +### Phase 1: Cheap, Low-Risk Static Context Reduction + +Implement four low-risk optimizations first. + +#### 1. Role-Based Model Routing + +Use the existing `LLM_ROLE_MODEL_MAP` support more aggressively. + +Recommended default direction: + +- `parse` → lighter text/tool-capable model +- `code` → strongest coding/reasoning model +- `test` → lighter model unless the task or template explicitly requests stronger reasoning +- `signoff` → lighter text-oriented model + +This does not reduce token count directly, but it reduces cost immediately and aligns model strength with stage complexity. + +#### 2. Make `signoff` Text-Only + +`signoff` should stop re-entering tool-driven exploration by default. + +It should instead rely on: + +- prior stage outputs +- structured summaries +- the latest verified results already produced by earlier stages + +This reduces unnecessary tool schema exposure and avoids another mini ReAct loop at the end of the task. + +#### 3. Per-Stage Tool / Skill Pruning + +The current role defaults still expose too much shared capability in later-stage prompts. + +We should trim stage exposure so the agent only receives the tools and skill directories it realistically needs: + +- `code` should keep the core file and execution tools, but not unrelated later-stage abilities +- `test` should focus on read/edit/execute verification tools +- `signoff` should default to no tools + +This reduces prompt bloat and narrows action space. + +#### 4. Shrink `repo_context` and `project_memory` for `code` / `test` + +Now that deterministic preflight summaries exist, `code` and `test` no longer need the full original repo-context payload on every turn. + +We should split context into: + +- broad repo context for earlier planning stages +- slim execution context for `code` / `test` + +For execution stages, the injected context should favor: + +- concise stack/build facts +- minimal path hints +- short relevant memory excerpts + +and avoid re-sending large directory trees or verbose historical notes when preflight already covers the local execution target. + +### Phase 2: Make Checkpoint Restart the Main Continuation Strategy + +We already introduced restart/checkpoint machinery, but it should evolve from a rescue path into the preferred continuation model for `code` / `test`. + +The continuation path should increasingly favor: + +- `reset=True` +- a compact checkpoint prompt +- only the immediate execution state + +and increasingly avoid: + +- replaying full `repo_context` +- replaying full `project_memory` +- replaying large prior output blocks + +The restart payload should be limited to: + +- task objective +- stage goal +- preflight summary +- already confirmed edits or findings +- last 2 to 3 meaningful tool results +- immediate next action + +### Phase 3: Provider-Aware Prompt Caching + +Prompt caching is still worth evaluating, but only after the fixed prompt base is made smaller and more stable. + +Reasons not to lead with it: + +- provider support may vary across current model paths +- it does not solve growing-history behavior by itself +- it is more invasive than the earlier fixes + +Once Phases 1 and 2 land, caching can be evaluated on a cleaner and more stable prompt shape. + +## Alternatives Considered + +### 1. Prompt Caching First + +Useful later, but not the best immediate step because it does not reduce prompt size or continuation churn on its own. + +### 2. More Aggressive `max_turns` Reduction + +Helps cap damage, but still allows the remaining turns to carry the same large static prefix. + +### 3. Prompting the Agent to Batch Shell Commands + +Helpful as a guardrail, but weaker than platform-side prompt/context reduction. It depends on model compliance and does not address repeated schema/context cost. + +## Data Flow Changes + +### Parse / Spec-Like Stages + +These stages may continue to receive broader repo context because they are responsible for planning and synthesis. + +### Code / Test Stages + +These stages should increasingly receive: + +- slim repo facts +- short role-specific memory +- deterministic preflight summary +- compact restart checkpoints on continuation + +instead of the current larger blended context shape. + +### Signoff Stage + +This stage should default to text-only summary and evaluation behavior, without tool re-entry. + +## Testing Strategy + +Add or extend focused coverage for: + +- role-model resolution per stage +- signoff text-only routing +- per-role skill/tool narrowing +- prompt construction with slim execution-stage context +- restart/checkpoint payload staying compact +- non-execution stages still receiving the broader context they need + +## Success Criteria + +Compared to current baselines, success should show: + +- lower total tokens in `code` and `test` +- fewer repeated large chat payloads +- fewer tool-driven loops in `signoff` +- lower average cost per task even before prompt caching + +## Rollout Order + +1. tighten model routing defaults +2. make `signoff` text-only +3. prune per-stage tool/skill exposure +4. slim `repo_context` / `project_memory` for execution stages +5. promote compact checkpoint restart into the main continuation path +6. re-measure on the same VM task family +7. only then decide whether prompt caching should be the next investment diff --git a/docs/plans/2026-03-19-static-context-token-optimization-implementation.md b/docs/plans/2026-03-19-static-context-token-optimization-implementation.md new file mode 100644 index 0000000..935266f --- /dev/null +++ b/docs/plans/2026-03-19-static-context-token-optimization-implementation.md @@ -0,0 +1,308 @@ +# Static Context Token Optimization Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Reduce `code` / `test` token cost by shrinking repeated static prompt overhead before attempting provider-specific prompt caching. + +**Architecture:** Reuse the existing worker pipeline, but make execution stages cheaper by routing lighter models where appropriate, removing unnecessary tool exposure, slimming injected repo/memory context, and making compact checkpoint restart the default continuation shape. Preserve the existing stage graph and existing preflight work. + +**Tech Stack:** Python, FastAPI worker runtime, SkillKit AgentRunner, pytest + +--- + +### Task 1: Tighten Role-Based Model Routing Defaults + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/config.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/.env.example` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py` + +**Step 1: Write the failing test** + +Add coverage showing that `parse`, `test`, and `signoff` can resolve to lightweight models through `LLM_ROLE_MODEL_MAP`, while `code` can still resolve to the stronger coding model. + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_agents_api.py -q +``` + +Expected: FAIL until the default config and resolution behavior match the new mapping expectation. + +**Step 3: Write minimal implementation** + +Update config defaults and environment examples so the role-model map favors: + +- light model for `parse` +- strong model for `code` +- light model for `test` +- light model for `signoff` + +Keep stage-level override precedence unchanged. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_agents_api.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py /Users/jowang/Documents/github/silicon_agent/platform/app/config.py /Users/jowang/Documents/github/silicon_agent/platform/.env.example /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py +git commit -m "config(worker): tune role model routing defaults" +``` + +### Task 2: Make Signoff Text-Only By Default + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` + +**Step 1: Write the failing test** + +Add tests showing that `signoff` uses the text-only runner path and does not request tool execution by default. + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py -q +``` + +Expected: FAIL because `signoff` still flows through the general tool-enabled path. + +**Step 3: Write minimal implementation** + +Route `signoff` through `get_agent_text_only(...)` and keep it based on prior outputs plus prompt guidance, without reopening tool-driven exploration. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py +git commit -m "fix(worker): make signoff text only by default" +``` + +### Task 3: Prune Stage Tool And Skill Exposure + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` + +**Step 1: Write the failing test** + +Add tests showing that: + +- `code` only gets core implementation tools/skills +- `test` only gets core verification tools/skills +- `signoff` gets no tool-enabled skill exposure + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_agents.py tests/test_executor_stage_logs.py -q +``` + +Expected: FAIL because role defaults still expose too much shared capability. + +**Step 3: Write minimal implementation** + +Tighten `ROLE_TOOLS` and role skill directory selection so execution stages only carry what they actually need. Preserve sandbox parity for the container path. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_agents.py tests/test_executor_stage_logs.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py +git commit -m "refactor(worker): prune stage tool and skill exposure" +``` + +### Task 4: Slim Repo Context And Project Memory For Execution Stages + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_worker.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` + +**Step 1: Write the failing test** + +Add tests showing that `code` / `test` receive a slim execution-context variant while planning stages can still receive broader repo context. + +Cover: + +- reduced repo tree content for execution stages +- reduced memory excerpt size for execution stages +- preflight summary still present + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_prompts.py tests/test_worker.py tests/test_engine_stage_execution.py -q +``` + +Expected: FAIL because execution stages still receive the broader context shape. + +**Step 3: Write minimal implementation** + +Split context construction into broad planning context versus slim execution context. Keep concise build/stack facts, but avoid re-sending large directory trees and long memory blocks to `code` / `test`. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_prompts.py tests/test_worker.py tests/test_engine_stage_execution.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_worker.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py +git commit -m "feat(worker): slim execution stage context payloads" +``` + +### Task 5: Make Compact Restart The Primary Continuation Shape + +**Files:** +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` +- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` +- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py` + +**Step 1: Write the failing test** + +Add tests showing that after exploration drift or truncation, the stage continuation path prefers compact `reset=True` restart payloads and logs restart metadata consistently. + +Cover: + +- restart metadata on host path +- restart metadata on sandbox path +- reduced carry-forward payload shape + +**Step 2: Run test to verify it fails** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_sandbox_agent_server.py -q +``` + +Expected: FAIL because current live behavior still does not consistently surface restart metadata and still leaks too much old context into continuation. + +**Step 3: Write minimal implementation** + +Promote compact restart to the default continuation path for `code` / `test`, ensure restart logging is explicit, and keep carried state limited to the compact checkpoint. + +**Step 4: Run test to verify it passes** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_executor_stage_logs.py tests/test_sandbox_agent_server.py -q +``` + +Expected: PASS. + +**Step 5: Commit** + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py +git commit -m "fix(worker): prefer compact restart continuations" +``` + +### Task 6: Regression And Live Validation + +**Files:** +- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py` +- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py` +- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` +- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` +- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` + +**Step 1: Run targeted local regression** + +Run: +```bash +cd /Users/jowang/Documents/github/silicon_agent/platform +. .venv/bin/activate +pytest tests/test_agents.py tests/test_agents_api.py tests/test_prompts.py tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py tests/test_sandbox_agent_server.py -q +``` + +Expected: PASS. + +**Step 2: Deploy to VM and validate against the known hello-world task family** + +Use the VM flow already established for: + +- pulling `origin/codex/raise-cb-and-optimize-coding-image` +- restarting backend +- cloning task `339f8bd3-c5f2-4da5-8267-15a6ec3aaaa3` + +Capture: + +- total task tokens +- `parse` / `code` / `test` stage tokens +- `chat_sent` +- `tool_calls` +- `max_turn_markers` +- `forced_convergence` +- `restart_count` + +**Step 3: Commit** + +If only test updates were needed: + +```bash +git add /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py +git commit -m "test(worker): cover static context token optimization regressions" +``` From 9098814bcc19d676025c609f181c75439e4643fe Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:43:13 +0800 Subject: [PATCH 26/33] config(worker): tune role model routing defaults --- platform/.env.example | 1 + platform/app/config.py | 5 +++-- platform/app/services/agent_service.py | 2 +- platform/tests/test_agents_api.py | 22 ++++++++++++++++++++++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/platform/.env.example b/platform/.env.example index cf58eb0..ced5996 100644 --- a/platform/.env.example +++ b/platform/.env.example @@ -10,6 +10,7 @@ LLM_API_KEY=your-api-key-here LLM_BASE_URL=https://api.openai.com LLM_MODEL=gpt-4o-mini LLM_TIMEOUT=120.0 +LLM_ROLE_MODEL_MAP={"orchestrator":"gpt-4o-mini","test":"gpt-4o-mini"} EXTRA_SKILL_DIR_WHITELIST= # SkillKit compatibility env vars. diff --git a/platform/app/config.py b/platform/app/config.py index db012ab..33636eb 100644 --- a/platform/app/config.py +++ b/platform/app/config.py @@ -16,8 +16,9 @@ class Settings(BaseSettings): LLM_TIMEOUT: float = 120.0 # Per-role model routing (JSON string: {"coding": "gpt-4o", "review": "claude-sonnet-4-20250514"}) - # Unspecified roles fall back to LLM_MODEL - LLM_ROLE_MODEL_MAP: str = "{}" + # Unspecified roles fall back to LLM_MODEL. Keep lightweight defaults on + # orchestrator/test so parse + signoff stay cheaper unless env overrides them. + LLM_ROLE_MODEL_MAP: str = '{"orchestrator":"gpt-4o-mini","test":"gpt-4o-mini"}' # Comma-separated absolute path prefixes allowed in agent config `extra_skill_dirs`. # Empty means only built-in platform/skills directory is allowed. EXTRA_SKILL_DIR_WHITELIST: str = "" diff --git a/platform/app/services/agent_service.py b/platform/app/services/agent_service.py index 07f8352..1241b03 100644 --- a/platform/app/services/agent_service.py +++ b/platform/app/services/agent_service.py @@ -154,8 +154,8 @@ def _build_role_defaults(self, available_models: list[str]) -> dict[str, str]: for role, _ in AGENT_ROLES: model = ( role_model_map.get(role) - or FALLBACK_ROLE_DEFAULT_MODELS.get(role) or settings.LLM_MODEL + or FALLBACK_ROLE_DEFAULT_MODELS.get(role) ) if model not in available_models and available_models: if settings.LLM_MODEL in available_models: diff --git a/platform/tests/test_agents_api.py b/platform/tests/test_agents_api.py index 0cd3099..e24773b 100644 --- a/platform/tests/test_agents_api.py +++ b/platform/tests/test_agents_api.py @@ -154,6 +154,28 @@ async def list_models(self): assert data["role_defaults"]["coding"] in data["available_models"] +@pytest.mark.asyncio +async def test_get_agent_config_options_uses_lightweight_orchestrator_and_test_defaults( + client, monkeypatch +): + """Config options should keep orchestrator/test on lighter defaults while coding follows global.""" + + monkeypatch.setattr(settings, "LLM_API_KEY", "") + monkeypatch.setattr(settings, "LLM_MODEL", "gpt-5.1-codex") + monkeypatch.setattr( + settings, + "LLM_ROLE_MODEL_MAP", + '{"orchestrator":"gpt-4o-mini","test":"gpt-4o-mini"}', + ) + + resp = await client.get("/api/v1/agents/config/options") + assert resp.status_code == 200 + data = resp.json() + assert data["role_defaults"]["orchestrator"] == "gpt-4o-mini" + assert data["role_defaults"]["test"] == "gpt-4o-mini" + assert data["role_defaults"]["coding"] == "gpt-5.1-codex" + + @pytest.mark.asyncio async def test_update_agent_config_rejects_extra_skill_dirs_outside_whitelist( client, seed_agent, tmp_path, monkeypatch From c9ef94bcf7fac96a44b07a42af9f0cfe19cdad4b Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:44:59 +0800 Subject: [PATCH 27/33] fix(worker): make signoff text only by default --- platform/app/worker/executor.py | 3 +- platform/tests/test_executor_stage_logs.py | 60 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index 256bbce..283d8c8 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -1201,7 +1201,8 @@ async def execute_stage( runtime_overrides = _build_runtime_overrides(agent, stage_model) stage_max_turns = _resolve_stage_max_turns(stage.agent_role, runtime_overrides["max_turns"]) - runner = get_agent( + runner_factory = get_agent_text_only if _is_signoff_stage(stage.stage_name) else get_agent + runner = runner_factory( stage.agent_role, task_id, model=runtime_overrides["model"], diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index aab3be6..3c2e07e 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -358,6 +358,66 @@ def _fallback_runner( assert len(fallback_events) == 1 +@pytest.mark.asyncio +async def test_execute_stage_uses_text_only_runner_for_signoff(monkeypatch): + session = SimpleNamespace(commit=AsyncMock()) + task = SimpleNamespace( + id='task-signoff-1', + title='task title', + description='task description', + total_tokens=0, + total_cost_rmb=0.0, + ) + stage = SimpleNamespace( + id='stage-signoff-1', + stage_name='signoff', + agent_role='orchestrator', + status='pending', + started_at=None, + completed_at=None, + duration_seconds=None, + tokens_used=0, + output_summary=None, + ) + + fake_pipeline = _FakePipeline() + text_only_called = {'value': False} + + monkeypatch.setattr(executor, 'get_task_log_pipeline', lambda: fake_pipeline) + monkeypatch.setattr(executor, '_get_agent', AsyncMock(return_value=None)) + monkeypatch.setattr(executor, '_safe_broadcast', AsyncMock()) + monkeypatch.setattr(executor, 'build_user_prompt', lambda _ctx: 'signoff prompt') + + def _unexpected_agent(*args, **kwargs): + raise AssertionError('signoff should not use tool-enabled get_agent') + + def _text_only_runner( + _role, + _task_id, + model=None, + temperature=None, + max_tokens=None, + max_turns=None, + extra_skill_dirs=None, + system_prompt_append=None, + ): + text_only_called['value'] = True + return _FakeRunner() + + monkeypatch.setattr(executor, 'get_agent', _unexpected_agent) + monkeypatch.setattr(executor, 'get_agent_text_only', _text_only_runner) + + result = await executor.execute_stage( + session=session, + task=task, + stage=stage, + prior_outputs=[], + ) + + assert result == 'stage output' + assert text_only_called['value'] is True + + def test_apply_runner_workspace_override_replaces_prompt_and_cwd(): runner = SimpleNamespace( default_cwd='/tmp/old-workspace', From 851b6c32fcb5f9ef1584ac1fad0bca212dcd188f Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:46:14 +0800 Subject: [PATCH 28/33] refactor(worker): prune stage tool and skill exposure --- platform/app/worker/agents.py | 4 ++-- platform/tests/test_agents.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/platform/app/worker/agents.py b/platform/app/worker/agents.py index 8ec75e7..198a97a 100644 --- a/platform/app/worker/agents.py +++ b/platform/app/worker/agents.py @@ -57,8 +57,8 @@ _ROLE_SKILL_DIRS: dict[str, list[str]] = { "orchestrator": ["shared", "orchestrator"], "spec": ["shared", "spec"], - "coding": ["shared", "coding"], - "test": ["shared", "test"], + "coding": ["coding"], + "test": ["test"], "review": ["shared", "review"], "smoke": ["shared", "smoke"], "doc": ["shared", "doc"], diff --git a/platform/tests/test_agents.py b/platform/tests/test_agents.py index fbfe1fb..df5b092 100644 --- a/platform/tests/test_agents.py +++ b/platform/tests/test_agents.py @@ -21,6 +21,20 @@ def test_test_has_core_tools(): assert {"read", "write", "edit", "execute", "execute_script", "skill"}.issubset(ROLE_TOOLS["test"]) +def test_coding_skill_dirs_exclude_shared_by_default(): + dirs = agents_mod._get_skill_dirs("coding") + rendered = [p.name for p in dirs] + assert "coding" in rendered + assert "shared" not in rendered + + +def test_test_skill_dirs_exclude_shared_by_default(): + dirs = agents_mod._get_skill_dirs("test") + rendered = [p.name for p in dirs] + assert "test" in rendered + assert "shared" not in rendered + + def test_spec_no_execute(): tools = ROLE_TOOLS["spec"] assert "execute" not in tools From 581211371d1ab2c59069a8e649d3685c60b4074d Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:47:53 +0800 Subject: [PATCH 29/33] feat(worker): slim execution stage context payloads --- platform/app/worker/prompts.py | 40 ++++++++++++++++++++++++++++++---- platform/tests/test_prompts.py | 26 ++++++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index 438febf..6766728 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -4,6 +4,10 @@ from dataclasses import dataclass from typing import Dict, List, Optional +_EXECUTION_STAGE_NAMES = {"code", "coding", "test"} +_EXECUTION_REPO_CONTEXT_LIMIT = 900 +_EXECUTION_MEMORY_LIMIT = 700 + # --------------------------------------------------------------------------- # System prompts per agent role @@ -178,6 +182,20 @@ class StageContext: gate_rejection_context: Optional[Dict[str, str]] = None # {"comment": ..., "retry": "2/3"} +def _clip_stage_context(value: Optional[str], *, limit: int, marker: str) -> Optional[str]: + text = (value or "").strip() + if not text: + return None + if len(text) <= limit: + return text + keep_len = max(0, limit - len(marker)) + return text[:keep_len].rstrip() + marker + + +def _is_execution_stage(stage_name: str) -> bool: + return (stage_name or "").strip().lower() in _EXECUTION_STAGE_NAMES + + def build_user_prompt(ctx: StageContext) -> str: """Build the user prompt text for an AgentRunner chat call. @@ -191,13 +209,27 @@ def build_user_prompt(ctx: StageContext) -> str: if ctx.task_description: parts.append(f"\n{ctx.task_description}") + repo_context = ctx.repo_context + project_memory = ctx.project_memory + if _is_execution_stage(ctx.stage_name): + repo_context = _clip_stage_context( + repo_context, + limit=_EXECUTION_REPO_CONTEXT_LIMIT, + marker="...(执行阶段上下文已截断)", + ) + project_memory = _clip_stage_context( + project_memory, + limit=_EXECUTION_MEMORY_LIMIT, + marker="...(执行阶段记忆已截断)", + ) + # Inject repo context (tech stack + directory structure) - if ctx.repo_context: - parts.append(f"\n## 项目代码库信息\n{ctx.repo_context}") + if repo_context: + parts.append(f"\n## 项目代码库信息\n{repo_context}") # Inject project memory from historical tasks - if ctx.project_memory: - parts.append(f"\n## 项目上下文(来自历史任务)\n{ctx.project_memory}") + if project_memory: + parts.append(f"\n## 项目上下文(来自历史任务)\n{project_memory}") if ctx.preflight_summary: parts.append(f"\n## 阶段预扫摘要\n{ctx.preflight_summary}") diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index 44cb262..3f2c08a 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -77,6 +77,23 @@ def test_with_repo_context(): assert "Python 3.11 / FastAPI" in result +def test_code_stage_clips_large_repo_context(): + repo_context = "STACK\n" + ("src/main/java/demo/File.java\n" * 200) + ctx = _minimal_ctx(stage_name="code", agent_role="coding", repo_context=repo_context) + result = build_user_prompt(ctx) + assert "## 项目代码库信息" in result + assert "...(执行阶段上下文已截断)" in result + assert len(result) < len(repo_context) + 500 + + +def test_spec_stage_keeps_full_repo_context(): + repo_context = "STACK\n" + ("src/main/java/demo/File.java\n" * 40) + ctx = _minimal_ctx(stage_name="spec", agent_role="spec", repo_context=repo_context) + result = build_user_prompt(ctx) + assert "...(执行阶段上下文已截断)" not in result + assert repo_context in result + + def test_without_repo_context(): ctx = _minimal_ctx(repo_context=None) result = build_user_prompt(ctx) @@ -94,6 +111,15 @@ def test_with_project_memory(): assert "Previous task: added auth module." in result +def test_test_stage_clips_large_project_memory(): + project_memory = "Memory line\n" * 300 + ctx = _minimal_ctx(stage_name="test", agent_role="test", project_memory=project_memory) + result = build_user_prompt(ctx) + assert "## 项目上下文(来自历史任务)" in result + assert "...(执行阶段记忆已截断)" in result + assert len(result) < len(project_memory) + 500 + + def test_without_project_memory(): ctx = _minimal_ctx(project_memory=None) result = build_user_prompt(ctx) From 43a3962c93379a33a817daff9aedd39ee438e812 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 18:51:00 +0800 Subject: [PATCH 30/33] fix(worker): prefer compact restart continuations --- platform/app/worker/executor.py | 22 +++++++++++++--- platform/tests/test_executor_stage_logs.py | 29 ++++++++++++---------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index 283d8c8..a0d4242 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -239,6 +239,11 @@ def _stage_goal_summary(stage_name: str | None) -> str: return "完成当前阶段的最终结果。" +def _prefer_restart_continuations(stage_name: str | None) -> bool: + normalized = (stage_name or "").strip().lower() + return normalized in {"code", "coding", "test"} + + # --------------------------------------------------------------------------- # Module-level helpers extracted from execute_stage # --------------------------------------------------------------------------- @@ -860,6 +865,7 @@ async def _run_stage_restart( "max_tokens": runtime_overrides.get("max_tokens"), "restart": restart_index, "restart_reason": reason, + "forced_convergence": reason == "forced_convergence", "reset": True, "timeout_seconds": settings.WORKER_STAGE_TIMEOUT, }, @@ -877,6 +883,7 @@ async def _run_stage_restart( response_body={ "restart": restart_index, "restart_reason": reason, + "forced_convergence": reason == "forced_convergence", "content": restart_text, }, duration_ms=round((time.monotonic() - chat_started) * 1000, 2), @@ -896,7 +903,12 @@ async def _run_stage_restart( await tracker.emit_chat_received( chat_correlation, status="failed", - response_body={"restart": restart_index, "restart_reason": reason, "error": str(exc)}, + response_body={ + "restart": restart_index, + "restart_reason": reason, + "forced_convergence": reason == "forced_convergence", + "error": str(exc), + }, duration_ms=round((time.monotonic() - chat_started) * 1000, 2), ) return output @@ -977,15 +989,19 @@ async def _handle_continuations( _MAX_CONTINUATIONS = 3 _TRUNCATION_SENTINEL = "Max turns reached" restarts = 0 + effective_stage_name = stage_name or tracker.stage_name + + if restart_context is None and _prefer_restart_continuations(effective_stage_name): + restart_context = {"stage_name": effective_stage_name} if restart_context is None: continuations = 0 - output = await _run_forced_convergence(runner, output, runtime_overrides, tracker, stage_name) + output = await _run_forced_convergence(runner, output, runtime_overrides, tracker, effective_stage_name) while _TRUNCATION_SENTINEL in (output or "") and continuations < _MAX_CONTINUATIONS: continuations += 1 continuation_started = time.monotonic() - prompt = _build_continuation_prompt(stage_name or tracker.stage_name) + prompt = _build_continuation_prompt(effective_stage_name) chat_correlation = await tracker.emit_chat_sent( request_body={ "prompt": prompt, diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 3c2e07e..2cead39 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -729,9 +729,9 @@ async def test_handle_continuations_uses_coding_specific_prompt(): assert total_tokens == 11 assert output == 'done' - assert runner.prompts == [ - '请停止继续广泛探索。基于已知信息直接修改代码;如果仍缺信息,只允许再查看 1 个最关键文件,然后必须完成修改并给出最小验证结果。' - ] + assert runner.resets == [True] + assert '## 当前阶段\ncode' in runner.prompts[0] + assert '请停止继续广泛探索。基于已知信息直接修改代码' in runner.prompts[0] @pytest.mark.asyncio @@ -748,9 +748,9 @@ async def test_handle_continuations_uses_test_specific_prompt(): assert total_tokens == 11 assert output == 'done' - assert runner.prompts == [ - '请停止扩展测试范围。只做最小、最相关的验证;如果验证命令失败,必须直接给出失败命令、关键报错和唯一阻塞点,不要再用代码阅读代替测试结论。' - ] + assert runner.resets == [True] + assert '## 当前阶段\ntest' in runner.prompts[0] + assert '请停止扩展测试范围。只做最小、最相关的验证' in runner.prompts[0] @pytest.mark.asyncio @@ -768,9 +768,8 @@ async def test_handle_continuations_injects_forced_convergence_for_coding_budget assert total_tokens == 11 assert output == 'partial summary\n\ndone' - assert runner.prompts == [ - '你已经在当前阶段花了过多轮次进行探索。现在禁止继续浏览仓库。请直接做最小代码修改,并只执行最小必要验证。如果仍然无法完成,请只输出唯一阻塞点和证据。' - ] + assert runner.resets == [True] + assert '禁止继续浏览仓库' in runner.prompts[0] @pytest.mark.asyncio @@ -788,9 +787,8 @@ async def test_handle_continuations_injects_forced_convergence_for_failed_test_v assert total_tokens == 11 assert output == 'analysis only\n\ndone' - assert runner.prompts == [ - '你已经在当前阶段花了过多轮次进行探索。现在禁止继续扩展测试范围。请直接执行最小、最相关的验证。如果验证命令失败,必须明确给出失败命令、关键报错和唯一阻塞点;不要仅凭代码阅读判断测试通过。' - ] + assert runner.resets == [True] + assert '禁止继续扩展测试范围' in runner.prompts[0] @pytest.mark.asyncio @@ -818,6 +816,9 @@ async def test_handle_continuations_uses_checkpoint_restart_with_reset_true(): assert '## 任务\n**Hello Task**' in runner.prompts[0] assert '## 阶段预扫摘要' in runner.prompts[0] assert '不要重新展开整段历史' in runner.prompts[0] + assert tracker.sent[0]['request_body']['restart'] == 1 + assert tracker.sent[0]['request_body']['restart_reason'] == 'truncation' + assert tracker.sent[0]['request_body']['reset'] is True @pytest.mark.asyncio @@ -843,7 +844,9 @@ async def test_handle_continuations_restarts_from_checkpoint_for_forced_converge assert total_tokens == 11 assert output == 'partial summary\n\nimplemented result' assert runner.resets == [True] - assert 'restart_reason' not in runner.prompts[0] + assert tracker.sent[0]['request_body']['restart'] == 1 + assert tracker.sent[0]['request_body']['restart_reason'] == 'forced_convergence' + assert tracker.sent[0]['request_body']['forced_convergence'] is True def test_resolve_stage_max_turns_caps_coding_and_test(): From 7763091f1dc187c62932c06de81379810f23d8f2 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 19:08:39 +0800 Subject: [PATCH 31/33] refactor(worker): aggressively slim execution context --- platform/app/worker/agents.py | 8 +++--- platform/app/worker/executor.py | 4 +-- platform/app/worker/prompts.py | 16 ++++++----- platform/tests/test_agents.py | 10 ++++--- platform/tests/test_executor_stage_logs.py | 32 ++++++++++++++++++++++ platform/tests/test_prompts.py | 12 ++++++++ 6 files changed, 65 insertions(+), 17 deletions(-) diff --git a/platform/app/worker/agents.py b/platform/app/worker/agents.py index 198a97a..83baf5e 100644 --- a/platform/app/worker/agents.py +++ b/platform/app/worker/agents.py @@ -40,8 +40,8 @@ ROLE_TOOLS: dict[str, set[str]] = { "orchestrator": {"read", "execute", "skill"}, "spec": {"read", "write", "edit", "skill"}, - "coding": {"read", "write", "edit", "execute", "execute_script", "skill"}, - "test": {"read", "write", "edit", "execute", "execute_script", "skill"}, + "coding": {"read", "write", "edit", "execute", "execute_script"}, + "test": {"read", "write", "edit", "execute", "execute_script"}, "review": {"read", "execute", "skill"}, "smoke": {"read", "execute", "skill"}, "doc": {"read", "write", "edit", "skill"}, @@ -57,8 +57,8 @@ _ROLE_SKILL_DIRS: dict[str, list[str]] = { "orchestrator": ["shared", "orchestrator"], "spec": ["shared", "spec"], - "coding": ["coding"], - "test": ["test"], + "coding": [], + "test": [], "review": ["shared", "review"], "smoke": ["shared", "smoke"], "doc": ["shared", "doc"], diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index a0d4242..626edbf 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -294,7 +294,7 @@ def _clear_current_task_cancellation_state() -> None: "go test", "cargo test", ) -_RESTART_OUTPUT_CHARS = 1500 +_RESTART_OUTPUT_CHARS = 700 def _classify_tool_activity(tool_name: str, args: dict[str, Any]) -> str: @@ -817,7 +817,7 @@ def _build_stage_restart_prompt( stage_name = str(context.get("stage_name") or tracker.stage_name).strip() preflight_summary = str(context.get("preflight_summary") or "").strip() partial_output = _clip_text((output or "").replace("[Max turns reached. Please continue the conversation.]", "").strip(), _RESTART_OUTPUT_CHARS) - tool_digest = _format_tool_digest(tracker.get_completed_tool_runs(), limit=4) + tool_digest = _format_tool_digest(tracker.get_completed_tool_runs(), limit=2) action_prompt = ( _build_forced_convergence_prompt(stage_name) if reason == "forced_convergence" diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index 6766728..2cde7ea 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -5,8 +5,7 @@ from typing import Dict, List, Optional _EXECUTION_STAGE_NAMES = {"code", "coding", "test"} -_EXECUTION_REPO_CONTEXT_LIMIT = 900 -_EXECUTION_MEMORY_LIMIT = 700 +_EXECUTION_MEMORY_LIMIT = 320 # --------------------------------------------------------------------------- @@ -212,11 +211,14 @@ def build_user_prompt(ctx: StageContext) -> str: repo_context = ctx.repo_context project_memory = ctx.project_memory if _is_execution_stage(ctx.stage_name): - repo_context = _clip_stage_context( - repo_context, - limit=_EXECUTION_REPO_CONTEXT_LIMIT, - marker="...(执行阶段上下文已截断)", - ) + if ctx.preflight_summary: + repo_context = None + else: + repo_context = _clip_stage_context( + repo_context, + limit=320, + marker="...(执行阶段上下文已截断)", + ) project_memory = _clip_stage_context( project_memory, limit=_EXECUTION_MEMORY_LIMIT, diff --git a/platform/tests/test_agents.py b/platform/tests/test_agents.py index df5b092..7b7df35 100644 --- a/platform/tests/test_agents.py +++ b/platform/tests/test_agents.py @@ -14,24 +14,26 @@ def test_role_tools_all_valid(): def test_coding_has_core_tools(): - assert {"read", "write", "edit", "execute", "execute_script", "skill"}.issubset(ROLE_TOOLS["coding"]) + assert {"read", "write", "edit", "execute", "execute_script"}.issubset(ROLE_TOOLS["coding"]) + assert "skill" not in ROLE_TOOLS["coding"] def test_test_has_core_tools(): - assert {"read", "write", "edit", "execute", "execute_script", "skill"}.issubset(ROLE_TOOLS["test"]) + assert {"read", "write", "edit", "execute", "execute_script"}.issubset(ROLE_TOOLS["test"]) + assert "skill" not in ROLE_TOOLS["test"] def test_coding_skill_dirs_exclude_shared_by_default(): dirs = agents_mod._get_skill_dirs("coding") rendered = [p.name for p in dirs] - assert "coding" in rendered + assert rendered == [] assert "shared" not in rendered def test_test_skill_dirs_exclude_shared_by_default(): dirs = agents_mod._get_skill_dirs("test") rendered = [p.name for p in dirs] - assert "test" in rendered + assert rendered == [] assert "shared" not in rendered diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 2cead39..2a2cf94 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -189,6 +189,18 @@ def get_completed_tool_runs(self): return [] +class _DigestTracker(_ContinuationTracker): + def __init__(self, stage_name: str, agent_role: str = 'coding') -> None: + super().__init__(stage_name=stage_name, agent_role=agent_role) + self._items = [ + {"status": "success", "command": f"cmd-{i}", "result_preview": f"preview-{i}"} + for i in range(4) + ] + + def get_completed_tool_runs(self): + return self._items + + class _CancelledRunner(_FakeRunner): async def chat(self, _prompt: str, reset: bool = True, **_: object): await self.events.emit( @@ -849,6 +861,26 @@ async def test_handle_continuations_restarts_from_checkpoint_for_forced_converge assert tracker.sent[0]['request_body']['forced_convergence'] is True +def test_build_stage_restart_prompt_limits_tool_digest_items(): + tracker = _DigestTracker(stage_name='code', agent_role='coding') + prompt = executor._build_stage_restart_prompt( + { + 'task_title': 'Hello Task', + 'task_description': 'Implement hello endpoint', + 'stage_name': 'code', + 'preflight_summary': '- 构建文件: build.gradle', + }, + tracker, + 'partial output', + reason='truncation', + ) + + assert 'cmd-3' in prompt + assert 'cmd-2' in prompt + assert 'cmd-1' not in prompt + assert 'cmd-0' not in prompt + + def test_resolve_stage_max_turns_caps_coding_and_test(): assert executor._resolve_stage_max_turns('coding', None) == 6 assert executor._resolve_stage_max_turns('coding', 18) == 6 diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index 3f2c08a..91915db 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -94,6 +94,18 @@ def test_spec_stage_keeps_full_repo_context(): assert repo_context in result +def test_code_stage_omits_repo_context_when_preflight_present(): + ctx = _minimal_ctx( + stage_name="code", + agent_role="coding", + repo_context="STACK\nsrc/main/java/demo/File.java", + preflight_summary="- 构建文件: build.gradle", + ) + result = build_user_prompt(ctx) + assert "## 项目代码库信息" not in result + assert "## 阶段预扫摘要" in result + + def test_without_repo_context(): ctx = _minimal_ctx(repo_context=None) result = build_user_prompt(ctx) From 63719c36409dfda636769e0632ab1a025e88eb9d Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" Date: Thu, 19 Mar 2026 21:29:54 +0800 Subject: [PATCH 32/33] feat(worker): tighten execution context and preflight summaries --- platform/app/worker/compressor.py | 2 +- platform/app/worker/engine.py | 87 +++++++++++-- platform/app/worker/executor.py | 11 +- platform/app/worker/prompts.py | 145 ++++++++++++++++++++- platform/tests/test_compressor.py | 16 +++ platform/tests/test_executor_stage_logs.py | 8 ++ platform/tests/test_prompts.py | 37 +++++- platform/tests/test_worker.py | 38 +++++- 8 files changed, 320 insertions(+), 24 deletions(-) diff --git a/platform/app/worker/compressor.py b/platform/app/worker/compressor.py index c641049..31601a7 100644 --- a/platform/app/worker/compressor.py +++ b/platform/app/worker/compressor.py @@ -17,7 +17,7 @@ # Fallback truncation limits when LLM is unavailable _L0_FALLBACK_CHARS = 200 _L1_FALLBACK_CHARS = 1500 -_L2_MAX_CHARS = 20_000 # Hard cap on full-text prior output to prevent token explosion +_L2_MAX_CHARS = 4_000 # Hard cap on full-text prior output to prevent token explosion @dataclass diff --git a/platform/app/worker/engine.py b/platform/app/worker/engine.py index d74fefd..515ae0f 100644 --- a/platform/app/worker/engine.py +++ b/platform/app/worker/engine.py @@ -64,7 +64,7 @@ } _PREFLIGHT_MAX_FILES = 2000 _PREFLIGHT_MAX_DEPTH = 6 -_PREFLIGHT_MAX_CHARS = 1200 +_PREFLIGHT_MAX_CHARS = 600 async def _safe_broadcast(event: str, data: dict) -> None: @@ -2863,6 +2863,68 @@ def _format_preflight_section(title: str, items: list[str], *, limit: int = 4) - return f"- {title}: {', '.join(unique)}" +def _rank_preflight_path(rel_path: str, *, kind: str) -> tuple[int, int, int, str]: + lowered = rel_path.lower() + score = 0 + + if kind == "impl": + if "src/main/" in lowered: + score += 4 + if any(token in lowered for token in ("/controller/", "/handler/", "/service/", "/api/", "response")): + score += 5 + if "/src/test/" in lowered or lowered.startswith("src/test/"): + score -= 6 + elif kind == "test": + if any(token in lowered for token in ("/controller/", "/api/", "controller", "api")): + score += 5 + if lowered.endswith("test.java") or lowered.endswith("tests.java") or lowered.endswith("_test.go"): + score += 3 + if any(token in lowered for token in ("basetest", "sdk/", "mybatisgenerator", "mapper")): + score -= 4 + + return (-score, len(rel_path.split("/")), len(rel_path), rel_path) + + +def _pick_preflight_paths(items: list[str], *, kind: str, limit: int) -> list[str]: + unique = list(dict.fromkeys(items)) + return sorted(unique, key=lambda value: _rank_preflight_path(value, kind=kind))[:limit] + + +def _infer_validation_command(build_files: list[str]) -> str: + lowered = {item.lower() for item in build_files} + if "build.gradle" in lowered or "build.gradle.kts" in lowered: + return "./gradlew test" + if "pom.xml" in lowered: + return "./mvnw test" + if "package.json" in lowered: + return "npm test" + if "pyproject.toml" in lowered: + return "pytest" + if "go.mod" in lowered: + return "go test ./..." + if "cargo.toml" in lowered: + return "cargo test" + return "优先执行最小相关验证命令" + + +def _infer_coding_edit_target(source_roots: list[str], impl_examples: list[str]) -> str: + if impl_examples: + first = impl_examples[0] + parent = str(Path(first).parent).replace("\\", "/") + return parent if parent and parent != "." else first + if source_roots: + return source_roots[0] + return "优先在现有 controller/service 相邻目录做最小修改" + + +def _infer_test_target(test_examples: list[str], impl_examples: list[str]) -> str: + if test_examples: + return test_examples[0] + if impl_examples: + return impl_examples[0] + return "优先补充与当前改动直接相关的最小测试" + + def _build_stage_preflight_summary(stage_name: str, workspace_path: Optional[str]) -> Optional[str]: normalized = (stage_name or "").strip().lower() if normalized not in {"code", "coding", "test"}: @@ -2916,18 +2978,27 @@ def _build_stage_preflight_summary(stage_name: str, workspace_path: Optional[str ): test_examples.append(rel) + build_files = list(dict.fromkeys(build_files)) + source_roots = list(dict.fromkeys(source_roots)) + impl_examples = _pick_preflight_paths(impl_examples, kind="impl", limit=3) + test_examples = _pick_preflight_paths(test_examples, kind="test", limit=3) + validation_command = _infer_validation_command(build_files) + lines = [] if normalized in {"code", "coding"}: - lines.append(_format_preflight_section("构建文件", build_files, limit=3)) - lines.append(_format_preflight_section("源码目录", source_roots, limit=3)) - lines.append(_format_preflight_section("实现参考", impl_examples, limit=4)) - lines.append(_format_preflight_section("测试参考", test_examples, limit=3)) + lines.append(_format_preflight_section("构建入口", build_files, limit=2)) + lines.append(f"- 推荐修改落点: {_infer_coding_edit_target(source_roots, impl_examples)}") + lines.append(_format_preflight_section("最相关实现参考", impl_examples, limit=2)) + lines.append(_format_preflight_section("最相关测试参考", test_examples, limit=2)) + lines.append(f"- 推荐最小验证命令: {validation_command}") if not any(lines): lines.append("- 未发现明显的实现参考,请直接聚焦最小修改并谨慎验证。") else: - lines.append(_format_preflight_section("构建文件", build_files, limit=3)) - lines.append(_format_preflight_section("测试参考", test_examples, limit=5)) - lines.append(_format_preflight_section("实现参考", impl_examples, limit=3)) + lines.append(_format_preflight_section("构建入口", build_files, limit=2)) + lines.append(f"- 推荐验证落点: {_infer_test_target(test_examples, impl_examples)}") + lines.append(_format_preflight_section("最相关测试参考", test_examples, limit=2)) + lines.append(_format_preflight_section("对应实现参考", impl_examples, limit=2)) + lines.append(f"- 推荐最小验证命令: {validation_command}") if not any(lines): lines.append("- 未发现明显测试样例,请优先选择最小、最快的验证路径。") diff --git a/platform/app/worker/executor.py b/platform/app/worker/executor.py index 626edbf..671c8e5 100644 --- a/platform/app/worker/executor.py +++ b/platform/app/worker/executor.py @@ -192,7 +192,16 @@ def _is_signoff_stage(stage_name: str) -> bool: def _output_summary_limit(stage_name: str) -> int: # Cap stage output stored in DB to limit downstream prior-context injection. - return 50_000 + normalized = (stage_name or "").strip().lower() + if normalized == "parse": + return 600 + if normalized in {"code", "coding", "test"}: + return 1200 + if _is_signoff_stage(normalized): + return 1500 + if normalized in {"spec", "approve", "review", "doc"}: + return 1800 + return 1500 def _format_tool_digest(tool_items: list[dict[str, str]], limit: int = 6) -> str: diff --git a/platform/app/worker/prompts.py b/platform/app/worker/prompts.py index 2cde7ea..9162177 100644 --- a/platform/app/worker/prompts.py +++ b/platform/app/worker/prompts.py @@ -1,11 +1,26 @@ """Role-based system prompts and stage instruction templates for Agent Worker.""" from __future__ import annotations +import re from dataclasses import dataclass from typing import Dict, List, Optional _EXECUTION_STAGE_NAMES = {"code", "coding", "test"} _EXECUTION_MEMORY_LIMIT = 320 +_EXECUTION_REPO_HINT_LIMIT = 720 +_EXECUTION_PRIOR_LIMITS = { + "parse": 520, + "approve": 520, + "spec": 720, + "review": 720, + "doc": 720, + "code": 960, + "coding": 960, + "test": 960, + "signoff": 960, +} +_EXECUTION_PRIOR_MARKER = "\n...(前序阶段产出已截断)" +_REPO_SECTION_PATTERN = re.compile(r"^###\s+(?P[^\n]+)\n", re.MULTILINE) # --------------------------------------------------------------------------- @@ -195,6 +210,128 @@ def _is_execution_stage(stage_name: str) -> bool: return (stage_name or "").strip().lower() in _EXECUTION_STAGE_NAMES +def _extract_repo_section(repo_context: str, title: str) -> str: + text = (repo_context or "").strip() + if not text: + return "" + matches = list(_REPO_SECTION_PATTERN.finditer(text)) + for index, match in enumerate(matches): + if match.group("title").strip() != title: + continue + start = match.end() + end = matches[index + 1].start() if index + 1 < len(matches) else len(text) + return text[start:end].strip() + return "" + + +def _collect_tree_matches( + tree_lines: list[str], + *, + predicates: tuple[str, ...], + limit: int, + require_file: bool = False, +) -> list[str]: + matches: list[str] = [] + seen: set[str] = set() + for raw in tree_lines: + line = raw.strip() + lowered = line.lower() + if not line or line.startswith("...(目录树已截断)"): + continue + if require_file and "." not in line.rsplit("/", 1)[-1]: + continue + if not any(token in lowered for token in predicates): + continue + if line in seen: + continue + seen.add(line) + matches.append(line) + if len(matches) >= limit: + break + return matches + + +def _build_execution_repo_hint(repo_context: Optional[str]) -> Optional[str]: + text = (repo_context or "").strip() + if not text: + return None + + tech_stack = _extract_repo_section(text, "技术栈") + repo_tree = _extract_repo_section(text, "目录结构") + repo_tree_lines = [line for line in repo_tree.splitlines() if line.strip()] + + build_files = _collect_tree_matches( + repo_tree_lines, + predicates=( + "build.gradle", + "build.gradle.kts", + "pom.xml", + "package.json", + "pyproject.toml", + "go.mod", + "cargo.toml", + ), + limit=3, + require_file=True, + ) + source_roots = _collect_tree_matches( + repo_tree_lines, + predicates=("src/main", "app/", "app\\", "server/", "lib/", "internal/"), + limit=3, + ) + test_roots = _collect_tree_matches( + repo_tree_lines, + predicates=("src/test", "tests/", "__tests__", "spec/"), + limit=3, + ) + impl_refs = _collect_tree_matches( + repo_tree_lines, + predicates=("controller", "handler", "service", "api", "route", "response"), + limit=2, + require_file=True, + ) + + parts: list[str] = [] + if tech_stack: + parts.append(f"- 技术栈: {tech_stack[:180].strip()}") + if build_files: + parts.append(f"- 构建入口: {', '.join(build_files)}") + if source_roots: + parts.append(f"- 源码目录: {', '.join(source_roots)}") + if test_roots: + parts.append(f"- 测试目录: {', '.join(test_roots)}") + if impl_refs: + parts.append(f"- 参考实现: {', '.join(impl_refs)}") + + if not parts: + return _clip_stage_context( + text, + limit=_EXECUTION_REPO_HINT_LIMIT, + marker="...(执行阶段仓库信息已截断)", + ) + + return _clip_stage_context( + "\n".join(parts), + limit=_EXECUTION_REPO_HINT_LIMIT, + marker="...(执行阶段仓库信息已截断)", + ) + + +def _clip_execution_prior_outputs(prior: List[Dict[str, str]]) -> List[Dict[str, str]]: + clipped: List[Dict[str, str]] = [] + for item in prior: + stage = str(item.get("stage") or "").strip() + output = str(item.get("output") or "") + limit = _EXECUTION_PRIOR_LIMITS.get(stage.lower(), 720) + clipped_output = _clip_stage_context( + output, + limit=limit, + marker=_EXECUTION_PRIOR_MARKER, + ) or "" + clipped.append({"stage": stage, "output": clipped_output}) + return clipped + + def build_user_prompt(ctx: StageContext) -> str: """Build the user prompt text for an AgentRunner chat call. @@ -214,11 +351,7 @@ def build_user_prompt(ctx: StageContext) -> str: if ctx.preflight_summary: repo_context = None else: - repo_context = _clip_stage_context( - repo_context, - limit=320, - marker="...(执行阶段上下文已截断)", - ) + repo_context = _build_execution_repo_hint(repo_context) project_memory = _clip_stage_context( project_memory, limit=_EXECUTION_MEMORY_LIMIT, @@ -238,6 +371,8 @@ def build_user_prompt(ctx: StageContext) -> str: # Use compressed outputs (sliding-window) when available, otherwise raw prior = ctx.compressed_outputs if ctx.compressed_outputs is not None else ctx.prior_outputs + if prior and _is_execution_stage(ctx.stage_name): + prior = _clip_execution_prior_outputs(prior) if prior: parts.append("\n## 前序阶段产出") for po in prior: diff --git a/platform/tests/test_compressor.py b/platform/tests/test_compressor.py index 05ef85d..a4046cb 100644 --- a/platform/tests/test_compressor.py +++ b/platform/tests/test_compressor.py @@ -56,6 +56,22 @@ def test_compression_result_sliding_window(): assert ctx[3]["output"] == "l2_3_full_content" +def test_compression_result_caps_immediate_prior_l2(): + cr = CompressionResult() + cr.add( + CompressedOutput( + stage_name="parse", + l0="short", + l1="brief", + l2="x" * 10_000, + ) + ) + + ctx = cr.build_prior_context(1) + assert ctx[0]["output"].endswith("...(输出已截断)") + assert len(ctx[0]["output"]) < 10_000 + + @pytest.mark.asyncio async def test_compress_stage_output_fallback(): """When compression is disabled, should use fallback.""" diff --git a/platform/tests/test_executor_stage_logs.py b/platform/tests/test_executor_stage_logs.py index 2a2cf94..8b9ae2d 100644 --- a/platform/tests/test_executor_stage_logs.py +++ b/platform/tests/test_executor_stage_logs.py @@ -260,6 +260,14 @@ async def execute_stage(self, info, **kwargs): return self._result +def test_output_summary_limit_is_stage_specific(): + assert executor._output_summary_limit('parse') <= 600 + assert executor._output_summary_limit('code') <= 1200 + assert executor._output_summary_limit('test') <= 1200 + assert executor._output_summary_limit('signoff') <= 1600 + assert executor._output_summary_limit('spec') > executor._output_summary_limit('parse') + + def test_is_tool_call_error_matches_gemini_thought_signature_error(): err = RuntimeError( "Error code: 400 - [{'error': {'code': 400, 'message': " diff --git a/platform/tests/test_prompts.py b/platform/tests/test_prompts.py index 91915db..c8f9ca4 100644 --- a/platform/tests/test_prompts.py +++ b/platform/tests/test_prompts.py @@ -78,12 +78,24 @@ def test_with_repo_context(): def test_code_stage_clips_large_repo_context(): - repo_context = "STACK\n" + ("src/main/java/demo/File.java\n" * 200) + repo_context = ( + "### 技术栈\nJava 17, Spring Boot, Gradle\n\n" + "### 目录结构\n" + "build.gradle\n" + "src/main/java/demo/controller/HelloController.java\n" + "src/main/java/demo/service/HelloService.java\n" + "src/test/java/demo/controller/HelloControllerTest.java\n" + "docs/design.md\n" + ) ctx = _minimal_ctx(stage_name="code", agent_role="coding", repo_context=repo_context) result = build_user_prompt(ctx) assert "## 项目代码库信息" in result - assert "...(执行阶段上下文已截断)" in result - assert len(result) < len(repo_context) + 500 + assert "- 技术栈: Java 17, Spring Boot, Gradle" in result + assert "- 构建入口: build.gradle" in result + assert "- 源码目录:" in result + assert "- 测试目录:" in result + assert "- 参考实现:" in result + assert "### 目录结构" not in result def test_spec_stage_keeps_full_repo_context(): @@ -169,6 +181,25 @@ def test_with_prior_outputs_raw(): assert "Spec document:" in result +def test_execution_stage_clips_prior_outputs_aggressively(): + long_parse = "需求分析\n" + ("parse-line\n" * 200) + long_spec = "技术方案\n" + ("spec-line\n" * 200) + ctx = _minimal_ctx( + stage_name="code", + agent_role="coding", + prior_outputs=[ + {"stage": "parse", "output": long_parse}, + {"stage": "spec", "output": long_spec}, + ], + ) + result = build_user_prompt(ctx) + assert "## 前序阶段产出" in result + assert "...(前序阶段产出已截断)" in result + assert "parse-line\nparse-line\nparse-line" in result + assert result.count("parse-line") < 80 + assert result.count("spec-line") < 100 + + def test_with_empty_prior_outputs(): ctx = _minimal_ctx(prior_outputs=[]) result = build_user_prompt(ctx) diff --git a/platform/tests/test_worker.py b/platform/tests/test_worker.py index 9353ffe..6cc064a 100644 --- a/platform/tests/test_worker.py +++ b/platform/tests/test_worker.py @@ -118,31 +118,57 @@ class TestBuildStagePreflightSummary: def test_build_stage_preflight_summary_for_coding(self, tmp_path: Path): (tmp_path / "build.gradle").write_text("plugins {}", encoding="utf-8") (tmp_path / "src/main/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/main/java/demo/service").mkdir(parents=True) (tmp_path / "src/test/java/demo/controller").mkdir(parents=True) (tmp_path / "src/main/java/demo/controller/HelloController.java").write_text("class X {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/service/HelloService.java").write_text("class S {}", encoding="utf-8") (tmp_path / "src/test/java/demo/controller/HelloControllerTest.java").write_text("class T {}", encoding="utf-8") result = _build_stage_preflight_summary("coding", str(tmp_path)) assert result is not None - assert "构建文件" in result - assert "实现参考" in result - assert "测试参考" in result + assert "构建入口" in result + assert "推荐修改落点" in result + assert "最相关实现参考" in result + assert "最相关测试参考" in result + assert "推荐最小验证命令" in result assert "HelloController.java" in result + assert "./gradlew test" in result def test_build_stage_preflight_summary_for_test(self, tmp_path: Path): (tmp_path / "pom.xml").write_text("<project/>", encoding="utf-8") (tmp_path / "src/test/java/demo").mkdir(parents=True) + (tmp_path / "src/test/java/demo/controller").mkdir(parents=True) (tmp_path / "src/main/java/demo/service").mkdir(parents=True) + (tmp_path / "src/main/java/demo/controller").mkdir(parents=True) (tmp_path / "src/test/java/demo/DemoServiceTest.java").write_text("class T {}", encoding="utf-8") + (tmp_path / "src/test/java/demo/controller/HelloControllerTest.java").write_text("class HC {}", encoding="utf-8") (tmp_path / "src/main/java/demo/service/DemoService.java").write_text("class S {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/controller/HelloController.java").write_text("class C {}", encoding="utf-8") + + result = _build_stage_preflight_summary("test", str(tmp_path)) + + assert result is not None + assert "构建入口" in result + assert "推荐验证落点" in result + assert "最相关测试参考" in result + assert "对应实现参考" in result + assert "推荐最小验证命令: ./mvnw test" in result + assert "HelloControllerTest.java" in result + + def test_build_stage_preflight_summary_prioritizes_controller_tests(self, tmp_path: Path): + (tmp_path / "build.gradle").write_text("plugins {}", encoding="utf-8") + (tmp_path / "src/test/java/demo/sdk").mkdir(parents=True) + (tmp_path / "src/test/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/main/java/demo/controller").mkdir(parents=True) + (tmp_path / "src/test/java/demo/sdk/TaobaoApiTest.java").write_text("class T {}", encoding="utf-8") + (tmp_path / "src/test/java/demo/controller/HelloControllerTest.java").write_text("class C {}", encoding="utf-8") + (tmp_path / "src/main/java/demo/controller/HelloController.java").write_text("class X {}", encoding="utf-8") result = _build_stage_preflight_summary("test", str(tmp_path)) assert result is not None - assert "构建文件" in result - assert "测试参考" in result - assert "DemoServiceTest.java" in result + assert result.index("HelloControllerTest.java") < result.index("TaobaoApiTest.java") def test_build_stage_preflight_summary_ignores_other_stages(self, tmp_path: Path): assert _build_stage_preflight_summary("signoff", str(tmp_path)) is None From 30fd109ba72a7068171df9ea54d9bea6b7722ea4 Mon Sep 17 00:00:00 2001 From: "Johnny.Wang" <Johnny.Wang@starbucks.cn> Date: Thu, 19 Mar 2026 21:40:45 +0800 Subject: [PATCH 33/33] chore: drop plan docs from branch --- ...03-19-coding-testing-convergence-design.md | 154 --------- ...ding-testing-convergence-implementation.md | 224 ------------- ...9-exploration-budget-convergence-design.md | 133 -------- ...ation-budget-convergence-implementation.md | 103 ------ .../2026-03-19-java-gradle-sandbox-design.md | 169 ---------- ...3-19-java-gradle-sandbox-implementation.md | 308 ----------------- ...-03-19-stage-reset-and-preflight-design.md | 188 ----------- ...tage-reset-and-preflight-implementation.md | 314 ------------------ ...tatic-context-token-optimization-design.md | 207 ------------ ...ntext-token-optimization-implementation.md | 308 ----------------- 10 files changed, 2108 deletions(-) delete mode 100644 docs/plans/2026-03-19-coding-testing-convergence-design.md delete mode 100644 docs/plans/2026-03-19-coding-testing-convergence-implementation.md delete mode 100644 docs/plans/2026-03-19-exploration-budget-convergence-design.md delete mode 100644 docs/plans/2026-03-19-exploration-budget-convergence-implementation.md delete mode 100644 docs/plans/2026-03-19-java-gradle-sandbox-design.md delete mode 100644 docs/plans/2026-03-19-java-gradle-sandbox-implementation.md delete mode 100644 docs/plans/2026-03-19-stage-reset-and-preflight-design.md delete mode 100644 docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md delete mode 100644 docs/plans/2026-03-19-static-context-token-optimization-design.md delete mode 100644 docs/plans/2026-03-19-static-context-token-optimization-implementation.md diff --git a/docs/plans/2026-03-19-coding-testing-convergence-design.md b/docs/plans/2026-03-19-coding-testing-convergence-design.md deleted file mode 100644 index cdf6889..0000000 --- a/docs/plans/2026-03-19-coding-testing-convergence-design.md +++ /dev/null @@ -1,154 +0,0 @@ -# Coding And Testing Convergence Design - -**Date:** 2026-03-19 - -**Problem** - -Simple tasks are spending too many turns in the `coding` and `test` stages. The current behavior allows broad repository exploration, and when the model hits the turn limit the continuation prompt is too weak to force the stage back toward concrete actions. In practice this leads to repeated `read/ls/find` calls, continuation loops, and slow delivery even when the requested change is small. - -**Goal** - -Improve convergence for all `coding` and `test` stage executions with the smallest possible execution-chain change. The design should avoid changes to stage orchestration, model routing, sandbox selection, or task lifecycle. The work should only tighten stage instructions and continuation behavior. - -**Non-Goals** - -- Do not change task templates or stage topology. -- Do not add new execution modes or sandbox backends. -- Do not introduce repository-specific heuristics such as “simple task mode”. -- Do not add exploration budgets, tool counters, or runtime interruption logic in this iteration. -- Do not change non-`coding` / non-`test` stages unless required for shared helper plumbing. - -**Current State** - -- Stage prompts are built in [prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py). -- In-process continuation handling lives in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py). -- `coding` and `test` already have stage guardrails, but they focus on stage boundaries rather than convergence. -- Continuation prompts currently say only “请继续完成上面的输出,从你停下的地方继续。”, which encourages more prose instead of concrete progress. - -**Constraints** - -- Keep the implementation localized to prompt and continuation layers. -- Preserve compatibility with existing event logging and continuation flow. -- Keep Chinese prompt style consistent with the rest of the worker prompts. -- Avoid changing default max-turn behavior in this iteration. - -**Approach Options** - -1. Guardrails only -Add stricter `coding` and `test` guardrails in `prompts.py`. - -Pros: -- Smallest code diff. -- No helper changes. - -Cons: -- Only affects the first prompt. -- Does not solve continuation loops directly. - -2. Guardrails plus stage-specific continuation prompts -Strengthen `coding` and `test` stage guardrails and replace the generic continuation prompt with stage-specific convergence prompts. - -Pros: -- Directly addresses the observed failure mode. -- Still limited to prompt-generation and continuation code. -- No model or orchestration changes. - -Cons: -- Slightly larger change surface than guardrails alone. - -3. Guardrails plus continuation prompts plus lower max turns -Do option 2 and also reduce `coding` / `test` max-turn ceilings. - -Pros: -- More aggressive convergence. - -Cons: -- Higher risk of hurting legitimate longer tasks. -- Harder to tune safely without broader validation. - -**Recommendation** - -Choose option 2. - -It is the best fit for the stated goal: stronger convergence for all `coding` and `test` stages with minimal execution-chain change. It improves both the initial stage instruction and the continuation loop without changing orchestration or routing. - -**Design** - -### 1. Tighten `coding` stage guardrail - -Update the `code` entry in `STAGE_GUARDRAILS` so that it explicitly instructs the agent to: - -- avoid broad repository exploration; -- use already available context first; -- read more files only when a missing detail blocks implementation; -- move quickly to concrete file edits; -- run the smallest necessary validation; -- stop after implementation and a brief summary. - -The guardrail should discourage “understand the whole repo first” behavior and push the agent toward the minimum set of reads needed to safely modify code. - -### 2. Tighten `test` stage guardrail - -Update the `test` entry in `STAGE_GUARDRAILS` so that it explicitly instructs the agent to: - -- focus only on validation directly tied to the current change; -- prefer the fastest relevant verification path; -- stop once acceptance is sufficiently proven; -- avoid expanding into smoke/E2E/performance work unless explicitly requested; -- report the concrete blocker if validation cannot proceed. - -This keeps the `test` stage from growing into an open-ended general validation phase. - -### 3. Replace generic continuation prompts with stage-specific convergence prompts - -Update `_handle_continuations()` in `executor.py` so that: - -- `coding` continuations tell the agent to stop broad exploration, use the information already gathered, and produce concrete edits or a single evidenced blocker; -- `test` continuations tell the agent to stop expanding coverage, run the smallest relevant validation, and return concrete results or a blocker; -- other stages can keep the generic continuation wording, or use a neutral fallback prompt. - -The continuation prompt should be action-oriented. Its purpose is not to continue reasoning indefinitely; it is to force the stage back toward a terminating action. - -**Data Flow** - -1. `build_user_prompt()` continues to assemble the stage prompt as today. -2. `coding` / `test` guardrails now embed convergence-specific instructions. -3. If the model returns the truncation sentinel, `_handle_continuations()` chooses a stage-aware prompt. -4. The runner continues with `reset=False`, but the continuation now carries explicit instructions to finish implementation or verification instead of continuing exploratory dialogue. - -**Error Handling** - -- Existing continuation retry and logging behavior remains unchanged. -- If a continuation still fails or times out, the current stage error handling path stays in place. -- No changes are needed to fallback logic, task logs schema, or retry scheduling. - -**Testing Strategy** - -Add focused tests only. - -- Prompt tests in [test_prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py): - - verify `coding` guardrail includes the new convergence instructions; - - verify `test` guardrail includes the new minimal-validation instructions. -- Executor tests in [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) or a nearby executor-focused test module: - - verify `coding` continuation uses the stage-specific prompt; - - verify `test` continuation uses the stage-specific prompt; - - verify non-target stages still use the generic fallback prompt. - -**Risks** - -- Over-constraining `coding` may reduce necessary repo discovery for legitimately complex changes. -- Over-constraining `test` may cause the agent to stop too early if prompts are too absolute. -- Prompt wording that is too long may dilute the core instruction. - -**Mitigations** - -- Phrase the new guardrails as “use available context first” rather than “never explore”. -- Allow one additional critical file lookup in continuation prompts when truly needed. -- Leave max-turn settings unchanged for now to isolate the impact of prompt changes. - -**Success Criteria** - -- `coding` stage continuations stop looping on generic prose and push toward edits or a blocker. -- `test` stage continuations stop expanding test scope after sufficient evidence is available. -- The change is limited to prompt and continuation logic. -- Existing worker flow, logging, and task lifecycle remain unchanged. diff --git a/docs/plans/2026-03-19-coding-testing-convergence-implementation.md b/docs/plans/2026-03-19-coding-testing-convergence-implementation.md deleted file mode 100644 index 0436482..0000000 --- a/docs/plans/2026-03-19-coding-testing-convergence-implementation.md +++ /dev/null @@ -1,224 +0,0 @@ -# Coding And Testing Convergence Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Tighten `coding` and `test` stage convergence by strengthening stage guardrails and using stage-specific continuation prompts. - -**Architecture:** Keep the execution chain unchanged and localize the work to prompt assembly and continuation handling. `prompts.py` will define stronger convergence instructions for `coding` and `test`, and `executor.py` will choose stage-aware continuation prompts when the runner hits the truncation sentinel. - -**Tech Stack:** Python, pytest, async worker executor, stage prompt generation - ---- - -### Task 1: Strengthen stage guardrails in prompt generation - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` - -**Step 1: Write the failing tests** - -Add prompt assertions that prove: - -- the `code` guardrail tells the agent to avoid broad repo exploration and move toward concrete edits plus minimal validation; -- the `test` guardrail tells the agent to use the smallest relevant verification path and stop once evidence is sufficient. - -Example assertions: - -```python -def test_code_prompt_emphasizes_convergence(): - ctx = _minimal_ctx(stage_name="code") - result = build_user_prompt(ctx) - assert "不要为了理解整个仓库而广泛探索" in result - assert "最小必要验证" in result - - -def test_test_prompt_emphasizes_minimal_validation(): - ctx = _minimal_ctx(stage_name="test") - result = build_user_prompt(ctx) - assert "最小、最相关、最快的验证路径" in result - assert "满足验收标准" in result -``` - -**Step 2: Run the tests to verify they fail** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -k "convergence or minimal_validation" -q -``` - -Expected: FAIL because the current guardrails do not contain the new phrases. - -**Step 3: Write the minimal implementation** - -Update `STAGE_GUARDRAILS["code"]` and `STAGE_GUARDRAILS["test"]` in [prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py) so they: - -- push `coding` toward immediate implementation and minimal verification; -- push `test` toward targeted validation and fast termination once evidence is enough. - -Keep the wording concise and consistent with the existing Chinese prompt style. - -**Step 4: Run the tests to verify they pass** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -k "convergence or minimal_validation" -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py -git commit -m "fix(worker): tighten coding and test stage guardrails" -``` - -### Task 2: Add stage-specific continuation prompts - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Write the failing tests** - -Add executor-focused tests that exercise `_handle_continuations()` and prove: - -- `coding` continuations use a convergence prompt that stops broad exploration and asks for concrete edits or one blocker; -- `test` continuations use a convergence prompt that asks for the smallest relevant validation result; -- non-`coding` / non-`test` stages still use the generic continuation prompt. - -Use a fake runner whose `chat()` captures the continuation prompt and returns a non-truncated response. - -Example structure: - -```python -@pytest.mark.asyncio -async def test_handle_continuations_uses_coding_specific_prompt(): - runner = FakeRunner(["done"]) - tracker = FakeTracker() - output, _tokens = await _handle_continuations( - runner, - "[Max turns reached. Please continue the conversation.]", - {"stage_name": "code"}, - tracker, - ) - assert "不要继续广泛浏览代码库" in runner.prompts[0] -``` - -If `_handle_continuations()` does not currently know the stage, first design the smallest helper change that allows the caller to pass it in. - -**Step 2: Run the tests to verify they fail** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -k "continuation and coding or continuation and test" -q -``` - -Expected: FAIL because continuation prompts are currently generic. - -**Step 3: Write the minimal implementation** - -In [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py): - -- introduce a small helper that returns the continuation prompt for a given stage name; -- use stage-aware prompts for `code` and `test`; -- keep a generic fallback for other stages; -- thread the current stage name into `_handle_continuations()` with the smallest possible call-site change. - -Do not change retry counts, timeout behavior, logging contracts, or runner reset behavior. - -**Step 4: Run the tests to verify they pass** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -k "continuation and coding or continuation and test" -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py -git commit -m "fix(worker): add convergent continuation prompts" -``` - -### Task 3: Run regression checks for prompt and executor paths - -**Files:** -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Run the prompt tests** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py -q -``` - -Expected: PASS. - -**Step 2: Run the executor tests** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_executor_stage_logs.py -q -``` - -Expected: PASS. - -**Step 3: Run a focused combined smoke check** - -Run: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform && . .venv/bin/activate && pytest tests/test_prompts.py tests/test_executor_stage_logs.py -q -``` - -Expected: PASS with no new failures in the touched prompt and continuation logic. - -**Step 4: Commit the verification state** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/docs/plans/2026-03-19-coding-testing-convergence-design.md /Users/jowang/Documents/github/silicon_agent/docs/plans/2026-03-19-coding-testing-convergence-implementation.md -git commit -m "docs: plan coding and testing convergence changes" -``` - -### Task 4: Optional manual validation on the VM - -**Files:** -- Modify: none -- Test: none - -**Step 1: Restart the VM worker with the intended config** - -Run on the VM after code deployment: - -```bash -grep -n 'SANDBOX_ENABLED' /home/stb_admin/silicon_agent/platform/.env -``` - -Expected: whichever value is desired for the validation session. - -**Step 2: Re-run a previously slow task shape** - -Use a simple API task similar to “create a helloworld interface”. - -**Step 3: Inspect logs** - -Check that: - -- `coding` does not spend most turns on broad repo exploration; -- continuation prompts no longer produce repeated generic continuation loops; -- `test` stops after targeted validation. - -**Step 4: Record any follow-up gaps** - -If the new prompt strategy is still too weak, capture concrete examples before considering max-turn tuning in a later change. diff --git a/docs/plans/2026-03-19-exploration-budget-convergence-design.md b/docs/plans/2026-03-19-exploration-budget-convergence-design.md deleted file mode 100644 index f465442..0000000 --- a/docs/plans/2026-03-19-exploration-budget-convergence-design.md +++ /dev/null @@ -1,133 +0,0 @@ -# Exploration Budget Convergence Design - -## Background - -Recent live validation on VM shows that tightening `coding` and `test` stage guardrails plus lowering `max_turns` improved behavior, but did not remove the root failure mode: - -- `coding` still spends most turns on repository exploration (`read/find/ls/cat`) before acting. -- `test` can still drift into explanation-only output after verification failures. -- `max_turns` now bounds damage, but it does not prevent the turns from being spent on the wrong behavior. - -The goal of this design is to add a stronger convergence mechanism with minimal chain changes. We will keep the existing stage model, sandbox model, and AgentRunner integration intact, and only strengthen the executor behavior for `coding` and `test`. - -## Goals - -- Reduce wasted `coding` turns spent on exploration before code edits. -- Reduce `test` drift after failed validation commands. -- Preserve the current task/stage architecture and runtime interfaces. -- Apply consistently to host execution and sandbox execution. - -## Non-Goals - -- No new stages such as `explore` or `verify`. -- No AgentRunner API changes. -- No sandbox protocol changes. -- No model-routing redesign. - -## Recommended Approach - -Use executor-level exploration budgets with a single forced-convergence prompt. - -### Why this approach - -This is the smallest effective change that targets the actual failure mode. The system already captures stage events and tool-call lifecycle data inside the executor. Instead of only changing prompts or reducing `max_turns`, we can observe repeated exploration behavior and intervene once, at the executor layer, before the stage fully degenerates. - -This keeps the architecture stable while giving the runtime one stronger lever than prompt text alone. - -## Alternatives Considered - -### 1. Lower `max_turns` further - -This is easy, but it does not solve the root issue. The model can still waste the smaller budget on exploration and then get truncated earlier. - -### 2. Split `coding` and `test` into sub-phases - -This would likely be more effective, but it changes stage orchestration and increases behavioral complexity. It is larger than needed for the immediate problem. - -### 3. Add tool-level hard blocking in AgentRunner - -This could be very strong, but it requires changes below the executor boundary and is not the minimal-path solution. - -## Design - -### 1. Exploration budget tracking - -Inside the executor, track lightweight exploration signals for `coding` and `test`: - -- read-like tool usage -- directory/list/search style commands -- repeated tool-only turns without implementation or verification progress - -The budget should stay intentionally simple and heuristic-based. The goal is not perfect classification. The goal is to catch obvious drift. - -Suggested first-pass behavior: - -- `coding`: trigger after too many exploration actions in the same stage before meaningful implementation progress -- `test`: trigger after too many exploration actions, or after failed validation attempts followed by further drift - -### 2. Forced-convergence prompt - -When the budget is exceeded, inject one explicit recovery prompt. - -For `coding`, the prompt should require: - -- stop exploring -- directly modify files -- run only minimal verification -- if still blocked, report the single blocker clearly - -For `test`, the prompt should require: - -- stop expanding the test surface -- give the smallest relevant validation result -- if a command failed, report the failed command, key error, and blocker -- do not declare success based only on code inspection - -This should happen once per stage execution, not repeatedly. - -### 3. No repeated soft looping - -After a forced-convergence prompt has been issued, the executor should not continue to allow the same stage to loop through broad exploration again. The intention is: - -- one normal execution window -- one forced convergence recovery chance -- then end based on the resulting output or failure - -This avoids replacing one loop with another. - -### 4. Minimal runtime surface area - -The behavior should live in `executor.py` so it applies uniformly to: - -- host/in-process task execution -- sandboxed task execution - -The prompt texts may remain in executor helper functions rather than introducing a larger new prompt framework. - -## Error Handling - -If budget tracking cannot confidently classify a tool action, it should ignore it rather than overreact. - -If the forced-convergence prompt itself fails, the stage should continue to use the current lifecycle behavior and surface the latest failure normally. The new logic should not hide existing error messages. - -## Testing Strategy - -Add focused executor tests that cover: - -- exploration budget exceeded in `coding` triggers the forced-convergence path -- exploration budget exceeded in `test` triggers the correct test-specific prompt -- failed test-command flow requires blocker-style follow-up rather than success-style summary -- only one forced-convergence injection happens per stage -- non-target roles are unaffected - -## Success Criteria - -Live task behavior should improve in these ways: - -- fewer `llm_turn_sent` / `tool_call_executed` events before code changes or validation -- fewer `Max turns reached` events in `coding` and `test` -- fewer `test` outputs that claim success after command failure without citing blockers - -## Rollout Notes - -This should ship behind existing behavior with no config migration. If needed, thresholds can remain hard-coded for the first version and be externalized later only if real usage shows that tuning is necessary. diff --git a/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md b/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md deleted file mode 100644 index 9de4975..0000000 --- a/docs/plans/2026-03-19-exploration-budget-convergence-implementation.md +++ /dev/null @@ -1,103 +0,0 @@ -# Exploration Budget Convergence Implementation Plan - -## Objective - -Implement executor-level exploration-budget convergence for `coding` and `test` stages, with one forced-convergence recovery prompt and targeted regression coverage. - -## Scope - -- Modify executor behavior only. -- Add or update tests for executor behavior. -- Do not alter stage orchestration, sandbox APIs, or AgentRunner interfaces. - -## Planned Changes - -### 1. Add lightweight stage exploration state - -Update [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) to maintain per-stage exploration state during execution. - -Planned state: - -- exploration action counter -- test-command failure flag or summary -- whether forced convergence was already injected - -### 2. Define budget heuristics - -Add small helper functions in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) to classify tool behavior and decide whether the stage has exceeded its budget. - -Initial heuristics should stay simple: - -- `coding`: repeated read/search/list behavior without implementation progress -- `test`: repeated read/search/list behavior, or failure of validation commands followed by continued drift - -### 3. Inject one forced-convergence prompt - -Extend the stage execution loop in [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) so that when the budget is exceeded: - -- a stage-specific forced prompt is sent once -- the prompt differs from the existing generic continuation prompt -- subsequent looping does not repeatedly inject the same recovery prompt - -### 4. Keep current completion/failure semantics - -Preserve existing lifecycle behavior: - -- normal successful stage completion still goes through existing summary/finalization -- existing error handling and retry/fallback behavior remains intact -- no new persisted schema changes - -### 5. Add regression tests - -Update [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) to cover: - -- coding exploration budget breach -- test exploration budget breach -- failed test command followed by forced blocker-style convergence -- one-time forced-convergence injection -- unaffected behavior for other roles - -If needed, add prompt-text expectation coverage to [test_prompts.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py) only when shared prompt helpers are updated. - -## Verification - -Run: - -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q -``` - -If implementation reaches live validation, use a cloned VM task similar to the previous `helloworld` runs and compare: - -- `llm_turn_sent` -- `tool_call_executed` -- `Max turns reached` - -for `code` and `test` stages. - -## Risks - -- Heuristics may be too aggressive and cut off valid exploration in legitimate tasks. -- Heuristics may be too weak and not materially improve live behavior. -- Tool classification may miss edge cases where an `execute` command is exploratory vs. truly validating. - -## Mitigations - -- Keep the first-pass thresholds conservative. -- Inject one recovery prompt before failing or finishing, instead of immediately aborting. -- Limit the initial implementation to `coding` and `test` only. - -## Rollback - -Revert the executor helpers and test changes in: - -- [executor.py](/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py) -- [test_executor_stage_logs.py](/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py) - -## Exit Criteria - -- Targeted tests pass. -- The new logic is isolated to executor-level behavior. -- A subsequent live validation can reasonably show lower exploration churn than the current capped-`max_turns` behavior. diff --git a/docs/plans/2026-03-19-java-gradle-sandbox-design.md b/docs/plans/2026-03-19-java-gradle-sandbox-design.md deleted file mode 100644 index ae26e0d..0000000 --- a/docs/plans/2026-03-19-java-gradle-sandbox-design.md +++ /dev/null @@ -1,169 +0,0 @@ -# Java Gradle Sandbox Design - -## Context - -The sandbox needs to run a broad set of Java 8 or Java 17 Gradle + Spring Boot projects with poor network conditions. The current repository already ships a dual-JDK coding image, preserves `./gradlew`, mounts a shared Gradle cache, and prewarms wrapper downloads. The remaining gap is making that setup reliable across different project wrappers and dependency graphs without forcing per-project customization. - -## Goals - -- Provide a general-purpose sandbox image that can run most Java 8 or Java 17 Gradle + Spring Boot projects without online dependency fetching on the common path. -- Keep project-specific Gradle behavior aligned with each repository by preferring the project wrapper. -- Default to Java 8 when version detection is inconclusive. -- Offer a manual override for edge cases and keep failure handling bounded and observable. - -## Non-Goals - -- Supporting every private Maven repository or every unusual plugin offline on day one. -- Replacing project wrappers with a single global Gradle version. -- Solving long-term dependency distribution purely through the sandbox image when an internal artifact proxy becomes available later. - -## Decision Summary - -- Ship both JDK 8 and JDK 17 in the sandbox image. -- Prefer `./gradlew` for build and test commands; use system `gradle` only as a fallback when a project has no wrapper. -- Expand Java version detection to cover Gradle toolchains, Maven compiler properties, `.java-version`, `.tool-versions`, and existing compatibility markers. -- If detection fails, default to Java 8. -- Allow a manual override with `SANDBOX_JAVA_VERSION=8|17`. -- Preload common Gradle distributions plus common Spring Boot plugin and dependency caches for offline-first execution. -- Keep a writable runtime cache for uncommon dependencies that are not covered by the prewarmed cache. -- Permit at most one Java-version retry when build output clearly shows a version mismatch. - -## Architecture - -### 1. Image Layering - -- `platform/sandbox/Dockerfile.base` remains the common runtime base for Python, Node, and shared agent tooling. -- `platform/sandbox/Dockerfile.coding` remains the main Java-capable image and should continue to include: - - Temurin JDK 8 - - Temurin JDK 17 - - a system Gradle installation for diagnostics and last-resort fallback - - a prewarmed Gradle cache directory with wrapper distributions and common modules -- `platform/sandbox/Dockerfile.test` layers browser and test tooling on top of `coding`. - -This preserves the current image split while adding an explicit offline dependency layer. - -### 2. Runtime Decision Flow - -The sandbox runtime should resolve Java and Gradle execution in this order: - -1. Check `SANDBOX_JAVA_VERSION` for an explicit override. -2. Auto-detect Java from project files. -3. If no reliable signal is found, select Java 8. -4. Prefer `./gradlew` for Gradle commands. -5. Use system `gradle` only when the workspace lacks `gradlew`. -6. If the command fails with a clear Java-version mismatch, switch once to the other supported JDK and retry one time. - -This flow keeps project compatibility high while avoiding silent loops or repeated environment churn. - -### 3. Java Detection Rules - -The runtime should scan these files when present: - -- `pom.xml` -- `build.gradle` -- `build.gradle.kts` -- `gradle.properties` -- `settings.gradle` -- `settings.gradle.kts` -- `.java-version` -- `.tool-versions` - -Signals should be ranked in this order: - -1. Explicit `SANDBOX_JAVA_VERSION` -2. Gradle toolchain declarations such as `JavaLanguageVersion.of(8|17)` -3. `sourceCompatibility` and `targetCompatibility` -4. Maven compiler `source`, `target`, `release`, and `java.version` -5. version manager hints from `.java-version` or `.tool-versions` - -If conflicting markers exist, the highest-ranked explicit signal wins, and the runtime logs the winning rule. - -### 4. Gradle and Wrapper Strategy - -`./gradlew` should remain the primary execution path because it carries the project-specific Gradle version and plugin resolution behavior. The sandbox should not try to standardize project builds onto one system Gradle version. - -Instead, the sandbox should preload the resources that wrappers usually need to fetch: - -- wrapper distributions for representative Gradle versions in the 6.x, 7.x, and 8.x lines -- Gradle plugin metadata and jars commonly used by Spring Boot builds -- common Maven Central modules used by Spring Boot starters and test dependencies - -System `gradle` stays available for diagnostics such as `gradle -v` and as a fallback when `gradlew` is absent. - -### 5. Offline Cache Model - -The cache model should have two layers: - -- A prewarmed base cache baked into the image or injected as a prepared cache artifact -- A writable runtime cache mounted as `GRADLE_USER_HOME` for project-specific misses - -The prewarmed cache should include: - -- wrapper distributions for selected Gradle versions -- plugin portal artifacts for Spring Boot and dependency management plugins -- common modules and metadata under Gradle's module cache - -The writable layer captures rare dependencies without forcing a full image rebuild. - -### 6. Cache Refresh Strategy - -Refresh the base cache by running a representative project matrix in a controlled environment: - -- Java 8 + Spring Boot 2.x + Gradle 6.x -- Java 8 + Spring Boot 2.x + Gradle 7.x -- Java 17 + Spring Boot 2.7 + Gradle 7.x -- Java 17 + Spring Boot 3.x + Gradle 8.x - -For each representative project, run: - -- `./gradlew --no-daemon help` -- `./gradlew --no-daemon dependencies` -- `./gradlew --no-daemon testClasses` - -This strategy prefetches the Gradle distributions, plugins, starter dependencies, test dependencies, and most metadata needed by common projects. - -## Error Handling - -- If Java detection fails, log that no explicit signal was found and continue with Java 8. -- If the manual override references an unavailable JDK, fail fast with a clear configuration error. -- Retry only once when build output clearly indicates a Java-version incompatibility. -- If both Java 8 and Java 17 fail, surface the original command, selected JDK, retry decision, and the final error in logs and task output. - -## Testing Strategy - -Add or extend tests in the sandbox suite to verify: - -- default Java 8 selection when no markers are present -- Java 17 detection for Gradle toolchains -- Java 8 detection for legacy Gradle or Maven properties -- explicit override handling through environment variables -- wrapper-first execution behavior -- Gradle cache and prewarm environment wiring through the Docker run contract -- coding image assertions for dual JDK plus offline cache preparation hooks - -Add representative sandbox fixtures or smoke validation inputs for: - -- Java 8 + Spring Boot 2.x -- Java 17 + Spring Boot 2.7 -- Java 17 + Spring Boot 3.x - -## Risks And Mitigations - -- Large image size: keep the writable cache separate and refresh only the shared offline layer when possible. -- Cache staleness: refresh the matrix on a scheduled cadence or alongside sandbox release candidates. -- Private repositories: document that private artifacts still need runtime access or a mirrored internal repository. -- False-positive detection: keep a manual override and log the matched detection rule for every task. - -## Recommended Implementation Order - -1. Expand Java detection and add explicit override support. -2. Change runtime defaulting behavior to Java 8 plus bounded retry. -3. Add offline cache preparation hooks in the coding image. -4. Extend Docker env wiring and contract tests for any new cache paths or override variables. -5. Add representative fixture-based verification for Java 8 and Java 17 Spring Boot projects. - -## Open Assumptions - -- Most target projects are standard Gradle + Spring Boot applications using public Maven-style dependencies. -- The sandbox may still need runtime network access for rare or private dependencies, but the common path should succeed offline. -- The current untracked `gradle-8.5-wrapper-cache.tgz` artifact is treated as unrelated local state and is not part of this design. diff --git a/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md b/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md deleted file mode 100644 index f66c609..0000000 --- a/docs/plans/2026-03-19-java-gradle-sandbox-implementation.md +++ /dev/null @@ -1,308 +0,0 @@ -# Java Gradle Sandbox Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Make the sandbox reliably run common Java 8 and Java 17 Gradle + Spring Boot projects offline-first by default, while preferring each project's Gradle wrapper and defaulting unknown projects to Java 8. - -**Architecture:** Extend the sandbox runtime to choose Java deterministically, prefer `./gradlew`, and optionally retry once on clear Java-version mismatches. Expand the coding image with prewarmed Gradle distribution and dependency caches, then lock the behavior down with targeted unit and contract tests. - -**Tech Stack:** Docker, Temurin JDK 8/17, Gradle wrapper, Python 3.11, aiohttp sandbox agent, pytest - ---- - -### Task 1: Expand Java version detection inputs - -**Files:** -- Modify: `platform/sandbox/agent_server.py` -- Test: `platform/tests/test_sandbox_agent_server.py` - -**Step 1: Write the failing tests** - -```python -def test_detect_java_version_from_gradle_toolchain(tmp_path): - agent_server = _load_agent_server_with_fake_skillkit() - gradle = tmp_path / "build.gradle.kts" - gradle.write_text( - 'java { toolchain { languageVersion.set(JavaLanguageVersion.of(17)) } }', - encoding="utf-8", - ) - assert agent_server._detect_java_major_version(str(tmp_path)) == 17 - - -def test_detect_java_version_defaults_to_none_without_markers(tmp_path): - agent_server = _load_agent_server_with_fake_skillkit() - (tmp_path / "settings.gradle").write_text('rootProject.name = "demo"', encoding="utf-8") - assert agent_server._detect_java_major_version(str(tmp_path)) is None -``` - -**Step 2: Run test to verify it fails** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "toolchain or defaults_to_none" -v` -Expected: FAIL because toolchain markers are not detected yet. - -**Step 3: Write minimal implementation** - -```python -_JAVA_DETECT_FILES = ( - "pom.xml", - "build.gradle", - "build.gradle.kts", - "gradle.properties", - "settings.gradle", - "settings.gradle.kts", - ".java-version", - ".tool-versions", -) - -_JAVA17_PATTERNS = ( - r"JavaLanguageVersion\.of\(\s*17\s*\)", - r"languageVersion\s*(?:=|\.set\()\s*JavaLanguageVersion\.of\(\s*17\s*\)", -) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "toolchain or defaults_to_none" -v` -Expected: PASS - -**Step 5: Commit** - -```bash -git add platform/sandbox/agent_server.py platform/tests/test_sandbox_agent_server.py -git commit -m "feat: expand sandbox java version detection" -``` - -### Task 2: Add explicit Java override and Java 8 defaulting - -**Files:** -- Modify: `platform/sandbox/agent_server.py` -- Modify: `platform/app/config.py` -- Modify: `platform/app/worker/sandbox.py` -- Test: `platform/tests/test_sandbox_agent_server.py` -- Test: `platform/tests/test_sandbox_env_contract.py` - -**Step 1: Write the failing tests** - -```python -def test_configure_java_runtime_respects_explicit_override(tmp_path, monkeypatch): - agent_server = _load_agent_server_with_fake_skillkit() - monkeypatch.setenv("SANDBOX_JAVA_VERSION", "17") - monkeypatch.setenv("JAVA17_HOME", "/opt/jdk17") - monkeypatch.setenv("PATH", "/usr/bin:/bin") - selected = agent_server._configure_java_runtime_for_workspace(str(tmp_path)) - assert selected == 17 - - -def test_build_docker_run_cmd_includes_java_override_env(monkeypatch, tmp_path): - from app.worker import sandbox as sandbox_mod - monkeypatch.setattr(sandbox_mod.settings, "SANDBOX_DEFAULT_JAVA_VERSION", 8) - backend = DockerSandboxBackend() - cmd = backend._build_docker_run_cmd("sbx-test", "sandbox-image:latest", "/tmp/workspace", "task-123") - env = _extract_env_vars_from_docker_cmd(cmd) - assert env["SANDBOX_DEFAULT_JAVA_VERSION"] == "8" -``` - -**Step 2: Run test to verify it fails** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -k "override or DEFAULT_JAVA_VERSION" -v` -Expected: FAIL because the env plumbing and defaulting do not exist yet. - -**Step 3: Write minimal implementation** - -```python -override_raw = (os.environ.get("SANDBOX_JAVA_VERSION") or "").strip() -if override_raw in {"8", "17"}: - major = int(override_raw) -else: - major = _detect_java_major_version(workdir) or _env_int("SANDBOX_DEFAULT_JAVA_VERSION", 8) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -k "override or DEFAULT_JAVA_VERSION" -v` -Expected: PASS - -**Step 5: Commit** - -```bash -git add platform/sandbox/agent_server.py platform/app/config.py platform/app/worker/sandbox.py platform/tests/test_sandbox_agent_server.py platform/tests/test_sandbox_env_contract.py -git commit -m "feat: add sandbox java override and defaulting" -``` - -### Task 3: Add bounded Java-version fallback on known mismatch errors - -**Files:** -- Modify: `platform/sandbox/agent_server.py` -- Test: `platform/tests/test_sandbox_agent_server.py` - -**Step 1: Write the failing tests** - -```python -def test_should_retry_gradle_command_on_java_version_mismatch(): - assert _should_retry_with_other_java("Unsupported class file major version 61") is True - assert _should_retry_with_other_java("Execution failed for task ':test'") is False -``` - -**Step 2: Run test to verify it fails** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "retry_gradle_command_on_java_version_mismatch" -v` -Expected: FAIL because the helper does not exist yet. - -**Step 3: Write minimal implementation** - -```python -_JAVA_MISMATCH_PATTERNS = ( - r"Unsupported class file major version", - r"invalid source release", - r"release version .* not supported", -) - -def _should_retry_with_other_java(output: str) -> bool: - return any(re.search(pattern, output, re.IGNORECASE) for pattern in _JAVA_MISMATCH_PATTERNS) -``` - -**Step 4: Run test to verify it passes** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "retry_gradle_command_on_java_version_mismatch" -v` -Expected: PASS - -**Step 5: Commit** - -```bash -git add platform/sandbox/agent_server.py platform/tests/test_sandbox_agent_server.py -git commit -m "feat: add sandbox java mismatch fallback" -``` - -### Task 4: Add offline Gradle cache preparation to the coding image - -**Files:** -- Modify: `platform/sandbox/Dockerfile.coding` -- Create: `platform/sandbox/scripts/prewarm_gradle_cache.sh` -- Test: `platform/tests/test_sandbox_env_contract.py` - -**Step 1: Write the failing test** - -```python -def test_coding_sandbox_image_prepares_offline_gradle_cache(): - dockerfile_path = Path(__file__).resolve().parents[1] / "sandbox" / "Dockerfile.coding" - content = dockerfile_path.read_text(encoding="utf-8") - assert "prewarm_gradle_cache.sh" in content - assert "GRADLE_USER_HOME" in content -``` - -**Step 2: Run test to verify it fails** - -Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "offline_gradle_cache" -v` -Expected: FAIL because the prewarm hook is not present yet. - -**Step 3: Write minimal implementation** - -```bash -#!/usr/bin/env bash -set -euo pipefail -export GRADLE_USER_HOME="${GRADLE_USER_HOME:-/opt/gradle-offline-cache}" -for version in 6.9.4 7.6.4 8.5; do - gradle -g "$GRADLE_USER_HOME" -v >/dev/null 2>&1 || true -done -``` - -**Step 4: Run test to verify it passes** - -Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "offline_gradle_cache" -v` -Expected: PASS - -**Step 5: Commit** - -```bash -git add platform/sandbox/Dockerfile.coding platform/sandbox/scripts/prewarm_gradle_cache.sh platform/tests/test_sandbox_env_contract.py -git commit -m "feat: prewarm offline gradle cache in sandbox image" -``` - -### Task 5: Add representative sandbox fixture coverage - -**Files:** -- Create: `platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle` -- Create: `platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts` -- Modify: `platform/tests/test_sandbox_agent_server.py` - -**Step 1: Write the failing tests** - -```python -def test_detect_java_version_from_java8_fixture(): - fixture = Path(__file__).resolve().parent / "fixtures" / "sandbox" / "java8-springboot-gradle" - agent_server = _load_agent_server_with_fake_skillkit() - assert agent_server._detect_java_major_version(str(fixture)) == 8 - - -def test_detect_java_version_from_java17_fixture(): - fixture = Path(__file__).resolve().parent / "fixtures" / "sandbox" / "java17-springboot-gradle" - agent_server = _load_agent_server_with_fake_skillkit() - assert agent_server._detect_java_major_version(str(fixture)) == 17 -``` - -**Step 2: Run test to verify it fails** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "java8_fixture or java17_fixture" -v` -Expected: FAIL because the fixtures do not exist yet. - -**Step 3: Write minimal implementation** - -```groovy -plugins { - id 'org.springframework.boot' version '2.7.18' -} - -sourceCompatibility = JavaVersion.VERSION_1_8 -``` - -```kotlin -plugins { - id("org.springframework.boot") version "3.2.4" -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(17)) - } -} -``` - -**Step 4: Run test to verify it passes** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py -k "java8_fixture or java17_fixture" -v` -Expected: PASS - -**Step 5: Commit** - -```bash -git add platform/tests/fixtures/sandbox/java8-springboot-gradle/build.gradle platform/tests/fixtures/sandbox/java17-springboot-gradle/build.gradle.kts platform/tests/test_sandbox_agent_server.py -git commit -m "test: add sandbox java fixture coverage" -``` - -### Task 6: Run focused regression coverage - -**Files:** -- Test: `platform/tests/test_sandbox_agent_server.py` -- Test: `platform/tests/test_sandbox_env_contract.py` - -**Step 1: Run the focused regression suite** - -Run: `cd platform && pytest tests/test_sandbox_agent_server.py tests/test_sandbox_env_contract.py -v` -Expected: PASS - -**Step 2: Run a Dockerfile assertion smoke** - -Run: `cd platform && pytest tests/test_sandbox_env_contract.py -k "coding_sandbox_image or offline_gradle_cache" -v` -Expected: PASS - -**Step 3: Review git diff** - -Run: `git diff --stat` -Expected: Only sandbox runtime, image, script, fixture, and test files changed. - -**Step 4: Commit the verified implementation** - -```bash -git add platform/sandbox/agent_server.py platform/app/config.py platform/app/worker/sandbox.py platform/sandbox/Dockerfile.coding platform/sandbox/scripts/prewarm_gradle_cache.sh platform/tests/test_sandbox_agent_server.py platform/tests/test_sandbox_env_contract.py platform/tests/fixtures/sandbox -git commit -m "feat: harden sandbox java gradle offline support" -``` diff --git a/docs/plans/2026-03-19-stage-reset-and-preflight-design.md b/docs/plans/2026-03-19-stage-reset-and-preflight-design.md deleted file mode 100644 index 0168430..0000000 --- a/docs/plans/2026-03-19-stage-reset-and-preflight-design.md +++ /dev/null @@ -1,188 +0,0 @@ -# Stage Reset And Preflight Design - -## Background - -Recent task log analysis shows that extreme token usage in `coding` and `test` is dominated by repeated multi-turn ReAct loops rather than any one oversized file. - -The current cost pattern comes from three factors compounding together: - -1. A large fixed prompt base per stage: - - system prompt - - role skill directories - - repo context - - project memory -2. Repeated `runner.chat(..., reset=False)` continuation within the same stage. -3. Repeated exploration turns (`ls`, `find`, `read`, lightweight shell discovery) before implementation or validation. - -Even after recent turn-budget and convergence work, live runs still show the model spending too many turns on exploration, while each additional turn carries increasing historical context. - -## Goal - -Reduce token cost and exploration churn in `coding` and `test` by implementing both: - -- stage-local rolling resets with compact checkpoints -- deterministic platform-side preflight scan summaries - -## Non-Goals - -- No task/stage graph redesign -- No provider-specific prompt caching as the primary fix -- No changes to task log API shape -- No model routing redesign - -## Why These Two Changes Together - -Either change alone helps, but together they address both sides of the problem: - -- rolling reset reduces repeated historical context -- preflight scan removes avoidable exploration turns - -This is the strongest cost/control improvement available without changing the overall stage model. - -## Recommended Design - -### 1. Rolling Stage Reset With Compact Checkpoints - -For `coding` and `test`, do not let one long-running stage conversation accumulate unbounded history. - -Instead, after a small number of exploration or tool rounds, or after truncation pressure is observed, the executor should: - -1. collect a compact checkpoint -2. restart the runner conversation with `reset=True` -3. continue from the checkpoint rather than from full raw history - -The checkpoint should contain only the minimum needed state: - -- current task objective -- current stage goal -- confirmed facts discovered so far -- files already changed -- latest meaningful tool results -- the immediate next required action - -This preserves continuity while preventing the stage from carrying every prior prompt, tool reply, and continuation through the entire run. - -### 2. Platform-Side Preflight Scan Summary - -Before `coding` and `test`, the platform should run a small deterministic repo scan and inject a compact summary into the stage prompt. - -This replaces a large portion of the model’s exploratory shell work. - -For `coding`, preflight should gather things like: - -- key package / module roots -- likely implementation entrypoints -- existing controller / handler / service examples -- common response wrapper or domain model locations -- test framework presence -- build file hints - -For `test`, preflight should gather things like: - -- relevant existing test files -- framework and runner clues -- the most likely target test directories -- existing test pattern examples - -The preflight output should be short and structured, designed to replace multiple `find`, `ls`, and `read` rounds with one injected context block. - -### 3. Executor Ownership - -The changes should remain executor-driven so they apply consistently to: - -- host execution -- sandbox execution - -The stage executor should become responsible for: - -- deciding when a stage conversation has accumulated too much churn -- building the compact checkpoint -- restarting the stage chat cleanly - -The repo scan should be generated before stage execution and passed in as a small extra context block, similar to how repo context and project memory are already injected today. - -### 4. Minimal Context Surface - -This design should avoid adding yet another large context block. - -The preflight summary should therefore be: - -- tightly capped -- role-specific -- intentionally factual rather than verbose - -Likewise, checkpoint summaries should be much smaller than carrying the full multi-turn history forward. - -## Alternatives Considered - -### 1. Prompt Caching First - -This helps billing for repeated static prompt prefixes, but it does not solve the growing-history problem. It is still useful later, but it should not be the first or only fix. - -### 2. Lower `max_turns` Further - -This constrains runaway behavior, but it does not ensure that the remaining turns are spent effectively. - -### 3. Prompt-Only “Use Fewer Bash Calls” - -This is helpful as a guardrail, but not reliable enough as the main control. Deterministic preflight is more stable than asking the model to be disciplined. - -## Data Flow - -### Coding - -1. Engine prepares repo context and project memory as today. -2. New coding preflight scan runs and produces compact summary text. -3. Executor starts coding stage with the summary included. -4. If the stage begins to accumulate too many exploration/tool rounds, executor builds a checkpoint and re-enters with a fresh chat. -5. The stage proceeds from compressed current state rather than full historical turns. - -### Test - -1. Engine prepares existing compressed prior outputs as today. -2. New test preflight scan runs and provides framework/test-location summary. -3. Executor starts test stage with the summary included. -4. If test churn accumulates, executor rebuilds the stage from a compact checkpoint. -5. Validation continues from current facts instead of raw conversation history. - -## Error Handling - -If preflight scan fails: - -- stage should continue without it -- failure should be logged -- no task failure should occur solely because preflight was unavailable - -If rolling reset checkpoint generation fails: - -- stage should fall back to current behavior -- failure should be logged -- existing stage lifecycle semantics should remain intact - -## Testing Strategy - -Add focused coverage for: - -- coding preflight summary generation -- test preflight summary generation -- executor restart path after exploration churn -- checkpoint prompt includes only reduced current-state data -- non-target roles remain unchanged - -## Success Criteria - -Compared to current live baselines, successful improvement should show: - -- fewer exploration tool calls before implementation -- fewer continuation rounds -- materially lower total tokens in `coding` and `test` -- fewer repeated `Max turns reached` events - -## Rollout Order - -Recommended rollout sequence: - -1. implement preflight summary generation -2. implement rolling reset/checkpoint logic -3. validate on the same VM task family currently used for comparison -4. only after that, decide whether prompt caching is still worth prioritizing diff --git a/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md b/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md deleted file mode 100644 index 12ed084..0000000 --- a/docs/plans/2026-03-19-stage-reset-and-preflight-implementation.md +++ /dev/null @@ -1,314 +0,0 @@ -# Stage Reset And Preflight Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Reduce `coding` and `test` token usage by combining deterministic preflight scan summaries with executor-driven rolling conversation resets. - -**Architecture:** Add small role-specific preflight summaries ahead of `coding` and `test`, then teach the executor to restart long-running stage conversations from compact checkpoints instead of carrying full multi-turn history forever. Keep the existing stage model and runtime entrypoints intact. - -**Tech Stack:** Python, FastAPI worker runtime, SQLAlchemy task pipeline, SkillKit AgentRunner, pytest - ---- - -### Task 1: Add Preflight Summary Builders - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Write the failing test** - -Add tests for a helper that produces compact preflight text for `coding` and `test` from workspace facts. - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: FAIL because the new preflight helper does not exist yet. - -**Step 3: Write minimal implementation** - -Add helpers that: -- gather lightweight repo facts for `coding` / `test` -- cap output size aggressively -- degrade gracefully when data is missing - -Keep the first version simple and deterministic. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: PASS for the new helper coverage. - -**Step 5: Commit** - -```bash -git add platform/app/worker/engine.py platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py -git commit -m "feat(worker): add coding and test preflight summaries" -``` - -### Task 2: Inject Preflight Into Stage Execution - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` - -**Step 1: Write the failing test** - -Add tests showing that `coding` / `test` prompts include the preflight summary block when available. - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_prompts.py -q -``` - -Expected: FAIL because prompts do not yet include the new block. - -**Step 3: Write minimal implementation** - -Extend stage context and prompt assembly so the preflight summary is included only for the roles that need it, and is clearly labeled. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_prompts.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add platform/app/worker/engine.py platform/app/worker/prompts.py platform/tests/test_prompts.py -git commit -m "feat(worker): inject role preflight summaries" -``` - -### Task 3: Add Rolling Reset Checkpoint Builder - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Write the failing test** - -Add tests for a helper that turns current stage state into a compact restart checkpoint. - -Cover: -- task objective -- current stage goal -- recent tool digest -- next required action - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: FAIL because checkpoint logic does not exist yet. - -**Step 3: Write minimal implementation** - -Build a compact textual checkpoint helper in the executor and keep it size-capped. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py -git commit -m "feat(worker): build compact stage restart checkpoints" -``` - -### Task 4: Restart Long-Running Stage Conversations From Checkpoints - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Write the failing test** - -Add tests showing that when churn thresholds are exceeded: -- executor restarts the stage from a checkpoint -- the follow-up chat uses `reset=True` -- repeated raw-history continuation is reduced - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: FAIL because executor still only continues in-place. - -**Step 3: Write minimal implementation** - -Add restart logic for `coding` and `test` only: -- detect churn threshold -- build checkpoint -- restart with a fresh chat -- preserve existing lifecycle logging - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add platform/app/worker/executor.py platform/tests/test_executor_stage_logs.py -git commit -m "fix(worker): reset stage chats from compact checkpoints" -``` - -### Task 5: End-to-End Regression Verification - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` - -**Step 1: Add missing regression coverage** - -Ensure tests cover: -- fallback behavior when preflight is unavailable -- non-target roles unaffected -- prompt size stays bounded -- forced convergence and rolling reset do not fight each other - -**Step 2: Run targeted regression suite** - -Run: -```bash -cd platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q -``` - -Expected: PASS. - -**Step 3: Commit** - -```bash -git add platform/tests/test_executor_stage_logs.py platform/tests/test_prompts.py -git commit -m "test(worker): cover stage reset and preflight regressions" -``` - -### Task 6: Live Validation On VM - -**Files:** -- No local code changes required unless fixes are needed - -**Step 1: Deploy latest branch to VM** - -Pull latest code and restart backend in the current host-execution mode. - -**Step 2: Clone known comparison task** - -Use: -```bash -POST /api/v1/tasks/339f8bd3-c5f2-4da5-8267-15a6ec3aaaa3/clone -``` - -**Step 3: Compare live metrics** - -Capture: -- `llm_turn_sent` -- `tool_call_executed` -- `Max turns reached` -- total tokens - -for `code` and `test`. - -**Step 4: Record comparison** - -Compare against the recent baselines already observed on VM and summarize whether: -- exploration rounds dropped -- token usage dropped -- repeated truncation dropped - -**Step 5: Commit only if code changed during validation** - -```bash -git add <files> -git commit -m "fix(worker): adjust stage reset thresholds" -``` - -## Verification - -Primary local verification: - -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_prompts.py -q -``` - -Secondary live verification: - -- deploy current branch to VM -- clone the known `helloworld` task -- compare `code/test` stage metrics with earlier runs - -## Risks - -- Restart checkpoints may omit important context and hurt correctness. -- Preflight summaries may become too verbose and recreate the same token problem in a different form. -- Rolling reset may interact awkwardly with current continuation and forced-convergence behavior. - -## Mitigations - -- Keep checkpoint format deliberately compact and task-focused. -- Hard-cap preflight and checkpoint text. -- Limit the first version to `coding` and `test`. -- Add targeted tests around restart behavior. - -## Rollback - -Rollback would revert changes in: - -- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` -- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` -- `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` -- `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` diff --git a/docs/plans/2026-03-19-static-context-token-optimization-design.md b/docs/plans/2026-03-19-static-context-token-optimization-design.md deleted file mode 100644 index 8fe5c92..0000000 --- a/docs/plans/2026-03-19-static-context-token-optimization-design.md +++ /dev/null @@ -1,207 +0,0 @@ -# Static Context Token Optimization Design - -## Background - -Recent live task analysis confirms that high token cost is still dominated by repeated stage-local chat calls, especially in `code` and `test`. - -We have already improved behavior in three ways: - -- stronger `coding` / `test` convergence guardrails -- exploration budgets and forced convergence -- preflight summaries plus checkpoint-style restart prompts - -Those changes reduced some waste, but they did not fully remove the biggest fixed cost: each stage still repeatedly carries a large static prompt base. - -Today that base includes some combination of: - -- system prompt -- role skill directories and tool schema exposure -- repo context -- project memory -- prior stage outputs - -As a result, even when the agent is doing the right kind of work, each extra turn still re-pays too much prompt cost. - -## Goal - -Reduce token consumption primarily by shrinking repeated static prompt/context overhead before investing in provider-specific prompt caching. - -## Non-Goals - -- Do not redesign the task/stage graph -- Do not replace the current AgentRunner stack -- Do not make prompt caching the immediate primary fix -- Do not remove the existing preflight or restart work - -## Key Observation - -The next best savings are not from forcing the model to be “more disciplined” in the abstract. They come from making each stage carry less static baggage per turn. - -That means the platform should prioritize: - -1. smaller models for low-complexity stages -2. fewer tools and skills exposed per stage -3. less injected repo and memory context where preflight already covers the need -4. making restart/checkpoint execution the preferred continuation path - -## Recommended Approach - -### Phase 1: Cheap, Low-Risk Static Context Reduction - -Implement four low-risk optimizations first. - -#### 1. Role-Based Model Routing - -Use the existing `LLM_ROLE_MODEL_MAP` support more aggressively. - -Recommended default direction: - -- `parse` → lighter text/tool-capable model -- `code` → strongest coding/reasoning model -- `test` → lighter model unless the task or template explicitly requests stronger reasoning -- `signoff` → lighter text-oriented model - -This does not reduce token count directly, but it reduces cost immediately and aligns model strength with stage complexity. - -#### 2. Make `signoff` Text-Only - -`signoff` should stop re-entering tool-driven exploration by default. - -It should instead rely on: - -- prior stage outputs -- structured summaries -- the latest verified results already produced by earlier stages - -This reduces unnecessary tool schema exposure and avoids another mini ReAct loop at the end of the task. - -#### 3. Per-Stage Tool / Skill Pruning - -The current role defaults still expose too much shared capability in later-stage prompts. - -We should trim stage exposure so the agent only receives the tools and skill directories it realistically needs: - -- `code` should keep the core file and execution tools, but not unrelated later-stage abilities -- `test` should focus on read/edit/execute verification tools -- `signoff` should default to no tools - -This reduces prompt bloat and narrows action space. - -#### 4. Shrink `repo_context` and `project_memory` for `code` / `test` - -Now that deterministic preflight summaries exist, `code` and `test` no longer need the full original repo-context payload on every turn. - -We should split context into: - -- broad repo context for earlier planning stages -- slim execution context for `code` / `test` - -For execution stages, the injected context should favor: - -- concise stack/build facts -- minimal path hints -- short relevant memory excerpts - -and avoid re-sending large directory trees or verbose historical notes when preflight already covers the local execution target. - -### Phase 2: Make Checkpoint Restart the Main Continuation Strategy - -We already introduced restart/checkpoint machinery, but it should evolve from a rescue path into the preferred continuation model for `code` / `test`. - -The continuation path should increasingly favor: - -- `reset=True` -- a compact checkpoint prompt -- only the immediate execution state - -and increasingly avoid: - -- replaying full `repo_context` -- replaying full `project_memory` -- replaying large prior output blocks - -The restart payload should be limited to: - -- task objective -- stage goal -- preflight summary -- already confirmed edits or findings -- last 2 to 3 meaningful tool results -- immediate next action - -### Phase 3: Provider-Aware Prompt Caching - -Prompt caching is still worth evaluating, but only after the fixed prompt base is made smaller and more stable. - -Reasons not to lead with it: - -- provider support may vary across current model paths -- it does not solve growing-history behavior by itself -- it is more invasive than the earlier fixes - -Once Phases 1 and 2 land, caching can be evaluated on a cleaner and more stable prompt shape. - -## Alternatives Considered - -### 1. Prompt Caching First - -Useful later, but not the best immediate step because it does not reduce prompt size or continuation churn on its own. - -### 2. More Aggressive `max_turns` Reduction - -Helps cap damage, but still allows the remaining turns to carry the same large static prefix. - -### 3. Prompting the Agent to Batch Shell Commands - -Helpful as a guardrail, but weaker than platform-side prompt/context reduction. It depends on model compliance and does not address repeated schema/context cost. - -## Data Flow Changes - -### Parse / Spec-Like Stages - -These stages may continue to receive broader repo context because they are responsible for planning and synthesis. - -### Code / Test Stages - -These stages should increasingly receive: - -- slim repo facts -- short role-specific memory -- deterministic preflight summary -- compact restart checkpoints on continuation - -instead of the current larger blended context shape. - -### Signoff Stage - -This stage should default to text-only summary and evaluation behavior, without tool re-entry. - -## Testing Strategy - -Add or extend focused coverage for: - -- role-model resolution per stage -- signoff text-only routing -- per-role skill/tool narrowing -- prompt construction with slim execution-stage context -- restart/checkpoint payload staying compact -- non-execution stages still receiving the broader context they need - -## Success Criteria - -Compared to current baselines, success should show: - -- lower total tokens in `code` and `test` -- fewer repeated large chat payloads -- fewer tool-driven loops in `signoff` -- lower average cost per task even before prompt caching - -## Rollout Order - -1. tighten model routing defaults -2. make `signoff` text-only -3. prune per-stage tool/skill exposure -4. slim `repo_context` / `project_memory` for execution stages -5. promote compact checkpoint restart into the main continuation path -6. re-measure on the same VM task family -7. only then decide whether prompt caching should be the next investment diff --git a/docs/plans/2026-03-19-static-context-token-optimization-implementation.md b/docs/plans/2026-03-19-static-context-token-optimization-implementation.md deleted file mode 100644 index 935266f..0000000 --- a/docs/plans/2026-03-19-static-context-token-optimization-implementation.md +++ /dev/null @@ -1,308 +0,0 @@ -# Static Context Token Optimization Implementation Plan - -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. - -**Goal:** Reduce `code` / `test` token cost by shrinking repeated static prompt overhead before attempting provider-specific prompt caching. - -**Architecture:** Reuse the existing worker pipeline, but make execution stages cheaper by routing lighter models where appropriate, removing unnecessary tool exposure, slimming injected repo/memory context, and making compact checkpoint restart the default continuation shape. Preserve the existing stage graph and existing preflight work. - -**Tech Stack:** Python, FastAPI worker runtime, SkillKit AgentRunner, pytest - ---- - -### Task 1: Tighten Role-Based Model Routing Defaults - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/config.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/.env.example` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py` - -**Step 1: Write the failing test** - -Add coverage showing that `parse`, `test`, and `signoff` can resolve to lightweight models through `LLM_ROLE_MODEL_MAP`, while `code` can still resolve to the stronger coding model. - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_agents_api.py -q -``` - -Expected: FAIL until the default config and resolution behavior match the new mapping expectation. - -**Step 3: Write minimal implementation** - -Update config defaults and environment examples so the role-model map favors: - -- light model for `parse` -- strong model for `code` -- light model for `test` -- light model for `signoff` - -Keep stage-level override precedence unchanged. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_agents_api.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py /Users/jowang/Documents/github/silicon_agent/platform/app/config.py /Users/jowang/Documents/github/silicon_agent/platform/.env.example /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py -git commit -m "config(worker): tune role model routing defaults" -``` - -### Task 2: Make Signoff Text-Only By Default - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` - -**Step 1: Write the failing test** - -Add tests showing that `signoff` uses the text-only runner path and does not request tool execution by default. - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py -q -``` - -Expected: FAIL because `signoff` still flows through the general tool-enabled path. - -**Step 3: Write minimal implementation** - -Route `signoff` through `get_agent_text_only(...)` and keep it based on prior outputs plus prompt guidance, without reopening tool-driven exploration. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py -git commit -m "fix(worker): make signoff text only by default" -``` - -### Task 3: Prune Stage Tool And Skill Exposure - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` - -**Step 1: Write the failing test** - -Add tests showing that: - -- `code` only gets core implementation tools/skills -- `test` only gets core verification tools/skills -- `signoff` gets no tool-enabled skill exposure - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_agents.py tests/test_executor_stage_logs.py -q -``` - -Expected: FAIL because role defaults still expose too much shared capability. - -**Step 3: Write minimal implementation** - -Tighten `ROLE_TOOLS` and role skill directory selection so execution stages only carry what they actually need. Preserve sandbox parity for the container path. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_agents.py tests/test_executor_stage_logs.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/agents.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py -git commit -m "refactor(worker): prune stage tool and skill exposure" -``` - -### Task 4: Slim Repo Context And Project Memory For Execution Stages - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_worker.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` - -**Step 1: Write the failing test** - -Add tests showing that `code` / `test` receive a slim execution-context variant while planning stages can still receive broader repo context. - -Cover: - -- reduced repo tree content for execution stages -- reduced memory excerpt size for execution stages -- preflight summary still present - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_prompts.py tests/test_worker.py tests/test_engine_stage_execution.py -q -``` - -Expected: FAIL because execution stages still receive the broader context shape. - -**Step 3: Write minimal implementation** - -Split context construction into broad planning context versus slim execution context. Keep concise build/stack facts, but avoid re-sending large directory trees and long memory blocks to `code` / `test`. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_prompts.py tests/test_worker.py tests/test_engine_stage_execution.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/engine.py /Users/jowang/Documents/github/silicon_agent/platform/app/worker/prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_worker.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py -git commit -m "feat(worker): slim execution stage context payloads" -``` - -### Task 5: Make Compact Restart The Primary Continuation Shape - -**Files:** -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py` -- Modify: `/Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` -- Test: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py` - -**Step 1: Write the failing test** - -Add tests showing that after exploration drift or truncation, the stage continuation path prefers compact `reset=True` restart payloads and logs restart metadata consistently. - -Cover: - -- restart metadata on host path -- restart metadata on sandbox path -- reduced carry-forward payload shape - -**Step 2: Run test to verify it fails** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_sandbox_agent_server.py -q -``` - -Expected: FAIL because current live behavior still does not consistently surface restart metadata and still leaks too much old context into continuation. - -**Step 3: Write minimal implementation** - -Promote compact restart to the default continuation path for `code` / `test`, ensure restart logging is explicit, and keep carried state limited to the compact checkpoint. - -**Step 4: Run test to verify it passes** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_executor_stage_logs.py tests/test_sandbox_agent_server.py -q -``` - -Expected: PASS. - -**Step 5: Commit** - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/app/worker/executor.py /Users/jowang/Documents/github/silicon_agent/platform/sandbox/agent_server.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py -git commit -m "fix(worker): prefer compact restart continuations" -``` - -### Task 6: Regression And Live Validation - -**Files:** -- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py` -- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py` -- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py` -- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py` -- Modify if needed: `/Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py` - -**Step 1: Run targeted local regression** - -Run: -```bash -cd /Users/jowang/Documents/github/silicon_agent/platform -. .venv/bin/activate -pytest tests/test_agents.py tests/test_agents_api.py tests/test_prompts.py tests/test_executor_stage_logs.py tests/test_engine_stage_execution.py tests/test_sandbox_agent_server.py -q -``` - -Expected: PASS. - -**Step 2: Deploy to VM and validate against the known hello-world task family** - -Use the VM flow already established for: - -- pulling `origin/codex/raise-cb-and-optimize-coding-image` -- restarting backend -- cloning task `339f8bd3-c5f2-4da5-8267-15a6ec3aaaa3` - -Capture: - -- total task tokens -- `parse` / `code` / `test` stage tokens -- `chat_sent` -- `tool_calls` -- `max_turn_markers` -- `forced_convergence` -- `restart_count` - -**Step 3: Commit** - -If only test updates were needed: - -```bash -git add /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_agents_api.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_prompts.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_executor_stage_logs.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_engine_stage_execution.py /Users/jowang/Documents/github/silicon_agent/platform/tests/test_sandbox_agent_server.py -git commit -m "test(worker): cover static context token optimization regressions" -```