Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions services/api/api/sandbox/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,23 @@
_ATTACH_LOG_TAIL_LINES = int(os.getenv("KUBERNETES_ATTACH_LOG_TAIL_LINES", "200"))
_CONTAINER_NAME = "sandbox"
_AGENT_UID = 1001


def _agent_container_terminated(pod: Any) -> bool:
"""True when the sandbox (agent) container has terminated.

The pod's ``phase`` stays ``Running`` as long as any container is alive, so
a crashed/OOMKilled agent container is hidden by the surviving per-sandbox
tool-server sidecar. This inspects the agent container's own state so the
caller can stop treating such a session as live.
"""
statuses = getattr(getattr(pod, "status", None), "container_statuses", None) or []
for container_status in statuses:
if getattr(container_status, "name", None) != _CONTAINER_NAME:
continue
state = getattr(container_status, "state", None)
return getattr(state, "terminated", None) is not None
return False
_SANDBOX_OVERLAY_ROOT = "/home/agent/overlay"
_SANDBOX_OVERLAY_DIR = f"{_SANDBOX_OVERLAY_ROOT}/org"
# Writable dir the tool-server sidecar installs overlay tool deps into. The
Expand Down Expand Up @@ -1900,6 +1917,15 @@ async def status_by_id(self, sandbox_id: str) -> str:
return "stopped"
phase = (pod.status.phase or "").lower()
if phase == "running":
# Pod phase stays "Running" while *any* container runs, so a
# crashed or OOMKilled agent container is masked by the surviving
# per-sandbox tool-server sidecar. Report "stopped" when the agent
# container itself has terminated, so callers finalize the
# execution (failed) instead of reconnecting to a dead container's
# stdout forever — a stream that EOFs immediately on every attach,
# which otherwise reclaim-loops until the hard deadline.
if _agent_container_terminated(pod):
return "stopped"
return "running"
if phase == "pending":
return "created"
Expand Down
54 changes: 54 additions & 0 deletions services/api/tests/test_sandbox_kubernetes_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2171,3 +2171,57 @@ async def test_create_isolates_codex_api_key_from_claude_sandbox(
proxy_yaml = fake_core.created_configmaps[0][1]["data"]["proxy.yaml"]
assert "OPENAI_API_KEY" in proxy_yaml
assert "ANTHROPIC_API_KEY" not in proxy_yaml


def _status_pod(*, phase: str, sandbox_terminated: bool) -> SimpleNamespace:
terminated = (
SimpleNamespace(reason="OOMKilled", exit_code=137)
if sandbox_terminated
else None
)
return SimpleNamespace(
metadata=SimpleNamespace(deletion_timestamp=None),
status=SimpleNamespace(
phase=phase,
container_statuses=[
SimpleNamespace(
name="sandbox", state=SimpleNamespace(terminated=terminated)
),
SimpleNamespace(
name="tool-server", state=SimpleNamespace(terminated=None)
),
],
),
)


@pytest.mark.asyncio
async def test_status_by_id_reports_stopped_when_agent_container_terminated(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""A pod whose sandbox container OOMKilled must not read as ``running``.

The pod phase stays ``Running`` because the per-sandbox tool-server sidecar
survives the agent container's OOM kill. Reporting ``running`` makes the
execution worker reclaim-loop on a stream that EOFs immediately on every
reattach (dead container's stdout) until the hard deadline, instead of
finalizing the execution as failed.
"""
from api.sandbox.kubernetes import KubernetesExecutorBackend

monkeypatch.setenv("KUBERNETES_NAMESPACE", "centaur")
backend = KubernetesExecutorBackend()

async def _noop() -> None:
return None

monkeypatch.setattr(backend, "_ensure_clients", _noop)
fake_core = FakeCoreApi()
backend._core = fake_core # type: ignore[attr-defined]

fake_core.pods_to_read = [_status_pod(phase="Running", sandbox_terminated=True)]
assert await backend.status_by_id("sandbox-x") == "stopped"

# Control: agent container still alive → genuinely running.
fake_core.pods_to_read = [_status_pod(phase="Running", sandbox_terminated=False)]
assert await backend.status_by_id("sandbox-x") == "running"