getsentry · dcramer · May 21, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/packages/junior-evals/README.md b/packages/junior-evals/README.md
@@ -67,6 +67,7 @@ Harness override knobs (in `EvalOverrides`):
 - `auto_complete_mcp_oauth`: after our app genuinely starts an MCP OAuth flow for the listed providers, the harness immediately completes the fake provider callback.
 - `auto_complete_oauth`: after our app genuinely starts a generic OAuth flow for the listed providers, the harness immediately completes the fake provider callback.
 - `fail_reply_call`: force a non-retryable reply failure on a specific call.
+- `faults.sandbox_bash_stream_interrupts`: inject a fixed number of eval-only sandbox bash stream interruptions so the real agent must recover from failed command results.
 - `mock_image_generation`: stub the image-generation HTTP response with a valid image payload while still exercising the real attachment path.
 - `plugin_dirs`: load plugin fixtures from eval-local directories without adding workspace packages.
 - `reply_texts`: override returned reply text per call.

diff --git a/packages/junior-evals/evals/behavior-harness.ts b/packages/junior-evals/evals/behavior-harness.ts
@@ -130,6 +130,9 @@ export interface EvalOverrides {
   auto_complete_oauth?: string[];
   enable_test_credentials?: boolean;
   fail_reply_call?: number;
+  faults?: {
+    sandbox_bash_stream_interrupts?: number;
+  };
   mock_image_generation?: boolean;
   plugin_dirs?: string[];
   plugin_packages?: string[];
@@ -397,6 +400,8 @@ const HARNESS_ENV_KEYS = [
   "EVAL_TEST_CREDENTIAL_TOKEN",
   "JUNIOR_BASE_URL",
   "JUNIOR_EXTRA_PLUGIN_ROOTS",
+  "JUNIOR_EVAL_ENABLE_FAULTS",
+  "JUNIOR_EVAL_FAULT_SANDBOX_BASH_STREAM_INTERRUPTS",
   "JUNIOR_STATE_ADAPTER",
   "SLACK_BOT_TOKEN",
 ] as const;
@@ -942,6 +947,18 @@ async function setupHarnessEnvironment(
         scenario.overrides.test_credential_token;
     }
   }
+  const sandboxBashStreamInterrupts =
+    scenario.overrides?.faults?.sandbox_bash_stream_interrupts;
+  if (
+    typeof sandboxBashStreamInterrupts === "number" &&
+    Number.isFinite(sandboxBashStreamInterrupts) &&
+    sandboxBashStreamInterrupts > 0
+  ) {
+    process.env.JUNIOR_EVAL_ENABLE_FAULTS = "1";
+    process.env.JUNIOR_EVAL_FAULT_SANDBOX_BASH_STREAM_INTERRUPTS = String(
+      Math.floor(sandboxBashStreamInterrupts),
+    );
+  }
   process.env.JUNIOR_BASE_URL = "https://junior.example.com";
   process.env.JUNIOR_STATE_ADAPTER = "memory";
   process.env.JUNIOR_EXTRA_PLUGIN_ROOTS = JSON.stringify(configuredPluginDirs);

diff --git a/packages/junior-evals/evals/core/lifecycle-and-resilience.eval.ts b/packages/junior-evals/evals/core/lifecycle-and-resilience.eval.ts
@@ -68,4 +68,37 @@ describeEval("Lifecycle and Resilience", slackEvals, (it) => {
       }),
     });
   });
+
+  it("when a sandbox command stream is interrupted, recover and finish the request", async ({
+    run,
+  }) => {
+    await run({
+      overrides: {
+        faults: {
+          sandbox_bash_stream_interrupts: 1,
+        },
+        skill_dirs: ["evals/fixtures/skills"],
+      },
+      events: [
+        mention(
+          "/resilient-working-directory list files in the working directory",
+        ),
+      ],
+      taskTimeout: 120_000,
+      criteria: rubric({
+        contract:
+          "A transient sandbox command-stream interruption is treated as recoverable tool output, not a terminal assistant failure.",
+        pass: [
+          "observed_tool_invocations includes at least two `bash` calls, showing the agent retried after the injected interruption.",
+          "assistant_posts contains exactly one final reply.",
+          "The reply includes `Working directory files:` and a fenced list of files from the successful retry.",
+        ],
+        fail: [
+          "Do not post a generic assistant failure reply.",
+          "Do not stop after reporting only the injected stream interruption.",
+          "Do not mention Sentry event IDs, stack traces, or provider internals.",
+        ],
+      }),
+    });
+  });
 });
diff --git a/packages/junior-evals/evals/fixtures/skills/resilient-working-directory/SKILL.md b/packages/junior-evals/evals/fixtures/skills/resilient-working-directory/SKILL.md
@@ -0,0 +1,28 @@
+---
+name: resilient-working-directory
+description: Use for /resilient-working-directory eval requests that verify command interruption recovery.
+allowed-tools: bash
+---
+
+Generate a short response for `/resilient-working-directory` requests in eval runs.
+
+## Step 1: List Files
+
+Call `bash` with this input:
+
+```json
+{ "command": "ls -1", "timeout_ms": 120000, "max_output_chars": 12000 }
+```
+
+## Step 2: Recover Once
+
+If the command result has `ok: false` and `stderr` says the command stream ended before the command finished, call the same `bash` command one more time.
+
+## Step 3: Return Result
+
+- If the final command result has `ok: true`, return markdown with:
+  - `Working directory files:`
+  - a fenced code block containing `stdout`
+- If the final command result has `ok: false`, return markdown with:
+  - `Working directory files: unavailable`
+  - `Error:` and `stderr`
diff --git a/packages/junior/src/chat/prompt.ts b/packages/junior/src/chat/prompt.ts
@@ -438,7 +438,7 @@ const EXECUTION_CONTRACT_RULES = [
 const CONVERSATION_RULES = [
   "- In thread follow-ups, answer from prior thread context; do not repeat resolved clarifying questions.",
   "- Preserve attribution roles from thread context: the requester is the person asking now, which may differ from the original reporter or subject.",
-  "- On resumed turns, post a brief continuation notice, then the resumed answer as a separate message.",
+  "- Runtime owns continuation and authorization notices; on resumed turns, answer with the final requested content only.",
 ];
 
 const SLACK_ACTION_RULES = [

diff --git a/packages/junior/src/chat/respond.ts b/packages/junior/src/chat/respond.ts
@@ -81,11 +81,12 @@ import {
   toAgentThinkingLevel,
   type TurnThinkingSelection,
 } from "@/chat/services/turn-thinking-level";
-import type { AgentTurnUsage } from "@/chat/usage";
+import { hasAgentTurnUsage, type AgentTurnUsage } from "@/chat/usage";
 import {
   loadTurnCheckpoint,
   persistCompletedCheckpoint,
   persistAuthPauseCheckpoint,
+  persistRunningCheckpoint,
   persistTimeoutCheckpoint,
 } from "@/chat/services/turn-checkpoint";
 import { createMcpAuthOrchestration } from "@/chat/services/mcp-auth-orchestration";
@@ -185,6 +186,16 @@ function trimRouterAttachmentText(text: string): string {
     : `${normalized.slice(0, MAX_ROUTER_ATTACHMENT_PREVIEW_CHARS)}...`;
 }
 
+function extractSliceUsage(
+  messages: PiMessage[],
+  beforeMessageCount: number,
+): AgentTurnUsage | undefined {
+  const usage = extractGenAiUsageSummary(
+    ...messages.slice(beforeMessageCount).filter(isAssistantMessage),
+  );
+  return hasAgentTurnUsage(usage) ? usage : undefined;
+}
+
 function supportsRouterTextPreview(mediaType: string): boolean {
   const baseMediaType = mediaType.split(";", 1)[0]?.trim().toLowerCase();
   if (!baseMediaType) {
@@ -394,6 +405,14 @@ export async function generateAssistantReply(
   let timedOut = false;
   let turnUsage: AgentTurnUsage | undefined;
   let thinkingSelection: TurnThinkingSelection | undefined;
+  const checkpointLogContext = {
+    threadId: context.correlation?.threadId,
+    requesterId: context.correlation?.requesterId,
+    channelId: context.correlation?.channelId,
+    runId: context.correlation?.runId,
+    assistantUserName: botConfig.userName,
+    modelId: botConfig.modelId,
+  };
 
   const getSandboxMetadata = () =>
     sandboxExecutor
@@ -917,8 +936,31 @@ export async function generateAssistantReply(
     });
     let hasEmittedText = false;
     let needsSeparator = false;
+    const persistSafeBoundary = async (
+      messages: PiMessage[],
+    ): Promise<void> => {
+      if (
+        !checkpointState.canUseTurnSession ||
+        !sessionConversationId ||
+        !sessionId
+      ) {
+        return;
+      }
+
+      await persistRunningCheckpoint({
+        conversationId: sessionConversationId,
+        sessionId,
+        sliceId: currentSliceId,
+        messages,
+        loadedSkillNames: loadedSkillNamesForResume,
+        logContext: checkpointLogContext,
+      });
+    };
 
     const unsubscribe = agent.subscribe((event) => {
+      if (event.type === "turn_end" && event.toolResults.length > 0) {
+        return persistSafeBoundary([...agent!.state.messages]);
+      }
       if (event.type === "message_start") {
         Promise.resolve(context.onAssistantMessageStart?.()).catch((error) => {
           logWarn(
@@ -977,13 +1019,20 @@ export async function generateAssistantReply(
         spanContext,
         async () => {
           let promptResult: unknown;
+          const freshPromptMessage: PiMessage = {
+            role: "user",
+            content: promptContentParts,
+            timestamp: Date.now(),
+          } as PiMessage;
+          if (!resumedFromCheckpoint) {
+            await persistSafeBoundary([
+              ...agent.state.messages,
+              freshPromptMessage,
+            ]);
+          }
           const promptPromise = resumedFromCheckpoint
             ? agent.continue()
-            : agent.prompt({
-                role: "user",
-                content: promptContentParts,
-                timestamp: Date.now(),
-              });
+            : agent.prompt(freshPromptMessage);
 
           let timeoutId: ReturnType<typeof setTimeout> | undefined;
           const timeoutPromise = new Promise<never>((_, reject) => {
@@ -1044,9 +1093,7 @@ export async function generateAssistantReply(
             agent.state,
             ...outputMessages,
           );
-          turnUsage = Object.values(usageSummary).some(
-            (value) => value !== undefined,
-          )
+          turnUsage = hasAgentTurnUsage(usageSummary)
             ? usageSummary
             : undefined;
           setSpanAttributes({
@@ -1082,10 +1129,13 @@ export async function generateAssistantReply(
     ) {
       await persistCompletedCheckpoint({
         conversationId: sessionConversationId,
+        currentDurationMs: Date.now() - replyStartedAtMs,
+        currentUsage: turnUsage,
         sessionId,
         sliceId: currentSliceId,
         allMessages: agent.state.messages,
         loadedSkillNames: activeSkills.map((skill) => skill.name),
+        logContext: checkpointLogContext,
       });
     }
 
@@ -1114,21 +1164,19 @@ export async function generateAssistantReply(
     });
   } catch (error) {
     if (timedOut && timeoutResumeConversationId && timeoutResumeSessionId) {
+      turnUsage =
+        turnUsage ??
+        extractSliceUsage(timeoutResumeMessages, beforeMessageCount);
       const checkpoint = await persistTimeoutCheckpoint({
         conversationId: timeoutResumeConversationId,
         sessionId: timeoutResumeSessionId,
         currentSliceId: timeoutResumeSliceId,
+        currentDurationMs: Date.now() - replyStartedAtMs,
+        currentUsage: turnUsage,
         messages: timeoutResumeMessages,
         loadedSkillNames: loadedSkillNamesForResume,
         errorMessage: error instanceof Error ? error.message : String(error),
-        logContext: {
-          threadId: context.correlation?.threadId,
-          requesterId: context.correlation?.requesterId,
-          channelId: context.correlation?.channelId,
-          runId: context.correlation?.runId,
-          assistantUserName: botConfig.userName,
-          modelId: botConfig.modelId,
-        },
+        logContext: checkpointLogContext,
       });
       if (checkpoint) {
         throw new RetryableTurnError(
@@ -1151,36 +1199,21 @@ export async function generateAssistantReply(
       timeoutResumeSessionId
     ) {
       if (!turnUsage && timeoutResumeMessages.length > 0) {
-        // Match the canonical slice-scoped extraction: sum usage from new
-        // assistant messages produced during this slice, not the full
-        // message history (which may include prior slices whose usage was
-        // already reported in earlier footers).
-        const fallbackUsage = extractGenAiUsageSummary(
-          ...timeoutResumeMessages
-            .slice(beforeMessageCount)
-            .filter(isAssistantMessage),
+        turnUsage = extractSliceUsage(
+          timeoutResumeMessages,
+          beforeMessageCount,
         );
-        turnUsage = Object.values(fallbackUsage).some(
-          (value) => value !== undefined,
-        )
-          ? fallbackUsage
-          : undefined;
       }
       const nextSliceId = await persistAuthPauseCheckpoint({
         conversationId: timeoutResumeConversationId,
         sessionId: timeoutResumeSessionId,
         currentSliceId: timeoutResumeSliceId,
+        currentDurationMs: Date.now() - replyStartedAtMs,
+        currentUsage: turnUsage,
         messages: timeoutResumeMessages,
         loadedSkillNames: loadedSkillNamesForResume,
         errorMessage: error.message,
-        logContext: {
-          threadId: context.correlation?.threadId,
-          requesterId: context.correlation?.requesterId,
-          channelId: context.correlation?.channelId,
-          runId: context.correlation?.runId,
-          assistantUserName: botConfig.userName,
-          modelId: botConfig.modelId,
-        },
+        logContext: checkpointLogContext,
       });
       throw new RetryableTurnError(
         error.kind === "plugin" ? "plugin_auth_resume" : "mcp_auth_resume",

diff --git a/packages/junior/src/chat/runtime/reply-executor.ts b/packages/junior/src/chat/runtime/reply-executor.ts
@@ -61,12 +61,12 @@ import { buildDeterministicTurnId } from "@/chat/runtime/turn";
 import { markTurnCompleted, markTurnFailed } from "@/chat/runtime/turn";
 import { startActiveTurn } from "@/chat/runtime/turn";
 import { isRedundantReactionAckText } from "@/chat/services/reply-delivery-plan";
-import { deleteSlackMessage } from "@/chat/slack/outbound";
+import { deleteSlackMessage, postSlackMessage } from "@/chat/slack/outbound";
 import {
   finalizeFailedTurnReply,
   getAgentTurnDiagnosticsAttributes,
 } from "@/chat/services/turn-failure-response";
-import { buildTurnContinuationResponse } from "@/chat/services/turn-continuation-response";
+import { buildSlackTurnContinuationNotice } from "@/chat/slack/turn-continuation-notice";
 import { buildAuthPauseResponse } from "@/chat/services/auth-pause-response";
 import { maybeApplyProviderDefaultConfigRequest } from "@/chat/services/provider-default-config";
 
@@ -232,9 +232,22 @@ export function createReplyToThread(deps: ReplyExecutorDeps) {
         const postTurnContinuationNotice = async (): Promise<void> => {
           try {
             await beforeFirstResponsePost();
-            await thread.post(
-              buildSlackOutputMessage(buildTurnContinuationResponse()),
-            );
+            const notice = buildSlackTurnContinuationNotice({ conversationId });
+            const shouldUseSlackFooter =
+              Boolean(notice.blocks?.length) &&
+              Boolean(channelId && threadTs) &&
+              (thread.adapter as { name?: string } | undefined)?.name ===
+                "slack";
+            if (shouldUseSlackFooter && channelId && threadTs) {
+              await postSlackMessage({
+                channelId,
+                threadTs,
+                ...notice,
+              });
+              return;
+            }
+
+            await thread.post(buildSlackOutputMessage(notice.text));
           } catch (error) {
             logException(
               error,