Martian-Engineering · jalehman · Mar 20, 2026 · Mar 20, 2026
diff --git a/.changeset/tiny-ants-share.md b/.changeset/tiny-ants-share.md
@@ -0,0 +1,5 @@
+---
+"@martian-engineering/lossless-claw": minor
+---
+
+Add runtime-assisted transcript GC for summarized externalized tool results so active session transcripts can shrink after oversized tool output has been condensed and preserved in `large_files`.
diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl
@@ -157,3 +157,18 @@
 {"type":"create","timestamp":"2026-03-19T22:55:53.112027Z","issue_id":"lossless-claw-744","payload":{"description":"Observed on 2026-03-19 against live OpenClaw using ~/.openclaw/lcm.db with largeFileThresholdTokens=8000.\n\nRepro:\n1. Ask the agent to print the first 200000 characters of extensions/diffs/assets/viewer-runtime.js.\n2. LCM stores the payload inline in messages/message_parts for conversation 642.\n3. No new large_files row is created and no [LCM Tool Output: file_...] placeholder appears in messages.content.\n\nEvidence from live DB:\n- messages 118156 and 118158 are role=tool in conversation 642\n- message_parts rows for those messages are part_type=text with metadata.originalRole=toolResult, toolName=exec\n- total placeholder count in messages is 0\n- large_files has no rows created in the last day\n\nLikely cause:\ninterceptLargeToolResults() only handles params.message.role === \"toolResult\" with array content items of type tool_result/toolResult/function_call_output. Live exec output is arriving as stored role=tool with plain text parts, so the interceptor never rewrites it.\n\nExpected:\nOversized plain-text tool outputs from live OpenClaw exec/tool calls should externalize into large_files and leave an [LCM Tool Output: file_...] placeholder.","priority":"1","title":"Large tool output externalization misses plain-text tool parts from live OpenClaw exec results","type":"bug"}}
 {"type":"status_update","timestamp":"2026-03-19T23:24:13.028377Z","issue_id":"lossless-claw-744","payload":{"status":"in_progress"}}
 {"type":"close","timestamp":"2026-03-19T23:27:39.59025Z","issue_id":"lossless-claw-744","payload":{}}
+{"type":"create","timestamp":"2026-03-20T23:33:25.840447Z","issue_id":"lossless-claw-71a","payload":{"description":"Add the LCM-side query/helper described in the spec to identify transcript GC candidates. Candidates must be tool-result messages, outside the protected fresh tail, already covered by summaries via summary_messages, and not currently protected in context_items. The helper should return enough metadata to build compact replacements in oldest-first batches. Suggested file touch map: src/store/summary-store.ts, src/engine.ts, test/*summary-store*, test/*engine*.","priority":"1","title":"Select transcript-GC candidates from summarized tool results","type":"task"}}
+{"type":"create","timestamp":"2026-03-20T23:33:25.840444Z","issue_id":"lossless-claw-3ea","payload":{"description":"Implement Phase 2 from specs/tool-result-externalization-and-incremental-bootstrap.md now that OpenClaw has merged context-engine transcript maintenance support. Scope: conservative transcript GC for summarized oversized tool results via ContextEngine.maintain(), runtimeContext.rewriteTranscriptEntries(), candidate selection from LCM state, and integration tests. Goal: shrink active session JSONL after content is safely condensed while preserving large_files-backed recall and crash-safe transcript correctness. Reference: specs/tool-result-externalization-and-incremental-bootstrap.md and OpenClaw PR #51191.","priority":"1","title":"Phase 2: runtime-assisted transcript GC","type":"epic"}}
+{"type":"create","timestamp":"2026-03-20T23:33:25.840453Z","issue_id":"lossless-claw-b6e","payload":{"description":"Implement LosslessClawEngine.maintain() using the merged OpenClaw maintenance API. Build replacement toolResult messages from existing large_files-backed placeholders, align candidates to transcript entry ids conservatively, call runtimeContext.rewriteTranscriptEntries(), and add tests proving rewrites run only for eligible summarized tool outputs on bootstrap/turn/compaction paths. Suggested file touch map: src/engine.ts, test/*engine*, test/*integration*.","priority":"1","title":"Implement maintain() transcript rewrites and tests","type":"task"}}
+{"type":"rename","timestamp":"2026-03-20T23:33:41.143578Z","issue_id":"lossless-claw-71a","payload":{"new_id":"lossless-claw-3ea.1"}}
+{"type":"dep_add","timestamp":"2026-03-20T23:33:41.143578Z","issue_id":"lossless-claw-3ea.1","payload":{"dep_type":"parent-child","depends_on":"lossless-claw-3ea"}}
+{"type":"rename","timestamp":"2026-03-20T23:33:41.205701Z","issue_id":"lossless-claw-b6e","payload":{"new_id":"lossless-claw-3ea.2"}}
+{"type":"dep_add","timestamp":"2026-03-20T23:33:41.205701Z","issue_id":"lossless-claw-3ea.2","payload":{"dep_type":"parent-child","depends_on":"lossless-claw-3ea"}}
+{"type":"dep_add","timestamp":"2026-03-20T23:33:41.266466Z","issue_id":"lossless-claw-3ea","payload":{"dep_type":"blocks","depends_on":"lossless-claw-3ea.1"}}
+{"type":"dep_add","timestamp":"2026-03-20T23:33:41.370143Z","issue_id":"lossless-claw-3ea","payload":{"dep_type":"blocks","depends_on":"lossless-claw-3ea.2"}}
+{"type":"dep_add","timestamp":"2026-03-20T23:33:41.445513Z","issue_id":"lossless-claw-3ea.2","payload":{"dep_type":"blocks","depends_on":"lossless-claw-3ea.1"}}
+{"type":"status_update","timestamp":"2026-03-20T23:33:41.51239Z","issue_id":"lossless-claw-3ea.1","payload":{"status":"in_progress"}}
+{"type":"status_update","timestamp":"2026-03-20T23:41:13.844213Z","issue_id":"lossless-claw-3ea.2","payload":{"status":"in_progress"}}
+{"type":"close","timestamp":"2026-03-20T23:41:13.915384Z","issue_id":"lossless-claw-3ea.1","payload":{}}
+{"type":"close","timestamp":"2026-03-20T23:41:13.977321Z","issue_id":"lossless-claw-3ea.2","payload":{}}
+{"type":"close","timestamp":"2026-03-20T23:41:14.053332Z","issue_id":"lossless-claw-3ea","payload":{}}
diff --git a/specs/tool-result-externalization-and-incremental-bootstrap.md b/specs/tool-result-externalization-and-incremental-bootstrap.md
@@ -0,0 +1,192 @@
+# Tool Result Externalization, Transcript GC, and Incremental Bootstrap
+
+**Status:** In progress  
+**Date:** 2026-03-20  
+**Scope:** `lossless-claw` plugin with small OpenClaw runtime/API support  
+**Priority:** High
+
+## Problem
+
+`lossless-claw` bounds model context growth, but long-lived tool-heavy sessions can still grow their active session JSONL without bound.
+
+Without transcript maintenance:
+
+- large `toolResult` payloads remain inline in the active transcript
+- restart/bootstrap cost grows with transcript size
+- crashes force the same oversized history to be replayed
+- LCM compaction helps the model context, but not the hot transcript on disk
+
+The design here addresses three related concerns:
+
+1. externalize oversized tool output into `large_files`
+2. GC old transcript entries once their content is safely condensed
+3. make bootstrap proportional to transcript deltas instead of full history size
+
+## Current Implementation Status
+
+### Implemented in `lossless-claw`
+
+#### Phase 1: Incremental bootstrap and ingest-time externalization
+
+These pieces are implemented on `main`:
+
+- `large_files` storage with retrieval-friendly `file_...` references
+- ingest-time externalization of oversized tool-result payloads
+- compact `[LCM Tool Output: ...]` placeholders in stored message content
+- `message_parts.metadata` linkage for `externalizedFileId`, `originalByteSize`, and `toolOutputExternalized`
+- `conversation_bootstrap_state` persistence
+- unchanged-file bootstrap fast path
+- append-only tail-import bootstrap fast path
+- streaming fallback bootstrap parsing
+- constrained FTS indexing for externalized placeholders
+
+Relevant code:
+
+- [engine.ts](/Users/phaedrus/Projects/lossless-claw/src/engine.ts)
+- [large-files.ts](/Users/phaedrus/Projects/lossless-claw/src/large-files.ts)
+- [summary-store.ts](/Users/phaedrus/Projects/lossless-claw/src/store/summary-store.ts)
+- [conversation-store.ts](/Users/phaedrus/Projects/lossless-claw/src/store/conversation-store.ts)
+
+#### Phase 2: Runtime-assisted transcript GC, first pass
+
+This branch adds the first transcript-GC pass:
+
+- `SummaryStore.listTranscriptGcCandidates()` returns summarized tool-result messages that are:
+  - already externalized into `large_files`
+  - covered by `summary_messages`
+  - no longer present as raw `context_items`
+- `LcmContextEngine.maintain()` rebuilds compact replacement `toolResult` messages from stored `message_parts`
+- transcript rewrite requests are sent through OpenClaw's runtime-owned `rewriteTranscriptEntries()` hook
+- alignment is conservative and only proceeds when a candidate can be matched to a unique active transcript entry by `toolCallId`
+
+This intentionally skips ambiguous cases instead of attempting unsafe transcript surgery.
+
+Relevant code:
+
+- [engine.ts](/Users/phaedrus/Projects/lossless-claw/src/engine.ts)
+- [assembler.ts](/Users/phaedrus/Projects/lossless-claw/src/assembler.ts)
+- [summary-store.ts](/Users/phaedrus/Projects/lossless-claw/src/store/summary-store.ts)
+
+### Implemented in OpenClaw
+
+OpenClaw now provides the runtime support this design needed:
+
+- `ContextEngine.maintain()`
+- `runtimeContext.rewriteTranscriptEntries()`
+- safe branch-and-reappend transcript rewrites owned by the runtime
+- maintenance call sites after bootstrap, successful turns, and compaction
+
+That runtime support landed upstream via OpenClaw PR `#51191`.
+
+## Design
+
+### Proposal A: Tool-result externalization
+
+Oversized tool outputs should live in `large_files`, not inline in ordinary message storage.
+
+Current behavior:
+
+- tool outputs above the configured threshold are stored out-of-line
+- LCM persists a compact tool-output placeholder instead of the raw blob
+- retrieval remains possible via `file_...` references
+
+### Proposal B: Transcript GC
+
+Once old tool-result content has been safely condensed, the active transcript should no longer retain the giant inline blob.
+
+The first pass uses this eligibility rule:
+
+1. message is a tool-result row in LCM
+2. content was already externalized during ingest
+3. message is linked through `summary_messages`
+4. message is no longer a raw `context_items` entry
+5. the active transcript contains a unique matching tool-result entry for the same `toolCallId`
+
+When all of those are true, `maintain()` asks the runtime to replace the active transcript entry with the compact placeholder-backed `toolResult`.
+
+### Proposal C: Incremental bootstrap
+
+Bootstrap should skip or tail-import when the transcript is unchanged or append-only.
+
+Current behavior:
+
+- unchanged transcript: skip bootstrap work
+- append-only transcript: ingest only the tail
+- suspicious rewrite/truncation: fall back to full streaming reconciliation
+
+## Why This Matters
+
+This work addresses an operational problem, not just a model-context problem.
+
+Benefits:
+
+- active session transcripts stop accumulating unbounded large tool blobs
+- restarts become cheaper over time
+- crash recovery avoids repeatedly paying for the same oversized raw history
+- recall remains intact through `large_files`
+
+## Remaining Work
+
+The implementation is useful now, but it is not the full end state.
+
+### 1. Handle legacy inline oversized tool results
+
+The current transcript-GC pass only rewrites tool results that were already externalized during ingest.
+
+Still needed:
+
+- nominate old oversized inline tool results that predate externalization
+- externalize their raw content during maintenance if needed
+- then rewrite those transcript entries
+
+### 2. Improve transcript-entry alignment
+
+The current pass aligns transcript entries by unique `toolCallId`.
+
+That is safe, but conservative. It skips cases where:
+
+- the same `toolCallId` appears ambiguously
+- the active transcript shape cannot be matched with confidence
+
+Still needed:
+
+- a more robust mapping strategy, or
+- additive persistence of stable transcript entry ids
+
+### 3. Tighten eligibility and fresh-tail protection
+
+Today the effective protection rule is "summarized and not still a raw context item".
+
+Still needed:
+
+- an explicit fresh-tail policy
+- optional size/noise thresholds for GC
+- bounded batch tuning and observability for maintenance passes
+
+### 4. Add end-to-end runtime integration coverage
+
+Focused unit coverage exists for candidate selection and rewrite request generation.
+
+Still needed:
+
+- integration coverage against the real merged OpenClaw maintenance lifecycle
+- verification of bootstrap/turn/compaction-triggered rewrites in realistic session files
+
+### 5. Phase 3 preventive hygiene
+
+The current model is still mostly reactive.
+
+Still needed:
+
+- write-time transcript paths that avoid landing giant inline tool blobs in the first place where possible
+- optional normalization of repeated low-value progress spam
+
+## Recommendation
+
+Keep the current first pass narrow and safe, and continue Phase 2 with:
+
+1. legacy inline tool-result cleanup
+2. stronger transcript-entry identity/alignment
+3. end-to-end integration coverage
+
+That sequence preserves correctness while moving steadily toward bounded transcript growth in real long-lived sessions.
diff --git a/src/assembler.ts b/src/assembler.ts
@@ -420,7 +420,8 @@ export function blockFromPart(part: MessagePartRecord): unknown {
   return { type: "text", text: "" };
 }
 
-function contentFromParts(
+/** @internal Exported for transcript-maintenance reconstruction. */
+export function contentFromParts(
   parts: MessagePartRecord[],
   role: "user" | "assistant" | "toolResult",
   fallbackContent: string,
@@ -449,7 +450,8 @@ function contentFromParts(
   return blocks;
 }
 
-function pickToolCallId(parts: MessagePartRecord[]): string | undefined {
+/** @internal Exported for transcript-maintenance reconstruction. */
+export function pickToolCallId(parts: MessagePartRecord[]): string | undefined {
   for (const part of parts) {
     if (typeof part.toolCallId === "string" && part.toolCallId.length > 0) {
       return part.toolCallId;
@@ -478,7 +480,8 @@ function pickToolCallId(parts: MessagePartRecord[]): string | undefined {
   return undefined;
 }
 
-function pickToolName(parts: MessagePartRecord[]): string | undefined {
+/** @internal Exported for transcript-maintenance reconstruction. */
+export function pickToolName(parts: MessagePartRecord[]): string | undefined {
   for (const part of parts) {
     if (typeof part.toolName === "string" && part.toolName.length > 0) {
       return part.toolName;
@@ -507,7 +510,8 @@ function pickToolName(parts: MessagePartRecord[]): string | undefined {
   return undefined;
 }
 
-function pickToolIsError(parts: MessagePartRecord[]): boolean | undefined {
+/** @internal Exported for transcript-maintenance reconstruction. */
+export function pickToolIsError(parts: MessagePartRecord[]): boolean | undefined {
   for (const part of parts) {
     const decoded = parseJson(part.metadata);
     if (!decoded || typeof decoded !== "object") {