diff --git a/.changeset/skip-media-only-messages.md b/.changeset/skip-media-only-messages.md new file mode 100644 index 00000000..f4249201 --- /dev/null +++ b/.changeset/skip-media-only-messages.md @@ -0,0 +1,5 @@ +--- +"@martian-engineering/lossless-claw": patch +--- + +Skip media-only messages from the summarization pipeline. Messages whose text content (after stripping `MEDIA:/` file path references) is below 50 characters are excluded from summarizer input, avoiding wasted API calls on content that cannot be meaningfully compressed. diff --git a/src/compaction.ts b/src/compaction.ts index a78db8b5..0e9c0e3c 100644 --- a/src/compaction.ts +++ b/src/compaction.ts @@ -144,6 +144,20 @@ const FALLBACK_MAX_CHARS = 512 * 4; const DEFAULT_LEAF_CHUNK_TOKENS = 20_000; const CONDENSED_MIN_INPUT_RATIO = 0.1; +/** + * Minimum text length (after stripping file/media references) for a message + * to be worth sending to the summarizer. Messages below this threshold are + * typically media-only (an image attachment with no accompanying text). + */ +const MEDIA_ONLY_MIN_TEXT_LENGTH = 50; + +const MEDIA_PATH_RE = /MEDIA:\/\S+/g; + +function isMediaOnlyContent(content: string): boolean { + const stripped = content.replace(MEDIA_PATH_RE, "").trim(); + return stripped.length < MEDIA_ONLY_MIN_TEXT_LENGTH; +} + function dedupeOrderedIds(ids: Iterable): string[] { const seen = new Set(); const ordered: string[] = []; @@ -1063,7 +1077,21 @@ export class CompactionEngine { } } - const concatenated = messageContents + // Skip media-only messages that cannot be meaningfully summarized. + const summarizable = messageContents.filter( + (message) => !isMediaOnlyContent(message.content), + ); + + // If every message in this chunk is media-only, skip the entire leaf pass + // rather than sending an empty string to the summarizer. + if (summarizable.length === 0) { + console.warn( + `[lcm] skipping leaf chunk: all ${messageContents.length} messages are media-only; conversationId=${conversationId}`, + ); + return null; + } + + const concatenated = summarizable .map((message) => `[${formatTimestamp(message.createdAt, this.config.timezone)}]\n${message.content}`) .join("\n\n"); const fileIds = dedupeOrderedIds(