diff --git a/src/app/api/query/tour/route.ts b/src/app/api/query/tour/route.ts index 1d2a586..70f95ad 100644 --- a/src/app/api/query/tour/route.ts +++ b/src/app/api/query/tour/route.ts @@ -5,7 +5,7 @@ import { callLLM } from '@/lib/llm'; export const maxDuration = 300; import { serializeNodesForQuery, buildTourPrompt } from '@/lib/agents/query'; import type { TourResponse, TourChapter, QuerySerializedNode } from '@/lib/agents/query'; -import { extractJsonObject } from '@/lib/utils/json'; +import { parseLlmJsonLoose } from '@/lib/llm/parse'; const EMPTY_TOUR: TourResponse = { chapters: [ @@ -103,8 +103,7 @@ export async function POST(_request: Request): Promise { } try { - const extracted = extractJsonObject(llmText); - const parsed = JSON.parse(extracted) as unknown; + const parsed = parseLlmJsonLoose(llmText); const tour = normalizeTour(parsed); if (!tour) { console.error('[tour] Invalid tour structure after normalize. Raw (200):', llmText.slice(0, 200)); diff --git a/src/lib/agents/distillation.ts b/src/lib/agents/distillation.ts index 95550e5..eb3272f 100644 --- a/src/lib/agents/distillation.ts +++ b/src/lib/agents/distillation.ts @@ -1,7 +1,7 @@ import { callLLM } from '@/lib/llm'; import { z } from 'zod'; import { getDistillableTypes } from '@/lib/config/captureTypes'; -import { extractJsonObject } from '@/lib/utils/json'; +import { parseLlmJson } from '@/lib/llm/parse'; import type { SupabaseClient } from '@supabase/supabase-js'; interface NodeSummary { @@ -53,7 +53,7 @@ export async function runDistillation( let groups: Array<{ node_ids: string[]; rationale: string }> = []; try { - const parsed = clusterSchema.parse(JSON.parse(extractJsonObject(clusterResult.content))); + const parsed = parseLlmJson(clusterResult.content, clusterSchema); groups = parsed.groups ?? []; } catch { return { created: 0, errors: ['Cluster LLM response was not valid JSON or schema'] }; @@ -85,7 +85,7 @@ export async function runDistillation( maxTokens: 1024, }); - const synthesis = synthesisSchema.parse(JSON.parse(extractJsonObject(synthResult.content))); + const synthesis = parseLlmJson(synthResult.content, synthesisSchema); const { error } = await supabase.from('distillation_candidates').insert({ node_ids: groupNodes.map(n => n.id), diff --git a/src/lib/agents/extraction.ts b/src/lib/agents/extraction.ts index c905fcb..24f005d 100644 --- a/src/lib/agents/extraction.ts +++ b/src/lib/agents/extraction.ts @@ -6,7 +6,7 @@ import { getLlmNodeTypeEnum, getLlmNodeTypeDescriptions, } from '@/lib/config/captureTypes'; -import { extractJsonObject } from '@/lib/utils/json'; +import { parseLlmJsonLoose } from '@/lib/llm/parse'; // Computed once at module load from taxonomy config — change captureTypes.ts to update these. const LLM_NODE_TYPE_ENUM = getLlmNodeTypeEnum(); @@ -188,12 +188,9 @@ export function buildExtractionPrompt( } export function parseExtractionResponse(content: string): LlmExtraction { - const stripped = content.replace(/^```(?:json)?\n?/m, '').replace(/\n?```$/m, '').trim(); - const cleaned = extractJsonObject(stripped); - let parsed: unknown; try { - parsed = JSON.parse(cleaned); + parsed = parseLlmJsonLoose(content); } catch { // LLM returned natural language instead of JSON — likely a PDF it cannot read throw new Error('PDF_UNREADABLE'); @@ -271,16 +268,18 @@ export function buildMeetingExtractionPrompt( } export function parseMeetingExtractionResponse(content: string): MeetingExtraction { - const stripped = content.replace(/^```(?:json)?\n?/m, '').replace(/\n?```$/m, '').trim(); - const cleaned = extractJsonObject(stripped); - const parsed = JSON.parse(cleaned); + const parsed = parseLlmJsonLoose(content); + if (typeof parsed !== 'object' || parsed === null) { + throw new Error('Meeting extraction response must be a JSON object'); + } + const obj = parsed as Record; const required = ['meeting_title', 'meeting_summary', 'extracted_nodes']; for (const field of required) { - if (!(field in parsed)) { + if (!(field in obj)) { throw new Error(`Missing required field: ${field}`); } } - if (!Array.isArray(parsed.extracted_nodes) || parsed.extracted_nodes.length === 0) { + if (!Array.isArray(obj.extracted_nodes) || obj.extracted_nodes.length === 0) { throw new Error('extracted_nodes must be a non-empty array'); } return parsed as MeetingExtraction; @@ -349,14 +348,16 @@ export function buildDocumentExtractionPrompt( } export function parseDocumentExtractionResponse(content: string): DocumentExtraction { - const stripped = content.replace(/^```(?:json)?\n?/m, '').replace(/\n?```$/m, '').trim(); - const cleaned = extractJsonObject(stripped); - const parsed = JSON.parse(cleaned); + const parsed = parseLlmJsonLoose(content); + if (typeof parsed !== 'object' || parsed === null) { + throw new Error('Document extraction response must be a JSON object'); + } + const obj = parsed as Record; const required = ['document_title', 'document_summary', 'extracted_nodes']; for (const field of required) { - if (!(field in parsed)) throw new Error(`Missing required field: ${field}`); + if (!(field in obj)) throw new Error(`Missing required field: ${field}`); } - if (!Array.isArray(parsed.extracted_nodes) || parsed.extracted_nodes.length === 0) { + if (!Array.isArray(obj.extracted_nodes) || obj.extracted_nodes.length === 0) { throw new Error('extracted_nodes must be a non-empty array'); } return parsed as DocumentExtraction; diff --git a/src/lib/agents/process.ts b/src/lib/agents/process.ts index 1e5a7f9..8c56b2a 100644 --- a/src/lib/agents/process.ts +++ b/src/lib/agents/process.ts @@ -1,5 +1,6 @@ import type { Node } from '@/lib/types/nodes'; import { callLLM } from '@/lib/llm'; +import { parseLlmJsonLoose } from '@/lib/llm/parse'; import type { GoalContext } from '@/lib/agents/extraction'; export interface SuggestedNodeImpact { @@ -59,8 +60,9 @@ Return ONLY valid JSON array: Only include commitments where the learning is clearly relevant. If a commitment is unaffected, exclude it.`; function parseJsonResponse(content: string): T { - const cleaned = content.replace(/^```(?:json)?\n?/m, '').replace(/\n?```$/m, '').trim(); - return JSON.parse(cleaned) as T; + // Handles both object and array responses (step 3 returns an array), plus + // ```json fences and trailing prose. + return parseLlmJsonLoose(content) as T; } export async function suggestAffectedNodes( diff --git a/src/lib/agents/reflection.ts b/src/lib/agents/reflection.ts index 437285c..32993ac 100644 --- a/src/lib/agents/reflection.ts +++ b/src/lib/agents/reflection.ts @@ -1,3 +1,5 @@ +import { parseLlmJsonLoose } from '@/lib/llm/parse'; + // ─── Types ──────────────────────────────────────────────────────────────────── export interface ReflectionContext { @@ -175,13 +177,7 @@ const REQUIRED_FIELDS = [ ] as const; export function parseReflectionResponse(content: string): ReflectionReport { - // Strip markdown code fences if present - const cleaned = content - .replace(/^```(?:json)?\n?/m, '') - .replace(/\n?```$/m, '') - .trim(); - - const parsed: unknown = JSON.parse(cleaned); + const parsed: unknown = parseLlmJsonLoose(content); if (typeof parsed !== 'object' || parsed === null || Array.isArray(parsed)) { throw new Error('Reflection response must be a JSON object'); diff --git a/src/lib/agents/setup.ts b/src/lib/agents/setup.ts index 50e8bd8..0676a20 100644 --- a/src/lib/agents/setup.ts +++ b/src/lib/agents/setup.ts @@ -1,4 +1,5 @@ import { callLLM } from '@/lib/llm'; +import { parseLlmJsonLoose } from '@/lib/llm/parse'; const GOAL_SUGGEST_PROMPT = `You are helping a team articulate their strategic goals for a knowledge management system. The user will describe what they're trying to do in plain language. @@ -51,7 +52,7 @@ export async function suggestGoal(userInput: string): Promise { let parsed: unknown; try { - parsed = JSON.parse(response.content); + parsed = parseLlmJsonLoose(response.content); } catch { throw new Error('Failed to parse goal suggestion'); } @@ -80,7 +81,7 @@ export async function processSeedChat(input: SeedChatInput): Promise; const reasoning = typeof obj['reasoning'] === 'string' ? obj['reasoning'] : ''; diff --git a/src/lib/llm/__tests__/parse.test.ts b/src/lib/llm/__tests__/parse.test.ts new file mode 100644 index 0000000..2606c31 --- /dev/null +++ b/src/lib/llm/__tests__/parse.test.ts @@ -0,0 +1,74 @@ +import { describe, it, expect } from 'vitest'; +import { z } from 'zod'; +import { extractJson, parseLlmJson, tryParseLlmJson, parseLlmJsonLoose } from '../parse'; + +describe('extractJson', () => { + it('returns a bare object unchanged', () => { + expect(extractJson('{"a":1}')).toBe('{"a":1}'); + }); + + it('strips ```json code fences', () => { + expect(JSON.parse(extractJson('```json\n{"a":1}\n```'))).toEqual({ a: 1 }); + }); + + it('strips plain ``` fences and leading prose', () => { + expect(JSON.parse(extractJson('Here you go:\n```\n{"a":1}\n```'))).toEqual({ a: 1 }); + }); + + it('drops trailing commentary after the object', () => { + expect(JSON.parse(extractJson('{"a":1}\n\nHope that helps!'))).toEqual({ a: 1 }); + }); + + it('extracts a top-level array (process.ts step-3 shape)', () => { + expect(JSON.parse(extractJson('```json\n[{"id":1},{"id":2}]\n```'))).toEqual([{ id: 1 }, { id: 2 }]); + }); + + it('picks whichever of { or [ comes first', () => { + // a brace inside prose before the real array should not win if the array is the payload... + // here the object genuinely comes first, so it wins: + expect(JSON.parse(extractJson('{"wrap":[1,2]}'))).toEqual({ wrap: [1, 2] }); + }); + + it('ignores braces inside strings', () => { + expect(JSON.parse(extractJson('{"text":"a } b { c"}'))).toEqual({ text: 'a } b { c' }); + }); +}); + +describe('parseLlmJson', () => { + const schema = z.object({ name: z.string(), n: z.number() }); + + it('parses + validates a fenced response', () => { + expect(parseLlmJson('```json\n{"name":"x","n":2}\n```', schema)).toEqual({ name: 'x', n: 2 }); + }); + + it('throws on schema mismatch', () => { + expect(() => parseLlmJson('{"name":"x"}', schema)).toThrow(); + }); + + it('throws on invalid JSON', () => { + expect(() => parseLlmJson('not json', schema)).toThrow(); + }); +}); + +describe('tryParseLlmJson', () => { + const schema = z.object({ ok: z.boolean() }); + + it('returns the value on success', () => { + expect(tryParseLlmJson('{"ok":true}', schema)).toEqual({ ok: true }); + }); + + it('returns null on failure instead of throwing', () => { + expect(tryParseLlmJson('garbage', schema)).toBeNull(); + expect(tryParseLlmJson('{"ok":"nope"}', schema)).toBeNull(); + }); +}); + +describe('parseLlmJsonLoose', () => { + it('parses fenced JSON without a schema', () => { + expect(parseLlmJsonLoose('```json\n{"a":1}\n```')).toEqual({ a: 1 }); + }); + + it('throws on invalid JSON', () => { + expect(() => parseLlmJsonLoose('definitely not json')).toThrow(); + }); +}); diff --git a/src/lib/llm/parse.ts b/src/lib/llm/parse.ts new file mode 100644 index 0000000..f81d5e5 --- /dev/null +++ b/src/lib/llm/parse.ts @@ -0,0 +1,63 @@ +import type { z } from 'zod'; + +/** + * Extracts a JSON value from LLM output, tolerating ```json code fences, + * leading prose, and trailing commentary. Returns the substring spanning the + * first balanced object `{...}` or array `[...]` (whichever appears first). + * + * This is the single seam every agent parses LLM JSON through, replacing the + * six bespoke variants that previously diverged (raw JSON.parse with no fence + * handling, ad-hoc fence regexes, object-only extraction that broke on arrays). + */ +export function extractJson(text: string): string { + const objIdx = text.indexOf('{'); + const arrIdx = text.indexOf('['); + if (objIdx === -1 && arrIdx === -1) return text.trim(); + + let start: number; + let open: string; + let close: string; + if (arrIdx === -1 || (objIdx !== -1 && objIdx < arrIdx)) { + start = objIdx; open = '{'; close = '}'; + } else { + start = arrIdx; open = '['; close = ']'; + } + + let depth = 0; + let inString = false; + let escaped = false; + for (let i = start; i < text.length; i++) { + const c = text[i]; + if (escaped) { escaped = false; continue; } + if (c === '\\' && inString) { escaped = true; continue; } + if (c === '"') { inString = !inString; continue; } + if (!inString) { + if (c === open) depth++; + else if (c === close) { depth--; if (depth === 0) return text.slice(start, i + 1); } + } + } + return text.slice(start); +} + +/** Parse + validate against a Zod schema. Throws on invalid JSON or schema mismatch. */ +export function parseLlmJson(content: string, schema: z.ZodType): T { + return schema.parse(JSON.parse(extractJson(content))); +} + +/** Non-throwing variant — returns null on any parse or validation failure. */ +export function tryParseLlmJson(content: string, schema: z.ZodType): T | null { + try { + return parseLlmJson(content, schema); + } catch { + return null; + } +} + +/** + * Parse without schema validation, for callers that validate downstream or + * accept arbitrary shapes. Throws on invalid JSON. Returns `unknown` — narrow + * or validate before use. + */ +export function parseLlmJsonLoose(content: string): unknown { + return JSON.parse(extractJson(content)); +} diff --git a/src/lib/signals/webScanner.ts b/src/lib/signals/webScanner.ts index 1389179..1bacf97 100644 --- a/src/lib/signals/webScanner.ts +++ b/src/lib/signals/webScanner.ts @@ -1,6 +1,7 @@ import { extractKeywords, filterRelevant } from './relevanceFilter'; import { ingestSignals, type SignalInput } from './signalIngestor'; import { callLLM } from '@/lib/llm'; +import { parseLlmJson } from '@/lib/llm/parse'; import { z } from 'zod'; const extractedSignalSchema = z.object({ @@ -114,7 +115,7 @@ export async function scanWebForTopics(userId: string): Promise<{ created: numbe let extracted: z.infer[] = []; try { - const parsed = extractionOutputSchema.parse(JSON.parse(extractionResult.content)); + const parsed = parseLlmJson(extractionResult.content, extractionOutputSchema); extracted = parsed.signals ?? []; } catch { // non-fatal: malformed or schema-invalid LLM output