From e09debafc60d26b7e91a31bf1ea3e6ef52d593e1 Mon Sep 17 00:00:00 2001 From: shaiananvari8 <228813044+shaiananvari8@users.noreply.github.com> Date: Sun, 31 May 2026 12:18:17 -0500 Subject: [PATCH] Improve scientific RAG source grounding --- ui/__tests__/scientific-sources.test.ts | 111 +++++++ ui/pages/api/fetch-documents.ts | 52 +++- ui/pages/api/inject-documents.ts | 105 ++++--- ui/pages/api/rag-chat.ts | 74 ++--- ui/utils/server/scientific-sources.ts | 392 ++++++++++++++++++++++++ 5 files changed, 633 insertions(+), 101 deletions(-) create mode 100644 ui/__tests__/scientific-sources.test.ts create mode 100644 ui/utils/server/scientific-sources.ts diff --git a/ui/__tests__/scientific-sources.test.ts b/ui/__tests__/scientific-sources.test.ts new file mode 100644 index 0000000..3623689 --- /dev/null +++ b/ui/__tests__/scientific-sources.test.ts @@ -0,0 +1,111 @@ +import { + buildSemanticScholarEntries, + buildUploadedDocumentEntry, + detectScientificSection, + formatRetrievedScientificSources, + parseBoundedInteger, + parseSemanticScholarReferences, +} from '@/utils/server/scientific-sources'; + +import { describe, expect, it } from 'vitest'; + +describe('scientific source helpers', () => { + it('parses Semantic Scholar references from direct arrays and wrapped fields', () => { + expect( + parseSemanticScholarReferences( + JSON.stringify({ + references: [{ paperId: 'abc', title: 'Reference title' }], + }), + ), + ).toEqual([{ paperId: 'abc', title: 'Reference title' }]); + expect( + parseSemanticScholarReferences([{ title: 'A' }, { title: 'B' }]), + ).toHaveLength(2); + }); + + it('builds citation-ready Semantic Scholar entries', () => { + const entries = buildSemanticScholarEntries( + [ + { + paperId: '649def34f8be52c8b66281af98ae884c09aef38b', + title: 'Attention Is All You Need', + abstract: 'We propose a new simple network architecture.', + authors: [{ name: 'Ashish Vaswani' }, { name: 'Noam Shazeer' }], + year: 2017, + venue: 'NeurIPS', + externalIds: { DOI: '10.5555/3295222.3295349' }, + }, + ], + () => 'id-1', + ); + + expect(entries).toHaveLength(1); + expect(entries[0].metadata).toMatchObject({ + sourceType: 'semantic_scholar_reference', + citationKey: 'scholar:649def34f8be52c8b66281af98ae884c09aef38b:2017', + authors: 'Ashish Vaswani, Noam Shazeer', + doi: '10.5555/3295222.3295349', + year: 2017, + }); + expect(entries[0].document).toContain('Abstract: We propose'); + }); + + it('builds uploaded document metadata with stable page and chunk citations', () => { + const entry = buildUploadedDocumentEntry( + { + pageContent: 'Methods\nWe measured retrieval accuracy.', + metadata: { + source: '/tmp/papers/demo.pdf', + pdf: { info: { Title: 'Demo Study' } }, + loc: { pageNumber: 3 }, + }, + }, + 2, + 'chunk-id', + ); + + expect(entry.metadata).toMatchObject({ + citationKey: 'doc:demo-study:p3:c3', + page: 3, + section: 'methods', + sourceType: 'uploaded_document', + title: 'Demo Study', + }); + }); + + it('formats mixed Chroma results into prompt context with exact citation keys', () => { + const formatted = formatRetrievedScientificSources({ + documents: [['Uploaded excerpt', 'Reference abstract']], + metadatas: [ + [ + { + sourceType: 'uploaded_document', + citationKey: 'doc:demo:p1:c1', + title: 'Demo', + page: 1, + }, + { + sourceType: 'semantic_scholar_reference', + citationKey: 'scholar:paper:2024', + title: 'Paper', + year: 2024, + }, + ], + ], + distances: [[0.12, 0.34]], + }); + + expect(formatted.sources).toHaveLength(2); + expect(formatted.context).toContain('Citation: [doc:demo:p1:c1]'); + expect(formatted.context).toContain('Semantic Scholar reference'); + expect(formatted.context).toContain('Retrieval distance: 0.3400'); + }); + + it('detects scientific sections and clamps retrieval counts', () => { + expect( + detectScientificSection('Results\nThe measured recall improved.'), + ).toBe('results'); + expect(parseBoundedInteger('50', 8, 1, 20)).toBe(20); + expect(parseBoundedInteger('nope', 8, 1, 20)).toBe(8); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..e20a9a6 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,51 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +import { + formatRetrievedScientificSources, + parseBoundedInteger, +} from '@/utils/server/scientific-sources'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; + const query = + typeof req.body.input === 'string' ? req.body.input.trim() : ''; + + if (!query) { + return res.status(400).json({ error: 'Missing query input' }); + } + + const nResults = parseBoundedInteger(req.body.nResults, 8, 1, 20); const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + const results = await collection.query({ + nResults, + queryTexts: [query], + }); + const formatted = formatRetrievedScientificSources(results); - res.status(200).json(results); + res.status(200).json({ + ...results, + ...formatted, + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -29,4 +55,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..db2ec71 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,11 +1,16 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { + SCIENTIFIC_TEXT_SEPARATORS, + buildSemanticScholarEntries, + buildUploadedDocumentEntry, + parseSemanticScholarReferences, +} from '@/utils/server/scientific-sources'; + import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - -import path from 'path'; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { v4 as uuidv4 } from 'uuid'; export const config = { @@ -33,22 +38,48 @@ export default async function handler( path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); - - const originalDocs = await loader.load(); - - console.log(JSON.stringify(originalDocs)); - - const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 140, + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); - const docs = await splitter.splitDocuments(originalDocs); - - // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + const pdfFiles = Array.isArray(files.pdf) + ? files.pdf + : files.pdf + ? [files.pdf] + : []; + const references = parseSemanticScholarReferences( + fields.references ?? + fields.semanticScholarReferences ?? + fields.savedReferences, + ); + const uploadedDocumentEntries = []; + + for (const file of pdfFiles) { + const loader = new PDFLoader(file.filepath); + const originalDocs = await loader.load(); + const docs = await splitter.splitDocuments(originalDocs); + + uploadedDocumentEntries.push( + ...docs.map((document, index) => + buildUploadedDocumentEntry(document, index, uuidv4()), + ), + ); + } + + const semanticScholarEntries = buildSemanticScholarEntries( + references, + uuidv4, + ); + const entries = [...uploadedDocumentEntries, ...semanticScholarEntries]; + + if (entries.length === 0) { + return res.status(400).json({ + error: + 'Upload at least one PDF or provide Semantic Scholar references.', + }); + } const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -57,14 +88,16 @@ export default async function handler( }); await collection.add({ - ids, - metadatas, - documents: documentContents, + ids: entries.map((entry) => entry.id), + metadatas: entries.map((entry) => entry.metadata), + documents: entries.map((entry) => entry.document), }); res.status(200).json({ message: 'Documents processed successfully', - documentCount: ids.length, + documentCount: uploadedDocumentEntries.length, + referenceCount: semanticScholarEntries.length, + sourceCount: entries.length, }); }); } catch (error) { @@ -74,33 +107,3 @@ export default async function handler( .json({ message: 'An error occurred while processing the documents' }); } } - -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; - - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier - const id = uuidv4(); - ids.push(id); - - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); - - // Add the page content to the documents array - documentContents.push(document.pageContent); - } - - return { ids, metadatas, documentContents }; -} diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..a00664d 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,6 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import { formatRetrievedScientificSources } from '@/utils/server/scientific-sources'; import { ChatBody, Message } from '@/types/chat'; @@ -9,46 +9,42 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json'; import { Tiktoken, init } from '@dqbd/tiktoken/lite/init'; +import { codeBlock, oneLine } from 'common-tags'; export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +async function fetchAndFormatDocuments( + lastMessageContent: string, + req: Request, +) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const url = new URL('/api/fetch-documents', req.url); + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input: lastMessageContent, nResults: 8 }), }); - + if (!response.ok) { throw new Error(`Error fetching documents: ${response.statusText}`); } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); + const result = + typeof data.context === 'string' + ? data.context + : formatRetrievedScientificSources(data).context; return result; - } catch (error) { console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + throw error; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -62,10 +58,10 @@ const handler = async (req: Request): Promise => { let promptToSend = codeBlock` ${oneLine` - You are a very enthusiastic AI assistant who loves - to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. + You are a careful scientific research assistant. Given + the following retrieved evidence from uploaded documents + and Semantic Scholar references, answer the user's question + using only that evidence, outputted in markdown format. `} ${oneLine` @@ -75,7 +71,9 @@ const handler = async (req: Request): Promise => { `} ${oneLine` - Always include citations from the documentation. + Cite factual claims with the exact bracketed citation keys + shown in the evidence, such as [doc:paper:p2:c1] or + [scholar:paper-id:2024]. `} `; @@ -85,8 +83,11 @@ const handler = async (req: Request): Promise => { const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - + const relevantDocuments = await fetchAndFormatDocuments( + lastMessage.content, + req, + ); + let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; @@ -97,22 +98,18 @@ const handler = async (req: Request): Promise => { let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - - messagesToSend = [ + messagesToSend = [ { - role: "user", + role: 'user', content: codeBlock` Here is the relevant documentation: ${relevantDocuments} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` Answer my next question using only the above documentation. @@ -132,22 +129,25 @@ const handler = async (req: Request): Promise => { ${oneLine` - Output as markdown with citations based on the documentation. `} + ${oneLine` + - Cite each factual claim with the exact bracketed citation key + from the evidence. + `} `, }, { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} `, }, - ] - + ]; const stream = await OpenAIStream( model, promptToSend, - 0, + temperatureToUse, key, messagesToSend, ); diff --git a/ui/utils/server/scientific-sources.ts b/ui/utils/server/scientific-sources.ts new file mode 100644 index 0000000..aba4b7b --- /dev/null +++ b/ui/utils/server/scientific-sources.ts @@ -0,0 +1,392 @@ +export type ScientificSourceType = + | 'uploaded_document' + | 'semantic_scholar_reference'; + +export type ScientificSourceMetadata = { + sourceType: ScientificSourceType; + citationKey: string; + title: string; + section?: string; + page?: number; + chunkIndex?: number; + source?: string; + paperId?: string; + authors?: string; + year?: number; + venue?: string; + doi?: string; + url?: string; +}; + +export type SemanticScholarReference = { + paperId?: unknown; + title?: unknown; + abstract?: unknown; + tldr?: unknown; + authors?: unknown; + year?: unknown; + venue?: unknown; + doi?: unknown; + url?: unknown; + externalIds?: unknown; +}; + +export type ScientificSourceEntry = { + id: string; + metadata: ScientificSourceMetadata; + document: string; +}; + +export type RetrievedScientificSource = ScientificSourceMetadata & { + distance?: number; + excerpt: string; +}; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nIntroduction', + '\nBackground', + '\nMethods', + '\nMethodology', + '\nResults', + '\nDiscussion', + '\nConclusion', + '\nReferences', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +const SECTION_PATTERNS: Array<[string, RegExp]> = [ + ['abstract', /\babstract\b/i], + ['introduction', /\bintroduction\b/i], + ['methods', /\b(methods?|methodology|materials and methods)\b/i], + ['results', /\bresults?\b/i], + ['discussion', /\bdiscussion\b/i], + ['conclusion', /\bconclusions?\b/i], + ['references', /\breferences?\b/i], +]; + +export function detectScientificSection(text: string): string { + const headingWindow = normaliseWhitespace(text).slice(0, 240); + const match = SECTION_PATTERNS.find(([, pattern]) => + pattern.test(headingWindow), + ); + + return match?.[0] ?? 'body'; +} + +export function parseBoundedInteger( + value: unknown, + fallback: number, + min: number, + max: number, +): number { + const parsed = + typeof value === 'number' + ? value + : typeof value === 'string' + ? Number.parseInt(value, 10) + : NaN; + + if (!Number.isFinite(parsed)) return fallback; + + return Math.min(max, Math.max(min, Math.trunc(parsed))); +} + +export function parseSemanticScholarReferences( + raw: unknown, +): SemanticScholarReference[] { + if (raw == null) return []; + + const flattened = Array.isArray(raw) ? raw : [raw]; + + return flattened.flatMap((value) => { + const parsed = parseReferenceValue(value); + + if (Array.isArray(parsed)) return parsed; + if (isRecord(parsed) && Array.isArray(parsed.references)) { + return parsed.references; + } + if (isRecord(parsed) && Array.isArray(parsed.savedReferences)) { + return parsed.savedReferences; + } + if (isRecord(parsed) && Array.isArray(parsed.papers)) { + return parsed.papers; + } + if (isRecord(parsed)) return [parsed]; + + return []; + }); +} + +export function buildSemanticScholarEntries( + references: SemanticScholarReference[], + makeId: () => string, +): ScientificSourceEntry[] { + return references + .map((reference) => buildSemanticScholarEntry(reference, makeId())) + .filter((entry): entry is ScientificSourceEntry => entry !== null); +} + +export function buildUploadedDocumentEntry( + document: { + metadata?: Record; + pageContent?: string; + }, + chunkIndex: number, + id: string, +): ScientificSourceEntry { + const source = stringValue(document.metadata?.source); + const fallbackTitle = source ? basename(source) : 'Uploaded document'; + const titleFromMetadata = stringValue(document.metadata?.pdf?.info?.Title); + const title = titleFromMetadata || fallbackTitle; + const page = numberValue(document.metadata?.loc?.pageNumber) ?? 1; + const section = detectScientificSection(document.pageContent ?? ''); + const citationKey = buildUploadedDocumentCitationKey(title, page, chunkIndex); + + return { + id, + document: document.pageContent ?? '', + metadata: { + sourceType: 'uploaded_document', + citationKey, + title, + page, + chunkIndex, + source, + section, + }, + }; +} + +export function formatRetrievedScientificSources(results: any): { + context: string; + sources: RetrievedScientificSource[]; +} { + const documents = firstResultRow(results?.documents); + const metadatas = firstResultRow( + results?.metadatas, + ); + const distances = firstResultRow(results?.distances); + const seen = new Set(); + const sources: RetrievedScientificSource[] = []; + + for (let index = 0; index < documents.length; index += 1) { + const document = normaliseWhitespace(documents[index] ?? ''); + if (!document) continue; + + const metadata = normaliseMetadata(metadatas[index]); + const key = metadata.citationKey || `source-${index + 1}`; + const dedupeKey = `${key}:${document.slice(0, 120)}`; + + if (seen.has(dedupeKey)) continue; + seen.add(dedupeKey); + + sources.push({ + ...metadata, + citationKey: key, + distance: numberValue(distances[index]), + excerpt: document.slice(0, 1200), + }); + } + + return { + sources, + context: sources.map(formatSourceForPrompt).join('\n\n'), + }; +} + +function buildSemanticScholarEntry( + reference: SemanticScholarReference, + id: string, +): ScientificSourceEntry | null { + const title = stringValue(reference.title); + const abstract = stringValue(reference.abstract); + const tldr = extractTldr(reference.tldr); + const paperId = stringValue(reference.paperId); + + if (!title && !abstract && !tldr) return null; + + const authors = formatAuthors(reference.authors); + const year = numberValue(reference.year); + const venue = stringValue(reference.venue); + const doi = extractDoi(reference); + const url = stringValue(reference.url); + const citationKey = buildReferenceCitationKey(paperId || title, year); + const document = [ + title ? `Title: ${title}` : '', + authors ? `Authors: ${authors}` : '', + year ? `Year: ${year}` : '', + venue ? `Venue: ${venue}` : '', + doi ? `DOI: ${doi}` : '', + url ? `URL: ${url}` : '', + tldr ? `TLDR: ${tldr}` : '', + abstract ? `Abstract: ${abstract}` : '', + ] + .filter(Boolean) + .join('\n'); + + return { + id, + document, + metadata: { + sourceType: 'semantic_scholar_reference', + citationKey, + title: title || paperId || 'Semantic Scholar reference', + paperId, + authors, + year, + venue, + doi, + url, + section: 'reference', + }, + }; +} + +function formatSourceForPrompt(source: RetrievedScientificSource): string { + const details = [ + `Citation: [${source.citationKey}]`, + `Type: ${ + source.sourceType === 'semantic_scholar_reference' + ? 'Semantic Scholar reference' + : 'Uploaded document' + }`, + `Title: ${source.title}`, + source.authors ? `Authors: ${source.authors}` : '', + source.year ? `Year: ${source.year}` : '', + source.venue ? `Venue: ${source.venue}` : '', + source.doi ? `DOI: ${source.doi}` : '', + source.url ? `URL: ${source.url}` : '', + source.page ? `Page: ${source.page}` : '', + source.section ? `Section: ${source.section}` : '', + typeof source.distance === 'number' + ? `Retrieval distance: ${source.distance.toFixed(4)}` + : '', + `Excerpt: ${source.excerpt}`, + ].filter(Boolean); + + return details.join('\n'); +} + +function parseReferenceValue(value: unknown): unknown { + if (typeof value !== 'string') return value; + + try { + return JSON.parse(value); + } catch { + return null; + } +} + +function buildUploadedDocumentCitationKey( + title: string, + page: number, + chunkIndex: number, +): string { + return `doc:${slugify(title)}:p${page}:c${chunkIndex + 1}`; +} + +function buildReferenceCitationKey(seed: string, year?: number): string { + const suffix = year ? `:${year}` : ''; + + return `scholar:${slugify(seed || 'reference')}${suffix}`; +} + +function normaliseMetadata(metadata: any): ScientificSourceMetadata { + return { + sourceType: + metadata?.sourceType === 'semantic_scholar_reference' + ? 'semantic_scholar_reference' + : 'uploaded_document', + citationKey: stringValue(metadata?.citationKey), + title: stringValue(metadata?.title) || 'Untitled source', + section: stringValue(metadata?.section), + page: numberValue(metadata?.page), + chunkIndex: numberValue(metadata?.chunkIndex), + source: stringValue(metadata?.source), + paperId: stringValue(metadata?.paperId), + authors: stringValue(metadata?.authors), + year: numberValue(metadata?.year), + venue: stringValue(metadata?.venue), + doi: stringValue(metadata?.doi), + url: stringValue(metadata?.url), + }; +} + +function firstResultRow(value: unknown): T[] { + if (!Array.isArray(value)) return []; + if (Array.isArray(value[0])) return value[0] as T[]; + + return value as T[]; +} + +function extractTldr(raw: unknown): string { + if (typeof raw === 'string') return raw; + if (isRecord(raw)) return stringValue(raw.text); + + return ''; +} + +function extractDoi(reference: SemanticScholarReference): string { + const doi = stringValue(reference.doi); + if (doi) return doi; + + if (isRecord(reference.externalIds)) { + return stringValue(reference.externalIds.DOI); + } + + return ''; +} + +function formatAuthors(raw: unknown): string { + if (!Array.isArray(raw)) return ''; + + return raw + .map((author) => + typeof author === 'string' + ? author + : isRecord(author) + ? stringValue(author.name) + : '', + ) + .filter(Boolean) + .slice(0, 12) + .join(', '); +} + +function basename(value: string): string { + return value.split(/[\\/]/).pop() || value; +} + +function slugify(value: string): string { + const slug = normaliseWhitespace(value) + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 60); + + return slug || 'source'; +} + +function normaliseWhitespace(value: string): string { + return value.replace(/\s+/g, ' ').trim(); +} + +function stringValue(value: unknown): string { + return typeof value === 'string' ? value.trim() : ''; +} + +function numberValue(value: unknown): number | undefined { + const parsed = + typeof value === 'number' ? value : Number.parseInt(String(value), 10); + + return Number.isFinite(parsed) ? parsed : undefined; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null; +}