diff --git a/.gitignore b/.gitignore index 9bbb966..88cc081 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ yarn-error.log* .turbo todo.md plan.md +eval/cache +eval/results diff --git a/eval/chunkers/ast.ts b/eval/chunkers/ast.ts new file mode 100644 index 0000000..3794ce7 --- /dev/null +++ b/eval/chunkers/ast.ts @@ -0,0 +1,40 @@ +/** + * AST-aware chunker wrapper for evaluation + * + * Wraps the astchunk library for use in the evaluation harness. + * Uses the built-in contextualizedText for better embedding quality. + */ + +import { chunk } from '../../src' + +/** + * Chunk a file using AST-aware chunking and return results + * in a format compatible with the evaluation + * + * @param filepath - Path to the file + * @param code - Source code content + * @param maxNws - Maximum NWS characters per chunk (default: 1500) + */ +export async function chunkFile( + filepath: string, + code: string, + maxNws: number = 1500, +): Promise< + Array<{ + id: string + text: string + startLine: number + endLine: number + }> +> { + const chunks = await chunk(filepath, code, { + maxChunkSize: maxNws, + }) + + return chunks.map((c) => ({ + id: `${filepath}:${c.lineRange.start}-${c.lineRange.end}`, + text: c.contextualizedText, + startLine: c.lineRange.start, + endLine: c.lineRange.end, + })) +} diff --git a/eval/debug_chunks.ts b/eval/debug_chunks.ts new file mode 100644 index 0000000..26ea5b5 --- /dev/null +++ b/eval/debug_chunks.ts @@ -0,0 +1,77 @@ +import { readFileSync } from 'node:fs' +import { join } from 'node:path' +import { chunk } from '../src' +import { chunkFixed } from './chunkers/fixed' + +// Check deepmind_tracr/tracr/craft/transformers.py +// Assume we're looking for lines 100-150 +const testFile = join( + import.meta.dir, + 'data/repoeval/repositories/function_level/deepmind_tracr/tracr/craft/transformers.py', +) +const code = readFileSync(testFile, 'utf-8') +const targetStart = 100 +const targetEnd = 150 + +console.log('File:', testFile) +console.log('Target lines:', targetStart, '-', targetEnd) +console.log('') + +function countNws(text: string): number { + let count = 0 + for (let i = 0; i < text.length; i++) { + if (text.charCodeAt(i) > 32) count++ + } + return count +} + +function overlaps( + chunkStart: number, + chunkEnd: number, + tStart: number, + tEnd: number, +): boolean { + return !(chunkEnd < tStart || chunkStart > tEnd) +} + +for (const maxSize of [1500, 1800]) { + console.log(`\n=== Max chunk size: ${maxSize} ===`) + + const astChunks = await chunk(testFile, code, { maxChunkSize: maxSize }) + const fixedChunks = chunkFixed(code, maxSize) + + console.log('\nAST chunks:') + for (const c of astChunks) { + const overlap = overlaps( + c.lineRange.start, + c.lineRange.end, + targetStart, + targetEnd, + ) + console.log( + ` Lines ${c.lineRange.start}-${c.lineRange.end} (${countNws(c.text)} NWS) ${overlap ? '*** RELEVANT ***' : ''}`, + ) + } + + console.log('\nFixed chunks:') + for (const c of fixedChunks) { + const overlap = overlaps(c.startLine, c.endLine, targetStart, targetEnd) + console.log( + ` Lines ${c.startLine}-${c.endLine} (${c.nwsCount} NWS) ${overlap ? '*** RELEVANT ***' : ''}`, + ) + } + + const astRelevant = astChunks.filter((c) => + overlaps(c.lineRange.start, c.lineRange.end, targetStart, targetEnd), + ) + const fixedRelevant = fixedChunks.filter((c) => + overlaps(c.startLine, c.endLine, targetStart, targetEnd), + ) + + console.log( + `\nRelevant chunks: AST=${astRelevant.length}, Fixed=${fixedRelevant.length}`, + ) + console.log( + `Total chunks: AST=${astChunks.length}, Fixed=${fixedChunks.length}`, + ) +} diff --git a/eval/download.ts b/eval/download.ts new file mode 100644 index 0000000..c2e05af --- /dev/null +++ b/eval/download.ts @@ -0,0 +1,149 @@ +/** + * Download RepoEval benchmark data + * + * Downloads: + * 1. Task datasets (queries, ground truth) from Microsoft CodeT repo + * 2. Function-level Python repositories for chunking + */ + +import { existsSync } from 'node:fs' +import { mkdir, writeFile } from 'node:fs/promises' +import { join } from 'node:path' + +const DATA_DIR = join(import.meta.dir, 'data', 'repoeval') +const DATASETS_DIR = join(DATA_DIR, 'datasets') +const REPOS_DIR = join(DATA_DIR, 'repositories', 'function_level') + +// Function-level repositories from RepoEval +const REPOS_FUNCTION = [ + 'amazon-science_patchcore-inspection', + 'deepmind_tracr', + 'facebookresearch_omnivore', + 'google_lightweight_mmm', + 'lucidrains_imagen-pytorch', + 'maxhumber_redframes', +] + +async function downloadAndExtractZip( + url: string, + destDir: string, +): Promise { + console.log(`Downloading from ${url}...`) + + const response = await fetch(url) + if (!response.ok) { + throw new Error(`Failed to download: ${response.statusText}`) + } + + const arrayBuffer = await response.arrayBuffer() + const tempZipPath = join(destDir, '_temp.zip') + + await mkdir(destDir, { recursive: true }) + await writeFile(tempZipPath, new Uint8Array(arrayBuffer)) + + // Use unzip command + const proc = Bun.spawn(['unzip', '-o', '-q', tempZipPath, '-d', destDir], { + cwd: destDir, + }) + await proc.exited + + // Clean up temp file + await Bun.spawn(['rm', tempZipPath]).exited + + console.log(`Extracted to ${destDir}`) +} + +async function downloadDatasets(): Promise { + if (existsSync(DATASETS_DIR)) { + console.log('Datasets already downloaded, skipping...') + return + } + + const datasetsUrl = + 'https://github.com/microsoft/CodeT/raw/main/RepoCoder/datasets/datasets.zip' + await downloadAndExtractZip(datasetsUrl, DATASETS_DIR) +} + +async function downloadRepositories(): Promise { + if (existsSync(REPOS_DIR)) { + console.log('Repositories already downloaded, skipping...') + return + } + + // Using the cleaned version from Veronicium's fork + const reposUrl = + 'https://github.com/Veronicium/repoeval_debug/raw/main/function_level.zip' + await downloadAndExtractZip(reposUrl, REPOS_DIR) +} + +export interface RepoEvalTask { + prompt: string + metadata: { + task_id: string + ground_truth: string + fpath_tuple: string[] + line_no: number + lineno: number + context_start_lineno: number + } +} + +export async function loadTasks( + contextLength: '1k' | '2k' | '4k' = '2k', +): Promise { + const fileName = `function_level_completion_${contextLength}_context_codex.test.jsonl` + const filePath = join(DATASETS_DIR, fileName) + + const content = await Bun.file(filePath).text() + const lines = content.trim().split('\n') + + const tasks: RepoEvalTask[] = [] + const repo2idx: Record = {} + + for (const line of lines) { + const task = JSON.parse(line) as RepoEvalTask + + // Clean up task_id format + const repo = task.metadata.task_id.replace('--', '_').split('/')[0] + if (!REPOS_FUNCTION.includes(repo)) continue + + if (!(repo in repo2idx)) { + repo2idx[repo] = 0 + } + + task.metadata.task_id = task.metadata.task_id + .replace('--', '_') + .replace('idx', String(repo2idx[repo])) + task.metadata.line_no = task.metadata.lineno + repo2idx[repo]++ + + tasks.push(task) + } + + return tasks +} + +export function getReposDir(): string { + return REPOS_DIR +} + +export function getRepos(): string[] { + return REPOS_FUNCTION +} + +export async function download(): Promise { + console.log('Downloading RepoEval benchmark data...\n') + + await mkdir(DATA_DIR, { recursive: true }) + + await downloadDatasets() + await downloadRepositories() + + console.log('\nDownload complete!') + console.log(`Data stored in: ${DATA_DIR}`) +} + +// Run if executed directly +if (import.meta.main) { + await download() +} diff --git a/eval/embeddings.ts b/eval/embeddings.ts new file mode 100644 index 0000000..8e242e1 --- /dev/null +++ b/eval/embeddings.ts @@ -0,0 +1,205 @@ +/** + * OpenAI embeddings wrapper with disk caching + */ + +import { createHash } from 'node:crypto' +import { existsSync } from 'node:fs' +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { join } from 'node:path' +import OpenAI from 'openai' + +const CACHE_DIR = join(import.meta.dir, 'cache', 'embeddings') +const MODEL = 'text-embedding-3-small' +const BATCH_SIZE = 100 + +let client: OpenAI | null = null + +function getClient(): OpenAI { + if (!client) { + client = new OpenAI() + } + return client +} + +/** + * Create a cache key from text content + */ +function cacheKey(text: string): string { + return createHash('sha256').update(text).digest('hex').slice(0, 16) +} + +/** + * Get cache file path for a text + */ +function cachePath(text: string): string { + const key = cacheKey(text) + // Use first 2 chars as subdirectory to avoid too many files in one dir + return join(CACHE_DIR, key.slice(0, 2), `${key}.json`) +} + +/** + * Try to load embedding from cache + */ +async function loadFromCache(text: string): Promise { + const path = cachePath(text) + if (!existsSync(path)) { + return null + } + try { + const data = await readFile(path, 'utf-8') + return JSON.parse(data) as number[] + } catch { + return null + } +} + +/** + * Save embedding to cache + */ +async function saveToCache(text: string, embedding: number[]): Promise { + const path = cachePath(text) + const dir = join(path, '..') + await mkdir(dir, { recursive: true }) + await writeFile(path, JSON.stringify(embedding)) +} + +/** + * Embed a batch of texts using OpenAI API + */ +async function embedBatch(texts: string[]): Promise { + const openai = getClient() + + // Filter out empty texts and track their indices + const nonEmptyTexts: string[] = [] + const indexMap: number[] = [] + + for (let i = 0; i < texts.length; i++) { + const text = texts[i].trim() + if (text.length > 0) { + nonEmptyTexts.push(text) + indexMap.push(i) + } + } + + if (nonEmptyTexts.length === 0) { + // Return zero vectors for all empty inputs + return texts.map(() => new Array(1536).fill(0)) + } + + const response = await openai.embeddings.create({ + model: MODEL, + input: nonEmptyTexts, + }) + + // Sort by index to maintain order + const sorted = response.data.sort( + (a: { index: number }, b: { index: number }) => a.index - b.index, + ) + const embeddings = sorted.map((d: { embedding: number[] }) => d.embedding) + + // Map back to original indices, filling zeros for empty texts + const result: number[][] = texts.map(() => new Array(1536).fill(0)) + for (let i = 0; i < indexMap.length; i++) { + result[indexMap[i]] = embeddings[i] + } + + return result +} + +/** + * Embed texts with caching + * + * @param texts - Array of texts to embed + * @param onProgress - Optional callback for progress updates + * @returns Array of embeddings (same order as input texts) + */ +export async function embedTexts( + texts: string[], + onProgress?: (done: number, total: number) => void, +): Promise { + await mkdir(CACHE_DIR, { recursive: true }) + + const results: (number[] | null)[] = new Array(texts.length).fill(null) + const uncachedIndices: number[] = [] + const uncachedTexts: string[] = [] + + // Check cache for each text + for (let i = 0; i < texts.length; i++) { + const cached = await loadFromCache(texts[i]) + if (cached) { + results[i] = cached + } else { + uncachedIndices.push(i) + uncachedTexts.push(texts[i]) + } + } + + const cachedCount = texts.length - uncachedTexts.length + if (cachedCount > 0) { + console.log(` Found ${cachedCount}/${texts.length} embeddings in cache`) + } + + // Embed uncached texts in batches + for (let i = 0; i < uncachedTexts.length; i += BATCH_SIZE) { + const batch = uncachedTexts.slice(i, i + BATCH_SIZE) + const batchIndices = uncachedIndices.slice(i, i + BATCH_SIZE) + + const embeddings = await embedBatch(batch) + + // Save to cache and store results + for (let j = 0; j < embeddings.length; j++) { + const originalIdx = batchIndices[j] + results[originalIdx] = embeddings[j] + await saveToCache(batch[j], embeddings[j]) + } + + if (onProgress) { + onProgress( + Math.min(i + BATCH_SIZE, uncachedTexts.length), + uncachedTexts.length, + ) + } + } + + return results as number[][] +} + +/** + * Compute cosine similarity between two vectors + */ +export function cosineSimilarity(a: number[], b: number[]): number { + let dotProduct = 0 + let normA = 0 + let normB = 0 + + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i] + normA += a[i] * a[i] + normB += b[i] * b[i] + } + + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) +} + +/** + * Find top-k most similar items + * + * @param queryEmbedding - The query embedding + * @param corpusEmbeddings - Array of corpus embeddings + * @param k - Number of top results to return + * @returns Array of { index, score } sorted by score descending + */ +export function topK( + queryEmbedding: number[], + corpusEmbeddings: number[][], + k: number, +): Array<{ index: number; score: number }> { + const scores = corpusEmbeddings.map((emb, idx) => ({ + index: idx, + score: cosineSimilarity(queryEmbedding, emb), + })) + + scores.sort((a, b) => b.score - a.score) + + return scores.slice(0, k) +} diff --git a/eval/run.ts b/eval/run.ts new file mode 100644 index 0000000..8c05d74 --- /dev/null +++ b/eval/run.ts @@ -0,0 +1,414 @@ +/** + * RepoEval Retrieval Evaluation Runner + * + * Compares AST-aware chunking vs fixed-size chunking on code retrieval. + * + * Usage: + * bun eval/run.ts + */ + +import { readdirSync, statSync } from 'node:fs' +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { join } from 'node:path' +import { chunkFile as chunkWithAST } from './chunkers/ast' +import { chunkFile as chunkWithFixed } from './chunkers/fixed' +import { + download, + getRepos, + getReposDir, + loadTasks, + type RepoEvalTask, +} from './download' +import { embedTexts, topK } from './embeddings' +import { aggregateMetrics, computeMetrics } from './metrics' + +const RESULTS_DIR = join(import.meta.dir, 'results') +const K_VALUES = [5, 10] // Top-k values for retrieval +const MAX_CHUNK_SIZE = 1500 // NWS characters per chunk + +interface ChunkInfo { + id: string + text: string + startLine: number + endLine: number + filepath: string +} + +interface MetricsAtK { + precision: number + recall: number + ndcg: number +} + +interface QueryResult { + taskId: string + prompt: string + groundTruthLines: { start: number; end: number } + groundTruthFile: string + retrievedChunks: Array<{ id: string; score: number; rank: number }> + relevantChunkIds: string[] + metrics: Record // metrics per k value +} + +interface EvalResult { + chunker: 'ast' | 'fixed' + repo: string + summary: Record // summary per k value + queryResults: QueryResult[] + config: { kValues: number[]; maxChunkSize: number } + timestamp: string +} + +/** + * Recursively find all Python files in a directory + */ +function findPythonFiles(dir: string): string[] { + const files: string[] = [] + + function walk(currentDir: string) { + const entries = readdirSync(currentDir) + for (const entry of entries) { + const fullPath = join(currentDir, entry) + const stat = statSync(fullPath) + if (stat.isDirectory()) { + walk(fullPath) + } else if (entry.endsWith('.py')) { + files.push(fullPath) + } + } + } + + walk(dir) + return files +} + +/** + * Check if a chunk overlaps with a line range + */ +function chunksOverlap( + chunk: { startLine: number; endLine: number }, + target: { start: number; end: number }, +): boolean { + return !(chunk.endLine < target.start || chunk.startLine > target.end) +} + +/** + * Run evaluation for a single repository + */ +async function evaluateRepo( + repo: string, + tasks: RepoEvalTask[], + chunkerType: 'ast' | 'fixed', +): Promise { + console.log(`\n Evaluating ${repo} with ${chunkerType} chunker...`) + + const repoDir = join(getReposDir(), repo) + const pyFiles = findPythonFiles(repoDir) + console.log(` Found ${pyFiles.length} Python files`) + + // Step 1: Chunk all files + console.log(' Chunking files...') + const allChunks: ChunkInfo[] = [] + + for (const filepath of pyFiles) { + const code = await readFile(filepath, 'utf-8') + const relPath = filepath.replace(`${repoDir}/`, '') + + try { + const chunks = + chunkerType === 'ast' + ? await chunkWithAST(filepath, code, MAX_CHUNK_SIZE) + : await chunkWithFixed(filepath, code, MAX_CHUNK_SIZE) + + for (const chunk of chunks) { + allChunks.push({ + ...chunk, + filepath: relPath, + }) + } + } catch (err) { + // Skip files that can't be parsed + console.log(` Warning: Failed to chunk ${relPath}: ${err}`) + } + } + + console.log(` Created ${allChunks.length} chunks`) + + // Step 2: Embed all chunks + console.log(' Embedding chunks...') + const chunkTexts = allChunks.map((c) => c.text) + const chunkEmbeddings = await embedTexts(chunkTexts, (done, total) => { + process.stdout.write(`\r Embedding chunks: ${done}/${total}`) + }) + console.log('') + + // Step 3: Embed queries and retrieve + console.log(' Embedding queries and retrieving...') + const queryTexts = tasks.map((t) => t.prompt) + const queryEmbeddings = await embedTexts(queryTexts) + + // Step 4: For each query, retrieve top-k and compute metrics + const queryResults: QueryResult[] = [] + + // Debug: show sample filepaths from chunks + const sampleFilepaths = [...new Set(allChunks.map((c) => c.filepath))].slice( + 0, + 5, + ) + if (tasks.length > 0) { + console.log( + ` Debug: Sample chunk filepaths: ${sampleFilepaths.join(', ')}`, + ) + console.log( + ` Debug: Sample task fpath_tuple: ${tasks[0].metadata.fpath_tuple.join('/')}`, + ) + console.log( + ` Debug: Target file (after slice): ${tasks[0].metadata.fpath_tuple.slice(1).join('/')}`, + ) + } + + const maxK = Math.max(...K_VALUES) + + for (let i = 0; i < tasks.length; i++) { + const task = tasks[i] + const queryEmb = queryEmbeddings[i] + + // Get top-k chunks (use max k to get all we need) + const topKResults = topK(queryEmb, chunkEmbeddings, maxK) + + // Determine ground truth: chunks that overlap with target location + // fpath_tuple is ["repo_name", "path", "to", "file.py"], skip first element + const targetFile = task.metadata.fpath_tuple.slice(1).join('/') + const targetLines = { + start: task.metadata.context_start_lineno, + end: task.metadata.line_no, + } + + // Find all chunks that are relevant (overlap with ground truth) + const relevantChunkIds = allChunks + .filter((c) => c.filepath === targetFile && chunksOverlap(c, targetLines)) + .map((c) => c.id) + + // Debug first query + if (i === 0) { + console.log(` Debug first query:`) + console.log(` Target file: "${targetFile}"`) + console.log(` Target lines: ${targetLines.start}-${targetLines.end}`) + console.log(` Relevant chunks found: ${relevantChunkIds.length}`) + console.log( + ` Top retrieved chunk: ${allChunks[topKResults[0]?.index]?.filepath}`, + ) + } + + const relevantSet = new Set(relevantChunkIds) + + // Get retrieved chunk IDs + const retrievedIds = topKResults.map((r) => allChunks[r.index].id) + + // Compute metrics for each k value + const metrics: Record = {} + for (const k of K_VALUES) { + metrics[k] = computeMetrics(retrievedIds, relevantSet, k) + } + + queryResults.push({ + taskId: task.metadata.task_id, + prompt: `${task.prompt.slice(0, 200)}...`, // Truncate for readability + groundTruthLines: targetLines, + groundTruthFile: targetFile, + retrievedChunks: topKResults.map((r, rank) => ({ + id: allChunks[r.index].id, + score: r.score, + rank: rank + 1, + })), + relevantChunkIds, + metrics, + }) + } + + // Aggregate metrics for each k value + const summary: Record = {} + for (const k of K_VALUES) { + summary[k] = aggregateMetrics(queryResults.map((q) => q.metrics[k])) + } + + return { + chunker: chunkerType, + repo, + summary, + queryResults, + config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE }, + timestamp: new Date().toISOString(), + } +} + +/** + * Format metrics as a table row for a specific k + */ +function formatMetricsRow(label: string, metrics: MetricsAtK): string { + return `${label.padEnd(20)} | ${(metrics.ndcg * 100).toFixed(1).padStart(6)} | ${(metrics.precision * 100).toFixed(1).padStart(6)} | ${(metrics.recall * 100).toFixed(1).padStart(6)}` +} + +/** + * Print metrics table for all k values + */ +function printMetricsTable( + astSummary: Record, + fixedSummary: Record, + indent = '', +): void { + for (const k of K_VALUES) { + console.log(`${indent}k=${k}:`) + console.log(indent + '-'.repeat(50)) + console.log( + `${indent}${'Chunker'.padEnd(20)} | ${'nDCG'.padStart(6)} | ${'P@k'.padStart(6)} | ${'R@k'.padStart(6)}`, + ) + console.log(indent + '-'.repeat(50)) + console.log(indent + formatMetricsRow('AST', astSummary[k])) + console.log(indent + formatMetricsRow('Fixed', fixedSummary[k])) + console.log(indent + '-'.repeat(50)) + console.log('') + } +} + +async function main() { + console.log('RepoEval Retrieval Evaluation') + console.log('=============================\n') + + // Step 1: Download data if needed + await download() + + // Step 2: Load tasks + console.log('\nLoading tasks...') + const allTasks = await loadTasks('2k') + console.log(`Loaded ${allTasks.length} tasks`) + + // Group tasks by repo + const tasksByRepo = new Map() + for (const task of allTasks) { + const repo = task.metadata.task_id.split('/')[0] + if (!tasksByRepo.has(repo)) { + tasksByRepo.set(repo, []) + } + const repoTasks = tasksByRepo.get(repo) + if (repoTasks) { + repoTasks.push(task) + } + } + + // Step 3: Run evaluation for each repo and chunker + await mkdir(RESULTS_DIR, { recursive: true }) + + const allResults: EvalResult[] = [] + const repos = getRepos() + + for (const repo of repos) { + const tasks = tasksByRepo.get(repo) + if (!tasks || tasks.length === 0) { + console.log(`\nSkipping ${repo}: no tasks found`) + continue + } + + console.log(`\n${'='.repeat(60)}`) + console.log(`Repository: ${repo} (${tasks.length} tasks)`) + console.log('='.repeat(60)) + + // Evaluate with AST chunker + const astResult = await evaluateRepo(repo, tasks, 'ast') + allResults.push(astResult) + + // Evaluate with fixed chunker + const fixedResult = await evaluateRepo(repo, tasks, 'fixed') + allResults.push(fixedResult) + + // Print comparison + console.log(`\n Results for ${repo}:`) + printMetricsTable(astResult.summary, fixedResult.summary, ' ') + } + + // Step 4: Compute overall summary + console.log(`\n${'='.repeat(60)}`) + console.log('OVERALL SUMMARY') + console.log('='.repeat(60)) + + const astResults = allResults.filter((r) => r.chunker === 'ast') + const fixedResults = allResults.filter((r) => r.chunker === 'fixed') + + // Aggregate metrics for each k value + const astOverall: Record = {} + const fixedOverall: Record = {} + for (const k of K_VALUES) { + astOverall[k] = aggregateMetrics(astResults.map((r) => r.summary[k])) + fixedOverall[k] = aggregateMetrics(fixedResults.map((r) => r.summary[k])) + } + + console.log('') + printMetricsTable(astOverall, fixedOverall) + + // Compute improvements for each k + console.log('Improvement (AST vs Fixed):') + for (const k of K_VALUES) { + const ndcgImprovement = + ((astOverall[k].ndcg - fixedOverall[k].ndcg) / fixedOverall[k].ndcg) * 100 + const precImprovement = + ((astOverall[k].precision - fixedOverall[k].precision) / + fixedOverall[k].precision) * + 100 + const recallImprovement = + ((astOverall[k].recall - fixedOverall[k].recall) / + fixedOverall[k].recall) * + 100 + + console.log(` k=${k}:`) + console.log( + ` nDCG: ${ndcgImprovement >= 0 ? '+' : ''}${ndcgImprovement.toFixed(1)}%`, + ) + console.log( + ` Precision: ${precImprovement >= 0 ? '+' : ''}${precImprovement.toFixed(1)}%`, + ) + console.log( + ` Recall: ${recallImprovement >= 0 ? '+' : ''}${recallImprovement.toFixed(1)}%`, + ) + } + + // Step 5: Save results + const timestamp = new Date().toISOString().replace(/[:.]/g, '-') + + // Save summary + const summaryPath = join(RESULTS_DIR, `summary_${timestamp}.json`) + await writeFile( + summaryPath, + JSON.stringify( + { + overall: { + ast: astOverall, + fixed: fixedOverall, + }, + perRepo: Object.fromEntries( + repos.map((repo) => [ + repo, + { + ast: astResults.find((r) => r.repo === repo)?.summary, + fixed: fixedResults.find((r) => r.repo === repo)?.summary, + }, + ]), + ), + config: { kValues: K_VALUES, maxChunkSize: MAX_CHUNK_SIZE }, + timestamp: new Date().toISOString(), + }, + null, + 2, + ), + ) + console.log(`\nSaved summary to: ${summaryPath}`) + + // Save detailed results + const detailedPath = join(RESULTS_DIR, `detailed_${timestamp}.json`) + await writeFile(detailedPath, JSON.stringify(allResults, null, 2)) + console.log(`Saved detailed results to: ${detailedPath}`) +} + +// Run if executed directly +if (import.meta.main) { + main().catch(console.error) +} diff --git a/package.json b/package.json index 4aa3f6c..008a5c2 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "dev": "bunup --watch", "lint": "biome check .", "lint:fix": "biome check --write .", + "format": "biome format --write .", "release": "bumpp --commit --push --tag", "test": "bun test", "test:coverage": "bun test --coverage", diff --git a/src/chunking/index.ts b/src/chunking/index.ts index 9bfd95d..96f262f 100644 --- a/src/chunking/index.ts +++ b/src/chunking/index.ts @@ -4,6 +4,7 @@ import { getRelevantImports, getScopeForRange, } from '../context' +import { formatChunkWithContext } from '../context/format' import { getSiblings } from '../context/siblings' import type { ASTWindow, @@ -43,6 +44,7 @@ export const DEFAULT_CHUNK_OPTIONS: Required = { siblingDetail: 'signatures', filterImports: false, language: 'typescript', + overlapLines: 10, } /** @@ -291,18 +293,43 @@ export const chunk = ( const windowArray = Array.from(mergedWindows) const totalChunks = windowArray.length - const chunks: Chunk[] = windowArray.map((window, index) => { - // Rebuild text from window - const text = rebuildText(window, code) + // First pass: rebuild text for all windows (needed for overlap) + const rebuiltTexts = windowArray.map((window) => + rebuildText(window, code), + ) + // Second pass: build chunks with overlap + const chunks: Chunk[] = rebuiltTexts.map((text, index) => { // Build context const context = opts.contextMode === 'none' ? { scope: [], entities: [], siblings: [], imports: [] } : buildContext(text, scopeTree, opts, filepath, language) + // Compute overlap text from previous chunk if applicable + let overlapText: string | undefined + if (opts.overlapLines > 0 && index > 0) { + const prevText = rebuiltTexts[index - 1]?.text + if (prevText) { + const prevLines = prevText.split('\n') + const overlapLineCount = Math.min( + opts.overlapLines, + prevLines.length, + ) + overlapText = prevLines.slice(-overlapLineCount).join('\n') + } + } + + // Build contextualized text for embeddings (includes overlap) + const contextualizedText = formatChunkWithContext( + text.text, + context, + overlapText, + ) + return { text: text.text, + contextualizedText, byteRange: text.byteRange, lineRange: text.lineRange, context, @@ -366,6 +393,7 @@ export async function* streamChunks( // Stream chunks as they are generated // totalChunks is -1 since we don't know the total count while streaming let index = 0 + let prevText: string | undefined for (const window of mergedWindows) { // Rebuild text from window const text = rebuildText(window, code) @@ -376,14 +404,32 @@ export async function* streamChunks( ? { scope: [], entities: [], siblings: [], imports: [] } : buildContext(text, scopeTree, opts, filepath, language) + // Compute overlap text from previous chunk if applicable + let overlapText: string | undefined + if (opts.overlapLines > 0 && prevText) { + const prevLines = prevText.split('\n') + const overlapLineCount = Math.min(opts.overlapLines, prevLines.length) + overlapText = prevLines.slice(-overlapLineCount).join('\n') + } + + // Build contextualized text for embeddings (includes overlap) + const contextualizedText = formatChunkWithContext( + text.text, + context, + overlapText, + ) + yield { text: text.text, + contextualizedText, byteRange: text.byteRange, lineRange: text.lineRange, context, index, totalChunks: -1, // Unknown during streaming } + + prevText = text.text index++ } } diff --git a/src/context/format.ts b/src/context/format.ts new file mode 100644 index 0000000..ea14ddf --- /dev/null +++ b/src/context/format.ts @@ -0,0 +1,99 @@ +/** + * Format chunks with semantic context for embedding + * + * Prepends scope chain, entity signatures, and import context + * to improve embedding similarity for semantic search. + */ + +import type { ChunkContext } from '../types' + +/** + * Format chunk text with semantic context prepended + * + * Creates a contextualized version of the chunk text that includes: + * - File path (last 3 segments) + * - Scope chain (e.g., "MyClass > process") + * - Entity signatures defined in this chunk + * - Import dependencies + * - Sibling context for continuity + * - Optional overlap from previous chunk + * + * This format is optimized for embedding models to capture + * semantic relationships between code chunks. + * + * @param text - The raw chunk text + * @param context - The chunk's semantic context + * @param overlapText - Optional text from previous chunk to include for continuity + * @returns Formatted text with context prepended + */ +export function formatChunkWithContext( + text: string, + context: ChunkContext, + overlapText?: string, +): string { + const parts: string[] = [] + + // Add file path for context (last 3 segments) + if (context.filepath) { + const relPath = context.filepath.split('/').slice(-3).join('/') + parts.push(`# ${relPath}`) + } + + // Add scope chain (e.g., "Scope: MyClass > process") + if (context.scope.length > 0) { + const scopePath = context.scope + .map((s) => s.name) + .reverse() + .join(' > ') + parts.push(`# Scope: ${scopePath}`) + } + + // Add entity signatures in this chunk + const signatures = context.entities + .filter((e) => e.signature && e.type !== 'import') + .map((e) => e.signature) + if (signatures.length > 0) { + parts.push(`# Defines: ${signatures.join(', ')}`) + } + + // Add imports context (what this code depends on) + if (context.imports.length > 0) { + const importNames = context.imports + .slice(0, 10) // Limit to avoid noise + .map((i) => i.name) + .join(', ') + parts.push(`# Uses: ${importNames}`) + } + + // Add sibling context for continuity + const beforeSiblings = context.siblings + .filter((s) => s.position === 'before') + .map((s) => s.name) + const afterSiblings = context.siblings + .filter((s) => s.position === 'after') + .map((s) => s.name) + + if (beforeSiblings.length > 0) { + parts.push(`# After: ${beforeSiblings.join(', ')}`) + } + if (afterSiblings.length > 0) { + parts.push(`# Before: ${afterSiblings.join(', ')}`) + } + + // Add separator before code + if (parts.length > 0) { + parts.push('') + } + + // Add overlap from previous chunk if provided + if (overlapText) { + parts.push('# ...') + parts.push(overlapText) + parts.push('# ---') + } + + // Add actual chunk code + parts.push(text) + + return parts.join('\n') +} diff --git a/src/index.ts b/src/index.ts index 4eeca9f..0b50057 100644 --- a/src/index.ts +++ b/src/index.ts @@ -19,8 +19,8 @@ export { // Chunker factory export { createChunker } from './chunker' - -// Re-export language utilities for advanced usage +// Context formatting utility for custom embedding text generation +export { formatChunkWithContext } from './context/format' export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages' // All public types diff --git a/src/types.ts b/src/types.ts index 2bafcde..e32d144 100644 --- a/src/types.ts +++ b/src/types.ts @@ -225,6 +225,14 @@ export interface ChunkContext { export interface Chunk { /** The actual text content */ text: string + /** + * Text with semantic context prepended for embedding + * + * Includes file path, scope chain, entity signatures, imports, + * and sibling context to improve embedding quality for semantic search. + * Use this field when creating embeddings for RAG systems. + */ + contextualizedText: string /** Byte range in original source */ byteRange: ByteRange /** Line range in original source */ @@ -251,6 +259,14 @@ export interface ChunkOptions { filterImports?: boolean /** Override language detection */ language?: Language + /** + * Number of lines to overlap from the previous chunk (default: 0) + * + * When set, each chunk's contextualizedText will include the last N lines + * from the previous chunk, improving recall for queries that target + * code at chunk boundaries. The raw `text` field is not affected. + */ + overlapLines?: number } /**