diff --git a/README.md b/README.md index 2919d23..53c9d0f 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Uses tree-sitter to split source code at semantic boundaries (functions, classes - **Rich context**: Scope chain, imports, siblings, entity signatures - **Contextualized text**: Pre-formatted for embedding models - **Multi-language**: TypeScript, JavaScript, Python, Rust, Go, Java +- **Batch processing**: Process entire codebases with controlled concurrency - **Streaming**: Process large files incrementally - **Effect support**: First-class Effect integration @@ -143,6 +144,48 @@ for (const file of files) { } ``` +### Batch Processing + +Process multiple files concurrently with error handling per file: + +```typescript +import { chunkBatch } from 'code-chunk' + +const files = [ + { filepath: 'src/user.ts', code: userCode }, + { filepath: 'src/auth.ts', code: authCode }, + { filepath: 'lib/utils.py', code: utilsCode }, +] + +const results = await chunkBatch(files, { + maxChunkSize: 1500, + concurrency: 10, + onProgress: (done, total, path, success) => { + console.log(`[${done}/${total}] ${path}: ${success ? 'ok' : 'failed'}`) + } +}) + +for (const result of results) { + if (result.error) { + console.error(`Failed: ${result.filepath}`, result.error) + } else { + await indexChunks(result.filepath, result.chunks) + } +} +``` + +Stream results as they complete: + +```typescript +import { chunkBatchStream } from 'code-chunk' + +for await (const result of chunkBatchStream(files, { concurrency: 5 })) { + if (result.chunks) { + await indexChunks(result.filepath, result.chunks) + } +} +``` + ### Effect Integration For Effect-based pipelines: @@ -198,7 +241,43 @@ Effect-native streaming API for composable pipelines. Create a reusable chunker instance with default options. -**Returns:** `Chunker` with `chunk()` and `stream()` methods +**Returns:** `Chunker` with `chunk()`, `stream()`, `chunkBatch()`, and `chunkBatchStream()` methods + +--- + +### `chunkBatch(files, options?)` + +Process multiple files concurrently with per-file error handling. + +**Parameters:** +- `files`: Array of `{ filepath, code, options? }` +- `options`: Batch options (extends ChunkOptions with `concurrency` and `onProgress`) + +**Returns:** `Promise` where each result has `{ filepath, chunks, error }` + +--- + +### `chunkBatchStream(files, options?)` + +Stream batch results as files complete processing. + +**Returns:** `AsyncGenerator` + +--- + +### `chunkBatchEffect(files, options?)` + +Effect-native batch processing. + +**Returns:** `Effect.Effect` + +--- + +### `chunkBatchStreamEffect(files, options?)` + +Effect-native streaming batch processing. + +**Returns:** `Stream.Stream` --- @@ -218,7 +297,7 @@ Detect programming language from file extension. --- -### Options +### ChunkOptions | Option | Type | Default | Description | |--------|------|---------|-------------| @@ -229,6 +308,15 @@ Detect programming language from file extension. | `language` | `Language` | auto | Override language detection | | `overlapLines` | `number` | `10` | Lines from previous chunk to include in `contextualizedText` | +### BatchOptions + +Extends `ChunkOptions` with: + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `concurrency` | `number` | `10` | Maximum files to process concurrently | +| `onProgress` | `function` | - | Callback `(completed, total, filepath, success) => void` | + --- ### Supported Languages diff --git a/packages/code-chunk/README.md b/packages/code-chunk/README.md index 2919d23..53c9d0f 100644 --- a/packages/code-chunk/README.md +++ b/packages/code-chunk/README.md @@ -19,6 +19,7 @@ Uses tree-sitter to split source code at semantic boundaries (functions, classes - **Rich context**: Scope chain, imports, siblings, entity signatures - **Contextualized text**: Pre-formatted for embedding models - **Multi-language**: TypeScript, JavaScript, Python, Rust, Go, Java +- **Batch processing**: Process entire codebases with controlled concurrency - **Streaming**: Process large files incrementally - **Effect support**: First-class Effect integration @@ -143,6 +144,48 @@ for (const file of files) { } ``` +### Batch Processing + +Process multiple files concurrently with error handling per file: + +```typescript +import { chunkBatch } from 'code-chunk' + +const files = [ + { filepath: 'src/user.ts', code: userCode }, + { filepath: 'src/auth.ts', code: authCode }, + { filepath: 'lib/utils.py', code: utilsCode }, +] + +const results = await chunkBatch(files, { + maxChunkSize: 1500, + concurrency: 10, + onProgress: (done, total, path, success) => { + console.log(`[${done}/${total}] ${path}: ${success ? 'ok' : 'failed'}`) + } +}) + +for (const result of results) { + if (result.error) { + console.error(`Failed: ${result.filepath}`, result.error) + } else { + await indexChunks(result.filepath, result.chunks) + } +} +``` + +Stream results as they complete: + +```typescript +import { chunkBatchStream } from 'code-chunk' + +for await (const result of chunkBatchStream(files, { concurrency: 5 })) { + if (result.chunks) { + await indexChunks(result.filepath, result.chunks) + } +} +``` + ### Effect Integration For Effect-based pipelines: @@ -198,7 +241,43 @@ Effect-native streaming API for composable pipelines. Create a reusable chunker instance with default options. -**Returns:** `Chunker` with `chunk()` and `stream()` methods +**Returns:** `Chunker` with `chunk()`, `stream()`, `chunkBatch()`, and `chunkBatchStream()` methods + +--- + +### `chunkBatch(files, options?)` + +Process multiple files concurrently with per-file error handling. + +**Parameters:** +- `files`: Array of `{ filepath, code, options? }` +- `options`: Batch options (extends ChunkOptions with `concurrency` and `onProgress`) + +**Returns:** `Promise` where each result has `{ filepath, chunks, error }` + +--- + +### `chunkBatchStream(files, options?)` + +Stream batch results as files complete processing. + +**Returns:** `AsyncGenerator` + +--- + +### `chunkBatchEffect(files, options?)` + +Effect-native batch processing. + +**Returns:** `Effect.Effect` + +--- + +### `chunkBatchStreamEffect(files, options?)` + +Effect-native streaming batch processing. + +**Returns:** `Stream.Stream` --- @@ -218,7 +297,7 @@ Detect programming language from file extension. --- -### Options +### ChunkOptions | Option | Type | Default | Description | |--------|------|---------|-------------| @@ -229,6 +308,15 @@ Detect programming language from file extension. | `language` | `Language` | auto | Override language detection | | `overlapLines` | `number` | `10` | Lines from previous chunk to include in `contextualizedText` | +### BatchOptions + +Extends `ChunkOptions` with: + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `concurrency` | `number` | `10` | Maximum files to process concurrently | +| `onProgress` | `function` | - | Callback `(completed, total, filepath, success) => void` | + --- ### Supported Languages diff --git a/packages/code-chunk/src/batch.ts b/packages/code-chunk/src/batch.ts new file mode 100644 index 0000000..988e85c --- /dev/null +++ b/packages/code-chunk/src/batch.ts @@ -0,0 +1,219 @@ +import { Effect, Queue, Stream } from 'effect' +import { ChunkingError, UnsupportedLanguageError } from './chunk' +import { chunk as chunkInternal } from './chunking' +import { extractEntities } from './extract' +import { parseCode } from './parser' +import { detectLanguage } from './parser/languages' +import { buildScopeTree } from './scope' +import type { + BatchFileError, + BatchFileResult, + BatchOptions, + BatchResult, + Chunk, + ChunkOptions, + FileInput, + Language, +} from './types' + +const DEFAULT_CONCURRENCY = 10 + +const chunkFileEffect = ( + file: FileInput, + batchOptions: ChunkOptions = {}, +): Effect.Effect => { + const mergedOptions = { ...batchOptions, ...file.options } + + return Effect.gen(function* () { + const language: Language | null = + mergedOptions.language ?? detectLanguage(file.filepath) + + if (!language) { + return { + filepath: file.filepath, + chunks: null, + error: new UnsupportedLanguageError(file.filepath), + } satisfies BatchFileError + } + + const parseResult = yield* Effect.tryPromise({ + try: () => parseCode(file.code, language), + catch: (error: unknown) => + new ChunkingError('Failed to parse code', error), + }) + + const entities = yield* Effect.mapError( + extractEntities(parseResult.tree.rootNode, language, file.code), + (error: unknown) => + new ChunkingError('Failed to extract entities', error), + ) + + const scopeTree = yield* Effect.mapError( + buildScopeTree(entities), + (error: unknown) => + new ChunkingError('Failed to build scope tree', error), + ) + + const chunks = yield* Effect.mapError( + chunkInternal( + parseResult.tree.rootNode, + file.code, + scopeTree, + language, + mergedOptions, + file.filepath, + ), + (error: unknown) => new ChunkingError('Failed to chunk code', error), + ) + + const finalChunks: Chunk[] = parseResult.error + ? chunks.map((c: Chunk) => ({ + ...c, + context: { ...c.context, parseError: parseResult.error ?? undefined }, + })) + : chunks + + return { + filepath: file.filepath, + chunks: finalChunks, + error: null, + } satisfies BatchFileResult + }).pipe( + Effect.catchAll((error) => + Effect.succeed({ + filepath: file.filepath, + chunks: null, + error: error instanceof Error ? error : new Error(String(error)), + } satisfies BatchFileError), + ), + ) +} + +export const chunkBatchStreamEffect = ( + files: FileInput[], + options: BatchOptions = {}, +): Stream.Stream => { + const { + concurrency = DEFAULT_CONCURRENCY, + onProgress, + ...chunkOptions + } = options + const total = files.length + + if (total === 0) { + return Stream.empty + } + + return Stream.unwrap( + Effect.gen(function* () { + const queue = yield* Queue.unbounded() + const resultsQueue = yield* Queue.unbounded() + + yield* Effect.forEach(files, (file) => Queue.offer(queue, file), { + discard: true, + }) + + let completed = 0 + + const worker = Effect.gen(function* () { + while (true) { + const maybeFile = yield* Queue.poll(queue) + if (maybeFile._tag === 'None') { + break + } + const file = maybeFile.value + const result = yield* chunkFileEffect(file, chunkOptions) + completed++ + if (onProgress) { + onProgress(completed, total, file.filepath, result.error === null) + } + yield* Queue.offer(resultsQueue, result) + } + }) + + yield* Effect.fork( + Effect.gen(function* () { + yield* Effect.all( + Array.from({ length: Math.min(concurrency, total) }, () => worker), + { concurrency: 'unbounded' }, + ) + yield* Queue.offer(resultsQueue, null) + }), + ) + + return Stream.fromQueue(resultsQueue).pipe( + Stream.takeWhile((result): result is BatchResult => result !== null), + ) + }), + ) +} + +export const chunkBatchEffect = ( + files: FileInput[], + options: BatchOptions = {}, +): Effect.Effect => { + return Stream.runCollect(chunkBatchStreamEffect(files, options)).pipe( + Effect.map((chunk) => Array.from(chunk)), + ) +} + +export async function chunkBatch( + files: FileInput[], + options?: BatchOptions, +): Promise { + return Effect.runPromise(chunkBatchEffect(files, options)) +} + +export async function* chunkBatchStream( + files: FileInput[], + options?: BatchOptions, +): AsyncGenerator { + const results: BatchResult[] = [] + let resolveNext: ((value: IteratorResult) => void) | null = null + let done = false + + const streamEffect = chunkBatchStreamEffect(files, options).pipe( + Stream.runForEach((result) => + Effect.sync(() => { + if (resolveNext) { + const resolve = resolveNext + resolveNext = null + resolve({ value: result, done: false }) + } else { + results.push(result) + } + }), + ), + Effect.tap(() => + Effect.sync(() => { + done = true + if (resolveNext) { + resolveNext({ value: undefined as never, done: true }) + } + }), + ), + ) + + const runPromise = Effect.runPromise(streamEffect) + + try { + while (true) { + const buffered = results.shift() + if (buffered !== undefined) { + yield buffered + } else if (done) { + break + } else { + const result = await new Promise>( + (resolve) => { + resolveNext = resolve + }, + ) + if (result.done) break + yield result.value + } + } + } finally { + await runPromise.catch(() => {}) + } +} diff --git a/packages/code-chunk/src/chunker.ts b/packages/code-chunk/src/chunker.ts index 475ee1c..ae4131a 100644 --- a/packages/code-chunk/src/chunker.ts +++ b/packages/code-chunk/src/chunker.ts @@ -1,13 +1,18 @@ +import { + chunkBatch as batchFn, + chunkBatchStream as batchStreamFn, +} from './batch' import { chunk as chunkFn, chunkStream as streamFn } from './chunk' import { DEFAULT_CHUNK_OPTIONS } from './chunking' -import type { Chunk, Chunker, ChunkOptions } from './types' +import type { + BatchOptions, + BatchResult, + Chunk, + Chunker, + ChunkOptions, + FileInput, +} from './types' -/** - * Implementation of the Chunker interface - * - * Provides a stateful wrapper around the chunk and stream functions that - * stores default options and allows per-call overrides. - */ class ChunkerImpl implements Chunker { private readonly defaultOptions: ChunkOptions @@ -15,14 +20,6 @@ class ChunkerImpl implements Chunker { this.defaultOptions = { ...DEFAULT_CHUNK_OPTIONS, ...options } } - /** - * Chunk source code into pieces with context - * - * @param filepath - The file path (used for language detection) - * @param code - The source code to chunk - * @param options - Optional overrides for chunking options - * @returns Promise resolving to array of chunks - */ async chunk( filepath: string, code: string, @@ -32,14 +29,6 @@ class ChunkerImpl implements Chunker { return chunkFn(filepath, code, mergedOptions) } - /** - * Stream chunks as they are generated - * - * @param filepath - The file path (used for language detection) - * @param code - The source code to chunk - * @param options - Optional overrides for chunking options - * @returns Async iterable of chunks - */ async *stream( filepath: string, code: string, @@ -48,6 +37,22 @@ class ChunkerImpl implements Chunker { const mergedOptions = { ...this.defaultOptions, ...options } yield* streamFn(filepath, code, mergedOptions) } + + async chunkBatch( + files: FileInput[], + options?: BatchOptions, + ): Promise { + const mergedOptions = { ...this.defaultOptions, ...options } + return batchFn(files, mergedOptions) + } + + async *chunkBatchStream( + files: FileInput[], + options?: BatchOptions, + ): AsyncGenerator { + const mergedOptions = { ...this.defaultOptions, ...options } + yield* batchStreamFn(files, mergedOptions) + } } /** diff --git a/packages/code-chunk/src/index.ts b/packages/code-chunk/src/index.ts index f4dd19e..f8a1780 100644 --- a/packages/code-chunk/src/index.ts +++ b/packages/code-chunk/src/index.ts @@ -8,6 +8,13 @@ * @packageDocumentation */ +// Batch processing +export { + chunkBatch, + chunkBatchEffect, + chunkBatchStream, + chunkBatchStreamEffect, +} from './batch' // Main chunking function export { ChunkingError, @@ -26,6 +33,10 @@ export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages' // All public types export type { ASTWindow, + BatchFileError, + BatchFileResult, + BatchOptions, + BatchResult, ByteRange, Chunk, ChunkContext, @@ -35,6 +46,7 @@ export type { EntityInfo, EntityType, ExtractedEntity, + FileInput, ImportInfo, Language, LineRange, diff --git a/packages/code-chunk/src/types.ts b/packages/code-chunk/src/types.ts index b148075..b96c63c 100644 --- a/packages/code-chunk/src/types.ts +++ b/packages/code-chunk/src/types.ts @@ -273,31 +273,24 @@ export interface ChunkOptions { * Interface for a chunker instance */ export interface Chunker { - /** - * Chunk source code into pieces with context - * @param filepath The file path (used for language detection) - * @param source The source code to chunk - * @param options Chunking options - * @returns Array of chunks - */ chunk( filepath: string, source: string, options?: ChunkOptions, ): Promise - /** - * Stream chunks as they are generated - * @param filepath The file path (used for language detection) - * @param source The source code to chunk - * @param options Chunking options - * @returns Async iterable of chunks - */ stream( filepath: string, source: string, options?: ChunkOptions, ): AsyncIterable + + chunkBatch(files: FileInput[], options?: BatchOptions): Promise + + chunkBatchStream( + files: FileInput[], + options?: BatchOptions, + ): AsyncGenerator } // ============================================================================ @@ -344,3 +337,72 @@ export interface WasmConfig { */ languages: Partial> } + +// ============================================================================ +// Batch Processing Types +// ============================================================================ + +/** + * Input for batch processing - represents a single file to chunk + */ +export interface FileInput { + /** File path (used for language detection) */ + filepath: string + /** Source code content */ + code: string + /** Optional per-file chunking options (overrides batch options) */ + options?: ChunkOptions +} + +/** + * Successful result for a single file in batch processing + */ +export interface BatchFileResult { + /** File path that was processed */ + filepath: string + /** Generated chunks */ + chunks: Chunk[] + /** No error on success */ + error: null +} + +/** + * Error result for a single file in batch processing + */ +export interface BatchFileError { + /** File path that failed */ + filepath: string + /** No chunks on error */ + chunks: null + /** The error that occurred */ + error: Error +} + +/** + * Result for a single file in batch processing - either success or error + */ +export type BatchResult = BatchFileResult | BatchFileError + +/** + * Options for batch processing + */ +export interface BatchOptions extends ChunkOptions { + /** + * Maximum number of files to process concurrently + * @default 10 + */ + concurrency?: number + /** + * Progress callback called after each file is processed + * @param completed - Number of files completed so far + * @param total - Total number of files to process + * @param filepath - Path of the file that was just processed + * @param success - Whether the file was processed successfully + */ + onProgress?: ( + completed: number, + total: number, + filepath: string, + success: boolean, + ) => void +} diff --git a/packages/code-chunk/src/wasm.ts b/packages/code-chunk/src/wasm.ts index fa2865a..f138374 100644 --- a/packages/code-chunk/src/wasm.ts +++ b/packages/code-chunk/src/wasm.ts @@ -1,31 +1,45 @@ import { Effect } from 'effect' - -import type { - Chunk, - Chunker, - ChunkOptions, - Language, - WasmConfig, -} from './types' - import { chunk as chunkInternal, DEFAULT_CHUNK_OPTIONS, streamChunks as streamChunksInternal, } from './chunking' import { extractEntities } from './extract' -import { WasmParser } from './parser/wasm' import { detectLanguage } from './parser/languages' +import { WasmParser } from './parser/wasm' import { buildScopeTree } from './scope' +import type { + BatchOptions, + BatchResult, + Chunk, + Chunker, + ChunkOptions, + FileInput, + Language, + WasmConfig, +} from './types' +export { formatChunkWithContext } from './context/format' +export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages' +export { + createWasmParser, + WasmGrammarError, + WasmParser, + WasmParserError, +} from './parser/wasm' export type { + BatchFileError, + BatchFileResult, + BatchOptions, + BatchResult, Chunk, ChunkContext, ChunkEntityInfo, - ChunkOptions, Chunker, + ChunkOptions, EntityInfo, EntityType, + FileInput, ImportInfo, Language, LineRange, @@ -34,15 +48,6 @@ export type { WasmConfig, } from './types' -export { formatChunkWithContext } from './context/format' -export { - WasmGrammarError, - WasmParser, - WasmParserError, - createWasmParser, -} from './parser/wasm' -export { detectLanguage, LANGUAGE_EXTENSIONS } from './parser/languages' - export class WasmChunkingError extends Error { readonly _tag = 'WasmChunkingError' override readonly cause?: unknown @@ -175,6 +180,95 @@ class WasmChunker implements Chunker { } } } + + async chunkBatch( + files: FileInput[], + options?: BatchOptions, + ): Promise { + const { concurrency = 10, onProgress, ...chunkOptions } = options ?? {} + const mergedOptions = { ...this.defaultOptions, ...chunkOptions } + const total = files.length + + const processFile = async (file: FileInput): Promise => { + try { + const fileOptions = { ...mergedOptions, ...file.options } + const chunks = await this.chunk(file.filepath, file.code, fileOptions) + return { filepath: file.filepath, chunks, error: null } + } catch (error) { + return { + filepath: file.filepath, + chunks: null, + error: error instanceof Error ? error : new Error(String(error)), + } + } + } + + const results: BatchResult[] = [] + let completed = 0 + + for (let i = 0; i < files.length; i += concurrency) { + const batch = files.slice(i, i + concurrency) + const batchResults = await Promise.all(batch.map(processFile)) + + for (let j = 0; j < batchResults.length; j++) { + const result = batchResults[j] + if (result) { + results.push(result) + completed++ + if (onProgress) { + const file = batch[j] + if (file) { + onProgress(completed, total, file.filepath, result.error === null) + } + } + } + } + } + + return results + } + + async *chunkBatchStream( + files: FileInput[], + options?: BatchOptions, + ): AsyncGenerator { + const { concurrency = 10, onProgress, ...chunkOptions } = options ?? {} + const mergedOptions = { ...this.defaultOptions, ...chunkOptions } + const total = files.length + + const processFile = async (file: FileInput): Promise => { + try { + const fileOptions = { ...mergedOptions, ...file.options } + const chunks = await this.chunk(file.filepath, file.code, fileOptions) + return { filepath: file.filepath, chunks, error: null } + } catch (error) { + return { + filepath: file.filepath, + chunks: null, + error: error instanceof Error ? error : new Error(String(error)), + } + } + } + + let completed = 0 + + for (let i = 0; i < files.length; i += concurrency) { + const batch = files.slice(i, i + concurrency) + const batchResults = await Promise.all(batch.map(processFile)) + + for (let j = 0; j < batchResults.length; j++) { + const result = batchResults[j] + if (result) { + completed++ + const file = batch[j] + if (onProgress && file) { + onProgress(completed, total, file.filepath, result.error === null) + } + yield result + } + } + } + } } export async function createChunker( diff --git a/packages/code-chunk/test/batch.test.ts b/packages/code-chunk/test/batch.test.ts new file mode 100644 index 0000000..3095268 --- /dev/null +++ b/packages/code-chunk/test/batch.test.ts @@ -0,0 +1,448 @@ +import { describe, expect, test } from 'bun:test' +import { + type BatchResult, + chunkBatch, + chunkBatchStream, + createChunker, + type FileInput, +} from '../src' + +const tsCode1 = `export function add(a: number, b: number): number { + return a + b +} + +export function subtract(a: number, b: number): number { + return a - b +}` + +const tsCode2 = `export class Calculator { + private result: number = 0 + + add(n: number): this { + this.result += n + return this + } + + getValue(): number { + return this.result + } +}` + +const pyCode = `def greet(name: str) -> str: + return f"Hello, {name}!" + +def farewell(name: str) -> str: + return f"Goodbye, {name}!"` + +const goCode = `package main + +func Sum(nums []int) int { + total := 0 + for _, n := range nums { + total += n + } + return total +}` + +describe('chunkBatch', () => { + test('processes multiple files and returns results for each', async () => { + const files: FileInput[] = [ + { filepath: 'math.ts', code: tsCode1 }, + { filepath: 'calc.ts', code: tsCode2 }, + { filepath: 'greet.py', code: pyCode }, + ] + + const results = await chunkBatch(files) + + expect(results).toHaveLength(3) + + for (const result of results) { + expect(result.error).toBeNull() + expect(result.chunks).not.toBeNull() + expect(result.chunks!.length).toBeGreaterThan(0) + } + + const mathResult = results.find((r) => r.filepath === 'math.ts') + expect(mathResult?.chunks).not.toBeNull() + expect(mathResult?.chunks?.[0]?.context.language).toBe('typescript') + + const pyResult = results.find((r) => r.filepath === 'greet.py') + expect(pyResult?.chunks).not.toBeNull() + expect(pyResult?.chunks?.[0]?.context.language).toBe('python') + }) + + test('handles unsupported file types gracefully', async () => { + const files: FileInput[] = [ + { filepath: 'valid.ts', code: tsCode1 }, + { filepath: 'invalid.xyz', code: 'some content' }, + { filepath: 'also-valid.py', code: pyCode }, + ] + + const results = await chunkBatch(files) + + expect(results).toHaveLength(3) + + const validTs = results.find((r) => r.filepath === 'valid.ts') + expect(validTs?.error).toBeNull() + expect(validTs?.chunks).not.toBeNull() + + const invalid = results.find((r) => r.filepath === 'invalid.xyz') + expect(invalid?.error).not.toBeNull() + expect(invalid?.chunks).toBeNull() + expect(invalid?.error?.message).toContain('Unsupported file type') + + const validPy = results.find((r) => r.filepath === 'also-valid.py') + expect(validPy?.error).toBeNull() + expect(validPy?.chunks).not.toBeNull() + }) + + test('returns empty array for empty input', async () => { + const results = await chunkBatch([]) + expect(results).toHaveLength(0) + }) + + test('respects maxChunkSize option', async () => { + const largeCode = Array.from( + { length: 20 }, + (_, i) => + `export function func${i}(x: number): number { return x * ${i} }`, + ).join('\n\n') + + const files: FileInput[] = [{ filepath: 'large.ts', code: largeCode }] + + const smallChunks = await chunkBatch(files, { maxChunkSize: 100 }) + const largeChunks = await chunkBatch(files, { maxChunkSize: 2000 }) + + const smallResult = smallChunks[0] + const largeResult = largeChunks[0] + + expect(smallResult?.chunks).not.toBeNull() + expect(largeResult?.chunks).not.toBeNull() + expect(smallResult!.chunks!.length).toBeGreaterThan( + largeResult!.chunks!.length, + ) + }) + + test('respects per-file options override', async () => { + const code = Array.from( + { length: 10 }, + (_, i) => `export function func${i}(): void {}`, + ).join('\n') + + const files: FileInput[] = [ + { filepath: 'default.ts', code }, + { filepath: 'small.ts', code, options: { maxChunkSize: 50 } }, + ] + + const results = await chunkBatch(files, { maxChunkSize: 2000 }) + + const defaultResult = results.find((r) => r.filepath === 'default.ts') + const smallResult = results.find((r) => r.filepath === 'small.ts') + + expect(defaultResult?.chunks).not.toBeNull() + expect(smallResult?.chunks).not.toBeNull() + expect(smallResult!.chunks!.length).toBeGreaterThan( + defaultResult!.chunks!.length, + ) + }) + + test('calls onProgress callback for each file', async () => { + const files: FileInput[] = [ + { filepath: 'a.ts', code: tsCode1 }, + { filepath: 'b.ts', code: tsCode2 }, + { filepath: 'c.py', code: pyCode }, + ] + + const progressCalls: Array<{ + completed: number + total: number + filepath: string + success: boolean + }> = [] + + await chunkBatch(files, { + onProgress: (completed, total, filepath, success) => { + progressCalls.push({ completed, total, filepath, success }) + }, + }) + + expect(progressCalls).toHaveLength(3) + + for (const call of progressCalls) { + expect(call.total).toBe(3) + expect(call.success).toBe(true) + } + + const completedValues = progressCalls.map((c) => c.completed).sort() + expect(completedValues).toEqual([1, 2, 3]) + }) + + test('onProgress reports failures correctly', async () => { + const files: FileInput[] = [ + { filepath: 'valid.ts', code: tsCode1 }, + { filepath: 'invalid.xyz', code: 'content' }, + ] + + const progressCalls: Array<{ filepath: string; success: boolean }> = [] + + await chunkBatch(files, { + onProgress: (_, __, filepath, success) => { + progressCalls.push({ filepath, success }) + }, + }) + + const validCall = progressCalls.find((c) => c.filepath === 'valid.ts') + const invalidCall = progressCalls.find((c) => c.filepath === 'invalid.xyz') + + expect(validCall?.success).toBe(true) + expect(invalidCall?.success).toBe(false) + }) + + test('respects concurrency option', async () => { + const files: FileInput[] = Array.from({ length: 20 }, (_, i) => ({ + filepath: `file${i}.ts`, + code: `export const x${i} = ${i}`, + })) + + const startTime = performance.now() + await chunkBatch(files, { concurrency: 1 }) + const sequentialTime = performance.now() - startTime + + const startTime2 = performance.now() + await chunkBatch(files, { concurrency: 10 }) + const parallelTime = performance.now() - startTime2 + + expect(parallelTime).toBeLessThanOrEqual(sequentialTime * 1.5) + }) + + test('handles mixed language files', async () => { + const files: FileInput[] = [ + { filepath: 'app.ts', code: tsCode1 }, + { filepath: 'utils.py', code: pyCode }, + { filepath: 'main.go', code: goCode }, + ] + + const results = await chunkBatch(files) + + expect(results).toHaveLength(3) + + const tsResult = results.find((r) => r.filepath === 'app.ts') + const pyResult = results.find((r) => r.filepath === 'utils.py') + const goResult = results.find((r) => r.filepath === 'main.go') + + expect(tsResult?.chunks?.[0]?.context.language).toBe('typescript') + expect(pyResult?.chunks?.[0]?.context.language).toBe('python') + expect(goResult?.chunks?.[0]?.context.language).toBe('go') + }) + + test('chunks contain valid byte and line ranges', async () => { + const files: FileInput[] = [{ filepath: 'test.ts', code: tsCode1 }] + + const results = await chunkBatch(files) + const chunks = results[0]?.chunks + + expect(chunks).not.toBeNull() + + for (const chunk of chunks!) { + expect(chunk.byteRange.start).toBeGreaterThanOrEqual(0) + expect(chunk.byteRange.end).toBeGreaterThan(chunk.byteRange.start) + expect(chunk.lineRange.start).toBeGreaterThanOrEqual(0) + expect(chunk.lineRange.end).toBeGreaterThanOrEqual(chunk.lineRange.start) + + const sliced = tsCode1.slice(chunk.byteRange.start, chunk.byteRange.end) + expect(chunk.text).toBe(sliced) + } + }) +}) + +describe('chunkBatchStream', () => { + test('yields results as they complete', async () => { + const files: FileInput[] = [ + { filepath: 'a.ts', code: tsCode1 }, + { filepath: 'b.ts', code: tsCode2 }, + { filepath: 'c.py', code: pyCode }, + ] + + const results: BatchResult[] = [] + for await (const result of chunkBatchStream(files)) { + results.push(result) + } + + expect(results).toHaveLength(3) + + for (const result of results) { + expect(result.error).toBeNull() + expect(result.chunks).not.toBeNull() + } + }) + + test('handles errors in stream', async () => { + const files: FileInput[] = [ + { filepath: 'valid.ts', code: tsCode1 }, + { filepath: 'invalid.xyz', code: 'content' }, + ] + + const results: BatchResult[] = [] + for await (const result of chunkBatchStream(files)) { + results.push(result) + } + + expect(results).toHaveLength(2) + + const hasError = results.some((r) => r.error !== null) + const hasSuccess = results.some((r) => r.error === null) + + expect(hasError).toBe(true) + expect(hasSuccess).toBe(true) + }) + + test('streams empty for empty input', async () => { + const results: BatchResult[] = [] + for await (const result of chunkBatchStream([])) { + results.push(result) + } + + expect(results).toHaveLength(0) + }) + + test('respects options in stream mode', async () => { + const files: FileInput[] = [{ filepath: 'test.ts', code: tsCode1 }] + + const results: BatchResult[] = [] + for await (const result of chunkBatchStream(files, { maxChunkSize: 50 })) { + results.push(result) + } + + expect(results).toHaveLength(1) + expect(results[0]?.chunks).not.toBeNull() + expect(results[0]!.chunks!.length).toBeGreaterThan(1) + }) + + test('calls onProgress in stream mode', async () => { + const files: FileInput[] = [ + { filepath: 'a.ts', code: tsCode1 }, + { filepath: 'b.ts', code: tsCode2 }, + ] + + const progressCalls: number[] = [] + + for await (const _ of chunkBatchStream(files, { + onProgress: (completed) => { + progressCalls.push(completed) + }, + })) { + } + + expect(progressCalls.length).toBeGreaterThanOrEqual(2) + }) +}) + +describe('createChunker batch methods', () => { + test('chunker.chunkBatch uses default options', async () => { + const chunker = createChunker({ maxChunkSize: 100 }) + + const largeCode = Array.from( + { length: 10 }, + (_, i) => + `export function func${i}(x: number): number { return x * ${i} }`, + ).join('\n\n') + + const files: FileInput[] = [{ filepath: 'test.ts', code: largeCode }] + + const results = await chunker.chunkBatch(files) + + expect(results).toHaveLength(1) + expect(results[0]?.chunks).not.toBeNull() + expect(results[0]!.chunks!.length).toBeGreaterThan(1) + }) + + test('chunker.chunkBatch allows option overrides', async () => { + const chunker = createChunker({ maxChunkSize: 100 }) + + const files: FileInput[] = [{ filepath: 'test.ts', code: tsCode1 }] + + const smallChunks = await chunker.chunkBatch(files) + const largeChunks = await chunker.chunkBatch(files, { maxChunkSize: 5000 }) + + expect(smallChunks[0]!.chunks!.length).toBeGreaterThanOrEqual( + largeChunks[0]!.chunks!.length, + ) + }) + + test('chunker.chunkBatchStream yields results', async () => { + const chunker = createChunker() + + const files: FileInput[] = [ + { filepath: 'a.ts', code: tsCode1 }, + { filepath: 'b.py', code: pyCode }, + ] + + const results: BatchResult[] = [] + for await (const result of chunker.chunkBatchStream(files)) { + results.push(result) + } + + expect(results).toHaveLength(2) + }) +}) + +describe('batch processing edge cases', () => { + test('handles empty files', async () => { + const files: FileInput[] = [ + { filepath: 'empty.ts', code: '' }, + { filepath: 'whitespace.ts', code: ' \n\n ' }, + ] + + const results = await chunkBatch(files) + + expect(results).toHaveLength(2) + + for (const result of results) { + expect(result.error).toBeNull() + expect(result.chunks).toHaveLength(0) + } + }) + + test('handles malformed code gracefully', async () => { + const files: FileInput[] = [ + { filepath: 'broken.ts', code: 'function broken( { return' }, + { filepath: 'valid.ts', code: tsCode1 }, + ] + + const results = await chunkBatch(files) + + expect(results).toHaveLength(2) + + const validResult = results.find((r) => r.filepath === 'valid.ts') + expect(validResult?.error).toBeNull() + expect(validResult?.chunks).not.toBeNull() + }) + + test('handles large number of files', async () => { + const files: FileInput[] = Array.from({ length: 100 }, (_, i) => ({ + filepath: `file${i}.ts`, + code: `export const value${i} = ${i}`, + })) + + const results = await chunkBatch(files, { concurrency: 20 }) + + expect(results).toHaveLength(100) + + const successCount = results.filter((r) => r.error === null).length + expect(successCount).toBe(100) + }) + + test('preserves file order in results', async () => { + const files: FileInput[] = [ + { filepath: 'first.ts', code: 'export const a = 1' }, + { filepath: 'second.ts', code: 'export const b = 2' }, + { filepath: 'third.ts', code: 'export const c = 3' }, + ] + + const results = await chunkBatch(files, { concurrency: 1 }) + + expect(results[0]?.filepath).toBe('first.ts') + expect(results[1]?.filepath).toBe('second.ts') + expect(results[2]?.filepath).toBe('third.ts') + }) +}) diff --git a/packages/code-chunk/test/wasm.test.ts b/packages/code-chunk/test/wasm.test.ts index 53a5123..6473e74 100644 --- a/packages/code-chunk/test/wasm.test.ts +++ b/packages/code-chunk/test/wasm.test.ts @@ -1,24 +1,22 @@ +import { describe, expect, test } from 'bun:test' import { readFile } from 'node:fs/promises' import { resolve } from 'node:path' -import { describe, expect, test } from 'bun:test' import type { WasmConfig } from '../src/types' import { + createChunker, + UnsupportedLanguageError, WasmChunkingError, WasmGrammarError, WasmParser, WasmParserError, - createChunker, - UnsupportedLanguageError, } from '../src/wasm' async function loadWasmBinary(packagePath: string): Promise { - // node_modules is at monorepo root, 2 dirs up from packages/code-chunk + const monorepoRoot = resolve(import.meta.dir, '..', '..', '..') const fullPath = resolve( - process.cwd(), - '..', - '..', + monorepoRoot, 'node_modules', ...packagePath.split('/'), )