From aed0e2161a17fa1920eaec6e25e2ba79c2ea122b Mon Sep 17 00:00:00 2001 From: Shoubhit Dash Date: Wed, 17 Dec 2025 01:32:23 +0530 Subject: [PATCH 1/2] feat: implement entity extraction system with queries, signatures, docstrings --- src/extract/docstring.ts | 406 +++++++++++++++++-- src/extract/fallback.ts | 206 +++++++++- src/extract/index.ts | 348 +++++++++++++++- src/extract/queries.ts | 724 ++++++++++++++++++++++++++++++++- src/extract/signature.ts | 362 +++++++++++++++-- test/extract.test.ts | 838 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 2778 insertions(+), 106 deletions(-) create mode 100644 test/extract.test.ts diff --git a/src/extract/docstring.ts b/src/extract/docstring.ts index e9b8dfb..1df1f59 100644 --- a/src/extract/docstring.ts +++ b/src/extract/docstring.ts @@ -14,26 +14,9 @@ export const COMMENT_NODE_TYPES: Record = { } /** - * Extract the docstring/documentation comment for an entity - * - * @param node - The AST node representing the entity - * @param language - The programming language - * @param code - The source code - * @returns Effect yielding the docstring, or null if none found - * - * TODO: Implement docstring extraction + * Python docstring node types (triple-quoted strings) */ -export const extractDocstring = ( - _node: SyntaxNode, - _language: Language, - _code: string, -): Effect.Effect => { - // TODO: Implement docstring extraction - // 1. Look for comment nodes immediately preceding the entity - // 2. For Python, also check for string literal as first child - // 3. Parse and clean up the comment format - return Effect.succeed(null) -} +const PYTHON_STRING_TYPES: readonly string[] = ['string', 'string_content'] /** * Check if a comment is a documentation comment (JSDoc, docstring, etc.) @@ -43,14 +26,381 @@ export const extractDocstring = ( * @returns Whether the comment is a documentation comment */ export const isDocComment = ( - _commentText: string, - _language: Language, + commentText: string, + language: Language, ): boolean => { - // TODO: Implement doc comment detection - // - JSDoc: starts with /** - // - Python: triple quotes - // - Rust: starts with /// or //! - // - Go: starts with // - // - Java: starts with /** - return false + const trimmed = commentText.trim() + + switch (language) { + case 'typescript': + case 'javascript': + case 'java': + // JSDoc/Javadoc: starts with /** (but not /***+) + return /^\/\*\*[^*]/.test(trimmed) || trimmed === '/**/' + + case 'python': + // Python docstrings: triple quotes + return ( + trimmed.startsWith('"""') || + trimmed.startsWith("'''") || + trimmed.startsWith('r"""') || + trimmed.startsWith("r'''") + ) + + case 'rust': + // Rust doc comments: /// (outer) or //! (inner) + return trimmed.startsWith('///') || trimmed.startsWith('//!') + + case 'go': + // Go: any // comment immediately before a declaration is considered doc + return trimmed.startsWith('//') + + default: + return false + } +} + +/** + * Parse and clean up a docstring, removing comment markers and normalizing whitespace + * + * @param text - The raw docstring text + * @param language - The programming language + * @returns The cleaned docstring text + */ +export const parseDocstring = (text: string, language: Language): string => { + switch (language) { + case 'typescript': + case 'javascript': + case 'java': + return parseJSDocStyle(text) + + case 'python': + return parsePythonDocstring(text) + + case 'rust': + return parseRustDocComment(text) + + case 'go': + return parseGoComment(text) + + default: + return text.trim() + } +} + +/** + * Parse JSDoc/Javadoc style comments + * Handles: /** ... *\/ + */ +function parseJSDocStyle(text: string): string { + let content = text.trim() + + // Remove opening /** and closing */ + if (content.startsWith('/**')) { + content = content.slice(3) + } + if (content.endsWith('*/')) { + content = content.slice(0, -2) + } + + // Split into lines and process each + const lines = content.split('\n') + const processedLines = lines.map((line) => { + let processed = line.trim() + // Remove leading * from each line (common JSDoc style) + if (processed.startsWith('*')) { + processed = processed.slice(1) + // Remove one space after * if present + if (processed.startsWith(' ')) { + processed = processed.slice(1) + } + } + return processed + }) + + // Remove empty lines at start and end + while (processedLines.length > 0 && processedLines[0] === '') { + processedLines.shift() + } + while ( + processedLines.length > 0 && + processedLines[processedLines.length - 1] === '' + ) { + processedLines.pop() + } + + return processedLines.join('\n') +} + +/** + * Parse Python docstrings (triple-quoted strings) + * Handles: ''' ... ''' and """ ... """ + */ +function parsePythonDocstring(text: string): string { + let content = text.trim() + + // Handle raw strings + if (content.startsWith('r"""') || content.startsWith("r'''")) { + content = content.slice(1) + } + + // Remove opening and closing quotes + if (content.startsWith('"""')) { + content = content.slice(3) + if (content.endsWith('"""')) { + content = content.slice(0, -3) + } + } else if (content.startsWith("'''")) { + content = content.slice(3) + if (content.endsWith("'''")) { + content = content.slice(0, -3) + } + } + + // Split into lines + const lines = content.split('\n') + + // Find minimum indentation (excluding empty lines) + let minIndent = Number.POSITIVE_INFINITY + for (const line of lines) { + if (line.trim().length > 0) { + const leadingSpaces = line.match(/^(\s*)/)?.[1]?.length ?? 0 + minIndent = Math.min(minIndent, leadingSpaces) + } + } + + if (minIndent === Number.POSITIVE_INFINITY) { + minIndent = 0 + } + + // Remove common indentation + const dedentedLines = lines.map((line) => { + if (line.trim().length === 0) { + return '' + } + return line.slice(minIndent) + }) + + // Remove empty lines at start and end + while (dedentedLines.length > 0 && dedentedLines[0]?.trim() === '') { + dedentedLines.shift() + } + while ( + dedentedLines.length > 0 && + dedentedLines[dedentedLines.length - 1]?.trim() === '' + ) { + dedentedLines.pop() + } + + return dedentedLines.join('\n') +} + +/** + * Parse Rust doc comments + * Handles: /// and //! + */ +function parseRustDocComment(text: string): string { + const lines = text.split('\n') + const processedLines: string[] = [] + + for (const line of lines) { + const trimmed = line.trim() + let content = trimmed + + // Remove /// or //! prefix + if (trimmed.startsWith('///')) { + content = trimmed.slice(3) + } else if (trimmed.startsWith('//!')) { + content = trimmed.slice(3) + } + + // Remove one leading space if present + if (content.startsWith(' ')) { + content = content.slice(1) + } + + processedLines.push(content) + } + + // Remove empty lines at start and end + while (processedLines.length > 0 && processedLines[0] === '') { + processedLines.shift() + } + while ( + processedLines.length > 0 && + processedLines[processedLines.length - 1] === '' + ) { + processedLines.pop() + } + + return processedLines.join('\n') +} + +/** + * Parse Go comments + * Handles: // style comments + */ +function parseGoComment(text: string): string { + const lines = text.split('\n') + const processedLines: string[] = [] + + for (const line of lines) { + const trimmed = line.trim() + let content = trimmed + + // Remove // prefix + if (trimmed.startsWith('//')) { + content = trimmed.slice(2) + } + + // Remove one leading space if present + if (content.startsWith(' ')) { + content = content.slice(1) + } + + processedLines.push(content) + } + + // Remove empty lines at start and end + while (processedLines.length > 0 && processedLines[0] === '') { + processedLines.shift() + } + while ( + processedLines.length > 0 && + processedLines[processedLines.length - 1] === '' + ) { + processedLines.pop() + } + + return processedLines.join('\n') +} + +/** + * Get the text content of a node + */ +function getNodeText(node: SyntaxNode, code: string): string { + return code.slice(node.startIndex, node.endIndex) +} + +/** + * Find preceding comment nodes (handles consecutive comment lines) + */ +function findPrecedingComments( + node: SyntaxNode, + language: Language, + code: string, +): string | null { + const commentTypes = COMMENT_NODE_TYPES[language] + const comments: string[] = [] + let current = node.previousNamedSibling + + // Walk backwards collecting consecutive comment nodes + while (current) { + const nodeType = current.type + + if (commentTypes.includes(nodeType)) { + const text = getNodeText(current, code) + + // For Python, only consider string literals that are docstrings (but they come after, not before) + // For Python comments that precede, they're not docstrings + if (language === 'python' && PYTHON_STRING_TYPES.includes(nodeType)) { + break + } + + if (isDocComment(text, language)) { + comments.unshift(text) // Add to front since we're going backwards + current = current.previousNamedSibling + } else { + break + } + } else { + // Check if there's a comment between the current named sibling and our node + // by looking at the previous sibling (including non-named) + break + } + } + + if (comments.length === 0) { + return null + } + + // Combine consecutive comments (for Rust /// style) + const combinedText = comments.join('\n') + return parseDocstring(combinedText, language) +} + +/** + * Find Python docstring (first string literal in function/class body) + */ +function findPythonDocstring(node: SyntaxNode, code: string): string | null { + // Look for a block/body child + const bodyNode = + node.childForFieldName('body') ?? + node.namedChildren.find((c) => c.type === 'block') + + if (!bodyNode) { + return null + } + + // Get the first statement in the body + const firstChild = bodyNode.namedChildren[0] + + if (!firstChild) { + return null + } + + // Check if it's an expression statement containing a string + if (firstChild.type === 'expression_statement') { + const stringNode = firstChild.namedChildren[0] + if (stringNode && PYTHON_STRING_TYPES.includes(stringNode.type)) { + const text = getNodeText(stringNode, code) + if (isDocComment(text, 'python')) { + return parseDocstring(text, 'python') + } + } + } + + // Direct string literal (shouldn't happen in valid Python, but handle it) + if (PYTHON_STRING_TYPES.includes(firstChild.type)) { + const text = getNodeText(firstChild, code) + if (isDocComment(text, 'python')) { + return parseDocstring(text, 'python') + } + } + + return null +} + +/** + * Extract the docstring/documentation comment for an entity + * + * @param node - The AST node representing the entity + * @param language - The programming language + * @param code - The source code + * @returns Effect yielding the docstring, or null if none found + * + * Handles: + * - JSDoc (/** ... *\/) for TypeScript/JavaScript + * - Python docstrings (triple-quoted string as first statement in body) + * - Rust doc comments (/// and //!) + * - Go comments (// before declaration) + * - Java Javadoc (/** ... *\/) + */ +export const extractDocstring = ( + node: SyntaxNode, + language: Language, + code: string, +): Effect.Effect => { + return Effect.sync(() => { + // For Python, first check for docstring inside the body + if (language === 'python') { + const docstring = findPythonDocstring(node, code) + if (docstring) { + return docstring + } + } + + // Look for preceding comments + return findPrecedingComments(node, language, code) + }) } diff --git a/src/extract/fallback.ts b/src/extract/fallback.ts index a5bc614..7aa8263 100644 --- a/src/extract/fallback.ts +++ b/src/extract/fallback.ts @@ -1,5 +1,12 @@ import { Effect } from 'effect' -import type { ExtractedEntity, Language, SyntaxNode } from '../types' +import type { + EntityType, + ExtractedEntity, + Language, + SyntaxNode, +} from '../types' +import { extractDocstring } from './docstring' +import { extractName, extractSignature } from './signature' /** * Node types that represent extractable entities by language @@ -53,25 +60,50 @@ export const ENTITY_NODE_TYPES: Record = { } /** - * Extract entities by matching node types (fallback when no query available) - * - * @param rootNode - The root node of the AST - * @param language - The programming language - * @param code - The source code - * @returns Effect yielding extracted entities - * - * TODO: Implement node type based extraction + * Map node type to EntityType */ -export const extractByNodeTypes = ( - _rootNode: SyntaxNode, - _language: Language, - _code: string, -): Effect.Effect => { - // TODO: Implement fallback extraction - // 1. Get node types for language - // 2. Walk the tree - // 3. Extract entities for matching nodes - return Effect.succeed([]) +export const NODE_TYPE_TO_ENTITY_TYPE: Record = { + // Functions + function_declaration: 'function', + function_definition: 'function', + function_item: 'function', + generator_function_declaration: 'function', + arrow_function: 'function', + + // Methods + method_definition: 'method', + method_declaration: 'method', + + // Classes + class_declaration: 'class', + class_definition: 'class', + abstract_class_declaration: 'class', + + // Interfaces + interface_declaration: 'interface', + trait_item: 'interface', + + // Types + type_alias_declaration: 'type', + type_item: 'type', + type_declaration: 'type', + struct_item: 'type', + + // Enums + enum_declaration: 'enum', + enum_item: 'enum', + + // Imports + import_statement: 'import', + import_declaration: 'import', + import_from_statement: 'import', + use_declaration: 'import', + + // Exports + export_statement: 'export', + + // Impl blocks (Rust - treat as class-like) + impl_item: 'class', } /** @@ -84,3 +116,139 @@ export const isEntityNodeType = ( const types = ENTITY_NODE_TYPES[language] return types.includes(nodeType) } + +/** + * Get EntityType from node type string + */ +export const getEntityType = (nodeType: string): EntityType | null => { + return NODE_TYPE_TO_ENTITY_TYPE[nodeType] ?? null +} + +/** + * Item in the traversal stack for iterative tree walking + */ +interface StackItem { + node: SyntaxNode + parentName: string | null +} + +/** + * Walk the AST iteratively and extract entities by matching node types + * Uses an explicit stack to avoid stack overflow on deeply nested ASTs + */ +function walkAndExtract( + rootNode: SyntaxNode, + language: Language, + code: string, + entities: ExtractedEntity[], + entityNodes: Set, +): Effect.Effect { + return Effect.gen(function* () { + // Use explicit stack for depth-first traversal + const stack: StackItem[] = [{ node: rootNode, parentName: null }] + + while (stack.length > 0) { + const current = stack.pop() + if (!current) continue + const { node, parentName } = current + + // Check if this node is an entity type + if (isEntityNodeType(node.type, language)) { + // Skip if we've already processed this node + if (entityNodes.has(node.id)) { + continue + } + entityNodes.add(node.id) + + const entityType = getEntityType(node.type) + if (entityType) { + // Extract name + const name = extractName(node, language) ?? '' + + // Extract signature + const signature = yield* extractSignature( + node, + entityType, + language, + code, + ) + + // Extract docstring + const docstring = yield* extractDocstring(node, language, code) + + // Create entity + const entity: ExtractedEntity = { + type: entityType, + name, + signature: signature || name, + docstring, + byteRange: { + start: node.startIndex, + end: node.endIndex, + }, + lineRange: { + start: node.startPosition.row, + end: node.endPosition.row, + }, + parent: parentName, + node, + } + + entities.push(entity) + + // For nested entities, use this entity's name as parent + const newParentName = + entityType === 'class' || + entityType === 'interface' || + entityType === 'function' || + entityType === 'method' + ? name + : parentName + + // Add children to stack (in reverse order for correct DFS order) + const children = node.namedChildren + for (let i = children.length - 1; i >= 0; i--) { + const child = children[i] + if (child) { + stack.push({ node: child, parentName: newParentName }) + } + } + } + } else { + // Not an entity node, but might contain entity nodes + // Add children to stack (in reverse order for correct DFS order) + const children = node.namedChildren + for (let i = children.length - 1; i >= 0; i--) { + const child = children[i] + if (child) { + stack.push({ node: child, parentName }) + } + } + } + } + }) +} + +/** + * Extract entities by matching node types (fallback when no query available) + * + * @param rootNode - The root node of the AST + * @param language - The programming language + * @param code - The source code + * @returns Effect yielding extracted entities + */ +export const extractByNodeTypes = ( + rootNode: SyntaxNode, + language: Language, + code: string, +): Effect.Effect => { + return Effect.gen(function* () { + const entities: ExtractedEntity[] = [] + const entityNodes = new Set() + + // Walk the tree starting from root + yield* walkAndExtract(rootNode, language, code, entities, entityNodes) + + return entities + }) +} diff --git a/src/extract/index.ts b/src/extract/index.ts index ee6ad85..31eea9a 100644 --- a/src/extract/index.ts +++ b/src/extract/index.ts @@ -1,5 +1,18 @@ import { Effect } from 'effect' -import type { ExtractedEntity, Language, SyntaxNode } from '../types' +import type { + EntityType, + ExtractedEntity, + Language, + SyntaxNode, +} from '../types' +import { extractDocstring } from './docstring' +import { + ENTITY_NODE_TYPES, + extractByNodeTypes, + getEntityType, +} from './fallback' +import { type CompiledQuery, loadQuery, loadQuerySync } from './queries' +import { extractName, extractSignature } from './signature' /** * Error when entity extraction fails @@ -12,37 +25,338 @@ export class ExtractError { ) {} } +/** + * Interface for query match captures (compatible with future queries.ts implementation) + */ +interface QueryCapture { + name: string + node: SyntaxNode + patternIndex: number +} + +/** + * Interface for query matches (compatible with future queries.ts implementation) + */ +interface QueryMatch { + patternIndex: number + captures: QueryCapture[] +} + +/** + * Extract the entity node and name node from a query match + * This will be provided by queries.ts when merged, but we define it here for now + */ +function extractEntityFromMatch(match: QueryMatch): { + itemNode: SyntaxNode + nameNode: SyntaxNode | null + contextNodes: SyntaxNode[] + annotationNodes: SyntaxNode[] +} | null { + const itemCapture = match.captures.find((c) => c.name === 'item') + if (!itemCapture) { + return null + } + + const nameCapture = match.captures.find((c) => c.name === 'name') + const contextCaptures = match.captures.filter((c) => c.name === 'context') + const annotationCaptures = match.captures.filter( + (c) => c.name === 'annotation', + ) + + return { + itemNode: itemCapture.node, + nameNode: nameCapture?.node ?? null, + contextNodes: contextCaptures.map((c) => c.node), + annotationNodes: annotationCaptures.map((c) => c.node), + } +} + +/** + * Execute a query against a tree (compatible interface) + * This will be provided by queries.ts when merged + */ +function executeQueryOnTree( + query: CompiledQuery, + rootNode: SyntaxNode, +): { matches: QueryMatch[] } | null { + // Check if query has a matches method (compiled web-tree-sitter Query) + if ( + query && + typeof query === 'object' && + 'matches' in query && + typeof (query as { matches: unknown }).matches === 'function' + ) { + try { + const matches = ( + query as { matches: (node: SyntaxNode) => unknown[] } + ).matches(rootNode) + const queryMatches: QueryMatch[] = matches.map((match: unknown) => { + const m = match as { + patternIndex: number + captures: { name: string; node: SyntaxNode }[] + } + return { + patternIndex: m.patternIndex, + captures: m.captures.map((capture) => ({ + name: capture.name, + node: capture.node, + patternIndex: m.patternIndex, + })), + } + }) + return { matches: queryMatches } + } catch { + return null + } + } + return null +} + +/** + * Convert query matches to extracted entities + */ +function matchesToEntities( + matches: QueryMatch[], + language: Language, + code: string, + rootNode: SyntaxNode, +): Effect.Effect { + return Effect.gen(function* () { + const entities: ExtractedEntity[] = [] + const processedNodes = new Set() + + for (const match of matches) { + const extracted = extractEntityFromMatch(match) + if (!extracted) { + continue + } + + const { itemNode, nameNode } = extracted + + // Skip if already processed + if (processedNodes.has(itemNode.id)) { + continue + } + processedNodes.add(itemNode.id) + + // Get entity type from node type + let entityType = getEntityType(itemNode.type) + if (!entityType) { + // Fallback: try to infer from node type pattern + entityType = inferEntityType(itemNode.type) + if (!entityType) { + continue + } + } + + // Extract name - prefer name node from query, fallback to extraction + const name = nameNode + ? nameNode.text + : (extractName(itemNode, language) ?? '') + + // Extract signature + const signature = yield* extractSignature( + itemNode, + entityType, + language, + code, + ) + + // Extract docstring + const docstring = yield* extractDocstring(itemNode, language, code) + + // Find parent entity + const parent = findParentEntityName(itemNode, rootNode, language) + + const entity: ExtractedEntity = { + type: entityType, + name, + signature: signature || name, + docstring, + byteRange: { + start: itemNode.startIndex, + end: itemNode.endIndex, + }, + lineRange: { + start: itemNode.startPosition.row, + end: itemNode.endPosition.row, + }, + parent, + node: itemNode, + } + + entities.push(entity) + } + + return entities + }) +} + +/** + * Infer entity type from node type string for cases not covered by the map + */ +function inferEntityType(nodeType: string): EntityType | null { + const lowerType = nodeType.toLowerCase() + + if (lowerType.includes('function') || lowerType.includes('arrow')) { + return 'function' + } + if (lowerType.includes('method')) { + return 'method' + } + if (lowerType.includes('class')) { + return 'class' + } + if (lowerType.includes('interface') || lowerType.includes('trait')) { + return 'interface' + } + if (lowerType.includes('type') || lowerType.includes('struct')) { + return 'type' + } + if (lowerType.includes('enum')) { + return 'enum' + } + if (lowerType.includes('import') || lowerType.includes('use')) { + return 'import' + } + if (lowerType.includes('export')) { + return 'export' + } + + return null +} + +/** + * Find the name of the parent entity (if any) by walking up the AST + */ +function findParentEntityName( + node: SyntaxNode, + rootNode: SyntaxNode, + language: Language, +): string | null { + const entityTypes = ENTITY_NODE_TYPES[language] + let current = node.parent + + while (current && current.id !== rootNode.id) { + if (entityTypes.includes(current.type)) { + // This is a parent entity + const name = extractName(current, language) + if (name) { + return name + } + } + current = current.parent + } + + return null +} + /** * Extract entities from an AST tree * + * Uses tree-sitter queries when available, falling back to node type matching. + * * @param rootNode - The root node of the AST * @param language - The programming language * @param code - The source code (for extracting text) * @returns Effect yielding extracted entities - * - * TODO: Implement entity extraction using tree-sitter queries */ export const extractEntities = ( - _rootNode: SyntaxNode, - _language: Language, - _code: string, + rootNode: SyntaxNode, + language: Language, + code: string, ): Effect.Effect => { - // TODO: Implement entity extraction - // 1. Load appropriate query for language - // 2. Run query on AST - // 3. Extract entities from matches - // 4. Fall back to node type matching if no query - return Effect.succeed([]) + return Effect.gen(function* () { + // Try to load query for this language + const queryResult = yield* Effect.either(loadQuery(language)) + + if (queryResult._tag === 'Right' && queryResult.right !== null) { + // Query loaded successfully - execute it + const query = queryResult.right + + const result = executeQueryOnTree(query, rootNode) + + if (result) { + // Convert matches to entities + const entities = yield* matchesToEntities( + result.matches, + language, + code, + rootNode, + ) + return entities + } + } + + // No query available or query loading failed - use fallback extraction + const entities = yield* extractByNodeTypes(rootNode, language, code) + return entities + }).pipe( + Effect.catchAll((error: unknown) => + Effect.fail( + new ExtractError( + `Entity extraction failed: ${error instanceof Error ? error.message : String(error)}`, + error, + ), + ), + ), + ) } /** * Sync version of extractEntities for public API + * + * Note: This function will use query-based extraction if the query is already cached, + * otherwise it falls back to node type matching. For guaranteed query-based extraction, + * use extractEntitiesAsync() instead. + * + * @param rootNode - The root node of the AST + * @param language - The programming language + * @param code - The source code + * @returns Array of extracted entities */ export const extractEntitiesSync = ( - _rootNode: SyntaxNode, - _language: Language, - _code: string, + rootNode: SyntaxNode, + language: Language, + code: string, ): ExtractedEntity[] => { - // TODO: Implement sync wrapper - return [] + // Try to use cached query if available (loadQuerySync returns cached query or null) + const cachedQuery = loadQuerySync(language) + + if (cachedQuery) { + // Query is cached - use it + const result = executeQueryOnTree(cachedQuery, rootNode) + if (result) { + const effect = matchesToEntities(result.matches, language, code, rootNode) + return Effect.runSync(effect) + } + } + + // No cached query - use fallback extraction + const effect = extractByNodeTypes(rootNode, language, code) + return Effect.runSync(effect) } + +/** + * Extract entities async (for when query loading might be needed) + */ +export const extractEntitiesAsync = async ( + rootNode: SyntaxNode, + language: Language, + code: string, +): Promise => { + return Effect.runPromise(extractEntities(rootNode, language, code)) +} + +// Re-export useful types and functions +export type { EntityType, ExtractedEntity } from '../types' +export { extractDocstring, isDocComment } from './docstring' +export { + ENTITY_NODE_TYPES, + extractByNodeTypes, + getEntityType, + NODE_TYPE_TO_ENTITY_TYPE, +} from './fallback' +export type { CompiledQuery, QueryLoadError } from './queries' +export { clearQueryCache, loadQuery, loadQuerySync } from './queries' +export { extractName, extractSignature } from './signature' diff --git a/src/extract/queries.ts b/src/extract/queries.ts index 530c1ef..c1dcf4c 100644 --- a/src/extract/queries.ts +++ b/src/extract/queries.ts @@ -1,5 +1,12 @@ import { Effect } from 'effect' -import type { Language } from '../types' +import { + Query, + type Language as TSLanguage, + type QueryCapture as TSQueryCapture, + type QueryMatch as TSQueryMatch, +} from 'web-tree-sitter' +import { type GrammarLoadError, getLanguageGrammar } from '../parser/languages' +import type { Language, SyntaxNode, SyntaxTree } from '../types' /** * Error when loading a tree-sitter query fails @@ -13,34 +20,717 @@ export class QueryLoadError { ) {} } +/** + * Error when executing a query fails + */ +export class QueryExecutionError { + readonly _tag = 'QueryExecutionError' + constructor( + readonly message: string, + readonly cause?: unknown, + ) {} +} + /** * A compiled tree-sitter query - * TODO: Use actual tree-sitter Query type when implementing */ -export type CompiledQuery = unknown +export type CompiledQuery = Query + +/** + * A single capture from a query match + */ +export interface QueryCapture { + /** The capture name (e.g., "name", "item", "context") */ + name: string + /** The captured AST node */ + node: SyntaxNode + /** Pattern index this capture belongs to */ + patternIndex: number +} + +/** + * A complete match from a query, containing all captures from one pattern + */ +export interface QueryMatch { + /** Pattern index that matched */ + patternIndex: number + /** All captures from this match */ + captures: QueryCapture[] +} + +/** + * Result of executing a query + */ +export interface QueryResult { + /** All matches from the query */ + matches: QueryMatch[] + /** All captures from the query (flat list) */ + captures: QueryCapture[] +} + +// ============================================================================= +// Embedded Query Strings +// These are embedded at build time for portability - no filesystem access needed +// ============================================================================= + +const TYPESCRIPT_QUERY = `; TypeScript Entity Extraction Queries +; Adapted from Zed editor's outline.scm +; Uses @name for entity names, @item for full entity node, @context for signature context + +; Namespaces/Modules +(internal_module + "namespace" @context + name: (_) @name) @item + +; Enums +(enum_declaration + "enum" @context + name: (_) @name) @item + +; Type Aliases +(type_alias_declaration + "type" @context + name: (_) @name) @item + +; Functions +(function_declaration + "async"? @context + "function" @context + name: (_) @name + parameters: (formal_parameters + "(" @context + ")" @context)) @item + +; Generator Functions +(generator_function_declaration + "async"? @context + "function" @context + "*" @context + name: (_) @name + parameters: (formal_parameters + "(" @context + ")" @context)) @item + +; Interfaces +(interface_declaration + "interface" @context + name: (_) @name) @item + +; Exported variable declarations +(export_statement + (lexical_declaration + ["let" "const"] @context + (variable_declarator + name: (identifier) @name) @item)) + +; Top-level variable declarations +(program + (lexical_declaration + ["let" "const"] @context + (variable_declarator + name: (identifier) @name) @item)) + +; Classes +(class_declaration + "class" @context + name: (_) @name) @item + +; Abstract Classes +(abstract_class_declaration + "abstract" @context + "class" @context + name: (_) @name) @item + +; Method definitions in classes +(class_body + (method_definition + [ + "get" + "set" + "async" + "*" + "readonly" + "static" + (override_modifier) + (accessibility_modifier) + ]* @context + name: (_) @name + parameters: (formal_parameters + "(" @context + ")" @context)) @item) + +; Public field definitions +(public_field_definition + [ + "declare" + "readonly" + "abstract" + "static" + (accessibility_modifier) + ]* @context + name: (_) @name) @item + +; Arrow functions assigned to variables (exported) +(export_statement + (lexical_declaration + ["let" "const"] @context + (variable_declarator + name: (identifier) @name + value: (arrow_function)) @item)) + +; Arrow functions assigned to variables (top-level) +(program + (lexical_declaration + ["let" "const"] @context + (variable_declarator + name: (identifier) @name + value: (arrow_function)) @item)) + +; Import declarations +(import_statement) @item + +; Export declarations (re-exports) +(export_statement + (export_clause)) @item +` + +const JAVASCRIPT_QUERY = `; JavaScript Entity Extraction Queries +; Adapted from Zed editor's outline.scm +; Uses @name for entity names, @item for full entity node, @context for signature context + +; Functions +(function_declaration + name: (identifier) @name) @item + +; Generator Functions +(generator_function_declaration + name: (identifier) @name) @item + +; Classes +(class_declaration + name: (identifier) @name) @item + +; Method definitions in classes +(class_body + (method_definition + name: (property_identifier) @name) @item) + +; Top-level variable declarations +(program + (lexical_declaration + (variable_declarator + name: (identifier) @name) @item)) + +; Arrow functions assigned to variables (top-level) +(program + (lexical_declaration + (variable_declarator + name: (identifier) @name + value: (arrow_function)) @item)) + +; Import declarations +(import_statement) @item + +; Export declarations +(export_statement) @item +` + +const PYTHON_QUERY = `; Python Entity Extraction Queries +; Adapted from Zed editor's outline.scm +; Uses @name for entity names, @item for full entity node, @context for signature context + +; Decorators (captured for context) +(decorator) @annotation + +; Classes +(class_definition + name: (identifier) @name) @item + +; Functions (including async) +(function_definition + name: (identifier) @name) @item + +; Import statements +(import_statement) @item + +; Import from statements +(import_from_statement) @item +` + +const RUST_QUERY = `; Rust Entity Extraction Queries +; Uses @name for entity names, @item for full entity node + +; Structs +(struct_item + name: (type_identifier) @name) @item + +; Enums +(enum_item + name: (type_identifier) @name) @item + +; Traits +(trait_item + name: (type_identifier) @name) @item + +; Impl blocks +(impl_item) @item + +; Functions +(function_item + name: (identifier) @name) @item + +; Modules +(mod_item + name: (identifier) @name) @item + +; Type aliases +(type_item + name: (type_identifier) @name) @item + +; Constants +(const_item + name: (identifier) @name) @item + +; Use statements (imports) +(use_declaration) @item +` + +const GO_QUERY = `; Go Entity Extraction Queries +; Adapted from Zed editor's outline.scm +; Uses @name for entity names, @item for full entity node, @context for signature context + +; Comments (for doc extraction) +(comment) @annotation + +; Type declarations +(type_declaration + "type" @context + [ + (type_spec + name: (_) @name) @item + ( + "(" + (type_spec + name: (_) @name) @item + ")" + ) + ] +) + +; Functions +(function_declaration + "func" @context + name: (identifier) @name + parameters: (parameter_list + "(" + ")")) @item + +; Methods +(method_declaration + "func" @context + receiver: (parameter_list + "(" @context + (parameter_declaration + name: (_) @context + type: (_) @context) + ")" @context) + name: (field_identifier) @name + parameters: (parameter_list + "(" + ")")) @item + +; Constants +(const_declaration + "const" @context + (const_spec + name: (identifier) @name) @item) + +; Top-level variables +(source_file + (var_declaration + "var" @context + [ + (var_spec + name: (identifier) @name @item) + (var_spec_list + (var_spec + name: (identifier) @name @item) + ) + ] + ) +) + +; Interface methods +(method_elem + name: (_) @name + parameters: (parameter_list + "(" @context + ")" @context)) @item + +; Struct fields +(field_declaration + name: (_) @name @item) + +; Import declarations +(import_declaration) @item + +; Package declaration +(package_clause + "package" @context + (package_identifier) @name) @item +` + +const JAVA_QUERY = `; Java Entity Extraction Queries +; Adapted from nvim-treesitter's locals.scm +; Uses @name for entity names, @item for full entity node, @context for signature context + +; Package declaration +(package_declaration + "package" @context + (scoped_identifier) @name) @item + +; Import declarations +(import_declaration) @item + +; Classes +(class_declaration + (modifiers)? @context + "class" @context + name: (identifier) @name) @item + +; Interfaces +(interface_declaration + (modifiers)? @context + "interface" @context + name: (identifier) @name) @item + +; Records (Java 14+) +(record_declaration + (modifiers)? @context + "record" @context + name: (identifier) @name) @item + +; Enums +(enum_declaration + (modifiers)? @context + "enum" @context + name: (identifier) @name) @item + +; Enum constants +(enum_constant + name: (identifier) @name) @item + +; Annotation types +(annotation_type_declaration + (modifiers)? @context + "@interface" @context + name: (identifier) @name) @item + +; Methods +(method_declaration + (modifiers)? @context + type: (_) @context + name: (identifier) @name + parameters: (formal_parameters + "(" @context + ")" @context)) @item + +; Constructors +(constructor_declaration + (modifiers)? @context + name: (identifier) @name + parameters: (formal_parameters + "(" @context + ")" @context)) @item + +; Fields +(field_declaration + (modifiers)? @context + type: (_) @context + declarator: (variable_declarator + name: (identifier) @name)) @item + +; Static initializer blocks +(static_initializer + "static" @context) @item + +; Annotation members (methods in annotations) +(annotation_type_element_declaration + type: (_) @context + name: (identifier) @name) @item + +; Inner classes +(class_body + (class_declaration + (modifiers)? @context + "class" @context + name: (identifier) @name) @item) + +; Inner interfaces +(class_body + (interface_declaration + (modifiers)? @context + "interface" @context + name: (identifier) @name) @item) + +; Inner enums +(class_body + (enum_declaration + (modifiers)? @context + "enum" @context + name: (identifier) @name) @item) +` + +/** + * Query patterns by language - embedded as strings for portability + */ +export const QUERY_PATTERNS: Record = { + typescript: TYPESCRIPT_QUERY, + javascript: JAVASCRIPT_QUERY, + python: PYTHON_QUERY, + rust: RUST_QUERY, + go: GO_QUERY, + java: JAVA_QUERY, +} + +// ============================================================================= +// Query Loading & Caching +// ============================================================================= + +/** + * Cache for compiled queries by language + */ +const queryCache: Map = new Map() + +/** + * Compile a query string for a specific language + * + * @param language - The programming language + * @param tsLanguage - The loaded tree-sitter language grammar + * @param queryString - The query pattern string + * @returns The compiled Query + */ +function compileQuery( + language: Language, + tsLanguage: TSLanguage, + queryString: string, +): Effect.Effect { + return Effect.try({ + try: () => new Query(tsLanguage, queryString), + catch: (error: unknown) => + new QueryLoadError( + language, + `Failed to compile query: ${error instanceof Error ? error.message : String(error)}`, + error, + ), + }) +} /** * Load a tree-sitter query for entity extraction * - * @param language - The programming language to load query for - * @returns Effect yielding the compiled query, or null if no query exists + * Loads and compiles the query for the given language. Queries are cached + * after first compilation. * - * TODO: Implement query loading from .scm files + * @param language - The programming language to load query for + * @returns Effect yielding the compiled query, or null if no query exists for the language */ export const loadQuery = ( - _language: Language, -): Effect.Effect => { - // TODO: Implement query loading - // 1. Look up query file path for language - // 2. Load .scm file contents - // 3. Compile query using tree-sitter - return Effect.succeed(null) + language: Language, +): Effect.Effect => { + return Effect.gen(function* () { + // Check cache first + const cached = queryCache.get(language) + if (cached) { + return cached + } + + // Get the query pattern for this language + const queryPattern = QUERY_PATTERNS[language] + if (!queryPattern) { + return null + } + + // Load the language grammar + const tsLanguage = yield* getLanguageGrammar(language) + + // Compile the query + const query = yield* compileQuery(language, tsLanguage, queryPattern) + + // Cache for future use + queryCache.set(language, query) + + return query + }) +} + +/** + * Load a query (public async API) + * + * @param language - The language to load the query for + * @returns Promise resolving to the compiled query, or null if no query exists + */ +export async function loadQueryAsync( + language: Language, +): Promise { + return Effect.runPromise(loadQuery(language)) +} + +/** + * Clear the query cache (useful for testing) + */ +export function clearQueryCache(): void { + queryCache.clear() } /** - * Query patterns by language - * TODO: Populate with actual query patterns + * Synchronously load a cached query + * + * This only returns a query if it's already been compiled and cached. + * Use this for sync code paths where you can't await query loading. + * + * @param language - The language to get the cached query for + * @returns The cached query, or null if not cached + */ +export function loadQuerySync(language: Language): CompiledQuery | null { + return queryCache.get(language) ?? null +} + +// ============================================================================= +// Query Execution +// ============================================================================= + +/** + * Execute a query against a syntax tree + * + * @param query - The compiled query to execute + * @param tree - The syntax tree to query + * @param startNode - Optional node to start querying from (defaults to root) + * @returns Effect yielding the query result with matches and captures + */ +export const executeQuery = ( + query: CompiledQuery, + tree: SyntaxTree, + startNode?: SyntaxNode, +): Effect.Effect => { + return Effect.try({ + try: () => { + const node = startNode ?? tree.rootNode + + // Execute the query and get all matches + const matches = query.matches(node) + + // Convert to our QueryMatch format + const queryMatches: QueryMatch[] = matches.map((match: TSQueryMatch) => ({ + patternIndex: match.patternIndex, + captures: match.captures.map((capture: TSQueryCapture) => ({ + name: capture.name, + node: capture.node, + patternIndex: match.patternIndex, + })), + })) + + // Also collect all captures as a flat list + const allCaptures: QueryCapture[] = queryMatches.flatMap( + (match) => match.captures, + ) + + return { + matches: queryMatches, + captures: allCaptures, + } + }, + catch: (error: unknown) => + new QueryExecutionError( + `Query execution failed: ${error instanceof Error ? error.message : String(error)}`, + error, + ), + }) +} + +/** + * Execute a query and get captures (public async API) + * + * @param query - The compiled query to execute + * @param tree - The syntax tree to query + * @param startNode - Optional node to start querying from + * @returns Promise resolving to the query result + */ +export async function executeQueryAsync( + query: CompiledQuery, + tree: SyntaxTree, + startNode?: SyntaxNode, +): Promise { + return Effect.runPromise(executeQuery(query, tree, startNode)) +} + +// ============================================================================= +// Utility Functions +// ============================================================================= + +/** + * Get all captures with a specific name from a query result + * + * @param result - The query result + * @param captureName - The capture name to filter by (e.g., "name", "item") + * @returns Array of captures matching the name + */ +export function getCapturesByName( + result: QueryResult, + captureName: string, +): QueryCapture[] { + return result.captures.filter((capture) => capture.name === captureName) +} + +/** + * Get all matches that have an "item" capture (entity nodes) + * + * @param result - The query result + * @returns Array of matches that contain entity items + */ +export function getEntityMatches(result: QueryResult): QueryMatch[] { + return result.matches.filter((match) => + match.captures.some((capture) => capture.name === 'item'), + ) +} + +/** + * Extract the entity node and name node from a match + * + * @param match - A query match + * @returns Object with item and name nodes, or null if not found + */ +export function extractEntityFromMatch(match: QueryMatch): { + itemNode: SyntaxNode + nameNode: SyntaxNode | null + contextNodes: SyntaxNode[] + annotationNodes: SyntaxNode[] +} | null { + const itemCapture = match.captures.find((c) => c.name === 'item') + if (!itemCapture) { + return null + } + + const nameCapture = match.captures.find((c) => c.name === 'name') + const contextCaptures = match.captures.filter((c) => c.name === 'context') + const annotationCaptures = match.captures.filter( + (c) => c.name === 'annotation', + ) + + return { + itemNode: itemCapture.node, + nameNode: nameCapture?.node ?? null, + contextNodes: contextCaptures.map((c) => c.node), + annotationNodes: annotationCaptures.map((c) => c.node), + } +} + +/** + * Check if a language has a query available + * + * @param language - The language to check + * @returns True if a query is available for the language */ -export const QUERY_PATTERNS: Partial> = { - // TODO: Add query patterns for each language +export function hasQueryForLanguage(language: Language): boolean { + return language in QUERY_PATTERNS } diff --git a/src/extract/signature.ts b/src/extract/signature.ts index 612ab0d..60cfb6b 100644 --- a/src/extract/signature.ts +++ b/src/extract/signature.ts @@ -1,6 +1,306 @@ import { Effect } from 'effect' import type { EntityType, Language, SyntaxNode } from '../types' +/** + * Body delimiters by language - the character that marks the start of the body + */ +export const BODY_DELIMITERS: Record = { + typescript: '{', + javascript: '{', + python: ':', + rust: '{', + go: '{', + java: '{', +} + +/** + * Node types that represent identifiers/names by language + * Order matters - first match wins + */ +const NAME_NODE_TYPES: readonly string[] = [ + 'name', + 'identifier', + 'type_identifier', + 'property_identifier', +] + +/** + * Extract the name of an entity from its AST node + * + * @param node - The AST node representing the entity + * @param _language - The programming language (unused but kept for consistency) + * @returns The entity name, or null if not found + */ +export const extractName = ( + node: SyntaxNode, + _language: Language, +): string | null => { + // Try to find a named child that is an identifier + for (const nameType of NAME_NODE_TYPES) { + const nameNode = node.childForFieldName(nameType) + if (nameNode) { + return nameNode.text + } + } + + // Try to find any child with a name-like type + for (const child of node.children) { + if (NAME_NODE_TYPES.includes(child.type)) { + return child.text + } + } + + // For some languages, try the first identifier child + for (const child of node.children) { + if (child.type === 'identifier' || child.type === 'type_identifier') { + return child.text + } + } + + return null +} + +/** + * Find the position of the body delimiter in a signature + * + * This handles nested brackets/parens/generics to avoid matching + * delimiters inside parameter lists or type annotations. + */ +const findBodyDelimiterPos = (text: string, delimiter: string): number => { + // Handle nested brackets/parens before the body delimiter + let parenDepth = 0 + let bracketDepth = 0 + let angleDepth = 0 + let inString = false + let stringChar = '' + + for (let i = 0; i < text.length; i++) { + const char = text[i] + const prevChar = i > 0 ? text[i - 1] : '' + + // Track string literals to avoid matching inside them + if ((char === '"' || char === "'" || char === '`') && prevChar !== '\\') { + if (!inString) { + inString = true + stringChar = char + } else if (char === stringChar) { + inString = false + stringChar = '' + } + continue + } + + if (inString) continue + + // Track nested structures + if (char === '(') { + parenDepth++ + } else if (char === ')') { + parenDepth-- + } else if (char === '[') { + bracketDepth++ + } else if (char === ']') { + bracketDepth-- + } else if (char === '<') { + // Only count as generic bracket if followed by identifier or another < + // This helps avoid matching comparison operators like <, <=, << + const nextChar = text[i + 1] ?? '' + if (/[A-Za-z_<]/.test(nextChar) || nextChar === '>' || nextChar === ' ') { + angleDepth++ + } + } else if (char === '>' && angleDepth > 0) { + // Only decrement if we're tracking angle brackets + angleDepth-- + } + + // Only match delimiter at depth 0 + if ( + char === delimiter && + parenDepth === 0 && + bracketDepth === 0 && + angleDepth === 0 + ) { + return i + } + } + + return -1 +} + +/** + * Node types that represent body/block structures + */ +const BODY_NODE_TYPES: readonly string[] = [ + 'block', + 'statement_block', + 'class_body', + 'interface_body', + 'enum_body', +] + +/** + * Try to extract signature using AST body field + * Look for 'body' or block-like child and extract everything before it + * Returns null if body node not found + */ +const tryExtractSignatureFromBody = ( + node: SyntaxNode, + code: string, + language: Language, +): string | null => { + // Find the body/block child node + const bodyNode = + node.childForFieldName('body') || + node.children.find((c) => BODY_NODE_TYPES.includes(c.type)) + + if (!bodyNode) { + return null + } + + // Extract from node start to body start + let signature = code.slice(node.startIndex, bodyNode.startIndex).trim() + + // For Python, remove trailing colon + if (language === 'python' && signature.endsWith(':')) { + signature = signature.slice(0, -1) + } + + // For arrow functions, remove trailing => + if (signature.endsWith('=>')) { + signature = signature.slice(0, -2).trim() + } + + return cleanSignature(signature) +} + +/** + * Extract signature for function/method entities + * Extract from start to opening brace '{' (or ':' for Python) + */ +const extractFunctionSignature = ( + node: SyntaxNode, + language: Language, + code: string, +): string => { + // Try AST-based extraction first (more reliable for languages with complex type syntax) + const astSignature = tryExtractSignatureFromBody(node, code, language) + if (astSignature) { + return astSignature + } + + // Fallback to text-based extraction + const nodeText = code.slice(node.startIndex, node.endIndex) + const delimiter = BODY_DELIMITERS[language] + const delimPos = findBodyDelimiterPos(nodeText, delimiter) + + if (delimPos === -1) { + // No body delimiter found - might be a declaration without body + // Return the full node text cleaned up + return cleanSignature(nodeText) + } + + // Extract up to (but not including) the body delimiter + const signature = nodeText.slice(0, delimPos).trim() + return cleanSignature(signature) +} + +/** + * Extract signature for class/interface entities + * Extract the declaration line (up to opening brace or first line) + */ +const extractClassSignature = ( + node: SyntaxNode, + language: Language, + code: string, +): string => { + // Try AST-based extraction first + const astSignature = tryExtractSignatureFromBody(node, code, language) + if (astSignature) { + return astSignature + } + + // Fallback to text-based extraction + const nodeText = code.slice(node.startIndex, node.endIndex) + const delimiter = BODY_DELIMITERS[language] + const delimPos = findBodyDelimiterPos(nodeText, delimiter) + + if (delimPos === -1) { + // No body - return first line or full text + const firstNewline = nodeText.indexOf('\n') + if (firstNewline !== -1) { + return cleanSignature(nodeText.slice(0, firstNewline)) + } + return cleanSignature(nodeText) + } + + // Extract up to (but not including) the opening brace + const signature = nodeText.slice(0, delimPos).trim() + return cleanSignature(signature) +} + +/** + * Extract signature for type/enum entities + * Extract until '=' or '{' + */ +const extractTypeSignature = ( + node: SyntaxNode, + language: Language, + code: string, +): string => { + const nodeText = code.slice(node.startIndex, node.endIndex) + + // For type aliases, look for '=' first + const equalsPos = nodeText.indexOf('=') + const bracePos = findBodyDelimiterPos(nodeText, '{') + const colonPos = + language === 'python' ? findBodyDelimiterPos(nodeText, ':') : -1 + + // Find the earliest delimiter + let delimPos = -1 + if (equalsPos !== -1) delimPos = equalsPos + if (bracePos !== -1 && (delimPos === -1 || bracePos < delimPos)) + delimPos = bracePos + if (colonPos !== -1 && (delimPos === -1 || colonPos < delimPos)) + delimPos = colonPos + + if (delimPos === -1) { + // No delimiter found - return first line or full text + const firstNewline = nodeText.indexOf('\n') + if (firstNewline !== -1) { + return cleanSignature(nodeText.slice(0, firstNewline)) + } + return cleanSignature(nodeText) + } + + const signature = nodeText.slice(0, delimPos).trim() + return cleanSignature(signature) +} + +/** + * Extract signature for import/export entities + * Extract the full statement + */ +const extractImportExportSignature = ( + node: SyntaxNode, + code: string, +): string => { + const nodeText = code.slice(node.startIndex, node.endIndex) + return cleanSignature(nodeText) +} + +/** + * Clean up a signature string: + * - Collapse multiple whitespace to single space + * - Normalize multi-line to single line + * - Trim leading/trailing whitespace + */ +const cleanSignature = (signature: string): string => { + return signature + .replace(/[\r\n]+/g, ' ') // Replace newlines with space + .replace(/\s+/g, ' ') // Collapse multiple whitespace + .trim() +} + /** * Extract the signature of an entity from its AST node * @@ -9,38 +309,50 @@ import type { EntityType, Language, SyntaxNode } from '../types' * @param language - The programming language * @param code - The source code * @returns Effect yielding the signature string - * - * TODO: Implement signature extraction for different entity types */ export const extractSignature = ( - _node: SyntaxNode, - _entityType: EntityType, - _language: Language, - _code: string, + node: SyntaxNode, + entityType: EntityType, + language: Language, + code: string, ): Effect.Effect => { - // TODO: Implement signature extraction - // Different strategies based on entity type: - // - function: extract until opening brace/colon - // - class: extract declaration line - // - interface/type: extract until opening brace or = - // - import/export: extract full statement - return Effect.succeed('') + return Effect.sync(() => { + switch (entityType) { + case 'function': + case 'method': + return extractFunctionSignature(node, language, code) + + case 'class': + case 'interface': + return extractClassSignature(node, language, code) + + case 'type': + case 'enum': + return extractTypeSignature(node, language, code) + + case 'import': + case 'export': + return extractImportExportSignature(node, code) + + default: { + // Fallback: extract first line + const nodeText = code.slice(node.startIndex, node.endIndex) + const firstNewline = nodeText.indexOf('\n') + if (firstNewline !== -1) { + return cleanSignature(nodeText.slice(0, firstNewline)) + } + return cleanSignature(nodeText) + } + } + }) } /** - * Extract the name of an entity from its AST node + * Get the body delimiter for a language * - * @param node - The AST node representing the entity * @param language - The programming language - * @returns The entity name, or null if not found - * - * TODO: Implement name extraction + * @returns The character that marks the start of a body block */ -export const extractName = ( - _node: SyntaxNode, - _language: Language, -): string | null => { - // TODO: Implement name extraction - // Look for identifier/name child nodes based on language - return null +export const getBodyDelimiter = (language: Language): string => { + return BODY_DELIMITERS[language] } diff --git a/test/extract.test.ts b/test/extract.test.ts new file mode 100644 index 0000000..fa5de4d --- /dev/null +++ b/test/extract.test.ts @@ -0,0 +1,838 @@ +import { beforeAll, describe, expect, test } from 'bun:test' +import { Effect } from 'effect' +import { + clearQueryCache, + ENTITY_NODE_TYPES, + extractByNodeTypes, + extractEntitiesAsync, + extractEntitiesSync, + getEntityType, + loadQuery, + loadQuerySync, +} from '../src/extract' +import { + extractDocstring, + isDocComment, + parseDocstring, +} from '../src/extract/docstring' +import { extractName, extractSignature } from '../src/extract/signature' +import { initializeParser, parseCode } from '../src/parser' +import type { Language } from '../src/types' + +// ============================================================================ +// Setup +// ============================================================================ + +beforeAll(async () => { + await initializeParser() +}) + +// ============================================================================ +// Query Loading Tests +// ============================================================================ + +describe('query loading', () => { + beforeAll(() => { + clearQueryCache() + }) + + test('loadQuery loads and caches TypeScript query', async () => { + const query = await Effect.runPromise(loadQuery('typescript')) + expect(query).not.toBeNull() + + // Second call should return cached + const cached = await Effect.runPromise(loadQuery('typescript')) + expect(cached).toBe(query) + }) + + test('loadQuery loads queries for all supported languages', async () => { + const languages: Language[] = [ + 'typescript', + 'javascript', + 'python', + 'rust', + 'go', + 'java', + ] + + for (const lang of languages) { + const query = await Effect.runPromise(loadQuery(lang)) + expect(query).not.toBeNull() + } + }) + + test('loadQuerySync returns null when query not cached', () => { + clearQueryCache() + const query = loadQuerySync('typescript') + // Not cached yet, should return null + expect(query).toBeNull() + }) + + test('loadQuerySync returns cached query after loadQuery', async () => { + clearQueryCache() + + // First load with async + await Effect.runPromise(loadQuery('javascript')) + + // Now sync should return it + const cached = loadQuerySync('javascript') + expect(cached).not.toBeNull() + }) +}) + +// ============================================================================ +// Sync/Async Behavior Consistency Tests +// ============================================================================ + +describe('extractEntities sync/async consistency', () => { + test('extractEntitiesSync uses cached query when available', async () => { + clearQueryCache() + + const code = ` +function greet(name: string): string { + return \`Hello, \${name}!\` +} +` + const result = await parseCode(code, 'typescript') + const rootNode = result.tree.rootNode + + // First, preload the query + await Effect.runPromise(loadQuery('typescript')) + + // Now sync should use the cached query + const entitiesSync = extractEntitiesSync(rootNode, 'typescript', code) + + // Compare with async version + const entitiesAsync = await extractEntitiesAsync( + rootNode, + 'typescript', + code, + ) + + // Both should find the same entities + expect(entitiesSync.length).toBe(entitiesAsync.length) + expect(entitiesSync.map((e) => e.name)).toEqual( + entitiesAsync.map((e) => e.name), + ) + }) + + test('extractEntitiesSync falls back to node types when query not cached', () => { + clearQueryCache() + + const code = ` +function test() { + return 1 +} +` + // Parse synchronously (we need the tree) + const parseEffect = Effect.gen(function* () { + const result = yield* Effect.tryPromise(() => + parseCode(code, 'typescript'), + ) + return result + }) + + Effect.runPromise(parseEffect).then((result) => { + const rootNode = result.tree.rootNode + // With no cached query, should still work via fallback + const entities = extractEntitiesSync(rootNode, 'typescript', code) + expect(entities.length).toBeGreaterThan(0) + }) + }) +}) + +// ============================================================================ +// Entity Extraction Tests +// ============================================================================ + +describe('extractEntities', () => { + test('extracts TypeScript function declaration', async () => { + const code = ` +function greet(name: string): string { + return \`Hello, \${name}!\` +} +` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + expect(entities.length).toBeGreaterThan(0) + const fn = entities.find((e) => e.name === 'greet') + expect(fn).toBeDefined() + expect(fn?.type).toBe('function') + expect(fn?.signature).toContain('greet') + }) + + test('extracts TypeScript class with methods', async () => { + const code = ` +class Calculator { + add(a: number, b: number): number { + return a + b + } + + subtract(a: number, b: number): number { + return a - b + } +} +` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + const cls = entities.find((e) => e.name === 'Calculator') + expect(cls).toBeDefined() + expect(cls?.type).toBe('class') + + const methods = entities.filter((e) => e.type === 'method') + expect(methods.length).toBe(2) + expect(methods.map((m) => m.name)).toContain('add') + expect(methods.map((m) => m.name)).toContain('subtract') + }) + + test('extracts TypeScript interface', async () => { + const code = ` +interface User { + name: string + age: number +} +` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + const iface = entities.find((e) => e.name === 'User') + expect(iface).toBeDefined() + expect(iface?.type).toBe('interface') + }) + + test('extracts Python function with docstring', async () => { + const code = ` +def greet(name): + """Say hello to someone.""" + return f"Hello, {name}!" +` + const result = await parseCode(code, 'python') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'python', + code, + ) + + const fn = entities.find((e) => e.name === 'greet') + expect(fn).toBeDefined() + expect(fn?.type).toBe('function') + expect(fn?.docstring).toBe('Say hello to someone.') + }) + + test('extracts Python class', async () => { + const code = ` +class Calculator: + """A simple calculator.""" + + def add(self, a, b): + return a + b +` + const result = await parseCode(code, 'python') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'python', + code, + ) + + const cls = entities.find((e) => e.name === 'Calculator') + expect(cls).toBeDefined() + expect(cls?.type).toBe('class') + }) + + test('extracts Rust function', async () => { + const code = ` +fn add(a: i32, b: i32) -> i32 { + a + b +} +` + const result = await parseCode(code, 'rust') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'rust', + code, + ) + + const fn = entities.find((e) => e.name === 'add') + expect(fn).toBeDefined() + expect(fn?.type).toBe('function') + }) + + test('extracts Go function', async () => { + const code = ` +package main + +func add(a, b int) int { + return a + b +} +` + const result = await parseCode(code, 'go') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'go', + code, + ) + + const fn = entities.find((e) => e.name === 'add') + expect(fn).toBeDefined() + expect(fn?.type).toBe('function') + }) + + test('extracts Java class and method', async () => { + const code = ` +public class Calculator { + public int add(int a, int b) { + return a + b; + } +} +` + const result = await parseCode(code, 'java') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'java', + code, + ) + + const cls = entities.find((e) => e.name === 'Calculator') + expect(cls).toBeDefined() + expect(cls?.type).toBe('class') + }) + + test('tracks parent relationships for nested entities', async () => { + const code = ` +class Outer { + inner() { + return 1 + } +} +` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + const method = entities.find((e) => e.name === 'inner') + expect(method?.parent).toBe('Outer') + }) +}) + +// ============================================================================ +// Fallback Extraction Tests (Iterative Walk) +// ============================================================================ + +describe('fallback extraction (iterative)', () => { + test('handles deeply nested code without stack overflow', async () => { + // Generate deeply nested functions (more reliable nesting) + let code = '' + const depth = 50 + + for (let i = 0; i < depth; i++) { + code += `function level${i}() {\n` + } + code += 'return 1\n' + for (let i = 0; i < depth; i++) { + code += '}\n' + } + + const result = await parseCode(code, 'typescript') + + // Should not throw stack overflow + const entities = await Effect.runPromise( + extractByNodeTypes(result.tree.rootNode, 'typescript', code), + ) + + // Should find nested functions (exact count may vary based on nesting support) + const functions = entities.filter((e) => e.type === 'function') + expect(functions.length).toBeGreaterThan(0) + // At minimum the outer function should be found + expect(functions.some((f) => f.name === 'level0')).toBe(true) + }) + + test('extractByNodeTypes extracts entities correctly', async () => { + const code = ` +function foo() { return 1 } +class Bar { + baz() { return 2 } +} +` + const result = await parseCode(code, 'typescript') + const entities = await Effect.runPromise( + extractByNodeTypes(result.tree.rootNode, 'typescript', code), + ) + + expect(entities.find((e) => e.name === 'foo')).toBeDefined() + expect(entities.find((e) => e.name === 'Bar')).toBeDefined() + expect(entities.find((e) => e.name === 'baz')).toBeDefined() + }) + + test('getEntityType maps node types correctly', () => { + expect(getEntityType('function_declaration')).toBe('function') + expect(getEntityType('method_definition')).toBe('method') + expect(getEntityType('class_declaration')).toBe('class') + expect(getEntityType('interface_declaration')).toBe('interface') + expect(getEntityType('unknown_type')).toBeNull() + }) + + test('ENTITY_NODE_TYPES contains all supported languages', () => { + const languages: Language[] = [ + 'typescript', + 'javascript', + 'python', + 'rust', + 'go', + 'java', + ] + + for (const lang of languages) { + expect(ENTITY_NODE_TYPES[lang]).toBeDefined() + expect(ENTITY_NODE_TYPES[lang].length).toBeGreaterThan(0) + } + }) +}) + +// ============================================================================ +// Signature Extraction Tests +// ============================================================================ + +describe('signature extraction', () => { + test('extracts TypeScript function signature', async () => { + const code = `function greet(name: string): string { + return \`Hello, \${name}!\` +}` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(fnNode, 'function', 'typescript', code), + ) + + expect(signature).toBe('function greet(name: string): string') + }) + + test('extracts Python function signature (stops at colon)', async () => { + const code = `def greet(name): + return f"Hello, {name}!"` + const result = await parseCode(code, 'python') + const fnNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(fnNode, 'function', 'python', code), + ) + + expect(signature).toBe('def greet(name)') + }) + + test('handles generic type parameters correctly', async () => { + const code = `function identity(arg: T): T { + return arg +}` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(fnNode, 'function', 'typescript', code), + ) + + // Should include the generic parameter + expect(signature).toContain('') + expect(signature).toContain('identity') + }) + + test('handles comparison operators in signatures (angle bracket fix)', async () => { + // This tests that < in comparisons doesn't break generic tracking + const code = `function compare(a: number, b: number): boolean { + return a < b +}` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(fnNode, 'function', 'typescript', code), + ) + + // Should extract signature correctly without being confused by < in body + expect(signature).toBe('function compare(a: number, b: number): boolean') + }) + + test('extracts class signature', async () => { + const code = `class Calculator extends Base implements ICalc { + add(a: number, b: number): number { + return a + b + } +}` + const result = await parseCode(code, 'typescript') + const classNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(classNode, 'class', 'typescript', code), + ) + + expect(signature).toContain('class Calculator') + expect(signature).toContain('extends Base') + expect(signature).toContain('implements ICalc') + }) + + test('cleans multi-line signatures to single line', async () => { + const code = `function multiLine( + param1: string, + param2: number, + param3: boolean +): void { + console.log(param1) +}` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const signature = await Effect.runPromise( + extractSignature(fnNode, 'function', 'typescript', code), + ) + + // Should not contain newlines + expect(signature).not.toContain('\n') + // Should have all params + expect(signature).toContain('param1') + expect(signature).toContain('param2') + expect(signature).toContain('param3') + }) + + test('extractName finds identifier in node', async () => { + const code = `function greet() { return 1 }` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const name = extractName(fnNode, 'typescript') + expect(name).toBe('greet') + }) +}) + +// ============================================================================ +// Docstring Extraction Tests +// ============================================================================ + +describe('docstring extraction', () => { + test('extracts JSDoc for TypeScript function', async () => { + const code = `/** + * Greet someone by name. + * @param name The name to greet + */ +function greet(name: string): string { + return \`Hello, \${name}!\` +}` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[1] // Skip comment, get function + + const docstring = await Effect.runPromise( + extractDocstring(fnNode, 'typescript', code), + ) + + expect(docstring).toContain('Greet someone by name') + expect(docstring).toContain('@param name') + }) + + test('extracts Python docstring from function body', async () => { + const code = `def greet(name): + """ + Say hello to someone. + + Args: + name: The person to greet + """ + return f"Hello, {name}!"` + const result = await parseCode(code, 'python') + const fnNode = result.tree.rootNode.namedChildren[0] + + const docstring = await Effect.runPromise( + extractDocstring(fnNode, 'python', code), + ) + + expect(docstring).toContain('Say hello to someone') + expect(docstring).toContain('Args:') + }) + + test('extracts Rust doc comment', async () => { + const code = `/// Add two numbers together. +/// Returns the sum. +fn add(a: i32, b: i32) -> i32 { + a + b +}` + const result = await parseCode(code, 'rust') + // Find the function node + const fnNode = result.tree.rootNode.namedChildren.find( + (n) => n.type === 'function_item', + ) + + if (fnNode) { + const docstring = await Effect.runPromise( + extractDocstring(fnNode, 'rust', code), + ) + + expect(docstring).toContain('Add two numbers') + } + }) + + test('extracts Go comment', async () => { + const code = `// Add returns the sum of a and b. +func Add(a, b int) int { + return a + b +}` + const result = await parseCode(code, 'go') + const fnNode = result.tree.rootNode.namedChildren.find( + (n) => n.type === 'function_declaration', + ) + + if (fnNode) { + const docstring = await Effect.runPromise( + extractDocstring(fnNode, 'go', code), + ) + + expect(docstring).toContain('Add returns the sum') + } + }) + + test('extracts Javadoc', async () => { + const code = `/** + * Add two integers. + * @param a First number + * @param b Second number + * @return The sum + */ +public int add(int a, int b) { + return a + b; +}` + const result = await parseCode(code, 'java') + const methodNode = result.tree.rootNode.namedChildren.find( + (n) => n.type === 'method_declaration', + ) + + if (methodNode) { + const docstring = await Effect.runPromise( + extractDocstring(methodNode, 'java', code), + ) + + expect(docstring).toContain('Add two integers') + } + }) + + test('returns null when no docstring present', async () => { + const code = `function noDoc() { return 1 }` + const result = await parseCode(code, 'typescript') + const fnNode = result.tree.rootNode.namedChildren[0] + + const docstring = await Effect.runPromise( + extractDocstring(fnNode, 'typescript', code), + ) + + expect(docstring).toBeNull() + }) +}) + +// ============================================================================ +// isDocComment Tests +// ============================================================================ + +describe('isDocComment', () => { + test('recognizes JSDoc comments', () => { + expect(isDocComment('/** This is JSDoc */', 'typescript')).toBe(true) + expect(isDocComment('/* Regular comment */', 'typescript')).toBe(false) + expect(isDocComment('// Line comment', 'typescript')).toBe(false) + }) + + test('recognizes Python docstrings', () => { + expect(isDocComment('"""Docstring"""', 'python')).toBe(true) + expect(isDocComment("'''Docstring'''", 'python')).toBe(true) + expect(isDocComment('r"""Raw docstring"""', 'python')).toBe(true) + expect(isDocComment('# Comment', 'python')).toBe(false) + }) + + test('recognizes Rust doc comments', () => { + expect(isDocComment('/// Doc comment', 'rust')).toBe(true) + expect(isDocComment('//! Inner doc', 'rust')).toBe(true) + expect(isDocComment('// Regular comment', 'rust')).toBe(false) + }) + + test('recognizes Go comments', () => { + // Go considers any // comment before a declaration as doc + expect(isDocComment('// Comment', 'go')).toBe(true) + }) + + test('recognizes Javadoc', () => { + expect(isDocComment('/** Javadoc */', 'java')).toBe(true) + expect(isDocComment('/* Block comment */', 'java')).toBe(false) + }) +}) + +// ============================================================================ +// parseDocstring Tests +// ============================================================================ + +describe('parseDocstring', () => { + test('parses JSDoc and removes markers', () => { + const input = `/** + * This is a description. + * @param name The name + */` + const parsed = parseDocstring(input, 'typescript') + + expect(parsed).not.toContain('/**') + expect(parsed).not.toContain('*/') + expect(parsed).toContain('This is a description') + expect(parsed).toContain('@param name') + }) + + test('parses Python docstring and dedents', () => { + const input = `""" + This is indented. + So is this. + """` + const parsed = parseDocstring(input, 'python') + + expect(parsed).not.toContain('"""') + expect(parsed).toContain('This is indented') + // Should be dedented + expect(parsed).not.toMatch(/^\s{4}This/) + }) + + test('parses Rust doc comments and removes ///', () => { + const input = `/// First line. +/// Second line.` + const parsed = parseDocstring(input, 'rust') + + expect(parsed).not.toContain('///') + expect(parsed).toContain('First line') + expect(parsed).toContain('Second line') + }) + + test('parses Go comments and removes //', () => { + const input = `// First line. +// Second line.` + const parsed = parseDocstring(input, 'go') + + expect(parsed).not.toContain('//') + expect(parsed).toContain('First line') + expect(parsed).toContain('Second line') + }) +}) + +// ============================================================================ +// Edge Cases +// ============================================================================ + +describe('extraction edge cases', () => { + test('handles anonymous functions via variable declaration', async () => { + // Note: anonymous functions themselves aren't extracted as entities, + // but top-level variable declarations are + const code = `const fn = function() { return 1 }` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + // Query extracts top-level const declarations + // If no entities found, that's acceptable - the function is anonymous + // What matters is it doesn't crash + expect(Array.isArray(entities)).toBe(true) + }) + + test('handles arrow functions via variable declaration', async () => { + // Arrow functions assigned to const are extracted as the variable + const code = `const add = (a: number, b: number) => a + b` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + // Queries should capture top-level const with arrow function value + // The entity would be named 'add' (the variable name) + expect(Array.isArray(entities)).toBe(true) + }) + + test('handles arrow functions - no crash', async () => { + // Arrow functions may or may not be extracted depending on query patterns + // The key is the system handles them without crashing + const code = `const add = (a: number, b: number) => a + b` + const result = await parseCode(code, 'typescript') + + // Should not throw + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + // Result should be an array (may be empty if arrow function isn't captured) + expect(Array.isArray(entities)).toBe(true) + }) + + test('handles async functions', async () => { + const code = `async function fetchData(): Promise { + return await fetch('/api') +}` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + const fn = entities.find((e) => e.name === 'fetchData') + expect(fn).toBeDefined() + expect(fn?.signature).toContain('async') + }) + + test('handles export declarations', async () => { + const code = `export function publicFn() { return 1 } +export default function defaultFn() { return 2 }` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + expect(entities.length).toBeGreaterThan(0) + }) + + test('handles empty file', async () => { + const code = '' + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + expect(entities).toEqual([]) + }) + + test('handles file with only comments', async () => { + const code = `// Just a comment +/* Another comment */` + const result = await parseCode(code, 'typescript') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'typescript', + code, + ) + + expect(entities).toEqual([]) + }) +}) From b3e0cf2b6197e96d2d84198d0978f3e92ab7d1bd Mon Sep 17 00:00:00 2001 From: Shoubhit Dash Date: Wed, 17 Dec 2025 01:36:20 +0530 Subject: [PATCH 2/2] fix: remove non-null assertions in rebuild.ts --- src/chunking/rebuild.ts | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/chunking/rebuild.ts b/src/chunking/rebuild.ts index 6769fed..660b1d3 100644 --- a/src/chunking/rebuild.ts +++ b/src/chunking/rebuild.ts @@ -85,8 +85,15 @@ export const rebuildText = (window: ASTWindow, code: string): RebuiltText => { // Normal case: slice from first node start to last node end // Use startPosition/endPosition from nodes for optimized line calculation - const firstNode = window.nodes[0]! - const lastNode = window.nodes[window.nodes.length - 1]! + const firstNode = window.nodes[0] + const lastNode = window.nodes[window.nodes.length - 1] + if (!firstNode || !lastNode) { + return { + text: '', + byteRange: { start: 0, end: 0 }, + lineRange: { start: 0, end: 0 }, + } + } const startByte = firstNode.startIndex const endByte = lastNode.endIndex @@ -114,12 +121,26 @@ const rebuildFromLineRanges = ( window: ASTWindow, code: string, ): RebuiltText => { - const lineRanges = window.lineRanges! + const lineRanges = window.lineRanges + if (!lineRanges || lineRanges.length === 0) { + return { + text: '', + byteRange: { start: 0, end: 0 }, + lineRange: { start: 0, end: 0 }, + } + } const lineStarts = buildLineStartsTable(code) // Get the overall line range - const firstRange = lineRanges[0]! - const lastRange = lineRanges[lineRanges.length - 1]! + const firstRange = lineRanges[0] + const lastRange = lineRanges[lineRanges.length - 1] + if (!firstRange || !lastRange) { + return { + text: '', + byteRange: { start: 0, end: 0 }, + lineRange: { start: 0, end: 0 }, + } + } const startLine = firstRange.start const endLine = lastRange.end @@ -127,7 +148,9 @@ const rebuildFromLineRanges = ( const startByte = lineStarts[startLine] ?? 0 // End byte is start of line after endLine, or end of file const endByte = - endLine + 1 < lineStarts.length ? lineStarts[endLine + 1]! : code.length + endLine + 1 < lineStarts.length + ? (lineStarts[endLine + 1] ?? code.length) + : code.length const text = code.slice(startByte, endByte)