diff --git a/.gitignore b/.gitignore index ddb53d6..d34e3d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .env __pycache__/ database/ -processed/ \ No newline at end of file +processed/ +venv diff --git a/README.md b/README.md index 81e508b..3b1a1cf 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Blog Links: [An attempt to build cursor's @codebase feature - RAG on codebases - part 2](https://blog.lancedb.com/building-rag-on-codebases-part-2/) -A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript and Java with a clean, minimal UI. +A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript, CSharp and Java with a clean, minimal UI. > **Note**: New OpenAI/Anthropic accounts may experience token rate limits. Consider using an established account. diff --git a/llm_comments.py b/llm_comments.py index a094bd4..f66b1c6 100644 --- a/llm_comments.py +++ b/llm_comments.py @@ -64,7 +64,7 @@ async def process_batch_anthropic(batch_texts, semaphore): *[ anthropic_client.messages.create( max_tokens=1024, - system='''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''', + system='''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''', messages=[ {"role": "user", "content": text}, ], @@ -82,7 +82,7 @@ async def process_batch_openai(batch_texts, semaphore): openai_client.chat.completions.create( max_tokens=1024, messages=[ - {"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' }, + {"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' }, {"role": "user", "content": text} ], model="gpt-3.5-turbo" diff --git a/preprocessing.py b/preprocessing.py index 7672074..13821d8 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,5 +1,7 @@ import os import sys +import logging + from treesitter import Treesitter, LanguageEnum from collections import defaultdict import csv @@ -27,7 +29,7 @@ ".aws-sam", ".terraform" ] -WHITELIST_FILES = [".java", ".py", ".js", ".rs"] +WHITELIST_FILES = [".java", ".py", ".js", ".rs", ".cs"] BLACKLIST_FILES = ["docker-compose.yml"] NODE_TYPES = { @@ -47,6 +49,10 @@ "class": "class_declaration", "method": "method_definition" }, + "c_sharp": { + "class": "class_definition", + "method": "function_definition" + }, # Add other languages as needed } @@ -71,6 +77,11 @@ "method": "call_expression", "child_field_name": "function" }, + "c_sharp": { + "class": "identifier", + "method": "call", + "child_field_name": "function" + }, # Add other languages as needed } @@ -80,6 +91,7 @@ def get_language_from_extension(file_ext): ".py": LanguageEnum.PYTHON, ".js": LanguageEnum.JAVASCRIPT, ".rs": LanguageEnum.RUST, + ".cs": LanguageEnum.CSHARP, # Add other extensions and languages as needed } return FILE_EXTENSION_LANGUAGE_MAP.get(file_ext) @@ -115,7 +127,11 @@ def parse_code_files(file_list): treesitter_parser = Treesitter.create_treesitter(language) for file_path in files: with open(file_path, "r", encoding="utf-8") as file: - code = file.read() + try: + code = file.read() + except UnicodeDecodeError: + logging.warning(f"Skipping file due to encoding issues: {file_path}") + continue file_bytes = code.encode() class_nodes, method_nodes = treesitter_parser.parse(file_bytes) @@ -162,7 +178,11 @@ def find_references(file_list, class_names, method_names): treesitter_parser = Treesitter.create_treesitter(language) for file_path in files: with open(file_path, "r", encoding="utf-8") as file: - code = file.read() + try: + code = file.read() + except UnicodeDecodeError: + logging.warning(f"Skipping file due to encoding issues: {file_path}") + continue file_bytes = code.encode() tree = treesitter_parser.parser.parse(file_bytes) diff --git a/treesitter.py b/treesitter.py index e98d85e..91a95a1 100644 --- a/treesitter.py +++ b/treesitter.py @@ -11,6 +11,7 @@ class LanguageEnum(Enum): PYTHON = "python" RUST = "rust" JAVASCRIPT = "javascript" + CSHARP = "c_sharp" UNKNOWN = "unknown" LANGUAGE_QUERIES = { @@ -74,6 +75,23 @@ class LanguageEnum(Enum): ((comment) @comment) """ }, + LanguageEnum.CSHARP: { + 'class_query': """ + (class_declaration + name: (identifier) @class.name) + """, + 'method_query': """ + [ + (method_declaration + name: (identifier) @method.name) + (constructor_declaration + name: (identifier) @method.name) + ] + """, + 'doc_query': """ + ((comment) @comment) + """ + } # Add other languages as needed }