sankalp1999 · pandamicro · Feb 8, 2025 · Feb 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .env
 __pycache__/
 database/
-processed/
+processed/
+venv
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Blog Links:
 
 [An attempt to build cursor's @codebase feature - RAG on codebases - part 2](https://blog.lancedb.com/building-rag-on-codebases-part-2/)
 
-A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript and Java with a clean, minimal UI.
+A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript, CSharp and Java with a clean, minimal UI.
 
 > **Note**: New OpenAI/Anthropic accounts may experience token rate limits. Consider using an established account.
 

diff --git a/llm_comments.py b/llm_comments.py
@@ -64,7 +64,7 @@ async def process_batch_anthropic(batch_texts, semaphore):
             *[
                 anthropic_client.messages.create(
                     max_tokens=1024,
-                    system='''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''',
+                    system='''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''',
                     messages=[
                         {"role": "user", "content": text},
                     ],
@@ -82,7 +82,7 @@ async def process_batch_openai(batch_texts, semaphore):
                 openai_client.chat.completions.create(
                     max_tokens=1024,
                     messages=[
-                        {"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' },
+                        {"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' },
                         {"role": "user", "content": text}
                     ],
                     model="gpt-3.5-turbo"

diff --git a/preprocessing.py b/preprocessing.py
@@ -1,5 +1,7 @@
 import os
 import sys
+import logging
+
 from treesitter import Treesitter, LanguageEnum
 from collections import defaultdict
 import csv
@@ -27,7 +29,7 @@
     ".aws-sam",
     ".terraform"
 ]
-WHITELIST_FILES = [".java", ".py", ".js", ".rs"]
+WHITELIST_FILES = [".java", ".py", ".js", ".rs", ".cs"]
 BLACKLIST_FILES = ["docker-compose.yml"]
 
 NODE_TYPES = {
@@ -47,6 +49,10 @@
         "class": "class_declaration",
         "method": "method_definition"
     },
+    "c_sharp": {
+        "class": "class_definition",
+        "method": "function_definition"
+    },
     # Add other languages as needed
 }
 
@@ -71,6 +77,11 @@
         "method": "call_expression",
         "child_field_name": "function"
     },
+    "c_sharp": {
+        "class": "identifier",
+        "method": "call",
+        "child_field_name": "function"
+    },
     # Add other languages as needed
 }
 
@@ -80,6 +91,7 @@ def get_language_from_extension(file_ext):
         ".py": LanguageEnum.PYTHON,
         ".js": LanguageEnum.JAVASCRIPT,
         ".rs": LanguageEnum.RUST,
+        ".cs": LanguageEnum.CSHARP,
         # Add other extensions and languages as needed
     }
     return FILE_EXTENSION_LANGUAGE_MAP.get(file_ext)
@@ -115,7 +127,11 @@ def parse_code_files(file_list):
         treesitter_parser = Treesitter.create_treesitter(language)
         for file_path in files:
             with open(file_path, "r", encoding="utf-8") as file:
-                code = file.read()
+                try:
+                    code = file.read()
+                except UnicodeDecodeError:
+                    logging.warning(f"Skipping file due to encoding issues: {file_path}")
+                    continue
                 file_bytes = code.encode()
                 class_nodes, method_nodes = treesitter_parser.parse(file_bytes)
 
@@ -162,7 +178,11 @@ def find_references(file_list, class_names, method_names):
         treesitter_parser = Treesitter.create_treesitter(language)
         for file_path in files:
             with open(file_path, "r", encoding="utf-8") as file:
-                code = file.read()
+                try:
+                    code = file.read()
+                except UnicodeDecodeError:
+                    logging.warning(f"Skipping file due to encoding issues: {file_path}")
+                    continue
                 file_bytes = code.encode()
                 tree = treesitter_parser.parser.parse(file_bytes)
 

diff --git a/treesitter.py b/treesitter.py
@@ -11,6 +11,7 @@ class LanguageEnum(Enum):
     PYTHON = "python"
     RUST = "rust"
     JAVASCRIPT = "javascript"
+    CSHARP = "c_sharp"
     UNKNOWN = "unknown"
 
 LANGUAGE_QUERIES = {
@@ -74,6 +75,23 @@ class LanguageEnum(Enum):
             ((comment) @comment)
         """
     },
+    LanguageEnum.CSHARP: {
+        'class_query': """
+            (class_declaration
+                name: (identifier) @class.name)
+        """,
+        'method_query': """
+            [
+                (method_declaration
+                        name: (identifier) @method.name)
+                (constructor_declaration
+                    name: (identifier) @method.name)
+            ]
+        """,
+        'doc_query': """
+            ((comment) @comment)
+        """
+    }
     # Add other languages as needed
 }