Skip to content

Add csharp language support #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: feature/optimization
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.env
__pycache__/
database/
processed/
processed/
venv
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Blog Links:

[An attempt to build cursor's @codebase feature - RAG on codebases - part 2](https://blog.lancedb.com/building-rag-on-codebases-part-2/)

A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript and Java with a clean, minimal UI.
A powerful code search and query system that lets you explore codebases using natural language. Ask questions about your code and get contextual answers powered by LanceDB, OpenAI gpt4o-mini/gpt4o and Answerdotai's colbert-small-v1 reranker. Supports Python, Rust, JavaScript, CSharp and Java with a clean, minimal UI.

> **Note**: New OpenAI/Anthropic accounts may experience token rate limits. Consider using an established account.

Expand Down
4 changes: 2 additions & 2 deletions llm_comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ async def process_batch_anthropic(batch_texts, semaphore):
*[
anthropic_client.messages.create(
max_tokens=1024,
system='''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''',
system='''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''',
messages=[
{"role": "user", "content": text},
],
Expand All @@ -82,7 +82,7 @@ async def process_batch_openai(batch_texts, semaphore):
openai_client.chat.completions.create(
max_tokens=1024,
messages=[
{"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' },
{"role" : "system", "content" : '''You are an expert programmer who specializes in java, python, rust, csharp and javascript. Describe the code in less than 3 lines. Do not add fluff like "the code you provided".''' },
{"role": "user", "content": text}
],
model="gpt-3.5-turbo"
Expand Down
26 changes: 23 additions & 3 deletions preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import sys
import logging

from treesitter import Treesitter, LanguageEnum
from collections import defaultdict
import csv
Expand Down Expand Up @@ -27,7 +29,7 @@
".aws-sam",
".terraform"
]
WHITELIST_FILES = [".java", ".py", ".js", ".rs"]
WHITELIST_FILES = [".java", ".py", ".js", ".rs", ".cs"]
BLACKLIST_FILES = ["docker-compose.yml"]

NODE_TYPES = {
Expand All @@ -47,6 +49,10 @@
"class": "class_declaration",
"method": "method_definition"
},
"c_sharp": {
"class": "class_definition",
"method": "function_definition"
},
# Add other languages as needed
}

Expand All @@ -71,6 +77,11 @@
"method": "call_expression",
"child_field_name": "function"
},
"c_sharp": {
"class": "identifier",
"method": "call",
"child_field_name": "function"
},
# Add other languages as needed
}

Expand All @@ -80,6 +91,7 @@ def get_language_from_extension(file_ext):
".py": LanguageEnum.PYTHON,
".js": LanguageEnum.JAVASCRIPT,
".rs": LanguageEnum.RUST,
".cs": LanguageEnum.CSHARP,
# Add other extensions and languages as needed
}
return FILE_EXTENSION_LANGUAGE_MAP.get(file_ext)
Expand Down Expand Up @@ -115,7 +127,11 @@ def parse_code_files(file_list):
treesitter_parser = Treesitter.create_treesitter(language)
for file_path in files:
with open(file_path, "r", encoding="utf-8") as file:
code = file.read()
try:
code = file.read()
except UnicodeDecodeError:
logging.warning(f"Skipping file due to encoding issues: {file_path}")
continue
file_bytes = code.encode()
class_nodes, method_nodes = treesitter_parser.parse(file_bytes)

Expand Down Expand Up @@ -162,7 +178,11 @@ def find_references(file_list, class_names, method_names):
treesitter_parser = Treesitter.create_treesitter(language)
for file_path in files:
with open(file_path, "r", encoding="utf-8") as file:
code = file.read()
try:
code = file.read()
except UnicodeDecodeError:
logging.warning(f"Skipping file due to encoding issues: {file_path}")
continue
file_bytes = code.encode()
tree = treesitter_parser.parser.parse(file_bytes)

Expand Down
18 changes: 18 additions & 0 deletions treesitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class LanguageEnum(Enum):
PYTHON = "python"
RUST = "rust"
JAVASCRIPT = "javascript"
CSHARP = "c_sharp"
UNKNOWN = "unknown"

LANGUAGE_QUERIES = {
Expand Down Expand Up @@ -74,6 +75,23 @@ class LanguageEnum(Enum):
((comment) @comment)
"""
},
LanguageEnum.CSHARP: {
'class_query': """
(class_declaration
name: (identifier) @class.name)
""",
'method_query': """
[
(method_declaration
name: (identifier) @method.name)
(constructor_declaration
name: (identifier) @method.name)
]
""",
'doc_query': """
((comment) @comment)
"""
}
# Add other languages as needed
}

Expand Down