Skip to content

Commit 3582e4e

Browse files
committed
Allow ai analysis of large repositories
Use a certain category of files to answer each question. If there are too many or too large files i the relevant category then split the file list to batches, generate answer for each, and then generate a summary of the answers.
1 parent b025cb7 commit 3582e4e

File tree

1 file changed

+115
-60
lines changed

1 file changed

+115
-60
lines changed

scripts/extension_ai_analysis.py

Lines changed: 115 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,20 @@
2121
INFERENCE_MODEL = "mistralai/Mistral-Small-3.2-24B-Instruct-2506"
2222
INFERENCE_RESPONSE_PER_MINUTE_LIMIT = 5
2323
INFERENCE_API_KEY = os.getenv("NEBULA_API_KEY")
24+
INFERENCE_MAX_CHARACTERS = 100000 # max characters in all files provided to the model, approximately 25k tokens
2425

2526
QUESTIONS = [
26-
"Is there a EXTENSION_DESCRIPTION variable in the CMakeLists.txt file that describes what the extension does in a few sentences that can be understood by a person knowledgeable in medical image computing?",
27-
"Does the README.md file contain a short description, 1-2 sentences, which summarizes what the extension is usable for?",
28-
"Does the README.md file contain at least one image that illustrates what the extension can do, preferably a screenshot? Ignore contents of CMakeLists.txt file.",
29-
"Does the README.md file contain description of contained modules: at one sentence for each module?",
30-
"Does the README.md file contain publication: link to publication and/or to PubMed reference or a 'How to cite' section?",
31-
"Does the documentation contain step-by-step tutorial? Does the tutorial tell where to get sample data from?"
32-
"Does this code download any executable code from the internet or uploads any data to the internet?",
33-
"Is any code executed at the file scope when a module is imported?",
34-
"Are any Python packages imported at the file scope that are not from the Python Standard Library and not from Slicer, vtk, SimpleITK, numpy, and scipy?",
35-
"Does it directly use pip_install to install pytorch?",
36-
"Does it store large amount of downloaded content on local disk other than installing Python packages? Does it provide a way for the user to remove that content?",
27+
["Is there a EXTENSION_DESCRIPTION variable in the CMakeLists.txt file that describes what the extension does in a few sentences that can be understood by a person knowledgeable in medical image computing?", ["cmake"]],
28+
["Does the README.md file contain a short description, 1-2 sentences, which summarizes what the extension is usable for?", ["doc"]],
29+
["Does the README.md file contain at least one image that illustrates what the extension can do, preferably a screenshot?", ["doc"]],
30+
["Does the README.md file contain description of contained modules: at one sentence for each module?", ["doc"]],
31+
["Does the README.md file contain publication: link to publication and/or to PubMed reference or a 'How to cite' section?", ["doc"]],
32+
["Does the documentation contain step-by-step tutorial? Does the tutorial tell where to get sample data from?", ["doc"]],
33+
["Does this code download any executable code from the internet or uploads any data to the internet?", ["source"]],
34+
["Is any code executed at the file scope when a module is imported?", ["source"]],
35+
["Are any Python packages imported at the file scope that are not from the Python Standard Library and not from Slicer, vtk, SimpleITK, numpy, and scipy?", ["source"]],
36+
["Does it directly use pip_install to install pytorch?", ["source"]],
37+
["Does it store large amount of downloaded content on local disk other than installing Python packages? Does it provide a way for the user to remove that content?", ["source"]],
3738
]
3839

3940
def parse_json(extension_file_path):
@@ -105,72 +106,126 @@ def clone_repository(metadata, cloned_repository_folder):
105106

106107

107108
def collect_analyzed_files(folder):
108-
"""Load all .py files in a folder, recursively."""
109-
scripts = {}
109+
"""Load all .py files in a folder, recursively.
110+
returns a dict of categories (doc, source, cmake), each containing a dict of filename->content"""
111+
found_files = { "doc": {}, "source": {}, "cmake": {} }
110112
for root, dirs, files in os.walk(folder):
111113
for filename in files:
112114
fullpath = os.path.join(root, filename)
113115
relative_path = os.path.relpath(fullpath, start=folder).replace("\\", "/")
114-
if filename.endswith(".py") or filename.endswith(".md") or relative_path == "CMakeLists.txt":
115-
with open(fullpath, "r", encoding="utf-8") as f:
116-
# get relative path to folder, in linux-style
117-
scripts[relative_path] = f.read()
118-
return scripts
119-
120-
121-
def analyze_extension(extension_name, metadata, cloned_repository_folder):
116+
category = None
117+
if filename.endswith(".py"):
118+
category = "source"
119+
elif filename.endswith(".md"):
120+
category = "doc"
121+
elif relative_path == "CMakeLists.txt":
122+
category = "cmake"
123+
if category is None:
124+
continue
125+
with open(fullpath, "r", encoding="utf-8") as f:
126+
# get relative path to folder, in linux-style
127+
found_files[category][relative_path] = f.read()
128+
return found_files
122129

130+
def ask_question(system_msg, question):
123131
headers = {
124132
"Content-Type": "application/json",
125133
"Authorization": f"Bearer {INFERENCE_API_KEY}"
126134
}
127135

128-
scripts = collect_analyzed_files(cloned_repository_folder)
129-
130-
system_msg = \
131-
"You are a quality control expert that checks community-contributed files that contain code and documentation." \
132-
" Do not talk about things in general, only strictly about the content provided." \
133-
" Relevant files of the extension repository are provided below." \
134-
" Each file is delimited by lines with '=== FILE: filename ===' and '=== END FILE: filename ==='."
135-
for filename in scripts:
136-
system_msg += f"\n=== FILE: {filename} ===\n"
137-
system_msg += scripts[filename]
138-
system_msg += f"\n=== END FILE: {filename} ===\n"
139-
140-
# Send the system prompt only once, then continue the conversation
141136
messages = [
142-
{"role": "system", "content": system_msg}
137+
{"role": "system", "content": system_msg},
138+
{"role": "user", "content": question}
143139
]
144140

145-
for index, question in enumerate(QUESTIONS):
146-
messages.append({"role": "user", "content": question})
147-
data = {
148-
"messages": messages,
149-
"model": INFERENCE_MODEL,
150-
"max_tokens": None,
151-
"temperature": 1,
152-
"top_p": 0.9,
153-
"stream": False
154-
}
155-
response = requests.post(INFERENCE_URL, headers=headers, json=data)
141+
data = {
142+
"messages": messages,
143+
"model": INFERENCE_MODEL,
144+
"max_tokens": None,
145+
"temperature": 1,
146+
"top_p": 0.9,
147+
"stream": False
148+
}
149+
150+
response = requests.post(INFERENCE_URL, headers=headers, json=data)
151+
152+
# wait according to response per minute limit
153+
delay = 60 / INFERENCE_RESPONSE_PER_MINUTE_LIMIT
154+
import time
155+
time.sleep(delay)
156+
157+
try:
158+
answer = response.json()["choices"][0]["message"]["content"]
159+
except Exception as e:
160+
raise RuntimeError(f"Error or unexpected response: {response.json()["error"]["message"]}")
161+
162+
return answer
163+
164+
165+
def analyze_extension(extension_name, metadata, cloned_repository_folder):
166+
167+
files = collect_analyzed_files(cloned_repository_folder)
168+
169+
for index, [question, categories] in enumerate(QUESTIONS):
170+
156171
print("\n------------------------------------------------------")
157172
print(f"Question {index+1}: {question}")
158173
print("------------------------------------------------------")
159-
try:
160-
answer = response.json()["choices"][0]["message"]["content"]
161174

162-
print(answer)
163-
messages.append({"role": "assistant", "content": answer})
164-
except Exception as e:
165-
print("Error or unexpected response:", response.json()["error"]["message"])
166-
if index == 0:
167-
# if the first question fails, likely the system prompt is too long, so stop here
168-
raise RuntimeError("Stopping further questions since the first question failed.")
175+
file_content_batches = [""]
176+
177+
# Add files of the categories relevant for the question
178+
# The context of each query is limited, therefore if there are too many/too large input files in the relevant categories,
179+
# then we split them into batches, ask the question for each batch, and then generate a summary of the answers.
180+
for category in categories:
181+
files_in_category = files.get(category, {})
182+
for filename in files_in_category:
183+
next_file = f"\n=== FILE: {filename} ===\n" + files_in_category[filename] + f"\n=== END FILE: {filename} ===\n"
184+
if len(file_content_batches[-1]) + len(next_file) < INFERENCE_MAX_CHARACTERS:
185+
# We can add this file to the current batch
186+
file_content_batches[-1] += next_file
187+
else:
188+
# Start a new batch
189+
file_content_batches.append(next_file)
190+
191+
if not file_content_batches[0].strip():
192+
print("No relevant files found for this question.")
193+
continue
169194

170-
# wait according to response per minute limit
171-
delay = 60 / INFERENCE_RESPONSE_PER_MINUTE_LIMIT
172-
import time
173-
time.sleep(delay)
195+
role_description = \
196+
"You are a quality control expert that checks community-contributed files that contain code and documentation." \
197+
" Do not talk about things in general, only strictly about the content provided."
198+
199+
answers = []
200+
201+
for batch_index, file_content in enumerate(file_content_batches):
202+
203+
system_msg = role_description
204+
system_msg += " Relevant files of the extension repository are provided below."
205+
system_msg += " Each file is delimited by lines with '=== FILE: filename ===' and '=== END FILE: filename ==='.\n"
206+
system_msg += file_content
207+
208+
try:
209+
answer = ask_question(system_msg, question)
210+
answers.append(answer)
211+
except Exception as e:
212+
answers = [f"Error or unexpected response: {e}"]
213+
break
214+
215+
if len(answers) == 1:
216+
print(answers[0])
217+
else:
218+
# Multiple batches of files were used to answer this question, generate a summary
219+
system_msg = role_description
220+
question = "The answer to the question is spread over multiple parts. Please summarize the answer in a concise way, combining all relevant information from the different parts. " \
221+
"Here are the different parts of the answer:\n\n"
222+
for part_index, part in enumerate(answers):
223+
question += f"--- PART {part_index+1} ---\n{part}\n"
224+
try:
225+
answer = ask_question(system_msg, question)
226+
except Exception as e:
227+
answer = f"Error or unexpected response: {e}"
228+
print(answer)
174229

175230

176231
def main():
@@ -211,7 +266,7 @@ def main():
211266
# Clean up temporary directory
212267
success_cleanup = safe_cleanup_directory(cloned_repository_folder)
213268

214-
print("=====================================================")
269+
print("\n=====================================================\n")
215270

216271

217272
if __name__ == "__main__":

0 commit comments

Comments
 (0)