-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathextract.py
97 lines (74 loc) · 3.07 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from bs4 import BeautifulSoup
import re
# Useful functions for extracting code from LLM responces
def extract_code_markdown(answer):
# Look for start tokens
match = re.search(r'`{3,}(\w*)', answer)
start_token = match.group(0) if match else None
start_index = match.start() if match else -1
# If we didn't find a start token, return None
if start_index == -1:
return None
# codellama special
if answer[start_index + len(start_token) + 1: start_index + len(start_token) + 8] in ['python\n','Python\n']:
start_index += 7
# Find the index of the end token, starting from the end of the start token.
# if not found, assume we're taking the whole thing.
end_token = "```"
end_index = answer.find(end_token, start_index + len(start_token) + 1)
if end_index == -1:
end_index = len(answer)
# Extract the text between the tokens
code_text = answer[start_index + len(start_token):end_index].strip()
return code_text if code_text.strip() else None
def remove_indentation(code_block):
lines = code_block.split('\n')
if not lines:
return code_block
first_line_indent = len(lines[0]) - len(lines[0].lstrip())
modified_lines = [line[first_line_indent:] for line in lines]
modified_code = '\n'.join(modified_lines)
return modified_code
def extract_code_html(answer):
soup = BeautifulSoup(answer, "html.parser")
longest_code = None
for item in soup.find_all('code'):
if longest_code is None or len(item.get_text()) > len(longest_code):
#print("Found candidate code: ", item)
longest_code = remove_indentation(item.get_text())
return longest_code
def extract_code_codellama_python(answer):
start_token = '[PYTHON]'
start_index = answer.find(start_token)
end_token = '[/PYTHON]'
end_index = answer.find(end_token, start_index + len(start_token) + 1)
code_text = answer[start_index + len(start_token):end_index]
return code_text if code_text.strip() else None
# Fallback if the model forgot to use any quotes
def extract_code_fallback(answer):
simple_answer = answer.strip()
return simple_answer
# Remove reasoning
def remove_think_tags(text):
import re
pattern = r'<think>.*?</think>'
return re.sub(pattern, '', text, flags=re.DOTALL)
def extract_code(answer, stop_at_prefix=[]):
code = None
answer = remove_think_tags(answer)
if answer.find('[PYTHON]') != -1:
code = extract_code_codellama_python(answer)
if code is None and answer.find('<code>') != -1:
code = extract_code_html(answer)
if code is None and answer.find('```') != -1:
code = extract_code_markdown(answer)
if code is None:
code = extract_code_fallback(answer)
if code is not None and len(stop_at_prefix) > 0:
lines = code.split('\n')
for i, line in enumerate(lines):
for prefix in stop_at_prefix:
if line.startswith(prefix):
code = '\n'.join(lines[:i])
break
return code