Skip to content

Commit 3ebe8ae

Browse files
[WIP] Auto-generation of finetuning dataset
1 parent 9f1bf65 commit 3ebe8ae

File tree

11 files changed

+656
-1
lines changed

11 files changed

+656
-1
lines changed

Diff for: docs/TypeEvalPy_JSON_schema.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import List, Optional
2+
3+
from pydantic import BaseModel
4+
5+
6+
class TypeEvalPySchemaLocalVariable(BaseModel):
7+
file: str
8+
line_number: int
9+
col_offset: int
10+
type: List[str]
11+
variable: str
12+
13+
14+
class TypeEvalPySchemaLocalVariableInsideFunction(BaseModel):
15+
file: str
16+
line_number: int
17+
col_offset: int
18+
type: List[str]
19+
function: str
20+
variable: str
21+
22+
23+
class TypeEvalPySchemaParameter(BaseModel):
24+
file: str
25+
line_number: int
26+
col_offset: int
27+
type: List[str]
28+
function: str
29+
parameter: str
30+
31+
32+
class TypeEvalPySchemaFunctionReturn(BaseModel):
33+
file: str
34+
line_number: int
35+
col_offset: int
36+
type: List[str]
37+
function: Optional[str] = None
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import itertools
2+
import json
3+
import os
4+
import random
5+
import re
6+
import shutil
7+
8+
9+
def generate_value_for_type(chosen_type):
10+
"""Generate a random value and its corresponding data type"""
11+
12+
if chosen_type == "int":
13+
return random.randint(1, 100), "int"
14+
elif chosen_type == "float":
15+
return round(random.uniform(1, 100), 2), "float"
16+
elif chosen_type == "str":
17+
return (
18+
'"' + "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5)) + '"',
19+
"str",
20+
)
21+
elif chosen_type == "bool":
22+
return random.choice(["True", "False"]), "bool"
23+
elif chosen_type == "list":
24+
return (
25+
"[" + ", ".join(str(random.randint(1, 100)) for _ in range(3)) + "]",
26+
"list",
27+
)
28+
elif chosen_type == "dict":
29+
keys = [
30+
'"' + "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5)) + '"'
31+
for _ in range(3)
32+
]
33+
values = [str(random.randint(1, 100)) for _ in range(3)]
34+
return "{" + ", ".join(f"{k}: {v}" for k, v in zip(keys, values)) + "}", "dict"
35+
36+
37+
def find_placeholders(code_or_json):
38+
"""Find unique placeholders in the code or JSON"""
39+
placeholders = set(re.findall(r"<value\d+>", code_or_json))
40+
return sorted(list(placeholders))
41+
42+
43+
def generate_data_type_permutations(placeholders, data_types):
44+
"""Generate unique permutations of data types for the placeholders"""
45+
if len(placeholders) > len(data_types):
46+
# If there are more placeholders than data types, use combinations_with_replacement
47+
return itertools.combinations(data_types, len(placeholders))
48+
else:
49+
# Otherwise, use permutations, Avoid this case
50+
return itertools.permutations(data_types, len(placeholders))
51+
52+
53+
def replace_placeholders_and_generate_json(code, json_template_str, data_type_mapping):
54+
"""Replace placeholders with values for their respective types and update JSON"""
55+
for placeholder, data_type in data_type_mapping.items():
56+
value, _ = generate_value_for_type(data_type)
57+
code = code.replace(placeholder, str(value))
58+
59+
json_template = json.loads(json_template_str)
60+
for item in json_template:
61+
for placeholder, data_type in data_type_mapping.items():
62+
if placeholder in item["type"]:
63+
item["type"] = [data_type]
64+
65+
return code, json_template
66+
67+
68+
def save_files(code, json_data, output_folder, case_name, type_name, case_number):
69+
"""Modified to include case name in the folder path"""
70+
case_folder = os.path.join(
71+
output_folder, f"{case_number:03}_{case_name}_{type_name}"
72+
)
73+
os.makedirs(case_folder, exist_ok=True)
74+
75+
code_file_path = os.path.join(case_folder, "main.py")
76+
json_file_path = os.path.join(case_folder, "main_gt.json")
77+
78+
with open(code_file_path, "w") as file:
79+
file.write(code)
80+
print(f"Saved Python code to {code_file_path}")
81+
82+
with open(json_file_path, "w") as file:
83+
json.dump(json_data, file, indent=4)
84+
print(f"Saved JSON ground truth to {json_file_path}")
85+
86+
87+
def read_templates(directory):
88+
"""Read all template files in a directory and return a list of templates"""
89+
templates = []
90+
for file_name in os.listdir(directory):
91+
if file_name.endswith(".txt"): # Filter only text files
92+
with open(os.path.join(directory, file_name), "r") as file:
93+
content = file.read()
94+
95+
parts = content.split("\n# ")
96+
name = parts[0].replace("# Name: ", "").strip()
97+
template_type = parts[1].replace("Type: ", "").strip()
98+
data_types = parts[2].replace("Data Types: ", "").strip().split(", ")
99+
code_template = parts[3].split("Python Code Template\n", 1)[1].strip()
100+
json_template = parts[4].split("JSON Template", 1)[1].strip()
101+
templates.append(
102+
(name, template_type, data_types, code_template, json_template)
103+
)
104+
return templates
105+
106+
107+
# Example usage
108+
output_folder = "/mnt/Projects/PhD/Research/TypeEvalPy/git_sources/TypeEvalPy_LLM/.scrapy/generated_dataset"
109+
templates = read_templates("./templates")
110+
111+
shutil.rmtree(output_folder, ignore_errors=True)
112+
113+
case_number = 1
114+
total_cases = 1
115+
for name, template_type, data_types, code_template, json_template in templates:
116+
if template_type == "Simple":
117+
# Handling for simple templates
118+
for data_type in data_types:
119+
data_type_mapping = {"value": data_type}
120+
replaced_code, json_data = replace_placeholders_and_generate_json(
121+
code_template, json_template, data_type_mapping
122+
)
123+
save_files(
124+
replaced_code,
125+
json_data,
126+
output_folder,
127+
name,
128+
data_type,
129+
f"{case_number}_{total_cases}",
130+
)
131+
total_cases += 1
132+
elif template_type == "Complex":
133+
# Handling for complex templates
134+
placeholders = find_placeholders(code_template + json_template)
135+
for data_type_combo in generate_data_type_permutations(
136+
placeholders, data_types
137+
):
138+
data_type_mapping = {
139+
ph: dt for ph, dt in zip(placeholders, data_type_combo)
140+
}
141+
type_name = "_".join(data_type_combo)
142+
replaced_code, json_data = replace_placeholders_and_generate_json(
143+
code_template, json_template, data_type_mapping
144+
)
145+
save_files(
146+
replaced_code,
147+
json_data,
148+
output_folder,
149+
name,
150+
type_name,
151+
f"{case_number}_{total_cases}",
152+
)
153+
total_cases += 1
154+
155+
case_number += 1

0 commit comments

Comments
 (0)