Skip to content

Commit 27a5541

Browse files
authored
Merge pull request #76 from ghyathmoussa/main
fix: Enhance QA curation to support CoT examples and format conversion
2 parents 2c722d1 + 0f0d452 commit 27a5541

File tree

2 files changed

+57
-13
lines changed

2 files changed

+57
-13
lines changed

synthetic_data_kit/core/curate.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,28 @@ def curate_qa_pairs(
4949
with open(input_path, 'r', encoding='utf-8') as f:
5050
data = json.load(f)
5151

52-
# Extract QA pairs
52+
# Extract QA pairs or CoT examples
5353
qa_pairs = data.get("qa_pairs", [])
54+
cot_examples = data.get("cot_examples", [])
5455
summary = data.get("summary", "")
5556

56-
# If there are no QA pairs or they're already filtered
57+
# Determine which format we're working with
58+
is_cot_format = False
59+
if cot_examples and not qa_pairs:
60+
# Convert CoT examples to QA format for curation
61+
qa_pairs = []
62+
for example in cot_examples:
63+
qa_pair = {
64+
"question": example.get("question", ""),
65+
"answer": example.get("answer", ""),
66+
"reasoning": example.get("reasoning", "") # Keep reasoning for reference
67+
}
68+
qa_pairs.append(qa_pair)
69+
is_cot_format = True
70+
71+
# If there are no QA pairs or CoT examples
5772
if not qa_pairs:
58-
raise ValueError("No QA pairs found in the input file")
73+
raise ValueError("No QA pairs or CoT examples found in the input file")
5974

6075
# Initialize LLM client
6176
client = LLMClient(
@@ -269,13 +284,34 @@ def curate_qa_pairs(
269284
# Convert to conversation format
270285
conversations = convert_to_conversation_format(filtered_pairs)
271286

272-
# Create result with filtered pairs
273-
result = {
274-
"summary": summary,
275-
"qa_pairs": filtered_pairs,
276-
"conversations": conversations,
277-
"metrics": metrics
278-
}
287+
# Create result with filtered pairs in the appropriate format
288+
if is_cot_format:
289+
# Convert back to CoT format
290+
filtered_cot_examples = []
291+
for pair in filtered_pairs:
292+
cot_example = {
293+
"question": pair.get("question", ""),
294+
"reasoning": pair.get("reasoning", ""),
295+
"answer": pair.get("answer", "")
296+
}
297+
# Keep rating if it exists
298+
if "rating" in pair:
299+
cot_example["rating"] = pair["rating"]
300+
filtered_cot_examples.append(cot_example)
301+
302+
result = {
303+
"summary": summary,
304+
"cot_examples": filtered_cot_examples,
305+
"conversations": conversations,
306+
"metrics": metrics
307+
}
308+
else:
309+
result = {
310+
"summary": summary,
311+
"qa_pairs": filtered_pairs,
312+
"conversations": conversations,
313+
"metrics": metrics
314+
}
279315

280316
# Ensure output directory exists
281317
os.makedirs(os.path.dirname(output_path), exist_ok=True)

tests/unit/test_error_handling.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,19 @@ def test_curate_input_validation(patch_config, test_env):
158158
{
159159
"question": "What is synthetic data?",
160160
"answer": "Synthetic data is artificially generated data.",
161-
},
161+
}
162+
]
163+
164+
cot_examples = [
165+
{
166+
"question": "What is synthetic data?",
167+
"reasoning": "Synthetic data is artificially generated data.",
168+
"answer": "Synthetic data is artificially generated data.",
169+
}
162170
]
163171

164172
with tempfile.NamedTemporaryFile(mode="w+", suffix=".json", delete=False) as f:
165-
json.dump({"qa_pairs": qa_pairs}, f)
173+
json.dump({"qa_pairs": qa_pairs, "cot_examples": cot_examples}, f)
166174
file_path = f.name
167175

168176
# Create temporary output directory
@@ -182,7 +190,7 @@ def test_curate_input_validation(patch_config, test_env):
182190
curate.curate_qa_pairs(input_path=empty_file_path, output_path=output_path)
183191

184192
# Check that the error message is helpful
185-
assert "No QA pairs found" in str(excinfo.value)
193+
assert "No QA pairs or CoT examples found" in str(excinfo.value)
186194
finally:
187195
# Clean up
188196
if os.path.exists(file_path):

0 commit comments

Comments
 (0)