@@ -49,13 +49,28 @@ def curate_qa_pairs(
4949 with open (input_path , 'r' , encoding = 'utf-8' ) as f :
5050 data = json .load (f )
5151
52- # Extract QA pairs
52+ # Extract QA pairs or CoT examples
5353 qa_pairs = data .get ("qa_pairs" , [])
54+ cot_examples = data .get ("cot_examples" , [])
5455 summary = data .get ("summary" , "" )
5556
56- # If there are no QA pairs or they're already filtered
57+ # Determine which format we're working with
58+ is_cot_format = False
59+ if cot_examples and not qa_pairs :
60+ # Convert CoT examples to QA format for curation
61+ qa_pairs = []
62+ for example in cot_examples :
63+ qa_pair = {
64+ "question" : example .get ("question" , "" ),
65+ "answer" : example .get ("answer" , "" ),
66+ "reasoning" : example .get ("reasoning" , "" ) # Keep reasoning for reference
67+ }
68+ qa_pairs .append (qa_pair )
69+ is_cot_format = True
70+
71+ # If there are no QA pairs or CoT examples
5772 if not qa_pairs :
58- raise ValueError ("No QA pairs found in the input file" )
73+ raise ValueError ("No QA pairs or CoT examples found in the input file" )
5974
6075 # Initialize LLM client
6176 client = LLMClient (
@@ -269,13 +284,34 @@ def curate_qa_pairs(
269284 # Convert to conversation format
270285 conversations = convert_to_conversation_format (filtered_pairs )
271286
272- # Create result with filtered pairs
273- result = {
274- "summary" : summary ,
275- "qa_pairs" : filtered_pairs ,
276- "conversations" : conversations ,
277- "metrics" : metrics
278- }
287+ # Create result with filtered pairs in the appropriate format
288+ if is_cot_format :
289+ # Convert back to CoT format
290+ filtered_cot_examples = []
291+ for pair in filtered_pairs :
292+ cot_example = {
293+ "question" : pair .get ("question" , "" ),
294+ "reasoning" : pair .get ("reasoning" , "" ),
295+ "answer" : pair .get ("answer" , "" )
296+ }
297+ # Keep rating if it exists
298+ if "rating" in pair :
299+ cot_example ["rating" ] = pair ["rating" ]
300+ filtered_cot_examples .append (cot_example )
301+
302+ result = {
303+ "summary" : summary ,
304+ "cot_examples" : filtered_cot_examples ,
305+ "conversations" : conversations ,
306+ "metrics" : metrics
307+ }
308+ else :
309+ result = {
310+ "summary" : summary ,
311+ "qa_pairs" : filtered_pairs ,
312+ "conversations" : conversations ,
313+ "metrics" : metrics
314+ }
279315
280316 # Ensure output directory exists
281317 os .makedirs (os .path .dirname (output_path ), exist_ok = True )
0 commit comments