diff --git a/src/benchpress/tasks/aime24.py b/src/benchpress/tasks/aime24.py index 19dd01f..e94a180 100644 --- a/src/benchpress/tasks/aime24.py +++ b/src/benchpress/tasks/aime24.py @@ -126,15 +126,13 @@ async def evaluate_example( } ) - # Extract answers using the simplified extraction system + # Extract answers using the extraction system candidates = extract_answer(model_output, context) # Get the best answer (highest confidence) - extracted_answer = "" - if candidates: - extracted_answer = candidates[0].text + extracted_answer = candidates[0].text if candidates else "" - # Use comprehensive answer comparison + # Compare answers and determine correctness correct = compare_answers(extracted_answer, example.answer, domain="aime24") # Build metadata dictionary @@ -148,16 +146,27 @@ async def evaluate_example( # Add extraction details if available if candidates: best_candidate = candidates[0] - metadata["extraction_method"] = best_candidate.pattern_name - metadata["method"] = best_candidate.pattern_name # Alternative key for backward compatibility - metadata["extraction_confidence"] = float(best_candidate.confidence) - metadata["confidence"] = float(best_candidate.confidence) # Alternative key - - if best_candidate.metadata: - for key, value in best_candidate.metadata.items(): - metadata[key] = value + metadata.update({ + "extraction_method": best_candidate.pattern_name, + "method": best_candidate.pattern_name, # Alternative key for backward compatibility + "extraction_confidence": float(best_candidate.confidence), + "confidence": float(best_candidate.confidence), # Alternative key + "pattern_type": best_candidate.metadata.get("pattern_type", "unknown") + }) + + # Add alternative candidates info if available + if len(candidates) > 1: + metadata["alternative_answers"] = [ + { + "text": c.text, + "method": c.pattern_name, + "confidence": float(c.confidence) + } + for c in candidates[1:3] # Just include top alternatives + ] return TaskResult( + question=example.question, example_id=example.id, model_id="", # Will be filled in by the evaluation engine model_output=model_output, diff --git a/src/benchpress/tasks/gpqa.py b/src/benchpress/tasks/gpqa.py index affa4d7..fbb41b5 100644 --- a/src/benchpress/tasks/gpqa.py +++ b/src/benchpress/tasks/gpqa.py @@ -6,7 +6,7 @@ from ..datasets.gpqa_dataset import GpqaDataset from ..datasets.gpqa_hf_dataset import GpqaHfDataset from ..examples.gpqa import GpqaExample -from ..extraction import ExtractedAnswer, ExtractionContext +from ..extraction import ExtractedAnswer, ExtractionContext, extract_answer from ..utils import get_hf_token from ..utils.math_comparison import compare_answers from .base import BaseTask, TaskResult @@ -16,6 +16,7 @@ @register_task class GpqaTask(BaseTask[GpqaExample]): """GPQA Diamond benchmark task implementation.""" + # We use extract_answer directly from extraction module instead of instance attr def __init__( self, @@ -97,35 +98,22 @@ async def evaluate_example( metadata={"subject": example.subject} ) - # Extract all candidate answers - candidate_answers = self._extractor.extract(model_output, extraction_context) + # Extract answers using the extraction system + candidates = extract_answer(model_output, extraction_context) - # Use the highest confidence answer if available - if candidate_answers and candidate_answers[0].confidence >= 0.3: - extracted_answer = candidate_answers[0] + # Get best answer or use fallback extraction + if candidates and candidates[0].confidence >= 0.3: + extracted_answer = candidates[0] else: - # Basic fallback extraction for GPQA - answer_pattern = r"(?:answer|result|solution):\s*(.+?)(?:$|\n)" - match = re.search(answer_pattern, model_output.lower(), re.DOTALL) - - if match: - extracted_text = match.group(1).strip() - extracted_answer = ExtractedAnswer( - text=extracted_text, - pattern_name="fallback_regex", - confidence=0.5, - metadata={"pattern_type": "fallback"} - ) - else: - # If no explicit answer format, try to extract the last sentence - sentences = re.split(r"(?<=[.!?])\s+", model_output) - extracted_text = sentences[-1].strip() if sentences else "" - extracted_answer = ExtractedAnswer( - text=extracted_text, - pattern_name="fallback_last_sentence", - confidence=0.2, # Low confidence for this method - metadata={"pattern_type": "fallback"} - ) + # Use last sentence as a fallback + sentences = re.split(r"(?<=[.!?])\s+", model_output) + extracted_text = sentences[-1].strip() if sentences else "" + extracted_answer = ExtractedAnswer( + text=extracted_text, + pattern_name="fallback_last_sentence", + confidence=0.2, + metadata={"pattern_type": "fallback"} + ) # Use our comprehensive comparison approach for more consistent results # This will handle various formatting differences @@ -139,22 +127,29 @@ async def evaluate_example( # Prepare the metadata with extraction information metadata = { "extracted_answer": extracted_answer.text, - "extraction_confidence": extracted_answer.confidence, + "extraction_confidence": float(extracted_answer.confidence), "extraction_method": extracted_answer.pattern_name, + "method": extracted_answer.pattern_name, # For backward compatibility + "confidence": float(extracted_answer.confidence), # For backward compatibility "expected_answer": example.answer, "subject": example.subject, "difficulty": example.difficulty, + "pattern_type": extracted_answer.metadata.get("pattern_type", "unknown") } - # Include alternatives if available (other candidates) - alternative_answers = candidate_answers[1:] if len(candidate_answers) > 1 else [] - if alternative_answers: + # Include alternatives if available + if len(candidates) > 1: metadata["alternative_answers"] = [ - {"text": alt.text, "confidence": alt.confidence} - for alt in alternative_answers + { + "text": c.text, + "method": c.pattern_name, + "confidence": float(c.confidence) + } + for c in candidates[1:3] # Just include top alternatives ] return TaskResult( + question=example.question, example_id=example.id, model_id="", # Will be filled in by the evaluation engine model_output=model_output, diff --git a/src/benchpress/tasks/math500.py b/src/benchpress/tasks/math500.py index 0891e54..45cb690 100644 --- a/src/benchpress/tasks/math500.py +++ b/src/benchpress/tasks/math500.py @@ -1,6 +1,5 @@ """MATH-500 benchmark task.""" -import re from typing import List, Optional from ..datasets.math500_hf_dataset import Math500HfDataset @@ -54,79 +53,7 @@ def prompt_template(self) -> str: - No additional text - Just the number or expression itself""" - def _normalize_math_answer(self, answer: str) -> str: - """Normalize a math answer for more robust comparison. - - Args: - answer: The answer string to normalize - - Returns: - Normalized answer string - """ - if not answer: - return "" - - # Remove "ANSWER:" marker - answer = re.sub( - r'^ANSWER:\s*', - '', - answer - ) - - # Special case for coordinate pairs with fractions - the issue we're fixing - # Pattern for LaTeX coordinate pairs with fractions like \left( 3, \frac{\pi}{2} \right) - latex_coord_match = re.search(r'\\left\s*\(\s*(\d+)\s*,\s*\\frac\s*\{\\pi\}\s*\{(\d+)\}\s*\\right\s*\)', answer) - if latex_coord_match: - x_value = latex_coord_match.group(1) - denom = latex_coord_match.group(2) - return f"({x_value},π/{denom})" - - # Regular coordinate pairs like (3,π/2) - simple_coord_match = re.search(r'\(\s*(\d+)\s*,\s*π/(\d+)\s*\)', answer) - if simple_coord_match: - x_value = simple_coord_match.group(1) - denom = simple_coord_match.group(2) - return f"({x_value},π/{denom})" - - # Replace LaTeX fractions with division notation - answer = re.sub(r"\\frac{([^}]+)}{([^}]+)}", r"\1/\2", answer) - answer = re.sub(r"\\dfrac{([^}]+)}{([^}]+)}", r"\1/\2", answer) - - # Remove LaTeX formatting - answer = answer.replace("\\left", "") - answer = answer.replace("\\right", "") - answer = answer.replace("\\", "") - answer = answer.replace("{", "") - answer = answer.replace("}", "") - answer = answer.replace("$", "") - answer = answer.replace(" ", "") - - # Replace LaTeX special symbols - answer = answer.replace("pi", "π") - - # Normalize fractions (both numeric and symbolic) - try: - # Check for numeric fractions first - if "/" in answer: - parts = answer.split("/") - if len(parts) == 2: - # For numeric fractions, standardize the form but don't convert to decimal - if all(part.strip().isdigit() for part in parts): - num = int(parts[0].strip()) - denom = int(parts[1].strip()) - if denom != 0: # Avoid division by zero - answer = f"{num}/{denom}" - # For symbolic fractions like p/q, n/k, standardize to lowercase - elif len(parts[0].strip()) == 1 and len(parts[1].strip()) == 1: - p1 = parts[0].strip() - p2 = parts[1].strip() - if p1.isalpha() and p2.isalpha(): - answer = f"{p1.lower()}/{p2.lower()}" - except Exception: - # If normalization fails, keep the original - pass - - return answer.strip().lower() + # Removed _normalize_math_answer - now using the central utility in extraction.processors async def load_examples(self) -> List[Math500Example]: """Load MATH-500 examples from HuggingFace dataset. @@ -183,47 +110,37 @@ async def evaluate_example( } ) - # Extract answers using the simplified extraction system + # Extract answers using the extraction system candidates = extract_answer(model_output, context) # Get the best answer (highest confidence) - extracted_answer = "" - if candidates: - extracted_answer = candidates[0].text + extracted_answer = candidates[0].text if candidates else "" - # Get the expected answer - expected_answer = example.answer - - # Compare answers using our comprehensive multi-tier comparison approach - # This checks raw, normalized, and mathematical equivalence - correct = compare_answers(extracted_answer, expected_answer, domain="math500") + # Compare answers and determine correctness + correct = compare_answers(extracted_answer, example.answer, domain="math500") # Create detailed metadata metadata: dict[str, object] = { "extracted_answer": extracted_answer, - "expected_answer": expected_answer, + "expected_answer": example.answer, "category": example.category, "difficulty": example.difficulty, } # Add extraction details if available if candidates: - # Set both canonical and alternative keys for backward compatibility best_candidate = candidates[0] - metadata["extraction_method"] = best_candidate.pattern_name - metadata["method"] = best_candidate.pattern_name # Alternative key - - # Convert confidence to float and store in two formats - confidence_float = float(best_candidate.confidence) - metadata["extraction_confidence"] = confidence_float - metadata["confidence"] = confidence_float # Alternative key - - # Store info about how it was extracted - metadata["pattern_type"] = best_candidate.metadata.get("pattern_type", "unknown") + metadata.update({ + "extraction_method": best_candidate.pattern_name, + "method": best_candidate.pattern_name, # Alternative key for backward compatibility + "extraction_confidence": float(best_candidate.confidence), + "confidence": float(best_candidate.confidence), # Alternative key + "pattern_type": best_candidate.metadata.get("pattern_type", "unknown") + }) # Add alternative candidates info if available if len(candidates) > 1: - alt_answers = [ + metadata["alternative_answers"] = [ { "text": c.text, "method": c.pattern_name, @@ -231,7 +148,6 @@ async def evaluate_example( } for c in candidates[1:3] # Just include top alternatives ] - metadata["alternative_answers"] = alt_answers return TaskResult( question=example.question,