pranav100000 · pranav100000 · Mar 24, 2025
diff --git a/src/benchpress/tasks/aime24.py b/src/benchpress/tasks/aime24.py
@@ -126,15 +126,13 @@ async def evaluate_example(
             }
         )
 
-        # Extract answers using the simplified extraction system
+        # Extract answers using the extraction system
         candidates = extract_answer(model_output, context)
 
         # Get the best answer (highest confidence)
-        extracted_answer = ""
-        if candidates:
-            extracted_answer = candidates[0].text
+        extracted_answer = candidates[0].text if candidates else ""
 
-        # Use comprehensive answer comparison
+        # Compare answers and determine correctness
         correct = compare_answers(extracted_answer, example.answer, domain="aime24")
 
         # Build metadata dictionary
@@ -148,16 +146,27 @@ async def evaluate_example(
         # Add extraction details if available
         if candidates:
             best_candidate = candidates[0]
-            metadata["extraction_method"] = best_candidate.pattern_name
-            metadata["method"] = best_candidate.pattern_name  # Alternative key for backward compatibility
-            metadata["extraction_confidence"] = float(best_candidate.confidence)
-            metadata["confidence"] = float(best_candidate.confidence)  # Alternative key
-
-            if best_candidate.metadata:
-                for key, value in best_candidate.metadata.items():
-                    metadata[key] = value
+            metadata.update({
+                "extraction_method": best_candidate.pattern_name,
+                "method": best_candidate.pattern_name,  # Alternative key for backward compatibility
+                "extraction_confidence": float(best_candidate.confidence),
+                "confidence": float(best_candidate.confidence),  # Alternative key
+                "pattern_type": best_candidate.metadata.get("pattern_type", "unknown")
+            })
+
+            # Add alternative candidates info if available
+            if len(candidates) > 1:
+                metadata["alternative_answers"] = [
+                    {
+                        "text": c.text,
+                        "method": c.pattern_name,
+                        "confidence": float(c.confidence)
+                    }
+                    for c in candidates[1:3]  # Just include top alternatives
+                ]
 
         return TaskResult(
+            question=example.question,
             example_id=example.id,
             model_id="",  # Will be filled in by the evaluation engine
             model_output=model_output,

diff --git a/src/benchpress/tasks/gpqa.py b/src/benchpress/tasks/gpqa.py
@@ -6,7 +6,7 @@
 from ..datasets.gpqa_dataset import GpqaDataset
 from ..datasets.gpqa_hf_dataset import GpqaHfDataset
 from ..examples.gpqa import GpqaExample
-from ..extraction import ExtractedAnswer, ExtractionContext
+from ..extraction import ExtractedAnswer, ExtractionContext, extract_answer
 from ..utils import get_hf_token
 from ..utils.math_comparison import compare_answers
 from .base import BaseTask, TaskResult
@@ -16,6 +16,7 @@
 @register_task
 class GpqaTask(BaseTask[GpqaExample]):
     """GPQA Diamond benchmark task implementation."""
+    # We use extract_answer directly from extraction module instead of instance attr
 
     def __init__(
         self,
@@ -97,35 +98,22 @@ async def evaluate_example(
             metadata={"subject": example.subject}
         )
 
-        # Extract all candidate answers
-        candidate_answers = self._extractor.extract(model_output, extraction_context)
+        # Extract answers using the extraction system
+        candidates = extract_answer(model_output, extraction_context)
 
-        # Use the highest confidence answer if available
-        if candidate_answers and candidate_answers[0].confidence >= 0.3:
-            extracted_answer = candidate_answers[0]
+        # Get best answer or use fallback extraction
+        if candidates and candidates[0].confidence >= 0.3:
+            extracted_answer = candidates[0]
         else:
-            # Basic fallback extraction for GPQA
-            answer_pattern = r"(?:answer|result|solution):\s*(.+?)(?:$|\n)"
-            match = re.search(answer_pattern, model_output.lower(), re.DOTALL)
-
-            if match:
-                extracted_text = match.group(1).strip()
-                extracted_answer = ExtractedAnswer(
-                    text=extracted_text,
-                    pattern_name="fallback_regex",
-                    confidence=0.5,
-                    metadata={"pattern_type": "fallback"}
-                )
-            else:
-                # If no explicit answer format, try to extract the last sentence
-                sentences = re.split(r"(?<=[.!?])\s+", model_output)
-                extracted_text = sentences[-1].strip() if sentences else ""
-                extracted_answer = ExtractedAnswer(
-                    text=extracted_text,
-                    pattern_name="fallback_last_sentence",
-                    confidence=0.2,  # Low confidence for this method
-                    metadata={"pattern_type": "fallback"}
-                )
+            # Use last sentence as a fallback
+            sentences = re.split(r"(?<=[.!?])\s+", model_output)
+            extracted_text = sentences[-1].strip() if sentences else ""
+            extracted_answer = ExtractedAnswer(
+                text=extracted_text,
+                pattern_name="fallback_last_sentence",
+                confidence=0.2,
+                metadata={"pattern_type": "fallback"}
+            )
 
         # Use our comprehensive comparison approach for more consistent results
         # This will handle various formatting differences
@@ -139,22 +127,29 @@ async def evaluate_example(
         # Prepare the metadata with extraction information
         metadata = {
             "extracted_answer": extracted_answer.text,
-            "extraction_confidence": extracted_answer.confidence,
+            "extraction_confidence": float(extracted_answer.confidence),
             "extraction_method": extracted_answer.pattern_name,
+            "method": extracted_answer.pattern_name,  # For backward compatibility
+            "confidence": float(extracted_answer.confidence),  # For backward compatibility
             "expected_answer": example.answer,
             "subject": example.subject,
             "difficulty": example.difficulty,
+            "pattern_type": extracted_answer.metadata.get("pattern_type", "unknown")
         }
 
-        # Include alternatives if available (other candidates)
-        alternative_answers = candidate_answers[1:] if len(candidate_answers) > 1 else []
-        if alternative_answers:
+        # Include alternatives if available
+        if len(candidates) > 1:
             metadata["alternative_answers"] = [
-                {"text": alt.text, "confidence": alt.confidence}
-                for alt in alternative_answers
+                {
+                    "text": c.text,
+                    "method": c.pattern_name,
+                    "confidence": float(c.confidence)
+                }
+                for c in candidates[1:3]  # Just include top alternatives
             ]
 
         return TaskResult(
+            question=example.question,
             example_id=example.id,
             model_id="",  # Will be filled in by the evaluation engine
             model_output=model_output,

diff --git a/src/benchpress/tasks/math500.py b/src/benchpress/tasks/math500.py
@@ -1,6 +1,5 @@
 """MATH-500 benchmark task."""
 
-import re
 from typing import List, Optional
 
 from ..datasets.math500_hf_dataset import Math500HfDataset
@@ -54,79 +53,7 @@ def prompt_template(self) -> str:
 - No additional text
 - Just the number or expression itself"""
 
-    def _normalize_math_answer(self, answer: str) -> str:
-        """Normalize a math answer for more robust comparison.
-
-        Args:
-            answer: The answer string to normalize
-
-        Returns:
-            Normalized answer string
-        """
-        if not answer:
-            return ""
-
-        # Remove "ANSWER:" marker
-        answer = re.sub(
-            r'^ANSWER:\s*',
-            '',
-            answer
-        )
-
-        # Special case for coordinate pairs with fractions - the issue we're fixing
-        # Pattern for LaTeX coordinate pairs with fractions like \left( 3, \frac{\pi}{2} \right)
-        latex_coord_match = re.search(r'\\left\s*\(\s*(\d+)\s*,\s*\\frac\s*\{\\pi\}\s*\{(\d+)\}\s*\\right\s*\)', answer)
-        if latex_coord_match:
-            x_value = latex_coord_match.group(1)
-            denom = latex_coord_match.group(2)
-            return f"({x_value},π/{denom})"
-
-        # Regular coordinate pairs like (3,π/2)
-        simple_coord_match = re.search(r'\(\s*(\d+)\s*,\s*π/(\d+)\s*\)', answer)
-        if simple_coord_match:
-            x_value = simple_coord_match.group(1)
-            denom = simple_coord_match.group(2)
-            return f"({x_value},π/{denom})"
-
-        # Replace LaTeX fractions with division notation
-        answer = re.sub(r"\\frac{([^}]+)}{([^}]+)}", r"\1/\2", answer)
-        answer = re.sub(r"\\dfrac{([^}]+)}{([^}]+)}", r"\1/\2", answer)
-
-        # Remove LaTeX formatting
-        answer = answer.replace("\\left", "")
-        answer = answer.replace("\\right", "")
-        answer = answer.replace("\\", "")
-        answer = answer.replace("{", "")
-        answer = answer.replace("}", "")
-        answer = answer.replace("$", "")
-        answer = answer.replace(" ", "")
-
-        # Replace LaTeX special symbols
-        answer = answer.replace("pi", "π")
-
-        # Normalize fractions (both numeric and symbolic)
-        try:
-            # Check for numeric fractions first
-            if "/" in answer:
-                parts = answer.split("/")
-                if len(parts) == 2:
-                    # For numeric fractions, standardize the form but don't convert to decimal
-                    if all(part.strip().isdigit() for part in parts):
-                        num = int(parts[0].strip())
-                        denom = int(parts[1].strip())
-                        if denom != 0:  # Avoid division by zero
-                            answer = f"{num}/{denom}"
-                    # For symbolic fractions like p/q, n/k, standardize to lowercase
-                    elif len(parts[0].strip()) == 1 and len(parts[1].strip()) == 1:
-                        p1 = parts[0].strip()
-                        p2 = parts[1].strip()
-                        if p1.isalpha() and p2.isalpha():
-                            answer = f"{p1.lower()}/{p2.lower()}"
-        except Exception:
-            # If normalization fails, keep the original
-            pass
-
-        return answer.strip().lower()
+    # Removed _normalize_math_answer - now using the central utility in extraction.processors
 
     async def load_examples(self) -> List[Math500Example]:
         """Load MATH-500 examples from HuggingFace dataset.
@@ -183,55 +110,44 @@ async def evaluate_example(
             }
         )
 
-        # Extract answers using the simplified extraction system
+        # Extract answers using the extraction system
         candidates = extract_answer(model_output, context)
 
         # Get the best answer (highest confidence)
-        extracted_answer = ""
-        if candidates:
-            extracted_answer = candidates[0].text
+        extracted_answer = candidates[0].text if candidates else ""
 
-        # Get the expected answer
-        expected_answer = example.answer
-
-        # Compare answers using our comprehensive multi-tier comparison approach
-        # This checks raw, normalized, and mathematical equivalence
-        correct = compare_answers(extracted_answer, expected_answer, domain="math500")
+        # Compare answers and determine correctness
+        correct = compare_answers(extracted_answer, example.answer, domain="math500")
 
         # Create detailed metadata
         metadata: dict[str, object] = {
             "extracted_answer": extracted_answer,
-            "expected_answer": expected_answer,
+            "expected_answer": example.answer,
             "category": example.category,
             "difficulty": example.difficulty,
         }
 
         # Add extraction details if available
         if candidates:
-            # Set both canonical and alternative keys for backward compatibility
             best_candidate = candidates[0]
-            metadata["extraction_method"] = best_candidate.pattern_name
-            metadata["method"] = best_candidate.pattern_name  # Alternative key
-
-            # Convert confidence to float and store in two formats
-            confidence_float = float(best_candidate.confidence)
-            metadata["extraction_confidence"] = confidence_float
-            metadata["confidence"] = confidence_float  # Alternative key
-
-            # Store info about how it was extracted
-            metadata["pattern_type"] = best_candidate.metadata.get("pattern_type", "unknown")
+            metadata.update({
+                "extraction_method": best_candidate.pattern_name,
+                "method": best_candidate.pattern_name,  # Alternative key for backward compatibility
+                "extraction_confidence": float(best_candidate.confidence),
+                "confidence": float(best_candidate.confidence),  # Alternative key
+                "pattern_type": best_candidate.metadata.get("pattern_type", "unknown")
+            })
 
             # Add alternative candidates info if available
             if len(candidates) > 1:
-                alt_answers = [
+                metadata["alternative_answers"] = [
                     {
                         "text": c.text,
                         "method": c.pattern_name,
                         "confidence": float(c.confidence)
                     }
                     for c in candidates[1:3]  # Just include top alternatives
                 ]
-                metadata["alternative_answers"] = alt_answers
 
         return TaskResult(
             question=example.question,