Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions src/benchpress/tasks/aime24.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,13 @@ async def evaluate_example(
}
)

# Extract answers using the simplified extraction system
# Extract answers using the extraction system
candidates = extract_answer(model_output, context)

# Get the best answer (highest confidence)
extracted_answer = ""
if candidates:
extracted_answer = candidates[0].text
extracted_answer = candidates[0].text if candidates else ""

# Use comprehensive answer comparison
# Compare answers and determine correctness
correct = compare_answers(extracted_answer, example.answer, domain="aime24")

# Build metadata dictionary
Expand All @@ -148,16 +146,27 @@ async def evaluate_example(
# Add extraction details if available
if candidates:
best_candidate = candidates[0]
metadata["extraction_method"] = best_candidate.pattern_name
metadata["method"] = best_candidate.pattern_name # Alternative key for backward compatibility
metadata["extraction_confidence"] = float(best_candidate.confidence)
metadata["confidence"] = float(best_candidate.confidence) # Alternative key

if best_candidate.metadata:
for key, value in best_candidate.metadata.items():
metadata[key] = value
metadata.update({
"extraction_method": best_candidate.pattern_name,
"method": best_candidate.pattern_name, # Alternative key for backward compatibility
"extraction_confidence": float(best_candidate.confidence),
"confidence": float(best_candidate.confidence), # Alternative key
"pattern_type": best_candidate.metadata.get("pattern_type", "unknown")
})

# Add alternative candidates info if available
if len(candidates) > 1:
metadata["alternative_answers"] = [
{
"text": c.text,
"method": c.pattern_name,
"confidence": float(c.confidence)
}
for c in candidates[1:3] # Just include top alternatives
]

return TaskResult(
question=example.question,
example_id=example.id,
model_id="", # Will be filled in by the evaluation engine
model_output=model_output,
Expand Down
63 changes: 29 additions & 34 deletions src/benchpress/tasks/gpqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..datasets.gpqa_dataset import GpqaDataset
from ..datasets.gpqa_hf_dataset import GpqaHfDataset
from ..examples.gpqa import GpqaExample
from ..extraction import ExtractedAnswer, ExtractionContext
from ..extraction import ExtractedAnswer, ExtractionContext, extract_answer
from ..utils import get_hf_token
from ..utils.math_comparison import compare_answers
from .base import BaseTask, TaskResult
Expand All @@ -16,6 +16,7 @@
@register_task
class GpqaTask(BaseTask[GpqaExample]):
"""GPQA Diamond benchmark task implementation."""
# We use extract_answer directly from extraction module instead of instance attr

def __init__(
self,
Expand Down Expand Up @@ -97,35 +98,22 @@ async def evaluate_example(
metadata={"subject": example.subject}
)

# Extract all candidate answers
candidate_answers = self._extractor.extract(model_output, extraction_context)
# Extract answers using the extraction system
candidates = extract_answer(model_output, extraction_context)

# Use the highest confidence answer if available
if candidate_answers and candidate_answers[0].confidence >= 0.3:
extracted_answer = candidate_answers[0]
# Get best answer or use fallback extraction
if candidates and candidates[0].confidence >= 0.3:
extracted_answer = candidates[0]
else:
# Basic fallback extraction for GPQA
answer_pattern = r"(?:answer|result|solution):\s*(.+?)(?:$|\n)"
match = re.search(answer_pattern, model_output.lower(), re.DOTALL)

if match:
extracted_text = match.group(1).strip()
extracted_answer = ExtractedAnswer(
text=extracted_text,
pattern_name="fallback_regex",
confidence=0.5,
metadata={"pattern_type": "fallback"}
)
else:
# If no explicit answer format, try to extract the last sentence
sentences = re.split(r"(?<=[.!?])\s+", model_output)
extracted_text = sentences[-1].strip() if sentences else ""
extracted_answer = ExtractedAnswer(
text=extracted_text,
pattern_name="fallback_last_sentence",
confidence=0.2, # Low confidence for this method
metadata={"pattern_type": "fallback"}
)
# Use last sentence as a fallback
sentences = re.split(r"(?<=[.!?])\s+", model_output)
extracted_text = sentences[-1].strip() if sentences else ""
extracted_answer = ExtractedAnswer(
text=extracted_text,
pattern_name="fallback_last_sentence",
confidence=0.2,
metadata={"pattern_type": "fallback"}
)

# Use our comprehensive comparison approach for more consistent results
# This will handle various formatting differences
Expand All @@ -139,22 +127,29 @@ async def evaluate_example(
# Prepare the metadata with extraction information
metadata = {
"extracted_answer": extracted_answer.text,
"extraction_confidence": extracted_answer.confidence,
"extraction_confidence": float(extracted_answer.confidence),
"extraction_method": extracted_answer.pattern_name,
"method": extracted_answer.pattern_name, # For backward compatibility
"confidence": float(extracted_answer.confidence), # For backward compatibility
"expected_answer": example.answer,
"subject": example.subject,
"difficulty": example.difficulty,
"pattern_type": extracted_answer.metadata.get("pattern_type", "unknown")
}

# Include alternatives if available (other candidates)
alternative_answers = candidate_answers[1:] if len(candidate_answers) > 1 else []
if alternative_answers:
# Include alternatives if available
if len(candidates) > 1:
metadata["alternative_answers"] = [
{"text": alt.text, "confidence": alt.confidence}
for alt in alternative_answers
{
"text": c.text,
"method": c.pattern_name,
"confidence": float(c.confidence)
}
for c in candidates[1:3] # Just include top alternatives
]

return TaskResult(
question=example.question,
example_id=example.id,
model_id="", # Will be filled in by the evaluation engine
model_output=model_output,
Expand Down
112 changes: 14 additions & 98 deletions src/benchpress/tasks/math500.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""MATH-500 benchmark task."""

import re
from typing import List, Optional

from ..datasets.math500_hf_dataset import Math500HfDataset
Expand Down Expand Up @@ -54,79 +53,7 @@ def prompt_template(self) -> str:
- No additional text
- Just the number or expression itself"""

def _normalize_math_answer(self, answer: str) -> str:
"""Normalize a math answer for more robust comparison.

Args:
answer: The answer string to normalize

Returns:
Normalized answer string
"""
if not answer:
return ""

# Remove "ANSWER:" marker
answer = re.sub(
r'^ANSWER:\s*',
'',
answer
)

# Special case for coordinate pairs with fractions - the issue we're fixing
# Pattern for LaTeX coordinate pairs with fractions like \left( 3, \frac{\pi}{2} \right)
latex_coord_match = re.search(r'\\left\s*\(\s*(\d+)\s*,\s*\\frac\s*\{\\pi\}\s*\{(\d+)\}\s*\\right\s*\)', answer)
if latex_coord_match:
x_value = latex_coord_match.group(1)
denom = latex_coord_match.group(2)
return f"({x_value},π/{denom})"

# Regular coordinate pairs like (3,π/2)
simple_coord_match = re.search(r'\(\s*(\d+)\s*,\s*π/(\d+)\s*\)', answer)
if simple_coord_match:
x_value = simple_coord_match.group(1)
denom = simple_coord_match.group(2)
return f"({x_value},π/{denom})"

# Replace LaTeX fractions with division notation
answer = re.sub(r"\\frac{([^}]+)}{([^}]+)}", r"\1/\2", answer)
answer = re.sub(r"\\dfrac{([^}]+)}{([^}]+)}", r"\1/\2", answer)

# Remove LaTeX formatting
answer = answer.replace("\\left", "")
answer = answer.replace("\\right", "")
answer = answer.replace("\\", "")
answer = answer.replace("{", "")
answer = answer.replace("}", "")
answer = answer.replace("$", "")
answer = answer.replace(" ", "")

# Replace LaTeX special symbols
answer = answer.replace("pi", "π")

# Normalize fractions (both numeric and symbolic)
try:
# Check for numeric fractions first
if "/" in answer:
parts = answer.split("/")
if len(parts) == 2:
# For numeric fractions, standardize the form but don't convert to decimal
if all(part.strip().isdigit() for part in parts):
num = int(parts[0].strip())
denom = int(parts[1].strip())
if denom != 0: # Avoid division by zero
answer = f"{num}/{denom}"
# For symbolic fractions like p/q, n/k, standardize to lowercase
elif len(parts[0].strip()) == 1 and len(parts[1].strip()) == 1:
p1 = parts[0].strip()
p2 = parts[1].strip()
if p1.isalpha() and p2.isalpha():
answer = f"{p1.lower()}/{p2.lower()}"
except Exception:
# If normalization fails, keep the original
pass

return answer.strip().lower()
# Removed _normalize_math_answer - now using the central utility in extraction.processors

async def load_examples(self) -> List[Math500Example]:
"""Load MATH-500 examples from HuggingFace dataset.
Expand Down Expand Up @@ -183,55 +110,44 @@ async def evaluate_example(
}
)

# Extract answers using the simplified extraction system
# Extract answers using the extraction system
candidates = extract_answer(model_output, context)

# Get the best answer (highest confidence)
extracted_answer = ""
if candidates:
extracted_answer = candidates[0].text
extracted_answer = candidates[0].text if candidates else ""

# Get the expected answer
expected_answer = example.answer

# Compare answers using our comprehensive multi-tier comparison approach
# This checks raw, normalized, and mathematical equivalence
correct = compare_answers(extracted_answer, expected_answer, domain="math500")
# Compare answers and determine correctness
correct = compare_answers(extracted_answer, example.answer, domain="math500")

# Create detailed metadata
metadata: dict[str, object] = {
"extracted_answer": extracted_answer,
"expected_answer": expected_answer,
"expected_answer": example.answer,
"category": example.category,
"difficulty": example.difficulty,
}

# Add extraction details if available
if candidates:
# Set both canonical and alternative keys for backward compatibility
best_candidate = candidates[0]
metadata["extraction_method"] = best_candidate.pattern_name
metadata["method"] = best_candidate.pattern_name # Alternative key

# Convert confidence to float and store in two formats
confidence_float = float(best_candidate.confidence)
metadata["extraction_confidence"] = confidence_float
metadata["confidence"] = confidence_float # Alternative key

# Store info about how it was extracted
metadata["pattern_type"] = best_candidate.metadata.get("pattern_type", "unknown")
metadata.update({
"extraction_method": best_candidate.pattern_name,
"method": best_candidate.pattern_name, # Alternative key for backward compatibility
"extraction_confidence": float(best_candidate.confidence),
"confidence": float(best_candidate.confidence), # Alternative key
"pattern_type": best_candidate.metadata.get("pattern_type", "unknown")
})

# Add alternative candidates info if available
if len(candidates) > 1:
alt_answers = [
metadata["alternative_answers"] = [
{
"text": c.text,
"method": c.pattern_name,
"confidence": float(c.confidence)
}
for c in candidates[1:3] # Just include top alternatives
]
metadata["alternative_answers"] = alt_answers

return TaskResult(
question=example.question,
Expand Down