Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
*.7z

# PyInstaller
# Usually these files are written by a python script from a template
Expand All @@ -52,7 +53,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/

analysis_output/
# Translations
*.mo
*.pot
Expand Down
74 changes: 73 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,76 @@ export MEDARC_DISABLE_TOKEN_TRACKING=true
- If provider doesn't return usage data, defaults to 0
- Model tokens include all inference API calls
- Judge tokens include all LLM-as-judge calls via `judge()` method (e.g., FactScore: 6-20 verification calls per example)
- **Note**: Some judge implementations (e.g., FactScore claim extraction) make additional API calls (claim extraction) that are currently not tracked not part of judge() calls or get stored in state["responses"]. These represent a small overhead (~10-20% of total judge tokens) and are present in existing implementations like MedRedQA, keep in mind when calculating.
- **Note**: Some judge implementations (e.g., FactScore claim extraction) make additional API calls (claim extraction) that are currently not tracked not part of judge() calls or get stored in state["responses"]. These represent a small overhead (~10-20% of total judge tokens) and are present in existing implementations like MedRedQA, keep in mind when calculating.

## MCQ Answer Analysis

Post-hoc analysis of MCQ benchmark results. Extracts model answers using the same parsing pipeline as evaluation, generates confusion matrices, and computes cross-rollout consistency metrics.

### Usage

```bash
# Single model
python scripts/mcq_answer_analysis.py \
--logs-dir /path/to/model_results \
--output-dir ./analysis_output \
--model model-name

# All models
python scripts/mcq_answer_analysis.py \
--logs-dir /path/to/all_models \
--output-dir ./analysis_output \
--all-models

# Specific benchmark only
python scripts/mcq_answer_analysis.py \
--logs-dir /path/to/all_models \
--output-dir ./analysis_output \
--all-models \
--benchmark medqa
```

### Output Files

| File | Description |
|------|-------------|
| `{model}_{benchmark}.csv` | Per-example analysis with parsed answers |
| `{model}_{benchmark}_confusion.csv` | Confusion matrix (correct → predicted) |
| `{model}_{benchmark}_rollouts.csv` | Cross-rollout comparison (if multiple rollouts) |
| `{model}_summary.json` | Aggregate statistics per benchmark |
| `{model}_benchmark_metrics.csv` | Summary metrics table |
| `all_models_metrics.csv` | Cross-model comparison (with `--all-models`) |

### Metrics

- **Accuracy**: Standard correctness rate
- **Parsing success rate**: % of completions with extracted answer
- **Variation rate**: % of questions with different answers across rollouts
- **Semantic consistency**: Among varied answers, % with same answer text (different letter, same content)
- **Positional bias**: Per-position selection rate vs ground truth distribution

### Model Parsed Answer Logging

New evaluations automatically log parsed answers to `info` dict in `results.jsonl`:

```json
{
"info": {
"model_parsed_answer": "B",
"parsing_method": "anchored_token"
}
}
```

This enables exact reproducibility in post-hoc analysis. For older results without logging, the analysis script applies the full parsing pipeline (environment-specific XML/boxed extraction → MCQ answer parsing).

To enable logging in custom environments, pass `info=info` to `multiple_choice_accuracy()`:

```python
is_correct = multiple_choice_accuracy(
llm_answer=parsed,
answer_letter=answer,
answer_text=answer_text,
info=info # Enables parsed answer logging
)
```
2 changes: 1 addition & 1 deletion environments/careqa/careqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
"""Reward based on shared multiple-choice accuracy grading."""
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0


Expand Down
2 changes: 1 addition & 1 deletion environments/head_qa_v2/head_qa_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def cot_prompt(example: dict[str, Any]) -> dict[str, Any]:
def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text") if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0


Expand Down
2 changes: 1 addition & 1 deletion environments/longhealth/longhealth.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict | None
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("correct_answer_text", None) if info else None
is_correct = multiple_choice_accuracy(
llm_answer=parsed, answer_letter=answer, answer_text=answer_text, prefix="The correct answer is"
llm_answer=parsed, answer_letter=answer, answer_text=answer_text, prefix="The correct answer is", info=info
)
return 1.0 if is_correct else 0.0

Expand Down
2 changes: 1 addition & 1 deletion environments/m_arc/m_arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def load_environment(
def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
2 changes: 1 addition & 1 deletion environments/med_mcqa/med_mcqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _map_example(example: dict[str, Any]) -> dict[str, Any] | None:
def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
2 changes: 1 addition & 1 deletion environments/medbullets/medbullets.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def load_environment(
def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
2 changes: 1 addition & 1 deletion environments/medconceptsqa/medconceptsqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _map(row: dict, idx: int | None = None) -> dict:
def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict | None = None) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
4 changes: 2 additions & 2 deletions environments/medexqa/medexqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def _is_correct(parser, completion, answer: str, info: dict | None = None) -> bo
completion_text = completion or ""
parsed = parser.parse_answer(completion) or completion_text
answer_text = (info or {}).get("answer_text", "")
return multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
return multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)

def combined_reward(parser, completion, answer, **kwargs) -> float:
"""Gate explanation scoring on MCQ correctness."""
Expand Down Expand Up @@ -322,7 +322,7 @@ async def combined_judge_reward(judge, prompt, completion, answer, state: State,
model_rational = getattr(parsed, "explanation", None)

is_correct = multiple_choice_accuracy(
llm_answer=model_answer, answer_letter=answer, answer_text=answer_text
llm_answer=model_answer, answer_letter=answer, answer_text=answer_text, info=info
)

if not is_correct:
Expand Down
2 changes: 1 addition & 1 deletion environments/medqa/medqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
"""Reward based on shared multiple-choice accuracy grading."""
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0


Expand Down
2 changes: 1 addition & 1 deletion environments/medxpertqa/medxpertqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def _map(example: dict) -> dict:
def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
2 changes: 1 addition & 1 deletion environments/metamedqa/metamedqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _map(ex: dict, idx: int | None = None):
def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
2 changes: 1 addition & 1 deletion environments/mmlu_pro_health/mmlu_pro_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def _convert_options(row: dict) -> dict:
def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
parsed = parser.parse_answer(completion) or ""
answer_text = info.get("answer_text", None) if info else None
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
return 1.0 if is_correct else 0.0

rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
Expand Down
1 change: 1 addition & 0 deletions environments/pubmedqa/pubmedqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
llm_answer=parsed,
answer_letter=answer,
answer_text=answer_text,
info=info,
)
return 1.0 if is_correct else 0.0

Expand Down
74 changes: 59 additions & 15 deletions medarc_verifiers/rewards/multiple_choice_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ class MCQAccuracyResult:
correct_answer: Optional[str] = None
"""The correct answer for reference, if available."""

parsed_answer: Optional[str] = None
"""Parsed answer token (letter/number) extracted from the model output, if available."""


def _nfkc_casefold(text: str) -> str:
"""Unicode normalize + casefold for robust text comparison."""
Expand Down Expand Up @@ -213,6 +216,7 @@ def multiple_choice_accuracy(
accept_answer_text: bool = True,
strip_tex: bool = True,
return_details: bool = False,
info: Optional[dict] = None,
) -> bool | MCQAccuracyResult:
"""
Grade a multiple-choice answer with layered strategies:
Expand All @@ -230,26 +234,50 @@ def multiple_choice_accuracy(
accept_answer_text: Whether to fall back to text matching
strip_tex: Whether to strip LaTeX formatting
return_details: If True, return MCQAccuracyResult dataclass instead of bool

Returns:
bool (if return_details=False) or MCQAccuracyResult (if return_details=True)
"""

def _result(
is_correct: bool, method: str, predicted: str | None, actual: str | None, return_details: bool
is_correct: bool,
method: str,
matched: str | None,
actual: str | None,
return_details: bool,
parsed: str | None = None,
log_method: str | None = None,
) -> bool | MCQAccuracyResult:
"""Helper to format return value."""
"""Helper to format return value.

Args:
is_correct: Whether the answer was graded as correct
method: The parsing method for MCQAccuracyResult (original behavior)
matched: The answer that matched correctly (None if incorrect)
actual: The correct answer letter
return_details: Whether to return MCQAccuracyResult or bool
parsed: What the model actually said (regardless of correctness)
log_method: The actual parsing method for info dict logging (defaults to method)
"""
# Log parsed answer to info dict if provided
if info is not None:
info["model_parsed_answer"] = parsed
info["parsing_method"] = log_method if log_method is not None else method

if not return_details:
return is_correct
return MCQAccuracyResult(

result = MCQAccuracyResult(
is_correct=is_correct,
method=method,
matched_answer=predicted,
matched_answer=matched,
correct_answer=actual,
parsed_answer=parsed,
)

return result

if not llm_answer:
return _result(False, "none", None, None, return_details)
return _result(False, "none", None, None, return_details, parsed=None)

# Normalize the response
llm_answer = _remove_think_tags(llm_answer)
Expand All @@ -269,19 +297,28 @@ def _result(
raise ValueError(f"Invalid answer_letter '{answer_letter=}'. Must be a single letter or digit string.")

explicit_choice_found = False
model_predicted = None # Track what the model actually said
parse_method = "none"

# Strategy 1: Only answer letter anywhere (without anchoring)
if answer_letter == _norm_letter(llm_answer):
return _result(True, "direct_answer", llm_answer, answer_letter, return_details)
normalized_llm = _norm_letter(llm_answer)
if normalized_llm and len(llm_answer.strip()) <= 3:
model_predicted = normalized_llm
parse_method = "direct_answer"
if normalized_llm == answer_letter:
return _result(True, "direct_answer", llm_answer, answer_letter, return_details, parsed=normalized_llm)

# Strategy 2: Accept leading option token like "B. answer ..."
leading_match = LEADING_OPTION_PATTERN.match(llm_answer_original)
if leading_match and answer_letter:
predicted = _norm_letter(leading_match.group(1))
if predicted and model_predicted is None:
model_predicted = predicted
parse_method = "anchored_token"
if _token_kind_matches_answer_letter(predicted, answer_letter):
explicit_choice_found = True
if predicted == answer_letter:
return _result(True, "anchored_token", predicted, answer_letter, return_details)
return _result(True, "anchored_token", predicted, answer_letter, return_details, parsed=predicted)

# Strategy 3: Anchored token (prefix matches first, fallback to generic anchors)
prefix_matches = []
Expand All @@ -290,7 +327,7 @@ def _result(
if prefix_norm:
flexible_prefix = re.escape(prefix_norm).replace(r"\ ", r"\s+")
prefix_pattern = re.compile(
rf"{flexible_prefix}\s*[:\-–—]?\s*(?:is\s*)?(?P<neg>not\s+|isn[']t\s+)?\(?\s*(?P<opt>[A-Za-z]|\d{{1,2}})\s*[\)\.:]?(?![\w+\-/])",
rf"{flexible_prefix}\s*[:\-–—]?\s*(?:is\s*)?(?P<neg>not\s+|isn['']t\s+)?\(?\s*(?P<opt>[A-Za-z]|\d{{1,2}})\s*[\)\.:]?(?![\w+\-/])",
re.IGNORECASE,
)
prefix_matches = list(prefix_pattern.finditer(llm_answer))
Expand All @@ -299,10 +336,13 @@ def _result(
if anchored_matches and answer_letter:
last_match = anchored_matches[-1]
predicted = _norm_letter(last_match.group("opt"))
if predicted and last_match.group("neg") is None:
model_predicted = predicted
parse_method = "anchored_token"
if last_match.group("neg") is None and _token_kind_matches_answer_letter(predicted, answer_letter):
explicit_choice_found = True
if predicted == answer_letter and last_match.group("neg") is None:
return _result(True, "anchored_token", predicted, answer_letter, return_details)
return _result(True, "anchored_token", predicted, answer_letter, return_details, parsed=predicted)

# Strategy 4: Last token in the answer tail, ignore negative contexts like "C is incorrect",
if not explicit_choice_found and answer_letter:
Expand All @@ -318,8 +358,12 @@ def _result(
continue
if _negative_after_option(tail, token_match):
continue
if model_predicted is None:
model_predicted = predicted
parse_method = "last_token"
if predicted == answer_letter:
return _result(True, "last_token", predicted, answer_letter, return_details)
return _result(True, "last_token", predicted, answer_letter, return_details, parsed=predicted)
break # Take the first valid token we find

# Strategy 5: Exact answer text match if there's no explicit choice found
# Only search at beginning and end to avoid matching reasoning in the middle
Expand All @@ -343,11 +387,11 @@ def _result(
# Check beginning first
match = pattern.search(beginning_region)
if match and not _negated_near(beginning_region, match):
return _result(True, "answer_text", beginning_region, answer_text, return_details)
return _result(True, "answer_text", beginning_region, answer_text, return_details, parsed=model_predicted)

# Then check end (after reasoning)
match = pattern.search(end_region)
if match and not _negated_near(end_region, match):
return _result(True, "answer_text", end_region, answer_text, return_details)
return _result(True, "answer_text", end_region, answer_text, return_details, parsed=model_predicted)

return _result(False, "none", None, None, return_details)
return _result(False, "none", None, None, return_details, parsed=model_predicted, log_method=parse_method)
Loading