MedARC-AI · mnishant2 · Jan 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+*.7z
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -52,7 +53,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
-
+analysis_output/
 # Translations
 *.mo
 *.pot

diff --git a/README.md b/README.md
@@ -197,4 +197,76 @@ export MEDARC_DISABLE_TOKEN_TRACKING=true
 - If provider doesn't return usage data, defaults to 0
 - Model tokens include all inference API calls
 - Judge tokens include all LLM-as-judge calls via `judge()` method (e.g., FactScore: 6-20 verification calls per example)
-- **Note**: Some judge implementations (e.g., FactScore claim extraction) make additional API calls (claim extraction) that are currently not tracked not part of judge() calls or get stored in state["responses"]. These represent a small overhead (~10-20% of total judge tokens) and are present in existing implementations like MedRedQA, keep in mind when calculating.
+- **Note**: Some judge implementations (e.g., FactScore claim extraction) make additional API calls (claim extraction) that are currently not tracked not part of judge() calls or get stored in state["responses"]. These represent a small overhead (~10-20% of total judge tokens) and are present in existing implementations like MedRedQA, keep in mind when calculating.
+
+## MCQ Answer Analysis
+
+Post-hoc analysis of MCQ benchmark results. Extracts model answers using the same parsing pipeline as evaluation, generates confusion matrices, and computes cross-rollout consistency metrics.
+
+### Usage
+
+```bash
+# Single model
+python scripts/mcq_answer_analysis.py \
+  --logs-dir /path/to/model_results \
+  --output-dir ./analysis_output \
+  --model model-name
+
+# All models
+python scripts/mcq_answer_analysis.py \
+  --logs-dir /path/to/all_models \
+  --output-dir ./analysis_output \
+  --all-models
+
+# Specific benchmark only
+python scripts/mcq_answer_analysis.py \
+  --logs-dir /path/to/all_models \
+  --output-dir ./analysis_output \
+  --all-models \
+  --benchmark medqa
+```
+
+### Output Files
+
+| File | Description |
+|------|-------------|
+| `{model}_{benchmark}.csv` | Per-example analysis with parsed answers |
+| `{model}_{benchmark}_confusion.csv` | Confusion matrix (correct → predicted) |
+| `{model}_{benchmark}_rollouts.csv` | Cross-rollout comparison (if multiple rollouts) |
+| `{model}_summary.json` | Aggregate statistics per benchmark |
+| `{model}_benchmark_metrics.csv` | Summary metrics table |
+| `all_models_metrics.csv` | Cross-model comparison (with `--all-models`) |
+
+### Metrics
+
+- **Accuracy**: Standard correctness rate
+- **Parsing success rate**: % of completions with extracted answer
+- **Variation rate**: % of questions with different answers across rollouts
+- **Semantic consistency**: Among varied answers, % with same answer text (different letter, same content)
+- **Positional bias**: Per-position selection rate vs ground truth distribution
+
+### Model Parsed Answer Logging
+
+New evaluations automatically log parsed answers to `info` dict in `results.jsonl`:
+
+```json
+{
+  "info": {
+    "model_parsed_answer": "B",
+    "parsing_method": "anchored_token"
+  }
+}
+```
+
+This enables exact reproducibility in post-hoc analysis. For older results without logging, the analysis script applies the full parsing pipeline (environment-specific XML/boxed extraction → MCQ answer parsing).
+
+To enable logging in custom environments, pass `info=info` to `multiple_choice_accuracy()`:
+
+```python
+is_correct = multiple_choice_accuracy(
+    llm_answer=parsed,
+    answer_letter=answer,
+    answer_text=answer_text,
+    info=info  # Enables parsed answer logging
+)
+```
diff --git a/environments/careqa/careqa.py b/environments/careqa/careqa.py
@@ -32,7 +32,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
     """Reward based on shared multiple-choice accuracy grading."""
     parsed = parser.parse_answer(completion) or ""
     answer_text = info.get("answer_text", None) if info else None
-    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
     return 1.0 if is_correct else 0.0
 
 

diff --git a/environments/head_qa_v2/head_qa_v2.py b/environments/head_qa_v2/head_qa_v2.py
@@ -139,7 +139,7 @@ def cot_prompt(example: dict[str, Any]) -> dict[str, Any]:
 def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
     parsed = parser.parse_answer(completion) or ""
     answer_text = info.get("answer_text") if info else None
-    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
     return 1.0 if is_correct else 0.0
 
 

diff --git a/environments/longhealth/longhealth.py b/environments/longhealth/longhealth.py
@@ -465,7 +465,7 @@ def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict | None
     parsed = parser.parse_answer(completion) or ""
     answer_text = info.get("correct_answer_text", None) if info else None
     is_correct = multiple_choice_accuracy(
-        llm_answer=parsed, answer_letter=answer, answer_text=answer_text, prefix="The correct answer is"
+        llm_answer=parsed, answer_letter=answer, answer_text=answer_text, prefix="The correct answer is", info=info
     )
     return 1.0 if is_correct else 0.0
 

diff --git a/environments/m_arc/m_arc.py b/environments/m_arc/m_arc.py
@@ -194,7 +194,7 @@ def load_environment(
     def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/med_mcqa/med_mcqa.py b/environments/med_mcqa/med_mcqa.py
@@ -132,7 +132,7 @@ def _map_example(example: dict[str, Any]) -> dict[str, Any] | None:
     def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
@@ -143,7 +143,7 @@ def load_environment(
     def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/medconceptsqa/medconceptsqa.py b/environments/medconceptsqa/medconceptsqa.py
@@ -192,7 +192,7 @@ def _map(row: dict, idx: int | None = None) -> dict:
     def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict | None = None) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/medexqa/medexqa.py b/environments/medexqa/medexqa.py
@@ -289,7 +289,7 @@ def _is_correct(parser, completion, answer: str, info: dict | None = None) -> bo
         completion_text = completion or ""
         parsed = parser.parse_answer(completion) or completion_text
         answer_text = (info or {}).get("answer_text", "")
-        return multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        return multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
 
     def combined_reward(parser, completion, answer, **kwargs) -> float:
         """Gate explanation scoring on MCQ correctness."""
@@ -322,7 +322,7 @@ async def combined_judge_reward(judge, prompt, completion, answer, state: State,
             model_rational = getattr(parsed, "explanation", None)
 
             is_correct = multiple_choice_accuracy(
-                llm_answer=model_answer, answer_letter=answer, answer_text=answer_text
+                llm_answer=model_answer, answer_letter=answer, answer_text=answer_text, info=info
             )
 
             if not is_correct:

diff --git a/environments/medqa/medqa.py b/environments/medqa/medqa.py
@@ -19,7 +19,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
     """Reward based on shared multiple-choice accuracy grading."""
     parsed = parser.parse_answer(completion) or ""
     answer_text = info.get("answer_text", None) if info else None
-    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
     return 1.0 if is_correct else 0.0
 
 

diff --git a/environments/medxpertqa/medxpertqa.py b/environments/medxpertqa/medxpertqa.py
@@ -111,7 +111,7 @@ def _map(example: dict) -> dict:
     def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/metamedqa/metamedqa.py b/environments/metamedqa/metamedqa.py
@@ -74,7 +74,7 @@ def _map(ex: dict, idx: int | None = None):
     def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/mmlu_pro_health/mmlu_pro_health.py b/environments/mmlu_pro_health/mmlu_pro_health.py
@@ -193,7 +193,7 @@ def _convert_options(row: dict) -> dict:
     def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
         parsed = parser.parse_answer(completion) or ""
         answer_text = info.get("answer_text", None) if info else None
-        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+        is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text, info=info)
         return 1.0 if is_correct else 0.0
 
     rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

diff --git a/environments/pubmedqa/pubmedqa.py b/environments/pubmedqa/pubmedqa.py
@@ -90,6 +90,7 @@ def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = Non
         llm_answer=parsed,
         answer_letter=answer,
         answer_text=answer_text,
+        info=info,
     )
     return 1.0 if is_correct else 0.0
 

diff --git a/medarc_verifiers/rewards/multiple_choice_accuracy.py b/medarc_verifiers/rewards/multiple_choice_accuracy.py
@@ -31,6 +31,9 @@ class MCQAccuracyResult:
     correct_answer: Optional[str] = None
     """The correct answer for reference, if available."""
 
+    parsed_answer: Optional[str] = None
+    """Parsed answer token (letter/number) extracted from the model output, if available."""
+
 
 def _nfkc_casefold(text: str) -> str:
     """Unicode normalize + casefold for robust text comparison."""
@@ -213,6 +216,7 @@ def multiple_choice_accuracy(
     accept_answer_text: bool = True,
     strip_tex: bool = True,
     return_details: bool = False,
+    info: Optional[dict] = None,
 ) -> bool | MCQAccuracyResult:
     """
     Grade a multiple-choice answer with layered strategies:
@@ -230,26 +234,50 @@ def multiple_choice_accuracy(
         accept_answer_text: Whether to fall back to text matching
         strip_tex: Whether to strip LaTeX formatting
         return_details: If True, return MCQAccuracyResult dataclass instead of bool
-
     Returns:
         bool (if return_details=False) or MCQAccuracyResult (if return_details=True)
     """
 
     def _result(
-        is_correct: bool, method: str, predicted: str | None, actual: str | None, return_details: bool
+        is_correct: bool,
+        method: str,
+        matched: str | None,
+        actual: str | None,
+        return_details: bool,
+        parsed: str | None = None,
+        log_method: str | None = None,
     ) -> bool | MCQAccuracyResult:
-        """Helper to format return value."""
+        """Helper to format return value.
+
+        Args:
+            is_correct: Whether the answer was graded as correct
+            method: The parsing method for MCQAccuracyResult (original behavior)
+            matched: The answer that matched correctly (None if incorrect)
+            actual: The correct answer letter
+            return_details: Whether to return MCQAccuracyResult or bool
+            parsed: What the model actually said (regardless of correctness)
+            log_method: The actual parsing method for info dict logging (defaults to method)
+        """
+        # Log parsed answer to info dict if provided
+        if info is not None:
+            info["model_parsed_answer"] = parsed
+            info["parsing_method"] = log_method if log_method is not None else method
+
         if not return_details:
             return is_correct
-        return MCQAccuracyResult(
+
+        result = MCQAccuracyResult(
             is_correct=is_correct,
             method=method,
-            matched_answer=predicted,
+            matched_answer=matched,
             correct_answer=actual,
+            parsed_answer=parsed,
         )
 
+        return result
+
     if not llm_answer:
-        return _result(False, "none", None, None, return_details)
+        return _result(False, "none", None, None, return_details, parsed=None)
 
     # Normalize the response
     llm_answer = _remove_think_tags(llm_answer)
@@ -269,19 +297,28 @@ def _result(
         raise ValueError(f"Invalid answer_letter '{answer_letter=}'. Must be a single letter or digit string.")
 
     explicit_choice_found = False
+    model_predicted = None  # Track what the model actually said
+    parse_method = "none"
 
     # Strategy 1: Only answer letter anywhere (without anchoring)
-    if answer_letter == _norm_letter(llm_answer):
-        return _result(True, "direct_answer", llm_answer, answer_letter, return_details)
+    normalized_llm = _norm_letter(llm_answer)
+    if normalized_llm and len(llm_answer.strip()) <= 3:
+        model_predicted = normalized_llm
+        parse_method = "direct_answer"
+        if normalized_llm == answer_letter:
+            return _result(True, "direct_answer", llm_answer, answer_letter, return_details, parsed=normalized_llm)
 
     # Strategy 2: Accept leading option token like "B. answer ..."
     leading_match = LEADING_OPTION_PATTERN.match(llm_answer_original)
     if leading_match and answer_letter:
         predicted = _norm_letter(leading_match.group(1))
+        if predicted and model_predicted is None:
+            model_predicted = predicted
+            parse_method = "anchored_token"
         if _token_kind_matches_answer_letter(predicted, answer_letter):
             explicit_choice_found = True
         if predicted == answer_letter:
-            return _result(True, "anchored_token", predicted, answer_letter, return_details)
+            return _result(True, "anchored_token", predicted, answer_letter, return_details, parsed=predicted)
 
     # Strategy 3: Anchored token (prefix matches first, fallback to generic anchors)
     prefix_matches = []
@@ -290,7 +327,7 @@ def _result(
         if prefix_norm:
             flexible_prefix = re.escape(prefix_norm).replace(r"\ ", r"\s+")
             prefix_pattern = re.compile(
-                rf"{flexible_prefix}\s*[:\-–—]?\s*(?:is\s*)?(?P<neg>not\s+|isn['’]t\s+)?\(?\s*(?P<opt>[A-Za-z]|\d{{1,2}})\s*[\)\.:]?(?![\w+\-/])",
+                rf"{flexible_prefix}\s*[:\-–—]?\s*(?:is\s*)?(?P<neg>not\s+|isn['']t\s+)?\(?\s*(?P<opt>[A-Za-z]|\d{{1,2}})\s*[\)\.:]?(?![\w+\-/])",
                 re.IGNORECASE,
             )
             prefix_matches = list(prefix_pattern.finditer(llm_answer))
@@ -299,10 +336,13 @@ def _result(
     if anchored_matches and answer_letter:
         last_match = anchored_matches[-1]
         predicted = _norm_letter(last_match.group("opt"))
+        if predicted and last_match.group("neg") is None:
+            model_predicted = predicted
+            parse_method = "anchored_token"
         if last_match.group("neg") is None and _token_kind_matches_answer_letter(predicted, answer_letter):
             explicit_choice_found = True
         if predicted == answer_letter and last_match.group("neg") is None:
-            return _result(True, "anchored_token", predicted, answer_letter, return_details)
+            return _result(True, "anchored_token", predicted, answer_letter, return_details, parsed=predicted)
 
     # Strategy 4: Last token in the answer tail, ignore negative contexts like "C is incorrect",
     if not explicit_choice_found and answer_letter:
@@ -318,8 +358,12 @@ def _result(
                     continue
                 if _negative_after_option(tail, token_match):
                     continue
+                if model_predicted is None:
+                    model_predicted = predicted
+                    parse_method = "last_token"
                 if predicted == answer_letter:
-                    return _result(True, "last_token", predicted, answer_letter, return_details)
+                    return _result(True, "last_token", predicted, answer_letter, return_details, parsed=predicted)
+                break  # Take the first valid token we find
 
     # Strategy 5: Exact answer text match if there's no explicit choice found
     # Only search at beginning and end to avoid matching reasoning in the middle
@@ -343,11 +387,11 @@ def _result(
         # Check beginning first
         match = pattern.search(beginning_region)
         if match and not _negated_near(beginning_region, match):
-            return _result(True, "answer_text", beginning_region, answer_text, return_details)
+            return _result(True, "answer_text", beginning_region, answer_text, return_details, parsed=model_predicted)
 
         # Then check end (after reasoning)
         match = pattern.search(end_region)
         if match and not _negated_near(end_region, match):
-            return _result(True, "answer_text", end_region, answer_text, return_details)
+            return _result(True, "answer_text", end_region, answer_text, return_details, parsed=model_predicted)
 
-    return _result(False, "none", None, None, return_details)
+    return _result(False, "none", None, None, return_details, parsed=model_predicted, log_method=parse_method)