MedARC-AI · mkieffer1107 · Sep 14, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+outputs/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
@@ -205,3 +207,5 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+uv.lock
diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
@@ -0,0 +1,58 @@
+# medbullets
+
+### Overview
+- **Environment ID**: `medbullets`
+- **Short description**: USMLE-style multiple-choice questions from Medbullets.
+- **Tags**: medical, clinical, single-turn, multiple-choice, USMLE, train, evaluation
+
+### Datasets
+- **Primary dataset(s)**: `Medbullets-4` and `Medbullets-5`
+- **Source links**: [Paper](https://arxiv.org/pdf/2402.18060), [Github](https://github.com/HanjieChen/ChallengeClinicalQA), [HF Dataset](https://huggingface.co/datasets/mkieffer/Medbullets)
+- **Split sizes**:
+
+    | Split       | Choices         | Count   |
+    | ----------- | --------------- | ------- |
+    | `op4_test` | {A, B, C, D}    | **308** |
+    | `op5_test` | {A, B, C, D, E} | **308** |
+
+    `op5_test` contains the same content as `op4_test`, but with one additional answer choice to increase difficulty. Note that while the content is the same, the letter choice corresponding to the correct answer is sometimes different between these splits.
+
+
+### Task
+- **Type**: single-turn
+- **Parser**: `Parser` or `ThinkParser`, with `extract_fn=extract_boxed_answer` for strict letter-in-\boxed{}-format parsing
+- **Rubric overview**: Binary scoring based on correctly boxed letter choice and optional think tag formatting
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval medbullets
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval medbullets \
+    -m gpt-4.1-mini   \
+    -n -1 -r 3 -t 1024 -T 0.7  \
+    -a '{"use_think": false, "num_options": 4, "shuffle": true}'
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+
+| Arg                  | Type | Default | Description                                                                                                                                                                          |
+| -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `num_options`        | int  | `4`     | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E}                                                |
+| `use_think`          | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
+| `shuffle`            | bool | `False` | Whether to shuffle answer choices |
+
+
+### Metrics
+
+| Metric | Meaning |
+| ------ | ------- |
+| `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0|
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
@@ -0,0 +1,164 @@
+import re
+import random
+
+import verifiers as vf
+from datasets import Dataset, load_dataset
+from datasets.utils.logging import disable_progress_bar
+from verifiers.utils.data_utils import (
+    BOXED_SYSTEM_PROMPT,
+    THINK_BOXED_SYSTEM_PROMPT,
+    extract_boxed_answer,
+)
+
+disable_progress_bar()  # suppress datasets mapping progress bar
+
+
+def _build_question(question: str, options: dict[str, str]) -> str:
+    opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+    return f"Question: {question}\n\n{opts}"
+
+
+def _to_vf_format(ds: Dataset, num_options: int, shuffle: bool) -> Dataset:
+    """
+    Shape each row for SingleTurnEnv's defaults:
+      - 'question': string the env will turn into chat messages
+      - 'answer':   top-level gold letter (A/B/C/D[/E])
+      - 'info':     keep all original fields for bookkeeping
+
+    Args:
+      - num_options: 4 or 5; if 4, strips out option "E"
+      - shuffle: whether to shuffle the answer choices
+    """
+    VALID = ("A", "B", "C", "D", "E")
+
+    def _format_row(row: dict) -> dict:
+        question = row.get("question", "") or ""  # question string
+        opts = (
+            row.get("options", {}) or {}
+        )  # answer choices, map of letter to answer text
+
+        # strip option E if num_options == 4
+        if num_options == 4:
+            opts = {k: v for k, v in opts.items() if k != "E"}
+
+        # lift the answer to top-level, normalize to a single letter
+        answer_letter = (row.get("answer") or "").strip().upper()
+        if answer_letter not in VALID:
+            return None
+
+        # shuffle answer choices if requested
+        if shuffle and answer_letter and answer_letter in opts:
+            # get the correct answer text before shuffling
+            correct_answer_text = opts[answer_letter]
+
+            # create list of (letter, text) pairs and shuffle them
+            option_pairs = list(opts.items())
+
+            # use a deterministic seed based on the question for consistency
+            rng = random.Random(hash(question) % (2**32))
+            rng.shuffle(option_pairs)
+
+            # rebuild options dict with new letter assignments
+            letters = VALID[: len(option_pairs)]
+            opts = {letters[i]: text for i, (_, text) in enumerate(option_pairs)}
+
+            # find the new letter for the correct answer
+            for letter, text in opts.items():
+                if text == correct_answer_text:
+                    answer_letter = letter
+                    break
+
+        instruction = "The following are multiple choice questions (with answers) about health. Think step by step and then output the single letter answer at the end like \\boxed{A}.\n\n"
+        question = _build_question(question, opts)
+        prompt = instruction + question
+
+        # question and answer have been moved to top-level, so remove them here
+        info = dict(row)
+
+        # update shuffled answer choices in the info dict
+        if shuffle:
+            info["answer"] = answer_letter
+            info["options"] = opts
+
+        return {
+            "question": prompt,
+            "answer": answer_letter,
+            "info": info,
+        }
+
+    return ds.map(_format_row, remove_columns=ds.column_names).filter(
+        lambda row: row is not None
+    )
+
+
+def load_environment(
+    num_options: int = 4, use_think: bool = False, shuffle: bool = False, **kwargs
+) -> vf.Environment:
+    """
+    Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset
+
+    Each example is normalized to the fields expected by `vf.SingleTurnEnv`:
+        {
+            "question": "<stem + formatted options>",      # string used as the user prompt
+            "answer":   "<A|B|C|D|E>",                     # top-level gold letter
+            "info":     { ...original example fields... }  # full source row for debugging
+        }
+
+    - num_options=4 : loads split `op4_test
+    - num_options=5 : loads split `op5_test`
+
+    - Parser extracts \\boxed{A|B|C|D|E} from completions
+
+    - Reward looks for exact match between parsed letter and answer letter
+    """
+
+    # -------- load dataset --------
+    if num_options == 4:
+        # 4 options: {"A", "B", "C", "D"}
+        test_raw = load_dataset("mkieffer/Medbullets", split="op4_test")
+    elif num_options == 5:
+        # 5 options: {"A", "B", "C", "D", "E"}
+        test_raw = load_dataset("mkieffer/Medbullets", split="op5_test")
+    else:
+        raise ValueError("'num_options' must be 4 or 5")
+
+    test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle)
+    del test_raw  # free memory
+
+    parser = (
+        vf.ThinkParser(extract_fn=extract_boxed_answer)
+        if use_think
+        else vf.Parser(extract_fn=extract_boxed_answer)
+    )
+    system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT
+
+    def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
+        response = parser.parse_answer(completion) or ""
+        response = response.strip()
+
+        # remove \text{...} wrapper if present
+        text_match = re.match(r"\\text\{(.+)\}", response)
+        if text_match:
+            response = text_match.group(1).strip()
+
+        # try to extract a letter at the beginning
+        # matches: "H", "H.", "H:", "(H)", "(H).", "H. Some text", "(A) Some text", etc.
+        letter_match = re.match(r"^\(?([A-J])\)?(?:[.:\s]|$)", response)
+        if letter_match:
+            extracted_letter = letter_match.group(1)
+            return 1.0 if extracted_letter.upper() == answer.upper() else 0.0
+        return 0.0
+
+    rubric = vf.Rubric(
+        funcs=[correct_answer_reward_func],
+        weights=[1.0],
+        parser=parser,
+    )
+
+    return vf.SingleTurnEnv(
+        eval_dataset=test_ds,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+        **kwargs,
+    )
diff --git a/environments/medbullets/pyproject.toml b/environments/medbullets/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "medbullets"
+description = "Single-turn medical MCQ"
+tags = ["medical", "clinical", "single-turn", "multiple-choice", "usmle", "train", "evaluation"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.3.post0",
+]
+
+[tool.prime.environment]
+loader = "medbullets:load_environment"
+display_name = "Medbullets"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["medbullets.py"]