Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
outputs/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
Expand Down Expand Up @@ -205,3 +207,5 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/

uv.lock
58 changes: 58 additions & 0 deletions environments/medbullets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# medbullets

### Overview
- **Environment ID**: `medbullets`
- **Short description**: USMLE-style multiple-choice questions from Medbullets.
- **Tags**: medical, clinical, single-turn, multiple-choice, USMLE, train, evaluation

### Datasets
- **Primary dataset(s)**: `Medbullets-4` and `Medbullets-5`
- **Source links**: [Paper](https://arxiv.org/pdf/2402.18060), [Github](https://github.com/HanjieChen/ChallengeClinicalQA), [HF Dataset](https://huggingface.co/datasets/mkieffer/Medbullets)
- **Split sizes**:

| Split | Choices | Count |
| ----------- | --------------- | ------- |
| `op4_test` | {A, B, C, D} | **308** |
| `op5_test` | {A, B, C, D, E} | **308** |

`op5_test` contains the same content as `op4_test`, but with one additional answer choice to increase difficulty. Note that while the content is the same, the letter choice corresponding to the correct answer is sometimes different between these splits.


### Task
- **Type**: single-turn
- **Parser**: `Parser` or `ThinkParser`, with `extract_fn=extract_boxed_answer` for strict letter-in-\boxed{}-format parsing
- **Rubric overview**: Binary scoring based on correctly boxed letter choice and optional think tag formatting

### Quickstart
Run an evaluation with default settings:

```bash
uv run vf-eval medbullets
```

Configure model and sampling:

```bash
uv run vf-eval medbullets \
-m gpt-4.1-mini \
-n -1 -r 3 -t 1024 -T 0.7 \
-a '{"use_think": false, "num_options": 4, "shuffle": true}'
```

Notes:
- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.

### Environment Arguments

| Arg | Type | Default | Description |
| -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `num_options` | int | `4` | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E} |
| `use_think` | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
| `shuffle` | bool | `False` | Whether to shuffle answer choices |


### Metrics

| Metric | Meaning |
| ------ | ------- |
| `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0|
164 changes: 164 additions & 0 deletions environments/medbullets/medbullets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import re
import random

import verifiers as vf
from datasets import Dataset, load_dataset
from datasets.utils.logging import disable_progress_bar
from verifiers.utils.data_utils import (
BOXED_SYSTEM_PROMPT,
THINK_BOXED_SYSTEM_PROMPT,
extract_boxed_answer,
)

disable_progress_bar() # suppress datasets mapping progress bar


def _build_question(question: str, options: dict[str, str]) -> str:
opts = "\n".join(f"{k}. {v}" for k, v in options.items())
return f"Question: {question}\n\n{opts}"


def _to_vf_format(ds: Dataset, num_options: int, shuffle: bool) -> Dataset:
"""
Shape each row for SingleTurnEnv's defaults:
- 'question': string the env will turn into chat messages
- 'answer': top-level gold letter (A/B/C/D[/E])
- 'info': keep all original fields for bookkeeping

Args:
- num_options: 4 or 5; if 4, strips out option "E"
- shuffle: whether to shuffle the answer choices
"""
VALID = ("A", "B", "C", "D", "E")

def _format_row(row: dict) -> dict:
question = row.get("question", "") or "" # question string
opts = (
row.get("options", {}) or {}
) # answer choices, map of letter to answer text

# strip option E if num_options == 4
if num_options == 4:
opts = {k: v for k, v in opts.items() if k != "E"}

# lift the answer to top-level, normalize to a single letter
answer_letter = (row.get("answer") or "").strip().upper()
if answer_letter not in VALID:
return None

# shuffle answer choices if requested
if shuffle and answer_letter and answer_letter in opts:
# get the correct answer text before shuffling
correct_answer_text = opts[answer_letter]

# create list of (letter, text) pairs and shuffle them
option_pairs = list(opts.items())

# use a deterministic seed based on the question for consistency
rng = random.Random(hash(question) % (2**32))
rng.shuffle(option_pairs)

# rebuild options dict with new letter assignments
letters = VALID[: len(option_pairs)]
opts = {letters[i]: text for i, (_, text) in enumerate(option_pairs)}

# find the new letter for the correct answer
for letter, text in opts.items():
if text == correct_answer_text:
answer_letter = letter
break

instruction = "The following are multiple choice questions (with answers) about health. Think step by step and then output the single letter answer at the end like \\boxed{A}.\n\n"
question = _build_question(question, opts)
prompt = instruction + question

# question and answer have been moved to top-level, so remove them here
info = dict(row)

# update shuffled answer choices in the info dict
if shuffle:
info["answer"] = answer_letter
info["options"] = opts

return {
"question": prompt,
"answer": answer_letter,
"info": info,
}

return ds.map(_format_row, remove_columns=ds.column_names).filter(
lambda row: row is not None
)


def load_environment(
num_options: int = 4, use_think: bool = False, shuffle: bool = False, **kwargs
) -> vf.Environment:
"""
Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset

Each example is normalized to the fields expected by `vf.SingleTurnEnv`:
{
"question": "<stem + formatted options>", # string used as the user prompt
"answer": "<A|B|C|D|E>", # top-level gold letter
"info": { ...original example fields... } # full source row for debugging
}

- num_options=4 : loads split `op4_test
- num_options=5 : loads split `op5_test`

- Parser extracts \\boxed{A|B|C|D|E} from completions

- Reward looks for exact match between parsed letter and answer letter
"""

# -------- load dataset --------
if num_options == 4:
# 4 options: {"A", "B", "C", "D"}
test_raw = load_dataset("mkieffer/Medbullets", split="op4_test")
elif num_options == 5:
# 5 options: {"A", "B", "C", "D", "E"}
test_raw = load_dataset("mkieffer/Medbullets", split="op5_test")
else:
raise ValueError("'num_options' must be 4 or 5")

test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle)
del test_raw # free memory

parser = (
vf.ThinkParser(extract_fn=extract_boxed_answer)
if use_think
else vf.Parser(extract_fn=extract_boxed_answer)
)
system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT

def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
response = parser.parse_answer(completion) or ""
response = response.strip()

# remove \text{...} wrapper if present
text_match = re.match(r"\\text\{(.+)\}", response)
if text_match:
response = text_match.group(1).strip()

# try to extract a letter at the beginning
# matches: "H", "H.", "H:", "(H)", "(H).", "H. Some text", "(A) Some text", etc.
letter_match = re.match(r"^\(?([A-J])\)?(?:[.:\s]|$)", response)
if letter_match:
extracted_letter = letter_match.group(1)
return 1.0 if extracted_letter.upper() == answer.upper() else 0.0
return 0.0

rubric = vf.Rubric(
funcs=[correct_answer_reward_func],
weights=[1.0],
parser=parser,
)

return vf.SingleTurnEnv(
eval_dataset=test_ds,
system_prompt=system_prompt,
parser=parser,
rubric=rubric,
**kwargs,
)
21 changes: 21 additions & 0 deletions environments/medbullets/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "medbullets"
description = "Single-turn medical MCQ"
tags = ["medical", "clinical", "single-turn", "multiple-choice", "usmle", "train", "evaluation"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.3.post0",
]

[tool.prime.environment]
loader = "medbullets:load_environment"
display_name = "Medbullets"
visibility = "PUBLIC"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["medbullets.py"]