Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions environments/medhalt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# MedHALT

Evaluation environment for the MedHALT (Medical Domain Hallucination Test) dataset.

## Overview

MedHALT tests medical language models on multiple-choice questions from various medical exams. This environment implements evaluation for two configurations:

- `reasoning_FCT`: Functional correct thinking questions
- `reasoning_nota`: None-of-the-above questions

## Dataset

- **Source**: [openlifescienceai/Med-HALT](https://huggingface.co/datasets/openlifescienceai/Med-HALT)
- **Paper**: https://arxiv.org/abs/2307.15343
- **GitHub**: https://github.com/medhalt/medhalt
- **Size**: ~18,866 examples per configuration

## Installation
```bash
vf-install medhalt
```

## Quickstart

Run an evaluation with default settings:
```bash
uv run vf-eval medhalt
```

## Usage

To run an evaluation using `vf-eval` with the OpenAI API:
```bash
export OPENAI_API_KEY=sk-...
uv run vf-eval \
-m gpt-4o-mini \
-n 100 \
-s \
medhalt
```

Replace `OPENAI_API_KEY` with your actual API key.

### With Ollama
```bash
uv run vf-eval \
-m qwen2.5:3b \
-n 100 \
--api-base-url http://localhost:11434/v1 \
--api-key ollama \
medhalt
```

### Specify configuration
```bash
# Test on None-of-the-Above questions
uv run vf-eval medhalt --config-name reasoning_nota

# Enable answer shuffling
uv run vf-eval medhalt --shuffle-answers --shuffle-seed 42

## Parameters

- `config_name`: One of `["reasoning_FCT", "reasoning_nota"]` (default: `"reasoning_FCT"`)
- `split`: Dataset split (default: `"train"`)
- `num_examples`: Limit number of examples (default: `None` for all)
- `shuffle_answers`: Randomize answer order (default: `False`)
- `shuffle_seed`: Seed for shuffling (default: `42`)

## Testing

A test script is provided to verify the environment works:
```bash
cd environments/medhalt
python test_medhalt.py --config reasoning_FCT --num-examples 10
```

## Example Results

Tested on qwen2.5:3b with 1000 examples:

| Config | Accuracy | Notes |
|--------|----------|-------|
| reasoning_FCT | 50.7% | Functional reasoning questions |
| reasoning_nota | 33.1% | None-of-the-above questions |

Results vary by model size and capability. Random guessing baseline is 25% for 4-option questions.
166 changes: 166 additions & 0 deletions environments/medhalt/medhalt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""MedHALT: Medical Domain Hallucination Test

This environment implements evaluation for the MedHALT dataset, which tests
medical language models on multiple-choice questions from various medical exams.

Dataset: https://huggingface.co/datasets/openlifescienceai/Med-HALT
Paper: https://github.com/medhalt/medhalt

Supported configurations:
- reasoning_FCT: Functional correct thinking questions
- reasoning_nota: None-of-the-above questions

"""

from datasets import load_dataset
from typing import Optional, List, Dict, Any
import verifiers as vf
import re
import json
import ast
import random

def load_environment(
config_name: str = "reasoning_FCT",
split: str = "train",
num_examples: Optional[int] = None,
shuffle_answers: bool = False,
shuffle_seed: int | None = 42,
) -> vf.SingleTurnEnv:
"""Load MedHALT environment for medical LLM evaluation.

Args:
config_name: One of ["reasoning_FCT", "reasoning_nota"]
split: Dataset split to use
num_examples: Optional limit on number of examples
shuffle_answers: Whether to randomize answer order
shuffle_seed: Random seed for answer shuffling

Returns:
A SingleTurnEnv ready for evaluation
"""""
valid_configs = ["reasoning_FCT", "reasoning_nota"]
if config_name not in valid_configs:
raise ValueError(f"Invalid config_name: {config_name}")

raw_dataset = load_dataset("openlifescienceai/Med-HALT", config_name, split=split)

dataset = raw_dataset.map(
lambda ex: _format_example(ex, shuffle_answers, shuffle_seed),
remove_columns=raw_dataset.column_names,
load_from_cache_file=False,
)

# Create rubric with simple accuracy check
def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
parsed = parser.parse_answer(completion) or ""
predicted = _extract_letter(parsed, info.get("num_options", 4) if info else 4)
return 1.0 if predicted and predicted.upper() == answer.upper() else 0.0

if num_examples is not None:
dataset = dataset.select(range(min(num_examples, len(dataset))))

parser = vf.Parser()
rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)

return vf.SingleTurnEnv(dataset=dataset, parser=parser, rubric=rubric)

def _format_example(example: dict[str, Any], shuffle_answers: bool = False, shuffle_seed: int | None = 42) -> dict[str, Any]:
"""Format a MedHALT example into verifiers format."""

question = example.get('question', '').strip()

# Parse options string into dict
options_raw = example.get('options', {})
if isinstance(options_raw, str):
try:
options_raw = ast.literal_eval(options_raw)
except Exception as e:
raise ValueError(f"Failed to parse options: {e}") from e

# Build options list in order (skip the 'correct answer' key if present)
options_list = []
i = 0
while str(i) in options_raw:
text = options_raw[str(i)]
if text and str(text).strip():
options_list.append(str(text).strip())
i += 1

# Get correct answer index directly from data
if 'correct_index' not in example:
raise ValueError("Missing correct_index field")

original_answer_idx = int(example['correct_index'])

# Validate index is in range
if original_answer_idx < 0 or original_answer_idx >= len(options_list):
raise ValueError(
f"correct_index {original_answer_idx} out of range for {len(options_list)} options. "
f"Question: {question[:50]}..."
)

# Shuffle if requested
if shuffle_answers:
rng = random.Random(f"{shuffle_seed}_{question}")
indices = list(range(len(options_list)))
rng.shuffle(indices)

options_list = [options_list[i] for i in indices]
answer_idx = indices.index(original_answer_idx)
else:
answer_idx = original_answer_idx

# Build option_map with letters
option_map = {chr(65 + i): text for i, text in enumerate(options_list)}

# Build prompt
options_text = '\n'.join(f"{letter}. {text}" for letter, text in option_map.items())
prompt_text = f"""Answer the following multiple-choice medical question.

Question: {question}

Options:
{options_text}

Provide your answer as a single letter."""

correct_letter = chr(65 + answer_idx)

return {
'question': prompt_text,
'answer': correct_letter,
'choices': list(option_map.keys()),
'info': {
'question_text': question,
'options': option_map,
'num_options': len(option_map),
}
}

def _extract_letter(text: str, num_options: int = 4) -> str | None:
"""Extract answer letter (A, B, C, D, ...) from model response.

Args:
text: Model's response text
num_options: Number of valid options (determines valid letters)

Returns:
Extracted letter or None if not found
"""
if not text:
return None

text = text.strip().upper()
valid_letters = [chr(65 + i) for i in range(num_options)]

# Direct match (just "A" or "B")
if len(text) == 1 and text in valid_letters:
return text

# Find first valid letter in the response
for char in text:
if char in valid_letters:
return char

return None
16 changes: 16 additions & 0 deletions environments/medhalt/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[project]
name = "medhalt"
description = "MedHALT evaluation environment for medical LLMs"
tags = ["placeholder-tag", "train", "eval"]
version = "0.1.0"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"verifiers",
"datasets",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

38 changes: 38 additions & 0 deletions environments/medhalt/test_medhalt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Validation script for MedHALT environment"""

import argparse
from openai import AsyncOpenAI
import verifiers as vf

async def main():
parser = argparse.ArgumentParser(description='Validate MedHALT environment')
parser.add_argument('--config', default='reasoning_FCT', choices=['reasoning_FCT', 'reasoning_nota'])
parser.add_argument('--num-examples', type=int, default=10, help='Number of examples to test')
parser.add_argument('--model', default='qwen2.5:3b', help='Model to use')

args = parser.parse_args()

print(f"Testing MedHALT environment: {args.config}")

# Load environment
env = vf.load_environment('medhalt', config_name=args.config, num_examples=args.num_examples)
print(f"✓ Loaded {len(env.dataset)} examples")

# Test with Ollama
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
results = await env.evaluate(client, args.model, num_examples=args.num_examples)

# Calculate metrics
correct = sum(results.reward)
total = len(results.reward)
accuracy = (correct / total * 100)

print(f"\nResults:")
print(f" Total: {total}")
print(f" Correct: {int(correct)}")
print(f" Accuracy: {accuracy:.1f}%")
print(f"\n✅ Validation complete")

if __name__ == "__main__":
import asyncio
asyncio.run(main())