diff --git a/environments/medhalt/README.md b/environments/medhalt/README.md new file mode 100644 index 00000000..6484ade4 --- /dev/null +++ b/environments/medhalt/README.md @@ -0,0 +1,88 @@ +# MedHALT + +Evaluation environment for the MedHALT (Medical Domain Hallucination Test) dataset. + +## Overview + +MedHALT tests medical language models on multiple-choice questions from various medical exams. This environment implements evaluation for two configurations: + +- `reasoning_FCT`: Functional correct thinking questions +- `reasoning_nota`: None-of-the-above questions + +## Dataset + +- **Source**: [openlifescienceai/Med-HALT](https://huggingface.co/datasets/openlifescienceai/Med-HALT) +- **Paper**: https://arxiv.org/abs/2307.15343 +- **GitHub**: https://github.com/medhalt/medhalt +- **Size**: ~18,866 examples per configuration + +## Installation +```bash +vf-install medhalt +``` + +## Quickstart + +Run an evaluation with default settings: +```bash +uv run vf-eval medhalt +``` + +## Usage + +To run an evaluation using `vf-eval` with the OpenAI API: +```bash +export OPENAI_API_KEY=sk-... +uv run vf-eval \ + -m gpt-4o-mini \ + -n 100 \ + -s \ + medhalt +``` + +Replace `OPENAI_API_KEY` with your actual API key. + +### With Ollama +```bash +uv run vf-eval \ + -m qwen2.5:3b \ + -n 100 \ + --api-base-url http://localhost:11434/v1 \ + --api-key ollama \ + medhalt +``` + +### Specify configuration +```bash +# Test on None-of-the-Above questions +uv run vf-eval medhalt --config-name reasoning_nota + +# Enable answer shuffling +uv run vf-eval medhalt --shuffle-answers --shuffle-seed 42 + +## Parameters + +- `config_name`: One of `["reasoning_FCT", "reasoning_nota"]` (default: `"reasoning_FCT"`) +- `split`: Dataset split (default: `"train"`) +- `num_examples`: Limit number of examples (default: `None` for all) +- `shuffle_answers`: Randomize answer order (default: `False`) +- `shuffle_seed`: Seed for shuffling (default: `42`) + +## Testing + +A test script is provided to verify the environment works: +```bash +cd environments/medhalt +python test_medhalt.py --config reasoning_FCT --num-examples 10 +``` + +## Example Results + +Tested on qwen2.5:3b with 1000 examples: + +| Config | Accuracy | Notes | +|--------|----------|-------| +| reasoning_FCT | 50.7% | Functional reasoning questions | +| reasoning_nota | 33.1% | None-of-the-above questions | + +Results vary by model size and capability. Random guessing baseline is 25% for 4-option questions. \ No newline at end of file diff --git a/environments/medhalt/medhalt.py b/environments/medhalt/medhalt.py new file mode 100644 index 00000000..5a04437c --- /dev/null +++ b/environments/medhalt/medhalt.py @@ -0,0 +1,166 @@ +"""MedHALT: Medical Domain Hallucination Test + +This environment implements evaluation for the MedHALT dataset, which tests +medical language models on multiple-choice questions from various medical exams. + +Dataset: https://huggingface.co/datasets/openlifescienceai/Med-HALT +Paper: https://github.com/medhalt/medhalt + +Supported configurations: +- reasoning_FCT: Functional correct thinking questions +- reasoning_nota: None-of-the-above questions + +""" + +from datasets import load_dataset +from typing import Optional, List, Dict, Any +import verifiers as vf +import re +import json +import ast +import random + +def load_environment( + config_name: str = "reasoning_FCT", + split: str = "train", + num_examples: Optional[int] = None, + shuffle_answers: bool = False, + shuffle_seed: int | None = 42, +) -> vf.SingleTurnEnv: + """Load MedHALT environment for medical LLM evaluation. + + Args: + config_name: One of ["reasoning_FCT", "reasoning_nota"] + split: Dataset split to use + num_examples: Optional limit on number of examples + shuffle_answers: Whether to randomize answer order + shuffle_seed: Random seed for answer shuffling + + Returns: + A SingleTurnEnv ready for evaluation + """"" + valid_configs = ["reasoning_FCT", "reasoning_nota"] + if config_name not in valid_configs: + raise ValueError(f"Invalid config_name: {config_name}") + + raw_dataset = load_dataset("openlifescienceai/Med-HALT", config_name, split=split) + + dataset = raw_dataset.map( + lambda ex: _format_example(ex, shuffle_answers, shuffle_seed), + remove_columns=raw_dataset.column_names, + load_from_cache_file=False, + ) + + # Create rubric with simple accuracy check + def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float: + parsed = parser.parse_answer(completion) or "" + predicted = _extract_letter(parsed, info.get("num_options", 4) if info else 4) + return 1.0 if predicted and predicted.upper() == answer.upper() else 0.0 + + if num_examples is not None: + dataset = dataset.select(range(min(num_examples, len(dataset)))) + + parser = vf.Parser() + rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser) + + return vf.SingleTurnEnv(dataset=dataset, parser=parser, rubric=rubric) + +def _format_example(example: dict[str, Any], shuffle_answers: bool = False, shuffle_seed: int | None = 42) -> dict[str, Any]: + """Format a MedHALT example into verifiers format.""" + + question = example.get('question', '').strip() + + # Parse options string into dict + options_raw = example.get('options', {}) + if isinstance(options_raw, str): + try: + options_raw = ast.literal_eval(options_raw) + except Exception as e: + raise ValueError(f"Failed to parse options: {e}") from e + + # Build options list in order (skip the 'correct answer' key if present) + options_list = [] + i = 0 + while str(i) in options_raw: + text = options_raw[str(i)] + if text and str(text).strip(): + options_list.append(str(text).strip()) + i += 1 + + # Get correct answer index directly from data + if 'correct_index' not in example: + raise ValueError("Missing correct_index field") + + original_answer_idx = int(example['correct_index']) + + # Validate index is in range + if original_answer_idx < 0 or original_answer_idx >= len(options_list): + raise ValueError( + f"correct_index {original_answer_idx} out of range for {len(options_list)} options. " + f"Question: {question[:50]}..." + ) + + # Shuffle if requested + if shuffle_answers: + rng = random.Random(f"{shuffle_seed}_{question}") + indices = list(range(len(options_list))) + rng.shuffle(indices) + + options_list = [options_list[i] for i in indices] + answer_idx = indices.index(original_answer_idx) + else: + answer_idx = original_answer_idx + + # Build option_map with letters + option_map = {chr(65 + i): text for i, text in enumerate(options_list)} + + # Build prompt + options_text = '\n'.join(f"{letter}. {text}" for letter, text in option_map.items()) + prompt_text = f"""Answer the following multiple-choice medical question. + +Question: {question} + +Options: +{options_text} + +Provide your answer as a single letter.""" + + correct_letter = chr(65 + answer_idx) + + return { + 'question': prompt_text, + 'answer': correct_letter, + 'choices': list(option_map.keys()), + 'info': { + 'question_text': question, + 'options': option_map, + 'num_options': len(option_map), + } + } + +def _extract_letter(text: str, num_options: int = 4) -> str | None: + """Extract answer letter (A, B, C, D, ...) from model response. + + Args: + text: Model's response text + num_options: Number of valid options (determines valid letters) + + Returns: + Extracted letter or None if not found + """ + if not text: + return None + + text = text.strip().upper() + valid_letters = [chr(65 + i) for i in range(num_options)] + + # Direct match (just "A" or "B") + if len(text) == 1 and text in valid_letters: + return text + + # Find first valid letter in the response + for char in text: + if char in valid_letters: + return char + + return None \ No newline at end of file diff --git a/environments/medhalt/pyproject.toml b/environments/medhalt/pyproject.toml new file mode 100644 index 00000000..27bdb939 --- /dev/null +++ b/environments/medhalt/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "medhalt" +description = "MedHALT evaluation environment for medical LLMs" +tags = ["placeholder-tag", "train", "eval"] +version = "0.1.0" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "verifiers", + "datasets", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + diff --git a/environments/medhalt/test_medhalt.py b/environments/medhalt/test_medhalt.py new file mode 100644 index 00000000..8e056074 --- /dev/null +++ b/environments/medhalt/test_medhalt.py @@ -0,0 +1,38 @@ +"""Validation script for MedHALT environment""" + +import argparse +from openai import AsyncOpenAI +import verifiers as vf + +async def main(): + parser = argparse.ArgumentParser(description='Validate MedHALT environment') + parser.add_argument('--config', default='reasoning_FCT', choices=['reasoning_FCT', 'reasoning_nota']) + parser.add_argument('--num-examples', type=int, default=10, help='Number of examples to test') + parser.add_argument('--model', default='qwen2.5:3b', help='Model to use') + + args = parser.parse_args() + + print(f"Testing MedHALT environment: {args.config}") + + # Load environment + env = vf.load_environment('medhalt', config_name=args.config, num_examples=args.num_examples) + print(f"āœ“ Loaded {len(env.dataset)} examples") + + # Test with Ollama + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama") + results = await env.evaluate(client, args.model, num_examples=args.num_examples) + + # Calculate metrics + correct = sum(results.reward) + total = len(results.reward) + accuracy = (correct / total * 100) + + print(f"\nResults:") + print(f" Total: {total}") + print(f" Correct: {int(correct)}") + print(f" Accuracy: {accuracy:.1f}%") + print(f"\nāœ… Validation complete") + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file