MedARC-AI · geetua · Oct 30, 2025 · Oct 31, 2025
diff --git a/environments/medhalt/README.md b/environments/medhalt/README.md
@@ -0,0 +1,88 @@
+# MedHALT
+
+Evaluation environment for the MedHALT (Medical Domain Hallucination Test) dataset.
+
+## Overview
+
+MedHALT tests medical language models on multiple-choice questions from various medical exams. This environment implements evaluation for two configurations:
+
+- `reasoning_FCT`: Functional correct thinking questions
+- `reasoning_nota`: None-of-the-above questions
+
+## Dataset
+
+- **Source**: [openlifescienceai/Med-HALT](https://huggingface.co/datasets/openlifescienceai/Med-HALT)
+- **Paper**: https://arxiv.org/abs/2307.15343
+- **GitHub**: https://github.com/medhalt/medhalt
+- **Size**: ~18,866 examples per configuration
+
+## Installation
+```bash
+vf-install medhalt
+```
+
+## Quickstart
+
+Run an evaluation with default settings:
+```bash
+uv run vf-eval medhalt
+```
+
+## Usage
+
+To run an evaluation using `vf-eval` with the OpenAI API:
+```bash
+export OPENAI_API_KEY=sk-...
+uv run vf-eval \
+  -m gpt-4o-mini \
+  -n 100 \
+  -s \
+  medhalt
+```
+
+Replace `OPENAI_API_KEY` with your actual API key.
+
+### With Ollama
+```bash
+uv run vf-eval \
+  -m qwen2.5:3b \
+  -n 100 \
+  --api-base-url http://localhost:11434/v1 \
+  --api-key ollama \
+  medhalt
+```
+
+### Specify configuration
+```bash
+# Test on None-of-the-Above questions
+uv run vf-eval medhalt --config-name reasoning_nota
+
+# Enable answer shuffling
+uv run vf-eval medhalt --shuffle-answers --shuffle-seed 42
+
+## Parameters
+
+- `config_name`: One of `["reasoning_FCT", "reasoning_nota"]` (default: `"reasoning_FCT"`)
+- `split`: Dataset split (default: `"train"`)
+- `num_examples`: Limit number of examples (default: `None` for all)
+- `shuffle_answers`: Randomize answer order (default: `False`)
+- `shuffle_seed`: Seed for shuffling (default: `42`)
+
+## Testing
+
+A test script is provided to verify the environment works:
+```bash
+cd environments/medhalt
+python test_medhalt.py --config reasoning_FCT --num-examples 10
+```
+
+## Example Results
+
+Tested on qwen2.5:3b with 1000 examples:
+
+| Config | Accuracy | Notes |
+|--------|----------|-------|
+| reasoning_FCT | 50.7% | Functional reasoning questions |
+| reasoning_nota | 33.1% | None-of-the-above questions |
+
+Results vary by model size and capability. Random guessing baseline is 25% for 4-option questions.
diff --git a/environments/medhalt/medhalt.py b/environments/medhalt/medhalt.py
@@ -0,0 +1,166 @@
+"""MedHALT: Medical Domain Hallucination Test
+
+This environment implements evaluation for the MedHALT dataset, which tests
+medical language models on multiple-choice questions from various medical exams.
+
+Dataset: https://huggingface.co/datasets/openlifescienceai/Med-HALT
+Paper: https://github.com/medhalt/medhalt
+
+Supported configurations:
+- reasoning_FCT: Functional correct thinking questions
+- reasoning_nota: None-of-the-above questions
+
+"""
+
+from datasets import load_dataset
+from typing import Optional, List, Dict, Any
+import verifiers as vf
+import re
+import json
+import ast
+import random
+
+def load_environment(
+    config_name: str = "reasoning_FCT",
+    split: str = "train",
+    num_examples: Optional[int] = None,
+    shuffle_answers: bool = False,
+    shuffle_seed: int | None = 42,
+) -> vf.SingleTurnEnv:
+    """Load MedHALT environment for medical LLM evaluation. 
+
+    Args:
+        config_name: One of ["reasoning_FCT", "reasoning_nota"]
+        split: Dataset split to use
+        num_examples: Optional limit on number of examples
+        shuffle_answers: Whether to randomize answer order
+        shuffle_seed: Random seed for answer shuffling
+
+    Returns:
+        A SingleTurnEnv ready for evaluation
+    """""
+    valid_configs = ["reasoning_FCT", "reasoning_nota"]
+    if config_name not in valid_configs:
+        raise ValueError(f"Invalid config_name: {config_name}")
+
+    raw_dataset = load_dataset("openlifescienceai/Med-HALT", config_name, split=split)
+
+    dataset = raw_dataset.map(
+        lambda ex: _format_example(ex, shuffle_answers, shuffle_seed),
+        remove_columns=raw_dataset.column_names,
+        load_from_cache_file=False,
+    )
+
+    # Create rubric with simple accuracy check
+    def accuracy(completion: Any, answer: str, parser: vf.Parser, info: dict[str, Any] | None = None) -> float:
+        parsed = parser.parse_answer(completion) or ""
+        predicted = _extract_letter(parsed, info.get("num_options", 4) if info else 4)
+        return 1.0 if predicted and predicted.upper() == answer.upper() else 0.0
+
+    if num_examples is not None:
+        dataset = dataset.select(range(min(num_examples, len(dataset))))
+
+    parser = vf.Parser()
+    rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
+
+    return vf.SingleTurnEnv(dataset=dataset, parser=parser, rubric=rubric)
+
+def _format_example(example: dict[str, Any], shuffle_answers: bool = False, shuffle_seed: int | None = 42) -> dict[str, Any]:
+    """Format a MedHALT example into verifiers format."""
+
+    question = example.get('question', '').strip()
+
+    # Parse options string into dict
+    options_raw = example.get('options', {})
+    if isinstance(options_raw, str):
+        try:
+            options_raw = ast.literal_eval(options_raw)
+        except Exception as e:
+            raise ValueError(f"Failed to parse options: {e}") from e
+
+    # Build options list in order (skip the 'correct answer' key if present)
+    options_list = []
+    i = 0
+    while str(i) in options_raw:
+        text = options_raw[str(i)]
+        if text and str(text).strip():
+            options_list.append(str(text).strip())
+        i += 1
+
+    # Get correct answer index directly from data
+    if 'correct_index' not in example:
+        raise ValueError("Missing correct_index field")
+
+    original_answer_idx = int(example['correct_index'])
+
+    # Validate index is in range
+    if original_answer_idx < 0 or original_answer_idx >= len(options_list):
+        raise ValueError(
+            f"correct_index {original_answer_idx} out of range for {len(options_list)} options. "
+            f"Question: {question[:50]}..."
+        )
+
+    # Shuffle if requested
+    if shuffle_answers:
+        rng = random.Random(f"{shuffle_seed}_{question}")
+        indices = list(range(len(options_list)))
+        rng.shuffle(indices)
+
+        options_list = [options_list[i] for i in indices]
+        answer_idx = indices.index(original_answer_idx)
+    else:
+        answer_idx = original_answer_idx
+
+    # Build option_map with letters
+    option_map = {chr(65 + i): text for i, text in enumerate(options_list)}
+
+    # Build prompt
+    options_text = '\n'.join(f"{letter}. {text}" for letter, text in option_map.items())
+    prompt_text = f"""Answer the following multiple-choice medical question.
+
+Question: {question}
+
+Options:
+{options_text}
+
+Provide your answer as a single letter."""
+
+    correct_letter = chr(65 + answer_idx)
+
+    return {
+        'question': prompt_text,
+        'answer': correct_letter,
+        'choices': list(option_map.keys()),
+        'info': {
+            'question_text': question,
+            'options': option_map,
+            'num_options': len(option_map),
+        }
+    }
+
+def _extract_letter(text: str, num_options: int = 4) -> str | None:
+    """Extract answer letter (A, B, C, D, ...) from model response.
+
+    Args:
+        text: Model's response text
+        num_options: Number of valid options (determines valid letters)
+
+    Returns:
+        Extracted letter or None if not found
+    """
+    if not text:
+        return None
+
+    text = text.strip().upper()
+    valid_letters = [chr(65 + i) for i in range(num_options)]
+
+    # Direct match (just "A" or "B")
+    if len(text) == 1 and text in valid_letters:
+        return text
+
+    # Find first valid letter in the response
+    for char in text:
+        if char in valid_letters:
+            return char
+
+    return None
diff --git a/environments/medhalt/pyproject.toml b/environments/medhalt/pyproject.toml
@@ -0,0 +1,16 @@
+[project]
+name = "medhalt"
+description = "MedHALT evaluation environment for medical LLMs"
+tags = ["placeholder-tag", "train", "eval"]
+version = "0.1.0"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers",
+    "datasets",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
diff --git a/environments/medhalt/test_medhalt.py b/environments/medhalt/test_medhalt.py
@@ -0,0 +1,38 @@
+"""Validation script for MedHALT environment"""
+
+import argparse
+from openai import AsyncOpenAI
+import verifiers as vf
+
+async def main():
+    parser = argparse.ArgumentParser(description='Validate MedHALT environment')
+    parser.add_argument('--config', default='reasoning_FCT', choices=['reasoning_FCT', 'reasoning_nota'])
+    parser.add_argument('--num-examples', type=int, default=10, help='Number of examples to test')
+    parser.add_argument('--model', default='qwen2.5:3b', help='Model to use')
+
+    args = parser.parse_args()
+
+    print(f"Testing MedHALT environment: {args.config}")
+
+    # Load environment
+    env = vf.load_environment('medhalt', config_name=args.config, num_examples=args.num_examples)
+    print(f"✓ Loaded {len(env.dataset)} examples")
+
+    # Test with Ollama
+    client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
+    results = await env.evaluate(client, args.model, num_examples=args.num_examples)
+
+    # Calculate metrics
+    correct = sum(results.reward)
+    total = len(results.reward)
+    accuracy = (correct / total * 100)
+
+    print(f"\nResults:")
+    print(f"  Total:    {total}")
+    print(f"  Correct:  {int(correct)}")
+    print(f"  Accuracy: {accuracy:.1f}%")
+    print(f"\n✅ Validation complete")
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())