diff --git a/configs/envs/bioasq.yaml b/configs/envs/bioasq.yaml new file mode 100644 index 00000000..144c4345 --- /dev/null +++ b/configs/envs/bioasq.yaml @@ -0,0 +1,17 @@ +- id: bioasq + module: bioasq + rollouts_per_example: 1 + num_examples: -1 + verbose: false + env_args: + answer_format: xml + judge_model: gpt-4o-mini + +- id: bioasq-boxed + module: bioasq + rollouts_per_example: 1 + num_examples: -1 + verbose: false + env_args: + answer_format: boxed + judge_model: gpt-4o-mini \ No newline at end of file diff --git a/environments/bioasq/README.md b/environments/bioasq/README.md new file mode 100644 index 00000000..dae2a3c2 --- /dev/null +++ b/environments/bioasq/README.md @@ -0,0 +1,242 @@ +# BioASQ + +## Overview + +- **Environment ID:** `bioasq` +- **Short description:** + BioASQ is a benchmark for large-scale biomedical semantic question answering that evaluates a model’s ability to generate comprehensive, expert-level answers grounded in scientific literature. +- **Task Type:** + Single-turn biomedical question answering and multi-document summarization. + +--- + +## Dataset + +BioASQ-QA reflects real-world information needs of biomedical experts. The benchmark is challenging because systems must reason over both structured and unstructured biomedical evidence to generate paragraph-sized *ideal answers*. + +- **Split sizes:** + - **Training:** 5,389 biomedical questions (compiled from previous BioASQ editions) + - **Test:** 340 new questions across 4 batches (85 questions per batch) + - **Total:** 5,729 questions with ideal answers and supporting evidence +- **Source:** https://huggingface.co/datasets/kroshan/BioASQ +- **Official Website:** http://bioasq.org/participate +- **Implementation based on:** BioASQ 2025 Overview Paper (Task 13b) + +--- + +## Task + +- **Type:** Single-turn +- **Rubric:** `JudgeRubric` (LLM-as-a-Judge evaluation adapted from official BioASQ manual assessment criteria) +- **Task description:** + Given a biomedical question and a set of relevant evidence snippets, generate a comprehensive *ideal answer*—a paragraph-sized summary intended for biomedical professionals. +- **Prompt template:** + + > *Here is a biomedical question and several relevant snippets. Provide a comprehensive answer strictly based on these snippets.* + +--- + +## Evaluation Dimensions + +Each response is scored on a **1–5 scale** (5 is best) following official BioASQ manual evaluation criteria: + +- **Precision (1–5):** + Accuracy and relevance of biomedical facts. Unsupported information is penalized. +- **Recall (1–5):** + Coverage of all important biomedical concepts present in the gold answer. +- **Repetition (1–5):** + Redundancy in the response (1 = excessive repetition, 5 = none). +- **Readability (1–5):** + Clarity, organization, and professional readability. + +--- + +## Quickstart + +Run evaluation with default settings (dataset downloads automatically): + +```bash +uv run vf-eval bioasq +```` + +### Use a custom judge model + +```bash +uv run vf-eval bioasq -m gpt-4o --env-args '{"judge_model": "gpt-4o"}' +``` + +### Enable chain-of-thought (CoT) prompting + +```bash +uv run vf-eval bioasq --env-args '{"use_think": true}' +``` + +### Notes + +* Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. +* By default, the environment uses **`gpt-4o-mini`** as the judge model. + +--- + +## Environment Arguments + +| Argument | Type | Default | Description | +| ---------------- | ---- | --------------- | ------------------------------------------------------------ | +| `cache_dir` | str | `None` | Path to cache directory | +| `use_think` | bool | `False` | Enable chain-of-thought prompting using `...` | +| `judge_model` | str | `"gpt-4o-mini"` | LLM used to judge ideal answers | +| `judge_base_url` | str | `None` | Custom base URL for judge API | +| `judge_api_key` | str | `None` | API key for judge model | + +--- + +## Results Dataset Structure + +### Core Evaluation Fields + +* **`prompt`** + Biomedical question and associated context snippets presented to the model. + Represented as a list of message objects: `(role, content)`. + +* **`completion`** + Model-generated paragraph-sized ideal answer. + +* **`reward`** + Scalar score in the range **0.0–1.0**, computed as: + +```text +(precision/5 + recall/5 + repetition/5 + readability/5) / 4 +``` + +--- + +## Example Metadata (`info`) + +The `info` field contains BioASQ-specific metadata for each example: + +* **`filename`** – Unique question identifier +* **`question_type`** – One of: `yesno`, `factoid`, `list`, `summary` +* **`ideal_answer`** – Gold standard paragraph-sized reference answer +* **`exact_answer`** – Expected exact answer (format depends on question type) +* **`documents`** – List of relevant PubMed document URLs +* **`context`** – Concatenated text snippets used as supporting evidence +* **`judge_feedback`** – Judge explanations and per-dimension scores + +### Notes + +* The `question` field contains the biomedical question text. +* The `answer` field contains the gold ideal answer. +* Scores are normalized to 0–1 before averaging. +* If judge parsing fails, dimension scores default to `None` and are excluded. + +--- + +## Data Processing + +The dataset follows specifications from the **BioASQ 2025 paper**: + +* **Question Types:** Yes/No, Factoid, List, Summary (Section 2.1) +* **Ideal Answers:** Paragraph-sized summaries for all question types +* **Exact Answers:** + + * Yes/No for yes-no questions + * Entity name(s) for factoid and list questions +* **Supporting Evidence:** PubMed documents and curated text snippets + +--- + +## Dataset Examples + +### Example 1: Yes/No Question + +**Question:** +Is the protein Papilin secreted? + +**Question Type:** `yesno` + +**Ideal Answer (Reference):** +Yes, Papilin is a secreted protein. It is an extracellular matrix glycoprotein that contains proteoglycan-like domains and is secreted into the basement membrane. + +--- + +### Example 2: Factoid Question + +**Question:** +Which gene is mutated in Huntington's disease? + +**Question Type:** `factoid` + +**Ideal Answer (Reference):** +Huntington's disease is caused by mutations in the HTT gene (also known as the Huntingtin gene). The mutation involves an expansion of CAG trinucleotide repeats in the HTT gene. + +--- + +### Example 3: List Question + +**Question:** +Which are the different isoforms of the mammalian Notch receptor? + +**Question Type:** `list` + +**Ideal Answer (Reference):** +In mammals, there are four Notch receptor isoforms: Notch1, Notch2, Notch3, and Notch4. + +--- + +### Example 4: Summary Question + +**Question:** +What is the mechanism of action of pembrolizumab? + +**Question Type:** `summary` + +**Ideal Answer (Reference):** +Pembrolizumab is a humanized monoclonal antibody that targets the programmed cell death protein 1 (PD-1) receptor. It blocks the interaction between PD-1 and its ligands PD-L1 and PD-L2, preventing pathway-mediated immune inhibition and enhancing anti-tumor immunity. + +--- + +## Dataset Statistics + +Breakdown based on the **BioASQ 2025 paper (Table 1)**: + +| Dataset | Questions | Yes/No | List | Factoid | Summary | Avg Docs | Avg Snippets | +| ------------ | --------- | --------- | --------- | --------- | --------- | -------- | ------------ | +| Training | 5,389 | 1,459 | 1,047 | 1,600 | 1,283 | 9.74 | 12.78 | +| Test Batch 1 | 85 | 17 | 23 | 26 | 19 | 2.68 | 3.74 | +| Test Batch 2 | 85 | 17 | 19 | 27 | 22 | 2.71 | 3.06 | +| Test Batch 3 | 85 | 22 | 22 | 20 | 21 | 3.00 | 3.66 | +| Test Batch 4 | 85 | 26 | 19 | 22 | 18 | 3.15 | 3.92 | +| **Total** | **5,729** | **1,541** | **1,130** | **1,695** | **1,363** | **9.33** | **12.23** | + +--- + +## References + +```bibtex +@article{nentidis2025bioasq, + title={Overview of BioASQ 2025: The thirteenth BioASQ challenge on large-scale biomedical semantic indexing and question answering}, + author={Nentidis, Anastasios and Katsimpras, Georgios and Krithara, Anastasia and others}, + journal={arXiv preprint arXiv:2508.20554}, + year={2025} +} + +@article{tsatsaronis2015bioasq, + title={An overview of the BIOASQ large-scale biomedical semantic indexing and question answering competition}, + author={Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos and others}, + journal={BMC Bioinformatics}, + volume={16}, + pages={138}, + year={2015} +} + +@article{krithara2023bioasq, + title={BioASQ-QA: A manually curated corpus for Biomedical Question Answering}, + author={Krithara, Anastasia and Nentidis, Anastasios and Bougiatiotis, Konstantinos and Paliouras, Georgios}, + journal={Scientific Data}, + volume={10}, + number={1}, + pages={170}, + year={2023} +} +``` + diff --git a/environments/bioasq/bioasq/__init__.py b/environments/bioasq/bioasq/__init__.py new file mode 100644 index 00000000..e69ea893 --- /dev/null +++ b/environments/bioasq/bioasq/__init__.py @@ -0,0 +1,3 @@ +from .bioasq import load_environment + +__all__ = ["load_environment"] \ No newline at end of file diff --git a/environments/bioasq/bioasq/bioasq.py b/environments/bioasq/bioasq/bioasq.py new file mode 100644 index 00000000..d9c0f917 --- /dev/null +++ b/environments/bioasq/bioasq/bioasq.py @@ -0,0 +1,188 @@ +from typing import Any, Dict, Optional + +import verifiers as vf +from datasets import load_dataset +from datasets.utils.logging import disable_progress_bar +from medarc_verifiers.parsers import JSONParser +from medarc_verifiers.prompts import XML_SYSTEM_PROMPT, AnswerFormat +from medarc_verifiers.utils import default_judge_api_key, judge_sampling_args_and_headers +from openai import AsyncOpenAI +from verifiers.types import Info, Messages, State +from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, extract_boxed_answer +from .judge_prompts import JUDGE_DIMENSIONS, JUDGE_OUTPUT_JSON, JUDGE_TEMPLATE + +disable_progress_bar() # suppress datasets progress indicators + +# System prompt aligned with BioASQ Task 1b synthesis requirement +PROMPT = "Provide a comprehensive answer to the following biomedical question strictly based on the provided snippets." + +def _parse_bioasq_hf(example: dict[str, Any]) -> dict[str, Any]: + """Parses Hugging Face BioASQ format into the Med-LM-Env structure.""" + # Note: Structure depends on the specific HF version; + # BioASQ 1b/13b typically uses 'body', 'ideal_answer', and 'snippets' + question_text = example.get("body", example.get("question", "")) + ideal_answer = example.get("ideal_answer", "") + snippets = example.get("snippets", []) + + # Handle different snippet formats (list of strings vs list of dicts) + snippet_texts = [] + for s in snippets: + if isinstance(s, dict): + snippet_texts.append(s.get("text", "")) + else: + snippet_texts.append(str(s)) + + return { + "question": question_text, + "answer": ideal_answer, + "info": { + "question_type": example.get("type", "summary"), + "ideal_answer": ideal_answer, + "context": "\n".join(snippet_texts), + "documents": example.get("documents", []) + } + } + +def _coerce_score(value: Any) -> float | None: + """Convert score value to float or None if invalid.""" + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + value = value.strip() + if not value: + return None + try: + return float(value) + except ValueError: + return None + return None + + +def _compute_normalized_reward( + scores: dict[str, dict[str, Any]], + min_score: float | None = None, + max_score: float | None = None, +) -> float: + """Accumulate per-dimension judge scores normalized from [min_score, max_score] to [0.0, 1.0]""" + min_score = min_score if min_score is not None else 1 + max_score = max_score if max_score is not None else 5 + + total_dims = len(JUDGE_DIMENSIONS) + if total_dims == 0: + return 0.0 + + accumulated = 0.0 + for dimension in JUDGE_DIMENSIONS: + score = _coerce_score(scores.get(dimension, {}).get("score")) + if score is None: + continue + clamped = max(0.0, min(max_score, score)) + accumulated += clamped / max_score + + return max(0.0, min(1.0, accumulated / total_dims)) + + +def _extract_completion_text(completion: Messages, parser: vf.Parser) -> str: + """Extract completion text, respecting parser if available.""" + if isinstance(completion, list) and completion: + last_msg = completion[-1] + if isinstance(last_msg, dict): + return str(last_msg.get("content", "")) + return str(completion) + + +def load_environment( + answer_format: AnswerFormat | str = AnswerFormat.XML, + judge_model: str = "gpt-4o-mini", + judge_base_url: str | None = None, + judge_api_key: str | None = None, + system_prompt: Optional[str] = None, + **kwargs: Any, +) -> vf.Environment: + """Load BioASQ environment for biomedical question answering evaluation. + + Args: + answer_format: Format for model responses (XML or BOXED) + judge_model: Model to use for LLM-as-judge evaluation + judge_base_url: Base URL for judge model API + judge_api_key: API key for judge model + system_prompt: Custom system prompt (defaults to BioASQ-specific prompt) + **kwargs: Additional arguments passed to SingleTurnEnv + """ + # Load from the provided Hugging Face path + raw_ds = load_dataset("kroshan/BioASQ", split="train") + dataset = raw_ds.map(lambda x, idx: _parse_bioasq_hf(x), with_indices=True) + + # -------- normalize answer_format -------- + answer_format = AnswerFormat(answer_format) if isinstance(answer_format, str) else answer_format + + if answer_format == AnswerFormat.XML: + system_prompt = system_prompt or XML_SYSTEM_PROMPT + parser_fields = ["answer"] + parser = vf.XMLParser(fields=parser_fields, answer_field="answer") + elif answer_format == AnswerFormat.BOXED: + system_prompt = system_prompt or BOXED_SYSTEM_PROMPT + parser = vf.Parser(extract_fn=extract_boxed_answer) + else: + raise ValueError(f"Unsupported answer format: {answer_format=}") + + # -------- setup judge -------- + api_key = default_judge_api_key(judge_base_url) if judge_api_key is None else judge_api_key + sampling_args, default_headers = judge_sampling_args_and_headers(judge_model, judge_base_url) + + judge_parser = JSONParser(fields=list(JUDGE_DIMENSIONS)) + judge_rubric = vf.JudgeRubric( + judge_client=AsyncOpenAI(base_url=judge_base_url, api_key=api_key, default_headers=default_headers), + judge_model=judge_model, + judge_prompt="{question}", # gets filled in during judge_rubric.judge() call + parser=parser, + judge_sampling_args=sampling_args, + ) + + async def judge_rubric_reward(completion: Messages, info: Info, state: State, **kwargs: Any) -> float: + question = str(info.get("question") or "") + context = str(info.get("context") or "") + gold_answer = str(info.get("ideal_answer") or "") + completion_text = _extract_completion_text(completion, parser) + + judge_prompt = JUDGE_TEMPLATE.format( + question=question, + context=context, + response=completion_text, + gold_answer=gold_answer, + output_format=JUDGE_OUTPUT_JSON, + ) + + # judge_prompt assigned to question var inside judge_rubric.judge() method + try: + judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state) + parsed = judge_parser.parse(str(judge_raw), strip=True) + except AttributeError: + judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state) + parsed = judge_parser.parse(str(judge_raw), strip=True) + + if parsed is None: + parsed = {dimension: {"score": None, "explanation": None, "raw": None} for dimension in JUDGE_DIMENSIONS} + + normalized = _compute_normalized_reward(parsed) + + info.setdefault("judge_feedback", []).append( + { + "scores": parsed, + "raw_judge": judge_raw, + } + ) + + return normalized + + judge_rubric.add_reward_func(judge_rubric_reward, weight=1.0) + + return vf.SingleTurnEnv( + dataset=dataset, + system_prompt=system_prompt, + rubric=judge_rubric, + parser=parser, + **kwargs, + ) \ No newline at end of file diff --git a/environments/bioasq/bioasq/judge_prompts.py b/environments/bioasq/bioasq/judge_prompts.py new file mode 100644 index 00000000..668ec507 --- /dev/null +++ b/environments/bioasq/bioasq/judge_prompts.py @@ -0,0 +1,104 @@ +# Judge template adapted from BioASQ manual assessment criteria +# Evaluates biomedical ideal answers for question answering +# Grounded in official Task 1b synthesis requirements + +JUDGE_DIMENSIONS = ("precision", "recall", "repetition", "readability") + +JUDGE_TEMPLATE = """\ +You are a biomedical expert tasked with evaluating the quality of a generated answer to a biomedical question. + +Your goal is to assess how well the generated answer addresses the question and how it compares to the reference answer in terms of precision, recall, repetition, and readability. + +The biomedical question will be provided in these tags: + +{question} + + +Supporting evidence (Context snippets used to derive the answer): + +{context} + + +The generated response will be provided in these tags: + +{response} + + +The reference answer will be provided in these tags: + +{gold_answer} + + +Carefully review the based on the and the supporting . + +For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent), and provide a short justification for your score. + +Evaluation Criteria: +1. Precision (1-5) +- Does the generated response provide accurate biomedical information that is relevant to the question? Penalize information not supported by the context. + +2. Recall (1-5) +- Does the response include all important biomedical concepts and facts mentioned in the reference answer? + +3. Repetition (1-5) +- Does the response avoid unnecessary repetition? (1 = lots of repetition, 5 = no repetition) + +4. Readability (1-5) +- Is the response written clearly and organized in a way that is easy to read for biomedical professionals? + +Output Format: +{output_format} +""" + +JUDGE_OUTPUT_JSON = """ +Output your evaluation as a single valid JSON object matching the following structure: +{ + "precision": { + "explanation": "Brief explanation of why this score was given.", + "score": 0 + }, + "recall": { + "explanation": "Brief explanation of why this score was given.", + "score": 0 + }, + "repetition": { + "explanation": "Brief explanation of why this score was given.", + "score": 0 + }, + "readability": { + "explanation": "Brief explanation of why this score was given.", + "score": 0 + } +} + +Ensure the output is valid JSON: +- Use **double quotes** (") for all keys and string values. +- When quoting text or sections inside the explanations, use escaped double quotes (\\") to maintain valid JSON formatting. +- Do not include any additional information in the output. +""" + +JUDGE_OUTPUT_XML = """ +Output your evaluation as a single valid XML object matching the following structure: + + + Brief explanation of why this score was given. + 0 + + + Brief explanation of why this score was given. + 0 + + + Brief explanation of why this score was given. + 0 + + + Brief explanation of why this score was given. + 0 + + + +Ensure the output is valid XML: +- Escape special characters in text nodes: & as &, < as <, > as >, " as ", ' as '. +- Do not include any additional information in the output. +""" \ No newline at end of file diff --git a/environments/bioasq/pyproject.toml b/environments/bioasq/pyproject.toml new file mode 100644 index 00000000..203d176a --- /dev/null +++ b/environments/bioasq/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "bioasq" +description = "BioASQ evaluation environment for biomedical question answering" +readme = "README.md" +tags = ["medical", "biomedical", "qa", "question-answering", "llm-judge", "single-turn"] +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "verifiers>=0.1.5.post0", + "medarc_verifiers>=0.1.0", + "openai", + "datasets>=2.13.0", +] + +[tool.prime.environment] +loader = "bioasq:load_environment" +display_name = "BioASQ" +visibility = "PUBLIC" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.uv.sources] +medarc_verifiers = { git = "https://github.com/MedARC-AI/med-lm-envs" } \ No newline at end of file