diff --git a/configs/envs/bioasq.yaml b/configs/envs/bioasq.yaml
new file mode 100644
index 00000000..144c4345
--- /dev/null
+++ b/configs/envs/bioasq.yaml
@@ -0,0 +1,17 @@
+- id: bioasq
+ module: bioasq
+ rollouts_per_example: 1
+ num_examples: -1
+ verbose: false
+ env_args:
+ answer_format: xml
+ judge_model: gpt-4o-mini
+
+- id: bioasq-boxed
+ module: bioasq
+ rollouts_per_example: 1
+ num_examples: -1
+ verbose: false
+ env_args:
+ answer_format: boxed
+ judge_model: gpt-4o-mini
\ No newline at end of file
diff --git a/environments/bioasq/README.md b/environments/bioasq/README.md
new file mode 100644
index 00000000..dae2a3c2
--- /dev/null
+++ b/environments/bioasq/README.md
@@ -0,0 +1,242 @@
+# BioASQ
+
+## Overview
+
+- **Environment ID:** `bioasq`
+- **Short description:**
+ BioASQ is a benchmark for large-scale biomedical semantic question answering that evaluates a model’s ability to generate comprehensive, expert-level answers grounded in scientific literature.
+- **Task Type:**
+ Single-turn biomedical question answering and multi-document summarization.
+
+---
+
+## Dataset
+
+BioASQ-QA reflects real-world information needs of biomedical experts. The benchmark is challenging because systems must reason over both structured and unstructured biomedical evidence to generate paragraph-sized *ideal answers*.
+
+- **Split sizes:**
+ - **Training:** 5,389 biomedical questions (compiled from previous BioASQ editions)
+ - **Test:** 340 new questions across 4 batches (85 questions per batch)
+ - **Total:** 5,729 questions with ideal answers and supporting evidence
+- **Source:** https://huggingface.co/datasets/kroshan/BioASQ
+- **Official Website:** http://bioasq.org/participate
+- **Implementation based on:** BioASQ 2025 Overview Paper (Task 13b)
+
+---
+
+## Task
+
+- **Type:** Single-turn
+- **Rubric:** `JudgeRubric` (LLM-as-a-Judge evaluation adapted from official BioASQ manual assessment criteria)
+- **Task description:**
+ Given a biomedical question and a set of relevant evidence snippets, generate a comprehensive *ideal answer*—a paragraph-sized summary intended for biomedical professionals.
+- **Prompt template:**
+
+ > *Here is a biomedical question and several relevant snippets. Provide a comprehensive answer strictly based on these snippets.*
+
+---
+
+## Evaluation Dimensions
+
+Each response is scored on a **1–5 scale** (5 is best) following official BioASQ manual evaluation criteria:
+
+- **Precision (1–5):**
+ Accuracy and relevance of biomedical facts. Unsupported information is penalized.
+- **Recall (1–5):**
+ Coverage of all important biomedical concepts present in the gold answer.
+- **Repetition (1–5):**
+ Redundancy in the response (1 = excessive repetition, 5 = none).
+- **Readability (1–5):**
+ Clarity, organization, and professional readability.
+
+---
+
+## Quickstart
+
+Run evaluation with default settings (dataset downloads automatically):
+
+```bash
+uv run vf-eval bioasq
+````
+
+### Use a custom judge model
+
+```bash
+uv run vf-eval bioasq -m gpt-4o --env-args '{"judge_model": "gpt-4o"}'
+```
+
+### Enable chain-of-thought (CoT) prompting
+
+```bash
+uv run vf-eval bioasq --env-args '{"use_think": true}'
+```
+
+### Notes
+
+* Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+* By default, the environment uses **`gpt-4o-mini`** as the judge model.
+
+---
+
+## Environment Arguments
+
+| Argument | Type | Default | Description |
+| ---------------- | ---- | --------------- | ------------------------------------------------------------ |
+| `cache_dir` | str | `None` | Path to cache directory |
+| `use_think` | bool | `False` | Enable chain-of-thought prompting using `...` |
+| `judge_model` | str | `"gpt-4o-mini"` | LLM used to judge ideal answers |
+| `judge_base_url` | str | `None` | Custom base URL for judge API |
+| `judge_api_key` | str | `None` | API key for judge model |
+
+---
+
+## Results Dataset Structure
+
+### Core Evaluation Fields
+
+* **`prompt`**
+ Biomedical question and associated context snippets presented to the model.
+ Represented as a list of message objects: `(role, content)`.
+
+* **`completion`**
+ Model-generated paragraph-sized ideal answer.
+
+* **`reward`**
+ Scalar score in the range **0.0–1.0**, computed as:
+
+```text
+(precision/5 + recall/5 + repetition/5 + readability/5) / 4
+```
+
+---
+
+## Example Metadata (`info`)
+
+The `info` field contains BioASQ-specific metadata for each example:
+
+* **`filename`** – Unique question identifier
+* **`question_type`** – One of: `yesno`, `factoid`, `list`, `summary`
+* **`ideal_answer`** – Gold standard paragraph-sized reference answer
+* **`exact_answer`** – Expected exact answer (format depends on question type)
+* **`documents`** – List of relevant PubMed document URLs
+* **`context`** – Concatenated text snippets used as supporting evidence
+* **`judge_feedback`** – Judge explanations and per-dimension scores
+
+### Notes
+
+* The `question` field contains the biomedical question text.
+* The `answer` field contains the gold ideal answer.
+* Scores are normalized to 0–1 before averaging.
+* If judge parsing fails, dimension scores default to `None` and are excluded.
+
+---
+
+## Data Processing
+
+The dataset follows specifications from the **BioASQ 2025 paper**:
+
+* **Question Types:** Yes/No, Factoid, List, Summary (Section 2.1)
+* **Ideal Answers:** Paragraph-sized summaries for all question types
+* **Exact Answers:**
+
+ * Yes/No for yes-no questions
+ * Entity name(s) for factoid and list questions
+* **Supporting Evidence:** PubMed documents and curated text snippets
+
+---
+
+## Dataset Examples
+
+### Example 1: Yes/No Question
+
+**Question:**
+Is the protein Papilin secreted?
+
+**Question Type:** `yesno`
+
+**Ideal Answer (Reference):**
+Yes, Papilin is a secreted protein. It is an extracellular matrix glycoprotein that contains proteoglycan-like domains and is secreted into the basement membrane.
+
+---
+
+### Example 2: Factoid Question
+
+**Question:**
+Which gene is mutated in Huntington's disease?
+
+**Question Type:** `factoid`
+
+**Ideal Answer (Reference):**
+Huntington's disease is caused by mutations in the HTT gene (also known as the Huntingtin gene). The mutation involves an expansion of CAG trinucleotide repeats in the HTT gene.
+
+---
+
+### Example 3: List Question
+
+**Question:**
+Which are the different isoforms of the mammalian Notch receptor?
+
+**Question Type:** `list`
+
+**Ideal Answer (Reference):**
+In mammals, there are four Notch receptor isoforms: Notch1, Notch2, Notch3, and Notch4.
+
+---
+
+### Example 4: Summary Question
+
+**Question:**
+What is the mechanism of action of pembrolizumab?
+
+**Question Type:** `summary`
+
+**Ideal Answer (Reference):**
+Pembrolizumab is a humanized monoclonal antibody that targets the programmed cell death protein 1 (PD-1) receptor. It blocks the interaction between PD-1 and its ligands PD-L1 and PD-L2, preventing pathway-mediated immune inhibition and enhancing anti-tumor immunity.
+
+---
+
+## Dataset Statistics
+
+Breakdown based on the **BioASQ 2025 paper (Table 1)**:
+
+| Dataset | Questions | Yes/No | List | Factoid | Summary | Avg Docs | Avg Snippets |
+| ------------ | --------- | --------- | --------- | --------- | --------- | -------- | ------------ |
+| Training | 5,389 | 1,459 | 1,047 | 1,600 | 1,283 | 9.74 | 12.78 |
+| Test Batch 1 | 85 | 17 | 23 | 26 | 19 | 2.68 | 3.74 |
+| Test Batch 2 | 85 | 17 | 19 | 27 | 22 | 2.71 | 3.06 |
+| Test Batch 3 | 85 | 22 | 22 | 20 | 21 | 3.00 | 3.66 |
+| Test Batch 4 | 85 | 26 | 19 | 22 | 18 | 3.15 | 3.92 |
+| **Total** | **5,729** | **1,541** | **1,130** | **1,695** | **1,363** | **9.33** | **12.23** |
+
+---
+
+## References
+
+```bibtex
+@article{nentidis2025bioasq,
+ title={Overview of BioASQ 2025: The thirteenth BioASQ challenge on large-scale biomedical semantic indexing and question answering},
+ author={Nentidis, Anastasios and Katsimpras, Georgios and Krithara, Anastasia and others},
+ journal={arXiv preprint arXiv:2508.20554},
+ year={2025}
+}
+
+@article{tsatsaronis2015bioasq,
+ title={An overview of the BIOASQ large-scale biomedical semantic indexing and question answering competition},
+ author={Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos and others},
+ journal={BMC Bioinformatics},
+ volume={16},
+ pages={138},
+ year={2015}
+}
+
+@article{krithara2023bioasq,
+ title={BioASQ-QA: A manually curated corpus for Biomedical Question Answering},
+ author={Krithara, Anastasia and Nentidis, Anastasios and Bougiatiotis, Konstantinos and Paliouras, Georgios},
+ journal={Scientific Data},
+ volume={10},
+ number={1},
+ pages={170},
+ year={2023}
+}
+```
+
diff --git a/environments/bioasq/bioasq/__init__.py b/environments/bioasq/bioasq/__init__.py
new file mode 100644
index 00000000..e69ea893
--- /dev/null
+++ b/environments/bioasq/bioasq/__init__.py
@@ -0,0 +1,3 @@
+from .bioasq import load_environment
+
+__all__ = ["load_environment"]
\ No newline at end of file
diff --git a/environments/bioasq/bioasq/bioasq.py b/environments/bioasq/bioasq/bioasq.py
new file mode 100644
index 00000000..d9c0f917
--- /dev/null
+++ b/environments/bioasq/bioasq/bioasq.py
@@ -0,0 +1,188 @@
+from typing import Any, Dict, Optional
+
+import verifiers as vf
+from datasets import load_dataset
+from datasets.utils.logging import disable_progress_bar
+from medarc_verifiers.parsers import JSONParser
+from medarc_verifiers.prompts import XML_SYSTEM_PROMPT, AnswerFormat
+from medarc_verifiers.utils import default_judge_api_key, judge_sampling_args_and_headers
+from openai import AsyncOpenAI
+from verifiers.types import Info, Messages, State
+from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, extract_boxed_answer
+from .judge_prompts import JUDGE_DIMENSIONS, JUDGE_OUTPUT_JSON, JUDGE_TEMPLATE
+
+disable_progress_bar() # suppress datasets progress indicators
+
+# System prompt aligned with BioASQ Task 1b synthesis requirement
+PROMPT = "Provide a comprehensive answer to the following biomedical question strictly based on the provided snippets."
+
+def _parse_bioasq_hf(example: dict[str, Any]) -> dict[str, Any]:
+ """Parses Hugging Face BioASQ format into the Med-LM-Env structure."""
+ # Note: Structure depends on the specific HF version;
+ # BioASQ 1b/13b typically uses 'body', 'ideal_answer', and 'snippets'
+ question_text = example.get("body", example.get("question", ""))
+ ideal_answer = example.get("ideal_answer", "")
+ snippets = example.get("snippets", [])
+
+ # Handle different snippet formats (list of strings vs list of dicts)
+ snippet_texts = []
+ for s in snippets:
+ if isinstance(s, dict):
+ snippet_texts.append(s.get("text", ""))
+ else:
+ snippet_texts.append(str(s))
+
+ return {
+ "question": question_text,
+ "answer": ideal_answer,
+ "info": {
+ "question_type": example.get("type", "summary"),
+ "ideal_answer": ideal_answer,
+ "context": "\n".join(snippet_texts),
+ "documents": example.get("documents", [])
+ }
+ }
+
+def _coerce_score(value: Any) -> float | None:
+ """Convert score value to float or None if invalid."""
+ if value is None:
+ return None
+ if isinstance(value, (int, float)):
+ return float(value)
+ if isinstance(value, str):
+ value = value.strip()
+ if not value:
+ return None
+ try:
+ return float(value)
+ except ValueError:
+ return None
+ return None
+
+
+def _compute_normalized_reward(
+ scores: dict[str, dict[str, Any]],
+ min_score: float | None = None,
+ max_score: float | None = None,
+) -> float:
+ """Accumulate per-dimension judge scores normalized from [min_score, max_score] to [0.0, 1.0]"""
+ min_score = min_score if min_score is not None else 1
+ max_score = max_score if max_score is not None else 5
+
+ total_dims = len(JUDGE_DIMENSIONS)
+ if total_dims == 0:
+ return 0.0
+
+ accumulated = 0.0
+ for dimension in JUDGE_DIMENSIONS:
+ score = _coerce_score(scores.get(dimension, {}).get("score"))
+ if score is None:
+ continue
+ clamped = max(0.0, min(max_score, score))
+ accumulated += clamped / max_score
+
+ return max(0.0, min(1.0, accumulated / total_dims))
+
+
+def _extract_completion_text(completion: Messages, parser: vf.Parser) -> str:
+ """Extract completion text, respecting parser if available."""
+ if isinstance(completion, list) and completion:
+ last_msg = completion[-1]
+ if isinstance(last_msg, dict):
+ return str(last_msg.get("content", ""))
+ return str(completion)
+
+
+def load_environment(
+ answer_format: AnswerFormat | str = AnswerFormat.XML,
+ judge_model: str = "gpt-4o-mini",
+ judge_base_url: str | None = None,
+ judge_api_key: str | None = None,
+ system_prompt: Optional[str] = None,
+ **kwargs: Any,
+) -> vf.Environment:
+ """Load BioASQ environment for biomedical question answering evaluation.
+
+ Args:
+ answer_format: Format for model responses (XML or BOXED)
+ judge_model: Model to use for LLM-as-judge evaluation
+ judge_base_url: Base URL for judge model API
+ judge_api_key: API key for judge model
+ system_prompt: Custom system prompt (defaults to BioASQ-specific prompt)
+ **kwargs: Additional arguments passed to SingleTurnEnv
+ """
+ # Load from the provided Hugging Face path
+ raw_ds = load_dataset("kroshan/BioASQ", split="train")
+ dataset = raw_ds.map(lambda x, idx: _parse_bioasq_hf(x), with_indices=True)
+
+ # -------- normalize answer_format --------
+ answer_format = AnswerFormat(answer_format) if isinstance(answer_format, str) else answer_format
+
+ if answer_format == AnswerFormat.XML:
+ system_prompt = system_prompt or XML_SYSTEM_PROMPT
+ parser_fields = ["answer"]
+ parser = vf.XMLParser(fields=parser_fields, answer_field="answer")
+ elif answer_format == AnswerFormat.BOXED:
+ system_prompt = system_prompt or BOXED_SYSTEM_PROMPT
+ parser = vf.Parser(extract_fn=extract_boxed_answer)
+ else:
+ raise ValueError(f"Unsupported answer format: {answer_format=}")
+
+ # -------- setup judge --------
+ api_key = default_judge_api_key(judge_base_url) if judge_api_key is None else judge_api_key
+ sampling_args, default_headers = judge_sampling_args_and_headers(judge_model, judge_base_url)
+
+ judge_parser = JSONParser(fields=list(JUDGE_DIMENSIONS))
+ judge_rubric = vf.JudgeRubric(
+ judge_client=AsyncOpenAI(base_url=judge_base_url, api_key=api_key, default_headers=default_headers),
+ judge_model=judge_model,
+ judge_prompt="{question}", # gets filled in during judge_rubric.judge() call
+ parser=parser,
+ judge_sampling_args=sampling_args,
+ )
+
+ async def judge_rubric_reward(completion: Messages, info: Info, state: State, **kwargs: Any) -> float:
+ question = str(info.get("question") or "")
+ context = str(info.get("context") or "")
+ gold_answer = str(info.get("ideal_answer") or "")
+ completion_text = _extract_completion_text(completion, parser)
+
+ judge_prompt = JUDGE_TEMPLATE.format(
+ question=question,
+ context=context,
+ response=completion_text,
+ gold_answer=gold_answer,
+ output_format=JUDGE_OUTPUT_JSON,
+ )
+
+ # judge_prompt assigned to question var inside judge_rubric.judge() method
+ try:
+ judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state)
+ parsed = judge_parser.parse(str(judge_raw), strip=True)
+ except AttributeError:
+ judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state)
+ parsed = judge_parser.parse(str(judge_raw), strip=True)
+
+ if parsed is None:
+ parsed = {dimension: {"score": None, "explanation": None, "raw": None} for dimension in JUDGE_DIMENSIONS}
+
+ normalized = _compute_normalized_reward(parsed)
+
+ info.setdefault("judge_feedback", []).append(
+ {
+ "scores": parsed,
+ "raw_judge": judge_raw,
+ }
+ )
+
+ return normalized
+
+ judge_rubric.add_reward_func(judge_rubric_reward, weight=1.0)
+
+ return vf.SingleTurnEnv(
+ dataset=dataset,
+ system_prompt=system_prompt,
+ rubric=judge_rubric,
+ parser=parser,
+ **kwargs,
+ )
\ No newline at end of file
diff --git a/environments/bioasq/bioasq/judge_prompts.py b/environments/bioasq/bioasq/judge_prompts.py
new file mode 100644
index 00000000..668ec507
--- /dev/null
+++ b/environments/bioasq/bioasq/judge_prompts.py
@@ -0,0 +1,104 @@
+# Judge template adapted from BioASQ manual assessment criteria
+# Evaluates biomedical ideal answers for question answering
+# Grounded in official Task 1b synthesis requirements
+
+JUDGE_DIMENSIONS = ("precision", "recall", "repetition", "readability")
+
+JUDGE_TEMPLATE = """\
+You are a biomedical expert tasked with evaluating the quality of a generated answer to a biomedical question.
+
+Your goal is to assess how well the generated answer addresses the question and how it compares to the reference answer in terms of precision, recall, repetition, and readability.
+
+The biomedical question will be provided in these tags:
+
+{question}
+
+
+Supporting evidence (Context snippets used to derive the answer):
+
+{context}
+
+
+The generated response will be provided in these tags:
+
+{response}
+
+
+The reference answer will be provided in these tags:
+
+{gold_answer}
+
+
+Carefully review the based on the and the supporting .
+
+For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent), and provide a short justification for your score.
+
+Evaluation Criteria:
+1. Precision (1-5)
+- Does the generated response provide accurate biomedical information that is relevant to the question? Penalize information not supported by the context.
+
+2. Recall (1-5)
+- Does the response include all important biomedical concepts and facts mentioned in the reference answer?
+
+3. Repetition (1-5)
+- Does the response avoid unnecessary repetition? (1 = lots of repetition, 5 = no repetition)
+
+4. Readability (1-5)
+- Is the response written clearly and organized in a way that is easy to read for biomedical professionals?
+
+Output Format:
+{output_format}
+"""
+
+JUDGE_OUTPUT_JSON = """
+Output your evaluation as a single valid JSON object matching the following structure:
+{
+ "precision": {
+ "explanation": "Brief explanation of why this score was given.",
+ "score": 0
+ },
+ "recall": {
+ "explanation": "Brief explanation of why this score was given.",
+ "score": 0
+ },
+ "repetition": {
+ "explanation": "Brief explanation of why this score was given.",
+ "score": 0
+ },
+ "readability": {
+ "explanation": "Brief explanation of why this score was given.",
+ "score": 0
+ }
+}
+
+Ensure the output is valid JSON:
+- Use **double quotes** (") for all keys and string values.
+- When quoting text or sections inside the explanations, use escaped double quotes (\\") to maintain valid JSON formatting.
+- Do not include any additional information in the output.
+"""
+
+JUDGE_OUTPUT_XML = """
+Output your evaluation as a single valid XML object matching the following structure:
+
+
+ Brief explanation of why this score was given.
+ 0
+
+
+ Brief explanation of why this score was given.
+ 0
+
+
+ Brief explanation of why this score was given.
+ 0
+
+
+ Brief explanation of why this score was given.
+ 0
+
+
+
+Ensure the output is valid XML:
+- Escape special characters in text nodes: & as &, < as <, > as >, " as ", ' as '.
+- Do not include any additional information in the output.
+"""
\ No newline at end of file
diff --git a/environments/bioasq/pyproject.toml b/environments/bioasq/pyproject.toml
new file mode 100644
index 00000000..203d176a
--- /dev/null
+++ b/environments/bioasq/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "bioasq"
+description = "BioASQ evaluation environment for biomedical question answering"
+readme = "README.md"
+tags = ["medical", "biomedical", "qa", "question-answering", "llm-judge", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+ "verifiers>=0.1.5.post0",
+ "medarc_verifiers>=0.1.0",
+ "openai",
+ "datasets>=2.13.0",
+]
+
+[tool.prime.environment]
+loader = "bioasq:load_environment"
+display_name = "BioASQ"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.uv.sources]
+medarc_verifiers = { git = "https://github.com/MedARC-AI/med-lm-envs" }
\ No newline at end of file