diff --git a/configs/envs/bioasq.yaml b/configs/envs/bioasq.yaml
new file mode 100644
index 00000000..144c4345
--- /dev/null
+++ b/configs/envs/bioasq.yaml
@@ -0,0 +1,17 @@
+- id: bioasq
+  module: bioasq
+  rollouts_per_example: 1
+  num_examples: -1
+  verbose: false
+  env_args:
+    answer_format: xml
+    judge_model: gpt-4o-mini
+
+- id: bioasq-boxed
+  module: bioasq
+  rollouts_per_example: 1
+  num_examples: -1
+  verbose: false
+  env_args:
+    answer_format: boxed
+    judge_model: gpt-4o-mini
\ No newline at end of file
diff --git a/environments/bioasq/README.md b/environments/bioasq/README.md
new file mode 100644
index 00000000..dae2a3c2
--- /dev/null
+++ b/environments/bioasq/README.md
@@ -0,0 +1,242 @@
+# BioASQ
+
+## Overview
+
+- **Environment ID:** `bioasq`
+- **Short description:**  
+  BioASQ is a benchmark for large-scale biomedical semantic question answering that evaluates a model’s ability to generate comprehensive, expert-level answers grounded in scientific literature.
+- **Task Type:**  
+  Single-turn biomedical question answering and multi-document summarization.
+
+---
+
+## Dataset
+
+BioASQ-QA reflects real-world information needs of biomedical experts. The benchmark is challenging because systems must reason over both structured and unstructured biomedical evidence to generate paragraph-sized *ideal answers*.
+
+- **Split sizes:**
+  - **Training:** 5,389 biomedical questions (compiled from previous BioASQ editions)
+  - **Test:** 340 new questions across 4 batches (85 questions per batch)
+  - **Total:** 5,729 questions with ideal answers and supporting evidence
+- **Source:** https://huggingface.co/datasets/kroshan/BioASQ
+- **Official Website:** http://bioasq.org/participate
+- **Implementation based on:** BioASQ 2025 Overview Paper (Task 13b)
+
+---
+
+## Task
+
+- **Type:** Single-turn
+- **Rubric:** `JudgeRubric` (LLM-as-a-Judge evaluation adapted from official BioASQ manual assessment criteria)
+- **Task description:**  
+  Given a biomedical question and a set of relevant evidence snippets, generate a comprehensive *ideal answer*—a paragraph-sized summary intended for biomedical professionals.
+- **Prompt template:**
+
+  > *Here is a biomedical question and several relevant snippets. Provide a comprehensive answer strictly based on these snippets.*
+
+---
+
+## Evaluation Dimensions
+
+Each response is scored on a **1–5 scale** (5 is best) following official BioASQ manual evaluation criteria:
+
+- **Precision (1–5):**  
+  Accuracy and relevance of biomedical facts. Unsupported information is penalized.
+- **Recall (1–5):**  
+  Coverage of all important biomedical concepts present in the gold answer.
+- **Repetition (1–5):**  
+  Redundancy in the response (1 = excessive repetition, 5 = none).
+- **Readability (1–5):**  
+  Clarity, organization, and professional readability.
+
+---
+
+## Quickstart
+
+Run evaluation with default settings (dataset downloads automatically):
+
+```bash
+uv run vf-eval bioasq
+````
+
+### Use a custom judge model
+
+```bash
+uv run vf-eval bioasq -m gpt-4o --env-args '{"judge_model": "gpt-4o"}'
+```
+
+### Enable chain-of-thought (CoT) prompting
+
+```bash
+uv run vf-eval bioasq --env-args '{"use_think": true}'
+```
+
+### Notes
+
+* Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+* By default, the environment uses **`gpt-4o-mini`** as the judge model.
+
+---
+
+## Environment Arguments
+
+| Argument         | Type | Default         | Description                                                  |
+| ---------------- | ---- | --------------- | ------------------------------------------------------------ |
+| `cache_dir`      | str  | `None`          | Path to cache directory                                      |
+| `use_think`      | bool | `False`         | Enable chain-of-thought prompting using `<think>...</think>` |
+| `judge_model`    | str  | `"gpt-4o-mini"` | LLM used to judge ideal answers                              |
+| `judge_base_url` | str  | `None`          | Custom base URL for judge API                                |
+| `judge_api_key`  | str  | `None`          | API key for judge model                                      |
+
+---
+
+## Results Dataset Structure
+
+### Core Evaluation Fields
+
+* **`prompt`**
+  Biomedical question and associated context snippets presented to the model.
+  Represented as a list of message objects: `(role, content)`.
+
+* **`completion`**
+  Model-generated paragraph-sized ideal answer.
+
+* **`reward`**
+  Scalar score in the range **0.0–1.0**, computed as:
+
+```text
+(precision/5 + recall/5 + repetition/5 + readability/5) / 4
+```
+
+---
+
+## Example Metadata (`info`)
+
+The `info` field contains BioASQ-specific metadata for each example:
+
+* **`filename`** – Unique question identifier
+* **`question_type`** – One of: `yesno`, `factoid`, `list`, `summary`
+* **`ideal_answer`** – Gold standard paragraph-sized reference answer
+* **`exact_answer`** – Expected exact answer (format depends on question type)
+* **`documents`** – List of relevant PubMed document URLs
+* **`context`** – Concatenated text snippets used as supporting evidence
+* **`judge_feedback`** – Judge explanations and per-dimension scores
+
+### Notes
+
+* The `question` field contains the biomedical question text.
+* The `answer` field contains the gold ideal answer.
+* Scores are normalized to 0–1 before averaging.
+* If judge parsing fails, dimension scores default to `None` and are excluded.
+
+---
+
+## Data Processing
+
+The dataset follows specifications from the **BioASQ 2025 paper**:
+
+* **Question Types:** Yes/No, Factoid, List, Summary (Section 2.1)
+* **Ideal Answers:** Paragraph-sized summaries for all question types
+* **Exact Answers:**
+
+  * Yes/No for yes-no questions
+  * Entity name(s) for factoid and list questions
+* **Supporting Evidence:** PubMed documents and curated text snippets
+
+---
+
+## Dataset Examples
+
+### Example 1: Yes/No Question
+
+**Question:**
+Is the protein Papilin secreted?
+
+**Question Type:** `yesno`
+
+**Ideal Answer (Reference):**
+Yes, Papilin is a secreted protein. It is an extracellular matrix glycoprotein that contains proteoglycan-like domains and is secreted into the basement membrane.
+
+---
+
+### Example 2: Factoid Question
+
+**Question:**
+Which gene is mutated in Huntington's disease?
+
+**Question Type:** `factoid`
+
+**Ideal Answer (Reference):**
+Huntington's disease is caused by mutations in the HTT gene (also known as the Huntingtin gene). The mutation involves an expansion of CAG trinucleotide repeats in the HTT gene.
+
+---
+
+### Example 3: List Question
+
+**Question:**
+Which are the different isoforms of the mammalian Notch receptor?
+
+**Question Type:** `list`
+
+**Ideal Answer (Reference):**
+In mammals, there are four Notch receptor isoforms: Notch1, Notch2, Notch3, and Notch4.
+
+---
+
+### Example 4: Summary Question
+
+**Question:**
+What is the mechanism of action of pembrolizumab?
+
+**Question Type:** `summary`
+
+**Ideal Answer (Reference):**
+Pembrolizumab is a humanized monoclonal antibody that targets the programmed cell death protein 1 (PD-1) receptor. It blocks the interaction between PD-1 and its ligands PD-L1 and PD-L2, preventing pathway-mediated immune inhibition and enhancing anti-tumor immunity.
+
+---
+
+## Dataset Statistics
+
+Breakdown based on the **BioASQ 2025 paper (Table 1)**:
+
+| Dataset      | Questions | Yes/No    | List      | Factoid   | Summary   | Avg Docs | Avg Snippets |
+| ------------ | --------- | --------- | --------- | --------- | --------- | -------- | ------------ |
+| Training     | 5,389     | 1,459     | 1,047     | 1,600     | 1,283     | 9.74     | 12.78        |
+| Test Batch 1 | 85        | 17        | 23        | 26        | 19        | 2.68     | 3.74         |
+| Test Batch 2 | 85        | 17        | 19        | 27        | 22        | 2.71     | 3.06         |
+| Test Batch 3 | 85        | 22        | 22        | 20        | 21        | 3.00     | 3.66         |
+| Test Batch 4 | 85        | 26        | 19        | 22        | 18        | 3.15     | 3.92         |
+| **Total**    | **5,729** | **1,541** | **1,130** | **1,695** | **1,363** | **9.33** | **12.23**    |
+
+---
+
+## References
+
+```bibtex
+@article{nentidis2025bioasq,
+  title={Overview of BioASQ 2025: The thirteenth BioASQ challenge on large-scale biomedical semantic indexing and question answering},
+  author={Nentidis, Anastasios and Katsimpras, Georgios and Krithara, Anastasia and others},
+  journal={arXiv preprint arXiv:2508.20554},
+  year={2025}
+}
+
+@article{tsatsaronis2015bioasq,
+  title={An overview of the BIOASQ large-scale biomedical semantic indexing and question answering competition},
+  author={Tsatsaronis, George and Balikas, Georgios and Malakasiotis, Prodromos and others},
+  journal={BMC Bioinformatics},
+  volume={16},
+  pages={138},
+  year={2015}
+}
+
+@article{krithara2023bioasq,
+  title={BioASQ-QA: A manually curated corpus for Biomedical Question Answering},
+  author={Krithara, Anastasia and Nentidis, Anastasios and Bougiatiotis, Konstantinos and Paliouras, Georgios},
+  journal={Scientific Data},
+  volume={10},
+  number={1},
+  pages={170},
+  year={2023}
+}
+```
+
diff --git a/environments/bioasq/bioasq/__init__.py b/environments/bioasq/bioasq/__init__.py
new file mode 100644
index 00000000..e69ea893
--- /dev/null
+++ b/environments/bioasq/bioasq/__init__.py
@@ -0,0 +1,3 @@
+from .bioasq import load_environment
+
+__all__ = ["load_environment"]
\ No newline at end of file
diff --git a/environments/bioasq/bioasq/bioasq.py b/environments/bioasq/bioasq/bioasq.py
new file mode 100644
index 00000000..d9c0f917
--- /dev/null
+++ b/environments/bioasq/bioasq/bioasq.py
@@ -0,0 +1,188 @@
+from typing import Any, Dict, Optional
+
+import verifiers as vf
+from datasets import load_dataset
+from datasets.utils.logging import disable_progress_bar
+from medarc_verifiers.parsers import JSONParser
+from medarc_verifiers.prompts import XML_SYSTEM_PROMPT, AnswerFormat
+from medarc_verifiers.utils import default_judge_api_key, judge_sampling_args_and_headers
+from openai import AsyncOpenAI
+from verifiers.types import Info, Messages, State
+from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, extract_boxed_answer
+from .judge_prompts import JUDGE_DIMENSIONS, JUDGE_OUTPUT_JSON, JUDGE_TEMPLATE
+
+disable_progress_bar()  # suppress datasets progress indicators
+
+# System prompt aligned with BioASQ Task 1b synthesis requirement  
+PROMPT = "Provide a comprehensive answer to the following biomedical question strictly based on the provided snippets."
+
+def _parse_bioasq_hf(example: dict[str, Any]) -> dict[str, Any]:
+    """Parses Hugging Face BioASQ format into the Med-LM-Env structure."""
+    # Note: Structure depends on the specific HF version; 
+    # BioASQ 1b/13b typically uses 'body', 'ideal_answer', and 'snippets'
+    question_text = example.get("body", example.get("question", ""))
+    ideal_answer = example.get("ideal_answer", "")
+    snippets = example.get("snippets", [])
+    
+    # Handle different snippet formats (list of strings vs list of dicts)
+    snippet_texts = []
+    for s in snippets:
+        if isinstance(s, dict):
+            snippet_texts.append(s.get("text", ""))
+        else:
+            snippet_texts.append(str(s))
+
+    return {
+        "question": question_text,
+        "answer": ideal_answer,
+        "info": {
+            "question_type": example.get("type", "summary"),
+            "ideal_answer": ideal_answer,
+            "context": "\n".join(snippet_texts),
+            "documents": example.get("documents", [])
+        }
+    }
+
+def _coerce_score(value: Any) -> float | None:
+    """Convert score value to float or None if invalid."""
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        value = value.strip()
+        if not value:
+            return None
+        try:
+            return float(value)
+        except ValueError:
+            return None
+    return None
+
+
+def _compute_normalized_reward(
+    scores: dict[str, dict[str, Any]],
+    min_score: float | None = None,
+    max_score: float | None = None,
+) -> float:
+    """Accumulate per-dimension judge scores normalized from [min_score, max_score] to [0.0, 1.0]"""
+    min_score = min_score if min_score is not None else 1
+    max_score = max_score if max_score is not None else 5
+
+    total_dims = len(JUDGE_DIMENSIONS)
+    if total_dims == 0:
+        return 0.0
+
+    accumulated = 0.0
+    for dimension in JUDGE_DIMENSIONS:
+        score = _coerce_score(scores.get(dimension, {}).get("score"))
+        if score is None:
+            continue
+        clamped = max(0.0, min(max_score, score))
+        accumulated += clamped / max_score
+
+    return max(0.0, min(1.0, accumulated / total_dims))
+
+
+def _extract_completion_text(completion: Messages, parser: vf.Parser) -> str:
+    """Extract completion text, respecting parser if available."""
+    if isinstance(completion, list) and completion:
+        last_msg = completion[-1]
+        if isinstance(last_msg, dict):
+            return str(last_msg.get("content", ""))
+    return str(completion)
+
+
+def load_environment(
+    answer_format: AnswerFormat | str = AnswerFormat.XML,
+    judge_model: str = "gpt-4o-mini",
+    judge_base_url: str | None = None,
+    judge_api_key: str | None = None,
+    system_prompt: Optional[str] = None,
+    **kwargs: Any,
+) -> vf.Environment:
+    """Load BioASQ environment for biomedical question answering evaluation.
+    
+    Args:
+        answer_format: Format for model responses (XML or BOXED)
+        judge_model: Model to use for LLM-as-judge evaluation
+        judge_base_url: Base URL for judge model API
+        judge_api_key: API key for judge model
+        system_prompt: Custom system prompt (defaults to BioASQ-specific prompt)
+        **kwargs: Additional arguments passed to SingleTurnEnv
+    """
+    # Load from the provided Hugging Face path
+    raw_ds = load_dataset("kroshan/BioASQ", split="train")
+    dataset = raw_ds.map(lambda x, idx: _parse_bioasq_hf(x), with_indices=True)
+
+    # -------- normalize answer_format --------
+    answer_format = AnswerFormat(answer_format) if isinstance(answer_format, str) else answer_format
+
+    if answer_format == AnswerFormat.XML:
+        system_prompt = system_prompt or XML_SYSTEM_PROMPT
+        parser_fields = ["answer"]
+        parser = vf.XMLParser(fields=parser_fields, answer_field="answer")
+    elif answer_format == AnswerFormat.BOXED:
+        system_prompt = system_prompt or BOXED_SYSTEM_PROMPT
+        parser = vf.Parser(extract_fn=extract_boxed_answer)
+    else:
+        raise ValueError(f"Unsupported answer format: {answer_format=}")
+
+    # -------- setup judge --------
+    api_key = default_judge_api_key(judge_base_url) if judge_api_key is None else judge_api_key
+    sampling_args, default_headers = judge_sampling_args_and_headers(judge_model, judge_base_url)
+
+    judge_parser = JSONParser(fields=list(JUDGE_DIMENSIONS))
+    judge_rubric = vf.JudgeRubric(
+        judge_client=AsyncOpenAI(base_url=judge_base_url, api_key=api_key, default_headers=default_headers),
+        judge_model=judge_model,
+        judge_prompt="{question}",  # gets filled in during judge_rubric.judge() call
+        parser=parser,
+        judge_sampling_args=sampling_args,
+    )
+
+    async def judge_rubric_reward(completion: Messages, info: Info, state: State, **kwargs: Any) -> float:
+        question = str(info.get("question") or "")
+        context = str(info.get("context") or "")
+        gold_answer = str(info.get("ideal_answer") or "")
+        completion_text = _extract_completion_text(completion, parser)
+
+        judge_prompt = JUDGE_TEMPLATE.format(
+            question=question,
+            context=context,
+            response=completion_text,
+            gold_answer=gold_answer,
+            output_format=JUDGE_OUTPUT_JSON,
+        )
+
+        # judge_prompt assigned to question var inside judge_rubric.judge() method
+        try:
+            judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state)
+            parsed = judge_parser.parse(str(judge_raw), strip=True)
+        except AttributeError:
+            judge_raw = await judge_rubric.judge(judge_prompt, completion_text, gold_answer, state)
+            parsed = judge_parser.parse(str(judge_raw), strip=True)
+        
+        if parsed is None:
+            parsed = {dimension: {"score": None, "explanation": None, "raw": None} for dimension in JUDGE_DIMENSIONS}
+
+        normalized = _compute_normalized_reward(parsed)
+
+        info.setdefault("judge_feedback", []).append(
+            {
+                "scores": parsed,
+                "raw_judge": judge_raw,
+            }
+        )
+
+        return normalized
+
+    judge_rubric.add_reward_func(judge_rubric_reward, weight=1.0)
+
+    return vf.SingleTurnEnv(
+        dataset=dataset,
+        system_prompt=system_prompt,
+        rubric=judge_rubric,
+        parser=parser,
+        **kwargs,
+    )
\ No newline at end of file
diff --git a/environments/bioasq/bioasq/judge_prompts.py b/environments/bioasq/bioasq/judge_prompts.py
new file mode 100644
index 00000000..668ec507
--- /dev/null
+++ b/environments/bioasq/bioasq/judge_prompts.py
@@ -0,0 +1,104 @@
+# Judge template adapted from BioASQ manual assessment criteria
+# Evaluates biomedical ideal answers for question answering
+# Grounded in official Task 1b synthesis requirements
+
+JUDGE_DIMENSIONS = ("precision", "recall", "repetition", "readability")
+
+JUDGE_TEMPLATE = """\
+You are a biomedical expert tasked with evaluating the quality of a generated answer to a biomedical question.
+
+Your goal is to assess how well the generated answer addresses the question and how it compares to the reference answer in terms of precision, recall, repetition, and readability.
+
+The biomedical question will be provided in these tags:
+<question>
+{question}
+</question>
+
+Supporting evidence (Context snippets used to derive the answer):
+<context>
+{context}
+</context>
+
+The generated response will be provided in these tags:
+<response>
+{response}
+</response>
+
+The reference answer will be provided in these tags:
+<gold_answer>
+{gold_answer}
+</gold_answer>
+
+Carefully review the <response> based on the <question> and the supporting <context>.
+
+For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent), and provide a short justification for your score.
+
+Evaluation Criteria:
+1. Precision (1-5)
+- Does the generated response provide accurate biomedical information that is relevant to the question? Penalize information not supported by the context.
+
+2. Recall (1-5)
+- Does the response include all important biomedical concepts and facts mentioned in the reference answer?
+
+3. Repetition (1-5)
+- Does the response avoid unnecessary repetition? (1 = lots of repetition, 5 = no repetition)
+
+4. Readability (1-5)
+- Is the response written clearly and organized in a way that is easy to read for biomedical professionals?
+
+Output Format:
+{output_format}
+"""
+
+JUDGE_OUTPUT_JSON = """
+Output your evaluation as a single valid JSON object matching the following structure:
+{
+  "precision": {
+    "explanation": "Brief explanation of why this score was given.",
+    "score": 0
+  },
+  "recall": {
+    "explanation": "Brief explanation of why this score was given.",
+    "score": 0
+  },
+  "repetition": {
+    "explanation": "Brief explanation of why this score was given.",
+    "score": 0
+  },
+  "readability": {
+    "explanation": "Brief explanation of why this score was given.",
+    "score": 0
+  }
+}
+
+Ensure the output is valid JSON:
+- Use **double quotes** (") for all keys and string values.
+- When quoting text or sections inside the explanations, use escaped double quotes (\\") to maintain valid JSON formatting.
+- Do not include any additional information in the output.
+"""
+
+JUDGE_OUTPUT_XML = """
+Output your evaluation as a single valid XML object matching the following structure:
+<evaluation>
+  <precision>
+    <explanation>Brief explanation of why this score was given.</explanation>
+    <score>0</score>
+  </precision>
+  <recall>
+    <explanation>Brief explanation of why this score was given.</explanation>
+    <score>0</score>
+  </recall>
+  <repetition>
+    <explanation>Brief explanation of why this score was given.</explanation>
+    <score>0</score>
+  </repetition>
+  <readability>
+    <explanation>Brief explanation of why this score was given.</explanation>
+    <score>0</score>
+  </readability>
+</evaluation>
+
+Ensure the output is valid XML:
+- Escape special characters in text nodes: & as &amp;, < as &lt;, > as &gt;, " as &quot;, ' as &apos;.
+- Do not include any additional information in the output.
+"""
\ No newline at end of file
diff --git a/environments/bioasq/pyproject.toml b/environments/bioasq/pyproject.toml
new file mode 100644
index 00000000..203d176a
--- /dev/null
+++ b/environments/bioasq/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "bioasq"
+description = "BioASQ evaluation environment for biomedical question answering"
+readme = "README.md"
+tags = ["medical", "biomedical", "qa", "question-answering", "llm-judge", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.5.post0",
+    "medarc_verifiers>=0.1.0",
+    "openai",
+    "datasets>=2.13.0",
+]
+
+[tool.prime.environment]
+loader = "bioasq:load_environment"
+display_name = "BioASQ"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.uv.sources]
+medarc_verifiers = { git = "https://github.com/MedARC-AI/med-lm-envs" }
\ No newline at end of file