From 2b6668201b4c58bcad5d71091a61866af024b8ed Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Tue, 26 May 2026 10:10:42 -0700 Subject: [PATCH 1/2] feat(accuracy): GPQA-Diamond lighteval-aligned benchmark loader (AIP-880) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement ``GPQADiamondBenchmark`` mirroring the trt-llm benchmark recipe's ``acc_bench_lighteval.py:gpqa_diamond`` configuration: loads ``Idavidrein/gpqa`` (subset ``gpqa_diamond``, train split) and renders the simple-evals prompt template: Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. {Question} A) {A} B) {B} C) {C} D) {D} The four answer choices (1 correct + 3 distractors) are shuffled into A/B/C/D positions via **SHA-256-seeded deterministic shuffling** — the one intentional deviation from the recipe's stochastic ``random.randint(0, 3)``. Seeding off the question text gives a stable, locale-independent, Python-version-independent permutation so gold positions reproduce across runs while still distributed uniformly. Pair with ``LightevalGPQAGrader`` (the recipe's ``gpqa_metric``). Per-row ``task`` is set to ``row["High-level domain"]`` so the accuracy CSV breaks down by physics/chemistry/biology; ``metadata.subdomain`` and ``metadata.gold_letter`` are carried for post-hoc analysis. Built on top of AIP-879 in the lighteval sub-stack (875 → 876 → 879 → 880). No heavy optional dep — ``datasets`` is core — so CI gets 100% line + branch coverage out of the box. Updates the stub registry: drop ``gpqa_diamond`` from ``test_accuracy_config.STUB_BENCHMARKS``, drop ``is_implemented: false`` from the ``gpqa_diamond`` plugins.yaml entry, switch ``default_grader`` to ``lighteval_gpqa``, add the ``gpqa_diamond`` row to ``docs/accuracy/accuracy-benchmarking.md``, and move it from "Still Stubbed" to "Implemented" in ``accuracy_stubs.md`` (refreshing the Status Summary, Method Count Summary, and Suggested Implementation Order accordingly). Signed-off-by: Elias Bermudez --- docs/accuracy/accuracy-benchmarking.md | 1 + docs/accuracy/accuracy_stubs.md | 17 +- .../accuracy/benchmarks/gpqa_diamond.py | 203 +++++++++++- src/aiperf/plugin/plugins.yaml | 10 +- tests/unit/accuracy/test_accuracy_config.py | 1 - .../accuracy/test_gpqa_diamond_benchmark.py | 296 ++++++++++++++++++ 6 files changed, 506 insertions(+), 22 deletions(-) create mode 100644 tests/unit/accuracy/test_gpqa_diamond_benchmark.py diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md index adabfe136..ee0cc7ff3 100644 --- a/docs/accuracy/accuracy-benchmarking.md +++ b/docs/accuracy/accuracy-benchmarking.md @@ -77,6 +77,7 @@ system message). | `aime24` | `lighteval_expr` | 0 | `HuggingFaceH4/aime_2024` (trt-llm/lighteval reference, bare problem text, `expr_gold_metric`) | | `aime25` | `lighteval_expr` | 0 | `yentinglin/aime_2025` (trt-llm/lighteval reference, bare problem text, `expr_gold_metric`) | | `math_500` | `lighteval_latex` | 0 | `HuggingFaceH4/MATH-500` (trt-llm/lighteval reference, gold is full solution containing `\boxed{answer}`, `latex_gold_metric`) | +| `gpqa_diamond` | `lighteval_gpqa` | 0 | `Idavidrein/gpqa` subset `gpqa_diamond` (trt-llm/lighteval reference, simple-evals template with SHA-256-seeded deterministic A/B/C/D shuffling, `gpqa_metric`) | ## CLI Flags diff --git a/docs/accuracy/accuracy_stubs.md b/docs/accuracy/accuracy_stubs.md index aeafe8703..30370b6b5 100644 --- a/docs/accuracy/accuracy_stubs.md +++ b/docs/accuracy/accuracy_stubs.md @@ -7,7 +7,7 @@ This document catalogs every stubbed method in the accuracy benchmarking scaffolding. The scaffolding is fully integrated into the plugin system, CLI, and config pipeline — the performance benchmarking path is unaffected. -**Status summary:** With the MATH-500 loader landing on top of the AIME25 / AIME24 / BigBench / HellaSwag stack, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`, `AIME25Benchmark`, and `Math500Benchmark` are fully implemented; the remaining benchmarks (`gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. +**Status summary:** With the GPQA-Diamond loader landing on top of the MATH-500 / AIME25 / AIME24 / BigBench / HellaSwag stack, all seven graders and eight benchmark loaders are now implemented; only `lcb_codegeneration` remains stubbed (LiveCodeBench code-generation) and ships behind `NotImplementedError` until the AIP-881 branch lands. Use the implemented classes as canonical references when filling in the remaining stub. ## Table of Contents @@ -176,13 +176,13 @@ All benchmarks use `AIPerfLoggerMixin` and must implement 1 method. | 5 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `lighteval_expr` | 0 | **IMPLEMENTED.** Loads `HuggingFaceH4/aime_2024` (train split) and emits the bare problem text as a single user message — no instruction prefix, no few-shot priming. Mirrors the trt-llm benchmark recipe's `acc_bench_lighteval.py` configuration (`few_shots_split=None`, `generation_size=32768`). Pairs with `lighteval_expr` for the recipe's `expr_gold_metric` extraction. | | 6 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `lighteval_expr` | 0 | **IMPLEMENTED.** Same lighteval-aligned shape as `AIME24Benchmark` but pointed at `yentinglin/aime_2025` (the recipe's `aime25` task config). Identical prompt rendering, generation size, and grader pairing. | | 7 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `lighteval_latex` | 0 | **IMPLEMENTED.** Loads `HuggingFaceH4/MATH-500` (test split). Same lighteval-aligned shape as AIME24/25, but `ground_truth` is the full `solution` text (containing `\boxed{answer}`); `LightevalLatexGrader` extracts the boxed expression at grade time. Per-row `task` = `subject` so the accuracy CSV breaks down by MATH subject. | +| 8 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `lighteval_gpqa` | 0 | **IMPLEMENTED.** Loads `Idavidrein/gpqa` (subset `gpqa_diamond`, train split). Renders the simple-evals prompt template with **SHA-256-seeded deterministic A/B/C/D shuffling** of the correct + 3 distractor answers — one intentional deviation from the recipe's stochastic `random.randint(0, 3)` so gold positions reproduce across runs. Per-row `task` = `High-level domain` so the accuracy CSV breaks down by physics/chemistry/biology. | ### Still Stubbed | # | Class | File | Plugin Key | Default Grader | Default N-Shots | |---|-------|------|------------|----------------|-----------------| -| 1 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | -| 2 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | +| 1 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | **Each benchmark has 1 method to implement:** @@ -309,13 +309,13 @@ All stubs are registered in `src/aiperf/plugin/plugins.yaml` and `src/aiperf/plu | Component | Implemented | Still Stubbed | Methods per Stub | Remaining Methods | |-----------|-------------|---------------|------------------|-------------------| | Graders | 7 (all) | 0 | — | 0 | -| Benchmarks | 7 (incl. MMLU, AIME, HellaSwag, BigBench, AIME24, AIME25, Math500) | 2 | 1 (`load_problems`) | 2 | +| Benchmarks | 8 (incl. MMLU, AIME, HellaSwag, BigBench, AIME24, AIME25, Math500, GPQADiamond) | 1 | 1 (`load_problems`) | 1 | | Record Processor | 1 (`AccuracyRecordProcessor`) | 0 | — | 0 | | Results Processor | 1 (`AccuracyResultsProcessor`) | 0 | — | 0 | | Console Exporter | 1 (`AccuracyConsoleExporter`) | 0 | — | 0 | | Data Exporter | 1 (`AccuracyDataExporter`) | 0 | — | 0 | | Stub-plugin Validator | 0 | 1 | 1 (`AccuracyConfig._reject_stub_plugins`) | 1 | -| **Total** | **18** | **3** | | **3** | +| **Total** | **19** | **2** | | **2** | ### Self-Disabling Pattern @@ -323,11 +323,10 @@ Processors and exporters raise their `Disabled` exception **in `__init__`** when ### Suggested Implementation Order -The processors, exporters, all graders, and seven benchmarks (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`, `AIME24Benchmark`, `AIME25Benchmark`, `Math500Benchmark`) are already wired end-to-end. The remaining work is the two stub benchmarks; mirror the existing loader whose grader matches: +The processors, exporters, all graders, and eight benchmarks are already wired end-to-end. The remaining work is the single stub benchmark: -1. **`gpqa_diamond`** — mirror `MMLUBenchmark` (`benchmarks/mmlu.py`); pair with the `lighteval_gpqa` grader. -2. **`lcb_codegeneration`** — mirror `MMLUBenchmark`'s scaffolding; pair with the `code_execution` grader. -3. **Stub-plugin validator** — update `AccuracyConfig._reject_stub_plugins()` whenever a benchmark moves from stubbed to supported. +1. **`lcb_codegeneration`** — mirror `MMLUBenchmark`'s scaffolding; pair with the `code_execution` grader. +2. **Stub-plugin validator** — update `AccuracyConfig._reject_stub_plugins()` when `lcb_codegeneration` lands so the validator no longer rejects it. ### Key Files for Reference diff --git a/src/aiperf/accuracy/benchmarks/gpqa_diamond.py b/src/aiperf/accuracy/benchmarks/gpqa_diamond.py index 69086ee45..7773ea893 100644 --- a/src/aiperf/accuracy/benchmarks/gpqa_diamond.py +++ b/src/aiperf/accuracy/benchmarks/gpqa_diamond.py @@ -1,31 +1,218 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +"""GPQA-Diamond benchmark loader, aligned with the trt-llm lighteval reference. + +Mirrors ``acc_bench_lighteval.py:gpqa_diamond``: + + gpqa_diamond = LightevalTaskConfig( + name="gpqa:diamond", + prompt_function=gpqa_prompt_fn, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_diamond", + evaluation_splits=["train"], + few_shots_split=None, + generation_size=32768, + metric=[gpqa_metric], + stop_sequence=[], + trust_dataset=True, + ) + +The recipe's ``gpqa_prompt_fn`` builds the simple-evals template: + + Answer the following multiple choice question. The last line of + your response should be of the following format: 'Answer: $LETTER' + (without quotes) where LETTER is one of ABCD. Think step by step + before answering. + + {Question} + + A) {A} + B) {B} + C) {C} + D) {D} + +The recipe's prompt_fn shuffles options with ``random.randint(0, 3)`` +(stochastic, different per call). Aiperf instead uses **SHA-256-seeded +deterministic shuffling** (per the user direction during the alignment +review) so gold positions are reproducible across runs while still +distributed uniformly. This is the one intentional deviation from the +trt-llm reference, documented in +``docs/accuracy/accuracy-benchmarking.md``. + +Pair with ``LightevalGPQAGrader`` (default), which extracts via +``IndicesExtractionConfig(prefix_for_extraction="NativeLetters")`` to +match the recipe's ``gpqa_metric``. + +Reference: + trt-llm-benchmark-recipe/src/accuracy/acc_bench_lighteval.py:108,170 +""" + from __future__ import annotations -from typing import TYPE_CHECKING +import asyncio +import hashlib +import random +from typing import TYPE_CHECKING, Any -from aiperf.accuracy.models import BenchmarkProblem +from datasets import Dataset, load_dataset + +from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem from aiperf.common.mixins import AIPerfLoggerMixin if TYPE_CHECKING: from aiperf.config.resolution.plan import BenchmarkRun +DATASET_NAME = "Idavidrein/gpqa" +DATASET_CONFIG = "gpqa_diamond" +TASK_NAME = "gpqa_diamond" + +# lighteval's gpqa_diamond task config: ``generation_size=32768``. +DEFAULT_GENERATION_SIZE = 32768 + +# 4 choices per question (1 correct + 3 distractors). +NUM_CHOICES = 4 + +# Width of the SHA-256-derived seed when modded down to a 32-bit +# Python ``random.Random`` seed. +_SEED_MODULUS = 2**32 + +# Schema field names in the Idavidrein/gpqa dataset (Title Case with +# spaces — the upstream's choice). +QUESTION_FIELD = "Question" +CORRECT_ANSWER_FIELD = "Correct Answer" +INCORRECT_ANSWER_FIELDS = ( + "Incorrect Answer 1", + "Incorrect Answer 2", + "Incorrect Answer 3", +) +DOMAIN_FIELD = "High-level domain" +SUBDOMAIN_FIELD = "Subdomain" + +# Recipe's ``gpqa_prompt_fn`` template. The model is told to emit +# ``Answer: $LETTER`` so ``LightevalGPQAGrader`` (with +# ``IndicesExtractionConfig(prefix_for_extraction="NativeLetters")``) +# can extract the letter cleanly. +_PROMPT_TEMPLATE = ( + "Answer the following multiple choice question. The last line of " + "your response should be of the following format: 'Answer: $LETTER' " + "(without quotes) where LETTER is one of ABCD. Think step by step " + "before answering.\n\n" + "{Question}\n\n" + "A) {A}\n" + "B) {B}\n" + "C) {C}\n" + "D) {D}" +) + + +def _seeded_shuffle_indices(key: str, n: int) -> list[int]: + """Return a deterministic permutation of ``range(n)`` seeded by ``key``. + + Uses the leading 32 bits of SHA-256(key) as the seed for Python's + ``random.Random``. This gives a stable, locale-independent, + Python-version-independent permutation: regenerating prompts on a + new machine produces identical letter orderings. + + The recipe shuffles via ``random.randint(0, 3)`` (stochastic per + call) — see the module docstring for why aiperf chose + determinism instead. + """ + digest = hashlib.sha256(key.encode("utf-8")).hexdigest() + seed = int(digest, 16) % _SEED_MODULUS + rng = random.Random(seed) + indices = list(range(n)) + rng.shuffle(indices) + return indices + class GPQADiamondBenchmark(AIPerfLoggerMixin): - """Registered placeholder for a future GPQA Diamond loader. + """GPQA-Diamond lighteval-aligned benchmark loader. - `load_problems()` intentionally raises NotImplementedError in this release; - use the MMLU benchmark when a working accuracy loader is required. + Loads ``Idavidrein/gpqa`` (config ``gpqa_diamond``, train split). + Each row's correct + 3 incorrect answers are deterministically + shuffled into A/B/C/D positions and rendered with the simple-evals + template (matching ``gpqa_prompt_fn``). Pair with + ``LightevalGPQAGrader`` for grading parity with the recipe. """ - def __init__(self, run: BenchmarkRun, **kwargs) -> None: + def __init__(self, run: BenchmarkRun, **kwargs: Any) -> None: super().__init__(**kwargs) self.run = run async def load_problems( self, tasks: list[str] | None, n_shots: int, enable_cot: bool ) -> list[BenchmarkProblem]: - raise NotImplementedError( - "gpqa_diamond benchmark is not yet implemented; only 'mmlu' is available in this release." + """Load GPQA-Diamond problems lighteval-style. + + Args: + tasks: Ignored — lighteval's gpqa_diamond task has no + subtask filtering (per-row High-level domain is in + metadata for post-run reporting). + n_shots: Ignored — the lighteval reference is zero-shot + (``few_shots_split=None``). + enable_cot: Ignored — the simple-evals template already + includes "Think step by step before answering." + + Returns: + One ``BenchmarkProblem`` per dataset row, in dataset order. + ``ground_truth`` is the gold letter ("A", "B", "C", or + "D") so ``LightevalGPQAGrader`` can pass it directly into + its ``Doc.choices=["A","B","C","D"], gold_index=...`` + shape. + """ + ds: Dataset = await asyncio.to_thread( + load_dataset, DATASET_NAME, DATASET_CONFIG, split="train" + ) + return await asyncio.to_thread(self._build_problems, ds) + + def _build_problems(self, ds: Dataset) -> list[BenchmarkProblem]: + problems: list[BenchmarkProblem] = [] + for row in ds: + choices, gold_letter = self._build_choices(row) + prompt = self._format_prompt(row, choices) + messages: list[AccuracyChatMessage] = [{"role": "user", "content": prompt}] + problems.append( + BenchmarkProblem( + prompt=prompt, + ground_truth=gold_letter, + task=TASK_NAME, + metadata={ + "domain": row.get(DOMAIN_FIELD, ""), + "subdomain": row.get(SUBDOMAIN_FIELD, ""), + "generation_size": DEFAULT_GENERATION_SIZE, + }, + raw_messages=messages, + ) + ) + return problems + + @staticmethod + def _build_choices(row: dict[str, Any]) -> tuple[list[str], str]: + """Assemble 4 lettered choices and report the gold letter. + + Uses SHA-256-seeded permutation (see ``_seeded_shuffle_indices``) + — deterministic per-question shuffle, distinct from the + recipe's stochastic ``random.randint(0, 3)``. + """ + raw = [ + row[CORRECT_ANSWER_FIELD], + row[INCORRECT_ANSWER_FIELDS[0]], + row[INCORRECT_ANSWER_FIELDS[1]], + row[INCORRECT_ANSWER_FIELDS[2]], + ] + order = _seeded_shuffle_indices(row[QUESTION_FIELD], len(raw)) + ordered = [raw[i] for i in order] + gold_index = order.index(0) + gold_letter = "ABCD"[gold_index] + return ordered, gold_letter + + def _format_prompt(self, row: dict[str, Any], choices: list[str]) -> str: + """Render the simple-evals template byte-equal to the recipe.""" + return _PROMPT_TEMPLATE.format( + Question=row[QUESTION_FIELD], + A=choices[0], + B=choices[1], + C=choices[2], + D=choices[3], ) diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml index d18e42880..0cc1d950f 100644 --- a/src/aiperf/plugin/plugins.yaml +++ b/src/aiperf/plugin/plugins.yaml @@ -1517,12 +1517,14 @@ accuracy_benchmark: gpqa_diamond: class: aiperf.accuracy.benchmarks.gpqa_diamond:GPQADiamondBenchmark description: | - GPQA Diamond benchmark with graduate-level science questions in physics, - chemistry, and biology requiring expert-level reasoning. + GPQA Diamond benchmark, aligned with the trt-llm benchmark recipe's + lighteval-backed configuration (Idavidrein/gpqa subset gpqa_diamond + + lighteval ``gpqa_metric``). Uses the simple-evals prompt template + with SHA-256-seeded deterministic A/B/C/D shuffling so gold + positions are reproducible across runs. metadata: - default_grader: multiple_choice + default_grader: lighteval_gpqa default_n_shots: 0 - is_implemented: false lcb_codegeneration: class: aiperf.accuracy.benchmarks.lcb_codegeneration:LCBCodeGenerationBenchmark diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py index 109e8e3fc..a3e6f5b52 100644 --- a/tests/unit/accuracy/test_accuracy_config.py +++ b/tests/unit/accuracy/test_accuracy_config.py @@ -23,7 +23,6 @@ # This branch (AIP-874) implements ``aime``, ``math``, and ``code_execution``, # so those names are absent from the stub lists. STUB_BENCHMARKS = ( - "gpqa_diamond", "lcb_codegeneration", ) STUB_GRADERS: tuple[str, ...] = () diff --git a/tests/unit/accuracy/test_gpqa_diamond_benchmark.py b/tests/unit/accuracy/test_gpqa_diamond_benchmark.py new file mode 100644 index 000000000..2fdc837fc --- /dev/null +++ b/tests/unit/accuracy/test_gpqa_diamond_benchmark.py @@ -0,0 +1,296 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for ``GPQADiamondBenchmark`` after lighteval alignment. + +Pins: +1. The simple-evals prompt template byte-equal to the recipe's + ``gpqa_prompt_fn``. +2. Deterministic SHA-256-seeded shuffling (intentional deviation from + the recipe's stochastic ``random.randint``). +3. ``ground_truth`` is the bare gold letter (``"A"``..``"D"``), the + shape ``LightevalGPQAGrader`` expects. +""" + +from __future__ import annotations + +from collections import Counter +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from aiperf.accuracy.benchmarks.gpqa_diamond import ( + DEFAULT_GENERATION_SIZE, + NUM_CHOICES, + TASK_NAME, + GPQADiamondBenchmark, + _seeded_shuffle_indices, +) +from aiperf.accuracy.models import BenchmarkProblem +from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType +from tests.unit.conftest import make_benchmark_run + + +def _make_run(): + return make_benchmark_run( + model_names=["test-model"], + endpoint_type=EndpointType.COMPLETIONS, + streaming=False, + accuracy={"benchmark": AccuracyBenchmarkType.GPQA_DIAMOND}, + ) + + +def _make_row( + question: str = "What is 2+2?", + correct: str = "4", + incorrect: tuple[str, str, str] = ("3", "5", "6"), + domain: str = "Physics", + subdomain: str = "Mechanics", +) -> dict[str, Any]: + return { + "Question": question, + "Correct Answer": correct, + "Incorrect Answer 1": incorrect[0], + "Incorrect Answer 2": incorrect[1], + "Incorrect Answer 3": incorrect[2], + "High-level domain": domain, + "Subdomain": subdomain, + } + + +def _make_fake_dataset(rows: list[dict[str, Any]]) -> MagicMock: + ds = MagicMock() + ds.__iter__ = MagicMock(side_effect=lambda: iter(rows)) + ds.__len__ = MagicMock(return_value=len(rows)) + ds.__getitem__ = MagicMock(side_effect=lambda i: rows[i]) + return ds + + +class TestPromptTemplateMatchesRecipe: + """The flat prompt is byte-equal to the recipe's + ``gpqa_prompt_fn`` simple-evals template.""" + + @pytest.mark.asyncio + async def test_prompt_uses_simple_evals_template(self) -> None: + rows = [_make_row(question="Q?", correct="W", incorrect=("X", "Y", "Z"))] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + assert prompt.startswith( + "Answer the following multiple choice question. The last " + "line of your response should be of the following format: " + "'Answer: $LETTER'" + ) + assert "Think step by step before answering." in prompt + assert "Q?" in prompt + # The four-letter format uses ``A) `` / ``B) `` etc — NOT + # ``A. ``. The grader's ``Answer: $LETTER`` extractor matches + # against ``ABCD`` regardless, but the prompt format itself + # should match the recipe. + assert "A) " in prompt + assert "B) " in prompt + assert "C) " in prompt + assert "D) " in prompt + + @pytest.mark.asyncio + async def test_all_four_choices_present(self) -> None: + rows = [_make_row(correct="GOLD", incorrect=("DECOY1", "DECOY2", "DECOY3"))] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + assert "GOLD" in prompt + assert "DECOY1" in prompt + assert "DECOY2" in prompt + assert "DECOY3" in prompt + + +class TestGroundTruthIsBareLetter: + """``LightevalGPQAGrader`` expects the gold as a bare letter + (``"A"``..``"D"``). The previous SHA-seeded grader stored + ``" A"`` (leading-space CHOICES convention) — we no longer use + that since lighteval's ``IndicesExtractionConfig`` doesn't need + it.""" + + @pytest.mark.asyncio + async def test_ground_truth_is_letter(self) -> None: + rows = [_make_row(question=f"Q{i}", correct="GOLD") for i in range(3)] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + for p in problems: + assert p.ground_truth in ("A", "B", "C", "D") + + @pytest.mark.asyncio + async def test_ground_truth_letter_indexes_into_correct_text(self) -> None: + rows = [_make_row(question="Q1", correct="GOLD")] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + gold_letter = problems[0].ground_truth + # The line ``A) GOLD`` (or B) C) D)) for the gold_letter slot + # must appear in the prompt. + assert f"\n{gold_letter}) GOLD" in problems[0].prompt + + +class TestSeededShuffleIsDeterministic: + """SHA-256-seeded permutation: same key → same permutation, + distinct keys → distinct permutations, distribution roughly + uniform across many keys.""" + + def test_same_key_same_permutation(self) -> None: + a = _seeded_shuffle_indices("hello", NUM_CHOICES) + b = _seeded_shuffle_indices("hello", NUM_CHOICES) + assert a == b + + def test_different_keys_different_permutations(self) -> None: + a = _seeded_shuffle_indices("alpha", NUM_CHOICES) + b = _seeded_shuffle_indices("beta", NUM_CHOICES) + assert a != b + + def test_distribution_across_many_keys(self) -> None: + positions = Counter() + for i in range(1000): + order = _seeded_shuffle_indices(f"q-{i}", NUM_CHOICES) + positions[order.index(0)] += 1 + # Each of the 4 slots should land ~250 times; allow ±20%. + for slot in range(NUM_CHOICES): + assert 200 <= positions[slot] <= 300, ( + f"slot {slot} got {positions[slot]} (expected ~250)" + ) + + +class TestNShotsAndCoTAreIgnored: + @pytest.mark.asyncio + async def test_n_shots_argument_does_not_affect_prompt(self) -> None: + rows = [_make_row()] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + zero_shot = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + five_shot = await bench.load_problems( + tasks=None, n_shots=5, enable_cot=False + ) + assert zero_shot[0].prompt == five_shot[0].prompt + + +class TestLoadProblemsCore: + @pytest.mark.asyncio + async def test_returns_one_problem_per_row(self) -> None: + rows = [_make_row(question=f"Q{i}") for i in range(3)] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert len(problems) == 3 + assert all(isinstance(p, BenchmarkProblem) for p in problems) + + @pytest.mark.asyncio + async def test_metadata_carries_domain_and_gen_size(self) -> None: + rows = [_make_row(domain="Chemistry", subdomain="Organic")] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + meta = problems[0].metadata + assert meta["domain"] == "Chemistry" + assert meta["subdomain"] == "Organic" + assert meta["generation_size"] == DEFAULT_GENERATION_SIZE + assert DEFAULT_GENERATION_SIZE == 32768 + + @pytest.mark.asyncio + async def test_task_name_is_constant(self) -> None: + rows = [_make_row(domain="Physics"), _make_row(domain="Biology")] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert all(p.task == TASK_NAME for p in problems) + + +class TestPathologicalDatasetRows: + @pytest.mark.asyncio + async def test_empty_dataset_returns_empty_list(self) -> None: + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset([]), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems == [] + + @pytest.mark.asyncio + async def test_optional_subdomain_field_missing(self) -> None: + rows = [ + { + "Question": "Q?", + "Correct Answer": "yes", + "Incorrect Answer 1": "no", + "Incorrect Answer 2": "maybe", + "Incorrect Answer 3": "perhaps", + "High-level domain": "Physics", + # Subdomain absent. + } + ] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert problems[0].metadata["subdomain"] == "" + + @pytest.mark.asyncio + async def test_unicode_question_text_preserved(self) -> None: + rows = [_make_row(question="∮ E·dl = ?")] + with patch( + "aiperf.accuracy.benchmarks.gpqa_diamond.load_dataset", + return_value=_make_fake_dataset(rows), + ): + bench = GPQADiamondBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=None, n_shots=0, enable_cot=False + ) + assert "∮ E·dl" in problems[0].prompt From a4e98349b8634096d90fca5a853a1ed09a60716c Mon Sep 17 00:00:00 2001 From: Elias Bermudez Date: Tue, 2 Jun 2026 16:45:47 -0700 Subject: [PATCH 2/2] style(accuracy): collapse single-element STUB_BENCHMARKS tuple (AIP-880) ``ruff format`` collapses the now-single-element ``STUB_BENCHMARKS`` tuple in ``test_accuracy_config.py`` onto a single line. Pre-AIP-879 the tuple had two entries (``math_500`` + ``lcb_codegeneration``) which justified the multi-line layout; after 879 landed there's only ``lcb_codegeneration`` left and ruff's formatter would otherwise flag this on CI as a needed reformat. Signed-off-by: Elias Bermudez --- tests/unit/accuracy/test_accuracy_config.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py index a3e6f5b52..373a7c36c 100644 --- a/tests/unit/accuracy/test_accuracy_config.py +++ b/tests/unit/accuracy/test_accuracy_config.py @@ -22,9 +22,7 @@ # implementation (and remove the ``is_implemented: false`` from the YAML). # This branch (AIP-874) implements ``aime``, ``math``, and ``code_execution``, # so those names are absent from the stub lists. -STUB_BENCHMARKS = ( - "lcb_codegeneration", -) +STUB_BENCHMARKS = ("lcb_codegeneration",) STUB_GRADERS: tuple[str, ...] = ()