diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md index d64ad7fab..c58b7ac96 100644 --- a/docs/accuracy/accuracy-benchmarking.md +++ b/docs/accuracy/accuracy-benchmarking.md @@ -73,6 +73,7 @@ system message). | `mmlu` | `multiple_choice` | 5 | `lighteval/mmlu` (57 subjects) | | `aime` | `math` | 8 | `Maxwell-Jia/AIME_2024` (trt-llm reference, 8-shot CoT) | | `hellaswag` | `exact_match` | 10 | `Rowan/hellaswag` (trt-llm/DeepEval reference; one few-shot per unique activity_label) | +| `bigbench` | `exact_match` | 3 | `lukaemon/bbh` (trt-llm/DeepEval reference; 27 subtasks, canonical CoT/non-CoT prompt files) | ## CLI Flags diff --git a/docs/accuracy/accuracy_stubs.md b/docs/accuracy/accuracy_stubs.md index ea85ea2e7..ad8e8c71c 100644 --- a/docs/accuracy/accuracy_stubs.md +++ b/docs/accuracy/accuracy_stubs.md @@ -7,7 +7,7 @@ This document catalogs every stubbed method in the accuracy benchmarking scaffolding. The scaffolding is fully integrated into the plugin system, CLI, and config pipeline — the performance benchmarking path is unaffected. -**Status summary:** With the HellaSwag loader landing on top of AIP-874, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, and `HellaSwagBenchmark` are fully implemented; the remaining benchmarks (`bigbench`, `aime24`, `aime25`, `math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. +**Status summary:** With the BigBench-Hard loader landing on top of the HellaSwag stack, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, and `BigBenchBenchmark` are fully implemented; the remaining benchmarks (`aime24`, `aime25`, `math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. ## Table of Contents @@ -172,17 +172,17 @@ All benchmarks use `AIPerfLoggerMixin` and must implement 1 method. | 1 | `MMLUBenchmark` | `benchmarks/mmlu.py` | `mmlu` | `multiple_choice` | 5 | **IMPLEMENTED in PR #815** — canonical reference for new benchmarks. Downloads via HuggingFace datasets, handles few-shot formatting and CoT. | | 2 | `AIMEBenchmark` | `benchmarks/aime.py` | `aime` | `math` | 8 | **IMPLEMENTED.** Loads `Maxwell-Jia/AIME_2024`, instructs the model to wrap its final integer in `\boxed{}`, supports few-shot priming and chain-of-thought. `default_enable_cot=true`. | | 3 | `HellaSwagBenchmark` | `benchmarks/hellaswag.py` | `hellaswag` | `exact_match` | 10 | **IMPLEMENTED.** Loads `Rowan/hellaswag` (validation split filtered per task by `activity_label`; train split feeds the "one few-shot per unique activity_label" rule). Prompt rendering delegates to `deepeval.benchmarks.HellaSwag`'s `HellaSwagTemplate.generate_output`, so output is byte-equal to the trt-llm recipe's DeepEval-backed path. Pairs with `exact_match` for strict `Scorer.exact_match_score` semantics. Requires the `[accuracy]` extras (deepeval). | +| 4 | `BigBenchBenchmark` | `benchmarks/bigbench.py` | `bigbench` | `exact_match` | 3 | **IMPLEMENTED.** Loads `lukaemon/bbh` (27 BBH subtasks). Prompt rendering delegates to `deepeval.benchmarks.BigBenchHard`'s `BigBenchHardTemplate.generate_output`, which reads the 27 canonical CoT/shot prompt files DeepEval ships as package data. Pairs with `exact_match` for the recipe's strict `Scorer.exact_match_score` semantics. `default_n_shots=3`, `default_enable_cot=true`. Requires the `[accuracy]` extras (deepeval). | ### Still Stubbed | # | Class | File | Plugin Key | Default Grader | Default N-Shots | |---|-------|------|------------|----------------|-----------------| -| 1 | `BigBenchBenchmark` | `benchmarks/bigbench.py` | `bigbench` | `exact_match` | 3 | -| 2 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `math` | 0 | -| 3 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `math` | 0 | -| 4 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | -| 5 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | -| 6 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | +| 1 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `math` | 0 | +| 2 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `math` | 0 | +| 3 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | +| 4 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | +| 5 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | **Each benchmark has 1 method to implement:** @@ -308,14 +308,14 @@ All stubs are registered in `src/aiperf/plugin/plugins.yaml` and `src/aiperf/plu | Component | Implemented | Still Stubbed | Methods per Stub | Remaining Methods | |-----------|-------------|---------------|------------------|-------------------| -| Graders | 1 (`MultipleChoiceGrader`) | 3 | 2 (`grade`, `extract_answer`) | 6 | -| Benchmarks | 1 (`MMLUBenchmark`) | 8 | 1 (`load_problems`) | 8 | +| Graders | 7 (all) | 0 | — | 0 | +| Benchmarks | 4 (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`) | 5 | 1 (`load_problems`) | 5 | | Record Processor | 1 (`AccuracyRecordProcessor`) | 0 | — | 0 | | Results Processor | 1 (`AccuracyResultsProcessor`) | 0 | — | 0 | | Console Exporter | 1 (`AccuracyConsoleExporter`) | 0 | — | 0 | | Data Exporter | 1 (`AccuracyDataExporter`) | 0 | — | 0 | | Stub-plugin Validator | 0 | 1 | 1 (`AccuracyConfig._reject_stub_plugins`) | 1 | -| **Total** | **6** | **13** | | **15** | +| **Total** | **15** | **6** | | **6** | ### Self-Disabling Pattern @@ -323,11 +323,12 @@ Processors and exporters raise their `Disabled` exception **in `__init__`** when ### Suggested Implementation Order -The processors, exporters, and one grader/benchmark pair are already wired end-to-end. Start from the already-working pipeline: +The processors, exporters, all graders, and four benchmarks (`MMLUBenchmark`, `AIMEBenchmark`, `HellaSwagBenchmark`, `BigBenchBenchmark`) are already wired end-to-end. The remaining work is the five stub benchmarks; mirror the existing loader whose grader matches: -1. **Graders** — use `MultipleChoiceGrader` as reference; implement `ExactMatchGrader` next (simplest), then `MathGrader` -2. **Benchmarks** — use `MMLUBenchmark` as reference; implement dataset loading for each remaining benchmark -3. **Stub-plugin validator** — update `AccuracyConfig._reject_stub_plugins()` when a benchmark or grader moves from stubbed to supported +1. **`aime24`, `aime25`, `math_500`** — mirror `AIMEBenchmark` (`benchmarks/aime.py`); pair with the `math` grader. +2. **`gpqa_diamond`** — mirror `MMLUBenchmark` (`benchmarks/mmlu.py`); pair with the `multiple_choice` grader. +3. **`lcb_codegeneration`** — mirror `MMLUBenchmark`'s scaffolding; pair with the `code_execution` grader. +4. **Stub-plugin validator** — update `AccuracyConfig._reject_stub_plugins()` whenever a benchmark moves from stubbed to supported. ### Key Files for Reference diff --git a/pyproject.toml b/pyproject.toml index b4fb39ed4..643211651 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -219,6 +219,7 @@ markers = [ "server_unit: marks tests as unit tests for the mock server", "fern: marks tests that validate Fern documentation (requires fern CLI)", "network: marks tests that require network access", + "requires_deepeval: tests that need the real deepeval install (i.e. the [accuracy] extras) — skipped when only the fake-deepeval harness is registered", ] # Better console output console_output_style = "progress" diff --git a/src/aiperf/accuracy/benchmarks/bigbench.py b/src/aiperf/accuracy/benchmarks/bigbench.py index cf838f35f..07ede45ad 100644 --- a/src/aiperf/accuracy/benchmarks/bigbench.py +++ b/src/aiperf/accuracy/benchmarks/bigbench.py @@ -1,31 +1,228 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +"""BigBench-Hard benchmark loader, aligned with the trt-llm DeepEval reference. + +The trt-llm benchmark recipe routes ``bigbench`` through DeepEval's +``deepeval.benchmarks.BigBenchHard`` class +(``trt-llm-benchmark-recipe/src/tools/acc_benchmark.py:338-356``). This +loader produces prompts byte-equal to what DeepEval's +``BigBenchHardTemplate.generate_output`` produces, by importing and +calling that template directly. The 27 canonical CoT and non-CoT +prompt files (one per BBH subtask) ship inside DeepEval as package +data — DeepEval's template reads them via ``importlib.resources`` at +load time. + +Pair with ``ExactMatchGrader`` for strict ``pred.strip() == +gold.strip()`` semantics matching DeepEval's +``Scorer.exact_match_score``. + +Reference: + deepeval/benchmarks/big_bench_hard/big_bench_hard.py + deepeval/benchmarks/big_bench_hard/template.py + deepeval/benchmarks/big_bench_hard/cot_prompts/*.txt (27 files) + deepeval/benchmarks/big_bench_hard/shot_prompts/*.txt (27 files) + trt-llm-benchmark-recipe/src/tools/acc_benchmark.py:338 +""" + from __future__ import annotations -from typing import TYPE_CHECKING +import asyncio +from typing import TYPE_CHECKING, Any -from aiperf.accuracy.models import BenchmarkProblem +from datasets import Dataset, load_dataset + +from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem from aiperf.common.mixins import AIPerfLoggerMixin if TYPE_CHECKING: from aiperf.config.resolution.plan import BenchmarkRun +try: + from deepeval.benchmarks.big_bench_hard.big_bench_hard import ( + bbh_confinement_statements_dict, + ) + from deepeval.benchmarks.big_bench_hard.task import BigBenchHardTask + from deepeval.benchmarks.big_bench_hard.template import ( + BigBenchHardTemplate, + ) + + _HAS_DEEPEVAL = True +except ImportError: # pragma: no cover - exercised only without optional dep + _HAS_DEEPEVAL = False + BigBenchHardTask = None # type: ignore[assignment] + BigBenchHardTemplate = None # type: ignore[assignment] + bbh_confinement_statements_dict = None # type: ignore[assignment] + + +_MISSING_DEEPEVAL_HINT = ( + "deepeval is not installed; BigBench-Hard's prompt templates and " + "the per-task confinement dict (the trt-llm reference) cannot be " + "loaded. Install with: uv pip install 'aiperf[accuracy]'." +) + +DATASET_NAME = "lukaemon/bbh" +TASK_NAME = "bigbench" + +# DeepEval's BigBenchHard caps n_shots at 3 (the canonical CoT files +# only contain 3 worked examples each). We mirror both bounds. +DEFAULT_N_SHOTS = 3 +MAX_N_SHOTS = 3 + +# DeepEval's BigBenchHard default is ``enable_cot=True``. +DEFAULT_ENABLE_COT = True + +# CoT solutions can run several hundred tokens; non-CoT answers are +# typically a single bare token. 1024 covers both with headroom. +DEFAULT_GENERATION_SIZE = 1024 + +# Schema field names in lukaemon/bbh. +INPUT_FIELD = "input" +TARGET_FIELD = "target" + + +def _resolve_tasks(tasks: list[str] | None) -> list[Any]: + """Convert ``--accuracy-tasks`` strings to ``BigBenchHardTask`` enums. + + DeepEval evaluates one task at a time. Aiperf accepts either: + - ``None`` / empty / ``["all"]`` (case-insensitive) → every + BigBenchHardTask enum (27 subtasks). + - Lower-snake-case strings matching the enum's ``value`` + (e.g. ``"boolean_expressions"``). + - Upper-snake-case enum names (e.g. ``"BOOLEAN_EXPRESSIONS"``) + for parity with the recipe's ``getattr(BigBenchHardTask, + task_name.upper(), None)`` lookup. + + Mixing ``"all"`` with other task names is rejected so typos like + ``["all", "NOT_A_TASK"]`` don't silently bypass validation — that + used to slip through and return every task while swallowing the + invalid entry (the parallel HellaSwag bug fixed in AIP-877). + + Unknown names raise ``ValueError`` with the full valid list so + typos fail loudly. + """ + if not tasks: + return list(BigBenchHardTask) + lowered = [t.lower() for t in tasks] + if "all" in lowered: + if lowered == ["all"]: + return list(BigBenchHardTask) + raise ValueError( + "'all' cannot be mixed with other task names. Pass 'all' " + "by itself (or omit --accuracy-tasks) to select every BBH " + f"subtask, or list specific subtasks. Got: {tasks!r}" + ) + valid_values = {t.value for t in BigBenchHardTask} + resolved: list[Any] = [] + unknown: list[str] = [] + for name in tasks: + if name in valid_values: + resolved.append(next(t for t in BigBenchHardTask if t.value == name)) + continue + enum_member = getattr(BigBenchHardTask, name.upper(), None) + if enum_member is not None: + resolved.append(enum_member) + else: + unknown.append(name) + if unknown: + raise ValueError( + f"Unknown BBH subtask(s): {unknown}. Valid subtasks: {sorted(valid_values)}" + ) + return resolved + class BigBenchBenchmark(AIPerfLoggerMixin): - """Registered placeholder for a future BigBench loader. + """BigBench-Hard benchmark loader, byte-equal to DeepEval's prompts. - `load_problems()` intentionally raises NotImplementedError in this release; - use the MMLU benchmark when a working accuracy loader is required. + Iterates the requested BBH subtasks and renders each problem's + prompt via ``BigBenchHardTemplate.generate_output`` (which reads + DeepEval's bundled CoT/shot prompt files). Pair with + ``ExactMatchGrader`` for the recipe's strict equality scoring. """ - def __init__(self, run: BenchmarkRun, **kwargs) -> None: + def __init__(self, run: BenchmarkRun, **kwargs: Any) -> None: super().__init__(**kwargs) + if not _HAS_DEEPEVAL: + raise RuntimeError(_MISSING_DEEPEVAL_HINT) self.run = run async def load_problems( self, tasks: list[str] | None, n_shots: int, enable_cot: bool ) -> list[BenchmarkProblem]: - raise NotImplementedError( - "bigbench benchmark is not yet implemented; only 'mmlu' is available in this release." - ) + """Load BBH problems and format them DeepEval-style. + + Args: + tasks: Subtask names (lower-snake-case enum values like + ``boolean_expressions`` or upper-snake-case enum names + like ``BOOLEAN_EXPRESSIONS``). ``None`` / ``["all"]`` + selects every subtask. Unknown names raise. + n_shots: 0..3 (DeepEval asserts ``n_shots <= 3`` because + the canonical prompt files ship exactly 3 examples). + enable_cot: When True (the DeepEval default), use the + bundled CoT prompt files; when False, use the non-CoT + ``shot_prompts/`` files. + + Returns: + One ``BenchmarkProblem`` per row across all selected + subtasks. ``task`` is the subtask name so results + aggregate per-subtask. + """ + if n_shots > MAX_N_SHOTS: + raise ValueError( + f"BBH supports at most {MAX_N_SHOTS} few-shot examples " + f"(got {n_shots}); DeepEval asserts ``n_shots <= 3`` " + f"because the canonical prompt files ship exactly " + f"{MAX_N_SHOTS} worked examples per subtask." + ) + task_enums = _resolve_tasks(tasks) + problems: list[BenchmarkProblem] = [] + for task in task_enums: + ds: Dataset = await asyncio.to_thread( + load_dataset, DATASET_NAME, task.value + ) + sub_problems = await asyncio.to_thread( + self._build_subtask_problems, + ds["test"], + task, + n_shots, + enable_cot, + ) + problems.extend(sub_problems) + return problems + + def _build_subtask_problems( + self, + ds: Any, + task: Any, + n_shots: int, + enable_cot: bool, + ) -> list[BenchmarkProblem]: + problems: list[BenchmarkProblem] = [] + for row in ds: + template_prompt = BigBenchHardTemplate.generate_output( + input=row[INPUT_FIELD], + task=task, + n_shots=n_shots, + enable_cot=enable_cot, + ) + prompt = f"{template_prompt}{bbh_confinement_statements_dict[task]}" + messages: list[AccuracyChatMessage] = [{"role": "user", "content": prompt}] + problems.append( + BenchmarkProblem( + prompt=prompt, + # ``BenchmarkProblem.ground_truth`` is typed ``str`` in + # strict mode; the upstream BBH schema stores targets + # as strings today, but coerce defensively so a future + # numeric column doesn't break the loader. Mirrors + # DeepEval's ``str(expected_output)`` in its grader. + ground_truth=str(row[TARGET_FIELD]), + task=task.value, + metadata={ + "bbh_task": task.value, + "confinement": bbh_confinement_statements_dict.get(task, ""), + "generation_size": DEFAULT_GENERATION_SIZE, + }, + raw_messages=messages, + ) + ) + return problems diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml index c2c35f3f3..b11c97a2d 100644 --- a/src/aiperf/plugin/plugins.yaml +++ b/src/aiperf/plugin/plugins.yaml @@ -1240,12 +1240,17 @@ accuracy_benchmark: bigbench: class: aiperf.accuracy.benchmarks.bigbench:BigBenchBenchmark description: | - BigBench benchmark for diverse language understanding tasks spanning - linguistics, reasoning, and world knowledge. + BigBench-Hard benchmark, aligned with the trt-llm benchmark recipe's + DeepEval-backed configuration. Prompts are byte-equal to + ``deepeval.benchmarks.BigBenchHard`` (n_shots=3, enable_cot=True, + using DeepEval's bundled per-subtask CoT prompt files). Pairs with + ``exact_match`` for the recipe's strict ``Scorer.exact_match_score`` + semantics. Requires the ``[accuracy]`` install (deepeval ships the + 27 canonical prompt files). metadata: default_grader: exact_match default_n_shots: 3 - is_implemented: false + default_enable_cot: true aime24: class: aiperf.accuracy.benchmarks.aime24:AIME24Benchmark diff --git a/tests/harness/fake_deepeval.py b/tests/harness/fake_deepeval.py new file mode 100644 index 000000000..830f2e60d --- /dev/null +++ b/tests/harness/fake_deepeval.py @@ -0,0 +1,141 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Minimal stand-in for the ``deepeval.benchmarks.big_bench_hard`` subtree. + +The ``[accuracy]`` extras (deepeval, lighteval, torch, transformers, ...) add +roughly 1 GiB to the install footprint. Installing them in the default CI +matrix would dominate setup time, so the unit-test job runs without them and +every test that touches a real-deepeval-only contract is opt-in. + +This module re-creates just enough of deepeval's surface for the BigBench +loader tests to exercise their logic against a synthetic but deterministic +prompt template. Tests that pin byte-equality against deepeval's bundled +CoT/shot ``.txt`` files still need the real install and are marked +``@pytest.mark.requires_deepeval``. + +Wiring: ``tests/unit/accuracy/conftest.py`` patches the bigbench loader's +module-level deepeval names with these fakes per-test (autouse, function +scope) when the real deepeval isn't importable. We deliberately do *not* +inject into ``sys.modules`` so adjacent tests like HellaSwag's continue +to use their own ``pytest.importorskip("deepeval")`` skip mechanism +without interference. + +Three names need to be present: + +- ``BigBenchHardTask`` — enum of 27 BBH subtasks. Mirrors the real values + one-for-one so resolver tests using ``BOOLEAN_EXPRESSIONS`` / + ``boolean_expressions`` continue to work. +- ``BigBenchHardTemplate.generate_output(input, task, n_shots, enable_cot)`` + — returns a synthetic prompt. The structure is deliberately + *not* byte-equal to the real upstream output; tests that need that + contract are marked ``requires_deepeval``. The format does honour + ``n_shots`` (longer prompt with more shots) and ``enable_cot`` (CoT + prompts contain "Let's think step by step.") so + ``test_more_shots_make_longer_prompt`` and similar loader-behavior tests + pass against the fake. +- ``bbh_confinement_statements_dict`` — task→confinement-string mapping, + mirrored from upstream so the per-task confinement assertions stay + meaningful without needing the real install. +""" + +from __future__ import annotations + +import enum + + +class BigBenchHardTask(enum.Enum): + """27 BBH subtasks. Values mirror the real deepeval enum exactly.""" + + BOOLEAN_EXPRESSIONS = "boolean_expressions" + CAUSAL_JUDGEMENT = "causal_judgement" + DATE_UNDERSTANDING = "date_understanding" + DISAMBIGUATION_QA = "disambiguation_qa" + DYCK_LANGUAGES = "dyck_languages" + FORMAL_FALLACIES = "formal_fallacies" + GEOMETRIC_SHAPES = "geometric_shapes" + HYPERBATON = "hyperbaton" + LOGICAL_DEDUCTION_FIVE_OBJECTS = "logical_deduction_five_objects" + LOGICAL_DEDUCTION_SEVEN_OBJECTS = "logical_deduction_seven_objects" + LOGICAL_DEDUCTION_THREE_OBJECTS = "logical_deduction_three_objects" + MOVIE_RECOMMENDATION = "movie_recommendation" + MULTISTEP_ARITHMETIC_TWO = "multistep_arithmetic_two" + NAVIGATE = "navigate" + OBJECT_COUNTING = "object_counting" + PENGUINS_IN_A_TABLE = "penguins_in_a_table" + REASONING_ABOUT_COLORED_OBJECTS = "reasoning_about_colored_objects" + RUIN_NAMES = "ruin_names" + SALIENT_TRANSLATION_ERROR_DETECTION = "salient_translation_error_detection" + SNARKS = "snarks" + SPORTS_UNDERSTANDING = "sports_understanding" + TEMPORAL_SEQUENCES = "temporal_sequences" + TRACKING_SHUFFLED_OBJECTS_FIVE_OBJECTS = "tracking_shuffled_objects_five_objects" + TRACKING_SHUFFLED_OBJECTS_SEVEN_OBJECTS = "tracking_shuffled_objects_seven_objects" + TRACKING_SHUFFLED_OBJECTS_THREE_OBJECTS = "tracking_shuffled_objects_three_objects" + WEB_OF_LIES = "web_of_lies" + WORD_SORTING = "word_sorting" + + +# Mirrored verbatim from the real ``bbh_confinement_statements_dict``. +# Stable upstream data; resync if deepeval ever changes a string. +bbh_confinement_statements_dict: dict[BigBenchHardTask, str] = { + BigBenchHardTask.BOOLEAN_EXPRESSIONS: "\n\nOutput 'True' or 'False'. Full answer not needed.", + BigBenchHardTask.CAUSAL_JUDGEMENT: "\n\nOutput 'Yes' or 'No'. Full answer not needed.", + BigBenchHardTask.DATE_UNDERSTANDING: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', or '(F)'. Full answer not needed.", + BigBenchHardTask.DISAMBIGUATION_QA: "\n\nOutput '(A)', '(B)', or '(C)'. Full answer not needed.", + BigBenchHardTask.DYCK_LANGUAGES: "\n\nOutput only the sequence of parentheses characters separated by white space. Full answer not needed.", + BigBenchHardTask.FORMAL_FALLACIES: "\n\nOutput 'invalid' or 'valid'. Full answer not needed.", + BigBenchHardTask.GEOMETRIC_SHAPES: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', '(F)', '(G)', '(H)', '(I)', '(J)', or '(K)'. Full answer not needed.", + BigBenchHardTask.HYPERBATON: "\n\nOutput '(A)' or'(B)'. Full answer not needed.", + BigBenchHardTask.LOGICAL_DEDUCTION_FIVE_OBJECTS: "\n\nOutput '(A)', '(B)', '(C)', '(D)', or '(E)'. Full answer not needed.", + BigBenchHardTask.LOGICAL_DEDUCTION_SEVEN_OBJECTS: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', '(F)', or '(G)'. Full answer not needed.", + BigBenchHardTask.LOGICAL_DEDUCTION_THREE_OBJECTS: "\n\nOutput '(A)', '(B)', or '(C)'. Full answer not needed.", + BigBenchHardTask.MOVIE_RECOMMENDATION: "\n\nOutput '(A)', '(B)', '(C)', '(D)', or '(E)'. Full answer not needed.", + BigBenchHardTask.MULTISTEP_ARITHMETIC_TWO: "\n\nOutput the numerical answer. Full answer not needed.", + BigBenchHardTask.NAVIGATE: "\n\nOutput 'Yes' or 'No'. Full answer not needed.", + BigBenchHardTask.OBJECT_COUNTING: "\n\nOutput the numerical answer. Full answer not needed.", + BigBenchHardTask.PENGUINS_IN_A_TABLE: "\n\nOutput '(A)', '(B)', '(C)', '(D)', or '(E)'. Full answer not needed.", + BigBenchHardTask.REASONING_ABOUT_COLORED_OBJECTS: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', '(F)', '(G)', '(H)', '(I)', '(J)', '(K)', '(L)', '(M)', '(N)', '(O)', '(P)', '(Q)', or '(R)'. Full answer not needed.", + BigBenchHardTask.RUIN_NAMES: "\n\nOutput '(A)', '(B)', '(C)', or '(D)'. Full answer not needed.", + BigBenchHardTask.SALIENT_TRANSLATION_ERROR_DETECTION: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', or '(F)'. Full answer not needed.", + BigBenchHardTask.SNARKS: "\n\nOutput '(A)' or'(B)'. Full answer not needed.", + BigBenchHardTask.SPORTS_UNDERSTANDING: "\n\nOutput 'yes' or 'no'. Full answer not needed.", + BigBenchHardTask.TEMPORAL_SEQUENCES: "\n\nOutput '(A)', '(B)', '(C)', or '(D)'. Full answer not needed.", + BigBenchHardTask.TRACKING_SHUFFLED_OBJECTS_FIVE_OBJECTS: "\n\nOutput '(A)', '(B)', '(C)', '(D)', or '(E)'. Full answer not needed.", + BigBenchHardTask.TRACKING_SHUFFLED_OBJECTS_SEVEN_OBJECTS: "\n\nOutput '(A)', '(B)', '(C)', '(D)', '(E)', '(F)', or '(G)'. Full answer not needed.", + BigBenchHardTask.TRACKING_SHUFFLED_OBJECTS_THREE_OBJECTS: "\n\nOutput '(A)', '(B)', or '(C)'. Full answer not needed.", + BigBenchHardTask.WEB_OF_LIES: "\n\nOutput 'Yes' or 'No'. Full answer not needed.", + BigBenchHardTask.WORD_SORTING: "\n\nOutput only the sequence of words separated by white space. Full answer not needed.", +} + + +class BigBenchHardTemplate: + """Synthetic stand-in for ``deepeval``'s prompt template. + + Output structure is deliberately *not* byte-equal to upstream. Tests + that need byte-equality are tagged ``requires_deepeval`` and skip + without the real install. The fake honours these contracts so + loader-behavior tests still pass: + + - More ``n_shots`` produces a strictly longer prompt. + - ``enable_cot=True`` produces a prompt containing ``"step by step"``; + ``enable_cot=False`` does not. + - The trailing ``"Q: {input}\\nA: "`` matches the real template + well enough for the "query is at the end" assertion shape. + """ + + @classmethod + def generate_output( + cls, + input: str, # noqa: A002 - mirrors upstream kw name + task: BigBenchHardTask, + n_shots: int, + enable_cot: bool, + ) -> str: + header = f"Task description: [fake] subtask={task.value}." + shot_marker = ( + "\n[fake CoT shot] Let's think step by step.\n" + if enable_cot + else "\n[fake shot]\n" + ) + shots = shot_marker * n_shots + return f"{header}{shots}\n\nQ: {input}\nA: " diff --git a/tests/unit/accuracy/conftest.py b/tests/unit/accuracy/conftest.py new file mode 100644 index 000000000..2a7c39481 --- /dev/null +++ b/tests/unit/accuracy/conftest.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Accuracy-scoped fixtures. + +Carries the fake-deepeval wiring used by the BigBench loader tests so they +can run in CI without the ``[accuracy]`` extras (which add ~1 GiB and are +not installed in the default unit-test job). + +Two pieces: + +- ``_patch_bigbench_deepeval_names`` is an autouse fixture that swaps the + bigbench loader's deepeval-imported module attributes for the fake + stand-ins. Active only when the real deepeval isn't importable, so the + real install wins locally / in any job that opts into ``[accuracy]``. + Scoped per-test (function-scope ``monkeypatch``) so it doesn't leak + into adjacent tests like HellaSwag, which still use the existing + ``pytest.importorskip("deepeval")`` skip mechanism. +- ``pytest_collection_modifyitems`` skips tests tagged + ``@pytest.mark.requires_deepeval`` when only the fake is available — + used for byte-equal-prompt assertions that depend on deepeval's + bundled ``.txt`` prompt files which the fake doesn't reproduce. +""" + +from __future__ import annotations + +import pytest + +from tests.harness import fake_deepeval + + +def _real_deepeval_available() -> bool: + """True iff the real deepeval (with bundled CoT/shot prompt files) is + importable. The fake harness does not satisfy this — it lives under + ``tests.harness``.""" + try: + import deepeval.benchmarks.big_bench_hard.template as _t # noqa: F401 + + return True + except ImportError: + return False + + +def pytest_collection_modifyitems(config, items): + """Skip ``@pytest.mark.requires_deepeval`` items when the real + deepeval install isn't available.""" + if _real_deepeval_available(): + return + skip_mark = pytest.mark.skip( + reason="requires the real deepeval install ([accuracy] extras); " + "the fake-deepeval harness cannot reproduce upstream prompt bytes." + ) + for item in items: + if "requires_deepeval" in item.keywords: + item.add_marker(skip_mark) + + +@pytest.fixture(autouse=True) +def _patch_bigbench_deepeval_names(request, monkeypatch): + """Swap ``bigbench.py``'s deepeval-imported names for the fake when + the real install isn't present. + + ``bigbench.py``'s top-level ``try / except ImportError`` already + binds the four affected names (``_HAS_DEEPEVAL``, ``BigBenchHardTask``, + ``BigBenchHardTemplate``, ``bbh_confinement_statements_dict``) to + ``False`` / ``None`` when deepeval is missing. This fixture patches + them per-test to the harness fakes so loader tests can run. + + Skipped (no patching) when the real deepeval is importable so the + real upstream behavior is exercised end-to-end in ``[accuracy]`` + environments. + """ + if _real_deepeval_available(): + return + try: + import aiperf.accuracy.benchmarks.bigbench as bigbench_mod + except ImportError: + # bigbench.py couldn't load at all — nothing to patch. Tests + # that need it will fail loudly on import, which is what we + # want. + return + monkeypatch.setattr(bigbench_mod, "_HAS_DEEPEVAL", True) + monkeypatch.setattr( + bigbench_mod, "BigBenchHardTask", fake_deepeval.BigBenchHardTask + ) + monkeypatch.setattr( + bigbench_mod, "BigBenchHardTemplate", fake_deepeval.BigBenchHardTemplate + ) + monkeypatch.setattr( + bigbench_mod, + "bbh_confinement_statements_dict", + fake_deepeval.bbh_confinement_statements_dict, + ) diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py index 509565d26..5e78125ce 100644 --- a/tests/unit/accuracy/test_accuracy_config.py +++ b/tests/unit/accuracy/test_accuracy_config.py @@ -23,7 +23,6 @@ # This branch (AIP-874) implements ``aime``, ``math``, and ``code_execution``, # so those names are absent from the stub lists. STUB_BENCHMARKS = ( - "bigbench", "aime24", "aime25", "math_500", @@ -87,8 +86,8 @@ def test_accuracyconfig_with_uppercase_stub_name_raises_validationerror( ) -> None: """Case-insensitive enum lookup must not bypass the validator.""" with pytest.raises(ValidationError) as exc: - AccuracyConfig(benchmark="BIGBENCH") - assert "bigbench" in str(exc.value) + AccuracyConfig(benchmark="LCB_CODEGENERATION") + assert "lcb_codegeneration" in str(exc.value) class TestRejectsStubGrader: diff --git a/tests/unit/accuracy/test_bigbench_benchmark.py b/tests/unit/accuracy/test_bigbench_benchmark.py new file mode 100644 index 000000000..c45a16b8a --- /dev/null +++ b/tests/unit/accuracy/test_bigbench_benchmark.py @@ -0,0 +1,626 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for ``BigBenchBenchmark`` after DeepEval alignment. + +Pins: +1. Prompt is byte-equal to ``deepeval.benchmarks.BigBenchHard``'s + ``BigBenchHardTemplate.generate_output`` output (which itself + reads the canonical CoT/non-CoT prompt files DeepEval ships). +2. ``ground_truth`` is the bare ``target`` string from + ``lukaemon/bbh`` (DeepEval's convention for exact_match_score). +3. ``confinement`` carried in metadata maps per-task to the right + "Output 'X' or 'Y'..." string. +4. Per-task task field so the accuracy CSV breaks down per BBH + subtask. + +Most tests in this file run against ``tests.harness.fake_deepeval`` — a +small stand-in that mirrors the 27-task enum and confinement dict +exactly but generates a synthetic (non-byte-equal) prompt template. +Tests that pin the real upstream prompt bytes are marked +``@pytest.mark.requires_deepeval`` and skip when only the fake is +registered. The fake is wired in ``tests/unit/accuracy/conftest.py`` +(autouse, function scope) so the ``aiperf[accuracy]`` extras are no +longer a hard prerequisite for running this file. +""" + +from __future__ import annotations + +from collections.abc import Callable +from typing import TYPE_CHECKING, Any +from unittest.mock import MagicMock, patch + +import pytest + +from aiperf.accuracy.benchmarks.bigbench import ( + DEFAULT_ENABLE_COT, + DEFAULT_GENERATION_SIZE, + DEFAULT_N_SHOTS, + MAX_N_SHOTS, + BigBenchBenchmark, + _resolve_tasks, +) +from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType +from tests.unit.conftest import make_benchmark_run + +if TYPE_CHECKING: + from aiperf.config import BenchmarkRun + + +def _make_run() -> BenchmarkRun: + return make_benchmark_run( + model_names=["test-model"], + endpoint_type=EndpointType.COMPLETIONS, + streaming=False, + accuracy={"benchmark": AccuracyBenchmarkType.BIGBENCH}, + ) + + +def _make_row(input_text: str = "What is 2+2?", target: str = "4") -> dict[str, Any]: + return {"input": input_text, "target": target} + + +def _make_fake_dataset(rows: list[dict[str, Any]]) -> dict[str, Any]: + """Mock ``load_dataset`` return value (a dict-like with split keys).""" + test_split = MagicMock() + test_split.__iter__ = MagicMock(side_effect=lambda: iter(rows)) + test_split.__len__ = MagicMock(return_value=len(rows)) + test_split.__getitem__ = MagicMock(side_effect=lambda i: rows[i]) + return {"test": test_split} + + +def _per_task_loader( + per_task: dict[str, list[dict[str, Any]]], +) -> Callable[..., dict[str, Any]]: + """``load_dataset`` patch that dispatches by task name.""" + + def loader( + _dataset_name: str, + task_name: str | None = None, + **_kwargs: Any, + ) -> dict[str, Any]: + return _make_fake_dataset( + per_task.get(task_name, []) if task_name is not None else [] + ) + + return loader + + +class TestDefaultsMatchDeepEval: + """Defaults mirror ``deepeval.benchmarks.BigBenchHard``.""" + + def test_default_n_shots_is_3(self) -> None: + assert DEFAULT_N_SHOTS == 3 + + def test_max_n_shots_is_3(self) -> None: + """DeepEval asserts ``n_shots <= 3`` because the bundled prompt + files only contain 3 worked examples.""" + assert MAX_N_SHOTS == 3 + + def test_default_enable_cot_is_true(self) -> None: + assert DEFAULT_ENABLE_COT is True + + def test_default_generation_size_is_1024(self) -> None: + assert DEFAULT_GENERATION_SIZE == 1024 + + +class TestResolveTasks: + def test_none_returns_all_27_subtasks(self) -> None: + result = _resolve_tasks(None) + assert len(result) == 27 + + def test_all_returns_all_27_subtasks(self) -> None: + result = _resolve_tasks(["all"]) + assert len(result) == 27 + + def test_lower_snake_case_value_resolves(self) -> None: + result = _resolve_tasks(["boolean_expressions"]) + assert len(result) == 1 + assert result[0].value == "boolean_expressions" + + def test_upper_snake_case_enum_name_resolves(self) -> None: + result = _resolve_tasks(["BOOLEAN_EXPRESSIONS"]) + assert len(result) == 1 + assert result[0].value == "boolean_expressions" + + def test_unknown_subtask_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown BBH subtask"): + _resolve_tasks(["not_a_real_task"]) + + def test_unknown_subtask_lists_valid(self) -> None: + with pytest.raises(ValueError) as exc_info: + _resolve_tasks(["not_a_real_task"]) + # All 27 should appear in the error. + assert "boolean_expressions" in str(exc_info.value) + assert "navigate" in str(exc_info.value) + assert "object_counting" in str(exc_info.value) + + +@pytest.mark.requires_deepeval +class TestPromptByteEqualWithDeepEval: + """The flat prompt must be byte-equal to what + ``BigBenchHardTemplate.generate_output`` produces — same template, + same CoT files, same n_shots, same enable_cot. + + These assertions read specific strings out of DeepEval's bundled CoT + and shot prompt ``.txt`` files (e.g. ``"Task description: Evaluate + the result of a random Boolean expression."``). The fake harness + cannot reproduce those bytes, so the class is tagged + ``requires_deepeval``; the marker skips it when only the fake is + registered (i.e. when the ``[accuracy]`` extras aren't installed). + """ + + @pytest.mark.asyncio + async def test_cot_prompt_starts_with_task_description(self) -> None: + per_task = {"boolean_expressions": [_make_row("True and False is", "False")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["boolean_expressions"], + n_shots=3, + enable_cot=True, + ) + prompt = problems[0].prompt + # DeepEval's template prepends "Task description: " then the + # canonical first paragraph. For boolean_expressions that + # paragraph is "Evaluate the result of a random Boolean expression." + assert prompt.startswith( + "Task description: Evaluate the result of a random Boolean expression." + ) + + @pytest.mark.asyncio + async def test_query_appended_before_confinement(self) -> None: + per_task = {"boolean_expressions": [_make_row("True and False is", "False")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["boolean_expressions"], + n_shots=3, + enable_cot=True, + ) + prompt = problems[0].prompt + # DeepEval's template appends "\n\nQ: \nA: " at the end of + # its output. The loader then appends the per-task confinement + # statement so the LLM sees the constraint as part of the prompt + # (matches the trt-llm benchmark recipe's flow). For + # boolean_expressions that confinement starts with "\n\nOutput + # 'True' or 'False'." so the Q/A pair sits immediately before it. + assert "Q: True and False is\nA: \n\nOutput 'True' or 'False'." in prompt + assert prompt.endswith("Full answer not needed.") + + @pytest.mark.asyncio + async def test_cot_vs_no_cot_use_different_prompt_files(self) -> None: + per_task = {"navigate": [_make_row("Walk forward 5 steps.", "No")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + cot = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + no_cot = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=False + ) + # CoT version has "Let's think step by step." worked examples; + # non-CoT has bare Q/A pairs. + assert "step by step" in cot[0].prompt.lower() or "Let's" in cot[0].prompt + assert cot[0].prompt != no_cot[0].prompt + + @pytest.mark.asyncio + async def test_zero_shot_takes_only_task_description(self) -> None: + """``n_shots=0`` should emit just ``"Task description: "`` followed by the test query — no worked examples.""" + per_task = {"boolean_expressions": [_make_row("True and True is", "True")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["boolean_expressions"], + n_shots=0, + enable_cot=True, + ) + prompt = problems[0].prompt + # Only the task description and the query, no worked examples + # (the CoT files use "Let's think step by step." in shot + # examples; with n_shots=0 that phrase shouldn't appear). + assert "Q: True and True is\nA: " in prompt + # The 0-shot vs 3-shot length comparison lives in + # ``TestNShotsAffectsPromptLength`` below. + + +class TestNShotsAffectsPromptLength: + @pytest.mark.asyncio + async def test_more_shots_make_longer_prompt(self) -> None: + per_task = {"boolean_expressions": [_make_row("True is", "True")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + zero = await bench.load_problems( + tasks=["boolean_expressions"], n_shots=0, enable_cot=True + ) + three = await bench.load_problems( + tasks=["boolean_expressions"], n_shots=3, enable_cot=True + ) + assert len(three[0].prompt) > len(zero[0].prompt) + + +class TestNShotsCap: + @pytest.mark.asyncio + async def test_n_shots_above_3_raises(self) -> None: + bench = BigBenchBenchmark(run=_make_run()) + with pytest.raises(ValueError, match="at most 3"): + await bench.load_problems(tasks=None, n_shots=4, enable_cot=True) + + +class TestGroundTruthIsBareTarget: + @pytest.mark.asyncio + async def test_ground_truth_is_target_string(self) -> None: + per_task = { + "navigate": [ + _make_row("Walk left, then right.", "No"), + _make_row("Walk forward 5 steps.", "Yes"), + ] + } + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + assert [p.ground_truth for p in problems] == ["No", "Yes"] + + +class TestConfinementInMetadata: + """The per-task confinement string is carried in metadata so callers + that need DeepEval's structured-fallback shape (or want to log it) + can read it.""" + + @pytest.mark.asyncio + async def test_boolean_expressions_confinement(self) -> None: + per_task = {"boolean_expressions": [_make_row("Q?", "True")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["boolean_expressions"], n_shots=3, enable_cot=True + ) + assert "True" in problems[0].metadata["confinement"] + assert "False" in problems[0].metadata["confinement"] + + @pytest.mark.asyncio + async def test_navigate_confinement(self) -> None: + per_task = {"navigate": [_make_row("Q?", "Yes")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + assert "Yes" in problems[0].metadata["confinement"] + assert "No" in problems[0].metadata["confinement"] + + +class TestPerTaskAggregation: + @pytest.mark.asyncio + async def test_task_field_is_subtask_name(self) -> None: + per_task = { + "navigate": [_make_row("Q1", "Yes")], + "object_counting": [_make_row("Q2", "5")], + } + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate", "object_counting"], + n_shots=3, + enable_cot=True, + ) + tasks = {p.task for p in problems} + assert tasks == {"navigate", "object_counting"} + + +class TestPathologicalDatasetRows: + @pytest.mark.asyncio + async def test_empty_subtask_returns_empty(self) -> None: + per_task = {"navigate": []} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + assert problems == [] + + @pytest.mark.asyncio + async def test_unicode_in_target_preserved(self) -> None: + per_task = {"navigate": [_make_row("Q?", "café")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + assert problems[0].ground_truth == "café" + + @pytest.mark.asyncio + async def test_chat_message_is_single_user(self) -> None: + per_task = {"navigate": [_make_row("Q?", "Yes")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + msgs = problems[0].raw_messages + assert msgs is not None + assert len(msgs) == 1 + assert msgs[0]["role"] == "user" + + +class TestResolveTasksAdversarial: + """Edge cases on ``--accuracy-tasks`` parsing not covered by + ``TestResolveTasks``.""" + + def test_empty_list_returns_all_27_subtasks(self) -> None: + """A bare ``--accuracy-tasks`` with no values reaches the resolver + as ``[]`` (falsy) — equivalent to ``None`` / ``["all"]``.""" + assert len(_resolve_tasks([])) == 27 + + def test_mixed_case_all_returns_all_27_subtasks(self) -> None: + """``"All"`` / ``"ALL"`` must match case-insensitively. The + docstring promises this; pin it so a future case-sensitive + refactor breaks the test loudly.""" + assert len(_resolve_tasks(["All"])) == 27 + assert len(_resolve_tasks(["ALL"])) == 27 + + def test_all_mixed_with_typo_raises(self) -> None: + """``["all", "NOT_A_REAL_TASK"]`` used to silently return every + subtask and swallow the typo (the parallel HellaSwag bug AIP-877 + fixed). Must now raise so a user typo fails loudly instead of + running the whole 27-task benchmark.""" + with pytest.raises(ValueError, match="'all' cannot be mixed"): + _resolve_tasks(["all", "not_a_real_task"]) + + def test_all_mixed_with_valid_name_also_raises(self) -> None: + """Even when both names would individually be accepted, mixing + ``"all"`` with anything else is ambiguous and must fail.""" + with pytest.raises(ValueError, match="'all' cannot be mixed"): + _resolve_tasks(["all", "navigate"]) + + def test_whitespace_in_task_name_raises(self) -> None: + """A whitespace-bearing name is not silently trimmed — pin the + loud-failure mode so accidental YAML spacing is caught.""" + with pytest.raises(ValueError, match="Unknown BBH subtask"): + _resolve_tasks([" boolean_expressions "]) + + def test_hyphenated_task_name_raises(self) -> None: + """Hyphens aren't normalized. ``"boolean-expressions"`` upper- + cases to ``"BOOLEAN-EXPRESSIONS"`` which is not a valid enum + attribute, so the resolver raises.""" + with pytest.raises(ValueError, match="Unknown BBH subtask"): + _resolve_tasks(["boolean-expressions"]) + + def test_mixed_valid_and_invalid_lists_only_invalid(self) -> None: + """When some names resolve and others don't, the unknown-list + portion of the error must contain only the unknown name — no + false positive on the valid one.""" + with pytest.raises(ValueError) as exc_info: + _resolve_tasks(["navigate", "not_a_real"]) + msg = str(exc_info.value) + assert "not_a_real" in msg + # The error also lists the full valid set after "Valid subtasks:" + # for guidance, so narrow the check to the unknown-list portion. + unknown_portion = msg.split("Valid subtasks:")[0] + assert "'navigate'" not in unknown_portion + + def test_duplicate_task_names_resolve_to_duplicate_enums(self) -> None: + """The resolver does not deduplicate. Passing the same task + twice yields two entries and will trigger ``load_dataset`` + twice for the same subtask — pin the behavior so callers know + the cost.""" + result = _resolve_tasks(["navigate", "navigate"]) + assert len(result) == 2 + assert result[0] is result[1] + + +class TestConstructorWithoutDeepEval: + """The constructor refuses to build when the ``[accuracy]`` extras + aren't available — otherwise downstream ``BigBenchHardTemplate`` + calls would crash with an unhelpful ``NameError``.""" + + def test_missing_deepeval_raises_with_install_hint(self) -> None: + with ( + patch("aiperf.accuracy.benchmarks.bigbench._HAS_DEEPEVAL", False), + pytest.raises(RuntimeError, match=r"aiperf\[accuracy\]"), + ): + BigBenchBenchmark(run=_make_run()) + + +class TestOutputInvariants: + """Per-problem fields that should always agree do agree.""" + + @pytest.mark.asyncio + async def test_prompt_equals_first_chat_message_content(self) -> None: + """``prompt`` (the flat completions string) and the lone chat + message's ``content`` must be byte-equal — drift here would + mean completions vs chat endpoints render different prompts + for the same problem.""" + per_task = {"navigate": [_make_row("Q?", "Yes")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + msgs = problems[0].raw_messages + assert msgs is not None + assert problems[0].prompt == msgs[0]["content"] + + @pytest.mark.asyncio + async def test_metadata_bbh_task_matches_task_field(self) -> None: + """``problem.task`` and ``problem.metadata['bbh_task']`` must + match for every problem — the accuracy CSV reads the former, + downstream tooling the latter, and both refer to the same + subtask.""" + per_task = { + "navigate": [_make_row("Q1", "Yes")], + "object_counting": [_make_row("Q2", "5")], + } + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate", "object_counting"], + n_shots=3, + enable_cot=True, + ) + for p in problems: + assert p.task == p.metadata["bbh_task"] + + @pytest.mark.asyncio + async def test_generation_size_is_plumbed_through_metadata(self) -> None: + """``DEFAULT_GENERATION_SIZE=1024`` is carried in per-problem + metadata so request-level overrides can read it without + round-tripping the module constant.""" + per_task = {"navigate": [_make_row("Q?", "Yes")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + assert problems[0].metadata["generation_size"] == DEFAULT_GENERATION_SIZE + + @pytest.mark.asyncio + async def test_multitask_order_preserves_task_input_order(self) -> None: + """When ``tasks=[A, B]``, every problem for A precedes every + problem for B in the output list. The accuracy CSV's per-task + grouping depends on this contiguity.""" + per_task = { + "navigate": [_make_row("nav-q", "Yes")], + "object_counting": [ + _make_row("oc-q1", "1"), + _make_row("oc-q2", "2"), + ], + } + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate", "object_counting"], + n_shots=3, + enable_cot=True, + ) + assert [p.task for p in problems] == [ + "navigate", + "object_counting", + "object_counting", + ] + + +class TestLoadDatasetInvocation: + """Pin the ``load_dataset(DATASET_NAME, task.value)`` call shape so + a rename of ``DATASET_NAME`` or an accidental kwarg/positional + reorder is caught.""" + + @pytest.mark.asyncio + async def test_load_dataset_called_once_per_task_with_canonical_args( + self, + ) -> None: + per_task = { + "navigate": [_make_row("Q1", "Yes")], + "object_counting": [_make_row("Q2", "1")], + } + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ) as mock_load: + bench = BigBenchBenchmark(run=_make_run()) + await bench.load_problems( + tasks=["navigate", "object_counting"], + n_shots=3, + enable_cot=True, + ) + # Two requested tasks → exactly two load_dataset calls, each + # with the canonical dataset name positional and the subtask + # value positional. Asserting via call_args_list catches both a + # rename of DATASET_NAME and a future swap to kwargs. + assert [c.args for c in mock_load.call_args_list] == [ + ("lukaemon/bbh", "navigate"), + ("lukaemon/bbh", "object_counting"), + ] + + +class TestPathologicalRowContent: + """Hostile row content the upstream dataset could theoretically + ship.""" + + @pytest.mark.asyncio + async def test_empty_input_string_still_renders_prompt(self) -> None: + """A blank ``input`` is unusual but shouldn't crash — DeepEval's + template just appends it verbatim. Pin the passthrough so we + notice if DeepEval ever rejects empty inputs.""" + per_task = {"navigate": [_make_row(input_text="", target="Yes")]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["navigate"], n_shots=3, enable_cot=True + ) + # Prompt still rendered — the task description and shot + # examples are always present even when the input is empty. + assert len(problems) == 1 + assert len(problems[0].prompt) > 0 + + @pytest.mark.asyncio + async def test_numeric_target_coerced_to_string(self) -> None: + """A numeric ``target`` (e.g. an int from a future BBH schema + change) is coerced to ``str`` by the loader before constructing + the ``BenchmarkProblem``. ``BenchmarkProblem.ground_truth`` is + in strict mode, so the loader's defensive ``str(...)`` is the + contract callers rely on for string equality in graders.""" + per_task = {"object_counting": [{"input": "Count items", "target": 42}]} + with patch( + "aiperf.accuracy.benchmarks.bigbench.load_dataset", + side_effect=_per_task_loader(per_task), + ): + bench = BigBenchBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["object_counting"], n_shots=3, enable_cot=True + ) + assert problems[0].ground_truth == "42"