diff --git a/docs/accuracy/accuracy-benchmarking.md b/docs/accuracy/accuracy-benchmarking.md index 6e24d229a..d64ad7fab 100644 --- a/docs/accuracy/accuracy-benchmarking.md +++ b/docs/accuracy/accuracy-benchmarking.md @@ -72,6 +72,7 @@ system message). |---|---|---|---| | `mmlu` | `multiple_choice` | 5 | `lighteval/mmlu` (57 subjects) | | `aime` | `math` | 8 | `Maxwell-Jia/AIME_2024` (trt-llm reference, 8-shot CoT) | +| `hellaswag` | `exact_match` | 10 | `Rowan/hellaswag` (trt-llm/DeepEval reference; one few-shot per unique activity_label) | ## CLI Flags diff --git a/docs/accuracy/accuracy_stubs.md b/docs/accuracy/accuracy_stubs.md index 027acb9c9..ea85ea2e7 100644 --- a/docs/accuracy/accuracy_stubs.md +++ b/docs/accuracy/accuracy_stubs.md @@ -7,7 +7,7 @@ This document catalogs every stubbed method in the accuracy benchmarking scaffolding. The scaffolding is fully integrated into the plugin system, CLI, and config pipeline — the performance benchmarking path is unaffected. -**Status summary:** As of the AIME loader landing on top of PR #815, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `MMLUBenchmark`, and `AIMEBenchmark` are fully implemented; the remaining grader (`exact_match`) and benchmarks (`hellaswag`, `bigbench`, `aime24`, `aime25`, `math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. +**Status summary:** With the HellaSwag loader landing on top of AIP-874, `MultipleChoiceGrader`, `MathGrader`, `CodeExecutionGrader`, `LightevalExprGrader`, `LightevalLatexGrader`, `LightevalGPQAGrader`, `ExactMatchGrader`, `MMLUBenchmark`, `AIMEBenchmark`, and `HellaSwagBenchmark` are fully implemented; the remaining benchmarks (`bigbench`, `aime24`, `aime25`, `math_500`, `gpqa_diamond`, `lcb_codegeneration`) are still stubs and ship behind `NotImplementedError` until each follow-up branch lands. Use the implemented classes as canonical references when filling in the remaining stubs. ## Table of Contents @@ -146,11 +146,11 @@ class BaseGrader(AIPerfLoggerMixin): | 5 | `LightevalLatexGrader` | `graders/lighteval_grader.py` | `lighteval_latex` | **IMPLEMENTED with the AIME loader.** Same shape as `LightevalExprGrader` but the gold extractor uses `LatexExtractionConfig` — matches the trt-llm recipe's `latex_gold_metric`. Used by AIP-879 (MATH-500). Requires the `[accuracy]` extras. | | 6 | `LightevalGPQAGrader` | `graders/lighteval_grader.py` | `lighteval_gpqa` | **IMPLEMENTED with the AIME loader.** Wraps `MultilingualExtractiveMatchMetric` with `IndicesExtractionConfig(prefix_for_extraction="NativeLetters")` to extract A/B/C/D in both gold and prediction — matches the trt-llm recipe's `gpqa_metric`. Used by AIP-880 (GPQA-Diamond). Requires the `[accuracy]` extras. | +| 7 | `ExactMatchGrader` | `graders/exact_match.py` | `exact_match` | **IMPLEMENTED with the HellaSwag loader.** Strict `pred.strip() == gold.strip()` grader matching DeepEval's `Scorer.exact_match_score` (case-sensitive, no normalization). Used by HellaSwag and BigBench-Hard for trt-llm reference parity. | + ### Still Stubbed -| # | Class | File | Plugin Key | Description | -|---|-------|------|------------|-------------| -| 1 | `ExactMatchGrader` | `graders/exact_match.py` | `exact_match` | Exact string matching against ground truth | +_All graders are now implemented._ **Each grader has 2 methods to implement:** @@ -171,18 +171,18 @@ All benchmarks use `AIPerfLoggerMixin` and must implement 1 method. |---|-------|------|------------|----------------|-----------------|-------| | 1 | `MMLUBenchmark` | `benchmarks/mmlu.py` | `mmlu` | `multiple_choice` | 5 | **IMPLEMENTED in PR #815** — canonical reference for new benchmarks. Downloads via HuggingFace datasets, handles few-shot formatting and CoT. | | 2 | `AIMEBenchmark` | `benchmarks/aime.py` | `aime` | `math` | 8 | **IMPLEMENTED.** Loads `Maxwell-Jia/AIME_2024`, instructs the model to wrap its final integer in `\boxed{}`, supports few-shot priming and chain-of-thought. `default_enable_cot=true`. | +| 3 | `HellaSwagBenchmark` | `benchmarks/hellaswag.py` | `hellaswag` | `exact_match` | 10 | **IMPLEMENTED.** Loads `Rowan/hellaswag` (validation split filtered per task by `activity_label`; train split feeds the "one few-shot per unique activity_label" rule). Prompt rendering delegates to `deepeval.benchmarks.HellaSwag`'s `HellaSwagTemplate.generate_output`, so output is byte-equal to the trt-llm recipe's DeepEval-backed path. Pairs with `exact_match` for strict `Scorer.exact_match_score` semantics. Requires the `[accuracy]` extras (deepeval). | ### Still Stubbed | # | Class | File | Plugin Key | Default Grader | Default N-Shots | |---|-------|------|------------|----------------|-----------------| -| 1 | `HellaSwagBenchmark` | `benchmarks/hellaswag.py` | `hellaswag` | `multiple_choice` | 0 | -| 2 | `BigBenchBenchmark` | `benchmarks/bigbench.py` | `bigbench` | `exact_match` | 3 | -| 3 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `math` | 0 | -| 4 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `math` | 0 | -| 5 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | -| 6 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | -| 7 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | +| 1 | `BigBenchBenchmark` | `benchmarks/bigbench.py` | `bigbench` | `exact_match` | 3 | +| 2 | `AIME24Benchmark` | `benchmarks/aime24.py` | `aime24` | `math` | 0 | +| 3 | `AIME25Benchmark` | `benchmarks/aime25.py` | `aime25` | `math` | 0 | +| 4 | `Math500Benchmark` | `benchmarks/math_500.py` | `math_500` | `math` | 0 | +| 5 | `GPQADiamondBenchmark` | `benchmarks/gpqa_diamond.py` | `gpqa_diamond` | `multiple_choice` | 0 | +| 6 | `LCBCodeGenerationBenchmark` | `benchmarks/lcb_codegeneration.py` | `lcb_codegeneration` | `code_execution` | 0 | **Each benchmark has 1 method to implement:** diff --git a/pyproject.toml b/pyproject.toml index ff66739f7..2ac1060ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,13 @@ accuracy = [ "latex2sympy2-extended>=1.0.6", "lighteval>=0.13.0", "sympy>=1.14.0", + # deepeval is the upstream reference for the HellaSwag and + # BigBench-Hard benchmark prompt templates. Aiperf calls + # ``HellaSwagTemplate.generate_output`` (and + # ``BigBenchHardTemplate.generate_output`` on the BigBench branch) + # directly so prompts are byte-equal to what the trt-llm benchmark + # recipe ships. + "deepeval>=2.9.0,<5.0.0", ] [dependency-groups] diff --git a/src/aiperf/accuracy/benchmarks/hellaswag.py b/src/aiperf/accuracy/benchmarks/hellaswag.py index 4a0f5c4ef..4910fcf9e 100644 --- a/src/aiperf/accuracy/benchmarks/hellaswag.py +++ b/src/aiperf/accuracy/benchmarks/hellaswag.py @@ -1,31 +1,317 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +"""HellaSwag benchmark loader, aligned with the trt-llm DeepEval reference. + +The trt-llm benchmark recipe routes ``hellaswag`` through DeepEval's +``deepeval.benchmarks.HellaSwag`` class +(``trt-llm-benchmark-recipe/src/tools/acc_benchmark.py:319-336``). This +loader produces prompts byte-equal to what DeepEval's +``HellaSwagTemplate.generate_output`` produces, by importing and calling +that template directly. Pair with ``ExactMatchGrader`` for the +recipe's ``Scorer.exact_match_score`` semantics (strict +``pred.strip() == gold.strip()``). + +DeepEval's prompt format (verbatim): + + The following are multiple choice questions (with answers) are + sentence completion problems about . + + + + + ... (n_shots times, drawn one per unique activity_label from train) + + + A. + B. + C. + D. + Answer: + +The grammar in line 1 ("questions (with answers) are sentence +completion") is verbatim from DeepEval — we don't fix it; reproducing +the bug is part of reference parity. + +Dataset revision policy: + ``load_dataset("Rowan/hellaswag")`` is intentionally **not** pinned + to a commit ``revision=``. The trt-llm benchmark recipe also leaves + this unpinned, and matching the recipe's resolution behavior is + part of reference parity. If Rowan/hellaswag is re-uploaded or + rebased upstream, downstream callers should expect the byte-equal + pin against ``HellaSwagTemplate.generate_output`` to drift in + lockstep with whatever the recipe would resolve. Pin a SHA here + only if/when the recipe pins one. + +Reference: + deepeval/benchmarks/hellaswag/hellaswag.py + deepeval/benchmarks/hellaswag/template.py + trt-llm-benchmark-recipe/src/tools/acc_benchmark.py:319 +""" + from __future__ import annotations -from typing import TYPE_CHECKING +import asyncio +from typing import TYPE_CHECKING, Any -from aiperf.accuracy.models import BenchmarkProblem +from datasets import DatasetDict, load_dataset + +from aiperf.accuracy.models import AccuracyChatMessage, BenchmarkProblem from aiperf.common.mixins import AIPerfLoggerMixin if TYPE_CHECKING: from aiperf.config.resolution.plan import BenchmarkRun +try: + from deepeval.benchmarks.hellaswag.task import HellaSwagTask + from deepeval.benchmarks.hellaswag.template import HellaSwagTemplate + + _HAS_DEEPEVAL = True +except ImportError: # pragma: no cover - exercised only without optional dep + _HAS_DEEPEVAL = False + HellaSwagTask = None # type: ignore[assignment] + HellaSwagTemplate = None # type: ignore[assignment] + + +# Lowercased-value → enum lookup so the docstring's promise of +# case-insensitive activity_label matching is actually honoured — the +# previous per-call ``name in {t.value for t in HellaSwagTask}`` check +# was case-sensitive and rejected ``"applying sunscreen"``. Built once +# at module load since ``HellaSwagTask`` is immutable. +_LOWER_VALUE_TO_TASK: dict[str, Any] = ( + {t.value.lower(): t for t in HellaSwagTask} if _HAS_DEEPEVAL else {} +) + + +_MISSING_DEEPEVAL_HINT = ( + "deepeval is not installed; HellaSwag's prompt template (the " + "trt-llm reference) cannot be rendered. Install with: " + "uv pip install 'aiperf[accuracy]'." +) + +DATASET_NAME = "Rowan/hellaswag" +TASK_NAME = "hellaswag" + +# DeepEval's HellaSwag default is ``n_shots=10`` (capped at 15). We +# mirror both bounds so the loader's defaults match the recipe. +DEFAULT_N_SHOTS = 10 +MAX_N_SHOTS = 15 + +# A bare A/B/C/D answer fits in a handful of tokens; matches DeepEval's +# expectation that the model emits just the letter. +DEFAULT_GENERATION_SIZE = 5 + +# DeepEval's ``Rowan/hellaswag`` schema: ``activity_label`` selects the +# subtask, ``label`` is the integer gold index 0..3. +ACTIVITY_LABEL_FIELD = "activity_label" +LABEL_FIELD = "label" + +# DeepEval's default ``confinement_instructions`` string for HellaSwag, +# appended to the prompt when the model doesn't support +# ``model.generate(prompt, schema=MultipleChoiceSchema)`` — i.e. for +# every non-DeepEval-aware OpenAI-compatible endpoint, which is the +# only path aiperf takes. Without it, models emit verbose responses +# like ``"The answer is A."`` and the strict ``ExactMatchGrader`` +# (which mirrors ``Scorer.exact_match_score``) under-grades them +# vs DeepEval's reference numbers. +# +# Source: ``deepeval/benchmarks/hellaswag/hellaswag.py`` — +# ``HellaSwag.__init__`` default when ``confinement_instructions=None``. +DEEPEVAL_CONFINEMENT = "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + + +def _build_unique_activity_label_shots_set(train_set: Any) -> list[dict[str, Any]]: + """Mirror DeepEval's ``shots_dataset`` construction. + + DeepEval iterates the train split and collects the FIRST row for + each unique ``activity_label`` value + (``hellaswag.py:255-261``). We reproduce that exactly. + """ + shots_set: list[dict[str, Any]] = [] + categories_seen: set[str] = set() + for data in train_set: + category = data[ACTIVITY_LABEL_FIELD] + if category not in categories_seen: + categories_seen.add(category) + shots_set.append(data) + return shots_set + + +def _resolve_tasks(tasks: list[str] | None) -> list[Any]: + """Convert ``--accuracy-tasks`` CLI strings to ``HellaSwagTask`` enums. + + DeepEval evaluates one task at a time (see + ``HellaSwag.evaluate``). Aiperf accepts either: + - ``None`` / empty / ``["all"]`` (case-insensitive) → every + HellaSwagTask enum. + - A list of activity_label strings (case-insensitive, + space-separated as in the dataset, e.g. ``"Applying sunscreen"`` + or ``"applying sunscreen"``). + + Mixing ``"all"`` with other task names is rejected so typos like + ``["all", "NOT_A_TASK"]`` don't silently bypass validation — that + used to slip through and return every task while swallowing the + invalid entry. + + Unknown tasks raise ``ValueError`` listing the valid set so typos + fail loudly. + """ + if not tasks: + return list(HellaSwagTask) + lowered = [t.lower() for t in tasks] + if "all" in lowered: + if lowered == ["all"]: + return list(HellaSwagTask) + raise ValueError( + "'all' cannot be mixed with other task names. Pass 'all' " + "by itself (or omit --accuracy-tasks) to select every task, " + f"or list specific activity labels. Got: {tasks!r}" + ) + resolved: list[Any] = [] + unknown: list[str] = [] + for name in tasks: + member = _LOWER_VALUE_TO_TASK.get(name.lower()) + if member is not None: + resolved.append(member) + continue + # Fall back to upper-snake-case enum name, matching the recipe's + # ``getattr(HellaSwagTask, task_name.upper(), None)`` lookup. + enum_member = getattr(HellaSwagTask, name.upper(), None) + if enum_member is not None: + resolved.append(enum_member) + else: + unknown.append(name) + if unknown: + valid_values = sorted(t.value for t in HellaSwagTask) + raise ValueError( + f"Unknown HellaSwag task(s): {unknown}. Valid task values " + f"include {valid_values[:5]}... ({len(valid_values)} total). " + "Pass space-separated activity_label values " + "(e.g. 'Applying sunscreen') or upper-snake-case enum " + "names (e.g. 'APPLYING_SUNSCREEN')." + ) + return resolved + class HellaSwagBenchmark(AIPerfLoggerMixin): - """Registered placeholder for a future HellaSwag loader. + """HellaSwag benchmark loader, byte-equal to DeepEval's prompts. - `load_problems()` intentionally raises NotImplementedError in this release; - use the MMLU benchmark when a working accuracy loader is required. + Loads ``Rowan/hellaswag`` (validation split, filtered per-task by + ``activity_label``). Few-shot examples drawn from the train split + using DeepEval's "one per unique activity_label" rule. Pair with + ``ExactMatchGrader`` (strict equality) for grading parity. """ - def __init__(self, run: BenchmarkRun, **kwargs) -> None: + def __init__(self, run: BenchmarkRun, **kwargs: Any) -> None: super().__init__(**kwargs) + if not _HAS_DEEPEVAL: + raise RuntimeError(_MISSING_DEEPEVAL_HINT) self.run = run async def load_problems( self, tasks: list[str] | None, n_shots: int, enable_cot: bool ) -> list[BenchmarkProblem]: - raise NotImplementedError( - "hellaswag benchmark is not yet implemented; only 'mmlu' is available in this release." + """Load HellaSwag problems and format them DeepEval-style. + + Args: + tasks: Activity-label strings (case-sensitive against the + ``HellaSwagTask`` enum's ``value``) or upper-snake-case + enum names. ``None`` / ``["all"]`` selects every + category. Unknown names raise ``ValueError``. + n_shots: Few-shot count, capped at ``MAX_N_SHOTS`` (15). + The recipe's ``DeepEval.HellaSwag`` default is 10. + enable_cot: Ignored — DeepEval's HellaSwag has no + chain-of-thought variant. Accepting the parameter + keeps the protocol uniform across benchmarks. + + Returns: + One ``BenchmarkProblem`` per labeled validation row across + the selected tasks. ``ground_truth`` is a bare ``A``/``B``/ + ``C``/``D`` letter (DeepEval's convention). + """ + if enable_cot: + self.info( + "--accuracy-enable-cot is ignored for HellaSwag " + "(DeepEval's HellaSwag has no CoT variant)." + ) + if n_shots > MAX_N_SHOTS: + raise ValueError( + f"HellaSwag supports at most {MAX_N_SHOTS} few-shot " + f"examples (got {n_shots}); DeepEval asserts " + "``n_shots <= 15``." + ) + # Validate ``tasks`` BEFORE the HF download: an invalid + # ``--accuracy-tasks`` value would otherwise trigger a + # multi-MB ``load_dataset`` call (and potential network/cache + # failure) just to surface the user's typo. + selected_tasks = _resolve_tasks(tasks) + ds: DatasetDict = await asyncio.to_thread(load_dataset, DATASET_NAME) + return await asyncio.to_thread( + self._build_problems, ds, selected_tasks, n_shots ) + + def _build_problems( + self, ds: DatasetDict, tasks: list[Any], n_shots: int + ) -> list[BenchmarkProblem]: + train_set = ds["train"] + shots_set = _build_unique_activity_label_shots_set(train_set) + val_set = ds["validation"] + problems: list[BenchmarkProblem] = [] + choices = ["A", "B", "C", "D"] + # Pre-bucket validation rows by activity_label so the per-task + # loop is O(val_rows + tasks) instead of O(tasks × val_rows). + # With --accuracy-tasks=all (~190 tasks) over the ~10K-row + # validation split the naive nested scan does ~1.9M dict + # lookups; one pass over val_set is enough. + by_label: dict[str, list[dict[str, Any]]] = {} + for row in val_set: + by_label.setdefault(row.get(ACTIVITY_LABEL_FIELD), []).append(row) + for task in tasks: + for row in by_label.get(task.value, ()): + label_raw = row.get(LABEL_FIELD) + if label_raw == "" or label_raw is None: + continue + # DeepEval renders the question via the template's + # ``format_question(include_answer=False)`` to feed + # ``generate_output`` as ``input``. + input_text = HellaSwagTemplate.format_question( + row, include_answer=False + ) + template_prompt = HellaSwagTemplate.generate_output( + input=input_text, + train_set=shots_set, + task=task, + n_shots=n_shots, + ) + # Append DeepEval's confinement instruction. DeepEval's + # ``predict()`` does this when ``model.generate`` doesn't + # accept a ``schema`` kwarg (the normal case for every + # OpenAI-compatible endpoint aiperf hits). Without the + # append, ``ExactMatchGrader`` systematically grades + # verbose-but-correct responses ("The answer is A.") as + # wrong vs DeepEval's reference numbers. + prompt = f"{template_prompt}\n\n{DEEPEVAL_CONFINEMENT}" + gold_letter = choices[int(label_raw)] + problems.append( + BenchmarkProblem( + prompt=prompt, + ground_truth=gold_letter, + # Per-row task is the activity_label so the + # accuracy CSV breaks down per category. + task=task.value, + metadata={ + ACTIVITY_LABEL_FIELD: task.value, + "generation_size": DEFAULT_GENERATION_SIZE, + }, + raw_messages=self._build_chat_messages(prompt), + ) + ) + return problems + + @staticmethod + def _build_chat_messages(prompt: str) -> list[AccuracyChatMessage]: + """DeepEval sends the full prompt as a single string, no + multi-turn chat. Mirror that for both completions and chat + endpoints by emitting a single user message with the rendered + prompt.""" + return [{"role": "user", "content": prompt}] diff --git a/src/aiperf/accuracy/graders/exact_match.py b/src/aiperf/accuracy/graders/exact_match.py index 433d16320..ac7f9fff1 100644 --- a/src/aiperf/accuracy/graders/exact_match.py +++ b/src/aiperf/accuracy/graders/exact_match.py @@ -1,9 +1,35 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +"""Strict-equality grader matching DeepEval's ``Scorer.exact_match_score``. + +The trt-llm benchmark recipe routes ``hellaswag`` and ``bigbench`` +through ``deepeval.benchmarks`` (see +``trt-llm-benchmark-recipe/src/tools/acc_benchmark.py``). Both +benchmarks score with DeepEval's ``Scorer.exact_match_score``, which +is just:: + + if not prediction: + return 0 + return 1 if prediction.strip() == target.strip() else 0 + +Strict, case-sensitive, no normalization. We mirror this byte-for- +byte so aiperf's accuracy numbers reproduce the recipe's. + +This is a deliberately conservative grader. Models that emit +``"The answer is A."`` instead of bare ``"A"`` will score 0 — as they +do in DeepEval. The escape hatch in DeepEval is structured generation +via ``MultipleChoiceSchema``; aiperf's equivalent is to enforce +``--accuracy-system-prompt`` constraints and request structured +outputs at the LLM-server level. + +Reference: + deepeval/scorer/scorer.py:Scorer.exact_match_score +""" + from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from aiperf.accuracy.graders.base import BaseGrader from aiperf.accuracy.models import GradingResult @@ -13,19 +39,39 @@ class ExactMatchGrader(BaseGrader): - """Grades responses by exact string matching against ground truth.""" + """Strict ``pred.strip() == gold.strip()`` grader. - def __init__(self, run: BenchmarkRun, **kwargs) -> None: + Mirrors DeepEval's ``Scorer.exact_match_score``: + + - Empty / whitespace-only response → score 0 (``unparsed=True``). + - Otherwise score 1 only when the stripped prediction equals the + stripped gold byte-for-byte. Case-sensitive, no normalization. + + Used by HellaSwag and BigBench-Hard for trt-llm reference parity. + """ + + def __init__(self, run: BenchmarkRun, **kwargs: Any) -> None: super().__init__(run=run, **kwargs) + def extract_answer(self, response_text: str, **kwargs: Any) -> str: + """Return the response stripped of outer whitespace, no other transforms.""" + return response_text.strip() if response_text else "" + async def grade( - self, response_text: str, ground_truth: str, **kwargs + self, response_text: str, ground_truth: str, **kwargs: Any ) -> GradingResult: - raise NotImplementedError( - "exact_match grader is not yet implemented; only 'multiple_choice' is available in this release." - ) - - def extract_answer(self, response_text: str, **kwargs) -> str: - raise NotImplementedError( - "exact_match grader is not yet implemented; only 'multiple_choice' is available in this release." + pred = response_text.strip() if response_text else "" + gold = ground_truth.strip() if ground_truth else "" + unparsed = pred == "" and gold != "" + correct = bool(pred) and pred == gold + return GradingResult( + correct=correct, + unparsed=unparsed, + confidence=1.0 if correct else 0.0, + reasoning=( + f"strict equality: stripped pred '{pred}' vs gold '{gold}'; " + f"match={correct}" + (" (empty response)" if unparsed else "") + ), + extracted_answer=pred, + ground_truth=gold, ) diff --git a/src/aiperf/plugin/plugins.yaml b/src/aiperf/plugin/plugins.yaml index 19da1ecd7..c2c35f3f3 100644 --- a/src/aiperf/plugin/plugins.yaml +++ b/src/aiperf/plugin/plugins.yaml @@ -1145,10 +1145,9 @@ accuracy_grader: exact_match: class: aiperf.accuracy.graders.exact_match:ExactMatchGrader description: | - Exact match grader that compares extracted answers against ground truth - using exact string matching. Suitable for factual Q&A and classification tasks. - metadata: - is_implemented: false + Strict ``pred.strip() == gold.strip()`` grader matching DeepEval's + ``Scorer.exact_match_score``. Used by HellaSwag and BigBench-Hard for + trt-llm reference parity (case-sensitive, no normalization). math: class: aiperf.accuracy.graders.math:MathGrader @@ -1228,11 +1227,15 @@ accuracy_benchmark: hellaswag: class: aiperf.accuracy.benchmarks.hellaswag:HellaSwagBenchmark description: | - HellaSwag commonsense reasoning benchmark for sentence completion tasks. + HellaSwag commonsense reasoning benchmark, aligned with the trt-llm + benchmark recipe's DeepEval-backed configuration. Prompts are byte-equal + to ``deepeval.benchmarks.HellaSwag`` (n_shots=10, one few-shot per unique + activity_label). Pairs with ``exact_match`` for the recipe's strict + ``Scorer.exact_match_score`` semantics. Requires the ``[accuracy]`` + install (deepeval ships the prompt template). metadata: - default_grader: multiple_choice - default_n_shots: 0 - is_implemented: false + default_grader: exact_match + default_n_shots: 10 bigbench: class: aiperf.accuracy.benchmarks.bigbench:BigBenchBenchmark diff --git a/tests/unit/accuracy/test_accuracy_config.py b/tests/unit/accuracy/test_accuracy_config.py index 7afa43722..509565d26 100644 --- a/tests/unit/accuracy/test_accuracy_config.py +++ b/tests/unit/accuracy/test_accuracy_config.py @@ -23,7 +23,6 @@ # This branch (AIP-874) implements ``aime``, ``math``, and ``code_execution``, # so those names are absent from the stub lists. STUB_BENCHMARKS = ( - "hellaswag", "bigbench", "aime24", "aime25", @@ -31,7 +30,7 @@ "gpqa_diamond", "lcb_codegeneration", ) -STUB_GRADERS = ("exact_match",) +STUB_GRADERS: tuple[str, ...] = () class TestAcceptsImplemented: @@ -88,8 +87,8 @@ def test_accuracyconfig_with_uppercase_stub_name_raises_validationerror( ) -> None: """Case-insensitive enum lookup must not bypass the validator.""" with pytest.raises(ValidationError) as exc: - AccuracyConfig(benchmark="HELLASWAG") - assert "hellaswag" in str(exc.value) + AccuracyConfig(benchmark="BIGBENCH") + assert "bigbench" in str(exc.value) class TestRejectsStubGrader: diff --git a/tests/unit/accuracy/test_exact_match_grader.py b/tests/unit/accuracy/test_exact_match_grader.py new file mode 100644 index 000000000..58cbfc8a1 --- /dev/null +++ b/tests/unit/accuracy/test_exact_match_grader.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for ``ExactMatchGrader`` after DeepEval alignment. + +The grader is now strict — ``pred.strip() == gold.strip()``, no other +normalization — to match ``Scorer.exact_match_score`` from +``deepeval.scorer``. These tests pin every reproducible aspect of +that contract. +""" + +from __future__ import annotations + +import pytest +from pytest import param + +from aiperf.accuracy.graders.exact_match import ExactMatchGrader +from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType +from tests.unit.conftest import make_benchmark_run + + +def _make_run(): + return make_benchmark_run( + model_names=["test-model"], + endpoint_type=EndpointType.COMPLETIONS, + streaming=False, + accuracy={"benchmark": AccuracyBenchmarkType.HELLASWAG}, + ) + + +@pytest.fixture +def grader() -> ExactMatchGrader: + return ExactMatchGrader(run=_make_run()) + + +class TestStrictEquality: + """``pred.strip() == gold.strip()``, byte-for-byte case-sensitive.""" + + @pytest.mark.parametrize( + "pred,gold,expected_correct", + [ + param("A", "A", True, id="bare-letter-match"), + param(" A ", "A", True, id="prediction-whitespace-stripped"), + param("A", " A ", True, id="gold-whitespace-stripped"), + param("A", "B", False, id="mismatch"), + param("a", "A", False, id="case-sensitive-lower-vs-upper"), + param("A.", "A", False, id="trailing-period-NOT-forgiven"), + param('"A"', "A", False, id="surrounding-quotes-NOT-stripped"), + param("yes", "Yes", False, id="case-mismatch-not-equal"), + param("Yes", "Yes", True, id="case-exact-match"), + ], + ) # fmt: skip + @pytest.mark.asyncio + async def test_strict_equality_cases( + self, + grader: ExactMatchGrader, + pred: str, + gold: str, + expected_correct: bool, + ) -> None: + result = await grader.grade(pred, gold) + assert result.correct is expected_correct + assert result.confidence == (1.0 if expected_correct else 0.0) + + +class TestEmptyAndUnparsed: + @pytest.mark.asyncio + async def test_empty_response_unparsed_and_incorrect( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade("", "A") + assert result.correct is False + assert result.unparsed is True + + @pytest.mark.asyncio + async def test_whitespace_only_response_unparsed( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade(" \n\t ", "A") + assert result.correct is False + assert result.unparsed is True + + @pytest.mark.asyncio + async def test_empty_pred_and_empty_gold_neither_correct_nor_unparsed( + self, grader: ExactMatchGrader + ) -> None: + """If both are empty, gold is meaningless — not unparsed.""" + result = await grader.grade("", "") + assert result.correct is False + assert result.unparsed is False + + +class TestMultiLineNotForgiven: + """DeepEval doesn't take "first non-empty line" — it strips the + full response. So multi-line responses fail unless the entire + stripped content matches the gold.""" + + @pytest.mark.asyncio + async def test_multi_line_response_does_not_match_single_letter( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade("A\nbecause...", "A") + assert result.correct is False + + @pytest.mark.asyncio + async def test_explanation_prefix_does_not_match( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade("The answer is A.", "A") + assert result.correct is False + + +class TestUnicodeAndNonAscii: + @pytest.mark.asyncio + async def test_unicode_match(self, grader: ExactMatchGrader) -> None: + result = await grader.grade("café", "café") + assert result.correct is True + + @pytest.mark.asyncio + async def test_unicode_case_sensitive(self, grader: ExactMatchGrader) -> None: + result = await grader.grade("Café", "café") + assert result.correct is False + + +class TestExtractAnswerInterface: + def test_extract_answer_strips_only(self, grader: ExactMatchGrader) -> None: + assert grader.extract_answer(" A ") == "A" + + def test_extract_answer_preserves_inner(self, grader: ExactMatchGrader) -> None: + """No first-line / no quote-strip / no punct-strip — unlike + the previous over-engineered ExactMatchGrader.""" + assert grader.extract_answer("hello world") == "hello world" + + def test_extract_answer_empty(self, grader: ExactMatchGrader) -> None: + assert grader.extract_answer("") == "" + assert grader.extract_answer(" ") == "" + + +class TestGradingResultFields: + @pytest.mark.asyncio + async def test_reasoning_includes_stripped_forms( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade(" A ", "A") + assert "stripped pred 'A'" in result.reasoning + assert "gold 'A'" in result.reasoning + + @pytest.mark.asyncio + async def test_extracted_answer_is_stripped_pred( + self, grader: ExactMatchGrader + ) -> None: + result = await grader.grade(" Yes ", "Yes") + assert result.extracted_answer == "Yes" diff --git a/tests/unit/accuracy/test_hellaswag_benchmark.py b/tests/unit/accuracy/test_hellaswag_benchmark.py new file mode 100644 index 000000000..7eeae1657 --- /dev/null +++ b/tests/unit/accuracy/test_hellaswag_benchmark.py @@ -0,0 +1,538 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for ``HellaSwagBenchmark`` after DeepEval alignment. + +Pins: +1. Prompt is byte-equal to ``deepeval.benchmarks.HellaSwag``'s + ``HellaSwagTemplate.generate_output`` output. +2. Few-shot draw rule is "one per unique activity_label" (matches + DeepEval's ``categories_seen`` dedupe loop). +3. Validation split is filtered per task by ``activity_label == + task.value``. +4. ``ground_truth`` is a bare ``A``/``B``/``C``/``D`` letter + (DeepEval's convention for ``Scorer.exact_match_score``). + +These tests run against the real ``deepeval`` install (it's in the +``[accuracy]`` extras), so ``HellaSwagTemplate`` is available. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +from pytest import param + +# ``HellaSwagBenchmark`` calls into ``deepeval.benchmarks.HellaSwag``'s +# bundled prompt template; without the ``[accuracy]`` extras installed, +# the constructor raises ``RuntimeError`` and every test in this file +# would fail. Skip the whole module when deepeval is missing so CI +# environments that intentionally don't install the heavy extras still +# pass. +pytest.importorskip( + "deepeval", reason="HellaSwag tests require the [accuracy] extras (deepeval)" +) + +from aiperf.accuracy.benchmarks.hellaswag import ( # noqa: E402 + DEEPEVAL_CONFINEMENT, + DEFAULT_GENERATION_SIZE, + DEFAULT_N_SHOTS, + MAX_N_SHOTS, + HellaSwagBenchmark, + _build_unique_activity_label_shots_set, + _resolve_tasks, +) +from aiperf.plugin.enums import AccuracyBenchmarkType, EndpointType # noqa: E402 +from tests.unit.conftest import make_benchmark_run # noqa: E402 + + +def _make_run(): + return make_benchmark_run( + model_names=["test-model"], + endpoint_type=EndpointType.COMPLETIONS, + streaming=False, + accuracy={"benchmark": AccuracyBenchmarkType.HELLASWAG}, + ) + + +def _make_row( + activity_label: str = "Applying sunscreen", + ctx: str = "[header] A man is in the bathroom. [step] He", + endings: list[str] | None = None, + label: str | int = 0, +) -> dict[str, Any]: + return { + "activity_label": activity_label, + "ctx": ctx, + "endings": endings + if endings is not None + else [ + "applies sunscreen.", + "watches TV.", + "starts singing.", + "cooks breakfast.", + ], + "label": label, + } + + +def _make_fake_split(rows: list[dict[str, Any]]) -> MagicMock: + split = MagicMock() + split.__iter__ = MagicMock(side_effect=lambda: iter(rows)) + split.__len__ = MagicMock(return_value=len(rows)) + split.__getitem__ = MagicMock(side_effect=lambda i: rows[i]) + return split + + +def _make_fake_dataset_dict( + train_rows: list[dict[str, Any]] | None = None, + validation_rows: list[dict[str, Any]] | None = None, +) -> dict[str, Any]: + return { + "train": _make_fake_split(train_rows or []), + "validation": _make_fake_split(validation_rows or []), + } + + +class TestDefaultsMatchDeepEval: + """Defaults mirror ``deepeval.benchmarks.HellaSwag``.""" + + def test_default_n_shots_is_10(self) -> None: + assert DEFAULT_N_SHOTS == 10 + + def test_max_n_shots_is_15(self) -> None: + """DeepEval asserts ``n_shots <= 15``.""" + assert MAX_N_SHOTS == 15 + + def test_default_generation_size_is_5(self) -> None: + """A bare A/B/C/D answer fits in a few tokens.""" + assert DEFAULT_GENERATION_SIZE == 5 + + +class TestUniqueActivityLabelShotsSet: + """``_build_unique_activity_label_shots_set`` mirrors DeepEval's + ``shots_dataset`` construction (one row per unique activity_label, + in first-seen order).""" + + def test_picks_first_row_per_unique_label(self) -> None: + train = [ + _make_row(activity_label="A", ctx="row0"), + _make_row(activity_label="A", ctx="row1"), # duplicate label + _make_row(activity_label="B", ctx="row2"), + _make_row(activity_label="A", ctx="row3"), # duplicate + _make_row(activity_label="C", ctx="row4"), + ] + shots = _build_unique_activity_label_shots_set(train) + assert [s["ctx"] for s in shots] == ["row0", "row2", "row4"] + + def test_empty_train_returns_empty(self) -> None: + assert _build_unique_activity_label_shots_set([]) == [] + + +class TestResolveTasks: + """``_resolve_tasks`` accepts None / 'all' / activity_label values + / upper-snake-case enum names. Unknowns raise.""" + + def test_none_returns_all_tasks(self) -> None: + result = _resolve_tasks(None) + # DeepEval's HellaSwagTask has ~190 entries. + assert len(result) > 100 + + def test_all_returns_all_tasks(self) -> None: + result = _resolve_tasks(["all"]) + assert len(result) > 100 + + @pytest.mark.parametrize( + "name", + [ + param("all", id="lowercase"), + param("ALL", id="uppercase"), + param("All", id="titlecase"), + param("aLl", id="mixed_case"), + ], + ) # fmt: skip + def test_all_alone_is_case_insensitive(self, name: str) -> None: + """Any casing of the bare ``"all"`` sentinel selects every task.""" + result = _resolve_tasks([name]) + assert len(result) > 100 + + @pytest.mark.parametrize( + "tasks", + [ + param(["all", "NOT_A_REAL_TASK"], id="all_with_typo"), + param(["ALL", "Applying sunscreen"], id="all_with_real_task"), + param(["Applying sunscreen", "all"], id="all_after_other"), + param(["all", "all"], id="duplicate_all"), + ], + ) # fmt: skip + def test_all_mixed_with_other_selectors_raises(self, tasks: list[str]) -> None: + """``"all"`` mixed with any other selector raises rather than + silently returning every task. Pin the regression: previously + ``["all", "NOT_A_REAL_TASK"]`` returned all 192 tasks and the + typo was swallowed.""" + with pytest.raises(ValueError, match="'all' cannot be mixed"): + _resolve_tasks(tasks) + + def test_activity_label_value_resolves(self) -> None: + result = _resolve_tasks(["Applying sunscreen"]) + assert len(result) == 1 + assert result[0].value == "Applying sunscreen" + + def test_upper_snake_case_enum_name_resolves(self) -> None: + result = _resolve_tasks(["APPLYING_SUNSCREEN"]) + assert len(result) == 1 + assert result[0].value == "Applying sunscreen" + + @pytest.mark.parametrize( + "name", + [ + param("applying sunscreen", id="lowercase"), + param("APPLYING SUNSCREEN", id="uppercase_with_space"), + param("ApPlYiNg SuNsCrEeN", id="mixed_case"), + param("Applying sunscreen", id="exact_case_baseline"), + ], + ) # fmt: skip + def test_activity_label_value_case_insensitive(self, name: str) -> None: + """The docstring promises case-insensitive activity_label + matching. The previous implementation used ``name in + valid_values`` (set membership), which is strictly + case-sensitive and rejected ``"applying sunscreen"``. Pin that + all reasonable casings resolve to the same enum member.""" + result = _resolve_tasks([name]) + assert len(result) == 1 + assert result[0].value == "Applying sunscreen" + + def test_unknown_task_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown HellaSwag task"): + _resolve_tasks(["NOT_A_REAL_TASK"]) + + +class TestTaskValidationPrecedesDatasetDownload: + """Pin that an invalid ``--accuracy-tasks`` value fails BEFORE the + HuggingFace dataset is fetched. + + Previously ``load_problems`` called ``load_dataset()`` before + ``_resolve_tasks(tasks)``, so a typo in ``--accuracy-tasks`` would + trigger a multi-MB HellaSwag download (and could fail on a + network/cache error) just to surface the validation error. + """ + + @pytest.mark.asyncio + async def test_unknown_task_does_not_call_load_dataset(self) -> None: + with patch("aiperf.accuracy.benchmarks.hellaswag.load_dataset") as mock_load: + bench = HellaSwagBenchmark(run=_make_run()) + with pytest.raises(ValueError, match="Unknown HellaSwag task"): + await bench.load_problems( + tasks=["NOT_A_REAL_TASK"], n_shots=0, enable_cot=False + ) + mock_load.assert_not_called() + + +class TestPromptByteEqualWithDeepEval: + """The flat prompt must be byte-equal to what + ``HellaSwagTemplate.generate_output`` produces — same template, same + input, same shots, same n_shots.""" + + @pytest.mark.asyncio + async def test_zero_shot_prompt_starts_with_template_header(self) -> None: + rows = [_make_row(activity_label="Applying sunscreen", label=0)] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Applying sunscreen"], n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + # DeepEval's verbatim opening (note the awkward "questions + # (with answers) are sentence completion" grammar). + assert prompt.startswith( + "The following are multiple choice questions (with answers) " + "are sentence completion problems about Applying sunscreen.\n\n" + ) + + @pytest.mark.asyncio + async def test_question_format_matches_deepeval(self) -> None: + rows = [ + _make_row( + activity_label="Applying sunscreen", + ctx="A man is in the bathroom. He", + endings=["applies", "watches", "sings", "cooks"], + label=0, + ) + ] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Applying sunscreen"], n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + # Per DeepEval, the question block formats endings on + # labelled lines: ``ctx`` then ``A.``, ``B.``, ``C.``, ``D.``, + # each preceded by a literal newline, then ``Answer:``. The + # D-line half is built via ``+`` rather than a single string + # literal so codespell 2.2's dictionary doesn't false-positive + # on the (newline + D) substring as a typo for "and" (ruff + # format collapses adjacent literals but leaves binary ``+`` + # intact). + expected_endings_block = ( + "A. applies\nB. watches\nC. sings\n" + "D. cooks\nAnswer:" + ) + assert "A man is in the bathroom. He\n" in prompt + assert expected_endings_block in prompt + + @pytest.mark.asyncio + async def test_few_shots_drawn_from_train_with_one_per_label(self) -> None: + train = [ + _make_row(activity_label="Applying sunscreen", ctx="train_AS_0"), + _make_row(activity_label="Applying sunscreen", ctx="train_AS_1"), + _make_row(activity_label="Sailing", ctx="train_SAIL_0"), + ] + val = [_make_row(activity_label="Applying sunscreen", ctx="VAL_0")] + ds = _make_fake_dataset_dict(train_rows=train, validation_rows=val) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Applying sunscreen"], n_shots=2, enable_cot=False + ) + prompt = problems[0].prompt + # First shot should be train_AS_0 (first unique AS). + # Second shot should be train_SAIL_0 (first unique SAIL). + # train_AS_1 must NOT appear (it's the duplicate). + assert "train_AS_0" in prompt + assert "train_SAIL_0" in prompt + assert "train_AS_1" not in prompt + + +class TestDeepEvalConfinementInstructionAppended: + """Pin that aiperf appends DeepEval's default ``confinement_instructions`` + string to the rendered prompt. + + DeepEval's ``HellaSwag.predict()`` falls back to appending + ``"Output 'A', 'B', 'C', or 'D'. Full answer not needed."`` when the + model doesn't accept ``model.generate(..., schema=MultipleChoiceSchema)`` + — which is the only path aiperf has against OpenAI-compatible + endpoints. Without the append, ``ExactMatchGrader`` (which mirrors + ``Scorer.exact_match_score``) under-grades verbose-but-correct + responses (e.g. ``"The answer is A."`` vs gold ``"A"``). + """ + + def test_constant_matches_deepeval_default(self) -> None: + """The constant must byte-match DeepEval's hardcoded default — + sourced from ``HellaSwag.__init__`` when + ``confinement_instructions=None``.""" + assert ( + DEEPEVAL_CONFINEMENT + == "Output 'A', 'B', 'C', or 'D'. Full answer not needed." + ) + + @pytest.mark.asyncio + async def test_prompt_ends_with_confinement(self) -> None: + rows = [_make_row(activity_label="Applying sunscreen", label=0)] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Applying sunscreen"], n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + # Confinement is appended with a blank line separator after the + # template's trailing "Answer:" — matches DeepEval's + # ``prompt += f"\n\n{self.confinement_instructions}"``. + assert prompt.endswith(f"\n\n{DEEPEVAL_CONFINEMENT}") + + @pytest.mark.asyncio + async def test_template_output_is_a_prefix_of_prompt(self) -> None: + """The HellaSwagTemplate output is preserved byte-for-byte as + the prefix of the final prompt; only the confinement suffix is + new. Pins parity with DeepEval's ``predict()`` flow: + ``template.generate_output()`` then ``prompt += confinement``.""" + from deepeval.benchmarks.hellaswag.task import HellaSwagTask + from deepeval.benchmarks.hellaswag.template import HellaSwagTemplate + + rows = [ + _make_row( + activity_label="Applying sunscreen", + ctx="A man is in the bathroom. He", + endings=["applies", "watches", "sings", "cooks"], + label=0, + ) + ] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Applying sunscreen"], n_shots=0, enable_cot=False + ) + prompt = problems[0].prompt + expected_template = HellaSwagTemplate.generate_output( + input=HellaSwagTemplate.format_question(rows[0], include_answer=False), + train_set=rows, + task=HellaSwagTask.APPLYING_SUNSCREEN, + n_shots=0, + ) + assert prompt == f"{expected_template}\n\n{DEEPEVAL_CONFINEMENT}" + + +class TestGroundTruthIsBareLetter: + @pytest.mark.asyncio + async def test_ground_truth_is_letter_from_label(self) -> None: + rows = [ + _make_row(activity_label="Sailing", label=0), + _make_row(activity_label="Sailing", label=1), + _make_row(activity_label="Sailing", label=2), + _make_row(activity_label="Sailing", label=3), + ] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + assert [p.ground_truth for p in problems] == ["A", "B", "C", "D"] + + @pytest.mark.asyncio + async def test_string_label_coerced_to_int(self) -> None: + rows = [_make_row(activity_label="Sailing", label="2")] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + assert problems[0].ground_truth == "C" + + +class TestActivityLabelFiltering: + """Validation rows whose activity_label doesn't match the + selected task are excluded — matches DeepEval's + ``val_set.filter(lambda data: data['activity_label'] == task.value)``. + """ + + @pytest.mark.asyncio + async def test_validation_rows_filtered_by_activity_label(self) -> None: + train = [ + _make_row(activity_label="Sailing"), + _make_row(activity_label="Ballet"), + ] + val = [ + _make_row(activity_label="Sailing", ctx="match"), + _make_row(activity_label="Ballet", ctx="other"), + _make_row(activity_label="Sailing", ctx="match2"), + ] + ds = _make_fake_dataset_dict(train_rows=train, validation_rows=val) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + assert len(problems) == 2 + assert all(p.task == "Sailing" for p in problems) + + +class TestEnableCotIgnored: + """DeepEval's HellaSwag has no CoT variant. Aiperf accepts the + parameter for protocol uniformity but ignores it.""" + + @pytest.mark.asyncio + async def test_enable_cot_does_not_affect_prompt(self) -> None: + rows = [_make_row(activity_label="Sailing", label=0)] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + no_cot = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + with_cot = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=True + ) + assert no_cot[0].prompt == with_cot[0].prompt + + +class TestNShotsCap: + @pytest.mark.asyncio + async def test_n_shots_above_15_raises(self) -> None: + bench = HellaSwagBenchmark(run=_make_run()) + with pytest.raises(ValueError, match="at most 15"): + await bench.load_problems(tasks=None, n_shots=16, enable_cot=False) + + +class TestPathologicalDatasetRows: + @pytest.mark.asyncio + async def test_empty_validation_returns_empty(self) -> None: + train = [_make_row(activity_label="Sailing")] + ds = _make_fake_dataset_dict(train_rows=train, validation_rows=[]) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + assert problems == [] + + @pytest.mark.asyncio + async def test_unlabeled_rows_dropped(self) -> None: + rows = [ + _make_row(activity_label="Sailing", label=0), + _make_row(activity_label="Sailing", label=""), + _make_row(activity_label="Sailing", label=None), + _make_row(activity_label="Sailing", label=2), + ] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + assert len(problems) == 2 + + @pytest.mark.asyncio + async def test_per_problem_chat_message_is_single_user(self) -> None: + rows = [_make_row(activity_label="Sailing", label=0)] + ds = _make_fake_dataset_dict(train_rows=rows, validation_rows=rows) + with patch( + "aiperf.accuracy.benchmarks.hellaswag.load_dataset", + return_value=ds, + ): + bench = HellaSwagBenchmark(run=_make_run()) + problems = await bench.load_problems( + tasks=["Sailing"], n_shots=0, enable_cot=False + ) + msgs = problems[0].raw_messages + assert msgs is not None + assert len(msgs) == 1 + assert msgs[0]["role"] == "user"