From 0d7368200a36549917a73c62045143fc8c8e89e9 Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sat, 21 Mar 2026 20:08:36 +0530 Subject: [PATCH] refactor: deduplicate _canonical_json into shared _hashing module --- openverifiablellm/_hashing.py | 29 +++ openverifiablellm/environment.py | 9 +- openverifiablellm/eval/__init__.py | 30 +++ openverifiablellm/eval/base.py | 59 +++++ openverifiablellm/eval/benchmarks.py | 117 ++++++++++ openverifiablellm/eval/bias.py | 103 +++++++++ openverifiablellm/eval/perplexity.py | 270 +++++++++++++++++++++++ openverifiablellm/manifest_chain.py | 20 +- tests/test_eval.py | 316 +++++++++++++++++++++++++++ 9 files changed, 927 insertions(+), 26 deletions(-) create mode 100644 openverifiablellm/_hashing.py create mode 100644 openverifiablellm/eval/__init__.py create mode 100644 openverifiablellm/eval/base.py create mode 100644 openverifiablellm/eval/benchmarks.py create mode 100644 openverifiablellm/eval/bias.py create mode 100644 openverifiablellm/eval/perplexity.py create mode 100644 tests/test_eval.py diff --git a/openverifiablellm/_hashing.py b/openverifiablellm/_hashing.py new file mode 100644 index 0000000..ceeda2d --- /dev/null +++ b/openverifiablellm/_hashing.py @@ -0,0 +1,29 @@ +""" +_hashing.py +=========== +Shared low-level hashing utilities used across the openverifiablellm package. + +These are intentionally kept small and dependency-free so they can be safely +imported from any module without risk of circular imports. +""" + +import json +from typing import Any + + +def _canonical_json(obj: Any) -> str: + """ + Serialize object into canonical JSON format. + Ensures stable hashing across runs regardless of key order. + + Parameters + ---------- + obj : Any + JSON-serializable object + + Returns + ------- + str + Canonical JSON string with sorted keys + """ + return json.dumps(obj, sort_keys=True, separators=(",", ":")) diff --git a/openverifiablellm/environment.py b/openverifiablellm/environment.py index 9271e49..d7a8090 100644 --- a/openverifiablellm/environment.py +++ b/openverifiablellm/environment.py @@ -1,17 +1,10 @@ import hashlib -import json import platform import subprocess import sys from typing import Any, Dict - -def _canonical_json(obj: Any) -> str: - """ - Serialize object into canonical JSON format. - Ensures stable hashing across runs. - """ - return json.dumps(obj, sort_keys=True, separators=(",", ":")) +from openverifiablellm._hashing import _canonical_json def compute_object_hash(obj: Any) -> str: diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py new file mode 100644 index 0000000..bb5c6b6 --- /dev/null +++ b/openverifiablellm/eval/__init__.py @@ -0,0 +1,30 @@ +""" +openverifiablellm.eval +====================== + +Evaluation framework for OpenVerifiableLLM. + +Provides pluggable evaluators for perplexity, bias, and benchmark tasks, +all built on top of the abstract ``BaseEvaluator`` interface. + +Available evaluators +-------------------- +- :class:`~openverifiablellm.eval.perplexity.PerplexityEvaluator` + Measures cross-entropy perplexity on a held-out text corpus. +- :class:`~openverifiablellm.eval.bias.BiasEvaluator` + Bias-testing stub (WinoBias / BBQ — integration pending). +- :class:`~openverifiablellm.eval.benchmarks.BenchmarkEvaluator` + MMLU / factual-accuracy stub (lm-eval-harness — integration pending). +""" + +from .base import BaseEvaluator +from .benchmarks import BenchmarkEvaluator +from .bias import BiasEvaluator +from .perplexity import PerplexityEvaluator + +__all__ = [ + "BaseEvaluator", + "PerplexityEvaluator", + "BiasEvaluator", + "BenchmarkEvaluator", +] diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py new file mode 100644 index 0000000..6efad07 --- /dev/null +++ b/openverifiablellm/eval/base.py @@ -0,0 +1,59 @@ +""" +openverifiablellm.eval.base +============================ + +Abstract base class for all evaluation strategies. + +All concrete evaluators must subclass :class:`BaseEvaluator` and implement +:meth:`evaluate`, which receives a model callable and a tokenizer callable +and returns a flat ``dict`` of metric names to scalar values. + +Example +------- +:: + + class MyEvaluator(BaseEvaluator): + def evaluate(self, model, tokenizer): + # ... compute metrics ... + return {"my_metric": 42.0} +""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict + + +class BaseEvaluator(ABC): + """Abstract base class for LLM evaluators. + + Parameters + ---------- + name : str + Human-readable identifier for this evaluator (used in reports). + """ + + def __init__(self, name: str) -> None: + self.name = name + + @abstractmethod + def evaluate( + self, + model: Callable[..., Any], + tokenizer: Callable[..., Any], + ) -> Dict[str, float]: + """Run the evaluation and return a metric dictionary. + + Parameters + ---------- + model : + A callable that accepts token sequences and returns log-probabilities + or logits. The exact signature is determined by the concrete + evaluator subclass. + tokenizer : + A callable that maps a string to a sequence of integer token IDs. + + Returns + ------- + dict + Mapping of metric name → scalar value. All values must be + JSON-serialisable floats. + """ diff --git a/openverifiablellm/eval/benchmarks.py b/openverifiablellm/eval/benchmarks.py new file mode 100644 index 0000000..addb9c5 --- /dev/null +++ b/openverifiablellm/eval/benchmarks.py @@ -0,0 +1,117 @@ +""" +openverifiablellm.eval.benchmarks +==================================== + +Standard LLM benchmark evaluation stub for OpenVerifiableLLM. + +This module provides the :class:`BenchmarkEvaluator` class, which wraps +established NLP benchmarks commonly used to compare language models. + +Planned benchmarks +------------------ +* **MMLU** (Hendrycks et al., 2021) — Massive Multitask Language Understanding. + 57 academic subjects, 4-way multiple-choice. Evaluates broad knowledge and + reasoning across STEM, humanities, social sciences, and more. + +* **TriviaQA** (Joshi et al., 2017) — Factual accuracy benchmark with + trivia-style questions and supporting evidence passages. A random subset + (e.g., 1 000 questions) is used for fast evaluation. + +Integration is pending a stable lm-eval-harness dependency. The class +skeleton is provided now so that downstream code can import and type-check +:class:`BenchmarkEvaluator` without error. + +TODO +---- +* Integrate MMLU via ``lm_eval.tasks``:: + + lm_eval --model hf \ + --model_args pretrained= \ + --tasks mmlu \ + --device cpu \ + --output_path results/ + +* Integrate TriviaQA via ``lm_eval.tasks`` or HuggingFace ``datasets``. +* Cache downloaded datasets locally to avoid redundant network traffic. +* Return metrics: ``mmlu_accuracy``, ``triviaqa_exact_match``, + ``per_subject_accuracy`` dict. +""" + +import logging +from typing import Any, Callable, Dict + +from .base import BaseEvaluator + +logger = logging.getLogger(__name__) + + +class BenchmarkEvaluator(BaseEvaluator): + """Evaluate a language model on standard NLP benchmarks (stub). + + Parameters + ---------- + benchmark : {"mmlu", "triviaqa"} + Which benchmark to run. + n_samples : int or None + Number of examples to evaluate on. ``None`` means the full benchmark. + Set a small value (e.g., 100) for rapid iteration during development. + name : str + Evaluator name used in reports. + + Notes + ----- + This class is intentionally a stub. Calling :meth:`evaluate` will raise + :class:`NotImplementedError` until the benchmark integration is complete. + See module docstring for the planned implementation. + """ + + SUPPORTED_BENCHMARKS = ("mmlu", "triviaqa") + + def __init__( + self, + benchmark: str = "mmlu", + n_samples: int = None, + name: str = "benchmark", + ) -> None: + super().__init__(name=name) + + if benchmark not in self.SUPPORTED_BENCHMARKS: + raise ValueError( + f"Unsupported benchmark '{benchmark}'. " + f"Choose from: {self.SUPPORTED_BENCHMARKS}" + ) + + if n_samples is not None and n_samples <= 0: + raise ValueError("n_samples must be a positive integer or None") + + self.benchmark = benchmark + self.n_samples = n_samples + + def evaluate( + self, + model: Callable[..., Any], + tokenizer: Callable[..., Any], + ) -> Dict[str, float]: + """Run the benchmark evaluation. + + .. note:: + Not yet implemented. Raises :class:`NotImplementedError`. + + Parameters + ---------- + model : + Language model callable. + tokenizer : + Tokenizer callable. + + Raises + ------ + NotImplementedError + Always, until MMLU/TriviaQA integration is complete. + """ + # TODO: implement MMLU via lm-eval-harness task registry + # TODO: implement TriviaQA via HuggingFace datasets + exact-match scorer + raise NotImplementedError( + f"BenchmarkEvaluator ({self.benchmark}) is not yet implemented. " + "See openverifiablellm/eval/benchmarks.py for the integration plan." + ) diff --git a/openverifiablellm/eval/bias.py b/openverifiablellm/eval/bias.py new file mode 100644 index 0000000..84aacb1 --- /dev/null +++ b/openverifiablellm/eval/bias.py @@ -0,0 +1,103 @@ +""" +openverifiablellm.eval.bias +============================= + +Bias evaluation stub for OpenVerifiableLLM. + +This module provides the :class:`BiasEvaluator` class, which is intended to +measure social bias in a language model using established benchmarks. + +Planned benchmarks +------------------ +* **WinoBias** (Zhao et al., 2018) — coreference-resolution pairs that reveal + occupational gender bias. Each example has a pro-stereotypical and an + anti-stereotypical version; a fair model should perform equally on both. + +* **BBQ** (Parrish et al., 2022) — a question-answering dataset covering nine + social-bias dimensions (age, disability status, gender identity, nationality, + physical appearance, race/ethnicity, religion, SES, sexual orientation). + +Integration is pending a stable lm-eval-harness dependency. The class +skeleton is provided now so that downstream code can import and type-check +:class:`BiasEvaluator` without error. + +TODO +---- +* Integrate WinoBias evaluation via HuggingFace ``datasets``. +* Integrate BBQ via ``lm_eval.tasks`` (lm-eval-harness). +* Implement ``_score_pair()`` helper that forwards pro/anti pairs through the + model and computes the accuracy gap. +* Return bias metrics: ``gender_bias_score``, ``bbq_accuracy``, + ``per_category_bias`` dict. +""" + +import logging +from typing import Any, Callable, Dict + +from .base import BaseEvaluator + +logger = logging.getLogger(__name__) + + +class BiasEvaluator(BaseEvaluator): + """Evaluate social bias in a language model (stub). + + Parameters + ---------- + benchmark : {"winobias", "bbq"} + Which bias benchmark to use. + name : str + Evaluator name used in reports. + + Notes + ----- + This class is intentionally a stub. Calling :meth:`evaluate` will raise + :class:`NotImplementedError` until the benchmark integration is complete. + See module docstring for the planned implementation. + """ + + SUPPORTED_BENCHMARKS = ("winobias", "bbq") + + def __init__( + self, + benchmark: str = "winobias", + name: str = "bias", + ) -> None: + super().__init__(name=name) + + if benchmark not in self.SUPPORTED_BENCHMARKS: + raise ValueError( + f"Unsupported benchmark '{benchmark}'. " + f"Choose from: {self.SUPPORTED_BENCHMARKS}" + ) + + self.benchmark = benchmark + + def evaluate( + self, + model: Callable[..., Any], + tokenizer: Callable[..., Any], + ) -> Dict[str, float]: + """Run bias evaluation. + + .. note:: + Not yet implemented. Raises :class:`NotImplementedError`. + + Parameters + ---------- + model : + Language model callable. + tokenizer : + Tokenizer callable. + + Raises + ------ + NotImplementedError + Always, until WinoBias/BBQ integration is complete. + """ + # TODO: implement WinoBias via HuggingFace datasets + # TODO: implement BBQ via lm-eval-harness task registry + raise NotImplementedError( + f"BiasEvaluator ({self.benchmark}) is not yet implemented. " + "See openverifiablellm/eval/bias.py for the integration plan." + ) diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py new file mode 100644 index 0000000..edfe024 --- /dev/null +++ b/openverifiablellm/eval/perplexity.py @@ -0,0 +1,270 @@ +""" +openverifiablellm.eval.perplexity +=================================== + +Token-level perplexity measurement on a held-out text corpus. + +Perplexity is defined as:: + + PPL = exp( -1/N * sum_i log P(token_i | context_i) ) + +where *N* is the total number of tokens in the evaluation corpus. This is the +standard metric for language-model quality and is directly relevant to Wikipedia +pre-training: a lower perplexity indicates that the model assigns higher +probability to held-out Wikipedia text. + +Usage +----- +:: + + from openverifiablellm.eval.perplexity import PerplexityEvaluator + + evaluator = PerplexityEvaluator(text="The quick brown fox ...", stride=128) + results = evaluator.evaluate(model=my_model, tokenizer=my_tokenizer) + print(results) # {"perplexity": 45.3, "nll_bits_per_byte": 2.1, "n_tokens": 512} + +The ``model`` callable must accept a list of integer token IDs and return a +list of per-token log-probabilities (log P(token | prefix)) for every position. +This signature is intentionally simple so that tiny mock models work in tests +without requiring a GPU or a full transformer stack. +""" + +import logging +import math +from typing import Callable, Dict, List, Sequence + +from .base import BaseEvaluator + +logger = logging.getLogger(__name__) + +# Maximum sequence length forwarded through the model in a single call. +# Keeping this small allows evaluation on CPU with tiny mock models. +DEFAULT_MAX_LENGTH: int = 512 + +# Stride used for the sliding-window approach when the corpus is longer than +# max_length. Tokens in the overlap zone are scored only once (by the later +# window), which avoids inflating perplexity near window boundaries. +DEFAULT_STRIDE: int = 256 + + +def _sliding_window_nll( + token_ids: List[int], + model: Callable[[List[int]], List[float]], + max_length: int, + stride: int, +) -> tuple: + """Compute total negative log-likelihood using a sliding window. + + Parameters + ---------- + token_ids : + Full list of token IDs for the evaluation corpus. + model : + Callable mapping a token-ID list to per-token log-probabilities. + ``model(ids)[i]`` is log P(ids[i] | ids[:i]). + max_length : + Maximum number of tokens forwarded to the model at once. + stride : + How many tokens the window advances between calls. + + Returns + ------- + (total_nll, n_scored_tokens) : tuple of (float, int) + Aggregate negative log-likelihood and the number of tokens scored. + """ + if max_length <= 0: + raise ValueError("max_length must be a positive integer") + if stride <= 0: + raise ValueError("stride must be a positive integer") + if stride > max_length: + raise ValueError("stride must not exceed max_length") + + n = len(token_ids) + if n == 0: + return 0.0, 0 + + total_nll = 0.0 + n_scored = 0 + start = 0 + + while start < n: + end = min(start + max_length, n) + window = token_ids[start:end] + + # The first token in the very first window has no context → skip it. + # For subsequent windows the overlap is max_length - stride tokens; + # we only score the *new* tokens (the last stride tokens of the window). + if start == 0: + # Score positions 1 … end-1 (position 0 has no left context) + log_probs: List[float] = model(window) + for pos in range(1, len(window)): + total_nll -= log_probs[pos] + n_scored += 1 + else: + # Only score tokens beyond the overlap + log_probs = model(window) + overlap = max_length - stride + for pos in range(overlap, len(window)): + total_nll -= log_probs[pos] + n_scored += 1 + + if end == n: + break + start += stride + + return total_nll, n_scored + + +class PerplexityEvaluator(BaseEvaluator): + """Evaluate a language model's perplexity on a held-out text corpus. + + Parameters + ---------- + text : str + The held-out text to evaluate on. Typically a few thousand words of + Wikipedia-style prose. + max_length : int + Maximum context window passed to the model per forward call. + Default: 512. + stride : int + Sliding window stride. Must be ≤ ``max_length``. + Default: 256. + name : str + Evaluator name used in reports. + """ + + def __init__( + self, + text: str, + max_length: int = DEFAULT_MAX_LENGTH, + stride: int = DEFAULT_STRIDE, + name: str = "perplexity", + ) -> None: + super().__init__(name=name) + + if not isinstance(text, str) or not text: + raise ValueError("text must be a non-empty string") + if max_length <= 0: + raise ValueError("max_length must be a positive integer") + if stride <= 0: + raise ValueError("stride must be a positive integer") + if stride > max_length: + raise ValueError("stride must not exceed max_length") + + self.text = text + self.max_length = max_length + self.stride = stride + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def evaluate( + self, + model: Callable[[Sequence[int]], Sequence[float]], + tokenizer: Callable[[str], Sequence[int]], + ) -> Dict[str, float]: + """Compute perplexity metrics. + + Parameters + ---------- + model : + ``model(token_ids) → log_probs`` + + Receives a list of integer token IDs and returns a list of the same + length where ``log_probs[i]`` is the natural-log probability of + ``token_ids[i]`` given all preceding tokens. The value at position + 0 is ignored (no left context). + tokenizer : + ``tokenizer(text) → token_ids`` + + Maps a raw string to a list of integer token IDs. + + Returns + ------- + dict with keys: + - ``"perplexity"`` – exp(mean NLL) — lower is better. + - ``"nll_bits_per_byte"`` – NLL in bits per *byte* of input text, + a length-independent measure. + - ``"n_tokens"`` – number of tokens scored. + - ``"n_bytes"`` – length of the input text in UTF-8 bytes. + """ + token_ids: List[int] = list(tokenizer(self.text)) + + if len(token_ids) == 0: + logger.warning("Tokenizer produced empty output; returning infinite perplexity") + return { + "perplexity": float("inf"), + "nll_bits_per_byte": float("inf"), + "n_tokens": 0, + "n_bytes": len(self.text.encode("utf-8")), + } + + total_nll, n_scored = _sliding_window_nll( + token_ids=token_ids, + model=model, + max_length=self.max_length, + stride=self.stride, + ) + + if n_scored == 0: + # Edge case: single-token corpus — nothing to score. + logger.warning("No tokens were scored (corpus too short); returning perplexity=1.0") + return { + "perplexity": 1.0, + "nll_bits_per_byte": 0.0, + "n_tokens": len(token_ids), + "n_bytes": len(self.text.encode("utf-8")), + } + + mean_nll = total_nll / n_scored + perplexity = math.exp(mean_nll) + + n_bytes = len(self.text.encode("utf-8")) + # Convert nats → bits (log2(e) ≈ 1.4427); divide by byte count. + nll_bits_per_byte = (total_nll * math.log2(math.e)) / n_bytes if n_bytes > 0 else 0.0 + + logger.info( + "Perplexity: %.4f (NLL=%.4f, tokens=%d, bytes=%d)", + perplexity, + mean_nll, + n_scored, + n_bytes, + ) + + return { + "perplexity": perplexity, + "nll_bits_per_byte": nll_bits_per_byte, + "n_tokens": n_scored, + "n_bytes": n_bytes, + } + + # ------------------------------------------------------------------ + # Convenience helpers (static, so they are easy to unit-test in isolation) + # ------------------------------------------------------------------ + + @staticmethod + def uniform_model(vocab_size: int) -> Callable[[Sequence[int]], List[float]]: + """Return a trivial model that assigns uniform probability to all tokens. + + Useful for smoke tests. The perplexity of a uniform model on any + corpus should equal ``vocab_size`` exactly. + + Parameters + ---------- + vocab_size : int + Size of the vocabulary. + + Returns + ------- + Callable + """ + if vocab_size <= 0: + raise ValueError("vocab_size must be a positive integer") + + log_prob = math.log(1.0 / vocab_size) + + def _model(token_ids: Sequence[int]) -> List[float]: + return [log_prob] * len(token_ids) + + return _model diff --git a/openverifiablellm/manifest_chain.py b/openverifiablellm/manifest_chain.py index fe9bcc1..e48b9eb 100644 --- a/openverifiablellm/manifest_chain.py +++ b/openverifiablellm/manifest_chain.py @@ -28,25 +28,9 @@ from pathlib import Path from typing import Any, Dict, Optional, Union -logger = logging.getLogger(__name__) - - -def _canonical_json(obj: Any) -> str: - """ - Serialize object into canonical JSON format. - Ensures stable hashing across runs regardless of key order. - - Parameters - ---------- - obj : Any - JSON-serializable object +from openverifiablellm._hashing import _canonical_json - Returns - ------- - str - Canonical JSON string with sorted keys - """ - return json.dumps(obj, sort_keys=True, separators=(",", ":")) +logger = logging.getLogger(__name__) def compute_manifest_hash(manifest: Union[str, Path, Dict[str, Any]]) -> str: diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..b03415e --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,316 @@ +""" +tests/test_eval.py +================== + +Tests for the openverifiablellm.eval package. + +All tests run on CPU with tiny mock models — no GPU required. + +Run with: + pytest tests/test_eval.py -v +""" + +import math + +import pytest + +from openverifiablellm.eval import ( + BaseEvaluator, + BenchmarkEvaluator, + BiasEvaluator, + PerplexityEvaluator, +) +from openverifiablellm.eval.perplexity import _sliding_window_nll + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_TEXT = ( + "Wikipedia is a free online encyclopedia created by volunteers. " + "Anyone can edit an article at any time. " + "It is one of the most visited websites in the world. " +) + +VOCAB_SIZE = 256 # byte-level, conveniently small + + +def byte_tokenizer(text: str): + """Encode text as UTF-8 bytes (each byte is one token).""" + return list(text.encode("utf-8")) + + +def uniform_log_probs(vocab_size: int): + """Return a model that outputs uniform log-probabilities for every token.""" + lp = math.log(1.0 / vocab_size) + + def _model(token_ids): + return [lp] * len(token_ids) + + return _model + + +# --------------------------------------------------------------------------- +# BaseEvaluator — interface contract +# --------------------------------------------------------------------------- + + +class TestBaseEvaluator: + def test_cannot_instantiate_directly(self): + with pytest.raises(TypeError): + BaseEvaluator(name="x") # type: ignore[abstract] + + def test_concrete_subclass_must_implement_evaluate(self): + """A subclass that forgets to implement evaluate() is not instantiable.""" + + class Incomplete(BaseEvaluator): + pass + + with pytest.raises(TypeError): + Incomplete(name="incomplete") # type: ignore[abstract] + + def test_concrete_subclass_is_instantiable(self): + class Minimal(BaseEvaluator): + def evaluate(self, model, tokenizer): + return {"score": 1.0} + + ev = Minimal(name="minimal") + assert ev.name == "minimal" + assert ev.evaluate(None, None) == {"score": 1.0} + + def test_base_evaluator_import_alias(self): + # The package re-export must be the same class object as the base module class. + from openverifiablellm.eval.base import BaseEvaluator as _Direct + + assert BaseEvaluator is _Direct + + +# --------------------------------------------------------------------------- +# PerplexityEvaluator — construction validation +# --------------------------------------------------------------------------- + + +class TestPerplexityEvaluatorConstruction: + def test_valid_construction(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + assert ev.text == SAMPLE_TEXT + + def test_empty_text_raises(self): + with pytest.raises(ValueError, match="non-empty"): + PerplexityEvaluator(text="") + + def test_non_string_text_raises(self): + with pytest.raises(ValueError): + PerplexityEvaluator(text=123) # type: ignore[arg-type] + + def test_zero_max_length_raises(self): + with pytest.raises(ValueError, match="max_length"): + PerplexityEvaluator(text=SAMPLE_TEXT, max_length=0) + + def test_zero_stride_raises(self): + with pytest.raises(ValueError, match="stride"): + PerplexityEvaluator(text=SAMPLE_TEXT, stride=0) + + def test_stride_exceeds_max_length_raises(self): + with pytest.raises(ValueError, match="stride"): + PerplexityEvaluator(text=SAMPLE_TEXT, max_length=64, stride=128) + + def test_name_stored(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT, name="test_ppl") + assert ev.name == "test_ppl" + + +# --------------------------------------------------------------------------- +# PerplexityEvaluator — evaluate() correctness +# --------------------------------------------------------------------------- + + +class TestPerplexityEvaluatorEvaluate: + def test_returns_required_keys(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + for key in ("perplexity", "nll_bits_per_byte", "n_tokens", "n_bytes"): + assert key in result, f"Missing key: {key}" + + def test_uniform_model_perplexity_equals_vocab_size(self): + """For a uniform model, PPL must equal vocab_size (within float tolerance).""" + ev = PerplexityEvaluator(text=SAMPLE_TEXT, max_length=512, stride=256) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + assert abs(result["perplexity"] - VOCAB_SIZE) < 1e-6, ( + f"Expected PPL≈{VOCAB_SIZE}, got {result['perplexity']}" + ) + + def test_n_bytes_matches_utf8_length(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + assert result["n_bytes"] == len(SAMPLE_TEXT.encode("utf-8")) + + def test_n_tokens_positive(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT, max_length=64, stride=32) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + assert result["n_tokens"] > 0 + + def test_perplexity_is_finite(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + assert math.isfinite(result["perplexity"]) + + def test_lower_surprise_gives_lower_perplexity(self): + """A model that perfectly predicts each token must have PPL=1.""" + # A model that returns log(1.0) = 0.0 for every token + perfect_model = lambda ids: [0.0] * len(ids) # noqa: E731 + + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + result = ev.evaluate(model=perfect_model, tokenizer=byte_tokenizer) + assert abs(result["perplexity"] - 1.0) < 1e-9 + + def test_uniform_model_factory(self): + model = PerplexityEvaluator.uniform_model(vocab_size=100) + log_probs = model([0, 1, 2]) + assert len(log_probs) == 3 + expected_lp = math.log(1.0 / 100) + for lp in log_probs: + assert abs(lp - expected_lp) < 1e-12 + + def test_uniform_model_zero_vocab_raises(self): + with pytest.raises(ValueError, match="vocab_size"): + PerplexityEvaluator.uniform_model(vocab_size=0) + + def test_nll_bits_per_byte_non_negative(self): + ev = PerplexityEvaluator(text=SAMPLE_TEXT) + result = ev.evaluate( + model=uniform_log_probs(VOCAB_SIZE), + tokenizer=byte_tokenizer, + ) + assert result["nll_bits_per_byte"] >= 0.0 + + +# --------------------------------------------------------------------------- +# _sliding_window_nll — internal unit tests +# --------------------------------------------------------------------------- + + +class TestSlidingWindowNll: + def _uniform_model(self, vocab_size=VOCAB_SIZE): + lp = math.log(1.0 / vocab_size) + return lambda ids: [lp] * len(ids) + + def test_empty_token_list_returns_zero(self): + nll, n = _sliding_window_nll([], self._uniform_model(), max_length=64, stride=32) + assert nll == 0.0 + assert n == 0 + + def test_single_token_nothing_scored(self): + nll, n = _sliding_window_nll([42], self._uniform_model(), max_length=64, stride=32) + # Position 0 has no left context — not scored + assert n == 0 + + def test_two_tokens_scores_one(self): + nll, n = _sliding_window_nll([1, 2], self._uniform_model(), max_length=64, stride=32) + assert n == 1 + + def test_invalid_max_length_raises(self): + with pytest.raises(ValueError, match="max_length"): + _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=0, stride=1) + + def test_invalid_stride_raises(self): + with pytest.raises(ValueError, match="stride"): + _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=4, stride=0) + + def test_stride_exceeds_max_length_raises(self): + with pytest.raises(ValueError, match="stride"): + _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=2, stride=4) + + def test_nll_proportional_to_n_tokens_for_uniform_model(self): + """For a uniform model NLL = n_tokens * log(vocab_size).""" + text = SAMPLE_TEXT * 3 # ~500 bytes, enough to span multiple windows + token_ids = byte_tokenizer(text) + model = self._uniform_model(VOCAB_SIZE) + nll, n = _sliding_window_nll(token_ids, model, max_length=128, stride=64) + expected_nll_per_token = math.log(VOCAB_SIZE) + assert abs(nll / n - expected_nll_per_token) < 1e-9 + + +# --------------------------------------------------------------------------- +# BiasEvaluator — stub contract +# --------------------------------------------------------------------------- + + +class TestBiasEvaluator: + def test_valid_construction_winobias(self): + ev = BiasEvaluator(benchmark="winobias") + assert ev.benchmark == "winobias" + + def test_valid_construction_bbq(self): + ev = BiasEvaluator(benchmark="bbq") + assert ev.benchmark == "bbq" + + def test_invalid_benchmark_raises(self): + with pytest.raises(ValueError, match="Unsupported"): + BiasEvaluator(benchmark="unknown_benchmark") + + def test_evaluate_raises_not_implemented(self): + ev = BiasEvaluator() + with pytest.raises(NotImplementedError): + ev.evaluate(model=None, tokenizer=None) + + def test_is_base_evaluator_subclass(self): + from openverifiablellm.eval.base import BaseEvaluator + + assert issubclass(BiasEvaluator, BaseEvaluator) + + +# --------------------------------------------------------------------------- +# BenchmarkEvaluator — stub contract +# --------------------------------------------------------------------------- + + +class TestBenchmarkEvaluator: + def test_valid_construction_mmlu(self): + ev = BenchmarkEvaluator(benchmark="mmlu") + assert ev.benchmark == "mmlu" + + def test_valid_construction_triviaqa(self): + ev = BenchmarkEvaluator(benchmark="triviaqa") + assert ev.benchmark == "triviaqa" + + def test_invalid_benchmark_raises(self): + with pytest.raises(ValueError, match="Unsupported"): + BenchmarkEvaluator(benchmark="glue") + + def test_n_samples_none_allowed(self): + ev = BenchmarkEvaluator(n_samples=None) + assert ev.n_samples is None + + def test_n_samples_positive_allowed(self): + ev = BenchmarkEvaluator(n_samples=100) + assert ev.n_samples == 100 + + def test_n_samples_zero_raises(self): + with pytest.raises(ValueError, match="n_samples"): + BenchmarkEvaluator(n_samples=0) + + def test_evaluate_raises_not_implemented(self): + ev = BenchmarkEvaluator() + with pytest.raises(NotImplementedError): + ev.evaluate(model=None, tokenizer=None) + + def test_is_base_evaluator_subclass(self): + from openverifiablellm.eval.base import BaseEvaluator + + assert issubclass(BenchmarkEvaluator, BaseEvaluator)