From 0d7368200a36549917a73c62045143fc8c8e89e9 Mon Sep 17 00:00:00 2001
From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:08:36 +0530
Subject: [PATCH] refactor: deduplicate _canonical_json into shared _hashing
 module

---
 openverifiablellm/_hashing.py        |  29 +++
 openverifiablellm/environment.py     |   9 +-
 openverifiablellm/eval/__init__.py   |  30 +++
 openverifiablellm/eval/base.py       |  59 +++++
 openverifiablellm/eval/benchmarks.py | 117 ++++++++++
 openverifiablellm/eval/bias.py       | 103 +++++++++
 openverifiablellm/eval/perplexity.py | 270 +++++++++++++++++++++++
 openverifiablellm/manifest_chain.py  |  20 +-
 tests/test_eval.py                   | 316 +++++++++++++++++++++++++++
 9 files changed, 927 insertions(+), 26 deletions(-)
 create mode 100644 openverifiablellm/_hashing.py
 create mode 100644 openverifiablellm/eval/__init__.py
 create mode 100644 openverifiablellm/eval/base.py
 create mode 100644 openverifiablellm/eval/benchmarks.py
 create mode 100644 openverifiablellm/eval/bias.py
 create mode 100644 openverifiablellm/eval/perplexity.py
 create mode 100644 tests/test_eval.py

diff --git a/openverifiablellm/_hashing.py b/openverifiablellm/_hashing.py
new file mode 100644
index 0000000..ceeda2d
--- /dev/null
+++ b/openverifiablellm/_hashing.py
@@ -0,0 +1,29 @@
+"""
+_hashing.py
+===========
+Shared low-level hashing utilities used across the openverifiablellm package.
+
+These are intentionally kept small and dependency-free so they can be safely
+imported from any module without risk of circular imports.
+"""
+
+import json
+from typing import Any
+
+
+def _canonical_json(obj: Any) -> str:
+    """
+    Serialize object into canonical JSON format.
+    Ensures stable hashing across runs regardless of key order.
+
+    Parameters
+    ----------
+    obj : Any
+        JSON-serializable object
+
+    Returns
+    -------
+    str
+        Canonical JSON string with sorted keys
+    """
+    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
diff --git a/openverifiablellm/environment.py b/openverifiablellm/environment.py
index 9271e49..d7a8090 100644
--- a/openverifiablellm/environment.py
+++ b/openverifiablellm/environment.py
@@ -1,17 +1,10 @@
 import hashlib
-import json
 import platform
 import subprocess
 import sys
 from typing import Any, Dict
 
-
-def _canonical_json(obj: Any) -> str:
-    """
-    Serialize object into canonical JSON format.
-    Ensures stable hashing across runs.
-    """
-    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
+from openverifiablellm._hashing import _canonical_json
 
 
 def compute_object_hash(obj: Any) -> str:
diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py
new file mode 100644
index 0000000..bb5c6b6
--- /dev/null
+++ b/openverifiablellm/eval/__init__.py
@@ -0,0 +1,30 @@
+"""
+openverifiablellm.eval
+======================
+
+Evaluation framework for OpenVerifiableLLM.
+
+Provides pluggable evaluators for perplexity, bias, and benchmark tasks,
+all built on top of the abstract ``BaseEvaluator`` interface.
+
+Available evaluators
+--------------------
+- :class:`~openverifiablellm.eval.perplexity.PerplexityEvaluator`
+    Measures cross-entropy perplexity on a held-out text corpus.
+- :class:`~openverifiablellm.eval.bias.BiasEvaluator`
+    Bias-testing stub (WinoBias / BBQ — integration pending).
+- :class:`~openverifiablellm.eval.benchmarks.BenchmarkEvaluator`
+    MMLU / factual-accuracy stub (lm-eval-harness — integration pending).
+"""
+
+from .base import BaseEvaluator
+from .benchmarks import BenchmarkEvaluator
+from .bias import BiasEvaluator
+from .perplexity import PerplexityEvaluator
+
+__all__ = [
+    "BaseEvaluator",
+    "PerplexityEvaluator",
+    "BiasEvaluator",
+    "BenchmarkEvaluator",
+]
diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py
new file mode 100644
index 0000000..6efad07
--- /dev/null
+++ b/openverifiablellm/eval/base.py
@@ -0,0 +1,59 @@
+"""
+openverifiablellm.eval.base
+============================
+
+Abstract base class for all evaluation strategies.
+
+All concrete evaluators must subclass :class:`BaseEvaluator` and implement
+:meth:`evaluate`, which receives a model callable and a tokenizer callable
+and returns a flat ``dict`` of metric names to scalar values.
+
+Example
+-------
+::
+
+    class MyEvaluator(BaseEvaluator):
+        def evaluate(self, model, tokenizer):
+            # ... compute metrics ...
+            return {"my_metric": 42.0}
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict
+
+
+class BaseEvaluator(ABC):
+    """Abstract base class for LLM evaluators.
+
+    Parameters
+    ----------
+    name : str
+        Human-readable identifier for this evaluator (used in reports).
+    """
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    @abstractmethod
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run the evaluation and return a metric dictionary.
+
+        Parameters
+        ----------
+        model :
+            A callable that accepts token sequences and returns log-probabilities
+            or logits.  The exact signature is determined by the concrete
+            evaluator subclass.
+        tokenizer :
+            A callable that maps a string to a sequence of integer token IDs.
+
+        Returns
+        -------
+        dict
+            Mapping of metric name → scalar value.  All values must be
+            JSON-serialisable floats.
+        """
diff --git a/openverifiablellm/eval/benchmarks.py b/openverifiablellm/eval/benchmarks.py
new file mode 100644
index 0000000..addb9c5
--- /dev/null
+++ b/openverifiablellm/eval/benchmarks.py
@@ -0,0 +1,117 @@
+"""
+openverifiablellm.eval.benchmarks
+====================================
+
+Standard LLM benchmark evaluation stub for OpenVerifiableLLM.
+
+This module provides the :class:`BenchmarkEvaluator` class, which wraps
+established NLP benchmarks commonly used to compare language models.
+
+Planned benchmarks
+------------------
+* **MMLU** (Hendrycks et al., 2021) — Massive Multitask Language Understanding.
+  57 academic subjects, 4-way multiple-choice.  Evaluates broad knowledge and
+  reasoning across STEM, humanities, social sciences, and more.
+
+* **TriviaQA** (Joshi et al., 2017) — Factual accuracy benchmark with
+  trivia-style questions and supporting evidence passages.  A random subset
+  (e.g., 1 000 questions) is used for fast evaluation.
+
+Integration is pending a stable lm-eval-harness dependency.  The class
+skeleton is provided now so that downstream code can import and type-check
+:class:`BenchmarkEvaluator` without error.
+
+TODO
+----
+* Integrate MMLU via ``lm_eval.tasks``::
+
+      lm_eval --model hf \
+              --model_args pretrained=<model_path> \
+              --tasks mmlu \
+              --device cpu \
+              --output_path results/
+
+* Integrate TriviaQA via ``lm_eval.tasks`` or HuggingFace ``datasets``.
+* Cache downloaded datasets locally to avoid redundant network traffic.
+* Return metrics: ``mmlu_accuracy``, ``triviaqa_exact_match``,
+  ``per_subject_accuracy`` dict.
+"""
+
+import logging
+from typing import Any, Callable, Dict
+
+from .base import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class BenchmarkEvaluator(BaseEvaluator):
+    """Evaluate a language model on standard NLP benchmarks (stub).
+
+    Parameters
+    ----------
+    benchmark : {"mmlu", "triviaqa"}
+        Which benchmark to run.
+    n_samples : int or None
+        Number of examples to evaluate on.  ``None`` means the full benchmark.
+        Set a small value (e.g., 100) for rapid iteration during development.
+    name : str
+        Evaluator name used in reports.
+
+    Notes
+    -----
+    This class is intentionally a stub.  Calling :meth:`evaluate` will raise
+    :class:`NotImplementedError` until the benchmark integration is complete.
+    See module docstring for the planned implementation.
+    """
+
+    SUPPORTED_BENCHMARKS = ("mmlu", "triviaqa")
+
+    def __init__(
+        self,
+        benchmark: str = "mmlu",
+        n_samples: int = None,
+        name: str = "benchmark",
+    ) -> None:
+        super().__init__(name=name)
+
+        if benchmark not in self.SUPPORTED_BENCHMARKS:
+            raise ValueError(
+                f"Unsupported benchmark '{benchmark}'. "
+                f"Choose from: {self.SUPPORTED_BENCHMARKS}"
+            )
+
+        if n_samples is not None and n_samples <= 0:
+            raise ValueError("n_samples must be a positive integer or None")
+
+        self.benchmark = benchmark
+        self.n_samples = n_samples
+
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run the benchmark evaluation.
+
+        .. note::
+            Not yet implemented.  Raises :class:`NotImplementedError`.
+
+        Parameters
+        ----------
+        model :
+            Language model callable.
+        tokenizer :
+            Tokenizer callable.
+
+        Raises
+        ------
+        NotImplementedError
+            Always, until MMLU/TriviaQA integration is complete.
+        """
+        # TODO: implement MMLU via lm-eval-harness task registry
+        # TODO: implement TriviaQA via HuggingFace datasets + exact-match scorer
+        raise NotImplementedError(
+            f"BenchmarkEvaluator ({self.benchmark}) is not yet implemented. "
+            "See openverifiablellm/eval/benchmarks.py for the integration plan."
+        )
diff --git a/openverifiablellm/eval/bias.py b/openverifiablellm/eval/bias.py
new file mode 100644
index 0000000..84aacb1
--- /dev/null
+++ b/openverifiablellm/eval/bias.py
@@ -0,0 +1,103 @@
+"""
+openverifiablellm.eval.bias
+=============================
+
+Bias evaluation stub for OpenVerifiableLLM.
+
+This module provides the :class:`BiasEvaluator` class, which is intended to
+measure social bias in a language model using established benchmarks.
+
+Planned benchmarks
+------------------
+* **WinoBias** (Zhao et al., 2018) — coreference-resolution pairs that reveal
+  occupational gender bias.  Each example has a pro-stereotypical and an
+  anti-stereotypical version; a fair model should perform equally on both.
+
+* **BBQ** (Parrish et al., 2022) — a question-answering dataset covering nine
+  social-bias dimensions (age, disability status, gender identity, nationality,
+  physical appearance, race/ethnicity, religion, SES, sexual orientation).
+
+Integration is pending a stable lm-eval-harness dependency.  The class
+skeleton is provided now so that downstream code can import and type-check
+:class:`BiasEvaluator` without error.
+
+TODO
+----
+* Integrate WinoBias evaluation via HuggingFace ``datasets``.
+* Integrate BBQ via ``lm_eval.tasks`` (lm-eval-harness).
+* Implement ``_score_pair()`` helper that forwards pro/anti pairs through the
+  model and computes the accuracy gap.
+* Return bias metrics: ``gender_bias_score``, ``bbq_accuracy``,
+  ``per_category_bias`` dict.
+"""
+
+import logging
+from typing import Any, Callable, Dict
+
+from .base import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class BiasEvaluator(BaseEvaluator):
+    """Evaluate social bias in a language model (stub).
+
+    Parameters
+    ----------
+    benchmark : {"winobias", "bbq"}
+        Which bias benchmark to use.
+    name : str
+        Evaluator name used in reports.
+
+    Notes
+    -----
+    This class is intentionally a stub.  Calling :meth:`evaluate` will raise
+    :class:`NotImplementedError` until the benchmark integration is complete.
+    See module docstring for the planned implementation.
+    """
+
+    SUPPORTED_BENCHMARKS = ("winobias", "bbq")
+
+    def __init__(
+        self,
+        benchmark: str = "winobias",
+        name: str = "bias",
+    ) -> None:
+        super().__init__(name=name)
+
+        if benchmark not in self.SUPPORTED_BENCHMARKS:
+            raise ValueError(
+                f"Unsupported benchmark '{benchmark}'. "
+                f"Choose from: {self.SUPPORTED_BENCHMARKS}"
+            )
+
+        self.benchmark = benchmark
+
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run bias evaluation.
+
+        .. note::
+            Not yet implemented.  Raises :class:`NotImplementedError`.
+
+        Parameters
+        ----------
+        model :
+            Language model callable.
+        tokenizer :
+            Tokenizer callable.
+
+        Raises
+        ------
+        NotImplementedError
+            Always, until WinoBias/BBQ integration is complete.
+        """
+        # TODO: implement WinoBias via HuggingFace datasets
+        # TODO: implement BBQ via lm-eval-harness task registry
+        raise NotImplementedError(
+            f"BiasEvaluator ({self.benchmark}) is not yet implemented. "
+            "See openverifiablellm/eval/bias.py for the integration plan."
+        )
diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py
new file mode 100644
index 0000000..edfe024
--- /dev/null
+++ b/openverifiablellm/eval/perplexity.py
@@ -0,0 +1,270 @@
+"""
+openverifiablellm.eval.perplexity
+===================================
+
+Token-level perplexity measurement on a held-out text corpus.
+
+Perplexity is defined as::
+
+    PPL = exp( -1/N * sum_i log P(token_i | context_i) )
+
+where *N* is the total number of tokens in the evaluation corpus.  This is the
+standard metric for language-model quality and is directly relevant to Wikipedia
+pre-training: a lower perplexity indicates that the model assigns higher
+probability to held-out Wikipedia text.
+
+Usage
+-----
+::
+
+    from openverifiablellm.eval.perplexity import PerplexityEvaluator
+
+    evaluator = PerplexityEvaluator(text="The quick brown fox ...", stride=128)
+    results = evaluator.evaluate(model=my_model, tokenizer=my_tokenizer)
+    print(results)  # {"perplexity": 45.3, "nll_bits_per_byte": 2.1, "n_tokens": 512}
+
+The ``model`` callable must accept a list of integer token IDs and return a
+list of per-token log-probabilities (log P(token | prefix)) for every position.
+This signature is intentionally simple so that tiny mock models work in tests
+without requiring a GPU or a full transformer stack.
+"""
+
+import logging
+import math
+from typing import Callable, Dict, List, Sequence
+
+from .base import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+# Maximum sequence length forwarded through the model in a single call.
+# Keeping this small allows evaluation on CPU with tiny mock models.
+DEFAULT_MAX_LENGTH: int = 512
+
+# Stride used for the sliding-window approach when the corpus is longer than
+# max_length.  Tokens in the overlap zone are scored only once (by the later
+# window), which avoids inflating perplexity near window boundaries.
+DEFAULT_STRIDE: int = 256
+
+
+def _sliding_window_nll(
+    token_ids: List[int],
+    model: Callable[[List[int]], List[float]],
+    max_length: int,
+    stride: int,
+) -> tuple:
+    """Compute total negative log-likelihood using a sliding window.
+
+    Parameters
+    ----------
+    token_ids :
+        Full list of token IDs for the evaluation corpus.
+    model :
+        Callable mapping a token-ID list to per-token log-probabilities.
+        ``model(ids)[i]`` is log P(ids[i] | ids[:i]).
+    max_length :
+        Maximum number of tokens forwarded to the model at once.
+    stride :
+        How many tokens the window advances between calls.
+
+    Returns
+    -------
+    (total_nll, n_scored_tokens) : tuple of (float, int)
+        Aggregate negative log-likelihood and the number of tokens scored.
+    """
+    if max_length <= 0:
+        raise ValueError("max_length must be a positive integer")
+    if stride <= 0:
+        raise ValueError("stride must be a positive integer")
+    if stride > max_length:
+        raise ValueError("stride must not exceed max_length")
+
+    n = len(token_ids)
+    if n == 0:
+        return 0.0, 0
+
+    total_nll = 0.0
+    n_scored = 0
+    start = 0
+
+    while start < n:
+        end = min(start + max_length, n)
+        window = token_ids[start:end]
+
+        # The first token in the very first window has no context → skip it.
+        # For subsequent windows the overlap is max_length - stride tokens;
+        # we only score the *new* tokens (the last stride tokens of the window).
+        if start == 0:
+            # Score positions 1 … end-1 (position 0 has no left context)
+            log_probs: List[float] = model(window)
+            for pos in range(1, len(window)):
+                total_nll -= log_probs[pos]
+                n_scored += 1
+        else:
+            # Only score tokens beyond the overlap
+            log_probs = model(window)
+            overlap = max_length - stride
+            for pos in range(overlap, len(window)):
+                total_nll -= log_probs[pos]
+                n_scored += 1
+
+        if end == n:
+            break
+        start += stride
+
+    return total_nll, n_scored
+
+
+class PerplexityEvaluator(BaseEvaluator):
+    """Evaluate a language model's perplexity on a held-out text corpus.
+
+    Parameters
+    ----------
+    text : str
+        The held-out text to evaluate on.  Typically a few thousand words of
+        Wikipedia-style prose.
+    max_length : int
+        Maximum context window passed to the model per forward call.
+        Default: 512.
+    stride : int
+        Sliding window stride.  Must be ≤ ``max_length``.
+        Default: 256.
+    name : str
+        Evaluator name used in reports.
+    """
+
+    def __init__(
+        self,
+        text: str,
+        max_length: int = DEFAULT_MAX_LENGTH,
+        stride: int = DEFAULT_STRIDE,
+        name: str = "perplexity",
+    ) -> None:
+        super().__init__(name=name)
+
+        if not isinstance(text, str) or not text:
+            raise ValueError("text must be a non-empty string")
+        if max_length <= 0:
+            raise ValueError("max_length must be a positive integer")
+        if stride <= 0:
+            raise ValueError("stride must be a positive integer")
+        if stride > max_length:
+            raise ValueError("stride must not exceed max_length")
+
+        self.text = text
+        self.max_length = max_length
+        self.stride = stride
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def evaluate(
+        self,
+        model: Callable[[Sequence[int]], Sequence[float]],
+        tokenizer: Callable[[str], Sequence[int]],
+    ) -> Dict[str, float]:
+        """Compute perplexity metrics.
+
+        Parameters
+        ----------
+        model :
+            ``model(token_ids) → log_probs``
+
+            Receives a list of integer token IDs and returns a list of the same
+            length where ``log_probs[i]`` is the natural-log probability of
+            ``token_ids[i]`` given all preceding tokens.  The value at position
+            0 is ignored (no left context).
+        tokenizer :
+            ``tokenizer(text) → token_ids``
+
+            Maps a raw string to a list of integer token IDs.
+
+        Returns
+        -------
+        dict with keys:
+            - ``"perplexity"`` – exp(mean NLL) — lower is better.
+            - ``"nll_bits_per_byte"`` – NLL in bits per *byte* of input text,
+              a length-independent measure.
+            - ``"n_tokens"`` – number of tokens scored.
+            - ``"n_bytes"`` – length of the input text in UTF-8 bytes.
+        """
+        token_ids: List[int] = list(tokenizer(self.text))
+
+        if len(token_ids) == 0:
+            logger.warning("Tokenizer produced empty output; returning infinite perplexity")
+            return {
+                "perplexity": float("inf"),
+                "nll_bits_per_byte": float("inf"),
+                "n_tokens": 0,
+                "n_bytes": len(self.text.encode("utf-8")),
+            }
+
+        total_nll, n_scored = _sliding_window_nll(
+            token_ids=token_ids,
+            model=model,
+            max_length=self.max_length,
+            stride=self.stride,
+        )
+
+        if n_scored == 0:
+            # Edge case: single-token corpus — nothing to score.
+            logger.warning("No tokens were scored (corpus too short); returning perplexity=1.0")
+            return {
+                "perplexity": 1.0,
+                "nll_bits_per_byte": 0.0,
+                "n_tokens": len(token_ids),
+                "n_bytes": len(self.text.encode("utf-8")),
+            }
+
+        mean_nll = total_nll / n_scored
+        perplexity = math.exp(mean_nll)
+
+        n_bytes = len(self.text.encode("utf-8"))
+        # Convert nats → bits (log2(e) ≈ 1.4427); divide by byte count.
+        nll_bits_per_byte = (total_nll * math.log2(math.e)) / n_bytes if n_bytes > 0 else 0.0
+
+        logger.info(
+            "Perplexity: %.4f  (NLL=%.4f, tokens=%d, bytes=%d)",
+            perplexity,
+            mean_nll,
+            n_scored,
+            n_bytes,
+        )
+
+        return {
+            "perplexity": perplexity,
+            "nll_bits_per_byte": nll_bits_per_byte,
+            "n_tokens": n_scored,
+            "n_bytes": n_bytes,
+        }
+
+    # ------------------------------------------------------------------
+    # Convenience helpers (static, so they are easy to unit-test in isolation)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def uniform_model(vocab_size: int) -> Callable[[Sequence[int]], List[float]]:
+        """Return a trivial model that assigns uniform probability to all tokens.
+
+        Useful for smoke tests.  The perplexity of a uniform model on any
+        corpus should equal ``vocab_size`` exactly.
+
+        Parameters
+        ----------
+        vocab_size : int
+            Size of the vocabulary.
+
+        Returns
+        -------
+        Callable
+        """
+        if vocab_size <= 0:
+            raise ValueError("vocab_size must be a positive integer")
+
+        log_prob = math.log(1.0 / vocab_size)
+
+        def _model(token_ids: Sequence[int]) -> List[float]:
+            return [log_prob] * len(token_ids)
+
+        return _model
diff --git a/openverifiablellm/manifest_chain.py b/openverifiablellm/manifest_chain.py
index fe9bcc1..e48b9eb 100644
--- a/openverifiablellm/manifest_chain.py
+++ b/openverifiablellm/manifest_chain.py
@@ -28,25 +28,9 @@
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
-logger = logging.getLogger(__name__)
-
-
-def _canonical_json(obj: Any) -> str:
-    """
-    Serialize object into canonical JSON format.
-    Ensures stable hashing across runs regardless of key order.
-
-    Parameters
-    ----------
-    obj : Any
-        JSON-serializable object
+from openverifiablellm._hashing import _canonical_json
 
-    Returns
-    -------
-    str
-        Canonical JSON string with sorted keys
-    """
-    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
+logger = logging.getLogger(__name__)
 
 
 def compute_manifest_hash(manifest: Union[str, Path, Dict[str, Any]]) -> str:
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 0000000..b03415e
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,316 @@
+"""
+tests/test_eval.py
+==================
+
+Tests for the openverifiablellm.eval package.
+
+All tests run on CPU with tiny mock models — no GPU required.
+
+Run with:
+    pytest tests/test_eval.py -v
+"""
+
+import math
+
+import pytest
+
+from openverifiablellm.eval import (
+    BaseEvaluator,
+    BenchmarkEvaluator,
+    BiasEvaluator,
+    PerplexityEvaluator,
+)
+from openverifiablellm.eval.perplexity import _sliding_window_nll
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+SAMPLE_TEXT = (
+    "Wikipedia is a free online encyclopedia created by volunteers. "
+    "Anyone can edit an article at any time. "
+    "It is one of the most visited websites in the world. "
+)
+
+VOCAB_SIZE = 256  # byte-level, conveniently small
+
+
+def byte_tokenizer(text: str):
+    """Encode text as UTF-8 bytes (each byte is one token)."""
+    return list(text.encode("utf-8"))
+
+
+def uniform_log_probs(vocab_size: int):
+    """Return a model that outputs uniform log-probabilities for every token."""
+    lp = math.log(1.0 / vocab_size)
+
+    def _model(token_ids):
+        return [lp] * len(token_ids)
+
+    return _model
+
+
+# ---------------------------------------------------------------------------
+# BaseEvaluator — interface contract
+# ---------------------------------------------------------------------------
+
+
+class TestBaseEvaluator:
+    def test_cannot_instantiate_directly(self):
+        with pytest.raises(TypeError):
+            BaseEvaluator(name="x")  # type: ignore[abstract]
+
+    def test_concrete_subclass_must_implement_evaluate(self):
+        """A subclass that forgets to implement evaluate() is not instantiable."""
+
+        class Incomplete(BaseEvaluator):
+            pass
+
+        with pytest.raises(TypeError):
+            Incomplete(name="incomplete")  # type: ignore[abstract]
+
+    def test_concrete_subclass_is_instantiable(self):
+        class Minimal(BaseEvaluator):
+            def evaluate(self, model, tokenizer):
+                return {"score": 1.0}
+
+        ev = Minimal(name="minimal")
+        assert ev.name == "minimal"
+        assert ev.evaluate(None, None) == {"score": 1.0}
+
+    def test_base_evaluator_import_alias(self):
+        # The package re-export must be the same class object as the base module class.
+        from openverifiablellm.eval.base import BaseEvaluator as _Direct
+
+        assert BaseEvaluator is _Direct
+
+
+# ---------------------------------------------------------------------------
+# PerplexityEvaluator — construction validation
+# ---------------------------------------------------------------------------
+
+
+class TestPerplexityEvaluatorConstruction:
+    def test_valid_construction(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        assert ev.text == SAMPLE_TEXT
+
+    def test_empty_text_raises(self):
+        with pytest.raises(ValueError, match="non-empty"):
+            PerplexityEvaluator(text="")
+
+    def test_non_string_text_raises(self):
+        with pytest.raises(ValueError):
+            PerplexityEvaluator(text=123)  # type: ignore[arg-type]
+
+    def test_zero_max_length_raises(self):
+        with pytest.raises(ValueError, match="max_length"):
+            PerplexityEvaluator(text=SAMPLE_TEXT, max_length=0)
+
+    def test_zero_stride_raises(self):
+        with pytest.raises(ValueError, match="stride"):
+            PerplexityEvaluator(text=SAMPLE_TEXT, stride=0)
+
+    def test_stride_exceeds_max_length_raises(self):
+        with pytest.raises(ValueError, match="stride"):
+            PerplexityEvaluator(text=SAMPLE_TEXT, max_length=64, stride=128)
+
+    def test_name_stored(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT, name="test_ppl")
+        assert ev.name == "test_ppl"
+
+
+# ---------------------------------------------------------------------------
+# PerplexityEvaluator — evaluate() correctness
+# ---------------------------------------------------------------------------
+
+
+class TestPerplexityEvaluatorEvaluate:
+    def test_returns_required_keys(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        for key in ("perplexity", "nll_bits_per_byte", "n_tokens", "n_bytes"):
+            assert key in result, f"Missing key: {key}"
+
+    def test_uniform_model_perplexity_equals_vocab_size(self):
+        """For a uniform model, PPL must equal vocab_size (within float tolerance)."""
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT, max_length=512, stride=256)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        assert abs(result["perplexity"] - VOCAB_SIZE) < 1e-6, (
+            f"Expected PPL≈{VOCAB_SIZE}, got {result['perplexity']}"
+        )
+
+    def test_n_bytes_matches_utf8_length(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        assert result["n_bytes"] == len(SAMPLE_TEXT.encode("utf-8"))
+
+    def test_n_tokens_positive(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT, max_length=64, stride=32)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        assert result["n_tokens"] > 0
+
+    def test_perplexity_is_finite(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        assert math.isfinite(result["perplexity"])
+
+    def test_lower_surprise_gives_lower_perplexity(self):
+        """A model that perfectly predicts each token must have PPL=1."""
+        # A model that returns log(1.0) = 0.0 for every token
+        perfect_model = lambda ids: [0.0] * len(ids)  # noqa: E731
+
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        result = ev.evaluate(model=perfect_model, tokenizer=byte_tokenizer)
+        assert abs(result["perplexity"] - 1.0) < 1e-9
+
+    def test_uniform_model_factory(self):
+        model = PerplexityEvaluator.uniform_model(vocab_size=100)
+        log_probs = model([0, 1, 2])
+        assert len(log_probs) == 3
+        expected_lp = math.log(1.0 / 100)
+        for lp in log_probs:
+            assert abs(lp - expected_lp) < 1e-12
+
+    def test_uniform_model_zero_vocab_raises(self):
+        with pytest.raises(ValueError, match="vocab_size"):
+            PerplexityEvaluator.uniform_model(vocab_size=0)
+
+    def test_nll_bits_per_byte_non_negative(self):
+        ev = PerplexityEvaluator(text=SAMPLE_TEXT)
+        result = ev.evaluate(
+            model=uniform_log_probs(VOCAB_SIZE),
+            tokenizer=byte_tokenizer,
+        )
+        assert result["nll_bits_per_byte"] >= 0.0
+
+
+# ---------------------------------------------------------------------------
+# _sliding_window_nll — internal unit tests
+# ---------------------------------------------------------------------------
+
+
+class TestSlidingWindowNll:
+    def _uniform_model(self, vocab_size=VOCAB_SIZE):
+        lp = math.log(1.0 / vocab_size)
+        return lambda ids: [lp] * len(ids)
+
+    def test_empty_token_list_returns_zero(self):
+        nll, n = _sliding_window_nll([], self._uniform_model(), max_length=64, stride=32)
+        assert nll == 0.0
+        assert n == 0
+
+    def test_single_token_nothing_scored(self):
+        nll, n = _sliding_window_nll([42], self._uniform_model(), max_length=64, stride=32)
+        # Position 0 has no left context — not scored
+        assert n == 0
+
+    def test_two_tokens_scores_one(self):
+        nll, n = _sliding_window_nll([1, 2], self._uniform_model(), max_length=64, stride=32)
+        assert n == 1
+
+    def test_invalid_max_length_raises(self):
+        with pytest.raises(ValueError, match="max_length"):
+            _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=0, stride=1)
+
+    def test_invalid_stride_raises(self):
+        with pytest.raises(ValueError, match="stride"):
+            _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=4, stride=0)
+
+    def test_stride_exceeds_max_length_raises(self):
+        with pytest.raises(ValueError, match="stride"):
+            _sliding_window_nll([1, 2, 3], self._uniform_model(), max_length=2, stride=4)
+
+    def test_nll_proportional_to_n_tokens_for_uniform_model(self):
+        """For a uniform model NLL = n_tokens * log(vocab_size)."""
+        text = SAMPLE_TEXT * 3  # ~500 bytes, enough to span multiple windows
+        token_ids = byte_tokenizer(text)
+        model = self._uniform_model(VOCAB_SIZE)
+        nll, n = _sliding_window_nll(token_ids, model, max_length=128, stride=64)
+        expected_nll_per_token = math.log(VOCAB_SIZE)
+        assert abs(nll / n - expected_nll_per_token) < 1e-9
+
+
+# ---------------------------------------------------------------------------
+# BiasEvaluator — stub contract
+# ---------------------------------------------------------------------------
+
+
+class TestBiasEvaluator:
+    def test_valid_construction_winobias(self):
+        ev = BiasEvaluator(benchmark="winobias")
+        assert ev.benchmark == "winobias"
+
+    def test_valid_construction_bbq(self):
+        ev = BiasEvaluator(benchmark="bbq")
+        assert ev.benchmark == "bbq"
+
+    def test_invalid_benchmark_raises(self):
+        with pytest.raises(ValueError, match="Unsupported"):
+            BiasEvaluator(benchmark="unknown_benchmark")
+
+    def test_evaluate_raises_not_implemented(self):
+        ev = BiasEvaluator()
+        with pytest.raises(NotImplementedError):
+            ev.evaluate(model=None, tokenizer=None)
+
+    def test_is_base_evaluator_subclass(self):
+        from openverifiablellm.eval.base import BaseEvaluator
+
+        assert issubclass(BiasEvaluator, BaseEvaluator)
+
+
+# ---------------------------------------------------------------------------
+# BenchmarkEvaluator — stub contract
+# ---------------------------------------------------------------------------
+
+
+class TestBenchmarkEvaluator:
+    def test_valid_construction_mmlu(self):
+        ev = BenchmarkEvaluator(benchmark="mmlu")
+        assert ev.benchmark == "mmlu"
+
+    def test_valid_construction_triviaqa(self):
+        ev = BenchmarkEvaluator(benchmark="triviaqa")
+        assert ev.benchmark == "triviaqa"
+
+    def test_invalid_benchmark_raises(self):
+        with pytest.raises(ValueError, match="Unsupported"):
+            BenchmarkEvaluator(benchmark="glue")
+
+    def test_n_samples_none_allowed(self):
+        ev = BenchmarkEvaluator(n_samples=None)
+        assert ev.n_samples is None
+
+    def test_n_samples_positive_allowed(self):
+        ev = BenchmarkEvaluator(n_samples=100)
+        assert ev.n_samples == 100
+
+    def test_n_samples_zero_raises(self):
+        with pytest.raises(ValueError, match="n_samples"):
+            BenchmarkEvaluator(n_samples=0)
+
+    def test_evaluate_raises_not_implemented(self):
+        ev = BenchmarkEvaluator()
+        with pytest.raises(NotImplementedError):
+            ev.evaluate(model=None, tokenizer=None)
+
+    def test_is_base_evaluator_subclass(self):
+        from openverifiablellm.eval.base import BaseEvaluator
+
+        assert issubclass(BenchmarkEvaluator, BaseEvaluator)