AOSSIE-Org · chaitanyamedidar · Mar 21, 2026
diff --git a/openverifiablellm/_hashing.py b/openverifiablellm/_hashing.py
@@ -0,0 +1,29 @@
+"""
+_hashing.py
+===========
+Shared low-level hashing utilities used across the openverifiablellm package.
+
+These are intentionally kept small and dependency-free so they can be safely
+imported from any module without risk of circular imports.
+"""
+
+import json
+from typing import Any
+
+
+def _canonical_json(obj: Any) -> str:
+    """
+    Serialize object into canonical JSON format.
+    Ensures stable hashing across runs regardless of key order.
+
+    Parameters
+    ----------
+    obj : Any
+        JSON-serializable object
+
+    Returns
+    -------
+    str
+        Canonical JSON string with sorted keys
+    """
+    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
diff --git a/openverifiablellm/environment.py b/openverifiablellm/environment.py
@@ -1,17 +1,10 @@
 import hashlib
-import json
 import platform
 import subprocess
 import sys
 from typing import Any, Dict
 
-
-def _canonical_json(obj: Any) -> str:
-    """
-    Serialize object into canonical JSON format.
-    Ensures stable hashing across runs.
-    """
-    return json.dumps(obj, sort_keys=True, separators=(",", ":"))
+from openverifiablellm._hashing import _canonical_json
 
 
 def compute_object_hash(obj: Any) -> str:

diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py
@@ -0,0 +1,30 @@
+"""
+openverifiablellm.eval
+======================
+
+Evaluation framework for OpenVerifiableLLM.
+
+Provides pluggable evaluators for perplexity, bias, and benchmark tasks,
+all built on top of the abstract ``BaseEvaluator`` interface.
+
+Available evaluators
+--------------------
+- :class:`~openverifiablellm.eval.perplexity.PerplexityEvaluator`
+    Measures cross-entropy perplexity on a held-out text corpus.
+- :class:`~openverifiablellm.eval.bias.BiasEvaluator`
+    Bias-testing stub (WinoBias / BBQ — integration pending).
+- :class:`~openverifiablellm.eval.benchmarks.BenchmarkEvaluator`
+    MMLU / factual-accuracy stub (lm-eval-harness — integration pending).
+"""
+
+from .base import BaseEvaluator
+from .benchmarks import BenchmarkEvaluator
+from .bias import BiasEvaluator
+from .perplexity import PerplexityEvaluator
+
+__all__ = [
+    "BaseEvaluator",
+    "PerplexityEvaluator",
+    "BiasEvaluator",
+    "BenchmarkEvaluator",
+]
diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py
@@ -0,0 +1,59 @@
+"""
+openverifiablellm.eval.base
+============================
+
+Abstract base class for all evaluation strategies.
+
+All concrete evaluators must subclass :class:`BaseEvaluator` and implement
+:meth:`evaluate`, which receives a model callable and a tokenizer callable
+and returns a flat ``dict`` of metric names to scalar values.
+
+Example
+-------
+::
+
+    class MyEvaluator(BaseEvaluator):
+        def evaluate(self, model, tokenizer):
+            # ... compute metrics ...
+            return {"my_metric": 42.0}
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict
+
+
+class BaseEvaluator(ABC):
+    """Abstract base class for LLM evaluators.
+
+    Parameters
+    ----------
+    name : str
+        Human-readable identifier for this evaluator (used in reports).
+    """
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+    @abstractmethod
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run the evaluation and return a metric dictionary.
+
+        Parameters
+        ----------
+        model :
+            A callable that accepts token sequences and returns log-probabilities
+            or logits.  The exact signature is determined by the concrete
+            evaluator subclass.
+        tokenizer :
+            A callable that maps a string to a sequence of integer token IDs.
+
+        Returns
+        -------
+        dict
+            Mapping of metric name → scalar value.  All values must be
+            JSON-serialisable floats.
+        """
diff --git a/openverifiablellm/eval/benchmarks.py b/openverifiablellm/eval/benchmarks.py
@@ -0,0 +1,117 @@
+"""
+openverifiablellm.eval.benchmarks
+====================================
+
+Standard LLM benchmark evaluation stub for OpenVerifiableLLM.
+
+This module provides the :class:`BenchmarkEvaluator` class, which wraps
+established NLP benchmarks commonly used to compare language models.
+
+Planned benchmarks
+------------------
+* **MMLU** (Hendrycks et al., 2021) — Massive Multitask Language Understanding.
+  57 academic subjects, 4-way multiple-choice.  Evaluates broad knowledge and
+  reasoning across STEM, humanities, social sciences, and more.
+
+* **TriviaQA** (Joshi et al., 2017) — Factual accuracy benchmark with
+  trivia-style questions and supporting evidence passages.  A random subset
+  (e.g., 1 000 questions) is used for fast evaluation.
+
+Integration is pending a stable lm-eval-harness dependency.  The class
+skeleton is provided now so that downstream code can import and type-check
+:class:`BenchmarkEvaluator` without error.
+
+TODO
+----
+* Integrate MMLU via ``lm_eval.tasks``::
+
+      lm_eval --model hf \
+              --model_args pretrained=<model_path> \
+              --tasks mmlu \
+              --device cpu \
+              --output_path results/
+
+* Integrate TriviaQA via ``lm_eval.tasks`` or HuggingFace ``datasets``.
+* Cache downloaded datasets locally to avoid redundant network traffic.
+* Return metrics: ``mmlu_accuracy``, ``triviaqa_exact_match``,
+  ``per_subject_accuracy`` dict.
+"""
+
+import logging
+from typing import Any, Callable, Dict
+
+from .base import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class BenchmarkEvaluator(BaseEvaluator):
+    """Evaluate a language model on standard NLP benchmarks (stub).
+
+    Parameters
+    ----------
+    benchmark : {"mmlu", "triviaqa"}
+        Which benchmark to run.
+    n_samples : int or None
+        Number of examples to evaluate on.  ``None`` means the full benchmark.
+        Set a small value (e.g., 100) for rapid iteration during development.
+    name : str
+        Evaluator name used in reports.
+
+    Notes
+    -----
+    This class is intentionally a stub.  Calling :meth:`evaluate` will raise
+    :class:`NotImplementedError` until the benchmark integration is complete.
+    See module docstring for the planned implementation.
+    """
+
+    SUPPORTED_BENCHMARKS = ("mmlu", "triviaqa")
+
+    def __init__(
+        self,
+        benchmark: str = "mmlu",
+        n_samples: int = None,
+        name: str = "benchmark",
+    ) -> None:
+        super().__init__(name=name)
+
+        if benchmark not in self.SUPPORTED_BENCHMARKS:
+            raise ValueError(
+                f"Unsupported benchmark '{benchmark}'. "
+                f"Choose from: {self.SUPPORTED_BENCHMARKS}"
+            )
+
+        if n_samples is not None and n_samples <= 0:
+            raise ValueError("n_samples must be a positive integer or None")
+
+        self.benchmark = benchmark
+        self.n_samples = n_samples
+
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run the benchmark evaluation.
+
+        .. note::
+            Not yet implemented.  Raises :class:`NotImplementedError`.
+
+        Parameters
+        ----------
+        model :
+            Language model callable.
+        tokenizer :
+            Tokenizer callable.
+
+        Raises
+        ------
+        NotImplementedError
+            Always, until MMLU/TriviaQA integration is complete.
+        """
+        # TODO: implement MMLU via lm-eval-harness task registry
+        # TODO: implement TriviaQA via HuggingFace datasets + exact-match scorer
+        raise NotImplementedError(
+            f"BenchmarkEvaluator ({self.benchmark}) is not yet implemented. "
+            "See openverifiablellm/eval/benchmarks.py for the integration plan."
+        )
diff --git a/openverifiablellm/eval/bias.py b/openverifiablellm/eval/bias.py
@@ -0,0 +1,103 @@
+"""
+openverifiablellm.eval.bias
+=============================
+
+Bias evaluation stub for OpenVerifiableLLM.
+
+This module provides the :class:`BiasEvaluator` class, which is intended to
+measure social bias in a language model using established benchmarks.
+
+Planned benchmarks
+------------------
+* **WinoBias** (Zhao et al., 2018) — coreference-resolution pairs that reveal
+  occupational gender bias.  Each example has a pro-stereotypical and an
+  anti-stereotypical version; a fair model should perform equally on both.
+
+* **BBQ** (Parrish et al., 2022) — a question-answering dataset covering nine
+  social-bias dimensions (age, disability status, gender identity, nationality,
+  physical appearance, race/ethnicity, religion, SES, sexual orientation).
+
+Integration is pending a stable lm-eval-harness dependency.  The class
+skeleton is provided now so that downstream code can import and type-check
+:class:`BiasEvaluator` without error.
+
+TODO
+----
+* Integrate WinoBias evaluation via HuggingFace ``datasets``.
+* Integrate BBQ via ``lm_eval.tasks`` (lm-eval-harness).
+* Implement ``_score_pair()`` helper that forwards pro/anti pairs through the
+  model and computes the accuracy gap.
+* Return bias metrics: ``gender_bias_score``, ``bbq_accuracy``,
+  ``per_category_bias`` dict.
+"""
+
+import logging
+from typing import Any, Callable, Dict
+
+from .base import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class BiasEvaluator(BaseEvaluator):
+    """Evaluate social bias in a language model (stub).
+
+    Parameters
+    ----------
+    benchmark : {"winobias", "bbq"}
+        Which bias benchmark to use.
+    name : str
+        Evaluator name used in reports.
+
+    Notes
+    -----
+    This class is intentionally a stub.  Calling :meth:`evaluate` will raise
+    :class:`NotImplementedError` until the benchmark integration is complete.
+    See module docstring for the planned implementation.
+    """
+
+    SUPPORTED_BENCHMARKS = ("winobias", "bbq")
+
+    def __init__(
+        self,
+        benchmark: str = "winobias",
+        name: str = "bias",
+    ) -> None:
+        super().__init__(name=name)
+
+        if benchmark not in self.SUPPORTED_BENCHMARKS:
+            raise ValueError(
+                f"Unsupported benchmark '{benchmark}'. "
+                f"Choose from: {self.SUPPORTED_BENCHMARKS}"
+            )
+
+        self.benchmark = benchmark
+
+    def evaluate(
+        self,
+        model: Callable[..., Any],
+        tokenizer: Callable[..., Any],
+    ) -> Dict[str, float]:
+        """Run bias evaluation.
+
+        .. note::
+            Not yet implemented.  Raises :class:`NotImplementedError`.
+
+        Parameters
+        ----------
+        model :
+            Language model callable.
+        tokenizer :
+            Tokenizer callable.
+
+        Raises
+        ------
+        NotImplementedError
+            Always, until WinoBias/BBQ integration is complete.
+        """
+        # TODO: implement WinoBias via HuggingFace datasets
+        # TODO: implement BBQ via lm-eval-harness task registry
+        raise NotImplementedError(
+            f"BiasEvaluator ({self.benchmark}) is not yet implemented. "
+            "See openverifiablellm/eval/bias.py for the integration plan."
+        )