Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions openverifiablellm/_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
_hashing.py
===========
Shared low-level hashing utilities used across the openverifiablellm package.

These are intentionally kept small and dependency-free so they can be safely
imported from any module without risk of circular imports.
"""

import json
from typing import Any


def _canonical_json(obj: Any) -> str:
"""
Serialize object into canonical JSON format.
Ensures stable hashing across runs regardless of key order.

Parameters
----------
obj : Any
JSON-serializable object

Returns
-------
str
Canonical JSON string with sorted keys
"""
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
9 changes: 1 addition & 8 deletions openverifiablellm/environment.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
import hashlib
import json
import platform
import subprocess
import sys
from typing import Any, Dict


def _canonical_json(obj: Any) -> str:
"""
Serialize object into canonical JSON format.
Ensures stable hashing across runs.
"""
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
from openverifiablellm._hashing import _canonical_json


def compute_object_hash(obj: Any) -> str:
Expand Down
30 changes: 30 additions & 0 deletions openverifiablellm/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
openverifiablellm.eval
======================

Evaluation framework for OpenVerifiableLLM.

Provides pluggable evaluators for perplexity, bias, and benchmark tasks,
all built on top of the abstract ``BaseEvaluator`` interface.

Available evaluators
--------------------
- :class:`~openverifiablellm.eval.perplexity.PerplexityEvaluator`
Measures cross-entropy perplexity on a held-out text corpus.
- :class:`~openverifiablellm.eval.bias.BiasEvaluator`
Bias-testing stub (WinoBias / BBQ — integration pending).
- :class:`~openverifiablellm.eval.benchmarks.BenchmarkEvaluator`
MMLU / factual-accuracy stub (lm-eval-harness — integration pending).
"""

from .base import BaseEvaluator
from .benchmarks import BenchmarkEvaluator
from .bias import BiasEvaluator
from .perplexity import PerplexityEvaluator

__all__ = [
"BaseEvaluator",
"PerplexityEvaluator",
"BiasEvaluator",
"BenchmarkEvaluator",
]
59 changes: 59 additions & 0 deletions openverifiablellm/eval/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
openverifiablellm.eval.base
============================

Abstract base class for all evaluation strategies.

All concrete evaluators must subclass :class:`BaseEvaluator` and implement
:meth:`evaluate`, which receives a model callable and a tokenizer callable
and returns a flat ``dict`` of metric names to scalar values.

Example
-------
::

class MyEvaluator(BaseEvaluator):
def evaluate(self, model, tokenizer):
# ... compute metrics ...
return {"my_metric": 42.0}
"""

from abc import ABC, abstractmethod
from typing import Any, Callable, Dict


class BaseEvaluator(ABC):
"""Abstract base class for LLM evaluators.

Parameters
----------
name : str
Human-readable identifier for this evaluator (used in reports).
"""

def __init__(self, name: str) -> None:
self.name = name

@abstractmethod
def evaluate(
self,
model: Callable[..., Any],
tokenizer: Callable[..., Any],
) -> Dict[str, float]:
"""Run the evaluation and return a metric dictionary.

Parameters
----------
model :
A callable that accepts token sequences and returns log-probabilities
or logits. The exact signature is determined by the concrete
evaluator subclass.
tokenizer :
A callable that maps a string to a sequence of integer token IDs.

Returns
-------
dict
Mapping of metric name → scalar value. All values must be
JSON-serialisable floats.
"""
117 changes: 117 additions & 0 deletions openverifiablellm/eval/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
openverifiablellm.eval.benchmarks
====================================

Standard LLM benchmark evaluation stub for OpenVerifiableLLM.

This module provides the :class:`BenchmarkEvaluator` class, which wraps
established NLP benchmarks commonly used to compare language models.

Planned benchmarks
------------------
* **MMLU** (Hendrycks et al., 2021) — Massive Multitask Language Understanding.
57 academic subjects, 4-way multiple-choice. Evaluates broad knowledge and
reasoning across STEM, humanities, social sciences, and more.

* **TriviaQA** (Joshi et al., 2017) — Factual accuracy benchmark with
trivia-style questions and supporting evidence passages. A random subset
(e.g., 1 000 questions) is used for fast evaluation.

Integration is pending a stable lm-eval-harness dependency. The class
skeleton is provided now so that downstream code can import and type-check
:class:`BenchmarkEvaluator` without error.

TODO
----
* Integrate MMLU via ``lm_eval.tasks``::

lm_eval --model hf \
--model_args pretrained=<model_path> \
--tasks mmlu \
--device cpu \
--output_path results/

* Integrate TriviaQA via ``lm_eval.tasks`` or HuggingFace ``datasets``.
* Cache downloaded datasets locally to avoid redundant network traffic.
* Return metrics: ``mmlu_accuracy``, ``triviaqa_exact_match``,
``per_subject_accuracy`` dict.
"""

import logging
from typing import Any, Callable, Dict

from .base import BaseEvaluator

logger = logging.getLogger(__name__)


class BenchmarkEvaluator(BaseEvaluator):
"""Evaluate a language model on standard NLP benchmarks (stub).

Parameters
----------
benchmark : {"mmlu", "triviaqa"}
Which benchmark to run.
n_samples : int or None
Number of examples to evaluate on. ``None`` means the full benchmark.
Set a small value (e.g., 100) for rapid iteration during development.
name : str
Evaluator name used in reports.

Notes
-----
This class is intentionally a stub. Calling :meth:`evaluate` will raise
:class:`NotImplementedError` until the benchmark integration is complete.
See module docstring for the planned implementation.
"""

SUPPORTED_BENCHMARKS = ("mmlu", "triviaqa")

def __init__(
self,
benchmark: str = "mmlu",
n_samples: int = None,
name: str = "benchmark",
) -> None:
super().__init__(name=name)

if benchmark not in self.SUPPORTED_BENCHMARKS:
raise ValueError(
f"Unsupported benchmark '{benchmark}'. "
f"Choose from: {self.SUPPORTED_BENCHMARKS}"
)

if n_samples is not None and n_samples <= 0:
raise ValueError("n_samples must be a positive integer or None")

self.benchmark = benchmark
self.n_samples = n_samples

def evaluate(
self,
model: Callable[..., Any],
tokenizer: Callable[..., Any],
) -> Dict[str, float]:
"""Run the benchmark evaluation.

.. note::
Not yet implemented. Raises :class:`NotImplementedError`.

Parameters
----------
model :
Language model callable.
tokenizer :
Tokenizer callable.

Raises
------
NotImplementedError
Always, until MMLU/TriviaQA integration is complete.
"""
# TODO: implement MMLU via lm-eval-harness task registry
# TODO: implement TriviaQA via HuggingFace datasets + exact-match scorer
raise NotImplementedError(
f"BenchmarkEvaluator ({self.benchmark}) is not yet implemented. "
"See openverifiablellm/eval/benchmarks.py for the integration plan."
)
103 changes: 103 additions & 0 deletions openverifiablellm/eval/bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
openverifiablellm.eval.bias
=============================

Bias evaluation stub for OpenVerifiableLLM.

This module provides the :class:`BiasEvaluator` class, which is intended to
measure social bias in a language model using established benchmarks.

Planned benchmarks
------------------
* **WinoBias** (Zhao et al., 2018) — coreference-resolution pairs that reveal
occupational gender bias. Each example has a pro-stereotypical and an
anti-stereotypical version; a fair model should perform equally on both.

* **BBQ** (Parrish et al., 2022) — a question-answering dataset covering nine
social-bias dimensions (age, disability status, gender identity, nationality,
physical appearance, race/ethnicity, religion, SES, sexual orientation).

Integration is pending a stable lm-eval-harness dependency. The class
skeleton is provided now so that downstream code can import and type-check
:class:`BiasEvaluator` without error.

TODO
----
* Integrate WinoBias evaluation via HuggingFace ``datasets``.
* Integrate BBQ via ``lm_eval.tasks`` (lm-eval-harness).
* Implement ``_score_pair()`` helper that forwards pro/anti pairs through the
model and computes the accuracy gap.
* Return bias metrics: ``gender_bias_score``, ``bbq_accuracy``,
``per_category_bias`` dict.
"""

import logging
from typing import Any, Callable, Dict

from .base import BaseEvaluator

logger = logging.getLogger(__name__)


class BiasEvaluator(BaseEvaluator):
"""Evaluate social bias in a language model (stub).

Parameters
----------
benchmark : {"winobias", "bbq"}
Which bias benchmark to use.
name : str
Evaluator name used in reports.

Notes
-----
This class is intentionally a stub. Calling :meth:`evaluate` will raise
:class:`NotImplementedError` until the benchmark integration is complete.
See module docstring for the planned implementation.
"""

SUPPORTED_BENCHMARKS = ("winobias", "bbq")

def __init__(
self,
benchmark: str = "winobias",
name: str = "bias",
) -> None:
super().__init__(name=name)

if benchmark not in self.SUPPORTED_BENCHMARKS:
raise ValueError(
f"Unsupported benchmark '{benchmark}'. "
f"Choose from: {self.SUPPORTED_BENCHMARKS}"
)

self.benchmark = benchmark

def evaluate(
self,
model: Callable[..., Any],
tokenizer: Callable[..., Any],
) -> Dict[str, float]:
"""Run bias evaluation.

.. note::
Not yet implemented. Raises :class:`NotImplementedError`.

Parameters
----------
model :
Language model callable.
tokenizer :
Tokenizer callable.

Raises
------
NotImplementedError
Always, until WinoBias/BBQ integration is complete.
"""
# TODO: implement WinoBias via HuggingFace datasets
# TODO: implement BBQ via lm-eval-harness task registry
raise NotImplementedError(
f"BiasEvaluator ({self.benchmark}) is not yet implemented. "
"See openverifiablellm/eval/bias.py for the integration plan."
)
Loading
Loading