Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions openverifiablellm/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .bias import WinoBiasEvaluator
from .perplexity import PerplexityEvaluator

__all__ = [
"WinoBiasEvaluator",
"PerplexityEvaluator",
]
24 changes: 24 additions & 0 deletions openverifiablellm/eval/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABC, abstractmethod


class BaseEvaluator(ABC):
"""Abstract base class for all dataset evaluators."""

@abstractmethod
def evaluate(self, model, tokenizer) -> dict:
"""
Evaluate a language model using the given tokenizer.

Parameters
----------
model : callable
Callable accepting a sequence of token IDs and returning a
2-D sequence of logits with shape ``(len(input_ids), vocab_size)``.
tokenizer : object
Object with an ``encode(text: str) -> list[int]`` method.

Returns
-------
dict
Benchmark-specific evaluation results.
"""
5 changes: 5 additions & 0 deletions openverifiablellm/eval/bias/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .wino_bias import WinoBiasEvaluator

__all__ = [
"WinoBiasEvaluator",
]
91 changes: 91 additions & 0 deletions openverifiablellm/eval/bias/wino_bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
openverifiablellm/eval/bias/wino_bias.py

Gender-bias evaluator using the WinoBias benchmark.
"""

from typing import Optional

from ..base import BaseEvaluator
from ..perplexity import PerplexityEvaluator


class WinoBiasEvaluator(BaseEvaluator):
"""
Evaluates gender bias in a language model using the WinoBias benchmark.

For each sentence pair (pro-stereotype / anti-stereotype) the model's
perplexity is computed via the same sliding-window method used by
:class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less
biased model.

Parameters
----------
n_samples : int or None
Maximum number of sentences to load from each WinoBias split.
``None`` evaluates the full dataset. Default ``None``.
"""

def __init__(self, n_samples: Optional[int] = None):
self.n_samples = n_samples

def evaluate(self, model, tokenizer) -> dict:
"""
Compute stereotype and anti-stereotype perplexity scores.

Loads ``type1_pro`` (pro-stereotype) and ``type1_anti``
(anti-stereotype) splits of WinoBias and measures how much more
easily the model predicts gender-stereotypical sentences than
counter-stereotypical ones.

Parameters
----------
model : callable
``model(input_ids) -> 2-D sequence`` of shape
``(len(input_ids), vocab_size)``, as described in
:meth:`PerplexityEvaluator.compute_sentence_perplexity`.
tokenizer : object
Object with ``encode(text: str) -> list[int]``.

Returns
-------
dict
A dictionary with the following keys:

* **stereotype_score** (*float*) — mean perplexity on
pro-stereotype sentences.
* **anti_stereotype_score** (*float*) — mean perplexity on
anti-stereotype sentences.
* **bias_score** (*float*) —
``abs(stereotype_score - anti_stereotype_score)``;
lower means less biased.
"""
import datasets as hf_datasets # deferred; runtime dep

pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test")
anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test")

def _score_split(dataset) -> float:
scores = []
for i, row in enumerate(dataset):
if self.n_samples is not None and i >= self.n_samples:
break
tokens = row.get("tokens", [])
text = " ".join(tokens) if isinstance(tokens, list) else str(tokens)
if not text.strip():
continue
token_ids = tokenizer.encode(text)
scores.append(
PerplexityEvaluator.compute_sentence_perplexity(model, token_ids)
)
return float(sum(scores) / len(scores)) if scores else float("inf")
Comment on lines +78 to +81
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Consider filtering inf values before computing the mean.

If compute_sentence_perplexity returns float("inf") for any sentence (e.g., sequences with < 2 tokens), the entire split score becomes inf since sum([..., inf, ...]) is inf. While WinoBias sentences are typically well-formed, malformed or edge-case entries could skew the entire evaluation.

🛡️ Suggested defensive approach
-            return float(sum(scores) / len(scores)) if scores else float("inf")
+            finite_scores = [s for s in scores if math.isfinite(s)]
+            return float(sum(finite_scores) / len(finite_scores)) if finite_scores else float("inf")

This filters out infinite values, computing the mean only over valid perplexity scores.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/eval/bias/wino_bias.py` around lines 78 - 81, The current
averaging in the block that calls
PerplexityEvaluator.compute_sentence_perplexity can produce float("inf") if any
sentence returns infinity; update the logic in the loop/return that collects
scores (where scores is appended using
PerplexityEvaluator.compute_sentence_perplexity(model, token_ids)) to filter out
math.isfinite or not-infinite values before computing the mean, and if no finite
scores remain return float("inf") (or keep the original fallback) so a single
infinite sentence doesn't make the whole split score infinite.


stereotype_score = _score_split(pro_ds)
anti_stereotype_score = _score_split(anti_ds)
bias_score = abs(stereotype_score - anti_stereotype_score)

return {
"stereotype_score": stereotype_score,
"anti_stereotype_score": anti_stereotype_score,
"bias_score": bias_score,
}
151 changes: 151 additions & 0 deletions openverifiablellm/eval/perplexity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
openverifiablellm/eval/perplexity.py

Perplexity evaluator for language models.
"""

import math
from typing import List, Optional

from .base import BaseEvaluator


class PerplexityEvaluator(BaseEvaluator):
"""
Evaluates language-model perplexity on a HuggingFace benchmark dataset.

Perplexity is computed with a teacher-forced sliding-window approach:
for each token position *i* the model receives tokens ``[0 .. i-1]``
and the negative log-probability of token ``[i]`` is accumulated.
The final perplexity is ``exp(mean_NLL)``.

Parameters
----------
benchmark : str
HuggingFace dataset identifier. Default ``"wikitext"``.
n_samples : int or None
Maximum number of non-empty samples to evaluate. ``None`` means
evaluate the whole dataset. Default ``50``.
stride : int
Window stride used when the sequence exceeds the model's context
window. Default ``512``.
"""

def __init__(
self,
benchmark: str = "wikitext",
n_samples: Optional[int] = 50,
stride: int = 512,
):
self.benchmark = benchmark
self.n_samples = n_samples
self.stride = stride

# ------------------------------------------------------------------
# Mock helpers
# ------------------------------------------------------------------

@staticmethod
def uniform_model(vocab_size: int = 1000):
"""
Return a mock model that produces uniform (all-zero) logits.

Useful for unit testing: because all logits are equal, the
log-softmax is ``-log(vocab_size)`` at every position, giving a
predictable perplexity of exactly ``vocab_size``.

Parameters
----------
vocab_size : int
Vocabulary size of the mock model. Default ``1000``.

Returns
-------
callable
``model(input_ids) -> list[list[float]]`` of shape
``(len(input_ids), vocab_size)``.
"""

def _model(input_ids):
return [[0.0] * vocab_size for _ in input_ids]

return _model

# ------------------------------------------------------------------
# Core computation
# ------------------------------------------------------------------

@staticmethod
def compute_sentence_perplexity(model, token_ids: List[int]) -> float:
"""
Compute the perplexity of *token_ids* under *model*.

Parameters
----------
model : callable
``model(input_ids) -> 2-D sequence`` of shape
``(len(input_ids), vocab_size)``.
token_ids : list[int]
Tokenised sentence.

Returns
-------
float
Perplexity (≥ 1). Returns ``float("inf")`` for sequences
shorter than 2 tokens.
"""
if len(token_ids) < 2:
return float("inf")

inputs = token_ids[:-1]
targets = token_ids[1:]

logits_batch = model(inputs) # shape: (n-1, vocab_size)

nll_sum = 0.0
for logits, target in zip(logits_batch, targets):
# numerically-stable log-softmax
max_l = max(logits)
exp_shifted = [math.exp(v - max_l) for v in logits]
log_sum = math.log(sum(exp_shifted))
log_prob_target = (logits[target] - max_l) - log_sum
nll_sum -= log_prob_target

return math.exp(nll_sum / len(targets))

# ------------------------------------------------------------------
# BaseEvaluator interface
# ------------------------------------------------------------------

def evaluate(self, model, tokenizer) -> dict:
"""
Compute mean perplexity on *self.benchmark*.

Parameters
----------
model : callable
Callable as described in :meth:`compute_sentence_perplexity`.
tokenizer : object
Object with ``encode(text: str) -> list[int]``.

Returns
-------
dict
``{"perplexity": float}`` — mean perplexity across evaluated
sentences.
"""
import datasets as hf_datasets # deferred; runtime dep

ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True)
scores = []
for i, row in enumerate(ds):
if self.n_samples is not None and i >= self.n_samples:
break
text = row.get("text", "")
if not text.strip():
continue
token_ids = tokenizer.encode(text)
scores.append(self.compute_sentence_perplexity(model, token_ids))

mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf")
return {"perplexity": mean_ppl}
11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ authors = [
requires-python = ">=3.9"

dependencies = [
"datasets",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
Comment on lines 14 to 19
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

find . -name "pyproject.toml" -type f

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 88


🏁 Script executed:

cat -n ./pyproject.toml

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1676


Add numpy and torch to required dependencies.

These are core LLM dependencies for this project and should be in the required dependency list, not optional.

Suggested patch
 dependencies = [
     "datasets",
+    "numpy",
+    "torch",
     "defusedxml",
     "sentencepiece",
     "tokenizers==0.15.2"
 ]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
dependencies = [
"datasets",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
dependencies = [
"datasets",
"numpy",
"torch",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pyproject.toml` around lines 14 - 19, The dependencies list in pyproject.toml
is missing core LLM packages; add "numpy" and "torch" to the existing
dependencies array (alongside "datasets", "defusedxml", "sentencepiece",
"tokenizers==0.15.2") so they are installed as required dependencies; update the
dependencies section to include "numpy" and "torch" entries and ensure versions
are specified if necessary for compatibility.


# Intentionally duplicated from [dependency-groups] below.
# pip uses this section; uv/PEP 735 uses [dependency-groups]. Keep both in sync.
[project.optional-dependencies]
dev = [
"pytest>=7.0",
"ruff>=0.15.4",
]

[tool.setuptools.packages.find]
include = ["openverifiablellm*"]

# Intentionally duplicated from [project.optional-dependencies] above.
# uv/PEP 735 uses this section; pip uses [project.optional-dependencies]. Keep both in sync.
[dependency-groups]
dev = [
"pytest>=7.0",
Expand Down
Loading
Loading