Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,5 @@ __pycache__/
*.pyd
*.bz2

*.venv
*.venv
venv/
21 changes: 19 additions & 2 deletions openverifiablellm/tokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,30 @@ def __init__(self, vocab_size: int, min_frequency: int):

@abstractmethod
def train(self, text_file: Path, save_path: Path):
"""Train tokenizer and save model."""
"""Train tokenizer on a text corpus and save artifacts to save_path."""
pass

@abstractmethod
def encode(self, text: str) -> list[int]:
"""Encode text into a list of integer token ids."""
pass

@abstractmethod
def decode(self, ids: list[int]) -> str:
"""Decode a list of integer token ids back into text."""
pass

@abstractmethod
def load(self, tokenizer_dir: Path):
"""Load a previously trained tokenizer from disk."""
Comment on lines 25 to +37
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "BaseTokenizer abstract method declarations:"
rg -nP '^\s*@abstractmethod\s*$|^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/base.py -C1

echo
echo "SentencePieceTokenizer implementations for the same methods:"
rg -nP '^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/sentencepiece_tokenizer.py -C1 || true

echo
echo "Factory path that instantiates SentencePieceTokenizer:"
rg -nP 'SentencePieceTokenizer|tokenizer_type == "sentencepiece"' openverifiablellm/tokenizer/factory.py -C2

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1296


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== SentencePieceTokenizer class definition and all methods ==="
cat -n openverifiablellm/tokenizer/sentencepiece_tokenizer.py

echo
echo "=== Checking parent class(es) ==="
rg -P 'class SentencePieceTokenizer' openverifiablellm/tokenizer/sentencepiece_tokenizer.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1084


SentencePieceTokenizer is missing required abstract method implementations.

SentencePieceTokenizer does not implement three abstract methods from BaseTokenizer: encode, decode, and load. This will cause instantiation to fail at runtime when the factory attempts to create the tokenizer for tokenizer_type == "sentencepiece".

Implement encode, decode, and load in SentencePieceTokenizer, or remove the abstract requirement until the class is fully implemented.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/base.py` around lines 25 - 37, The
SentencePieceTokenizer class is missing concrete implementations for
BaseTokenizer's abstract methods encode, decode, and load; implement
encode(self, text: str) -> list[int] to return token ids (use the
SentencePieceProcessor EncodeAsIds or equivalent on the internal model),
decode(self, ids: list[int]) -> str to return text (use DecodeIds/DecodePieces
on the processor), and load(self, tokenizer_dir: Path) to initialize and load
the SentencePiece model file into the tokenizer instance (e.g., create/assign
self.sp_model and call its Load with the model filepath); update
SentencePieceTokenizer to define these three methods so instantiation for
tokenizer_type == "sentencepiece" succeeds.

pass

@abstractmethod
def get_vocab_path(self, tokenizer_dir: Path) -> Path:
"""Return path to the vocabulary file."""
pass

@abstractmethod
def get_merges_path(self, tokenizer_dir: Path):
def get_merges_path(self, tokenizer_dir: Path) -> Path | None:
"""Return path to the merges file, or None if not applicable."""
pass
141 changes: 139 additions & 2 deletions openverifiablellm/tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,43 @@


class BPETokenizer(BaseTokenizer):
"""
Byte-level BPE tokenizer implementation.

Wraps HuggingFace's ByteLevelBPETokenizer and implements
the full BaseTokenizer interface including train, encode,
decode, and load.
"""

def __init__(self, vocab_size: int, min_frequency: int):
super().__init__(vocab_size, min_frequency)
self._tokenizer = None

# ------------------------------------------------------------------
# Training
# ------------------------------------------------------------------

def train(self, text_file: Path, save_path: Path):
"""
Train BPE tokenizer on text corpus and save artifacts.

Args:
text_file: Path to training text corpus.
save_path: Directory to save vocab.json and merges.txt.

Raises:
FileNotFoundError: If text_file does not exist or is not a file.
"""

text_file = Path(text_file)
save_path = Path(save_path)

if not text_file.is_file():
raise FileNotFoundError(
f"Training file not found at {text_file}. Please provide a valid text corpus file."
)

save_path.mkdir(parents=True, exist_ok=True)

tokenizer = ByteLevelBPETokenizer()

Expand All @@ -21,8 +57,109 @@ def train(self, text_file: Path, save_path: Path):

tokenizer.save_model(str(save_path))

self._tokenizer = tokenizer

# ------------------------------------------------------------------
# Encode / Decode
# ------------------------------------------------------------------

def encode(self, text: str) -> list[int]:
"""
Encode text into a list of token ids.

Args:
text: Input string to tokenize.

Returns:
List of integer token ids.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

self._check_loaded()
return self._tokenizer.encode(text).ids

def decode(self, ids: list[int]) -> str:
"""
Decode a list of token ids back into text.

Args:
ids: List of integer token ids.

Returns:
Decoded string.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

self._check_loaded()
return self._tokenizer.decode(ids, skip_special_tokens=False)

# ------------------------------------------------------------------
# Load
# ------------------------------------------------------------------

def load(self, tokenizer_dir: Path):
"""
Load a previously trained BPE tokenizer from disk.

Args:
tokenizer_dir: Directory containing vocab.json and merges.txt.

Raises:
FileNotFoundError: If vocab.json or merges.txt are not found.
"""

tokenizer_dir = Path(tokenizer_dir)

vocab_path = tokenizer_dir / "vocab.json"
merges_path = tokenizer_dir / "merges.txt"
Comment on lines +117 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Use path helper methods inside load() to avoid duplication.

Line 117–118 redefines artifact paths that are already centralized in get_vocab_path() / get_merges_path(). Reusing those helpers keeps path logic single-sourced.

♻️ Proposed refactor
-        vocab_path = tokenizer_dir / "vocab.json"
-        merges_path = tokenizer_dir / "merges.txt"
+        vocab_path = self.get_vocab_path(tokenizer_dir)
+        merges_path = self.get_merges_path(tokenizer_dir)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/bpe_tokenizer.py` around lines 117 - 118, In
load(), remove the local redefinition of vocab_path and merges_path and instead
call the centralized helpers get_vocab_path(tokenizer_dir) and
get_merges_path(tokenizer_dir) (or the parameter signatures used in this module)
so path logic is single-sourced; replace uses of the local vocab_path and
merges_path variables with the returned paths from those helper functions and
ensure any downstream code still receives the same Path objects.


if not vocab_path.is_file():
raise FileNotFoundError(
f"vocab.json not found at {vocab_path}. Please train the tokenizer first."
)

if not merges_path.is_file():
raise FileNotFoundError(
f"merges.txt not found at {merges_path}. Please train the tokenizer first."
)

self._tokenizer = ByteLevelBPETokenizer(
vocab=str(vocab_path),
merges=str(merges_path),
)

# 🔧 Fix: re-register special tokens after loading
self._tokenizer.add_special_tokens(SPECIAL_TOKENS)

# ------------------------------------------------------------------
# Artifact paths
# ------------------------------------------------------------------

def get_vocab_path(self, tokenizer_dir: Path) -> Path:
return tokenizer_dir / "vocab.json"
"""Return path to vocab.json file."""
return Path(tokenizer_dir) / "vocab.json"

def get_merges_path(self, tokenizer_dir: Path) -> Path:
return tokenizer_dir / "merges.txt"
"""Return path to merges.txt file."""
return Path(tokenizer_dir) / "merges.txt"
Comment on lines 146 to +148
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Return type mismatch with base class.

The base class BaseTokenizer.get_merges_path() declares return type Path | None, but this implementation declares Path. While BPE always has merges, the return type should match the base class contract for type safety and LSP compliance.

♻️ Proposed fix
-    def get_merges_path(self, tokenizer_dir: Path) -> Path:
+    def get_merges_path(self, tokenizer_dir: Path) -> Path | None:
         """Return path to merges.txt file."""
         return Path(tokenizer_dir) / "merges.txt"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/bpe_tokenizer.py` around lines 146 - 148, The
method signature for get_merges_path in
openverifiablellm/tokenizer/bpe_tokenizer.py currently returns Path but must
match the base class contract BaseTokenizer.get_merges_path() which declares
Path | None; change the signature of get_merges_path(self, tokenizer_dir: Path)
-> Path | None (or Optional[Path]) and keep returning the Path(tokenizer_dir) /
"merges.txt" so the implementation remains correct while the declared return
type matches the base class; add or adjust the typing import (from typing import
Optional) if needed.


# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------

def _check_loaded(self):
"""
Check that tokenizer is loaded before encode/decode.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

if self._tokenizer is None:
raise RuntimeError(
"BPE tokenizer is not loaded. Call train() or load() before encode/decode."
)
16 changes: 9 additions & 7 deletions openverifiablellm/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,15 +386,17 @@ def verify_preprocessing(
"environment_hash",
expected=manifest.get("environment_hash"),
actual=current_env["environment_hash"],
detail="Environment fingerprint comparison"
detail="Environment fingerprint comparison",
)
else:
report.add(CheckResult(
name="environment_hash",
status=CheckStatus.SKIP,
detail="Field absent from manifest (older version)"
))

report.add(
CheckResult(
name="environment_hash",
status=CheckStatus.SKIP,
detail="Field absent from manifest (older version)",
)
)

# 4. Re-run preprocessing in an isolated temp directory
tmp_dir = Path(tempfile.mkdtemp(prefix="ovllm_verify_"))
try:
Expand Down
Loading
Loading