Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,5 @@ __pycache__/
*.pyd
*.bz2

*.venv
*.venv
venv/
21 changes: 19 additions & 2 deletions openverifiablellm/tokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,30 @@ def __init__(self, vocab_size: int, min_frequency: int):

@abstractmethod
def train(self, text_file: Path, save_path: Path):
"""Train tokenizer and save model."""
"""Train tokenizer on a text corpus and save artifacts to save_path."""
pass

@abstractmethod
def encode(self, text: str) -> list:
"""Encode text into a list of integer token ids."""
pass

@abstractmethod
def decode(self, ids: list) -> str:
"""Decode a list of integer token ids back into text."""
pass

@abstractmethod
def load(self, tokenizer_dir: Path):
"""Load a previously trained tokenizer from disk."""
Comment on lines 25 to +37
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "BaseTokenizer abstract method declarations:"
rg -nP '^\s*@abstractmethod\s*$|^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/base.py -C1

echo
echo "SentencePieceTokenizer implementations for the same methods:"
rg -nP '^\s*def (encode|decode|load)\(' openverifiablellm/tokenizer/sentencepiece_tokenizer.py -C1 || true

echo
echo "Factory path that instantiates SentencePieceTokenizer:"
rg -nP 'SentencePieceTokenizer|tokenizer_type == "sentencepiece"' openverifiablellm/tokenizer/factory.py -C2

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1296


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== SentencePieceTokenizer class definition and all methods ==="
cat -n openverifiablellm/tokenizer/sentencepiece_tokenizer.py

echo
echo "=== Checking parent class(es) ==="
rg -P 'class SentencePieceTokenizer' openverifiablellm/tokenizer/sentencepiece_tokenizer.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1084


SentencePieceTokenizer is missing required abstract method implementations.

SentencePieceTokenizer does not implement three abstract methods from BaseTokenizer: encode, decode, and load. This will cause instantiation to fail at runtime when the factory attempts to create the tokenizer for tokenizer_type == "sentencepiece".

Implement encode, decode, and load in SentencePieceTokenizer, or remove the abstract requirement until the class is fully implemented.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/base.py` around lines 25 - 37, The
SentencePieceTokenizer class is missing concrete implementations for
BaseTokenizer's abstract methods encode, decode, and load; implement
encode(self, text: str) -> list[int] to return token ids (use the
SentencePieceProcessor EncodeAsIds or equivalent on the internal model),
decode(self, ids: list[int]) -> str to return text (use DecodeIds/DecodePieces
on the processor), and load(self, tokenizer_dir: Path) to initialize and load
the SentencePiece model file into the tokenizer instance (e.g., create/assign
self.sp_model and call its Load with the model filepath); update
SentencePieceTokenizer to define these three methods so instantiation for
tokenizer_type == "sentencepiece" succeeds.

pass

@abstractmethod
def get_vocab_path(self, tokenizer_dir: Path) -> Path:
"""Return path to the vocabulary file."""
pass

@abstractmethod
def get_merges_path(self, tokenizer_dir: Path):
pass
"""Return path to the merges file, or None if not applicable."""
pass
144 changes: 142 additions & 2 deletions openverifiablellm/tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,44 @@


class BPETokenizer(BaseTokenizer):
"""
Byte-level BPE tokenizer implementation.

Wraps HuggingFace's ByteLevelBPETokenizer and implements
the full BaseTokenizer interface including train, encode,
decode, and load.
"""

def __init__(self, vocab_size: int, min_frequency: int):
super().__init__(vocab_size, min_frequency)
self._tokenizer = None

# ------------------------------------------------------------------
# Training
# ------------------------------------------------------------------

def train(self, text_file: Path, save_path: Path):
"""
Train BPE tokenizer on text corpus and save artifacts.

Args:
text_file: Path to training text corpus.
save_path: Directory to save vocab.json and merges.txt.

Raises:
FileNotFoundError: If text_file does not exist or is not a file.
"""

text_file = Path(text_file)
save_path = Path(save_path)

if not text_file.is_file():
raise FileNotFoundError(
f"Training file not found at {text_file}. "
f"Please provide a valid text corpus file."
)

save_path.mkdir(parents=True, exist_ok=True)

tokenizer = ByteLevelBPETokenizer()

Expand All @@ -20,10 +56,114 @@ def train(self, text_file: Path, save_path: Path):
special_tokens=SPECIAL_TOKENS,
)

# Must create directory BEFORE save_model() is called
save_path.mkdir(parents=True, exist_ok=True)

tokenizer.save_model(str(save_path))

self._tokenizer = tokenizer

# ------------------------------------------------------------------
# Encode / Decode
# ------------------------------------------------------------------

def encode(self, text: str) -> list:
"""
Encode text into a list of token ids.

Args:
text: Input string to tokenize.

Returns:
List of integer token ids.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

self._check_loaded()
return self._tokenizer.encode(text).ids

def decode(self, ids: list) -> str:
"""
Decode a list of token ids back into text.

Args:
ids: List of integer token ids.

Returns:
Decoded string.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

self._check_loaded()
return self._tokenizer.decode(ids)

# ------------------------------------------------------------------
# Load
# ------------------------------------------------------------------

def load(self, tokenizer_dir: Path):
"""
Load a previously trained BPE tokenizer from disk.

Args:
tokenizer_dir: Directory containing vocab.json and merges.txt.

Raises:
FileNotFoundError: If vocab.json or merges.txt are not found.
"""

tokenizer_dir = Path(tokenizer_dir)

vocab_path = tokenizer_dir / "vocab.json"
merges_path = tokenizer_dir / "merges.txt"
Comment on lines +117 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Use path helper methods inside load() to avoid duplication.

Line 117–118 redefines artifact paths that are already centralized in get_vocab_path() / get_merges_path(). Reusing those helpers keeps path logic single-sourced.

♻️ Proposed refactor
-        vocab_path = tokenizer_dir / "vocab.json"
-        merges_path = tokenizer_dir / "merges.txt"
+        vocab_path = self.get_vocab_path(tokenizer_dir)
+        merges_path = self.get_merges_path(tokenizer_dir)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/bpe_tokenizer.py` around lines 117 - 118, In
load(), remove the local redefinition of vocab_path and merges_path and instead
call the centralized helpers get_vocab_path(tokenizer_dir) and
get_merges_path(tokenizer_dir) (or the parameter signatures used in this module)
so path logic is single-sourced; replace uses of the local vocab_path and
merges_path variables with the returned paths from those helper functions and
ensure any downstream code still receives the same Path objects.


if not vocab_path.is_file():
raise FileNotFoundError(
f"vocab.json not found at {vocab_path}. "
f"Please train the tokenizer first."
)

if not merges_path.is_file():
raise FileNotFoundError(
f"merges.txt not found at {merges_path}. "
f"Please train the tokenizer first."
)

self._tokenizer = ByteLevelBPETokenizer(
vocab=str(vocab_path),
merges=str(merges_path),
)

# ------------------------------------------------------------------
# Artifact paths
# ------------------------------------------------------------------

def get_vocab_path(self, tokenizer_dir: Path) -> Path:
return tokenizer_dir / "vocab.json"
"""Return path to vocab.json file."""
return Path(tokenizer_dir) / "vocab.json"

def get_merges_path(self, tokenizer_dir: Path) -> Path:
return tokenizer_dir / "merges.txt"
"""Return path to merges.txt file."""
return Path(tokenizer_dir) / "merges.txt"
Comment on lines 146 to +148
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Return type mismatch with base class.

The base class BaseTokenizer.get_merges_path() declares return type Path | None, but this implementation declares Path. While BPE always has merges, the return type should match the base class contract for type safety and LSP compliance.

♻️ Proposed fix
-    def get_merges_path(self, tokenizer_dir: Path) -> Path:
+    def get_merges_path(self, tokenizer_dir: Path) -> Path | None:
         """Return path to merges.txt file."""
         return Path(tokenizer_dir) / "merges.txt"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/bpe_tokenizer.py` around lines 146 - 148, The
method signature for get_merges_path in
openverifiablellm/tokenizer/bpe_tokenizer.py currently returns Path but must
match the base class contract BaseTokenizer.get_merges_path() which declares
Path | None; change the signature of get_merges_path(self, tokenizer_dir: Path)
-> Path | None (or Optional[Path]) and keep returning the Path(tokenizer_dir) /
"merges.txt" so the implementation remains correct while the declared return
type matches the base class; add or adjust the typing import (from typing import
Optional) if needed.


# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------

def _check_loaded(self):
"""
Check that tokenizer is loaded before encode/decode.

Raises:
RuntimeError: If tokenizer has not been trained or loaded.
"""

if self._tokenizer is None:
raise RuntimeError(
"BPE tokenizer is not loaded. "
"Call train() or load() before encode/decode."
)
Loading