Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openverifiablellm/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .train import hash_tokenizer_config, train_tokenizer
from .tokenize_dataset import tokenize_dataset

__all__ = [
"train_tokenizer",
"hash_tokenizer_config",
"tokenize_dataset",
]
64 changes: 64 additions & 0 deletions openverifiablellm/tokenizer/tokenize_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pathlib import Path
import numpy as np
Comment on lines +1 to +2


def tokenize_dataset(input_file, tokenizer, output_file):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

tokenize_dataset is not compatible with the repository tokenizer classes.

Line 47 assumes tokenizer.encode(...) exists, but openverifiablellm/tokenizer/factory.py returns tokenizer classes (e.g., openverifiablellm/tokenizer/bpe_tokenizer.py::BPETokenizer) that do not expose encode. This will fail at runtime for the package’s own tokenizer instances and breaks the intended trained-tokenizer workflow.

Suggested hardening (clear failure mode in this function)
 def tokenize_dataset(input_file, tokenizer, output_file):
+    if not hasattr(tokenizer, "encode"):
+        raise TypeError(
+            f"tokenizer must expose encode(text); got {type(tokenizer).__name__}"
+        )

Also applies to: 47-47

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/tokenize_dataset.py` at line 5, tokenize_dataset
currently calls tokenizer.encode(...) which doesn't exist on the repository
tokenizers; update tokenize_dataset to handle the tokenizer interface robustly:
check for methods in order (hasattr(tokenizer, "encode") -> use it; elif
hasattr(tokenizer, "tokenize") -> call tokenizer.tokenize(...) and then convert
tokens to ids via tokenizer.convert_tokens_to_ids or tokenizer.tokens_to_ids if
present; elif hasattr(tokenizer, "encode_batch") use that; otherwise raise a
clear error mentioning tokenize_dataset and the missing methods. Ensure you
reference the tokenizer instance and preserve behavior for batching and special
tokens when available.

"""
Tokenize a dataset using a trained tokenizer and save tokens to a binary file.

This implementation is streaming and memory-efficient, meaning it can handle
very large datasets without loading everything into memory.

Parameters
----------
input_file : str or Path
Path to the cleaned dataset text file.

tokenizer : object
A tokenizer instance with an `encode()` method.

output_file : str or Path
Path where tokenized binary output will be written.

Returns
-------
int
Total number of tokens written.
"""

input_path = Path(input_file)
output_path = Path(output_file)

if not input_path.exists():
raise FileNotFoundError(f"Dataset file not found: {input_path}")

total_tokens = 0

# open dataset for streaming
with input_path.open("r", encoding="utf-8") as fin, \
output_path.open("wb") as fout:

Comment on lines +29 to +40
for line in fin:
text = line.strip()

if not text:
continue

encoded = tokenizer.encode(text)

if isinstance(encoded, list):
tokens = encoded
elif hasattr(encoded, "ids"):
tokens = encoded.ids
else:
raise TypeError(
f"Tokenizer.encode() returned unsupported type: {type(encoded).__name__}. "
"Expected list or object with 'ids' attribute."
)

tokens_array = np.array(tokens, dtype=np.uint32)
tokens_array.tofile(fout)

total_tokens += len(tokens)

return total_tokens
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ requires-python = ">=3.9"
dependencies = [
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
"tokenizers==0.15.2",
"numpy>=1.20"
]
Comment on lines 14 to 19

[tool.setuptools.packages.find]
Expand Down
64 changes: 63 additions & 1 deletion tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import json

import pytest

from pathlib import Path
import numpy as np
Comment on lines +4 to +5
from openverifiablellm.tokenizer import (
hash_tokenizer_config,
train_tokenizer,
)

from openverifiablellm.tokenizer.tokenize_dataset import tokenize_dataset

@pytest.fixture
def sample_text_file(tmp_path):
Expand Down Expand Up @@ -166,3 +168,63 @@ def test_hash_tokenizer_missing_merges(tmp_path):

with pytest.raises(FileNotFoundError):
hash_tokenizer_config(tokenizer_path)



class DummyTokenizer:
def encode(self, text):
return [ord(c) for c in text]


def test_tokenize_dataset_creates_output(tmp_path):

dataset = tmp_path / "dataset.txt"
dataset.write_text("hello\nworld,\n", encoding="utf-8")

output = tmp_path / "tokens.bin"

tokenizer = DummyTokenizer()

total_tokens = tokenize_dataset(dataset, tokenizer, output)

assert output.exists()
assert total_tokens > 0


def test_tokenize_dataset_deterministic(tmp_path):

dataset = tmp_path / "dataset.txt"
dataset.write_text("test data\nanother line", encoding="utf-8")

output1 = tmp_path / "tokens1.bin"
output2 = tmp_path / "tokens2.bin"

tokenizer = DummyTokenizer()

tokenize_dataset(dataset, tokenizer, output1)
tokenize_dataset(dataset, tokenizer, output2)

data1 = np.fromfile(output1, dtype=np.uint32)
data2 = np.fromfile(output2, dtype=np.uint32)

assert np.array_equal(data1, data2)

def test_tokenize_dataset_missing_file(tmp_path):
tokenizer = DummyTokenizer()
output = tmp_path / "tokens.bin"

with pytest.raises(FileNotFoundError):
tokenize_dataset(tmp_path / "missing.txt", tokenizer, output)


def test_tokenize_dataset_empty_file(tmp_path):
dataset = tmp_path / "empty.txt"
dataset.write_text("", encoding="utf-8")

output = tmp_path / "tokens.bin"

total = tokenize_dataset(dataset, DummyTokenizer(), output)

assert total == 0
assert output.exists()
assert output.stat().st_size == 0
Loading