Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openverifiablellm/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
"train_tokenizer",
"hash_tokenizer_config",
]

from .tokenize_dataset import tokenize_dataset
62 changes: 62 additions & 0 deletions openverifiablellm/tokenizer/tokenize_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from pathlib import Path
import numpy as np
Comment on lines +1 to +2


def tokenize_dataset(input_file, tokenizer, output_file):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

tokenize_dataset is not compatible with the repository tokenizer classes.

Line 47 assumes tokenizer.encode(...) exists, but openverifiablellm/tokenizer/factory.py returns tokenizer classes (e.g., openverifiablellm/tokenizer/bpe_tokenizer.py::BPETokenizer) that do not expose encode. This will fail at runtime for the package’s own tokenizer instances and breaks the intended trained-tokenizer workflow.

Suggested hardening (clear failure mode in this function)
 def tokenize_dataset(input_file, tokenizer, output_file):
+    if not hasattr(tokenizer, "encode"):
+        raise TypeError(
+            f"tokenizer must expose encode(text); got {type(tokenizer).__name__}"
+        )

Also applies to: 47-47

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/tokenize_dataset.py` at line 5, tokenize_dataset
currently calls tokenizer.encode(...) which doesn't exist on the repository
tokenizers; update tokenize_dataset to handle the tokenizer interface robustly:
check for methods in order (hasattr(tokenizer, "encode") -> use it; elif
hasattr(tokenizer, "tokenize") -> call tokenizer.tokenize(...) and then convert
tokens to ids via tokenizer.convert_tokens_to_ids or tokenizer.tokens_to_ids if
present; elif hasattr(tokenizer, "encode_batch") use that; otherwise raise a
clear error mentioning tokenize_dataset and the missing methods. Ensure you
reference the tokenizer instance and preserve behavior for batching and special
tokens when available.

"""
Tokenize a dataset using a trained tokenizer and save tokens to a binary file.

This implementation is streaming and memory-efficient, meaning it can handle
very large datasets without loading everything into memory.

Parameters
----------
input_file : str or Path
Path to the cleaned dataset text file.

tokenizer : object
A tokenizer instance with an `encode()` method.

output_file : str or Path
Path where tokenized binary output will be written.

Returns
-------
int
Total number of tokens written.
"""

input_path = Path(input_file)
output_path = Path(output_file)

if not input_path.exists():
raise FileNotFoundError(f"Dataset file not found: {input_path}")

total_tokens = 0

# open dataset for streaming
with input_path.open("r", encoding="utf-8") as fin, \
output_path.open("wb") as fout:

Comment on lines +29 to +40
for line in fin:

text = line.strip()

if not text:
continue

encoded = tokenizer.encode(text)

if isinstance(encoded, list):
tokens = encoded
else:
# support tokenizers returning objects
tokens = encoded.ids

tokens_array = np.array(tokens, dtype=np.uint32)

tokens_array.tofile(fout)

total_tokens += len(tokens)

return total_tokens
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ requires-python = ">=3.9"
dependencies = [
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
"tokenizers==0.15.2",
"numpy"
]
Comment on lines 14 to 19

[tool.setuptools.packages.find]
Expand Down
48 changes: 47 additions & 1 deletion tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json

import pytest

from pathlib import Path
import tempfile
import numpy as np
Comment on lines +4 to +5
from openverifiablellm.tokenizer import (
hash_tokenizer_config,
train_tokenizer,
Expand Down Expand Up @@ -166,3 +168,47 @@ def test_hash_tokenizer_missing_merges(tmp_path):

with pytest.raises(FileNotFoundError):
hash_tokenizer_config(tokenizer_path)




from openverifiablellm.tokenizer.tokenize_dataset import tokenize_dataset


class DummyTokenizer:
def encode(self, text):
return [ord(c) for c in text]


def test_tokenize_dataset_creates_output(tmp_path):

dataset = tmp_path / "dataset.txt"
dataset.write_text("hello\nworld")

output = tmp_path / "tokens.bin"

tokenizer = DummyTokenizer()

total_tokens = tokenize_dataset(dataset, tokenizer, output)

assert output.exists()
assert total_tokens > 0


def test_tokenize_dataset_deterministic(tmp_path):

dataset = tmp_path / "dataset.txt"
dataset.write_text("test data\nanother line")

output1 = tmp_path / "tokens1.bin"
output2 = tmp_path / "tokens2.bin"

tokenizer = DummyTokenizer()

tokenize_dataset(dataset, tokenizer, output1)
tokenize_dataset(dataset, tokenizer, output2)

data1 = np.fromfile(output1, dtype=np.uint32)
data2 = np.fromfile(output2, dtype=np.uint32)

assert np.array_equal(data1, data2)
Loading