diff --git a/openverifiablellm/tokenizer/__init__.py b/openverifiablellm/tokenizer/__init__.py index 10ea2ae..a4cec6a 100644 --- a/openverifiablellm/tokenizer/__init__.py +++ b/openverifiablellm/tokenizer/__init__.py @@ -1,6 +1,8 @@ from .train import hash_tokenizer_config, train_tokenizer +from .tokenize_dataset import tokenize_dataset __all__ = [ "train_tokenizer", "hash_tokenizer_config", + "tokenize_dataset", ] diff --git a/openverifiablellm/tokenizer/tokenize_dataset.py b/openverifiablellm/tokenizer/tokenize_dataset.py new file mode 100644 index 0000000..4896d1c --- /dev/null +++ b/openverifiablellm/tokenizer/tokenize_dataset.py @@ -0,0 +1,64 @@ +from pathlib import Path +import numpy as np + + +def tokenize_dataset(input_file, tokenizer, output_file): + """ + Tokenize a dataset using a trained tokenizer and save tokens to a binary file. + + This implementation is streaming and memory-efficient, meaning it can handle + very large datasets without loading everything into memory. + + Parameters + ---------- + input_file : str or Path + Path to the cleaned dataset text file. + + tokenizer : object + A tokenizer instance with an `encode()` method. + + output_file : str or Path + Path where tokenized binary output will be written. + + Returns + ------- + int + Total number of tokens written. + """ + + input_path = Path(input_file) + output_path = Path(output_file) + + if not input_path.exists(): + raise FileNotFoundError(f"Dataset file not found: {input_path}") + + total_tokens = 0 + + # open dataset for streaming + with input_path.open("r", encoding="utf-8") as fin, \ + output_path.open("wb") as fout: + + for line in fin: + text = line.strip() + + if not text: + continue + + encoded = tokenizer.encode(text) + + if isinstance(encoded, list): + tokens = encoded + elif hasattr(encoded, "ids"): + tokens = encoded.ids + else: + raise TypeError( + f"Tokenizer.encode() returned unsupported type: {type(encoded).__name__}. " + "Expected list or object with 'ids' attribute." + ) + + tokens_array = np.array(tokens, dtype=np.uint32) + tokens_array.tofile(fout) + + total_tokens += len(tokens) + + return total_tokens diff --git a/pyproject.toml b/pyproject.toml index 96523a0..e5b3328 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,8 @@ requires-python = ">=3.9" dependencies = [ "defusedxml", "sentencepiece", - "tokenizers==0.15.2" + "tokenizers==0.15.2", + "numpy>=1.20" ] [tool.setuptools.packages.find] diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 1c43f3c..72c7cc7 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,12 +1,14 @@ import json import pytest - +from pathlib import Path +import numpy as np from openverifiablellm.tokenizer import ( hash_tokenizer_config, train_tokenizer, ) +from openverifiablellm.tokenizer.tokenize_dataset import tokenize_dataset @pytest.fixture def sample_text_file(tmp_path): @@ -166,3 +168,63 @@ def test_hash_tokenizer_missing_merges(tmp_path): with pytest.raises(FileNotFoundError): hash_tokenizer_config(tokenizer_path) + + + +class DummyTokenizer: + def encode(self, text): + return [ord(c) for c in text] + + +def test_tokenize_dataset_creates_output(tmp_path): + + dataset = tmp_path / "dataset.txt" + dataset.write_text("hello\nworld,\n", encoding="utf-8") + + output = tmp_path / "tokens.bin" + + tokenizer = DummyTokenizer() + + total_tokens = tokenize_dataset(dataset, tokenizer, output) + + assert output.exists() + assert total_tokens > 0 + + +def test_tokenize_dataset_deterministic(tmp_path): + + dataset = tmp_path / "dataset.txt" + dataset.write_text("test data\nanother line", encoding="utf-8") + + output1 = tmp_path / "tokens1.bin" + output2 = tmp_path / "tokens2.bin" + + tokenizer = DummyTokenizer() + + tokenize_dataset(dataset, tokenizer, output1) + tokenize_dataset(dataset, tokenizer, output2) + + data1 = np.fromfile(output1, dtype=np.uint32) + data2 = np.fromfile(output2, dtype=np.uint32) + + assert np.array_equal(data1, data2) + +def test_tokenize_dataset_missing_file(tmp_path): + tokenizer = DummyTokenizer() + output = tmp_path / "tokens.bin" + + with pytest.raises(FileNotFoundError): + tokenize_dataset(tmp_path / "missing.txt", tokenizer, output) + + +def test_tokenize_dataset_empty_file(tmp_path): + dataset = tmp_path / "empty.txt" + dataset.write_text("", encoding="utf-8") + + output = tmp_path / "tokens.bin" + + total = tokenize_dataset(dataset, DummyTokenizer(), output) + + assert total == 0 + assert output.exists() + assert output.stat().st_size == 0