AOSSIE-Org · Varshiniputtabakula · Mar 14, 2026 · Mar 16, 2026 · coderabbitai · Mar 16, 2026
diff --git a/openverifiablellm/tokenizer/__init__.py b/openverifiablellm/tokenizer/__init__.py
@@ -1,6 +1,8 @@
 from .train import hash_tokenizer_config, train_tokenizer
+from .tokenize_dataset import tokenize_dataset
 
 __all__ = [
     "train_tokenizer",
     "hash_tokenizer_config",
+    "tokenize_dataset",
 ]
diff --git a/openverifiablellm/tokenizer/tokenize_dataset.py b/openverifiablellm/tokenizer/tokenize_dataset.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+import numpy as np
+
+
+def tokenize_dataset(input_file, tokenizer, output_file):
+    """
+    Tokenize a dataset using a trained tokenizer and save tokens to a binary file.
+
+    This implementation is streaming and memory-efficient, meaning it can handle
+    very large datasets without loading everything into memory.
+
+    Parameters
+    ----------
+    input_file : str or Path
+        Path to the cleaned dataset text file.
+
+    tokenizer : object
+        A tokenizer instance with an `encode()` method.
+
+    output_file : str or Path
+        Path where tokenized binary output will be written.
+
+    Returns
+    -------
+    int
+        Total number of tokens written.
+    """
+
+    input_path = Path(input_file)
+    output_path = Path(output_file)
+
+    if not input_path.exists():
+        raise FileNotFoundError(f"Dataset file not found: {input_path}")
+
+    total_tokens = 0
+
+    # open dataset for streaming
+    with input_path.open("r", encoding="utf-8") as fin, \
+         output_path.open("wb") as fout:
+
+        for line in fin:
+            text = line.strip()
+
+            if not text:
+                continue
+
+            encoded = tokenizer.encode(text)
+
+            if isinstance(encoded, list):
+                tokens = encoded
+            elif hasattr(encoded, "ids"):
+                tokens = encoded.ids
+            else:
+                raise TypeError(
+                    f"Tokenizer.encode() returned unsupported type: {type(encoded).__name__}. "
+                    "Expected list or object with 'ids' attribute."
+                )
+
+            tokens_array = np.array(tokens, dtype=np.uint32)
+            tokens_array.tofile(fout)
+
+            total_tokens += len(tokens)
+
+    return total_tokens
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,8 @@ requires-python = ">=3.9"
 dependencies = [
     "defusedxml",
     "sentencepiece",
-    "tokenizers==0.15.2"
+    "tokenizers==0.15.2",
+    "numpy>=1.20"
 ]
 
 [tool.setuptools.packages.find]

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -1,12 +1,14 @@
 import json
 
 import pytest
-
+from pathlib import Path
+import numpy as np
 from openverifiablellm.tokenizer import (
     hash_tokenizer_config,
     train_tokenizer,
 )
 
+from openverifiablellm.tokenizer.tokenize_dataset import tokenize_dataset
 
 @pytest.fixture
 def sample_text_file(tmp_path):
@@ -166,3 +168,63 @@ def test_hash_tokenizer_missing_merges(tmp_path):
 
     with pytest.raises(FileNotFoundError):
         hash_tokenizer_config(tokenizer_path)
+
+
+
+class DummyTokenizer:
+    def encode(self, text):
+        return [ord(c) for c in text]
+
+
+def test_tokenize_dataset_creates_output(tmp_path):
+
+    dataset = tmp_path / "dataset.txt"
+    dataset.write_text("hello\nworld,\n", encoding="utf-8")
+
+    output = tmp_path / "tokens.bin"
+
+    tokenizer = DummyTokenizer()
+
+    total_tokens = tokenize_dataset(dataset, tokenizer, output)
+
+    assert output.exists()
+    assert total_tokens > 0
+
+
+def test_tokenize_dataset_deterministic(tmp_path):
+
+    dataset = tmp_path / "dataset.txt"
+    dataset.write_text("test data\nanother line", encoding="utf-8")
+
+    output1 = tmp_path / "tokens1.bin"
+    output2 = tmp_path / "tokens2.bin"
+
+    tokenizer = DummyTokenizer()
+
+    tokenize_dataset(dataset, tokenizer, output1)
+    tokenize_dataset(dataset, tokenizer, output2)
+
+    data1 = np.fromfile(output1, dtype=np.uint32)
+    data2 = np.fromfile(output2, dtype=np.uint32)
+
+    assert np.array_equal(data1, data2)
+
+def test_tokenize_dataset_missing_file(tmp_path):
+    tokenizer = DummyTokenizer()
+    output = tmp_path / "tokens.bin"
+
+    with pytest.raises(FileNotFoundError):
+        tokenize_dataset(tmp_path / "missing.txt", tokenizer, output)
+
+
+def test_tokenize_dataset_empty_file(tmp_path):
+    dataset = tmp_path / "empty.txt"
+    dataset.write_text("", encoding="utf-8")
+
+    output = tmp_path / "tokens.bin"
+
+    total = tokenize_dataset(dataset, DummyTokenizer(), output)
+
+    assert total == 0
+    assert output.exists()
+    assert output.stat().st_size == 0