Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
d741214
Add malformed XML edge case test and fix verify.py regression
Shubhamx404 Mar 7, 2026
e923a03
Update openverifiablellm/verify.py
Shubhamx404 Mar 8, 2026
0d0626a
fixed coderabbit issue
Shubhamx404 Mar 8, 2026
573ad01
Merge branch 'main' into fix-malformed-xml-test
Shubhamx404 Mar 9, 2026
9c28b5a
Chain-linking
aniket866 Mar 9, 2026
e442106
Code rabbit follow up
aniket866 Mar 9, 2026
93ba434
Code-rabbit-follow-up
aniket866 Mar 9, 2026
6ac7c7d
Code rabbit follow up
aniket866 Mar 9, 2026
c94d335
testing with UV
aniket866 Mar 11, 2026
ff689a9
resume-preprocessing
aniket866 Mar 11, 2026
2ae4c5a
Merge branch 'main' into chain
aniket866 Mar 12, 2026
0b59f92
fixing-linting-issues
aniket866 Mar 12, 2026
824659f
ruff-format
aniket866 Mar 12, 2026
5cad61c
added-script
aniket866 Mar 12, 2026
8df62f8
added-script
aniket866 Mar 12, 2026
be86883
fixing-merge-conflicts
aniket866 Mar 16, 2026
5fbdff1
fixing-ci-fails
aniket866 Mar 16, 2026
672f071
fixing ruff formatting
aniket866 Mar 16, 2026
6b00bd4
Code rabbit follow up
aniket866 Mar 16, 2026
9fc7545
code rabbit fixes
aniket866 Mar 16, 2026
6286972
fixing ruff formatting
aniket866 Mar 16, 2026
e17ab23
Fix tokenizer tests and improve sentencepiece tokenizer
Shubhamx404 Mar 17, 2026
05c6b3d
Fix test_verify.py to write manifest during setup
Shubhamx404 Mar 17, 2026
3123770
Fix linting issue in sentencepiece_tokenizer.py
Shubhamx404 Mar 17, 2026
6af29e4
Fix duplicate pytest options in pyproject.toml
Shubhamx404 Mar 17, 2026
d5ff5ef
Merge pull request #53 from aniket866/chain
Archit381 Mar 17, 2026
40e2f40
Merge branch 'main' into resume-preprocessing
aniket866 Mar 17, 2026
e2c3017
Code rabbit follow up
aniket866 Mar 17, 2026
e35f8ab
Code rabbit follow-up
aniket866 Mar 17, 2026
a754edd
Code rabbit follow-up
aniket866 Mar 17, 2026
66bbeab
fixing-comput_sha
aniket866 Mar 17, 2026
173cdd5
Merge pull request #68 from aniket866/resume-preprocessing
Archit381 Mar 17, 2026
b8b81af
Merge main into fix-malformed-xml-test and resolve conflicts
Shubhamx404 Mar 17, 2026
c1a4ef7
style: run formatter to fix CI lint issue
Shubhamx404 Mar 17, 2026
578bc79
Merge pull request #49 from Shubhamx404/fix-malformed-xml-test
Archit381 Mar 17, 2026
890d77c
Merge origin/fix-tokenizer-tests and resolve conflicts, keeping tests
Shubhamx404 Mar 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openverifiablellm/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .factory import create_tokenizer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
rg -n -C2 --type=py 'from \.factory import create_tokenizer|from \.sentencepiece_tokenizer import SentencePieceTokenizer|import sentencepiece as spm'

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1413


🏁 Script executed:

cat -n openverifiablellm/tokenizer/factory.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 650


🏁 Script executed:

head -20 openverifiablellm/tokenizer/sentencepiece_tokenizer.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 649


🏁 Script executed:

cat pyproject.toml | head -50

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 738


🏁 Script executed:

cat setup.py 2>/dev/null || echo "setup.py not found"

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 90


🏁 Script executed:

rg -n 'create_tokenizer' --type=py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1406


Decouple optional sentencepiece dependency from package import.

The eager re-export in __init__.py pulls in SentencePieceTokenizer from factory.py, which in turn imports the optional sentencepiece library at module level. This breaks import openverifiablellm.tokenizer for users who only need BPE and don't have sentencepiece installed.

The fix requires lazy importing in two places:

  1. In __init__.py: Wrap the create_tokenizer re-export in a lazy import function
  2. In factory.py: Lazy-import SentencePieceTokenizer inside create_tokenizer() only when needed
Suggested fix

In openverifiablellm/tokenizer/__init__.py:

-from .factory import create_tokenizer
+def create_tokenizer(tokenizer_type, vocab_size, min_frequency):
+    from .factory import create_tokenizer as _create_tokenizer
+    return _create_tokenizer(tokenizer_type, vocab_size, min_frequency)

In openverifiablellm/tokenizer/factory.py:

-from .bpe_tokenizer import BPETokenizer
-from .sentencepiece_tokenizer import SentencePieceTokenizer
-
-
 def create_tokenizer(tokenizer_type, vocab_size, min_frequency):
+    from .bpe_tokenizer import BPETokenizer
 
     tokenizer_type = tokenizer_type.lower()
 
     if tokenizer_type == "bpe":
         return BPETokenizer(vocab_size, min_frequency)
 
     if tokenizer_type == "sentencepiece":
+        from .sentencepiece_tokenizer import SentencePieceTokenizer
         return SentencePieceTokenizer(vocab_size, min_frequency)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/__init__.py` at line 1, The package-level import
eagerly re-exports create_tokenizer which causes SentencePieceTokenizer (and the
optional sentencepiece dependency) to be imported at module import time; change
__init__.py to expose create_tokenizer via a lazy wrapper that imports
create_tokenizer only when called, and modify factory.py so
SentencePieceTokenizer (and any import of sentencepiece) is imported lazily
inside create_tokenizer() only when the sentencepiece-backed branch is selected;
update references to the factory-level create_tokenizer and the class name
SentencePieceTokenizer to ensure the dynamic import path is used.

from .train import hash_tokenizer_config, train_tokenizer

__all__ = [
"train_tokenizer",
"hash_tokenizer_config",
"create_tokenizer",
]
8 changes: 7 additions & 1 deletion openverifiablellm/tokenizer/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from pathlib import Path

import sentencepiece as spm
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", category=DeprecationWarning)
# SWIG-generated modules (like sentencepiece on python 3.11+) emit deprecation warnings
# scoping the suppression here prevents it from spamming our test output
import sentencepiece as spm

from .base import BaseTokenizer

Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ include = ["openverifiablellm*"]
dev = [
"pytest>=7.0",
"ruff>=0.15.4",
"numpy",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
python - <<'PY'
import pathlib, tomllib
data = tomllib.loads(pathlib.Path("pyproject.toml").read_text())
print("project.dependencies =", data.get("project", {}).get("dependencies", []))
print("dependency-groups.dev =", data.get("dependency-groups", {}).get("dev", []))
PY

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 196


Move numpy to core dependencies instead of dev-only.

Per repository policy, numpy must be a core dependency for this LLM-focused project. Keeping it in dependency-groups.dev causes installation and runtime divergence in non-dev environments.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pyproject.toml` at line 26, Move "numpy" out of the development dependency
group and add it to the project's core dependencies: remove "numpy" from
dependency-groups.dev and add it under the main dependencies section (the
project dependencies block in pyproject.toml). Update any existing dependency
listing to ensure only one entry for "numpy" remains (no duplicates) and run the
lock/build step to regenerate the lockfile so runtime installs include numpy.

]

[tool.ruff]
Expand All @@ -32,3 +33,7 @@ target-version = "py39"
[tool.ruff.lint]
select = ["E", "F", "I"]
ignore = ["E501"]

[tool.pytest.ini_options]
filterwarnings = [
]
26 changes: 26 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import pytest

from openverifiablellm.tokenizer import (
create_tokenizer,
hash_tokenizer_config,
train_tokenizer,
)
from openverifiablellm.tokenizer.bpe_tokenizer import BPETokenizer
from openverifiablellm.tokenizer.sentencepiece_tokenizer import SentencePieceTokenizer


@pytest.fixture
Expand Down Expand Up @@ -166,3 +169,26 @@ def test_hash_tokenizer_missing_merges(tmp_path):

with pytest.raises(FileNotFoundError):
hash_tokenizer_config(tokenizer_path)


# ---------------------------------------------------------------------
# create_tokenizer Tests
# ---------------------------------------------------------------------


def test_create_tokenizer_bpe():
"""Test that create_tokenizer returns a BPETokenizer for 'bpe'."""
tokenizer = create_tokenizer("bpe", vocab_size=1000, min_frequency=2)
assert isinstance(tokenizer, BPETokenizer)


def test_create_tokenizer_sentencepiece():
"""Test that create_tokenizer returns a SentencePieceTokenizer for 'sentencepiece'."""
tokenizer = create_tokenizer("sentencepiece", vocab_size=1000, min_frequency=2)
assert isinstance(tokenizer, SentencePieceTokenizer)


def test_create_tokenizer_invalid():
"""Test that create_tokenizer raises a ValueError for invalid types."""
with pytest.raises(ValueError, match="Unsupported tokenizer: invalid"):
create_tokenizer("invalid", vocab_size=1000, min_frequency=2)
53 changes: 53 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,27 @@ def test_merkle_root_empty_file(tmp_path):
assert root == expected


def test_compute_merkle_root_multi_chunk_hardcoded(tmp_path):
file = tmp_path / "data.txt"
# 3 chunks of 8 bytes each
chunk1 = b"chunk__1"
chunk2 = b"chunk__2"
chunk3 = b"chunk__3"
file.write_bytes(chunk1 + chunk2 + chunk3)

h1 = hashlib.sha256(chunk1).digest()
h2 = hashlib.sha256(chunk2).digest()
h3 = hashlib.sha256(chunk3).digest()

h12 = hashlib.sha256(h1 + h2).digest()
h33 = hashlib.sha256(h3 + h3).digest()

expected_root = hashlib.sha256(h12 + h33).hexdigest()

actual_root = utils.compute_merkle_root(file, chunk_size=8)
assert actual_root == expected_root


# --------------- Merkle proof generation ------------------------------------


Expand Down Expand Up @@ -283,3 +304,35 @@ def test_export_and_load_merkle_proof(tmp_path):
chunk_data=chunk,
expected_root=root,
)


# --------------- load_merkle_proof tests ------------------------------------


def test_load_merkle_proof_valid_file(tmp_path):
proof_data = {
"chunk_index": 1,
"chunk_size": 8,
"proof": [["00" * 32, True]],
}
proof_file = tmp_path / "proof.json"
proof_file.write_text(json.dumps(proof_data))

loaded_proof = utils.load_merkle_proof(proof_file)

assert loaded_proof == proof_data


def test_load_merkle_proof_missing_file(tmp_path):
proof_file = tmp_path / "missing.json"

with pytest.raises(FileNotFoundError):
utils.load_merkle_proof(proof_file)


def test_load_merkle_proof_invalid_json(tmp_path):
proof_file = tmp_path / "invalid.json"
proof_file.write_text("{invalid json}")

with pytest.raises(json.JSONDecodeError):
utils.load_merkle_proof(proof_file)
2 changes: 1 addition & 1 deletion tests/test_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run_preprocessing(tmp_dir: Path, dump: Path) -> None:
original = os.getcwd()
os.chdir(tmp_dir)
try:
utils.extract_text_from_xml(dump)
utils.extract_text_from_xml(dump, write_manifest=True)
finally:
os.chdir(original)

Expand Down
Loading
Loading