Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
d741214
Add malformed XML edge case test and fix verify.py regression
Shubhamx404 Mar 7, 2026
e923a03
Update openverifiablellm/verify.py
Shubhamx404 Mar 8, 2026
0d0626a
fixed coderabbit issue
Shubhamx404 Mar 8, 2026
573ad01
Merge branch 'main' into fix-malformed-xml-test
Shubhamx404 Mar 9, 2026
9c28b5a
Chain-linking
aniket866 Mar 9, 2026
e442106
Code rabbit follow up
aniket866 Mar 9, 2026
93ba434
Code-rabbit-follow-up
aniket866 Mar 9, 2026
6ac7c7d
Code rabbit follow up
aniket866 Mar 9, 2026
c94d335
testing with UV
aniket866 Mar 11, 2026
ff689a9
resume-preprocessing
aniket866 Mar 11, 2026
2ae4c5a
Merge branch 'main' into chain
aniket866 Mar 12, 2026
0b59f92
fixing-linting-issues
aniket866 Mar 12, 2026
824659f
ruff-format
aniket866 Mar 12, 2026
5cad61c
added-script
aniket866 Mar 12, 2026
8df62f8
added-script
aniket866 Mar 12, 2026
be86883
fixing-merge-conflicts
aniket866 Mar 16, 2026
5fbdff1
fixing-ci-fails
aniket866 Mar 16, 2026
672f071
fixing ruff formatting
aniket866 Mar 16, 2026
6b00bd4
Code rabbit follow up
aniket866 Mar 16, 2026
9fc7545
code rabbit fixes
aniket866 Mar 16, 2026
6286972
fixing ruff formatting
aniket866 Mar 16, 2026
e17ab23
Fix tokenizer tests and improve sentencepiece tokenizer
Shubhamx404 Mar 17, 2026
05c6b3d
Fix test_verify.py to write manifest during setup
Shubhamx404 Mar 17, 2026
3123770
Fix linting issue in sentencepiece_tokenizer.py
Shubhamx404 Mar 17, 2026
6af29e4
Fix duplicate pytest options in pyproject.toml
Shubhamx404 Mar 17, 2026
d5ff5ef
Merge pull request #53 from aniket866/chain
Archit381 Mar 17, 2026
40e2f40
Merge branch 'main' into resume-preprocessing
aniket866 Mar 17, 2026
e2c3017
Code rabbit follow up
aniket866 Mar 17, 2026
e35f8ab
Code rabbit follow-up
aniket866 Mar 17, 2026
a754edd
Code rabbit follow-up
aniket866 Mar 17, 2026
66bbeab
fixing-comput_sha
aniket866 Mar 17, 2026
173cdd5
Merge pull request #68 from aniket866/resume-preprocessing
Archit381 Mar 17, 2026
b8b81af
Merge main into fix-malformed-xml-test and resolve conflicts
Shubhamx404 Mar 17, 2026
c1a4ef7
style: run formatter to fix CI lint issue
Shubhamx404 Mar 17, 2026
578bc79
Merge pull request #49 from Shubhamx404/fix-malformed-xml-test
Archit381 Mar 17, 2026
890d77c
Merge origin/fix-tokenizer-tests and resolve conflicts, keeping tests
Shubhamx404 Mar 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 288 additions & 0 deletions openverifiablellm/manifest_chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
"""
manifest_chain.py
=================
Cryptographically linked manifest chains for tamper detection.

This module provides utilities to create and verify a chain of dataset manifests,
where each manifest includes the SHA256 hash of its predecessor. This forms a
tamper-evident chain: if any manifest in the sequence is modified, the hash
stored in the next manifest no longer matches, and the tampering is immediately
visible (analogous to a wax seal chain).

Usage
-----
# Generate parent hash before writing a new manifest
parent_hash = get_parent_manifest_hash(manifest_path)
manifest = { ... , "parent_manifest_hash": parent_hash }

# Verify the chain between two consecutive manifests
is_valid = verify_manifest_chain_link(previous_manifest_path, current_manifest)

# Verify entire chain from root
report = verify_manifest_chain(current_manifest_path)
"""

import hashlib
import json
import logging
from pathlib import Path
from typing import Any, Dict, Optional, Union

logger = logging.getLogger(__name__)


def _canonical_json(obj: Any) -> str:
"""
Serialize object into canonical JSON format.
Ensures stable hashing across runs regardless of key order.

Parameters
----------
obj : Any
JSON-serializable object

Returns
-------
str
Canonical JSON string with sorted keys
"""
return json.dumps(obj, sort_keys=True, separators=(",", ":"))


def compute_manifest_hash(manifest: Union[str, Path, Dict[str, Any]]) -> str:
"""
Compute SHA-256 hash of a manifest.

Can accept:
- Dict: manifest data object (will be canonical-JSON serialized)
- str/Path: path to manifest JSON file (will be read and parsed)

Parameters
----------
manifest : Union[str, Path, Dict[str, Any]]
Either a manifest dict or a path to a manifest JSON file

Returns
-------
str
SHA-256 hash in hexadecimal format

Raises
------
FileNotFoundError
If manifest is a path and the file does not exist
ValueError
If manifest JSON is malformed
"""
if isinstance(manifest, dict):
manifest_data = manifest
else:
manifest_path = Path(manifest)
if not manifest_path.exists():
raise FileNotFoundError(f"Manifest not found: {manifest_path}")

with manifest_path.open("r", encoding="utf-8") as f:
try:
manifest_data = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Malformed manifest JSON: {e}")

hashable = manifest_data.copy()
if isinstance(hashable, dict):
hashable.pop("parent_manifest_hash", None)
Comment on lines +90 to +92
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Redundant type check.

At this point in the code flow, hashable is always a dict (either from the input dict or from json.load), making the isinstance(hashable, dict) check unnecessary.

♻️ Simplify by removing redundant check
     hashable = manifest_data.copy()
-    if isinstance(hashable, dict):
-        hashable.pop("parent_manifest_hash", None)
+    hashable.pop("parent_manifest_hash", None)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
hashable = manifest_data.copy()
if isinstance(hashable, dict):
hashable.pop("parent_manifest_hash", None)
hashable = manifest_data.copy()
hashable.pop("parent_manifest_hash", None)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/manifest_chain.py` around lines 90 - 92, The local variable
`hashable` is always a dict (derived from `manifest_data`), so remove the
redundant isinstance check and directly remove the optional key: call
`hashable.pop("parent_manifest_hash", None)` on `hashable` (created from
`manifest_data.copy()`) instead of guarding it with `isinstance`; update any
surrounding comments if present to reflect the simplification in the logic
within the manifest handling code.


canonical = _canonical_json(hashable)
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()


def get_parent_manifest_hash(
manifest_path: Union[str, Path],
) -> str:
"""
Read the SHA256 hash of the manifest at manifest_path, to be stored
as parent_manifest_hash in the next (replacement) manifest.

If the file does not exist yet (first run), returns an empty string.

Parameters
----------
manifest_path : Union[str, Path]
Path to the manifest file

Returns
-------
str
SHA-256 hash of the existing manifest, or "" if it doesn't exist
"""
path = Path(manifest_path)

if not path.exists():
logger.info("No previous manifest found at %s — parent_manifest_hash will be empty", path)
return ""

try:
parent_hash = compute_manifest_hash(path)
logger.info("Parent manifest hash computed: %s", parent_hash)
return parent_hash
except Exception as e:
logger.exception("Could not compute parent manifest hash: %s", e)
raise


def verify_manifest_chain_link(
previous_manifest: Union[str, Path, Dict[str, Any]],
current_manifest: Union[str, Path, Dict[str, Any]],
) -> bool:
"""
Verify that current_manifest correctly references previous_manifest
via its parent_manifest_hash field.

This checks a single link in the chain. If the link is broken, it indicates
that either:
- The previous manifest was tampered with/regenerated
- The current manifest's parent_manifest_hash was modified

Parameters
----------
previous_manifest : Union[str, Path, Dict[str, Any]]
The previous manifest (dict or path to file)
current_manifest : Union[str, Path, Dict[str, Any]]
The current manifest (dict or path to file)

Returns
-------
bool
True if parent_manifest_hash in current matches the hash of previous

Raises
------
FileNotFoundError
If any required file does not exist
ValueError
If manifest JSON is malformed
"""
# Load current manifest if needed
if isinstance(current_manifest, dict):
current_data = current_manifest
else:
current_path = Path(current_manifest)
if not current_path.exists():
raise FileNotFoundError(f"Current manifest not found: {current_path}")
with current_path.open("r", encoding="utf-8") as f:
try:
current_data = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Malformed current manifest JSON: {e}")

# Get stored parent hash
stored_parent_hash = current_data.get("parent_manifest_hash", "")

# Compute hash of previous manifest
expected_parent_hash = compute_manifest_hash(previous_manifest)

# Compare
match = stored_parent_hash == expected_parent_hash

if match:
logger.info("Manifest chain link verified ✓")
else:
logger.error(
"Manifest chain link broken! ✗\n"
" stored (in current) : %s\n"
" computed (from prev) : %s",
stored_parent_hash,
expected_parent_hash,
)

return match


def verify_manifest_chain(
manifest_path: Union[str, Path],
previous_manifest_path: Optional[Union[str, Path]] = None,
) -> Dict[str, Any]:
"""
Verify the manifest chain up to the given manifest.

If previous_manifest_path is provided, checks the link between previous
and current. If not provided, checks that:
- parent_manifest_hash exists (for current run > 1)
- parent_manifest_hash is non-empty (indicates there was a previous run)

Parameters
----------
manifest_path : Union[str, Path]
Path to the manifest to verify
previous_manifest_path : Optional[Union[str, Path]]
Path to the previous manifest (optional, for explicit link verification)

Returns
-------
Dict[str, Any]
Report with keys:
- "chain_valid": bool - whether the chain is intact
- "chain_message": str - human-readable message
- "has_parent_hash_field": bool - whether parent_manifest_hash field exists
- "parent_hash_value": str - value of parent_manifest_hash (or "")
"""
manifest_path = Path(manifest_path)

if not manifest_path.exists():
return {
"chain_valid": False,
"chain_message": f"Manifest not found: {manifest_path}",
"has_parent_hash_field": False,
"parent_hash_value": "",
}

try:
with manifest_path.open("r", encoding="utf-8") as f:
manifest_data = json.load(f)
except Exception as e:
return {
"chain_valid": False,
"chain_message": f"Failed to read manifest: {e}",
"has_parent_hash_field": False,
"parent_hash_value": "",
}

has_field = "parent_manifest_hash" in manifest_data
parent_hash_value = manifest_data.get("parent_manifest_hash", "")

# If explicit previous manifest is provided, verify the link
if previous_manifest_path is not None:
try:
link_valid = verify_manifest_chain_link(previous_manifest_path, manifest_data)
message = (
"Chain link verified against previous manifest ✓"
if link_valid
else "Chain link broken — previous manifest does not match stored hash ✗"
)
except (OSError, ValueError) as exc:
link_valid = False
message = f"Failed to verify previous manifest: {exc}"
return {
"chain_valid": link_valid,
"chain_message": message,
"has_parent_hash_field": has_field,
"parent_hash_value": parent_hash_value,
}

# Otherwise, just check that the field exists (indicating awareness of chain concept)
message = ""
if not has_field:
message = "parent_manifest_hash field missing (may be old manifest)"
chain_valid = False
elif parent_hash_value == "":
message = "parent_manifest_hash is empty (first run in chain)"
chain_valid = True
else:
message = "Cannot verify non-root manifest without previous_manifest_path"
chain_valid = False

return {
"chain_valid": chain_valid,
"chain_message": message,
"has_parent_hash_field": has_field,
"parent_hash_value": parent_hash_value,
}
2 changes: 2 additions & 0 deletions openverifiablellm/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .factory import create_tokenizer
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
rg -n -C2 --type=py 'from \.factory import create_tokenizer|from \.sentencepiece_tokenizer import SentencePieceTokenizer|import sentencepiece as spm'

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1413


🏁 Script executed:

cat -n openverifiablellm/tokenizer/factory.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 650


🏁 Script executed:

head -20 openverifiablellm/tokenizer/sentencepiece_tokenizer.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 649


🏁 Script executed:

cat pyproject.toml | head -50

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 738


🏁 Script executed:

cat setup.py 2>/dev/null || echo "setup.py not found"

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 90


🏁 Script executed:

rg -n 'create_tokenizer' --type=py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1406


Decouple optional sentencepiece dependency from package import.

The eager re-export in __init__.py pulls in SentencePieceTokenizer from factory.py, which in turn imports the optional sentencepiece library at module level. This breaks import openverifiablellm.tokenizer for users who only need BPE and don't have sentencepiece installed.

The fix requires lazy importing in two places:

  1. In __init__.py: Wrap the create_tokenizer re-export in a lazy import function
  2. In factory.py: Lazy-import SentencePieceTokenizer inside create_tokenizer() only when needed
Suggested fix

In openverifiablellm/tokenizer/__init__.py:

-from .factory import create_tokenizer
+def create_tokenizer(tokenizer_type, vocab_size, min_frequency):
+    from .factory import create_tokenizer as _create_tokenizer
+    return _create_tokenizer(tokenizer_type, vocab_size, min_frequency)

In openverifiablellm/tokenizer/factory.py:

-from .bpe_tokenizer import BPETokenizer
-from .sentencepiece_tokenizer import SentencePieceTokenizer
-
-
 def create_tokenizer(tokenizer_type, vocab_size, min_frequency):
+    from .bpe_tokenizer import BPETokenizer
 
     tokenizer_type = tokenizer_type.lower()
 
     if tokenizer_type == "bpe":
         return BPETokenizer(vocab_size, min_frequency)
 
     if tokenizer_type == "sentencepiece":
+        from .sentencepiece_tokenizer import SentencePieceTokenizer
         return SentencePieceTokenizer(vocab_size, min_frequency)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/tokenizer/__init__.py` at line 1, The package-level import
eagerly re-exports create_tokenizer which causes SentencePieceTokenizer (and the
optional sentencepiece dependency) to be imported at module import time; change
__init__.py to expose create_tokenizer via a lazy wrapper that imports
create_tokenizer only when called, and modify factory.py so
SentencePieceTokenizer (and any import of sentencepiece) is imported lazily
inside create_tokenizer() only when the sentencepiece-backed branch is selected;
update references to the factory-level create_tokenizer and the class name
SentencePieceTokenizer to ensure the dynamic import path is used.

from .train import hash_tokenizer_config, train_tokenizer

__all__ = [
"train_tokenizer",
"hash_tokenizer_config",
"create_tokenizer",
]
1 change: 0 additions & 1 deletion openverifiablellm/tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

class BPETokenizer(BaseTokenizer):
def train(self, text_file: Path, save_path: Path):

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
Expand Down
1 change: 0 additions & 1 deletion openverifiablellm/tokenizer/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@


def create_tokenizer(tokenizer_type, vocab_size, min_frequency):

tokenizer_type = tokenizer_type.lower()

if tokenizer_type == "bpe":
Expand Down
8 changes: 6 additions & 2 deletions openverifiablellm/tokenizer/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import warnings
from pathlib import Path

import sentencepiece as spm
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=DeprecationWarning)
# SWIG-generated modules (like sentencepiece on python 3.11+) emit deprecation warnings
# scoping the suppression here prevents it from spamming our test output
import sentencepiece as spm

from .base import BaseTokenizer

Expand All @@ -11,7 +16,6 @@ class SentencePieceTokenizer(BaseTokenizer):
"""

def train(self, text_file: Path, save_path: Path):

model_prefix = save_path / "spm"

spm.SentencePieceTrainer.train(
Expand Down
Loading
Loading