Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 113 additions & 10 deletions openverifiablellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ def extract_text_from_xml(input_path):
logger.info("Preprocessing complete. Output saved to %s", output_path)
generate_manifest(input_path,output_path)

# Automatically generate ZKP proofs for the processed text chunks
proofs_dir = output_dir / "proofs"
logger.info("Generating ZKP-compatible Merkle proofs for verification pipelines...")
num_proofs = export_all_merkle_proofs(output_path, proofs_dir, MERKLE_CHUNK_SIZE_BYTES)
logger.info("Successfully exported %d ZKP proofs to %s", num_proofs, proofs_dir)

# generate data manifest
def generate_manifest(raw_path, processed_path):
raw_path = Path(raw_path)
Expand Down Expand Up @@ -244,10 +250,12 @@ def export_merkle_proof(
proof: List[Tuple[str, bool]],
chunk_index: int,
chunk_size: int,
merkle_root: str,
output_path: Union[str, Path]
) -> None:
"""
Export Merkle proof to a JSON file for portable verification.
Structured into public_inputs and witness for Zero-Knowledge Proof (ZKP) compatibility.
"""

if chunk_size <= 0:
Expand All @@ -260,9 +268,14 @@ def export_merkle_proof(
raise ValueError("chunk_index must be non-negative")

data = {
"chunk_index": chunk_index,
"chunk_size": chunk_size,
"proof": proof,
"public_inputs": {
"merkle_root": merkle_root,
"chunk_index": chunk_index,
"chunk_size": chunk_size,
},
"witness": {
"sibling_hashes": proof,
}
}

output_path = Path(output_path)
Expand All @@ -280,14 +293,91 @@ def load_merkle_proof(
with proof_path.open("r", encoding="utf-8") as f:
return json.load(f)

def export_all_merkle_proofs(
file_path: Union[str, Path],
output_dir: Union[str, Path],
chunk_size: int = MERKLE_CHUNK_SIZE_BYTES
) -> int:
"""
Efficiently generate and export Merkle proofs for all chunks of a file.
Saves them as individual JSON files in the output directory.
Returns the number of proofs generated.
"""
path = Path(file_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

if chunk_size <= 0:
raise ValueError("chunk_size must be a positive integer")

leaves = []
with path.open("rb") as f:
while chunk := f.read(chunk_size):
leaf_hex = compute_sha256(data=chunk)
leaves.append(bytes.fromhex(leaf_hex))

if not leaves:
return 0

num_leaves = len(leaves)

# Build the Merkle tree level by level
tree = []
current_level = list(leaves)

while len(current_level) > 1:
if len(current_level) % 2 == 1:
current_level.append(current_level[-1])

tree.append(list(current_level))

next_level = []
for i in range(0, len(current_level), 2):
combined = current_level[i] + current_level[i + 1]
parent_hex = compute_sha256(data=combined)
next_level.append(bytes.fromhex(parent_hex))

current_level = next_level

tree.append(current_level)

merkle_root = tree[-1][0].hex()
prefix = path.name

# Export a proof for each chunk
for chunk_index in range(num_leaves):
proof = []
index = chunk_index

for level in tree[:-1]:
sibling_index = index ^ 1
sibling = level[sibling_index]
is_left = sibling_index < index
proof.append((sibling.hex(), is_left))
index //= 2

output_path = output_dir / f"{prefix}_chunk_{chunk_index}_proof.json"

# Reuse existing single-proof exporter
export_merkle_proof(
proof=proof,
chunk_index=chunk_index,
chunk_size=chunk_size,
merkle_root=merkle_root,
output_path=output_path
)

return num_leaves
Comment on lines +296 to +370
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check for existing tests covering export_all_merkle_proofs
rg -n "export_all_merkle_proofs" --type py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 255


🏁 Script executed:

#!/bin/bash
# Find test files and check for tests of export_all_merkle_proofs
echo "=== Looking for test files ==="
find . -type f -name "*test*.py" | head -20

echo -e "\n=== Searching for export_all_merkle_proofs in test files ==="
rg -n "export_all_merkle_proofs" --type py tests/ 2>/dev/null || echo "No matches in tests/"

echo -e "\n=== Checking context around line 212 in utils.py ==="
sed -n '205,220p' openverifiablellm/utils.py

echo -e "\n=== Searching for extract_text_from_xml function ==="
rg -n "def extract_text_from_xml" --type py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1098


🏁 Script executed:

#!/bin/bash
# Find which function contains line 212
echo "=== Context showing function definition around line 212 ==="
sed -n '150,225p' openverifiablellm/utils.py | head -80

echo -e "\n=== Looking for function definitions near line 212 ==="
rg -n "^def " openverifiablellm/utils.py | awk -F: '$1 < 212 {prev=$0} $1 > 212 && !shown {print "Previous function:", prev; print "Next function at:", $0; shown=1}'

echo -e "\n=== Check test_util.py for test coverage ==="
wc -l ./tests/test_util.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 2942


🏁 Script executed:

#!/bin/bash
# Check for tests of extract_text_from_xml
echo "=== Searching for tests of extract_text_from_xml ==="
rg -n "extract_text_from_xml" ./tests/test_util.py

echo -e "\n=== Looking for test functions in test_util.py ==="
rg -n "^def test_" ./tests/test_util.py

echo -e "\n=== Show actual test file content to understand coverage ==="
head -50 ./tests/test_util.py

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 3058


🏁 Script executed:

#!/bin/bash
# Check the actual test_extract_text_from_xml tests
echo "=== Test implementation for extract_text_from_xml ==="
sed -n '101,155p' ./tests/test_util.py

echo -e "\n=== Checking if tests mock/patch export_all_merkle_proofs ==="
rg -n "export_all_merkle_proofs" ./tests/test_util.py

echo -e "\n=== Check for any monkeypatch or mock of export_all_merkle_proofs ==="
rg -n -B3 -A3 "monkeypatch" ./tests/test_util.py | head -60

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 3089


Add direct unit tests for export_all_merkle_proofs to verify proof generation and edge cases.

This public function currently lacks explicit test coverage. While it's indirectly exercised through extract_text_from_xml tests, those tests only verify preprocessing output, not proof generation. Direct tests should cover:

  • Empty files
  • Single chunk files
  • Odd chunk counts (tree padding logic)
  • Proof verification against the computed root

The implementation itself is correct and efficient, but explicit tests would validate the merkle tree construction and proof path extraction.

🧰 Tools
🪛 Ruff (0.15.4)

[warning] 311-311: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/utils.py` around lines 296 - 370, Add direct unit tests for
export_all_merkle_proofs that create temporary files and an output directory
(use pytest tmp_path), then assert behavior for: an empty file returns 0 and
produces no proof files; a single-chunk file produces one proof whose exported
JSON exists and whose proof reconstructs the merkle root; and a file with an odd
number of chunks exercises padding (e.g., 3 chunks) and produces correct proofs.
For verification, read each exported proof JSON (from export_merkle_proof
outputs), compute or reconstruct the merkle root by iteratively hashing leaves
using compute_sha256 and the sibling/is_left flags, and assert the reconstructed
root equals the merkle_root field in the JSON; also assert the function returns
the expected num_leaves. Reference export_all_merkle_proofs,
export_merkle_proof, and compute_sha256 to locate code under test and use
tmp_path for isolated file IO.



# Content before line 270 remains unchanged
# Entire function definition from lines 270-314 should be deleted
def verify_merkle_proof_from_file(
proof_file_path: Union[str, Path],
chunk_data: bytes,
expected_root: str
) -> bool:
"""
Verify a Merkle proof from a ZKP-compatible JSON file.
"""
proof_file_path = Path(proof_file_path)

if not proof_file_path.exists():
Expand All @@ -299,14 +389,27 @@ def verify_merkle_proof_from_file(
if not isinstance(data, dict):
raise ValueError("Malformed proof file: expected JSON object")

required_keys = {"chunk_index", "chunk_size", "proof"}
if not required_keys.issubset(data.keys()):
raise ValueError("Malformed proof file: missing required keys")
if "public_inputs" not in data or "witness" not in data:
raise ValueError("Malformed proof file: missing public_inputs or witness")

public_inputs = data["public_inputs"]
witness = data["witness"]

required_public_keys = {"merkle_root", "chunk_index", "chunk_size"}
if not isinstance(public_inputs, dict) or not required_public_keys.issubset(public_inputs.keys()):
raise ValueError("Malformed proof file: missing required keys in public_inputs")

if not isinstance(witness, dict) or "sibling_hashes" not in witness:
raise ValueError("Malformed proof file: missing sibling_hashes in witness")

proof_root = public_inputs["merkle_root"]
if proof_root != expected_root:
return False

proof = data["proof"]
proof = witness["sibling_hashes"]

if not isinstance(proof, list):
raise ValueError("Malformed proof: proof must be a list")
raise ValueError("Malformed proof: sibling_hashes must be a list")

return verify_merkle_proof(chunk_data, proof, expected_root)

Expand Down
10 changes: 9 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def test_export_and_load_merkle_proof(tmp_path):
proof,
chunk_index=1,
chunk_size=8,
merkle_root=root,
output_path=proof_file
)

Expand All @@ -269,4 +270,11 @@ def test_export_and_load_merkle_proof(tmp_path):
proof_file_path=proof_file,
chunk_data=chunk,
expected_root=root,
)
)

# Test mismatching merkle_root rejection
assert not utils.verify_merkle_proof_from_file(
proof_file_path=proof_file,
chunk_data=chunk,
expected_root="1" * 64, # Different root
)
Loading