diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 0000000..cc1e741 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,100 @@ +## Experiments + +This directory contains small reproducible experiments used to validate assumptions behind the **OpenVerifiableLLM deterministic training pipeline**. + +The goal of these experiments is to verify that: + +- preprocessing produces deterministic outputs +- dataset tampering can be detected using Merkle roots +- small reproducible datasets can be used for testing the pipeline + +These experiments are **not part of the main pipeline**. They are intended for testing ideas and validating reproducibility guarantees. + +--- + +## Directory Structure + +experiments/ +│ +├── data_subset/ +│ ├── sample_wiki_generate.py +│ ├── sample_wiki.xml.bz2 +│ └── tampered_sample_wiki.xml.bz2 +│ +├── preprocessing_determinism/ +│ └── test_preprocessing.py +│ +├── merkle_verification/ +│ └── test_merkle.py +│ +└── README.md + +--- + +## Experiments includes + +### 1. Preprocessing Determinism + +Verifies that running the preprocessing pipeline multiple times on the same dataset produces identical outputs. + +The experiment compares: + +- `processed_sha256` +- `processed_merkle_root` +- `environment_hash` + +If these values match across runs, the preprocessing step is deterministic. + +Run: + +```bash +python -m experiments.preprocessing_determinism.test_preprocessing experiments/data_subset/sample_wiki.xml.bz2 +``` + +**Expected Results** - + +```bash +Run 1 hash: ... +Run 2 hash: ... + +Deterministic preprocessing confirmed 🎉 +``` + +### 2. Merkle Root Tamper Detection + +Tests whether dataset tampering is detected by comparing Merkle roots. + +Two datasets are used: + +sample_wiki.xml.bz2 (original) + +tampered_sample_wiki.xml.bz2 (modified) + +The experiment compares: + +raw_merkle_root + +processed_merkle_root + +If either root differs, the tampering is successfully detected. + +Run: + +```bash +python -m experiments.merkle_verification.test_merkle --path1 experiments/data_subset/sample_wiki.xml.bz2 --path2 experiments/data_subset/tampered_sample_wiki.xml.bz2 +``` + +**Expected Results** - + +```bash +Run 1 RAW Merkle root: ... +Run 2 RAW Merkle root: ... + +Tampering detected 🎉 +``` + +### 3. Dataset Subset + +The data_subset directory contains a minimal Wikipedia XML example used for quick experimentation without downloading full dumps. + +This allows experiments to run quickly while still exercising the preprocessing pipeline. diff --git a/experiments/data_subset/sample_wiki_generate.py b/experiments/data_subset/sample_wiki_generate.py new file mode 100644 index 0000000..89cc32b --- /dev/null +++ b/experiments/data_subset/sample_wiki_generate.py @@ -0,0 +1,20 @@ +import bz2 + +# To make this tampered I deleted e of online + +xml_content = """ + + + + + Hello citation world. + This is [[Python|programming language]] + {{Wikipedia }}is a free onlin encyclopedia. + + + + +""" + +with bz2.open("experiments/data_subset/tampered_sample_wiki.xml.bz2", "wt", encoding="utf-8") as f: + f.write(xml_content) \ No newline at end of file diff --git a/experiments/merkle_verification/test_merkle.py b/experiments/merkle_verification/test_merkle.py new file mode 100644 index 0000000..fa35092 --- /dev/null +++ b/experiments/merkle_verification/test_merkle.py @@ -0,0 +1,63 @@ +import argparse +import json +import logging +from pathlib import Path + +from openverifiablellm.utils import extract_text_from_xml + +logger = logging.getLogger(__name__) + +""" +Experiment: Tamper Detection via Merkle Root Comparison + +Run with: +python -m experiments.merkle_verification.test_merkle --path1 experiments/data_subset/sample_wiki.xml.bz2 --path2 experiments/data_subset/tampered_sample_wiki.xml.bz2 + +""" +MANIFEST_PATH = Path("data/dataset_manifest.json") + +def run(path1): + """Run preprocessing and return processed Merkle root.""" + extract_text_from_xml(path1) + + #read genertaed manifest + with MANIFEST_PATH.open() as f: + manifest = json.load(f) + + return { + "raw_merkle_root": manifest["raw_merkle_root"], + "processed_merkle_root": manifest["processed_merkle_root"] + } + +if __name__ == "__main__": + + parser= argparse.ArgumentParser( + description= "Test tamper detection using Merkle root" + ) + + parser.add_argument("--path1",required=True,help="Original dataset") + parser.add_argument("--path2",required=True,help="Tampered dataset") + + args= parser.parse_args() + + logging.basicConfig( + level= logging.INFO, + format="%(levelname)s - %(message)s" + ) + + root1 = run(args.path1) + root2 = run(args.path2) + + print(f"\nRun 1 RAW Merkle root: {root1['raw_merkle_root']}") + print(f"Run 2 RAW Merkle root: {root2['raw_merkle_root']}") + + print(f"\nRun 1 processed Merkle root: {root1['processed_merkle_root']}") + print(f"Run 2 processed Merkle root: {root2['processed_merkle_root']}") + + if ( + root1["raw_merkle_root"] != root2["raw_merkle_root"] + or root1["processed_merkle_root"] != root2["processed_merkle_root"] + ): + print("\nTampering detected 🎉 (Merkle roots differ)") + else: + print("\nUnexpected result ❌ (Merkle roots identical)") \ No newline at end of file diff --git a/experiments/preprocessing_determinism/test_preprocessing.py b/experiments/preprocessing_determinism/test_preprocessing.py new file mode 100644 index 0000000..743a23b --- /dev/null +++ b/experiments/preprocessing_determinism/test_preprocessing.py @@ -0,0 +1,57 @@ +import json +import logging +import sys +from pathlib import Path + +from openverifiablellm.utils import extract_text_from_xml + +logger = logging.getLogger(__name__) + +""" +Experiment to test Deterministic preprocessing, by compairing generated hash on 2 runs. + +Run with: + python -m experiments.preprocessing_determinism.test_preprocessing experiments/data_subset/sample_wiki.xml.bz2 +""" +MANIFEST_PATH = Path("data/dataset_manifest.json") + +def run(input_path): + # Run preprocessing + extract_text_from_xml(input_path) + + #read genertaed manifest + with MANIFEST_PATH.open() as f: + manifest = json.load(f) + + return { + "processed_sha256": manifest["processed_sha256"], + "processed_merkle_root": manifest["processed_merkle_root"], + "environment_hash": manifest["environment_hash"], + } + + +if __name__ == "__main__": + + if len(sys.argv) < 2: + print("Usage: python -m experiments.preprocessing_determinism.test_preprocessing ") + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s - %(message)s" + ) + + result1= run(sys.argv[1]) + result2= run(sys.argv[1]) + + print(f"\nRun 1 hash: {result1['processed_sha256']}") + print(f"Run 2 hash: {result2['processed_sha256']}") + + if ( + result1["processed_sha256"] == result2["processed_sha256"] + and result1["processed_merkle_root"] == result2["processed_merkle_root"] + and result1["environment_hash"] == result2["environment_hash"] + ): + print("\nDeterministic preprocessing confirmed🎉") + else: + print("Hash didn't match❌") \ No newline at end of file