diff --git a/Cargo.lock b/Cargo.lock index 260a3c1..0dfbec3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,24 +2,71 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "cc" +version = "1.2.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "indoc" version = "2.0.7" @@ -29,12 +76,24 @@ dependencies = [ "rustversion", ] +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + [[package]] name = "libc" version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + [[package]] name = "memoffset" version = "0.9.1" @@ -143,14 +202,100 @@ version = "0.1.8" dependencies = [ "pyo3", "thiserror", + "tree-sitter", + "tree-sitter-bash", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", ] +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "syn" version = "2.0.117" @@ -188,6 +333,36 @@ dependencies = [ "syn", ] +[[package]] +name = "tree-sitter" +version = "0.26.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a6592b1aec0109df37b6bafea77eb4e61466e37b0a5a98bef4f89bfb81b7a2" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-bash" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5ec769279cc91b561d3df0d8a5deb26b0ad40d183127f409494d6d8fc53062" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -199,3 +374,9 @@ name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index f600244..890c041 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,10 @@ python = ["pyo3"] name = "rable" crate-type = ["lib", "cdylib"] +[dev-dependencies] +tree-sitter = "0.26" +tree-sitter-bash = "0.25" + [lints.clippy] unwrap_used = "deny" expect_used = "deny" diff --git a/justfile b/justfile index 7b39fc8..4839c74 100644 --- a/justfile +++ b/justfile @@ -89,6 +89,10 @@ fuzz-minimize input: develop _install-parable fuzz-generate-tests: develop .venv/bin/python3 tests/generate_oracle_tests.py +# Compare rable vs tree-sitter-bash accuracy (VERBOSE=1 for details) +compare-tree-sitter: + cargo test compare_parsers -- --nocapture + # Run the oracle test suite (aspirational — does not fail build) test-oracle: cargo test oracle_test_suite -- --nocapture diff --git a/tests/compare_tree_sitter.rs b/tests/compare_tree_sitter.rs new file mode 100644 index 0000000..ca1224d --- /dev/null +++ b/tests/compare_tree_sitter.rs @@ -0,0 +1,377 @@ +use std::fs; +use std::path::Path; + +/// A single test case parsed from a `.tests` file. +#[derive(Debug)] +struct TestCase { + name: String, + input: String, + expected: String, + extglob: bool, +} + +fn parse_test_file(content: &str) -> Vec { + let mut cases = Vec::new(); + let mut extglob = false; + let mut lines = content.lines(); + + while let Some(line) = lines.next() { + if line.starts_with("# @extglob") { + extglob = true; + continue; + } + if let Some(name) = line.strip_prefix("=== ") { + let name = name.trim().to_string(); + let mut input_lines = Vec::new(); + for line in lines.by_ref() { + if line == "---" { + break; + } + input_lines.push(line); + } + let mut expected_lines = Vec::new(); + for line in lines.by_ref() { + if line == "---" { + break; + } + expected_lines.push(line); + } + let mut test_extglob = extglob; + let filtered_input: Vec<_> = input_lines + .iter() + .filter(|l| { + if l.starts_with("# @extglob") { + test_extglob = true; + false + } else { + true + } + }) + .copied() + .collect(); + cases.push(TestCase { + name, + input: filtered_input.join("\n"), + expected: expected_lines.join("\n"), + extglob: test_extglob, + }); + } + } + cases +} + +fn normalize_whitespace(s: &str) -> String { + s.split_whitespace().collect::>().join(" ") +} + +/// Check if a tree-sitter node or any descendant is an ERROR or MISSING node. +fn has_error(node: &tree_sitter::Node) -> bool { + if node.is_error() || node.is_missing() { + return true; + } + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if has_error(&child) { + return true; + } + } + false +} + +/// Results for a single parser on a single test case. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ParseResult { + /// Parsed successfully (for rable: matches expected output) + Pass, + /// Parsed but output doesn't match expected (rable only) + WrongOutput, + /// Parse error / ERROR nodes present + ParseError, + /// Test expects an error and parser produced one + ExpectedError, +} + +fn run_rable(case: &TestCase) -> ParseResult { + match rable::parse(&case.input, case.extglob) { + Ok(nodes) => { + let actual = nodes + .iter() + .map(|n| format!("{n}")) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "); + + if case.expected == "" { + return ParseResult::WrongOutput; // expected error but got success + } + + if actual.trim() == case.expected.trim() + || normalize_whitespace(&actual) == normalize_whitespace(&case.expected) + { + ParseResult::Pass + } else { + ParseResult::WrongOutput + } + } + Err(_) => { + if case.expected == "" { + ParseResult::ExpectedError + } else { + ParseResult::ParseError + } + } + } +} + +fn run_tree_sitter(case: &TestCase, parser: &mut tree_sitter::Parser) -> ParseResult { + match parser.parse(&case.input, None) { + Some(tree) => { + let root = tree.root_node(); + if case.expected == "" { + // For error-expected cases: tree-sitter "passes" if it also has errors + if has_error(&root) { + return ParseResult::ExpectedError; + } + return ParseResult::WrongOutput; // parsed successfully but should have errored + } + if has_error(&root) { + ParseResult::ParseError + } else { + ParseResult::Pass + } + } + None => { + if case.expected == "" { + ParseResult::ExpectedError + } else { + ParseResult::ParseError + } + } + } +} + +struct ComparisonStats { + file_name: String, + total: usize, + rable_pass: usize, + ts_pass: usize, + both_pass: usize, + rable_only: usize, + ts_only: usize, + both_fail: usize, + error_cases: usize, + details: Vec, +} + +fn compare_file(path: &Path, parser: &mut tree_sitter::Parser, verbose: bool) -> ComparisonStats { + let file_name = path + .file_name() + .unwrap_or_default() + .to_string_lossy() + .to_string(); + let content = fs::read_to_string(path).unwrap_or_default(); + let cases = parse_test_file(&content); + + let mut stats = ComparisonStats { + file_name, + total: 0, + rable_pass: 0, + ts_pass: 0, + both_pass: 0, + rable_only: 0, + ts_only: 0, + both_fail: 0, + error_cases: 0, + details: Vec::new(), + }; + + for case in &cases { + if case.expected == "" { + stats.error_cases += 1; + continue; // skip error cases — not meaningful for accuracy comparison + } + + stats.total += 1; + let rable_result = run_rable(case); + let ts_result = run_tree_sitter(case, parser); + + let rable_ok = rable_result == ParseResult::Pass; + let ts_ok = ts_result == ParseResult::Pass; + + if rable_ok { + stats.rable_pass += 1; + } + if ts_ok { + stats.ts_pass += 1; + } + + match (rable_ok, ts_ok) { + (true, true) => stats.both_pass += 1, + (true, false) => { + stats.rable_only += 1; + if verbose { + stats.details.push(format!( + " RABLE-ONLY :: {} | input: {:?}", + case.name, case.input, + )); + } + } + (false, true) => { + stats.ts_only += 1; + if verbose { + stats.details.push(format!( + " TS-ONLY :: {} | input: {:?}", + case.name, case.input, + )); + } + } + (false, false) => stats.both_fail += 1, + } + } + + stats +} + +fn make_parser() -> Result { + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&tree_sitter_bash::LANGUAGE.into())?; + Ok(parser) +} + +fn collect_test_files(dir: &Path) -> Vec { + if !dir.exists() { + return Vec::new(); + } + let Ok(read_dir) = fs::read_dir(dir) else { + return Vec::new(); + }; + let mut entries: Vec<_> = read_dir + .filter_map(Result::ok) + .filter(|e| e.path().extension().is_some_and(|ext| ext == "tests")) + .map(|e| e.path()) + .collect(); + entries.sort(); + entries +} + +#[allow(clippy::cast_precision_loss)] +fn pct(num: usize, den: usize) -> f64 { + if den > 0 { + (num as f64) / (den as f64) * 100.0 + } else { + 0.0 + } +} + +struct GrandTotals { + total: usize, + rable: usize, + ts: usize, + both: usize, + rable_only: usize, + ts_only: usize, + both_fail: usize, + errors: usize, +} + +impl GrandTotals { + const fn new() -> Self { + Self { + total: 0, + rable: 0, + ts: 0, + both: 0, + rable_only: 0, + ts_only: 0, + both_fail: 0, + errors: 0, + } + } + + const fn add(&mut self, stats: &ComparisonStats) { + self.total += stats.total; + self.rable += stats.rable_pass; + self.ts += stats.ts_pass; + self.both += stats.both_pass; + self.rable_only += stats.rable_only; + self.ts_only += stats.ts_only; + self.both_fail += stats.both_fail; + self.errors += stats.error_cases; + } + + fn print_summary(&self) { + eprintln!("{}", "=".repeat(95)); + eprintln!( + "{:<45} {:>5} {:>7} {:>7} {:>7} {:>7} {:>7}", + "TOTAL", self.total, self.rable, self.ts, self.both, self.rable_only, self.ts_only, + ); + eprintln!(); + eprintln!( + "Rable accuracy: {}/{} ({:.1}%)", + self.rable, + self.total, + pct(self.rable, self.total) + ); + eprintln!( + "Tree-sitter accuracy: {}/{} ({:.1}%)", + self.ts, + self.total, + pct(self.ts, self.total) + ); + eprintln!("Both pass: {}", self.both); + eprintln!("Rable-only pass: {}", self.rable_only); + eprintln!("Tree-sitter-only pass: {}", self.ts_only); + eprintln!("Both fail: {}", self.both_fail); + eprintln!("Skipped (error cases): {}", self.errors); + eprintln!(); + eprintln!("NOTE: Rable 'pass' = exact S-expression match with expected output."); + eprintln!(" Tree-sitter 'pass' = parsed without ERROR/MISSING nodes."); + eprintln!( + " Tree-sitter's bar is lower — it only checks parsability, not correctness." + ); + } +} + +#[test] +fn compare_parsers() { + let verbose = std::env::var("VERBOSE").is_ok(); + let Ok(mut parser) = make_parser() else { + return; + }; + + let base = Path::new(env!("CARGO_MANIFEST_DIR")); + let mut all_files = collect_test_files(&base.join("tests/parable")); + all_files.extend(collect_test_files(&base.join("tests/oracle"))); + + let mut totals = GrandTotals::new(); + + eprintln!(); + eprintln!( + "{:<45} {:>5} {:>7} {:>7} {:>7} {:>7} {:>7}", + "File", "Total", "Rable", "TS", "Both", "R-only", "TS-only", + ); + eprintln!("{}", "-".repeat(95)); + + for path in &all_files { + let stats = compare_file(path, &mut parser, verbose); + if stats.total == 0 { + continue; + } + eprintln!( + "{:<45} {:>5} {:>7} {:>7} {:>7} {:>7} {:>7}", + stats.file_name, + stats.total, + stats.rable_pass, + stats.ts_pass, + stats.both_pass, + stats.rable_only, + stats.ts_only, + ); + for detail in &stats.details { + eprintln!("{detail}"); + } + totals.add(&stats); + } + + totals.print_summary(); +}