From 9163d24ecf5472da8238324ef6f7ff3a55bcf3ba Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Wed, 25 Mar 2026 12:22:02 +0100 Subject: [PATCH 1/5] feat: enrich AST with structured word spans and assignment detection Move expansion boundary tracking into the lexer to eliminate ~300 lines of duplicate re-parsing in the sexp formatting layer. Key changes: - **WordBuilder**: New lexer type that bundles word value string with expansion spans and quoting context. All 14 lexer word-reading functions converted from `&mut String` to `&mut WordBuilder`. - **WordSpan tracking**: Lexer records spans for all 14 expansion types (CommandSub, AnsiCQuote, LocaleString, ProcessSub, SingleQuoted, DoubleQuoted, ParamExpansion, SimpleVar, ArithmeticSub, Backtick, BracketSubscript, Extglob, DeprecatedArith, Escape) with quoting context (None, DoubleQuote, ParamExpansion, CommandSub, Backtick). - **segments_from_spans**: New span-based segment extraction replaces `parse_word_segments` re-parsing for words with spans. Filters to sexp-relevant span kinds, handles nested spans via top-level collection, and uses quoting context for correct ANSI-C behavior. - **Assignment detection**: Lexer emits AssignmentWord tokens for `NAME=`, `NAME+=`, `NAME[...]=` patterns. Parser populates `Command.assignments` field. - **Word.parts**: Populated via `decompose_word()` with structured AST nodes (WordLiteral, CommandSubstitution, ProcessSubstitution, AnsiCQuote, LocaleString). - **Array parsing**: New `read_array_elements`/`read_array_element` in lexer parses array content as individual words at lex time, replacing post-hoc `normalize_array_content` string manipulation. - **Dead code removed**: `try_normalize_array`, `normalize_array_content`, `read_balanced_delim`, `should_format_from_value`, `parts_to_segments`. All 138 tests pass (95 unit + 37 integration + 6 doc). Oracle: 165/181 (unchanged from baseline). S-expression output identical (Parable compatibility preserved). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ast.rs | 10 +- src/context.rs | 41 ------ src/lexer/expansions.rs | 197 +++++++++++++++----------- src/lexer/mod.rs | 1 + src/lexer/quotes.rs | 42 +++--- src/lexer/tests.rs | 217 ++++++++++++++++++++++++++++ src/lexer/word_builder.rs | 156 +++++++++++++++++++++ src/lexer/words.rs | 287 ++++++++++++++++++++++++++++---------- src/parser/compound.rs | 6 +- src/parser/helpers.rs | 10 +- src/parser/mod.rs | 5 +- src/parser/tests.rs | 62 +++++++- src/parser/word_parts.rs | 210 ++++++++++++++++++++++++++++ src/sexp/mod.rs | 10 +- src/sexp/word.rs | 232 ++++++++++++++++-------------- src/token.rs | 25 ++++ tests/integration.rs | 10 +- 17 files changed, 1184 insertions(+), 337 deletions(-) create mode 100644 src/lexer/word_builder.rs create mode 100644 src/parser/word_parts.rs diff --git a/src/ast.rs b/src/ast.rs index 42c7c15..9f78c17 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -72,7 +72,15 @@ impl Node { #[allow(clippy::use_self)] pub enum NodeKind { /// A word token, possibly containing expansion parts. - Word { value: String, parts: Vec }, + Word { + value: String, + parts: Vec, + #[allow(dead_code)] + spans: Vec, + }, + + /// A literal text segment within a word's parts list. + WordLiteral { value: String }, /// A simple command: assignments, words, and redirects. Command { diff --git a/src/context.rs b/src/context.rs index 26398f8..b8bd94c 100644 --- a/src/context.rs +++ b/src/context.rs @@ -115,47 +115,6 @@ pub fn skip_backtick(chars: &[char], pos: &mut usize, out: &mut String) { } } -/// Reads balanced delimiters (parens, braces, etc.) with quote awareness. -/// -/// Starting at depth 1, reads until depth reaches 0. Handles single quotes, -/// double quotes, backticks, and backslash escapes inside. -pub fn read_balanced_delim( - chars: &[char], - pos: &mut usize, - open: char, - close: char, - out: &mut String, -) { - let mut depth = 1; - while *pos < chars.len() && depth > 0 { - match chars[*pos] { - c if c == open => { - depth += 1; - out.push(c); - *pos += 1; - } - c if c == close => { - depth -= 1; - out.push(c); - *pos += 1; - } - '\'' => skip_single_quoted(chars, pos, out), - '"' => skip_double_quoted(chars, pos, out), - '`' => skip_backtick(chars, pos, out), - '\\' if *pos + 1 < chars.len() => { - out.push(chars[*pos]); - *pos += 1; - out.push(chars[*pos]); - *pos += 1; - } - c => { - out.push(c); - *pos += 1; - } - } - } -} - /// Counts consecutive backslashes before a position to determine /// if a character is escaped. /// diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs index 2f0f5c4..96761da 100644 --- a/src/lexer/expansions.rs +++ b/src/lexer/expansions.rs @@ -2,95 +2,114 @@ use crate::context::CaseTracker; use crate::error::{RableError, Result}; use super::Lexer; +use super::word_builder::{QuotingContext, WordBuilder, WordSpanKind}; impl Lexer { /// Reads a dollar expansion into the value string. - pub(super) fn read_dollar(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_dollar(&mut self, wb: &mut WordBuilder) -> Result<()> { + let span_start = wb.span_start(); self.advance_char(); // consume '$' - value.push('$'); + wb.push('$'); match self.peek_char() { Some('(') => { self.advance_char(); - value.push('('); + wb.push('('); if self.peek_char() == Some('(') { - // $(( ... )) arithmetic expansion self.advance_char(); - value.push('('); - self.read_matched_parens(value, 2)?; + wb.push('('); + self.read_matched_parens(wb, 2)?; + wb.record(span_start, WordSpanKind::ArithmeticSub); } else { - // $( ... ) command substitution - let content_start = value.len(); - self.read_matched_parens(value, 1)?; - // Validate content is parseable bash - let content_end = value.len().saturating_sub(1); - if content_start < content_end { - let content: String = value[content_start..content_end].to_string(); - if !content.trim().is_empty() - && crate::parse(&content, self.extglob()).is_err() - { - return Err(RableError::parse( - "invalid command substitution", - self.pos, - self.line, - )); - } - } + self.read_command_sub(wb)?; + wb.record(span_start, WordSpanKind::CommandSub); } } Some('{') => { self.advance_char(); - value.push('{'); - // Parameter expansion: don't count unquoted { as depth increase - self.read_param_expansion_braces(value)?; + wb.push('{'); + self.read_param_expansion_braces(wb)?; + wb.record(span_start, WordSpanKind::ParamExpansion); } Some('\'') => { - // $'...' ANSI-C quoting — handles \ escapes unlike regular '' self.advance_char(); - value.push('\''); - self.read_ansi_c_quoted(value)?; + wb.push('\''); + self.read_ansi_c_quoted(wb)?; + wb.record(span_start, WordSpanKind::AnsiCQuote); } Some('"') => { - // $"..." locale string self.advance_char(); - value.push('"'); - self.read_double_quoted(value)?; + wb.push('"'); + self.read_double_quoted(wb)?; + wb.record(span_start, WordSpanKind::LocaleString); } Some('[') => { - // $[...] deprecated arithmetic self.advance_char(); - value.push('['); - self.read_until_char(value, ']')?; + wb.push('['); + self.read_until_char(wb, ']')?; + wb.record(span_start, WordSpanKind::DeprecatedArith); + } + Some(c) if is_dollar_start(c) => { + self.read_variable_name(wb, c); + wb.record(span_start, WordSpanKind::SimpleVar); + } + _ => {} // Bare $ — no span + } + Ok(()) + } + + /// Reads `$(...)` command substitution content and validates it. + fn read_command_sub(&mut self, wb: &mut WordBuilder) -> Result<()> { + let content_start = wb.len(); + self.read_matched_parens(wb, 1)?; + let content_end = wb.len().saturating_sub(1); + if content_start < content_end { + let content: String = wb.value[content_start..content_end].to_string(); + if !content.trim().is_empty() && crate::parse(&content, self.extglob()).is_err() { + return Err(RableError::parse( + "invalid command substitution", + self.pos, + self.line, + )); } - Some(c) if is_dollar_start(c) => self.read_variable_name(value, c), - _ => {} // Bare $ at end of word — just leave it } Ok(()) } /// Reads a variable name after `$` for simple expansions. - pub(super) fn read_variable_name(&mut self, value: &mut String, first: char) { + pub(super) fn read_variable_name(&mut self, wb: &mut WordBuilder, first: char) { if first.is_ascii_alphabetic() || first == '_' { while let Some(nc) = self.peek_char() { if nc.is_ascii_alphanumeric() || nc == '_' { self.advance_char(); - value.push(nc); + wb.push(nc); } else { break; } } } else { self.advance_char(); - value.push(first); + wb.push(first); } } /// Reads until matching closing parentheses. #[allow(clippy::too_many_lines)] - #[allow(clippy::too_many_lines)] pub(super) fn read_matched_parens( &mut self, - value: &mut String, + wb: &mut WordBuilder, + close_count: usize, + ) -> Result<()> { + wb.enter_context(QuotingContext::CommandSub); + let result = self.read_matched_parens_inner(wb, close_count); + wb.leave_context(); + result + } + + #[allow(clippy::too_many_lines)] + fn read_matched_parens_inner( + &mut self, + wb: &mut WordBuilder, close_count: usize, ) -> Result<()> { let mut depth = close_count; @@ -105,7 +124,7 @@ impl Lexer { word_buf.clear(); self.advance_char(); - value.push(')'); + wb.push(')'); if case.is_pattern_close() { // This `)` terminates a case pattern — don't decrement depth case.close_pattern(); @@ -120,7 +139,7 @@ impl Lexer { case.check_word(&word_buf); word_buf.clear(); self.advance_char(); - value.push('('); + wb.push('('); // In case pattern mode, `(` is optional pattern prefix — don't increment if !case.is_pattern_open() { depth += 1; @@ -130,15 +149,15 @@ impl Lexer { case.check_word(&word_buf); word_buf.clear(); self.advance_char(); - value.push('\''); - self.read_single_quoted(value)?; + wb.push('\''); + self.read_single_quoted(wb)?; } Some('"') => { case.check_word(&word_buf); word_buf.clear(); self.advance_char(); - value.push('"'); - self.read_double_quoted(value)?; + wb.push('"'); + self.read_double_quoted(wb)?; } Some('\\') => { word_buf.clear(); @@ -146,27 +165,27 @@ impl Lexer { if self.peek_char() == Some('\n') { self.advance_char(); // line continuation } else { - value.push('\\'); + wb.push('\\'); if let Some(c) = self.advance_char() { - value.push(c); + wb.push(c); } } } Some('$') => { word_buf.clear(); - self.read_dollar(value)?; + self.read_dollar(wb)?; } Some('`') => { word_buf.clear(); self.advance_char(); - value.push('`'); - self.read_backtick(value)?; + wb.push('`'); + self.read_backtick(wb)?; } Some('#') => { case.check_word(&word_buf); word_buf.clear(); // # is a comment when preceded by whitespace/newline - let prev = value.chars().last().unwrap_or('\n'); + let prev = wb.value.chars().last().unwrap_or('\n'); if prev == '\n' || prev == ' ' || prev == '\t' { // Comment — skip to end of line while let Some(c) = self.peek_char() { @@ -177,26 +196,26 @@ impl Lexer { } } else { self.advance_char(); - value.push('#'); + wb.push('#'); } } Some(';') => { case.check_word(&word_buf); word_buf.clear(); self.advance_char(); - value.push(';'); + wb.push(';'); // Check for ;; ;& ;;& which resume case pattern mode if self.peek_char() == Some(';') { self.advance_char(); - value.push(';'); + wb.push(';'); if self.peek_char() == Some('&') { self.advance_char(); - value.push('&'); + wb.push('&'); } case.resume_pattern(); } else if self.peek_char() == Some('&') { self.advance_char(); - value.push('&'); + wb.push('&'); case.resume_pattern(); } } @@ -204,18 +223,18 @@ impl Lexer { case.check_word(&word_buf); word_buf.clear(); let c = self.advance_char().unwrap_or(' '); - value.push(c); + wb.push(c); } Some('|') => { case.check_word(&word_buf); word_buf.clear(); self.advance_char(); - value.push('|'); + wb.push('|'); } Some(c) => { word_buf.push(c); self.advance_char(); - value.push(c); + wb.push(c); } None => { return Err(RableError::matched_pair( @@ -229,46 +248,53 @@ impl Lexer { } /// Reads a parameter expansion `${...}` allowing unbalanced inner `{`. - pub(super) fn read_param_expansion_braces(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_param_expansion_braces(&mut self, wb: &mut WordBuilder) -> Result<()> { + wb.enter_context(QuotingContext::ParamExpansion); + let result = self.read_param_expansion_inner(wb); + wb.leave_context(); + result + } + + fn read_param_expansion_inner(&mut self, wb: &mut WordBuilder) -> Result<()> { loop { match self.peek_char() { Some('}') => { self.advance_char(); - value.push('}'); + wb.push('}'); return Ok(()); } Some('\'') => { self.advance_char(); - value.push('\''); - self.read_single_quoted(value)?; + wb.push('\''); + self.read_single_quoted(wb)?; } Some('"') => { self.advance_char(); - value.push('"'); - self.read_double_quoted(value)?; + wb.push('"'); + self.read_double_quoted(wb)?; } Some('\\') => { self.advance_char(); if self.peek_char() == Some('\n') { self.advance_char(); // line continuation } else { - value.push('\\'); + wb.push('\\'); if let Some(c) = self.advance_char() { - value.push(c); + wb.push(c); } } } Some('$') => { - self.read_dollar(value)?; + self.read_dollar(wb)?; } Some('`') => { self.advance_char(); - value.push('`'); - self.read_backtick(value)?; + wb.push('`'); + self.read_backtick(wb)?; } Some(c) => { self.advance_char(); - value.push(c); + wb.push(c); } None => { return Err(RableError::matched_pair( @@ -282,24 +308,31 @@ impl Lexer { } /// Reads a backtick command substitution. - pub(super) fn read_backtick(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_backtick(&mut self, wb: &mut WordBuilder) -> Result<()> { + wb.enter_context(QuotingContext::Backtick); + let result = self.read_backtick_inner(wb); + wb.leave_context(); + result + } + + fn read_backtick_inner(&mut self, wb: &mut WordBuilder) -> Result<()> { loop { match self.peek_char() { Some('`') => { self.advance_char(); - value.push('`'); + wb.push('`'); return Ok(()); } Some('\\') => { self.advance_char(); - value.push('\\'); + wb.push('\\'); if let Some(c) = self.advance_char() { - value.push(c); + wb.push(c); } } Some(c) => { self.advance_char(); - value.push(c); + wb.push(c); } None => { return Err(RableError::matched_pair( @@ -313,14 +346,14 @@ impl Lexer { } /// Reads until the given closing character. - pub(super) fn read_until_char(&mut self, value: &mut String, close: char) -> Result<()> { + pub(super) fn read_until_char(&mut self, wb: &mut WordBuilder, close: char) -> Result<()> { loop { match self.advance_char() { Some(c) if c == close => { - value.push(c); + wb.push(c); return Ok(()); } - Some(c) => value.push(c), + Some(c) => wb.push(c), None => { return Err(RableError::matched_pair( format!("unterminated '{close}'"), diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 346b99e..3334d61 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -5,6 +5,7 @@ mod expansions; mod heredoc; mod operators; mod quotes; +pub mod word_builder; mod words; #[cfg(test)] diff --git a/src/lexer/quotes.rs b/src/lexer/quotes.rs index 42a4a02..6adf639 100644 --- a/src/lexer/quotes.rs +++ b/src/lexer/quotes.rs @@ -1,17 +1,18 @@ use crate::error::{RableError, Result}; use super::Lexer; +use super::word_builder::{QuotingContext, WordBuilder}; impl Lexer { /// Reads contents of a single-quoted string (after the opening `'`). - pub(super) fn read_single_quoted(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_single_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> { loop { match self.advance_char() { Some('\'') => { - value.push('\''); + wb.push('\''); return Ok(()); } - Some(c) => value.push(c), + Some(c) => wb.push(c), None => { return Err(RableError::matched_pair( "unterminated single quote", @@ -25,24 +26,24 @@ impl Lexer { /// Reads contents of an ANSI-C quoted string `$'...'` (after the opening `'`). /// Unlike regular single quotes, `\` is an escape character here. - pub(super) fn read_ansi_c_quoted(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_ansi_c_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> { loop { match self.peek_char() { Some('\'') => { self.advance_char(); - value.push('\''); + wb.push('\''); return Ok(()); } Some('\\') => { self.advance_char(); - value.push('\\'); + wb.push('\\'); if let Some(next) = self.advance_char() { - value.push(next); + wb.push(next); } } Some(c) => { self.advance_char(); - value.push(c); + wb.push(c); } None => { return Err(RableError::matched_pair( @@ -56,12 +57,19 @@ impl Lexer { } /// Reads contents of a double-quoted string (after the opening `"`). - pub(super) fn read_double_quoted(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_double_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> { + wb.enter_context(QuotingContext::DoubleQuote); + let result = self.read_double_quoted_inner(wb); + wb.leave_context(); + result + } + + fn read_double_quoted_inner(&mut self, wb: &mut WordBuilder) -> Result<()> { loop { match self.peek_char() { Some('"') => { self.advance_char(); - value.push('"'); + wb.push('"'); return Ok(()); } Some('\\') => { @@ -70,9 +78,9 @@ impl Lexer { if self.peek_char() == Some('\n') { self.advance_char(); } else { - value.push('\\'); + wb.push('\\'); if let Some(next) = self.advance_char() { - value.push(next); + wb.push(next); } } } @@ -81,19 +89,19 @@ impl Lexer { // bare $ followed by closing ". Don't let read_dollar consume it. if self.input.get(self.pos + 1) == Some(&'"') { self.advance_char(); - value.push('$'); + wb.push('$'); } else { - self.read_dollar(value)?; + self.read_dollar(wb)?; } } Some('`') => { self.advance_char(); - value.push('`'); - self.read_backtick(value)?; + wb.push('`'); + self.read_backtick(wb)?; } Some(c) => { self.advance_char(); - value.push(c); + wb.push(c); } None => { return Err(RableError::matched_pair( diff --git a/src/lexer/tests.rs b/src/lexer/tests.rs index 193d294..fd7e75c 100644 --- a/src/lexer/tests.rs +++ b/src/lexer/tests.rs @@ -86,3 +86,220 @@ fn and_or() { assert_eq!(tokens[1], (TokenType::And, "&&".to_string())); assert_eq!(tokens[3], (TokenType::Or, "||".to_string())); } + +#[test] +fn assignment_word_simple() { + let tokens = collect_tokens("FOO=bar"); + assert_eq!( + tokens[0], + (TokenType::AssignmentWord, "FOO=bar".to_string()) + ); +} + +#[test] +fn assignment_word_plus_equals() { + let tokens = collect_tokens("FOO+=bar"); + assert_eq!( + tokens[0], + (TokenType::AssignmentWord, "FOO+=bar".to_string()) + ); +} + +#[test] +fn assignment_word_array() { + let tokens = collect_tokens("arr=(a b)"); + assert_eq!( + tokens[0], + (TokenType::AssignmentWord, "arr=(a b)".to_string()) + ); +} + +#[test] +fn assignment_word_subscript() { + let tokens = collect_tokens("arr[0]=val"); + assert_eq!( + tokens[0], + (TokenType::AssignmentWord, "arr[0]=val".to_string()) + ); +} + +#[test] +fn not_assignment_no_name() { + let tokens = collect_tokens("=value"); + assert_eq!(tokens[0].0, TokenType::Word); +} + +#[test] +fn not_assignment_regular_word() { + let tokens = collect_tokens("echo"); + assert_eq!(tokens[0].0, TokenType::Word); +} + +#[test] +fn assignment_before_command_keeps_command_start() { + // Assignment tokens should keep command_start=true so the + // command word after is still recognized + let tokens = collect_tokens("FOO=bar echo hello"); + assert_eq!(tokens[0].0, TokenType::AssignmentWord); + assert_eq!(tokens[1].0, TokenType::Word); + assert_eq!(tokens[2].0, TokenType::Word); +} + +// -- Span recording tests -- + +use super::word_builder::{WordSpan, WordSpanKind}; + +#[allow(clippy::unwrap_used)] +fn first_word_spans(source: &str) -> (String, Vec) { + let mut lexer = Lexer::new(source, false); + let tok = lexer.next_token().unwrap(); + (tok.value, tok.spans) +} + +#[test] +fn span_plain_word_no_spans() { + let (_, spans) = first_word_spans("echo"); + assert!(spans.is_empty()); +} + +#[test] +fn span_command_sub() { + let (val, spans) = first_word_spans("$(cmd)"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::CommandSub); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_command_sub_mid_word() { + let (_, spans) = first_word_spans("hello$(world)end"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::CommandSub); + assert_eq!(spans[0].start, 5); + assert_eq!(spans[0].end, 13); +} + +#[test] +fn span_arithmetic_sub() { + let (val, spans) = first_word_spans("$((1+2))"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::ArithmeticSub); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_param_expansion() { + let (val, spans) = first_word_spans("${var:-default}"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::ParamExpansion); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_simple_var() { + let (val, spans) = first_word_spans("$HOME"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::SimpleVar); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_ansi_c_quote() { + let (val, spans) = first_word_spans("$'foo'"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::AnsiCQuote); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_locale_string() { + let (val, spans) = first_word_spans("$\"hello\""); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::LocaleString); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_single_quoted() { + let (val, spans) = first_word_spans("'quoted'"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::SingleQuoted); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_double_quoted() { + let (val, spans) = first_word_spans("\"double\""); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::DoubleQuoted); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_backtick() { + let (val, spans) = first_word_spans("`cmd`"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::Backtick); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_escape() { + let (val, spans) = first_word_spans("\\n"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::Escape); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} + +#[test] +fn span_line_continuation_no_span() { + // \ is a line continuation — nothing pushed, no span + let (val, spans) = first_word_spans("hel\\\nlo"); + assert_eq!(val, "hello"); + assert!(spans.is_empty()); +} + +#[test] +fn span_bare_dollar_no_span() { + // Bare $ at end — no expansion, no span + let tokens = collect_tokens("echo $"); + assert_eq!(tokens[1].1, "$"); + // The $ word has no spans (bare dollar) + let (_, spans) = first_word_spans("$"); + assert!(spans.is_empty()); +} + +#[test] +fn span_nested_double_quoted_with_cmdsub() { + // "$(cmd)" — DoubleQuoted span contains CommandSub span + let (val, spans) = first_word_spans("\"$(cmd)\""); + assert_eq!(val, "\"$(cmd)\""); + assert_eq!(spans.len(), 2); + // CommandSub is recorded first (inside read_dollar, before + // DoubleQuoted is closed by read_word_special) + assert_eq!(spans[0].kind, WordSpanKind::CommandSub); + assert_eq!(spans[0].start, 1); + assert_eq!(spans[0].end, 7); + assert_eq!(spans[1].kind, WordSpanKind::DoubleQuoted); + assert_eq!(spans[1].start, 0); + assert_eq!(spans[1].end, 8); +} + +#[test] +fn span_deprecated_arith() { + let (val, spans) = first_word_spans("$[1+2]"); + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].kind, WordSpanKind::DeprecatedArith); + assert_eq!(spans[0].start, 0); + assert_eq!(spans[0].end, val.len()); +} diff --git a/src/lexer/word_builder.rs b/src/lexer/word_builder.rs new file mode 100644 index 0000000..7e3279a --- /dev/null +++ b/src/lexer/word_builder.rs @@ -0,0 +1,156 @@ +//! Word builder for accumulating word tokens with expansion span tracking. +//! +//! `WordBuilder` bundles the word value string with a list of `WordSpan`s +//! that record where each expansion starts and ends, along with the +//! quoting context at the point of recording. This eliminates the need +//! for downstream code to re-parse word values. +#![allow(dead_code)] + +/// The quoting context at the point where a span was recorded. +/// +/// Bash has context-sensitive expansion rules. For example, `$'...'` +/// is an ANSI-C quote at top level or inside `${...}`, but is NOT +/// special inside `"..."` (it's just literal `$` + `'...'`). +/// +/// Each span captures the context stack so downstream consumers can +/// make correct decisions without re-deriving context. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum QuotingContext { + /// Top-level word context (no quoting). + None, + /// Inside double quotes `"..."`. + DoubleQuote, + /// Inside parameter expansion `${...}` (resets quoting context). + ParamExpansion, + /// Inside command substitution `$(...)` (resets quoting context). + CommandSub, + /// Inside backtick command substitution `` `...` ``. + Backtick, +} + +/// Accumulates a word token's value string and expansion spans. +pub struct WordBuilder { + /// The word value being built character by character. + pub value: String, + /// Expansion spans recorded during lexing, ordered by start offset. + pub spans: Vec, + /// Stack of quoting contexts — the current context is the last entry. + /// Empty means top-level (no quoting). + context_stack: Vec, +} + +impl WordBuilder { + pub const fn new() -> Self { + Self { + value: String::new(), + spans: Vec::new(), + context_stack: Vec::new(), + } + } + + pub fn push(&mut self, c: char) { + self.value.push(c); + } + + pub fn push_str(&mut self, s: &str) { + self.value.push_str(s); + } + + pub fn ends_with(&self, c: char) -> bool { + self.value.ends_with(c) + } + + pub const fn is_empty(&self) -> bool { + self.value.is_empty() + } + + pub const fn len(&self) -> usize { + self.value.len() + } + + /// Returns the current byte offset — use before an expansion to + /// capture its start position. + pub const fn span_start(&self) -> usize { + self.value.len() + } + + /// Records a completed expansion span from `start` to the current + /// end of the value string, capturing the current quoting context. + pub fn record(&mut self, start: usize, kind: WordSpanKind) { + self.spans.push(WordSpan { + start, + end: self.value.len(), + kind, + context: self.current_context(), + }); + } + + /// Returns the current quoting context. + pub fn current_context(&self) -> QuotingContext { + self.context_stack + .last() + .copied() + .unwrap_or(QuotingContext::None) + } + + /// Pushes a quoting context onto the stack. + pub fn enter_context(&mut self, ctx: QuotingContext) { + self.context_stack.push(ctx); + } + + /// Pops the current quoting context from the stack. + pub fn leave_context(&mut self) { + self.context_stack.pop(); + } +} + +/// A byte-range span within a word token's value string, identifying +/// an expansion or quoting construct and the context it appeared in. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct WordSpan { + /// Start byte offset in the word value (inclusive). + pub start: usize, + /// End byte offset in the word value (exclusive). + pub end: usize, + /// The kind of expansion or quoting at this span. + pub kind: WordSpanKind, + /// The quoting context this span was recorded in. + pub context: QuotingContext, +} + +/// The kind of expansion or quoting construct a `WordSpan` represents. +#[derive(Debug, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum WordSpanKind { + // -- Sexp-relevant (formatter extracts content from these) -- + /// Command substitution: `$(...)`. + CommandSub, + /// ANSI-C quoting: `$'...'`. + AnsiCQuote, + /// Locale translation string: `$"..."`. + LocaleString, + /// Process substitution: `<(...)` or `>(...)`. + ProcessSub(char), + + // -- Structural (formatter treats as literal text) -- + /// Single-quoted string: `'...'`. + SingleQuoted, + /// Double-quoted string: `"..."`. + DoubleQuoted, + /// Parameter expansion: `${...}`. + ParamExpansion, + /// Simple variable: `$VAR`. + SimpleVar, + /// Arithmetic substitution: `$((...))`. + ArithmeticSub, + /// Backtick command substitution: `` `...` ``. + Backtick, + /// Array subscript: `[...]`. + BracketSubscript, + /// Extended glob pattern: `@(...)`, `?(...)`, etc. + Extglob(char), + /// Deprecated arithmetic: `$[...]`. + DeprecatedArith, + /// Backslash escape: `\X` (not `\` line continuations). + Escape, +} diff --git a/src/lexer/words.rs b/src/lexer/words.rs index 69b196c..8a4a981 100644 --- a/src/lexer/words.rs +++ b/src/lexer/words.rs @@ -2,12 +2,13 @@ use crate::error::{RableError, Result}; use crate::token::{Token, TokenType}; use super::Lexer; +use super::word_builder::{WordBuilder, WordSpanKind}; impl Lexer { /// Reads a word token, handling quoting and expansions. #[allow(clippy::too_many_lines)] pub(super) fn read_word_token(&mut self, start: usize, line: usize) -> Result { - let mut value = String::new(); + let mut wb = WordBuilder::new(); while let Some(c) = self.peek_char() { match c { @@ -15,76 +16,81 @@ impl Lexer { ' ' | '\t' | '\n' | '|' | '&' | ';' | ')' => break, // < and > are metacharacters, but <( and >( are process substitution '<' | '>' => { - if !value.is_empty() && self.input.get(self.pos + 1) == Some(&'(') { + if !wb.is_empty() && self.input.get(self.pos + 1) == Some(&'(') { // Process substitution mid-word: cat<(cmd) - self.read_process_sub_into(&mut value)?; + self.read_process_sub_into(&mut wb)?; } else { break; } } // ( is a metacharacter UNLESS preceded by = (array) or extglob prefix '(' => { - if value.ends_with('=') { - // Array assignment: arr=(...) + if wb.ends_with('=') { + // Array assignment: arr=(...) — parse elements directly self.advance_char(); - value.push('('); - self.read_matched_parens(&mut value, 1)?; - } else if value.ends_with('@') - || value.ends_with('?') - || value.ends_with('+') - || (value.ends_with('!') && self.config.extglob) - || (value.ends_with('*') && self.config.extglob) + wb.push('('); + self.read_array_elements(&mut wb)?; + } else if wb.ends_with('@') + || wb.ends_with('?') + || wb.ends_with('+') + || (wb.ends_with('!') && self.config.extglob) + || (wb.ends_with('*') && self.config.extglob) { // Extglob: @(...), ?(...), etc. self.advance_char(); - value.push('('); - self.read_matched_parens(&mut value, 1)?; + wb.push('('); + self.read_matched_parens(&mut wb, 1)?; } else { break; } } '\'' | '"' | '\\' | '$' | '`' => { - self.read_word_special(&mut value, c)?; + self.read_word_special(&mut wb, c)?; } // Extglob: @(...), ?(...), +(...), !(...) // !( is only extglob when extglob is enabled; otherwise ! is Bang '@' | '?' | '+' if self.input.get(self.pos + 1) == Some(&'(') => { - self.read_extglob(&mut value, c)?; + self.read_extglob(&mut wb, c)?; } // !( and *( are only extglob when extglob mode is enabled '!' | '*' if self.input.get(self.pos + 1) == Some(&'(') && self.config.extglob => { - self.read_extglob(&mut value, c)?; + self.read_extglob(&mut wb, c)?; } // `[` inside a word starts a subscript/bracket context - // Everything until matching `]` is part of the same word - // (including spaces, operators, etc.) - // Only applies when the word has content that isn't just `[` or `[[` - // (those are test/conditional command syntax, not subscripts) - '[' if !value.is_empty() && value != "[" && !value.ends_with('[') => { - self.read_bracket_subscript(&mut value)?; + '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => { + self.read_bracket_subscript(&mut wb)?; } // Regular character _ => { self.advance_char(); - value.push(c); + wb.push(c); } } } - if value.is_empty() { + if wb.is_empty() { return Err(RableError::parse("unexpected character", start, line)); } - // Check for reserved words at command start + // Check for reserved words at command start, then assignment pattern let kind = if self.ctx.command_start { - TokenType::reserved_word(&value).unwrap_or(TokenType::Word) + TokenType::reserved_word(&wb.value).unwrap_or_else(|| { + if is_assignment_word(&wb.value) { + TokenType::AssignmentWord + } else { + TokenType::Word + } + }) + } else if is_assignment_word(&wb.value) { + TokenType::AssignmentWord } else { TokenType::Word }; // After a word, we're no longer at command start (unless it's a keyword - // that expects another command) + // that expects another command, or it's an assignment) self.ctx.command_start = kind.starts_command() + || kind == TokenType::AssignmentWord || matches!( kind, TokenType::Then @@ -94,40 +100,48 @@ impl Lexer { | TokenType::Semi ); - Ok(Token::new(kind, value, start, line)) + Ok(Token::with_spans(kind, wb.value, start, line, wb.spans)) } /// Reads a quoted string, escape, dollar expansion, or backtick within a word. - pub(super) fn read_word_special(&mut self, value: &mut String, c: char) -> Result<()> { + pub(super) fn read_word_special(&mut self, wb: &mut WordBuilder, c: char) -> Result<()> { match c { '\'' => { + let start = wb.span_start(); self.advance_char(); - value.push('\''); - self.read_single_quoted(value)?; + wb.push('\''); + self.read_single_quoted(wb)?; + wb.record(start, WordSpanKind::SingleQuoted); } '"' => { + let start = wb.span_start(); self.advance_char(); - value.push('"'); - self.read_double_quoted(value)?; + wb.push('"'); + self.read_double_quoted(wb)?; + wb.record(start, WordSpanKind::DoubleQuoted); } '\\' => { self.advance_char(); if self.peek_char() == Some('\n') { - self.advance_char(); // line continuation + self.advance_char(); // line continuation — no span } else { - value.push('\\'); + let start = wb.span_start(); + wb.push('\\'); if let Some(next) = self.advance_char() { - value.push(next); + wb.push(next); } + wb.record(start, WordSpanKind::Escape); } } '$' => { - self.read_dollar(value)?; + self.read_dollar(wb)?; } '`' => { + let start = wb.span_start(); self.advance_char(); - value.push('`'); - self.read_backtick(value)?; + wb.push('`'); + self.read_backtick(wb)?; + wb.record(start, WordSpanKind::Backtick); } _ => {} } @@ -135,59 +149,62 @@ impl Lexer { } /// Reads an extglob pattern `@(...)`, `?(...)`, etc. - pub(super) fn read_extglob(&mut self, value: &mut String, prefix: char) -> Result<()> { + pub(super) fn read_extglob(&mut self, wb: &mut WordBuilder, prefix: char) -> Result<()> { + let start = wb.span_start(); self.advance_char(); - value.push(prefix); + wb.push(prefix); self.advance_char(); - value.push('('); - self.read_matched_parens(value, 1) + wb.push('('); + self.read_matched_parens(wb, 1)?; + wb.record(start, WordSpanKind::Extglob(prefix)); + Ok(()) } /// Reads a bracket subscript `[...]` inside a word. - /// Includes all content until the matching `]`, treating spaces - /// and operators as literal (array subscripts can contain spaces). - pub(super) fn read_bracket_subscript(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_bracket_subscript(&mut self, wb: &mut WordBuilder) -> Result<()> { + let start = wb.span_start(); self.advance_char(); // consume [ - value.push('['); + wb.push('['); let mut depth = 1; while let Some(c) = self.peek_char() { match c { '[' => { depth += 1; self.advance_char(); - value.push(c); + wb.push(c); } ']' => { depth -= 1; self.advance_char(); - value.push(c); + wb.push(c); if depth == 0 { + wb.record(start, WordSpanKind::BracketSubscript); return Ok(()); } } '\'' => { self.advance_char(); - value.push('\''); - self.read_single_quoted(value)?; + wb.push('\''); + self.read_single_quoted(wb)?; } '"' => { self.advance_char(); - value.push('"'); - self.read_double_quoted(value)?; + wb.push('"'); + self.read_double_quoted(wb)?; } '\\' => { self.advance_char(); - value.push('\\'); + wb.push('\\'); if let Some(nc) = self.advance_char() { - value.push(nc); + wb.push(nc); } } '$' => { - self.read_dollar(value)?; + self.read_dollar(wb)?; } _ => { self.advance_char(); - value.push(c); + wb.push(c); } } } @@ -195,58 +212,174 @@ impl Lexer { } /// Reads a process substitution into an existing word value. - pub(super) fn read_process_sub_into(&mut self, value: &mut String) -> Result<()> { + pub(super) fn read_process_sub_into(&mut self, wb: &mut WordBuilder) -> Result<()> { + let start = wb.span_start(); let dir = self.advance_char().unwrap_or('<'); - value.push(dir); + wb.push(dir); self.advance_char(); // ( - value.push('('); - self.read_matched_parens(value, 1) + wb.push('('); + self.read_matched_parens(wb, 1)?; + wb.record(start, WordSpanKind::ProcessSub(dir)); + Ok(()) } /// Reads a process substitution `<(...)` or `>(...)` as a word token. /// Continues reading word characters after the closing `)`. pub(super) fn read_process_sub_word(&mut self, start: usize, line: usize) -> Result { - let mut value = String::new(); + let mut wb = WordBuilder::new(); + let span_start = wb.span_start(); // Read < or > let dir = self.advance_char().unwrap_or('<'); - value.push(dir); + wb.push(dir); // Read ( self.advance_char(); - value.push('('); + wb.push('('); // Read until matching ) - self.read_matched_parens(&mut value, 1)?; + self.read_matched_parens(&mut wb, 1)?; + wb.record(span_start, WordSpanKind::ProcessSub(dir)); // Continue reading word chars after the process substitution - // e.g., <(cmd)suffix or >(cat)ca - self.continue_word(&mut value)?; + self.continue_word(&mut wb)?; self.ctx.command_start = false; - Ok(Token::new(TokenType::Word, value, start, line)) + Ok(Token::with_spans( + TokenType::Word, + wb.value, + start, + line, + wb.spans, + )) } - /// Continue reading word characters after a special construct (process sub, etc). - fn continue_word(&mut self, value: &mut String) -> Result<()> { + /// Continue reading word characters after a special construct. + fn continue_word(&mut self, wb: &mut WordBuilder) -> Result<()> { while let Some(c) = self.peek_char() { match c { ' ' | '\t' | '\n' | '|' | '&' | ';' | ')' | '(' => break, - // < and > break UNLESS followed by ( (process substitution continues word) '<' | '>' => { if self.input.get(self.pos + 1) == Some(&'(') { - self.read_process_sub_into(value)?; + self.read_process_sub_into(wb)?; } else { break; } } '\'' | '"' | '\\' | '$' | '`' => { - self.read_word_special(value, c)?; + self.read_word_special(wb, c)?; } - '[' if !value.is_empty() && value != "[" && !value.ends_with('[') => { - self.read_bracket_subscript(value)?; + '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => { + self.read_bracket_subscript(wb)?; } _ => { self.advance_char(); - value.push(c); + wb.push(c); } } } Ok(()) } + + /// Reads array elements from `(...)`, producing normalized content. + /// + /// Parses individual elements separated by whitespace directly from + /// the input stream, stripping comments. Uses the lexer's existing + /// word-reading facilities for quoting and expansion handling. + /// The opening `(` must already be consumed and pushed to `wb`. + fn read_array_elements(&mut self, wb: &mut WordBuilder) -> Result<()> { + let mut need_space = false; + loop { + // Skip whitespace between elements + while matches!(self.peek_char(), Some(' ' | '\t' | '\n' | '\r')) { + self.advance_char(); + } + match self.peek_char() { + Some(')') => { + self.advance_char(); + wb.push(')'); + return Ok(()); + } + Some('#') => { + // Comment — skip to end of line + while matches!(self.peek_char(), Some(c) if c != '\n') { + self.advance_char(); + } + } + Some(_) => { + if need_space { + wb.push(' '); + } + self.read_array_element(wb)?; + need_space = true; + } + None => { + return Err(RableError::matched_pair( + "unterminated array", + self.pos, + self.line, + )); + } + } + } + } + + /// Reads a single array element using the same word-reading logic + /// as `read_word_token`, but with `)` and whitespace as terminators + /// instead of the standard shell metacharacters. + fn read_array_element(&mut self, wb: &mut WordBuilder) -> Result<()> { + while let Some(c) = self.peek_char() { + match c { + ' ' | '\t' | '\n' | '\r' | ')' => break, + '<' | '>' => { + if self.input.get(self.pos + 1) == Some(&'(') { + self.read_process_sub_into(wb)?; + } else { + self.advance_char(); + wb.push(c); + } + } + '\'' | '"' | '\\' | '$' | '`' => { + self.read_word_special(wb, c)?; + } + '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => { + self.read_bracket_subscript(wb)?; + } + _ => { + self.advance_char(); + wb.push(c); + } + } + } + Ok(()) + } +} + +/// Returns true if a word value matches the assignment pattern `NAME=`, +/// `NAME+=`, or `NAME[...]=` / `NAME[...]+=`. +fn is_assignment_word(value: &str) -> bool { + let bytes = value.as_bytes(); + // Must start with [a-zA-Z_] + if !matches!(bytes.first(), Some(b'a'..=b'z' | b'A'..=b'Z' | b'_')) { + return false; + } + let mut i = 1; + // Continue with [a-zA-Z_0-9] + while i < bytes.len() { + match bytes[i] { + b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => i += 1, + b'[' => { + // Skip subscript [...] (may be nested) + i += 1; + let mut depth = 1; + while i < bytes.len() && depth > 0 { + match bytes[i] { + b'[' => depth += 1, + b']' => depth -= 1, + _ => {} + } + i += 1; + } + } + b'+' if i + 1 < bytes.len() && bytes[i + 1] == b'=' => return true, + b'=' => return true, + _ => return false, + } + } + false } diff --git a/src/parser/compound.rs b/src/parser/compound.rs index 202bdfa..2823c05 100644 --- a/src/parser/compound.rs +++ b/src/parser/compound.rs @@ -153,8 +153,9 @@ impl Parser { Some(ws) } else { Some(vec![Node::empty(NodeKind::Word { + parts: super::word_parts::decompose_word("\"$@\""), value: "\"$@\"".to_string(), - parts: Vec::new(), + spans: Vec::new(), })]) }; @@ -241,8 +242,9 @@ impl Parser { let word_tok = self.lexer.next_token()?; let word = Box::new(Node::new( NodeKind::Word { + parts: super::word_parts::decompose_word(&word_tok.value), value: word_tok.value.clone(), - parts: Vec::new(), + spans: word_tok.spans, }, Span::new(word_tok.pos, word_tok.pos + word_tok.value.len()), )); diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs index f2406f2..bc39e8e 100644 --- a/src/parser/helpers.rs +++ b/src/parser/helpers.rs @@ -2,11 +2,12 @@ use crate::ast::{Node, NodeKind}; -/// Creates a `Word` node with no parts. +/// Creates a `Word` node with decomposed parts. pub fn word_node(value: &str) -> Node { Node::empty(NodeKind::Word { + parts: super::word_parts::decompose_word(value), value: value.to_string(), - parts: Vec::new(), + spans: Vec::new(), }) } @@ -77,7 +78,10 @@ pub(super) fn make_stderr_redirect() -> Node { op: ">&".to_string(), target: Box::new(Node::empty(NodeKind::Word { value: "1".to_string(), - parts: Vec::new(), + parts: vec![Node::empty(NodeKind::WordLiteral { + value: "1".to_string(), + })], + spans: Vec::new(), })), fd: 2, }) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6db6bab..6f8d0ab 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,6 +1,7 @@ mod compound; mod conditional; pub mod helpers; +mod word_parts; use crate::ast::{ListItem, ListOperator, Node, NodeKind, PipeSep, Span}; use crate::error::{RableError, Result}; @@ -567,10 +568,12 @@ impl Parser { return self.parse_function_def(&tok); } let word_span = Span::new(tok.pos, tok.pos + tok.value.len()); + let parts = word_parts::decompose_word(&tok.value); let node = Node::new( NodeKind::Word { value: tok.value, - parts: Vec::new(), + parts, + spans: tok.spans, }, word_span, ); diff --git a/src/parser/tests.rs b/src/parser/tests.rs index 3f6edde..fe5b88e 100644 --- a/src/parser/tests.rs +++ b/src/parser/tests.rs @@ -196,17 +196,14 @@ fn line_continuation() { #[test] fn command_has_assignments_field() { - // The lexer doesn't currently produce AssignmentWord tokens, - // so assignments stay in words. This test verifies the - // Command variant has the assignments field and S-expression - // output is unchanged. let nodes = parse("FOO=bar cmd arg"); assert_eq!(nodes.len(), 1); assert!(matches!( &nodes[0].kind, NodeKind::Command { assignments, words, .. } - if assignments.is_empty() && words.len() == 3 + if assignments.len() == 1 && words.len() == 2 )); + // S-expression output is unchanged (assignments and words are merged) let output = format!("{}", nodes[0]); assert_eq!( output, @@ -214,6 +211,61 @@ fn command_has_assignments_field() { ); } +#[test] +fn multiple_assignments_before_command() { + let nodes = parse("a=1 b=2 cmd"); + assert_eq!(nodes.len(), 1); + assert!(matches!( + &nodes[0].kind, + NodeKind::Command { assignments, words, .. } + if assignments.len() == 2 && words.len() == 1 + )); +} + +#[test] +fn assignment_after_command_word_stays_in_words() { + let nodes = parse("cmd FOO=bar"); + assert_eq!(nodes.len(), 1); + assert!(matches!( + &nodes[0].kind, + NodeKind::Command { assignments, words, .. } + if assignments.is_empty() && words.len() == 2 + )); +} + +#[test] +fn plus_equals_assignment() { + let nodes = parse("FOO+=bar cmd"); + assert_eq!(nodes.len(), 1); + assert!(matches!( + &nodes[0].kind, + NodeKind::Command { assignments, words, .. } + if assignments.len() == 1 && words.len() == 1 + )); +} + +#[test] +fn array_assignment() { + let nodes = parse("arr=(a b c) cmd"); + assert_eq!(nodes.len(), 1); + assert!(matches!( + &nodes[0].kind, + NodeKind::Command { assignments, words, .. } + if assignments.len() == 1 && words.len() == 1 + )); +} + +#[test] +fn bare_assignment_no_command() { + let nodes = parse("FOO=bar"); + assert_eq!(nodes.len(), 1); + assert!(matches!( + &nodes[0].kind, + NodeKind::Command { assignments, words, .. } + if assignments.len() == 1 && words.is_empty() + )); +} + #[test] fn list_items_structured() { let nodes = parse("a && b; c"); diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs new file mode 100644 index 0000000..e441264 --- /dev/null +++ b/src/parser/word_parts.rs @@ -0,0 +1,210 @@ +//! Decomposes word values into structured AST parts. +//! +//! Reuses the segment parser from `sexp::word` to split a word's raw +//! value into typed segments, then maps each segment to an AST `Node`. + +use std::cell::Cell; + +use crate::ast::{ListItem, Node, NodeKind, Span}; +use crate::sexp::word::{WordSegment, parse_word_segments}; + +thread_local! { + static DECOMPOSE_DEPTH: Cell = const { Cell::new(0) }; +} + +/// RAII guard to prevent infinite recursion when decomposing nested +/// command/process substitutions. +/// +/// Depth limit of 2 matches `format::DepthGuard` — allows `$(a $(b))` +/// but stops before unbounded nesting. The counter is separate from +/// the format module's counter since decomposition and formatting +/// are independent recursion paths. +struct DepthGuard; + +impl DepthGuard { + fn enter() -> Option { + DECOMPOSE_DEPTH.with(|d| { + let v = d.get(); + if v >= 2 { + return None; + } + d.set(v + 1); + Some(Self) + }) + } +} + +impl Drop for DepthGuard { + fn drop(&mut self) { + DECOMPOSE_DEPTH.with(|d| d.set(d.get().saturating_sub(1))); + } +} + +/// Decomposes a word's raw value into a list of typed AST nodes. +/// +/// Each segment becomes one of: +/// - `WordLiteral` for plain text +/// - `CommandSubstitution` for `$(...)` +/// - `ProcessSubstitution` for `<(...)` / `>(...)` +/// - `AnsiCQuote` for `$'...'` +/// - `LocaleString` for `$"..."` +pub(super) fn decompose_word(value: &str) -> Vec { + let segments = parse_word_segments(value); + segments.into_iter().map(segment_to_node).collect() +} + +fn segment_to_node(seg: WordSegment) -> Node { + match seg { + WordSegment::Literal(text) => Node::empty(NodeKind::WordLiteral { value: text }), + WordSegment::AnsiCQuote(content) => Node::empty(NodeKind::AnsiCQuote { content }), + WordSegment::LocaleString(content) => Node::empty(NodeKind::LocaleString { content }), + WordSegment::CommandSubstitution(content) => cmdsub_to_node(&content), + WordSegment::ProcessSubstitution(direction, content) => { + procsub_to_node(direction, &content) + } + } +} + +fn cmdsub_to_node(content: &str) -> Node { + parse_sub(content, &format!("$({content})"), |cmd| { + // brace: false — this is $(...) syntax, not ${...} brace expansion + NodeKind::CommandSubstitution { + command: cmd, + brace: false, + } + }) +} + +fn procsub_to_node(direction: char, content: &str) -> Node { + parse_sub(content, &format!("{direction}({content})"), |cmd| { + NodeKind::ProcessSubstitution { + direction: direction.to_string(), + command: cmd, + } + }) +} + +/// Shared logic for parsing command/process substitution content. +fn parse_sub( + content: &str, + fallback_value: &str, + make_kind: impl FnOnce(Box) -> NodeKind, +) -> Node { + let Some(_guard) = DepthGuard::enter() else { + return literal_fallback(fallback_value); + }; + crate::parse(content, false).map_or_else( + |_| literal_fallback(fallback_value), + |nodes| Node::empty(make_kind(Box::new(wrap_nodes(nodes)))), + ) +} + +fn literal_fallback(value: &str) -> Node { + Node::empty(NodeKind::WordLiteral { + value: value.to_string(), + }) +} + +/// Wraps a list of parsed nodes into a single node. +fn wrap_nodes(mut nodes: Vec) -> Node { + match nodes.len() { + 0 => Node::empty(NodeKind::Empty), + 1 => nodes.swap_remove(0), + _ => Node::new( + NodeKind::List { + items: nodes + .into_iter() + .map(|n| ListItem { + command: n, + operator: None, + }) + .collect(), + }, + Span::empty(), + ), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn plain_word() { + let parts = decompose_word("echo"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::WordLiteral { value } if value == "echo" + )); + } + + #[test] + fn simple_variable_stays_literal() { + let parts = decompose_word("$foo"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::WordLiteral { value } if value == "$foo" + )); + } + + #[test] + fn command_substitution() { + let parts = decompose_word("$(date)"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::CommandSubstitution { brace: false, .. } + )); + } + + #[test] + fn mixed_segments() { + let parts = decompose_word("hello$(world)end"); + assert_eq!(parts.len(), 3); + assert!(matches!( + &parts[0].kind, + NodeKind::WordLiteral { value } if value == "hello" + )); + assert!(matches!( + &parts[1].kind, + NodeKind::CommandSubstitution { .. } + )); + assert!(matches!( + &parts[2].kind, + NodeKind::WordLiteral { value } if value == "end" + )); + } + + #[test] + fn ansi_c_quote() { + let parts = decompose_word("$'foo\\nbar'"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::AnsiCQuote { content } if content == "foo\\nbar" + )); + } + + #[test] + fn process_substitution() { + let parts = decompose_word("<(cmd)"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ProcessSubstitution { direction, .. } + if direction == "<" + )); + } + + #[test] + fn locale_string() { + let parts = decompose_word("$\"hello\""); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::LocaleString { content } if content == "\"hello\"" + )); + } +} diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs index 1461736..9faee1a 100644 --- a/src/sexp/mod.rs +++ b/src/sexp/mod.rs @@ -17,11 +17,17 @@ impl fmt::Display for Node { impl fmt::Display for NodeKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Word { value, .. } => { + Self::Word { value, spans, .. } => { write!(f, "(word \"")?; - word::write_word_value(f, value)?; + if !spans.is_empty() && !word::needs_value_path(value) { + let segments = word::segments_from_spans(value, spans); + word::write_word_segments(f, &segments)?; + } else { + word::write_word_value(f, value)?; + } write!(f, "\")") } + Self::WordLiteral { value } => write!(f, "{value}"), Self::Command { assignments, words, diff --git a/src/sexp/word.rs b/src/sexp/word.rs index abec579..489dcb1 100644 --- a/src/sexp/word.rs +++ b/src/sexp/word.rs @@ -8,8 +8,6 @@ use std::fmt; use crate::format; -use crate::context::read_balanced_delim; - use super::{ ansi_c::process_ansi_c_content, extract_paren_content, normalize_cmdsub_content, write_escaped_char, write_escaped_word, @@ -234,128 +232,152 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) Ok(()) } -/// The public entry point — replaces the old monolithic `write_word_value`. +/// The public entry point — formats a word value via segment parsing. +/// Used as fallback when spans are not available (synthetic word nodes). pub fn write_word_value(f: &mut fmt::Formatter<'_>, value: &str) -> fmt::Result { - // Check for array assignment pattern: name=(...) — normalize content - if let Some(normalized) = try_normalize_array(value) { - // Process word segments so $() inside arrays gets reformatted - let segments = parse_word_segments(&normalized); - return write_word_segments(f, &segments); - } let segments = parse_word_segments(value); write_word_segments(f, &segments) } -/// Detects `name=(...)` or `name+=(...)` array patterns and normalizes whitespace/comments. -fn try_normalize_array(value: &str) -> Option { - // Find the `=(` pattern - let eq_paren = value.find("=(")?; - let prefix = &value[..=eq_paren]; // includes `=` - let after_eq = &value[eq_paren + 1..]; // starts with `(` - - if !after_eq.starts_with('(') { - return None; - } - - // Find the matching `)` using depth tracking - let chars: Vec = after_eq.chars().collect(); - let mut depth = 0; - let mut close_pos = None; - for (j, &c) in chars.iter().enumerate() { - if c == '(' { - depth += 1; - } else if c == ')' { - depth -= 1; - if depth == 0 { - close_pos = Some(j); - break; - } +/// Converts lexer spans to segments without re-parsing the value. +/// +/// Filters to top-level sexp-relevant spans (not contained within +/// another sexp-relevant span) and treats everything else as literal. +/// Uses span context to handle context-sensitive constructs like +/// `$'...'` inside `${...}` vs inside `"..."`. +pub fn segments_from_spans( + value: &str, + spans: &[crate::lexer::word_builder::WordSpan], +) -> Vec { + use crate::lexer::word_builder::{QuotingContext, WordSpanKind}; + let top_level = collect_top_level_sexp_spans(spans); + let mut segments = Vec::new(); + let mut pos = 0; + for span in &top_level { + if span.start > pos + && let Some(text) = value.get(pos..span.start) + { + segments.push(WordSegment::Literal(text.to_string())); } - } - let close = close_pos?; - let inner_chars: String = chars[1..close].iter().collect(); - let suffix: String = chars[close + 1..].iter().collect(); - - // Always normalize — even just trimming trailing spaces matters - if inner_chars.is_empty() { - return None; - } - let inner = &inner_chars; - - let normalized = normalize_array_content(inner); - let result = format!("{prefix}({normalized}){suffix}"); - Some(result) -} - -/// Normalizes array content: collapses whitespace, strips comments. -#[allow(clippy::too_many_lines)] -fn normalize_array_content(inner: &str) -> String { - let mut elements = Vec::new(); - let chars: Vec = inner.chars().collect(); - let mut i = 0; - let mut current = String::new(); - - while i < chars.len() { - match chars[i] { - ' ' | '\t' | '\n' | '\r' => { - if !current.is_empty() { - elements.push(std::mem::take(&mut current)); + match &span.kind { + WordSpanKind::CommandSub => { + if let Some(c) = value.get(span.start + 2..span.end - 1) { + segments.push(WordSegment::CommandSubstitution(c.to_string())); } - i += 1; } - '#' => { - // # is only a comment when preceded by whitespace (current is empty) - if current.is_empty() { - // Skip comment until end of line - while i < chars.len() && chars[i] != '\n' { - i += 1; - } - } else { - // # is part of the word (e.g., b# or [1]=b#) - current.push(chars[i]); - i += 1; + WordSpanKind::ProcessSub(dir) => { + if let Some(c) = value.get(span.start + 2..span.end - 1) { + segments.push(WordSegment::ProcessSubstitution(*dir, c.to_string())); } } - '\'' => { - crate::context::skip_single_quoted(&chars, &mut i, &mut current); - } - '"' => { - crate::context::skip_double_quoted(&chars, &mut i, &mut current); + WordSpanKind::AnsiCQuote => { + push_ansi_c_span(&mut segments, value, span); } - '\\' if i + 1 < chars.len() => { - current.push(chars[i]); - i += 1; - current.push(chars[i]); - i += 1; - } - '`' => { - crate::context::skip_backtick(&chars, &mut i, &mut current); + WordSpanKind::LocaleString => { + match span.context { + QuotingContext::DoubleQuote => { + // $"..." inside "..." is literal (not a locale string) + if let Some(text) = value.get(span.start..span.end) { + push_literal(&mut segments, text); + } + } + _ => { + if let Some(c) = value.get(span.start + 1..span.end) { + segments.push(WordSegment::LocaleString(c.to_string())); + } + } + } } - '$' if i + 1 < chars.len() && chars[i + 1] == '(' => { - // Command substitution — read matched parens with quote awareness - current.push(chars[i]); - current.push(chars[i + 1]); - i += 2; - read_balanced_delim(&chars, &mut i, '(', ')', &mut current); + _ => {} // filtered out by is_sexp_relevant + } + pos = span.end; + } + if pos < value.len() + && let Some(text) = value.get(pos..) + { + segments.push(WordSegment::Literal(text.to_string())); + } + segments +} + +/// Handles `$'...'` spans with context-sensitive behavior: +/// - Inside `"..."`: not special, treat as literal `$` + `'...'` +/// - Inside `${...}`: process escapes, absorb into literal (no quotes) +/// - Otherwise: normal `AnsiCQuote` segment +fn push_ansi_c_span( + segments: &mut Vec, + value: &str, + span: &crate::lexer::word_builder::WordSpan, +) { + use crate::lexer::word_builder::QuotingContext; + match span.context { + QuotingContext::DoubleQuote => { + // $'...' is NOT special inside "..." — treat as literal + if let Some(text) = value.get(span.start..span.end) { + push_literal(segments, text); } - '$' if i + 1 < chars.len() && chars[i + 1] == '{' => { - // Parameter expansion — read matched braces with quote awareness - current.push(chars[i]); - current.push(chars[i + 1]); - i += 2; - read_balanced_delim(&chars, &mut i, '{', '}', &mut current); + } + QuotingContext::ParamExpansion => { + // $'...' inside ${...} — process escapes, no quotes in output + if let Some(raw) = value.get(span.start + 2..span.end - 1) { + let chars: Vec = raw.chars().collect(); + let mut pos = 0; + let processed = super::process_ansi_c_content(&chars, &mut pos); + push_literal(segments, &processed); } - _ => { - current.push(chars[i]); - i += 1; + } + _ => { + // Top-level or inside $() / `` — normal ANSI-C quote + if let Some(c) = value.get(span.start + 2..span.end - 1) { + segments.push(WordSegment::AnsiCQuote(c.to_string())); } } } - if !current.is_empty() { - elements.push(current); +} + +/// Appends text to the last `Literal` segment if possible, or creates a new one. +fn push_literal(segments: &mut Vec, text: &str) { + if let Some(WordSegment::Literal(last)) = segments.last_mut() { + last.push_str(text); + } else { + segments.push(WordSegment::Literal(text.to_string())); } +} + +/// Returns true if the word value requires the value-based formatting +/// path. Currently always returns false — all cases are handled by +/// the span-based path or by lex-time normalization. +pub const fn needs_value_path(_value: &str) -> bool { + false +} + +const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bool { + use crate::lexer::word_builder::WordSpanKind; + matches!( + kind, + WordSpanKind::CommandSub + | WordSpanKind::AnsiCQuote + | WordSpanKind::LocaleString + | WordSpanKind::ProcessSub(_) + ) +} - elements.join(" ") +/// Collects sexp-relevant spans that are not nested inside another +/// sexp-relevant span. Sorted by start offset. +fn collect_top_level_sexp_spans( + spans: &[crate::lexer::word_builder::WordSpan], +) -> Vec<&crate::lexer::word_builder::WordSpan> { + let relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect(); + relevant + .iter() + .filter(|s| { + // A span is top-level if no other relevant span contains it + !relevant.iter().any(|outer| { + outer.start <= s.start && outer.end >= s.end && !std::ptr::eq(*outer, **s) + }) + }) + .copied() + .collect() } // -- helpers -- diff --git a/src/token.rs b/src/token.rs index 9212d13..1187ba1 100644 --- a/src/token.rs +++ b/src/token.rs @@ -141,6 +141,8 @@ impl TokenType { } } +use crate::lexer::word_builder::WordSpan; + /// A token produced by the lexer. #[derive(Debug, Clone)] pub struct Token { @@ -148,6 +150,9 @@ pub struct Token { pub value: String, pub pos: usize, pub line: usize, + /// Expansion spans within the word value (empty for non-word tokens). + #[allow(dead_code)] + pub(crate) spans: Vec, } impl Token { @@ -157,6 +162,25 @@ impl Token { value: value.into(), pos, line, + spans: Vec::new(), + } + } + + /// Creates a word token with pre-recorded expansion spans. + #[allow(dead_code)] + pub(crate) const fn with_spans( + kind: TokenType, + value: String, + pos: usize, + line: usize, + spans: Vec, + ) -> Self { + Self { + kind, + value, + pos, + line, + spans, } } @@ -171,6 +195,7 @@ impl Token { value: String::new(), pos, line, + spans: Vec::new(), } } } diff --git a/tests/integration.rs b/tests/integration.rs index 8331825..bc28ba7 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -242,18 +242,26 @@ fn oracle_test_suite() { let mut pass = 0; let mut fail = 0; + let mut failures = Vec::new(); for case in &cases { - let (passed, _actual) = run_test(case); + let (passed, actual) = run_test(case); if passed { pass += 1; } else { fail += 1; + failures.push(format!( + " FAIL :: {}\n input: {:?}\n expected: {:?}\n actual: {:?}", + case.name, case.input, case.expected, actual, + )); } } let total = pass + fail; let status = if fail == 0 { "OK" } else { "FAIL" }; eprintln!(" {file_name}: {pass}/{total} passed [{status}]"); + for f in &failures { + eprintln!("{f}"); + } total_pass += pass; total_fail += fail; } From 54db8c79d15689d475ec12aeadc2d28b2d62fa90 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Wed, 25 Mar 2026 13:45:35 +0100 Subject: [PATCH 2/5] refactor: remove sexp re-parsing by threading spans through all nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate ~400 lines of duplicate parsing code in the sexp/format layers by ensuring all Word and CondTerm nodes carry lexer spans. - Thread token spans via `word_node_from_token()` (9 call sites updated) - Add spans to CondTerm nodes via `cond_term_from_token()` (4 call sites) - Rewrite `write_redirect()` to use span-based segments instead of string searches (`needs_word_processing` removed) - Update `process_word_value()` in format module to use spans - Replace `decompose_word()` with `decompose_word_with_spans()` (span path) and `decompose_word_literal()` (synthetic nodes) - Word Display for span-less synthetic nodes uses `write_escaped_word` directly instead of re-parsing through `parse_word_segments` Deleted dead code: - `parse_word_segments()` + `flush_literal` + `extract_ansi_c_content` + `extract_locale_content` (~195 lines in sexp/word.rs) - `extract_paren_content()` (~107 lines in sexp/mod.rs) - `needs_word_processing()`, `write_word_value()`, `needs_value_path()` - `skip_single_quoted/double/backtick`, `is_backslash_escaped` (~70 lines in context.rs) - 8 tests for deleted `parse_word_segments` function Oracle improved: 167/181 (was 165) — removing buggy re-parsing fixed 2 edge cases where the two parsers disagreed. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ast.rs | 6 +- src/context.rs | 70 ---------- src/format/mod.rs | 21 ++- src/parser/compound.rs | 19 +-- src/parser/conditional.rs | 10 +- src/parser/helpers.rs | 21 ++- src/parser/mod.rs | 12 +- src/parser/word_parts.rs | 47 ++++--- src/sexp/mod.rs | 207 ++++++++--------------------- src/sexp/word.rs | 273 +------------------------------------- 10 files changed, 143 insertions(+), 543 deletions(-) diff --git a/src/ast.rs b/src/ast.rs index 9f78c17..79d6e52 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -326,7 +326,11 @@ pub enum NodeKind { CondParen { inner: Box }, /// A term (word) in a conditional expression. - CondTerm { value: String }, + CondTerm { + value: String, + #[allow(dead_code)] + spans: Vec, + }, // -- Other -- /// Pipeline negation with `!`. diff --git a/src/context.rs b/src/context.rs index b8bd94c..d9b5903 100644 --- a/src/context.rs +++ b/src/context.rs @@ -58,73 +58,3 @@ impl CaseTracker { self.depth > 0 && self.in_pattern } } - -/// Skips a single-quoted region in a char array. Reads from `pos` (which -/// should point AT the opening `'`) through the closing `'`. -/// Pushes all chars (including quotes) into `out`. -pub fn skip_single_quoted(chars: &[char], pos: &mut usize, out: &mut String) { - out.push(chars[*pos]); // opening ' - *pos += 1; - while *pos < chars.len() && chars[*pos] != '\'' { - out.push(chars[*pos]); - *pos += 1; - } - if *pos < chars.len() { - out.push(chars[*pos]); // closing ' - *pos += 1; - } -} - -/// Skips a double-quoted region in a char array. Reads from `pos` (which -/// should point AT the opening `"`) through the closing `"`. -/// Handles backslash escapes inside. Pushes all chars into `out`. -pub fn skip_double_quoted(chars: &[char], pos: &mut usize, out: &mut String) { - out.push(chars[*pos]); // opening " - *pos += 1; - while *pos < chars.len() && chars[*pos] != '"' { - if chars[*pos] == '\\' && *pos + 1 < chars.len() { - out.push(chars[*pos]); - *pos += 1; - } - out.push(chars[*pos]); - *pos += 1; - } - if *pos < chars.len() { - out.push(chars[*pos]); // closing " - *pos += 1; - } -} - -/// Skips a backtick region in a char array. Reads from `pos` (which -/// should point AT the opening `` ` ``) through the closing `` ` ``. -/// Handles backslash escapes inside. Pushes all chars into `out`. -pub fn skip_backtick(chars: &[char], pos: &mut usize, out: &mut String) { - out.push(chars[*pos]); // opening ` - *pos += 1; - while *pos < chars.len() && chars[*pos] != '`' { - if chars[*pos] == '\\' && *pos + 1 < chars.len() { - out.push(chars[*pos]); - *pos += 1; - } - out.push(chars[*pos]); - *pos += 1; - } - if *pos < chars.len() { - out.push(chars[*pos]); // closing ` - *pos += 1; - } -} - -/// Counts consecutive backslashes before a position to determine -/// if a character is escaped. -/// -/// Even count → not escaped. Odd count → escaped. -pub fn is_backslash_escaped(chars: &[char], pos: usize) -> bool { - let mut count = 0; - let mut j = pos; - while j > 0 && chars[j - 1] == '\\' { - count += 1; - j -= 1; - } - count % 2 != 0 -} diff --git a/src/format/mod.rs b/src/format/mod.rs index 73fad3d..e1998fa 100644 --- a/src/format/mod.rs +++ b/src/format/mod.rs @@ -206,8 +206,8 @@ fn format_command_words(assignments: &[Node], words: &[Node], out: &mut String) if i > 0 { out.push(' '); } - if let NodeKind::Word { value, .. } = &w.kind { - out.push_str(&process_word_value(value)); + if let NodeKind::Word { value, spans, .. } = &w.kind { + out.push_str(&process_word_value(value, spans)); } else { out.push_str(&w.to_string()); } @@ -233,8 +233,8 @@ fn format_redirect(node: &Node, out: &mut String) { if !is_dup { out.push(' '); } - if let NodeKind::Word { value, .. } = &target.kind { - out.push_str(&process_word_value(value)); + if let NodeKind::Word { value, spans, .. } = &target.kind { + out.push_str(&process_word_value(value, spans)); } } else if let NodeKind::HereDoc { delimiter, @@ -515,7 +515,7 @@ fn format_cond_node(node: &Node, out: &mut String) { out.push_str("! "); format_cond_node(operand, out); } - NodeKind::CondTerm { value } => { + NodeKind::CondTerm { value, .. } => { out.push_str(value); } NodeKind::CondParen { inner } => { @@ -535,13 +535,12 @@ fn indent_str(out: &mut String, n: usize) { } } -/// Process a word value for canonical bash output using the same segment -/// pipeline as S-expression formatting. This ensures consistent handling -/// of `$'...'`, `$"..."`, and `$(...)` across both output paths. -fn process_word_value(value: &str) -> String { - use crate::sexp::word::{WordSegment, parse_word_segments}; +/// Process a word value for canonical bash output using span-based +/// segment extraction. +fn process_word_value(value: &str, spans: &[crate::lexer::word_builder::WordSpan]) -> String { + use crate::sexp::word::{WordSegment, segments_from_spans}; - let segments = parse_word_segments(value); + let segments = segments_from_spans(value, spans); let mut result = String::with_capacity(value.len()); for seg in &segments { diff --git a/src/parser/compound.rs b/src/parser/compound.rs index 2823c05..1be8779 100644 --- a/src/parser/compound.rs +++ b/src/parser/compound.rs @@ -7,7 +7,7 @@ use crate::token::{Token, TokenType}; use super::{ Parser, - helpers::{is_fd_number, word_node}, + helpers::{is_fd_number, word_node, word_node_from_token}, }; impl Parser { @@ -148,12 +148,12 @@ impl Parser { break; } let tok = self.lexer.next_token()?; - ws.push(word_node(&tok.value)); + ws.push(word_node_from_token(&tok)); } Some(ws) } else { Some(vec![Node::empty(NodeKind::Word { - parts: super::word_parts::decompose_word("\"$@\""), + parts: super::word_parts::decompose_word_literal("\"$@\""), value: "\"$@\"".to_string(), spans: Vec::new(), })]) @@ -242,7 +242,10 @@ impl Parser { let word_tok = self.lexer.next_token()?; let word = Box::new(Node::new( NodeKind::Word { - parts: super::word_parts::decompose_word(&word_tok.value), + parts: super::word_parts::decompose_word_with_spans( + &word_tok.value, + &word_tok.spans, + ), value: word_tok.value.clone(), spans: word_tok.spans, }, @@ -289,7 +292,7 @@ impl Parser { if tok.kind == TokenType::Pipe { continue; } - pattern_words.push(word_node(&tok.value)); + pattern_words.push(word_node_from_token(&tok)); } self.skip_newlines()?; @@ -348,7 +351,7 @@ impl Parser { break; } let tok = self.lexer.next_token()?; - ws.push(word_node(&tok.value)); + ws.push(word_node_from_token(&tok)); } Some(ws) } else { @@ -513,7 +516,7 @@ impl Parser { } else { None }; - let mut words = vec![word_node(&first_tok.value)]; + let mut words = vec![word_node_from_token(&first_tok)]; let mut redirects = Vec::new(); loop { if self.at_end()? { @@ -529,7 +532,7 @@ impl Parser { if is_fd_number(&tok.value) && self.is_redirect_operator()? { redirects.push(self.parse_redirect_with_fd(&tok)?); } else { - words.push(word_node(&tok.value)); + words.push(word_node_from_token(&tok)); } } else { break; diff --git a/src/parser/conditional.rs b/src/parser/conditional.rs index 4016638..2860e86 100644 --- a/src/parser/conditional.rs +++ b/src/parser/conditional.rs @@ -6,7 +6,7 @@ use crate::token::{Token, TokenType}; use super::{ Parser, - helpers::{cond_term, is_cond_binary_op}, + helpers::{cond_term_from_token, is_cond_binary_op}, }; impl Parser { @@ -107,7 +107,7 @@ impl Parser { start, NodeKind::UnaryTest { op: first.value, - operand: Box::new(cond_term(&operand_tok.value)), + operand: Box::new(cond_term_from_token(&operand_tok)), }, )); } @@ -128,8 +128,8 @@ impl Parser { start, NodeKind::BinaryTest { op: op.value, - left: Box::new(cond_term(&first.value)), - right: Box::new(cond_term(&right.value)), + left: Box::new(cond_term_from_token(&first)), + right: Box::new(cond_term_from_token(&right)), }, )); } @@ -140,7 +140,7 @@ impl Parser { start, NodeKind::UnaryTest { op: "-n".to_string(), - operand: Box::new(cond_term(&first.value)), + operand: Box::new(cond_term_from_token(&first)), }, )) } diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs index bc39e8e..3ff2149 100644 --- a/src/parser/helpers.rs +++ b/src/parser/helpers.rs @@ -1,20 +1,31 @@ //! Helper functions for the parser. use crate::ast::{Node, NodeKind}; +use crate::token::Token; -/// Creates a `Word` node with decomposed parts. +/// Creates a `Word` node from a lexer token, preserving spans. +pub fn word_node_from_token(tok: &Token) -> Node { + Node::empty(NodeKind::Word { + parts: super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans), + value: tok.value.clone(), + spans: tok.spans.clone(), + }) +} + +/// Creates a `Word` node for synthetic values (no lexer token). pub fn word_node(value: &str) -> Node { Node::empty(NodeKind::Word { - parts: super::word_parts::decompose_word(value), + parts: super::word_parts::decompose_word_literal(value), value: value.to_string(), spans: Vec::new(), }) } -/// Creates a `cond-term` node for conditional expressions. -pub(super) fn cond_term(value: &str) -> Node { +/// Creates a `cond-term` node from a lexer token, preserving spans. +pub(super) fn cond_term_from_token(tok: &Token) -> Node { Node::empty(NodeKind::CondTerm { - value: value.to_string(), + value: tok.value.clone(), + spans: tok.spans.clone(), }) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 6f8d0ab..35428ae 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -10,7 +10,7 @@ use crate::token::{Token, TokenType}; use helpers::{ add_stderr_redirect, fill_heredoc_contents, is_fd_number, is_varfd, make_stderr_redirect, - parse_heredoc_delimiter, word_node, + parse_heredoc_delimiter, word_node, word_node_from_token, }; /// Maximum recursion/iteration depth to prevent infinite loops. @@ -424,11 +424,11 @@ impl Parser { // After |, time is a regular word, not a keyword. // Temporarily demote it and all following reserved words to words. let time_tok = self.lexer.next_token()?; - let mut words = vec![word_node(&time_tok.value)]; + let mut words = vec![word_node_from_token(&time_tok)]; // Check for -p flag (also a word in this context) if self.check_word("-p")? { let p_tok = self.lexer.next_token()?; - words.push(word_node(&p_tok.value)); + words.push(word_node_from_token(&p_tok)); } // Consume all remaining words (including reserved words as plain words) let mut redirects = Vec::new(); @@ -456,7 +456,7 @@ impl Parser { continue; } let t = self.lexer.next_token()?; - words.push(word_node(&t.value)); + words.push(word_node_from_token(&t)); } Ok(self.spanned( start, @@ -568,7 +568,7 @@ impl Parser { return self.parse_function_def(&tok); } let word_span = Span::new(tok.pos, tok.pos + tok.value.len()); - let parts = word_parts::decompose_word(&tok.value); + let parts = word_parts::decompose_word_with_spans(&tok.value, &tok.spans); let node = Node::new( NodeKind::Word { value: tok.value, @@ -679,7 +679,7 @@ impl Parser { start, NodeKind::Redirect { op: op_tok.value, - target: Box::new(word_node(&target_tok.value)), + target: Box::new(word_node_from_token(&target_tok)), fd, }, )) diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs index e441264..ab7b494 100644 --- a/src/parser/word_parts.rs +++ b/src/parser/word_parts.rs @@ -6,7 +6,8 @@ use std::cell::Cell; use crate::ast::{ListItem, Node, NodeKind, Span}; -use crate::sexp::word::{WordSegment, parse_word_segments}; +use crate::lexer::word_builder::WordSpan; +use crate::sexp::word::{WordSegment, segments_from_spans}; thread_local! { static DECOMPOSE_DEPTH: Cell = const { Cell::new(0) }; @@ -40,19 +41,21 @@ impl Drop for DepthGuard { } } -/// Decomposes a word's raw value into a list of typed AST nodes. -/// -/// Each segment becomes one of: -/// - `WordLiteral` for plain text -/// - `CommandSubstitution` for `$(...)` -/// - `ProcessSubstitution` for `<(...)` / `>(...)` -/// - `AnsiCQuote` for `$'...'` -/// - `LocaleString` for `$"..."` -pub(super) fn decompose_word(value: &str) -> Vec { - let segments = parse_word_segments(value); +/// Decomposes a word using lexer spans — the primary path for all +/// token-derived words. +pub(super) fn decompose_word_with_spans(value: &str, spans: &[WordSpan]) -> Vec { + let segments = segments_from_spans(value, spans); segments.into_iter().map(segment_to_node).collect() } +/// Creates a single `WordLiteral` for synthetic values (no lexer token). +/// Used for fd numbers (`"0"`, `"1"`), synthetic `"$@"`, etc. +pub(super) fn decompose_word_literal(value: &str) -> Vec { + vec![Node::empty(NodeKind::WordLiteral { + value: value.to_string(), + })] +} + fn segment_to_node(seg: WordSegment) -> Node { match seg { WordSegment::Literal(text) => Node::empty(NodeKind::WordLiteral { value: text }), @@ -129,9 +132,17 @@ fn wrap_nodes(mut nodes: Vec) -> Node { mod tests { use super::*; + /// Helper: lex a single word and decompose it using the real span path. + #[allow(clippy::unwrap_used)] + fn decompose(source: &str) -> Vec { + let mut lexer = crate::lexer::Lexer::new(source, false); + let tok = lexer.next_token().unwrap(); + decompose_word_with_spans(&tok.value, &tok.spans) + } + #[test] fn plain_word() { - let parts = decompose_word("echo"); + let parts = decompose("echo"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, @@ -141,7 +152,7 @@ mod tests { #[test] fn simple_variable_stays_literal() { - let parts = decompose_word("$foo"); + let parts = decompose("$foo"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, @@ -151,7 +162,7 @@ mod tests { #[test] fn command_substitution() { - let parts = decompose_word("$(date)"); + let parts = decompose("$(date)"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, @@ -161,7 +172,7 @@ mod tests { #[test] fn mixed_segments() { - let parts = decompose_word("hello$(world)end"); + let parts = decompose("hello$(world)end"); assert_eq!(parts.len(), 3); assert!(matches!( &parts[0].kind, @@ -179,7 +190,7 @@ mod tests { #[test] fn ansi_c_quote() { - let parts = decompose_word("$'foo\\nbar'"); + let parts = decompose("$'foo\\nbar'"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, @@ -189,7 +200,7 @@ mod tests { #[test] fn process_substitution() { - let parts = decompose_word("<(cmd)"); + let parts = decompose("<(cmd)"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, @@ -200,7 +211,7 @@ mod tests { #[test] fn locale_string() { - let parts = decompose_word("$\"hello\""); + let parts = decompose("$\"hello\""); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs index 9faee1a..13e9509 100644 --- a/src/sexp/mod.rs +++ b/src/sexp/mod.rs @@ -19,11 +19,12 @@ impl fmt::Display for NodeKind { match self { Self::Word { value, spans, .. } => { write!(f, "(word \"")?; - if !spans.is_empty() && !word::needs_value_path(value) { + if spans.is_empty() { + // Synthetic word (no lexer token) — escape directly + write_escaped_word(f, value)?; + } else { let segments = word::segments_from_spans(value, spans); word::write_word_segments(f, &segments)?; - } else { - word::write_word_value(f, value)?; } write!(f, "\")") } @@ -205,18 +206,19 @@ impl fmt::Display for NodeKind { write!(f, "(cond {body})")?; write_redirects(f, redirects) } - Self::CondTerm { value } => { + Self::CondTerm { value, spans } => { // Strip $" locale prefix let val = if value.starts_with("$\"") { &value[1..] } else { value }; - // Process cmdsubs within the value (for redirect normalization) - // but write other content raw (no escaping — cond-terms use raw values) - if val.contains("$(") { + let has_cmdsub = spans.iter().any(|s| { + matches!(s.kind, crate::lexer::word_builder::WordSpanKind::CommandSub) + }); + if has_cmdsub { write!(f, "(cond-term \"")?; - let segments = word::parse_word_segments(val); + let segments = word::segments_from_spans(val, spans); for seg in &segments { match seg { word::WordSegment::Literal(text) => write!(f, "{text}")?, @@ -286,119 +288,6 @@ impl fmt::Display for CasePattern { // Old write_word_value replaced by word::write_word_value (segment-based) -/// Extracts content from matched parentheses (for command substitution). -/// Case-aware: `)` in case patterns doesn't close the substitution. -#[allow(clippy::too_many_lines)] -pub(super) fn extract_paren_content(chars: &[char], pos: &mut usize) -> String { - let mut content = String::new(); - let mut depth = 1; - let mut case = crate::context::CaseTracker::default(); - let mut word_buf = String::new(); - - while *pos < chars.len() { - let c = chars[*pos]; - // Handle backslash-escaped characters - if c == '\\' && *pos + 1 < chars.len() { - word_buf.clear(); - content.push(c); - *pos += 1; - content.push(chars[*pos]); - *pos += 1; - continue; - } - if c == '(' { - case.check_word(&word_buf); - word_buf.clear(); - // In case pattern mode, `(` is optional pattern prefix — don't increment depth - if !case.is_pattern_open() { - depth += 1; - } - content.push(c); - } else if c == ')' { - case.check_word(&word_buf); - word_buf.clear(); - if case.is_pattern_close() { - // Case pattern terminator — don't close - content.push(c); - case.close_pattern(); - } else { - depth -= 1; - if depth == 0 { - *pos += 1; - return content; - } - content.push(c); - } - } else if c == '\'' { - case.check_word(&word_buf); - word_buf.clear(); - crate::context::skip_single_quoted(chars, pos, &mut content); - continue; // skip_single_quoted already advances pos - } else if c == '"' { - case.check_word(&word_buf); - word_buf.clear(); - crate::context::skip_double_quoted(chars, pos, &mut content); - continue; - } else if c == ';' { - case.check_word(&word_buf); - word_buf.clear(); - content.push(c); - // Check for ;; ;& ;;& which resume case pattern mode - if *pos + 1 < chars.len() { - if chars[*pos + 1] == ';' { - *pos += 1; - content.push(chars[*pos]); - if *pos + 1 < chars.len() && chars[*pos + 1] == '&' { - *pos += 1; - content.push(chars[*pos]); - } - case.resume_pattern(); - } else if chars[*pos + 1] == '&' { - *pos += 1; - content.push(chars[*pos]); - case.resume_pattern(); - } - } - } else if c == '`' { - case.check_word(&word_buf); - word_buf.clear(); - crate::context::skip_backtick(chars, pos, &mut content); - continue; - } else if c == '#' { - // Comment: # after whitespace/newline skips to end of line - case.check_word(&word_buf); - word_buf.clear(); - let prev = if content.is_empty() { - '\n' - } else { - content.chars().last().unwrap_or('\n') - }; - if prev == '\n' || prev == ' ' || prev == '\t' { - while *pos < chars.len() && chars[*pos] != '\n' { - *pos += 1; - } - // Don't advance past the newline — the main loop will - if *pos < chars.len() { - *pos -= 1; // will be incremented by the outer loop - } - } else { - content.push(c); - } - } else if c == ' ' || c == '\t' || c == '\n' || c == '|' { - case.check_word(&word_buf); - word_buf.clear(); - content.push(c); - } else { - word_buf.push(c); - content.push(c); - } - *pos += 1; - } - content -} - -// CaseTracker moved to crate::context - /// Normalizes command substitution content: /// - Strips leading/trailing whitespace and newlines /// - Strips trailing semicolons @@ -785,30 +674,21 @@ fn write_in_list(f: &mut fmt::Formatter<'_>, words: Option<&[Node]>) -> fmt::Res fn write_redirect(f: &mut fmt::Formatter<'_>, op: &str, target: &Node, _fd: i32) -> fmt::Result { write!(f, "(redirect \"{op}\" ")?; - if let NodeKind::Word { value, .. } = &target.kind { + if let NodeKind::Word { value, spans, .. } = &target.kind { // For fd dup operations (>&, <&), bare digit-only targets are unquoted let is_fd_op = op.starts_with(">&") || op.starts_with("<&") || op.ends_with("&-") || op.ends_with('&'); if is_fd_op && !value.is_empty() && value.chars().all(|c| c.is_ascii_digit()) { write!(f, "{value})") - } else if value.starts_with("$\"") { - // Locale string: strip $ prefix, preserve literal quotes - write!(f, "\"{}\")", &value[1..]) - } else if value.starts_with("$'") && !needs_word_processing(value) { - // ANSI-C at start with no nested constructs: process with literal newlines - let chars: Vec = value.chars().collect(); - let mut pos = 2; // skip $' - let processed = process_ansi_c_content(&chars, &mut pos); - // Include any text after the closing quote - let remaining: String = chars[pos..].iter().collect(); - write!(f, "\"'{processed}'{remaining}\")") - } else if needs_word_processing(value) { - // Values with $'...', $"...", $(...), <(...) need segment processing + } else if !spans.is_empty() { + // Use span-based formatting — redirect targets use literal + // output (no S-expression escaping of quotes) write!(f, "\"")?; - word::write_word_value(f, value)?; + let segments = word::segments_from_spans(value, spans); + write_redirect_segments(f, &segments)?; write!(f, "\")") } else { - // Plain values: output as-is inside quotes + // Synthetic nodes (fd strings) — output as-is write!(f, "\"{value}\")") } } else { @@ -816,18 +696,49 @@ fn write_redirect(f: &mut fmt::Formatter<'_>, op: &str, target: &Node, _fd: i32) } } -/// Returns true if a redirect target value needs word segment processing. -/// Note: $' and $" at the START are handled specially (literal newlines), -/// so only mid-value occurrences or $()/<() trigger word processing. -fn needs_word_processing(value: &str) -> bool { - if value.starts_with("$'") || value.starts_with("$\"") { - // $' or $" at start handled by special cases — only need word processing - // if there's ALSO a $( or <( elsewhere in the value - let rest = &value[2..]; - rest.contains("$(") || rest.contains("<(") || rest.contains(">(") - } else { - value.contains("$'") || value.contains("$(") || value.contains("<(") || value.contains(">(") +/// Formats word segments for redirect targets — uses literal output +/// (no S-expression escaping) for text, but processes ANSI-C escapes +/// and reformats command substitutions. +fn write_redirect_segments( + f: &mut fmt::Formatter<'_>, + segments: &[word::WordSegment], +) -> fmt::Result { + for seg in segments { + match seg { + word::WordSegment::Literal(text) => write!(f, "{text}")?, + word::WordSegment::AnsiCQuote(raw) => { + let chars: Vec = raw.chars().collect(); + let mut pos = 0; + let processed = process_ansi_c_content(&chars, &mut pos); + write!(f, "'{processed}'")?; + } + word::WordSegment::LocaleString(content) => { + // $"..." → "..." (content already includes "...") + write!(f, "{content}")?; + } + word::WordSegment::CommandSubstitution(content) => { + write!(f, "$(")?; + if let Some(reformatted) = crate::format::reformat_bash(content) { + write!(f, "{reformatted}")?; + } else { + let normalized = normalize_cmdsub_content(content); + write!(f, "{normalized}")?; + } + write!(f, ")")?; + } + word::WordSegment::ProcessSubstitution(dir, content) => { + write!(f, "{dir}(")?; + if let Some(reformatted) = crate::format::reformat_bash(content) { + write!(f, "{reformatted}")?; + } else { + let normalized = normalize_cmdsub_content(content); + write!(f, "{normalized}")?; + } + write!(f, ")")?; + } + } } + Ok(()) } fn write_param( diff --git a/src/sexp/word.rs b/src/sexp/word.rs index 489dcb1..d22b12c 100644 --- a/src/sexp/word.rs +++ b/src/sexp/word.rs @@ -9,8 +9,8 @@ use std::fmt; use crate::format; use super::{ - ansi_c::process_ansi_c_content, extract_paren_content, normalize_cmdsub_content, - write_escaped_char, write_escaped_word, + ansi_c::process_ansi_c_content, normalize_cmdsub_content, write_escaped_char, + write_escaped_word, }; /// A typed segment within a word token's raw value. @@ -28,150 +28,6 @@ pub enum WordSegment { ProcessSubstitution(char, String), } -/// Parses a word's raw value into typed segments. -#[allow(clippy::too_many_lines)] -pub fn parse_word_segments(value: &str) -> Vec { - let chars: Vec = value.chars().collect(); - let mut segments = Vec::new(); - let mut i = 0; - let mut literal = String::new(); - let mut brace_depth = 0usize; // Track ${...} nesting - let mut in_double_quote = false; // Track "..." context - - while i < chars.len() { - let prev_backslash = crate::context::is_backslash_escaped(&chars, i); - - // Track double-quote context (not inside ${...} where quotes nest differently) - if chars[i] == '"' && !prev_backslash && brace_depth == 0 { - in_double_quote = !in_double_quote; - literal.push(chars[i]); - i += 1; - continue; - } - - // Track closing braces for ${...} - if chars[i] == '}' && brace_depth > 0 { - brace_depth -= 1; - literal.push(chars[i]); - i += 1; - continue; - } - - // Single quotes are opaque — nothing is special inside '...' - if chars[i] == '\'' && !prev_backslash && !in_double_quote { - literal.push(chars[i]); - i += 1; - while i < chars.len() && chars[i] != '\'' { - literal.push(chars[i]); - i += 1; - } - if i < chars.len() { - literal.push(chars[i]); // closing ' - i += 1; - } - continue; - } - - // Backticks are opaque — don't process $'...' or $() inside them - if chars[i] == '`' && !prev_backslash { - literal.push(chars[i]); - i += 1; - while i < chars.len() && chars[i] != '`' { - if chars[i] == '\\' && i + 1 < chars.len() { - literal.push(chars[i]); - i += 1; - } - literal.push(chars[i]); - i += 1; - } - if i < chars.len() { - literal.push(chars[i]); // closing ` - i += 1; - } - continue; - } - - if chars[i] == '$' && !prev_backslash && i + 1 < chars.len() { - match chars[i + 1] { - '\'' => { - if in_double_quote && brace_depth == 0 { - // $'...' is NOT special inside double quotes — literal - // (but IS special inside ${...} even when double-quoted) - literal.push(chars[i]); - i += 1; - continue; - } - if brace_depth > 0 { - // ANSI-C inside ${...} — process escapes, output without quotes - i += 2; // skip $' - let content = extract_ansi_c_content(&chars, &mut i); - let ac_chars: Vec = content.chars().collect(); - let mut pos = 0; - let processed = super::process_ansi_c_content(&ac_chars, &mut pos); - literal.push_str(&processed); - } else { - flush_literal(&mut literal, &mut segments); - i += 2; // skip $' - let content = extract_ansi_c_content(&chars, &mut i); - segments.push(WordSegment::AnsiCQuote(content)); - } - } - '"' => { - if in_double_quote { - // $" inside double quotes is literal (not a locale string) - literal.push(chars[i]); - i += 1; - } else { - flush_literal(&mut literal, &mut segments); - i += 1; // skip $, keep " - let content = extract_locale_content(&chars, &mut i); - segments.push(WordSegment::LocaleString(content)); - } - } - '{' => { - literal.push(chars[i]); - literal.push(chars[i + 1]); - i += 2; - brace_depth += 1; - } - '(' => { - // $(( )) arithmetic — treat as literal (don't reformat) - if i + 2 < chars.len() && chars[i + 2] == '(' { - literal.push(chars[i]); - i += 1; - continue; - } - flush_literal(&mut literal, &mut segments); - i += 2; // skip $( - let content = extract_paren_content(&chars, &mut i); - segments.push(WordSegment::CommandSubstitution(content)); - } - _ => { - literal.push(chars[i]); - i += 1; - } - } - } else if (chars[i] == '>' || chars[i] == '<') - && !prev_backslash - && !in_double_quote - && i + 1 < chars.len() - && chars[i + 1] == '(' - { - let direction = chars[i]; - flush_literal(&mut literal, &mut segments); - i += 2; // skip >( or <( - let content = extract_paren_content(&chars, &mut i); - segments.push(WordSegment::ProcessSubstitution(direction, content)); - } else { - literal.push(chars[i]); - i += 1; - } - } - - flush_literal(&mut literal, &mut segments); - segments -} - /// Formats word segments into S-expression output. pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) -> fmt::Result { for seg in segments { @@ -232,13 +88,6 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) Ok(()) } -/// The public entry point — formats a word value via segment parsing. -/// Used as fallback when spans are not available (synthetic word nodes). -pub fn write_word_value(f: &mut fmt::Formatter<'_>, value: &str) -> fmt::Result { - let segments = parse_word_segments(value); - write_word_segments(f, &segments) -} - /// Converts lexer spans to segments without re-parsing the value. /// /// Filters to top-level sexp-relevant spans (not contained within @@ -344,13 +193,6 @@ fn push_literal(segments: &mut Vec, text: &str) { } } -/// Returns true if the word value requires the value-based formatting -/// path. Currently always returns false — all cases are handled by -/// the span-based path or by lex-time normalization. -pub const fn needs_value_path(_value: &str) -> bool { - false -} - const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bool { use crate::lexer::word_builder::WordSpanKind; matches!( @@ -379,114 +221,3 @@ fn collect_top_level_sexp_spans( .copied() .collect() } - -// -- helpers -- - -fn flush_literal(literal: &mut String, segments: &mut Vec) { - if !literal.is_empty() { - segments.push(WordSegment::Literal(std::mem::take(literal))); - } -} - -/// Extracts ANSI-C content between `'...'` (after `$'` was consumed). -fn extract_ansi_c_content(chars: &[char], pos: &mut usize) -> String { - let mut content = String::new(); - while *pos < chars.len() { - let c = chars[*pos]; - if c == '\'' { - *pos += 1; // skip closing ' - return content; - } - if c == '\\' && *pos + 1 < chars.len() { - content.push('\\'); - *pos += 1; - content.push(chars[*pos]); - } else { - content.push(c); - } - *pos += 1; - } - content -} - -/// Extracts locale string content `"..."` (after `$` was skipped, starting at `"`). -fn extract_locale_content(chars: &[char], pos: &mut usize) -> String { - let mut content = String::new(); - // Read opening " and content until closing " - let start = *pos; - while *pos < chars.len() { - content.push(chars[*pos]); - if chars[*pos] == '"' && *pos > start { - *pos += 1; - return content; - } - *pos += 1; - } - content -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn plain_word() { - let segs = parse_word_segments("hello"); - assert_eq!(segs, vec![WordSegment::Literal("hello".into())]); - } - - #[test] - fn ansi_c_quote() { - let segs = parse_word_segments("$'foo\\nbar'"); - assert_eq!(segs, vec![WordSegment::AnsiCQuote("foo\\nbar".into())]); - } - - #[test] - fn locale_string() { - let segs = parse_word_segments("$\"hello\""); - assert_eq!(segs, vec![WordSegment::LocaleString("\"hello\"".into())]); - } - - #[test] - fn command_substitution() { - let segs = parse_word_segments("$(date)"); - assert_eq!(segs, vec![WordSegment::CommandSubstitution("date".into())]); - } - - #[test] - fn arithmetic_stays_literal() { - let segs = parse_word_segments("$((1+2))"); - assert_eq!(segs, vec![WordSegment::Literal("$((1+2))".into())]); - } - - #[test] - fn mixed_segments() { - let segs = parse_word_segments("hello$(world)$'foo'"); - assert_eq!( - segs, - vec![ - WordSegment::Literal("hello".into()), - WordSegment::CommandSubstitution("world".into()), - WordSegment::AnsiCQuote("foo".into()), - ] - ); - } - - #[test] - fn escaped_dollar_not_special() { - let segs = parse_word_segments("\\$'not-ansi'"); - assert_eq!(segs, vec![WordSegment::Literal("\\$'not-ansi'".into())]); - } - - #[test] - fn assignment_with_cmdsub() { - let segs = parse_word_segments("x=$(cmd)"); - assert_eq!( - segs, - vec![ - WordSegment::Literal("x=".into()), - WordSegment::CommandSubstitution("cmd".into()), - ] - ); - } -} From cec7e8e5f8a245b15211598f21f1b5fedd6081d4 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Wed, 25 Mar 2026 13:56:02 +0100 Subject: [PATCH 3/5] refactor: simplify span collection, move to owned tokens, remove dead code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix O(n²) collect_top_level_sexp_spans: sort by start offset then single-pass sweep (was using ptr::eq containment check) - Move word_node_from_token/cond_term_from_token to take owned Token, eliminating String+Vec clones per word - Remove stale #[allow(dead_code)] annotations on Token.spans, Token::with_spans, and blanket module-level allow on word_builder - Remove unused WordBuilder::push_str method - Fix CondTerm Display to handle all segment types via write_redirect_segments instead of debug-formatting unknown segments - Fix locale string context handling for $"..." inside words Oracle improved: 169/181 (was 167). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ast.rs | 2 -- src/lexer/expansions.rs | 4 ++-- src/lexer/word_builder.rs | 5 ----- src/parser/compound.rs | 10 +++++----- src/parser/conditional.rs | 8 ++++---- src/parser/helpers.rs | 19 ++++++++++--------- src/parser/mod.rs | 8 ++++---- src/sexp/mod.rs | 37 ++++++++++++++----------------------- src/sexp/word.rs | 28 +++++++++++++++------------- src/token.rs | 2 -- 10 files changed, 54 insertions(+), 69 deletions(-) diff --git a/src/ast.rs b/src/ast.rs index 79d6e52..3fff3d4 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -75,7 +75,6 @@ pub enum NodeKind { Word { value: String, parts: Vec, - #[allow(dead_code)] spans: Vec, }, @@ -328,7 +327,6 @@ pub enum NodeKind { /// A term (word) in a conditional expression. CondTerm { value: String, - #[allow(dead_code)] spans: Vec, }, diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs index 96761da..5ba00f6 100644 --- a/src/lexer/expansions.rs +++ b/src/lexer/expansions.rs @@ -64,8 +64,8 @@ impl Lexer { self.read_matched_parens(wb, 1)?; let content_end = wb.len().saturating_sub(1); if content_start < content_end { - let content: String = wb.value[content_start..content_end].to_string(); - if !content.trim().is_empty() && crate::parse(&content, self.extglob()).is_err() { + let content = &wb.value[content_start..content_end]; + if !content.trim().is_empty() && crate::parse(content, self.extglob()).is_err() { return Err(RableError::parse( "invalid command substitution", self.pos, diff --git a/src/lexer/word_builder.rs b/src/lexer/word_builder.rs index 7e3279a..a8298d1 100644 --- a/src/lexer/word_builder.rs +++ b/src/lexer/word_builder.rs @@ -4,7 +4,6 @@ //! that record where each expansion starts and ends, along with the //! quoting context at the point of recording. This eliminates the need //! for downstream code to re-parse word values. -#![allow(dead_code)] /// The quoting context at the point where a span was recorded. /// @@ -52,10 +51,6 @@ impl WordBuilder { self.value.push(c); } - pub fn push_str(&mut self, s: &str) { - self.value.push_str(s); - } - pub fn ends_with(&self, c: char) -> bool { self.value.ends_with(c) } diff --git a/src/parser/compound.rs b/src/parser/compound.rs index 1be8779..baf4ce7 100644 --- a/src/parser/compound.rs +++ b/src/parser/compound.rs @@ -148,7 +148,7 @@ impl Parser { break; } let tok = self.lexer.next_token()?; - ws.push(word_node_from_token(&tok)); + ws.push(word_node_from_token(tok)); } Some(ws) } else { @@ -292,7 +292,7 @@ impl Parser { if tok.kind == TokenType::Pipe { continue; } - pattern_words.push(word_node_from_token(&tok)); + pattern_words.push(word_node_from_token(tok)); } self.skip_newlines()?; @@ -351,7 +351,7 @@ impl Parser { break; } let tok = self.lexer.next_token()?; - ws.push(word_node_from_token(&tok)); + ws.push(word_node_from_token(tok)); } Some(ws) } else { @@ -516,7 +516,7 @@ impl Parser { } else { None }; - let mut words = vec![word_node_from_token(&first_tok)]; + let mut words = vec![word_node_from_token(first_tok)]; let mut redirects = Vec::new(); loop { if self.at_end()? { @@ -532,7 +532,7 @@ impl Parser { if is_fd_number(&tok.value) && self.is_redirect_operator()? { redirects.push(self.parse_redirect_with_fd(&tok)?); } else { - words.push(word_node_from_token(&tok)); + words.push(word_node_from_token(tok)); } } else { break; diff --git a/src/parser/conditional.rs b/src/parser/conditional.rs index 2860e86..a360710 100644 --- a/src/parser/conditional.rs +++ b/src/parser/conditional.rs @@ -107,7 +107,7 @@ impl Parser { start, NodeKind::UnaryTest { op: first.value, - operand: Box::new(cond_term_from_token(&operand_tok)), + operand: Box::new(cond_term_from_token(operand_tok)), }, )); } @@ -128,8 +128,8 @@ impl Parser { start, NodeKind::BinaryTest { op: op.value, - left: Box::new(cond_term_from_token(&first)), - right: Box::new(cond_term_from_token(&right)), + left: Box::new(cond_term_from_token(first)), + right: Box::new(cond_term_from_token(right)), }, )); } @@ -140,7 +140,7 @@ impl Parser { start, NodeKind::UnaryTest { op: "-n".to_string(), - operand: Box::new(cond_term_from_token(&first)), + operand: Box::new(cond_term_from_token(first)), }, )) } diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs index 3ff2149..72e42dd 100644 --- a/src/parser/helpers.rs +++ b/src/parser/helpers.rs @@ -3,12 +3,13 @@ use crate::ast::{Node, NodeKind}; use crate::token::Token; -/// Creates a `Word` node from a lexer token, preserving spans. -pub fn word_node_from_token(tok: &Token) -> Node { +/// Creates a `Word` node from a lexer token, moving value and spans. +pub fn word_node_from_token(tok: Token) -> Node { + let parts = super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans); Node::empty(NodeKind::Word { - parts: super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans), - value: tok.value.clone(), - spans: tok.spans.clone(), + parts, + value: tok.value, + spans: tok.spans, }) } @@ -21,11 +22,11 @@ pub fn word_node(value: &str) -> Node { }) } -/// Creates a `cond-term` node from a lexer token, preserving spans. -pub(super) fn cond_term_from_token(tok: &Token) -> Node { +/// Creates a `cond-term` node from a lexer token, moving value and spans. +pub(super) fn cond_term_from_token(tok: Token) -> Node { Node::empty(NodeKind::CondTerm { - value: tok.value.clone(), - spans: tok.spans.clone(), + value: tok.value, + spans: tok.spans, }) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 35428ae..823b2df 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -424,11 +424,11 @@ impl Parser { // After |, time is a regular word, not a keyword. // Temporarily demote it and all following reserved words to words. let time_tok = self.lexer.next_token()?; - let mut words = vec![word_node_from_token(&time_tok)]; + let mut words = vec![word_node_from_token(time_tok)]; // Check for -p flag (also a word in this context) if self.check_word("-p")? { let p_tok = self.lexer.next_token()?; - words.push(word_node_from_token(&p_tok)); + words.push(word_node_from_token(p_tok)); } // Consume all remaining words (including reserved words as plain words) let mut redirects = Vec::new(); @@ -456,7 +456,7 @@ impl Parser { continue; } let t = self.lexer.next_token()?; - words.push(word_node_from_token(&t)); + words.push(word_node_from_token(t)); } Ok(self.spanned( start, @@ -679,7 +679,7 @@ impl Parser { start, NodeKind::Redirect { op: op_tok.value, - target: Box::new(word_node_from_token(&target_tok)), + target: Box::new(word_node_from_token(target_tok)), fd, }, )) diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs index 13e9509..b41a637 100644 --- a/src/sexp/mod.rs +++ b/src/sexp/mod.rs @@ -213,31 +213,22 @@ impl fmt::Display for NodeKind { } else { value }; - let has_cmdsub = spans.iter().any(|s| { - matches!(s.kind, crate::lexer::word_builder::WordSpanKind::CommandSub) - }); - if has_cmdsub { - write!(f, "(cond-term \"")?; + if spans.is_empty() { + write!(f, "(cond-term \"{val}\")") + } else { let segments = word::segments_from_spans(val, spans); - for seg in &segments { - match seg { - word::WordSegment::Literal(text) => write!(f, "{text}")?, - word::WordSegment::CommandSubstitution(content) => { - write!(f, "$(")?; - if let Some(reformatted) = crate::format::reformat_bash(content) { - write!(f, "{reformatted}")?; - } else { - let normalized = normalize_cmdsub_content(content); - write!(f, "{normalized}")?; - } - write!(f, ")")?; - } - _ => write!(f, "{seg:?}")?, - } + if segments + .iter() + .all(|s| matches!(s, word::WordSegment::Literal(_))) + { + // No expansions needing processing + write!(f, "(cond-term \"{val}\")") + } else { + write!(f, "(cond-term \"")?; + // CondTerm uses raw output (like redirect targets) + write_redirect_segments(f, &segments)?; + write!(f, "\")") } - write!(f, "\")") - } else { - write!(f, "(cond-term \"{val}\")") } } Self::UnaryTest { op, operand } => { diff --git a/src/sexp/word.rs b/src/sexp/word.rs index d22b12c..e1fc31b 100644 --- a/src/sexp/word.rs +++ b/src/sexp/word.rs @@ -204,20 +204,22 @@ const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bo ) } -/// Collects sexp-relevant spans that are not nested inside another -/// sexp-relevant span. Sorted by start offset. +/// Collects top-level sexp-relevant spans sorted by start offset. +/// A span is top-level if no other sexp-relevant span fully contains it. fn collect_top_level_sexp_spans( spans: &[crate::lexer::word_builder::WordSpan], ) -> Vec<&crate::lexer::word_builder::WordSpan> { - let relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect(); - relevant - .iter() - .filter(|s| { - // A span is top-level if no other relevant span contains it - !relevant.iter().any(|outer| { - outer.start <= s.start && outer.end >= s.end && !std::ptr::eq(*outer, **s) - }) - }) - .copied() - .collect() + // Collect sexp-relevant spans sorted by start offset + let mut relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect(); + relevant.sort_by_key(|s| s.start); + // Single pass: skip spans nested inside a previously collected one + let mut result = Vec::new(); + let mut covered_until: usize = 0; + for span in relevant { + if span.start >= covered_until { + covered_until = span.end; + result.push(span); + } + } + result } diff --git a/src/token.rs b/src/token.rs index 1187ba1..c8e7c3b 100644 --- a/src/token.rs +++ b/src/token.rs @@ -151,7 +151,6 @@ pub struct Token { pub pos: usize, pub line: usize, /// Expansion spans within the word value (empty for non-word tokens). - #[allow(dead_code)] pub(crate) spans: Vec, } @@ -167,7 +166,6 @@ impl Token { } /// Creates a word token with pre-recorded expansion spans. - #[allow(dead_code)] pub(crate) const fn with_spans( kind: TokenType, value: String, From 5ecfd6926b0438bf579a1de0cd09c7bc5061c713 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Wed, 25 Mar 2026 13:59:57 +0100 Subject: [PATCH 4/5] test: add assertion-based oracle tracking with KNOWN_ORACLE_FAILURES Replace the non-asserting oracle test with one that tracks known failures explicitly. The test now fails on: - Regressions (previously passing test now fails) - Newly passing tests (update KNOWN_ORACLE_FAILURES to track progress) 12 known failures documented with root cause comments. Oracle: 169/181. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/integration.rs | 138 +++++++++++++++++++++++++++++++------------ 1 file changed, 100 insertions(+), 38 deletions(-) diff --git a/tests/integration.rs b/tests/integration.rs index bc28ba7..b46a720 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -199,9 +199,77 @@ parable_tests! { parable_35_parser_bugs => "35_parser_bugs.tests", } +/// Known oracle test failures — tests that don't match bash-oracle output yet. +/// When a fix makes one of these pass, the test suite will fail with +/// "NEWLY PASSING" so you know to remove it from this list. +const KNOWN_ORACLE_FAILURES: &[&str] = &[ + // Trailing backslash doubling + "ansi_c_escapes 3", + "redirect_formatting 3", + "heredoc_formatting 1", + // ANSI-C \x single hex digit and \0 octal repeat behavior + "ansi_c_escapes 13", + "other 10", + // Heredoc delimiter edge cases + "ansi_c_escapes 18", + "heredoc_formatting 8", + // Varfd {6d} not recognized → word dropped + "heredoc_formatting 9", + // Coproc with adjacent redirect + "redirect_formatting 7", + // Background & placement after heredoc in cmdsub + "cmdsub_formatting 9", + // Deprecated $[...] with ; splits word + "word_boundaries 8", + // Assignment detection causes esac to be keyword + "word_boundaries 2", +]; + +struct OracleResults { + total_pass: usize, + total_fail: usize, + regressions: Vec, + newly_passing: Vec, +} + +fn run_oracle_file(path: &Path, results: &mut OracleResults) { + let file_name = path.file_name().unwrap_or_default().to_string_lossy(); + let content = fs::read_to_string(path).unwrap_or_default(); + let cases = parse_test_file(&content); + let mut pass = 0; + let mut fail = 0; + + for case in &cases { + let (passed, actual) = run_test(case); + let is_known = KNOWN_ORACLE_FAILURES.contains(&case.name.as_str()); + if passed { + pass += 1; + if is_known { + results.newly_passing.push(format!( + " NEWLY PASSING :: {} — remove from KNOWN_ORACLE_FAILURES", + case.name, + )); + } + } else { + fail += 1; + if !is_known { + results.regressions.push(format!( + " REGRESSION :: {}\n input: {:?}\n expected: {:?}\n actual: {:?}", + case.name, case.input, case.expected, actual, + )); + } + } + } + + let total = pass + fail; + let status = if fail == 0 { "OK" } else { "FAIL" }; + eprintln!(" {file_name}: {pass}/{total} passed [{status}]"); + results.total_pass += pass; + results.total_fail += fail; +} + /// Oracle-derived tests: correctness differences found by fuzzing against bash-oracle. -/// These tests track progress toward full bash compatibility beyond Parable's test suite. -/// This test reports results but does NOT fail the build. +/// Asserts that known failures still fail and known passes don't regress. #[test] fn oracle_test_suite() { let test_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/oracle"); @@ -213,8 +281,12 @@ fn oracle_test_suite() { eprintln!("\n=== Oracle Test Suite (bash-oracle compatibility) ===\n"); let filter = std::env::var("RABLE_TEST").ok(); - let mut total_pass = 0; - let mut total_fail = 0; + let mut results = OracleResults { + total_pass: 0, + total_fail: 0, + regressions: Vec::new(), + newly_passing: Vec::new(), + }; let mut entries: Vec<_> = fs::read_dir(&test_dir) .unwrap_or_else(|_| { @@ -229,44 +301,34 @@ fn oracle_test_suite() { for entry in entries { let path = entry.path(); - let file_name = path.file_name().unwrap_or_default().to_string_lossy(); - - if let Some(ref f) = filter - && !file_name.contains(f.as_str()) - { + let name = path.file_name().unwrap_or_default().to_string_lossy(); + if filter.as_ref().is_some_and(|f| !name.contains(f.as_str())) { continue; } + run_oracle_file(&path, &mut results); + } - let content = fs::read_to_string(&path).unwrap_or_default(); - let cases = parse_test_file(&content); - let mut pass = 0; - let mut fail = 0; + let total = results.total_pass + results.total_fail; + eprintln!( + "\n=== Oracle Results: {}/{total} passed ({} remaining) ===\n", + results.total_pass, results.total_fail + ); - let mut failures = Vec::new(); - for case in &cases { - let (passed, actual) = run_test(case); - if passed { - pass += 1; - } else { - fail += 1; - failures.push(format!( - " FAIL :: {}\n input: {:?}\n expected: {:?}\n actual: {:?}", - case.name, case.input, case.expected, actual, - )); - } - } - - let total = pass + fail; - let status = if fail == 0 { "OK" } else { "FAIL" }; - eprintln!(" {file_name}: {pass}/{total} passed [{status}]"); - for f in &failures { - eprintln!("{f}"); - } - total_pass += pass; - total_fail += fail; + for msg in &results.newly_passing { + eprintln!("{msg}"); + } + for msg in &results.regressions { + eprintln!("{msg}"); } - let total = total_pass + total_fail; - eprintln!("\n=== Oracle Results: {total_pass}/{total} passed ({total_fail} remaining) ===\n"); - // Intentionally does not assert — these are aspirational targets + assert!( + results.regressions.is_empty(), + "{} oracle test regression(s)", + results.regressions.len() + ); + assert!( + results.newly_passing.is_empty(), + "{} oracle test(s) newly passing — update KNOWN_ORACLE_FAILURES", + results.newly_passing.len() + ); } From b768e979826344654d137d83bb677f16abe1fe64 Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Wed, 25 Mar 2026 14:01:13 +0100 Subject: [PATCH 5/5] test: generate per-file oracle test functions via macro Each oracle .tests file now has its own test function generated by the oracle_test! macro (e.g., oracle_ansi_c_escapes, oracle_heredoc_formatting). Each test individually asserts no regressions and no newly passing tests. This replaces the single oracle_test_suite function, giving specific line-level feedback on which oracle category passes or fails. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/integration.rs | 115 ++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 62 deletions(-) diff --git a/tests/integration.rs b/tests/integration.rs index b46a720..37ad57d 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -225,6 +225,7 @@ const KNOWN_ORACLE_FAILURES: &[&str] = &[ "word_boundaries 2", ]; +#[derive(Default)] struct OracleResults { total_pass: usize, total_fail: usize, @@ -268,67 +269,57 @@ fn run_oracle_file(path: &Path, results: &mut OracleResults) { results.total_fail += fail; } -/// Oracle-derived tests: correctness differences found by fuzzing against bash-oracle. -/// Asserts that known failures still fail and known passes don't regress. -#[test] -fn oracle_test_suite() { - let test_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/oracle"); - if !test_dir.exists() { - eprintln!("Skipping: tests/oracle/ directory not found"); - return; - } - - eprintln!("\n=== Oracle Test Suite (bash-oracle compatibility) ===\n"); - - let filter = std::env::var("RABLE_TEST").ok(); - let mut results = OracleResults { - total_pass: 0, - total_fail: 0, - regressions: Vec::new(), - newly_passing: Vec::new(), - }; - - let mut entries: Vec<_> = fs::read_dir(&test_dir) - .unwrap_or_else(|_| { - eprintln!("Warning: test directory not found: {}", test_dir.display()); - std::process::exit(0); - }) - .filter_map(Result::ok) - .filter(|e| e.path().extension().is_some_and(|ext| ext == "tests")) - .collect(); - - entries.sort_by_key(std::fs::DirEntry::file_name); - - for entry in entries { - let path = entry.path(); - let name = path.file_name().unwrap_or_default().to_string_lossy(); - if filter.as_ref().is_some_and(|f| !name.contains(f.as_str())) { - continue; +/// Generates a test function for each oracle test file. +/// Each test asserts no regressions and no newly passing tests +/// (which would need `KNOWN_ORACLE_FAILURES` updated). +macro_rules! oracle_test { + ($name:ident, $file:expr) => { + #[test] + fn $name() { + let path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/oracle") + .join($file); + if !path.exists() { + eprintln!("Skipping: {} not found", $file); + return; + } + let mut results = OracleResults::default(); + run_oracle_file(&path, &mut results); + let total = results.total_pass + results.total_fail; + eprintln!( + " {}: {}/{total} passed ({} remaining)", + $file, results.total_pass, results.total_fail, + ); + for msg in &results.newly_passing { + eprintln!("{msg}"); + } + for msg in &results.regressions { + eprintln!("{msg}"); + } + assert!( + results.regressions.is_empty(), + "{}: {} regression(s)", + $file, + results.regressions.len() + ); + assert!( + results.newly_passing.is_empty(), + "{}: {} newly passing — update KNOWN_ORACLE_FAILURES", + $file, + results.newly_passing.len() + ); } - run_oracle_file(&path, &mut results); - } - - let total = results.total_pass + results.total_fail; - eprintln!( - "\n=== Oracle Results: {}/{total} passed ({} remaining) ===\n", - results.total_pass, results.total_fail - ); - - for msg in &results.newly_passing { - eprintln!("{msg}"); - } - for msg in &results.regressions { - eprintln!("{msg}"); - } - - assert!( - results.regressions.is_empty(), - "{} oracle test regression(s)", - results.regressions.len() - ); - assert!( - results.newly_passing.is_empty(), - "{} oracle test(s) newly passing — update KNOWN_ORACLE_FAILURES", - results.newly_passing.len() - ); + }; } + +oracle_test!(oracle_ansi_c_escapes, "ansi_c_escapes.tests"); +oracle_test!(oracle_ansi_c_processing, "ansi_c_processing.tests"); +oracle_test!(oracle_array_normalization, "array_normalization.tests"); +oracle_test!(oracle_cmdsub_formatting, "cmdsub_formatting.tests"); +oracle_test!(oracle_heredoc_formatting, "heredoc_formatting.tests"); +oracle_test!(oracle_locale_strings, "locale_strings.tests"); +oracle_test!(oracle_other, "other.tests"); +oracle_test!(oracle_procsub_formatting, "procsub_formatting.tests"); +oracle_test!(oracle_redirect_formatting, "redirect_formatting.tests"); +oracle_test!(oracle_top_level_separation, "top_level_separation.tests"); +oracle_test!(oracle_word_boundaries, "word_boundaries.tests");