From 9163d24ecf5472da8238324ef6f7ff3a55bcf3ba Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Wed, 25 Mar 2026 12:22:02 +0100
Subject: [PATCH 1/5] feat: enrich AST with structured word spans and
 assignment detection

Move expansion boundary tracking into the lexer to eliminate ~300 lines
of duplicate re-parsing in the sexp formatting layer.

Key changes:

- **WordBuilder**: New lexer type that bundles word value string with
  expansion spans and quoting context. All 14 lexer word-reading
  functions converted from `&mut String` to `&mut WordBuilder`.

- **WordSpan tracking**: Lexer records spans for all 14 expansion types
  (CommandSub, AnsiCQuote, LocaleString, ProcessSub, SingleQuoted,
  DoubleQuoted, ParamExpansion, SimpleVar, ArithmeticSub, Backtick,
  BracketSubscript, Extglob, DeprecatedArith, Escape) with quoting
  context (None, DoubleQuote, ParamExpansion, CommandSub, Backtick).

- **segments_from_spans**: New span-based segment extraction replaces
  `parse_word_segments` re-parsing for words with spans. Filters to
  sexp-relevant span kinds, handles nested spans via top-level
  collection, and uses quoting context for correct ANSI-C behavior.

- **Assignment detection**: Lexer emits AssignmentWord tokens for
  `NAME=`, `NAME+=`, `NAME[...]=` patterns. Parser populates
  `Command.assignments` field.

- **Word.parts**: Populated via `decompose_word()` with structured AST
  nodes (WordLiteral, CommandSubstitution, ProcessSubstitution,
  AnsiCQuote, LocaleString).

- **Array parsing**: New `read_array_elements`/`read_array_element` in
  lexer parses array content as individual words at lex time, replacing
  post-hoc `normalize_array_content` string manipulation.

- **Dead code removed**: `try_normalize_array`, `normalize_array_content`,
  `read_balanced_delim`, `should_format_from_value`, `parts_to_segments`.

All 138 tests pass (95 unit + 37 integration + 6 doc).
Oracle: 165/181 (unchanged from baseline).
S-expression output identical (Parable compatibility preserved).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ast.rs                |  10 +-
 src/context.rs            |  41 ------
 src/lexer/expansions.rs   | 197 +++++++++++++++-----------
 src/lexer/mod.rs          |   1 +
 src/lexer/quotes.rs       |  42 +++---
 src/lexer/tests.rs        | 217 ++++++++++++++++++++++++++++
 src/lexer/word_builder.rs | 156 +++++++++++++++++++++
 src/lexer/words.rs        | 287 ++++++++++++++++++++++++++++----------
 src/parser/compound.rs    |   6 +-
 src/parser/helpers.rs     |  10 +-
 src/parser/mod.rs         |   5 +-
 src/parser/tests.rs       |  62 +++++++-
 src/parser/word_parts.rs  | 210 ++++++++++++++++++++++++++++
 src/sexp/mod.rs           |  10 +-
 src/sexp/word.rs          | 232 ++++++++++++++++--------------
 src/token.rs              |  25 ++++
 tests/integration.rs      |  10 +-
 17 files changed, 1184 insertions(+), 337 deletions(-)
 create mode 100644 src/lexer/word_builder.rs
 create mode 100644 src/parser/word_parts.rs
diff --git a/src/ast.rs b/src/ast.rs
index 42c7c15..9f78c17 100644
--- a/src/ast.rs
+++ b/src/ast.rs
@@ -72,7 +72,15 @@ impl Node {
 #[allow(clippy::use_self)]
 pub enum NodeKind {
     /// A word token, possibly containing expansion parts.
-    Word { value: String, parts: Vec<Node> },
+    Word {
+        value: String,
+        parts: Vec<Node>,
+        #[allow(dead_code)]
+        spans: Vec<crate::lexer::word_builder::WordSpan>,
+    },
+
+    /// A literal text segment within a word's parts list.
+    WordLiteral { value: String },
 
     /// A simple command: assignments, words, and redirects.
     Command {
diff --git a/src/context.rs b/src/context.rs
index 26398f8..b8bd94c 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -115,47 +115,6 @@ pub fn skip_backtick(chars: &[char], pos: &mut usize, out: &mut String) {
     }
 }
 
-/// Reads balanced delimiters (parens, braces, etc.) with quote awareness.
-///
-/// Starting at depth 1, reads until depth reaches 0. Handles single quotes,
-/// double quotes, backticks, and backslash escapes inside.
-pub fn read_balanced_delim(
-    chars: &[char],
-    pos: &mut usize,
-    open: char,
-    close: char,
-    out: &mut String,
-) {
-    let mut depth = 1;
-    while *pos < chars.len() && depth > 0 {
-        match chars[*pos] {
-            c if c == open => {
-                depth += 1;
-                out.push(c);
-                *pos += 1;
-            }
-            c if c == close => {
-                depth -= 1;
-                out.push(c);
-                *pos += 1;
-            }
-            '\'' => skip_single_quoted(chars, pos, out),
-            '"' => skip_double_quoted(chars, pos, out),
-            '`' => skip_backtick(chars, pos, out),
-            '\\' if *pos + 1 < chars.len() => {
-                out.push(chars[*pos]);
-                *pos += 1;
-                out.push(chars[*pos]);
-                *pos += 1;
-            }
-            c => {
-                out.push(c);
-                *pos += 1;
-            }
-        }
-    }
-}
-
 /// Counts consecutive backslashes before a position to determine
 /// if a character is escaped.
 ///
diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs
index 2f0f5c4..96761da 100644
--- a/src/lexer/expansions.rs
+++ b/src/lexer/expansions.rs
@@ -2,95 +2,114 @@ use crate::context::CaseTracker;
 use crate::error::{RableError, Result};
 
 use super::Lexer;
+use super::word_builder::{QuotingContext, WordBuilder, WordSpanKind};
 
 impl Lexer {
     /// Reads a dollar expansion into the value string.
-    pub(super) fn read_dollar(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_dollar(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        let span_start = wb.span_start();
         self.advance_char(); // consume '$'
-        value.push('$');
+        wb.push('$');
 
         match self.peek_char() {
             Some('(') => {
                 self.advance_char();
-                value.push('(');
+                wb.push('(');
                 if self.peek_char() == Some('(') {
-                    // $(( ... )) arithmetic expansion
                     self.advance_char();
-                    value.push('(');
-                    self.read_matched_parens(value, 2)?;
+                    wb.push('(');
+                    self.read_matched_parens(wb, 2)?;
+                    wb.record(span_start, WordSpanKind::ArithmeticSub);
                 } else {
-                    // $( ... ) command substitution
-                    let content_start = value.len();
-                    self.read_matched_parens(value, 1)?;
-                    // Validate content is parseable bash
-                    let content_end = value.len().saturating_sub(1);
-                    if content_start < content_end {
-                        let content: String = value[content_start..content_end].to_string();
-                        if !content.trim().is_empty()
-                            && crate::parse(&content, self.extglob()).is_err()
-                        {
-                            return Err(RableError::parse(
-                                "invalid command substitution",
-                                self.pos,
-                                self.line,
-                            ));
-                        }
-                    }
+                    self.read_command_sub(wb)?;
+                    wb.record(span_start, WordSpanKind::CommandSub);
                 }
             }
             Some('{') => {
                 self.advance_char();
-                value.push('{');
-                // Parameter expansion: don't count unquoted { as depth increase
-                self.read_param_expansion_braces(value)?;
+                wb.push('{');
+                self.read_param_expansion_braces(wb)?;
+                wb.record(span_start, WordSpanKind::ParamExpansion);
             }
             Some('\'') => {
-                // $'...' ANSI-C quoting — handles \ escapes unlike regular ''
                 self.advance_char();
-                value.push('\'');
-                self.read_ansi_c_quoted(value)?;
+                wb.push('\'');
+                self.read_ansi_c_quoted(wb)?;
+                wb.record(span_start, WordSpanKind::AnsiCQuote);
             }
             Some('"') => {
-                // $"..." locale string
                 self.advance_char();
-                value.push('"');
-                self.read_double_quoted(value)?;
+                wb.push('"');
+                self.read_double_quoted(wb)?;
+                wb.record(span_start, WordSpanKind::LocaleString);
             }
             Some('[') => {
-                // $[...] deprecated arithmetic
                 self.advance_char();
-                value.push('[');
-                self.read_until_char(value, ']')?;
+                wb.push('[');
+                self.read_until_char(wb, ']')?;
+                wb.record(span_start, WordSpanKind::DeprecatedArith);
+            }
+            Some(c) if is_dollar_start(c) => {
+                self.read_variable_name(wb, c);
+                wb.record(span_start, WordSpanKind::SimpleVar);
+            }
+            _ => {} // Bare $ — no span
+        }
+        Ok(())
+    }
+
+    /// Reads `$(...)` command substitution content and validates it.
+    fn read_command_sub(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        let content_start = wb.len();
+        self.read_matched_parens(wb, 1)?;
+        let content_end = wb.len().saturating_sub(1);
+        if content_start < content_end {
+            let content: String = wb.value[content_start..content_end].to_string();
+            if !content.trim().is_empty() && crate::parse(&content, self.extglob()).is_err() {
+                return Err(RableError::parse(
+                    "invalid command substitution",
+                    self.pos,
+                    self.line,
+                ));
             }
-            Some(c) if is_dollar_start(c) => self.read_variable_name(value, c),
-            _ => {} // Bare $ at end of word — just leave it
         }
         Ok(())
     }
 
     /// Reads a variable name after `$` for simple expansions.
-    pub(super) fn read_variable_name(&mut self, value: &mut String, first: char) {
+    pub(super) fn read_variable_name(&mut self, wb: &mut WordBuilder, first: char) {
         if first.is_ascii_alphabetic() || first == '_' {
             while let Some(nc) = self.peek_char() {
                 if nc.is_ascii_alphanumeric() || nc == '_' {
                     self.advance_char();
-                    value.push(nc);
+                    wb.push(nc);
                 } else {
                     break;
                 }
             }
         } else {
             self.advance_char();
-            value.push(first);
+            wb.push(first);
         }
     }
 
     /// Reads until matching closing parentheses.
     #[allow(clippy::too_many_lines)]
-    #[allow(clippy::too_many_lines)]
     pub(super) fn read_matched_parens(
         &mut self,
-        value: &mut String,
+        wb: &mut WordBuilder,
+        close_count: usize,
+    ) -> Result<()> {
+        wb.enter_context(QuotingContext::CommandSub);
+        let result = self.read_matched_parens_inner(wb, close_count);
+        wb.leave_context();
+        result
+    }
+
+    #[allow(clippy::too_many_lines)]
+    fn read_matched_parens_inner(
+        &mut self,
+        wb: &mut WordBuilder,
         close_count: usize,
     ) -> Result<()> {
         let mut depth = close_count;
@@ -105,7 +124,7 @@ impl Lexer {
                     word_buf.clear();
 
                     self.advance_char();
-                    value.push(')');
+                    wb.push(')');
                     if case.is_pattern_close() {
                         // This `)` terminates a case pattern — don't decrement depth
                         case.close_pattern();
@@ -120,7 +139,7 @@ impl Lexer {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     self.advance_char();
-                    value.push('(');
+                    wb.push('(');
                     // In case pattern mode, `(` is optional pattern prefix — don't increment
                     if !case.is_pattern_open() {
                         depth += 1;
@@ -130,15 +149,15 @@ impl Lexer {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     self.advance_char();
-                    value.push('\'');
-                    self.read_single_quoted(value)?;
+                    wb.push('\'');
+                    self.read_single_quoted(wb)?;
                 }
                 Some('"') => {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     self.advance_char();
-                    value.push('"');
-                    self.read_double_quoted(value)?;
+                    wb.push('"');
+                    self.read_double_quoted(wb)?;
                 }
                 Some('\\') => {
                     word_buf.clear();
@@ -146,27 +165,27 @@ impl Lexer {
                     if self.peek_char() == Some('\n') {
                         self.advance_char(); // line continuation
                     } else {
-                        value.push('\\');
+                        wb.push('\\');
                         if let Some(c) = self.advance_char() {
-                            value.push(c);
+                            wb.push(c);
                         }
                     }
                 }
                 Some('$') => {
                     word_buf.clear();
-                    self.read_dollar(value)?;
+                    self.read_dollar(wb)?;
                 }
                 Some('`') => {
                     word_buf.clear();
                     self.advance_char();
-                    value.push('`');
-                    self.read_backtick(value)?;
+                    wb.push('`');
+                    self.read_backtick(wb)?;
                 }
                 Some('#') => {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     // # is a comment when preceded by whitespace/newline
-                    let prev = value.chars().last().unwrap_or('\n');
+                    let prev = wb.value.chars().last().unwrap_or('\n');
                     if prev == '\n' || prev == ' ' || prev == '\t' {
                         // Comment — skip to end of line
                         while let Some(c) = self.peek_char() {
@@ -177,26 +196,26 @@ impl Lexer {
                         }
                     } else {
                         self.advance_char();
-                        value.push('#');
+                        wb.push('#');
                     }
                 }
                 Some(';') => {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     self.advance_char();
-                    value.push(';');
+                    wb.push(';');
                     // Check for ;; ;& ;;& which resume case pattern mode
                     if self.peek_char() == Some(';') {
                         self.advance_char();
-                        value.push(';');
+                        wb.push(';');
                         if self.peek_char() == Some('&') {
                             self.advance_char();
-                            value.push('&');
+                            wb.push('&');
                         }
                         case.resume_pattern();
                     } else if self.peek_char() == Some('&') {
                         self.advance_char();
-                        value.push('&');
+                        wb.push('&');
                         case.resume_pattern();
                     }
                 }
@@ -204,18 +223,18 @@ impl Lexer {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     let c = self.advance_char().unwrap_or(' ');
-                    value.push(c);
+                    wb.push(c);
                 }
                 Some('|') => {
                     case.check_word(&word_buf);
                     word_buf.clear();
                     self.advance_char();
-                    value.push('|');
+                    wb.push('|');
                 }
                 Some(c) => {
                     word_buf.push(c);
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 None => {
                     return Err(RableError::matched_pair(
@@ -229,46 +248,53 @@ impl Lexer {
     }
 
     /// Reads a parameter expansion `${...}` allowing unbalanced inner `{`.
-    pub(super) fn read_param_expansion_braces(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_param_expansion_braces(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        wb.enter_context(QuotingContext::ParamExpansion);
+        let result = self.read_param_expansion_inner(wb);
+        wb.leave_context();
+        result
+    }
+
+    fn read_param_expansion_inner(&mut self, wb: &mut WordBuilder) -> Result<()> {
         loop {
             match self.peek_char() {
                 Some('}') => {
                     self.advance_char();
-                    value.push('}');
+                    wb.push('}');
                     return Ok(());
                 }
                 Some('\'') => {
                     self.advance_char();
-                    value.push('\'');
-                    self.read_single_quoted(value)?;
+                    wb.push('\'');
+                    self.read_single_quoted(wb)?;
                 }
                 Some('"') => {
                     self.advance_char();
-                    value.push('"');
-                    self.read_double_quoted(value)?;
+                    wb.push('"');
+                    self.read_double_quoted(wb)?;
                 }
                 Some('\\') => {
                     self.advance_char();
                     if self.peek_char() == Some('\n') {
                         self.advance_char(); // line continuation
                     } else {
-                        value.push('\\');
+                        wb.push('\\');
                         if let Some(c) = self.advance_char() {
-                            value.push(c);
+                            wb.push(c);
                         }
                     }
                 }
                 Some('$') => {
-                    self.read_dollar(value)?;
+                    self.read_dollar(wb)?;
                 }
                 Some('`') => {
                     self.advance_char();
-                    value.push('`');
-                    self.read_backtick(value)?;
+                    wb.push('`');
+                    self.read_backtick(wb)?;
                 }
                 Some(c) => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 None => {
                     return Err(RableError::matched_pair(
@@ -282,24 +308,31 @@ impl Lexer {
     }
 
     /// Reads a backtick command substitution.
-    pub(super) fn read_backtick(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_backtick(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        wb.enter_context(QuotingContext::Backtick);
+        let result = self.read_backtick_inner(wb);
+        wb.leave_context();
+        result
+    }
+
+    fn read_backtick_inner(&mut self, wb: &mut WordBuilder) -> Result<()> {
         loop {
             match self.peek_char() {
                 Some('`') => {
                     self.advance_char();
-                    value.push('`');
+                    wb.push('`');
                     return Ok(());
                 }
                 Some('\\') => {
                     self.advance_char();
-                    value.push('\\');
+                    wb.push('\\');
                     if let Some(c) = self.advance_char() {
-                        value.push(c);
+                        wb.push(c);
                     }
                 }
                 Some(c) => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 None => {
                     return Err(RableError::matched_pair(
@@ -313,14 +346,14 @@ impl Lexer {
     }
 
     /// Reads until the given closing character.
-    pub(super) fn read_until_char(&mut self, value: &mut String, close: char) -> Result<()> {
+    pub(super) fn read_until_char(&mut self, wb: &mut WordBuilder, close: char) -> Result<()> {
         loop {
             match self.advance_char() {
                 Some(c) if c == close => {
-                    value.push(c);
+                    wb.push(c);
                     return Ok(());
                 }
-                Some(c) => value.push(c),
+                Some(c) => wb.push(c),
                 None => {
                     return Err(RableError::matched_pair(
                         format!("unterminated '{close}'"),
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 346b99e..3334d61 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -5,6 +5,7 @@ mod expansions;
 mod heredoc;
 mod operators;
 mod quotes;
+pub mod word_builder;
 mod words;
 
 #[cfg(test)]
diff --git a/src/lexer/quotes.rs b/src/lexer/quotes.rs
index 42a4a02..6adf639 100644
--- a/src/lexer/quotes.rs
+++ b/src/lexer/quotes.rs
@@ -1,17 +1,18 @@
 use crate::error::{RableError, Result};
 
 use super::Lexer;
+use super::word_builder::{QuotingContext, WordBuilder};
 
 impl Lexer {
     /// Reads contents of a single-quoted string (after the opening `'`).
-    pub(super) fn read_single_quoted(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_single_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> {
         loop {
             match self.advance_char() {
                 Some('\'') => {
-                    value.push('\'');
+                    wb.push('\'');
                     return Ok(());
                 }
-                Some(c) => value.push(c),
+                Some(c) => wb.push(c),
                 None => {
                     return Err(RableError::matched_pair(
                         "unterminated single quote",
@@ -25,24 +26,24 @@ impl Lexer {
 
     /// Reads contents of an ANSI-C quoted string `$'...'` (after the opening `'`).
     /// Unlike regular single quotes, `\` is an escape character here.
-    pub(super) fn read_ansi_c_quoted(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_ansi_c_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> {
         loop {
             match self.peek_char() {
                 Some('\'') => {
                     self.advance_char();
-                    value.push('\'');
+                    wb.push('\'');
                     return Ok(());
                 }
                 Some('\\') => {
                     self.advance_char();
-                    value.push('\\');
+                    wb.push('\\');
                     if let Some(next) = self.advance_char() {
-                        value.push(next);
+                        wb.push(next);
                     }
                 }
                 Some(c) => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 None => {
                     return Err(RableError::matched_pair(
@@ -56,12 +57,19 @@ impl Lexer {
     }
 
     /// Reads contents of a double-quoted string (after the opening `"`).
-    pub(super) fn read_double_quoted(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_double_quoted(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        wb.enter_context(QuotingContext::DoubleQuote);
+        let result = self.read_double_quoted_inner(wb);
+        wb.leave_context();
+        result
+    }
+
+    fn read_double_quoted_inner(&mut self, wb: &mut WordBuilder) -> Result<()> {
         loop {
             match self.peek_char() {
                 Some('"') => {
                     self.advance_char();
-                    value.push('"');
+                    wb.push('"');
                     return Ok(());
                 }
                 Some('\\') => {
@@ -70,9 +78,9 @@ impl Lexer {
                     if self.peek_char() == Some('\n') {
                         self.advance_char();
                     } else {
-                        value.push('\\');
+                        wb.push('\\');
                         if let Some(next) = self.advance_char() {
-                            value.push(next);
+                            wb.push(next);
                         }
                     }
                 }
@@ -81,19 +89,19 @@ impl Lexer {
                     // bare $ followed by closing ". Don't let read_dollar consume it.
                     if self.input.get(self.pos + 1) == Some(&'"') {
                         self.advance_char();
-                        value.push('$');
+                        wb.push('$');
                     } else {
-                        self.read_dollar(value)?;
+                        self.read_dollar(wb)?;
                     }
                 }
                 Some('`') => {
                     self.advance_char();
-                    value.push('`');
-                    self.read_backtick(value)?;
+                    wb.push('`');
+                    self.read_backtick(wb)?;
                 }
                 Some(c) => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 None => {
                     return Err(RableError::matched_pair(
diff --git a/src/lexer/tests.rs b/src/lexer/tests.rs
index 193d294..fd7e75c 100644
--- a/src/lexer/tests.rs
+++ b/src/lexer/tests.rs
@@ -86,3 +86,220 @@ fn and_or() {
     assert_eq!(tokens[1], (TokenType::And, "&&".to_string()));
     assert_eq!(tokens[3], (TokenType::Or, "||".to_string()));
 }
+
+#[test]
+fn assignment_word_simple() {
+    let tokens = collect_tokens("FOO=bar");
+    assert_eq!(
+        tokens[0],
+        (TokenType::AssignmentWord, "FOO=bar".to_string())
+    );
+}
+
+#[test]
+fn assignment_word_plus_equals() {
+    let tokens = collect_tokens("FOO+=bar");
+    assert_eq!(
+        tokens[0],
+        (TokenType::AssignmentWord, "FOO+=bar".to_string())
+    );
+}
+
+#[test]
+fn assignment_word_array() {
+    let tokens = collect_tokens("arr=(a b)");
+    assert_eq!(
+        tokens[0],
+        (TokenType::AssignmentWord, "arr=(a b)".to_string())
+    );
+}
+
+#[test]
+fn assignment_word_subscript() {
+    let tokens = collect_tokens("arr[0]=val");
+    assert_eq!(
+        tokens[0],
+        (TokenType::AssignmentWord, "arr[0]=val".to_string())
+    );
+}
+
+#[test]
+fn not_assignment_no_name() {
+    let tokens = collect_tokens("=value");
+    assert_eq!(tokens[0].0, TokenType::Word);
+}
+
+#[test]
+fn not_assignment_regular_word() {
+    let tokens = collect_tokens("echo");
+    assert_eq!(tokens[0].0, TokenType::Word);
+}
+
+#[test]
+fn assignment_before_command_keeps_command_start() {
+    // Assignment tokens should keep command_start=true so the
+    // command word after is still recognized
+    let tokens = collect_tokens("FOO=bar echo hello");
+    assert_eq!(tokens[0].0, TokenType::AssignmentWord);
+    assert_eq!(tokens[1].0, TokenType::Word);
+    assert_eq!(tokens[2].0, TokenType::Word);
+}
+
+// -- Span recording tests --
+
+use super::word_builder::{WordSpan, WordSpanKind};
+
+#[allow(clippy::unwrap_used)]
+fn first_word_spans(source: &str) -> (String, Vec<WordSpan>) {
+    let mut lexer = Lexer::new(source, false);
+    let tok = lexer.next_token().unwrap();
+    (tok.value, tok.spans)
+}
+
+#[test]
+fn span_plain_word_no_spans() {
+    let (_, spans) = first_word_spans("echo");
+    assert!(spans.is_empty());
+}
+
+#[test]
+fn span_command_sub() {
+    let (val, spans) = first_word_spans("$(cmd)");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::CommandSub);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_command_sub_mid_word() {
+    let (_, spans) = first_word_spans("hello$(world)end");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::CommandSub);
+    assert_eq!(spans[0].start, 5);
+    assert_eq!(spans[0].end, 13);
+}
+
+#[test]
+fn span_arithmetic_sub() {
+    let (val, spans) = first_word_spans("$((1+2))");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::ArithmeticSub);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_param_expansion() {
+    let (val, spans) = first_word_spans("${var:-default}");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::ParamExpansion);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_simple_var() {
+    let (val, spans) = first_word_spans("$HOME");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::SimpleVar);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_ansi_c_quote() {
+    let (val, spans) = first_word_spans("$'foo'");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::AnsiCQuote);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_locale_string() {
+    let (val, spans) = first_word_spans("$\"hello\"");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::LocaleString);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_single_quoted() {
+    let (val, spans) = first_word_spans("'quoted'");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::SingleQuoted);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_double_quoted() {
+    let (val, spans) = first_word_spans("\"double\"");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::DoubleQuoted);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_backtick() {
+    let (val, spans) = first_word_spans("`cmd`");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::Backtick);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_escape() {
+    let (val, spans) = first_word_spans("\\n");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::Escape);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
+
+#[test]
+fn span_line_continuation_no_span() {
+    // \<newline> is a line continuation — nothing pushed, no span
+    let (val, spans) = first_word_spans("hel\\\nlo");
+    assert_eq!(val, "hello");
+    assert!(spans.is_empty());
+}
+
+#[test]
+fn span_bare_dollar_no_span() {
+    // Bare $ at end — no expansion, no span
+    let tokens = collect_tokens("echo $");
+    assert_eq!(tokens[1].1, "$");
+    // The $ word has no spans (bare dollar)
+    let (_, spans) = first_word_spans("$");
+    assert!(spans.is_empty());
+}
+
+#[test]
+fn span_nested_double_quoted_with_cmdsub() {
+    // "$(cmd)" — DoubleQuoted span contains CommandSub span
+    let (val, spans) = first_word_spans("\"$(cmd)\"");
+    assert_eq!(val, "\"$(cmd)\"");
+    assert_eq!(spans.len(), 2);
+    // CommandSub is recorded first (inside read_dollar, before
+    // DoubleQuoted is closed by read_word_special)
+    assert_eq!(spans[0].kind, WordSpanKind::CommandSub);
+    assert_eq!(spans[0].start, 1);
+    assert_eq!(spans[0].end, 7);
+    assert_eq!(spans[1].kind, WordSpanKind::DoubleQuoted);
+    assert_eq!(spans[1].start, 0);
+    assert_eq!(spans[1].end, 8);
+}
+
+#[test]
+fn span_deprecated_arith() {
+    let (val, spans) = first_word_spans("$[1+2]");
+    assert_eq!(spans.len(), 1);
+    assert_eq!(spans[0].kind, WordSpanKind::DeprecatedArith);
+    assert_eq!(spans[0].start, 0);
+    assert_eq!(spans[0].end, val.len());
+}
diff --git a/src/lexer/word_builder.rs b/src/lexer/word_builder.rs
new file mode 100644
index 0000000..7e3279a
--- /dev/null
+++ b/src/lexer/word_builder.rs
@@ -0,0 +1,156 @@
+//! Word builder for accumulating word tokens with expansion span tracking.
+//!
+//! `WordBuilder` bundles the word value string with a list of `WordSpan`s
+//! that record where each expansion starts and ends, along with the
+//! quoting context at the point of recording. This eliminates the need
+//! for downstream code to re-parse word values.
+#![allow(dead_code)]
+
+/// The quoting context at the point where a span was recorded.
+///
+/// Bash has context-sensitive expansion rules. For example, `$'...'`
+/// is an ANSI-C quote at top level or inside `${...}`, but is NOT
+/// special inside `"..."` (it's just literal `$` + `'...'`).
+///
+/// Each span captures the context stack so downstream consumers can
+/// make correct decisions without re-deriving context.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum QuotingContext {
+    /// Top-level word context (no quoting).
+    None,
+    /// Inside double quotes `"..."`.
+    DoubleQuote,
+    /// Inside parameter expansion `${...}` (resets quoting context).
+    ParamExpansion,
+    /// Inside command substitution `$(...)` (resets quoting context).
+    CommandSub,
+    /// Inside backtick command substitution `` `...` ``.
+    Backtick,
+}
+
+/// Accumulates a word token's value string and expansion spans.
+pub struct WordBuilder {
+    /// The word value being built character by character.
+    pub value: String,
+    /// Expansion spans recorded during lexing, ordered by start offset.
+    pub spans: Vec<WordSpan>,
+    /// Stack of quoting contexts — the current context is the last entry.
+    /// Empty means top-level (no quoting).
+    context_stack: Vec<QuotingContext>,
+}
+
+impl WordBuilder {
+    pub const fn new() -> Self {
+        Self {
+            value: String::new(),
+            spans: Vec::new(),
+            context_stack: Vec::new(),
+        }
+    }
+
+    pub fn push(&mut self, c: char) {
+        self.value.push(c);
+    }
+
+    pub fn push_str(&mut self, s: &str) {
+        self.value.push_str(s);
+    }
+
+    pub fn ends_with(&self, c: char) -> bool {
+        self.value.ends_with(c)
+    }
+
+    pub const fn is_empty(&self) -> bool {
+        self.value.is_empty()
+    }
+
+    pub const fn len(&self) -> usize {
+        self.value.len()
+    }
+
+    /// Returns the current byte offset — use before an expansion to
+    /// capture its start position.
+    pub const fn span_start(&self) -> usize {
+        self.value.len()
+    }
+
+    /// Records a completed expansion span from `start` to the current
+    /// end of the value string, capturing the current quoting context.
+    pub fn record(&mut self, start: usize, kind: WordSpanKind) {
+        self.spans.push(WordSpan {
+            start,
+            end: self.value.len(),
+            kind,
+            context: self.current_context(),
+        });
+    }
+
+    /// Returns the current quoting context.
+    pub fn current_context(&self) -> QuotingContext {
+        self.context_stack
+            .last()
+            .copied()
+            .unwrap_or(QuotingContext::None)
+    }
+
+    /// Pushes a quoting context onto the stack.
+    pub fn enter_context(&mut self, ctx: QuotingContext) {
+        self.context_stack.push(ctx);
+    }
+
+    /// Pops the current quoting context from the stack.
+    pub fn leave_context(&mut self) {
+        self.context_stack.pop();
+    }
+}
+
+/// A byte-range span within a word token's value string, identifying
+/// an expansion or quoting construct and the context it appeared in.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct WordSpan {
+    /// Start byte offset in the word value (inclusive).
+    pub start: usize,
+    /// End byte offset in the word value (exclusive).
+    pub end: usize,
+    /// The kind of expansion or quoting at this span.
+    pub kind: WordSpanKind,
+    /// The quoting context this span was recorded in.
+    pub context: QuotingContext,
+}
+
+/// The kind of expansion or quoting construct a `WordSpan` represents.
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum WordSpanKind {
+    // -- Sexp-relevant (formatter extracts content from these) --
+    /// Command substitution: `$(...)`.
+    CommandSub,
+    /// ANSI-C quoting: `$'...'`.
+    AnsiCQuote,
+    /// Locale translation string: `$"..."`.
+    LocaleString,
+    /// Process substitution: `<(...)` or `>(...)`.
+    ProcessSub(char),
+
+    // -- Structural (formatter treats as literal text) --
+    /// Single-quoted string: `'...'`.
+    SingleQuoted,
+    /// Double-quoted string: `"..."`.
+    DoubleQuoted,
+    /// Parameter expansion: `${...}`.
+    ParamExpansion,
+    /// Simple variable: `$VAR`.
+    SimpleVar,
+    /// Arithmetic substitution: `$((...))`.
+    ArithmeticSub,
+    /// Backtick command substitution: `` `...` ``.
+    Backtick,
+    /// Array subscript: `[...]`.
+    BracketSubscript,
+    /// Extended glob pattern: `@(...)`, `?(...)`, etc.
+    Extglob(char),
+    /// Deprecated arithmetic: `$[...]`.
+    DeprecatedArith,
+    /// Backslash escape: `\X` (not `\<newline>` line continuations).
+    Escape,
+}
diff --git a/src/lexer/words.rs b/src/lexer/words.rs
index 69b196c..8a4a981 100644
--- a/src/lexer/words.rs
+++ b/src/lexer/words.rs
@@ -2,12 +2,13 @@ use crate::error::{RableError, Result};
 use crate::token::{Token, TokenType};
 
 use super::Lexer;
+use super::word_builder::{WordBuilder, WordSpanKind};
 
 impl Lexer {
     /// Reads a word token, handling quoting and expansions.
     #[allow(clippy::too_many_lines)]
     pub(super) fn read_word_token(&mut self, start: usize, line: usize) -> Result<Token> {
-        let mut value = String::new();
+        let mut wb = WordBuilder::new();
 
         while let Some(c) = self.peek_char() {
             match c {
@@ -15,76 +16,81 @@ impl Lexer {
                 ' ' | '\t' | '\n' | '|' | '&' | ';' | ')' => break,
                 // < and > are metacharacters, but <( and >( are process substitution
                 '<' | '>' => {
-                    if !value.is_empty() && self.input.get(self.pos + 1) == Some(&'(') {
+                    if !wb.is_empty() && self.input.get(self.pos + 1) == Some(&'(') {
                         // Process substitution mid-word: cat<(cmd)
-                        self.read_process_sub_into(&mut value)?;
+                        self.read_process_sub_into(&mut wb)?;
                     } else {
                         break;
                     }
                 }
                 // ( is a metacharacter UNLESS preceded by = (array) or extglob prefix
                 '(' => {
-                    if value.ends_with('=') {
-                        // Array assignment: arr=(...)
+                    if wb.ends_with('=') {
+                        // Array assignment: arr=(...) — parse elements directly
                         self.advance_char();
-                        value.push('(');
-                        self.read_matched_parens(&mut value, 1)?;
-                    } else if value.ends_with('@')
-                        || value.ends_with('?')
-                        || value.ends_with('+')
-                        || (value.ends_with('!') && self.config.extglob)
-                        || (value.ends_with('*') && self.config.extglob)
+                        wb.push('(');
+                        self.read_array_elements(&mut wb)?;
+                    } else if wb.ends_with('@')
+                        || wb.ends_with('?')
+                        || wb.ends_with('+')
+                        || (wb.ends_with('!') && self.config.extglob)
+                        || (wb.ends_with('*') && self.config.extglob)
                     {
                         // Extglob: @(...), ?(...), etc.
                         self.advance_char();
-                        value.push('(');
-                        self.read_matched_parens(&mut value, 1)?;
+                        wb.push('(');
+                        self.read_matched_parens(&mut wb, 1)?;
                     } else {
                         break;
                     }
                 }
                 '\'' | '"' | '\\' | '$' | '`' => {
-                    self.read_word_special(&mut value, c)?;
+                    self.read_word_special(&mut wb, c)?;
                 }
                 // Extglob: @(...), ?(...), +(...), !(...)
                 // !( is only extglob when extglob is enabled; otherwise ! is Bang
                 '@' | '?' | '+' if self.input.get(self.pos + 1) == Some(&'(') => {
-                    self.read_extglob(&mut value, c)?;
+                    self.read_extglob(&mut wb, c)?;
                 }
                 // !( and *( are only extglob when extglob mode is enabled
                 '!' | '*' if self.input.get(self.pos + 1) == Some(&'(') && self.config.extglob => {
-                    self.read_extglob(&mut value, c)?;
+                    self.read_extglob(&mut wb, c)?;
                 }
                 // `[` inside a word starts a subscript/bracket context
-                // Everything until matching `]` is part of the same word
-                // (including spaces, operators, etc.)
-                // Only applies when the word has content that isn't just `[` or `[[`
-                // (those are test/conditional command syntax, not subscripts)
-                '[' if !value.is_empty() && value != "[" && !value.ends_with('[') => {
-                    self.read_bracket_subscript(&mut value)?;
+                '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => {
+                    self.read_bracket_subscript(&mut wb)?;
                 }
                 // Regular character
                 _ => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
             }
         }
 
-        if value.is_empty() {
+        if wb.is_empty() {
             return Err(RableError::parse("unexpected character", start, line));
         }
 
-        // Check for reserved words at command start
+        // Check for reserved words at command start, then assignment pattern
         let kind = if self.ctx.command_start {
-            TokenType::reserved_word(&value).unwrap_or(TokenType::Word)
+            TokenType::reserved_word(&wb.value).unwrap_or_else(|| {
+                if is_assignment_word(&wb.value) {
+                    TokenType::AssignmentWord
+                } else {
+                    TokenType::Word
+                }
+            })
+        } else if is_assignment_word(&wb.value) {
+            TokenType::AssignmentWord
         } else {
             TokenType::Word
         };
 
         // After a word, we're no longer at command start (unless it's a keyword
-        // that expects another command)
+        // that expects another command, or it's an assignment)
         self.ctx.command_start = kind.starts_command()
+            || kind == TokenType::AssignmentWord
             || matches!(
                 kind,
                 TokenType::Then
@@ -94,40 +100,48 @@ impl Lexer {
                     | TokenType::Semi
             );
 
-        Ok(Token::new(kind, value, start, line))
+        Ok(Token::with_spans(kind, wb.value, start, line, wb.spans))
     }
 
     /// Reads a quoted string, escape, dollar expansion, or backtick within a word.
-    pub(super) fn read_word_special(&mut self, value: &mut String, c: char) -> Result<()> {
+    pub(super) fn read_word_special(&mut self, wb: &mut WordBuilder, c: char) -> Result<()> {
         match c {
             '\'' => {
+                let start = wb.span_start();
                 self.advance_char();
-                value.push('\'');
-                self.read_single_quoted(value)?;
+                wb.push('\'');
+                self.read_single_quoted(wb)?;
+                wb.record(start, WordSpanKind::SingleQuoted);
             }
             '"' => {
+                let start = wb.span_start();
                 self.advance_char();
-                value.push('"');
-                self.read_double_quoted(value)?;
+                wb.push('"');
+                self.read_double_quoted(wb)?;
+                wb.record(start, WordSpanKind::DoubleQuoted);
             }
             '\\' => {
                 self.advance_char();
                 if self.peek_char() == Some('\n') {
-                    self.advance_char(); // line continuation
+                    self.advance_char(); // line continuation — no span
                 } else {
-                    value.push('\\');
+                    let start = wb.span_start();
+                    wb.push('\\');
                     if let Some(next) = self.advance_char() {
-                        value.push(next);
+                        wb.push(next);
                     }
+                    wb.record(start, WordSpanKind::Escape);
                 }
             }
             '$' => {
-                self.read_dollar(value)?;
+                self.read_dollar(wb)?;
             }
             '`' => {
+                let start = wb.span_start();
                 self.advance_char();
-                value.push('`');
-                self.read_backtick(value)?;
+                wb.push('`');
+                self.read_backtick(wb)?;
+                wb.record(start, WordSpanKind::Backtick);
             }
             _ => {}
         }
@@ -135,59 +149,62 @@ impl Lexer {
     }
 
     /// Reads an extglob pattern `@(...)`, `?(...)`, etc.
-    pub(super) fn read_extglob(&mut self, value: &mut String, prefix: char) -> Result<()> {
+    pub(super) fn read_extglob(&mut self, wb: &mut WordBuilder, prefix: char) -> Result<()> {
+        let start = wb.span_start();
         self.advance_char();
-        value.push(prefix);
+        wb.push(prefix);
         self.advance_char();
-        value.push('(');
-        self.read_matched_parens(value, 1)
+        wb.push('(');
+        self.read_matched_parens(wb, 1)?;
+        wb.record(start, WordSpanKind::Extglob(prefix));
+        Ok(())
     }
 
     /// Reads a bracket subscript `[...]` inside a word.
-    /// Includes all content until the matching `]`, treating spaces
-    /// and operators as literal (array subscripts can contain spaces).
-    pub(super) fn read_bracket_subscript(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_bracket_subscript(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        let start = wb.span_start();
         self.advance_char(); // consume [
-        value.push('[');
+        wb.push('[');
         let mut depth = 1;
         while let Some(c) = self.peek_char() {
             match c {
                 '[' => {
                     depth += 1;
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
                 ']' => {
                     depth -= 1;
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                     if depth == 0 {
+                        wb.record(start, WordSpanKind::BracketSubscript);
                         return Ok(());
                     }
                 }
                 '\'' => {
                     self.advance_char();
-                    value.push('\'');
-                    self.read_single_quoted(value)?;
+                    wb.push('\'');
+                    self.read_single_quoted(wb)?;
                 }
                 '"' => {
                     self.advance_char();
-                    value.push('"');
-                    self.read_double_quoted(value)?;
+                    wb.push('"');
+                    self.read_double_quoted(wb)?;
                 }
                 '\\' => {
                     self.advance_char();
-                    value.push('\\');
+                    wb.push('\\');
                     if let Some(nc) = self.advance_char() {
-                        value.push(nc);
+                        wb.push(nc);
                     }
                 }
                 '$' => {
-                    self.read_dollar(value)?;
+                    self.read_dollar(wb)?;
                 }
                 _ => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
             }
         }
@@ -195,58 +212,174 @@ impl Lexer {
     }
 
     /// Reads a process substitution into an existing word value.
-    pub(super) fn read_process_sub_into(&mut self, value: &mut String) -> Result<()> {
+    pub(super) fn read_process_sub_into(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        let start = wb.span_start();
         let dir = self.advance_char().unwrap_or('<');
-        value.push(dir);
+        wb.push(dir);
         self.advance_char(); // (
-        value.push('(');
-        self.read_matched_parens(value, 1)
+        wb.push('(');
+        self.read_matched_parens(wb, 1)?;
+        wb.record(start, WordSpanKind::ProcessSub(dir));
+        Ok(())
     }
 
     /// Reads a process substitution `<(...)` or `>(...)` as a word token.
     /// Continues reading word characters after the closing `)`.
     pub(super) fn read_process_sub_word(&mut self, start: usize, line: usize) -> Result<Token> {
-        let mut value = String::new();
+        let mut wb = WordBuilder::new();
+        let span_start = wb.span_start();
         // Read < or >
         let dir = self.advance_char().unwrap_or('<');
-        value.push(dir);
+        wb.push(dir);
         // Read (
         self.advance_char();
-        value.push('(');
+        wb.push('(');
         // Read until matching )
-        self.read_matched_parens(&mut value, 1)?;
+        self.read_matched_parens(&mut wb, 1)?;
+        wb.record(span_start, WordSpanKind::ProcessSub(dir));
         // Continue reading word chars after the process substitution
-        // e.g., <(cmd)suffix or >(cat)ca
-        self.continue_word(&mut value)?;
+        self.continue_word(&mut wb)?;
         self.ctx.command_start = false;
-        Ok(Token::new(TokenType::Word, value, start, line))
+        Ok(Token::with_spans(
+            TokenType::Word,
+            wb.value,
+            start,
+            line,
+            wb.spans,
+        ))
     }
 
-    /// Continue reading word characters after a special construct (process sub, etc).
-    fn continue_word(&mut self, value: &mut String) -> Result<()> {
+    /// Continue reading word characters after a special construct.
+    fn continue_word(&mut self, wb: &mut WordBuilder) -> Result<()> {
         while let Some(c) = self.peek_char() {
             match c {
                 ' ' | '\t' | '\n' | '|' | '&' | ';' | ')' | '(' => break,
-                // < and > break UNLESS followed by ( (process substitution continues word)
                 '<' | '>' => {
                     if self.input.get(self.pos + 1) == Some(&'(') {
-                        self.read_process_sub_into(value)?;
+                        self.read_process_sub_into(wb)?;
                     } else {
                         break;
                     }
                 }
                 '\'' | '"' | '\\' | '$' | '`' => {
-                    self.read_word_special(value, c)?;
+                    self.read_word_special(wb, c)?;
                 }
-                '[' if !value.is_empty() && value != "[" && !value.ends_with('[') => {
-                    self.read_bracket_subscript(value)?;
+                '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => {
+                    self.read_bracket_subscript(wb)?;
                 }
                 _ => {
                     self.advance_char();
-                    value.push(c);
+                    wb.push(c);
                 }
             }
         }
         Ok(())
     }
+
+    /// Reads array elements from `(...)`, producing normalized content.
+    ///
+    /// Parses individual elements separated by whitespace directly from
+    /// the input stream, stripping comments. Uses the lexer's existing
+    /// word-reading facilities for quoting and expansion handling.
+    /// The opening `(` must already be consumed and pushed to `wb`.
+    fn read_array_elements(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        let mut need_space = false;
+        loop {
+            // Skip whitespace between elements
+            while matches!(self.peek_char(), Some(' ' | '\t' | '\n' | '\r')) {
+                self.advance_char();
+            }
+            match self.peek_char() {
+                Some(')') => {
+                    self.advance_char();
+                    wb.push(')');
+                    return Ok(());
+                }
+                Some('#') => {
+                    // Comment — skip to end of line
+                    while matches!(self.peek_char(), Some(c) if c != '\n') {
+                        self.advance_char();
+                    }
+                }
+                Some(_) => {
+                    if need_space {
+                        wb.push(' ');
+                    }
+                    self.read_array_element(wb)?;
+                    need_space = true;
+                }
+                None => {
+                    return Err(RableError::matched_pair(
+                        "unterminated array",
+                        self.pos,
+                        self.line,
+                    ));
+                }
+            }
+        }
+    }
+
+    /// Reads a single array element using the same word-reading logic
+    /// as `read_word_token`, but with `)` and whitespace as terminators
+    /// instead of the standard shell metacharacters.
+    fn read_array_element(&mut self, wb: &mut WordBuilder) -> Result<()> {
+        while let Some(c) = self.peek_char() {
+            match c {
+                ' ' | '\t' | '\n' | '\r' | ')' => break,
+                '<' | '>' => {
+                    if self.input.get(self.pos + 1) == Some(&'(') {
+                        self.read_process_sub_into(wb)?;
+                    } else {
+                        self.advance_char();
+                        wb.push(c);
+                    }
+                }
+                '\'' | '"' | '\\' | '$' | '`' => {
+                    self.read_word_special(wb, c)?;
+                }
+                '[' if !wb.is_empty() && wb.value != "[" && !wb.ends_with('[') => {
+                    self.read_bracket_subscript(wb)?;
+                }
+                _ => {
+                    self.advance_char();
+                    wb.push(c);
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Returns true if a word value matches the assignment pattern `NAME=`,
+/// `NAME+=`, or `NAME[...]=` / `NAME[...]+=`.
+fn is_assignment_word(value: &str) -> bool {
+    let bytes = value.as_bytes();
+    // Must start with [a-zA-Z_]
+    if !matches!(bytes.first(), Some(b'a'..=b'z' | b'A'..=b'Z' | b'_')) {
+        return false;
+    }
+    let mut i = 1;
+    // Continue with [a-zA-Z_0-9]
+    while i < bytes.len() {
+        match bytes[i] {
+            b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => i += 1,
+            b'[' => {
+                // Skip subscript [...] (may be nested)
+                i += 1;
+                let mut depth = 1;
+                while i < bytes.len() && depth > 0 {
+                    match bytes[i] {
+                        b'[' => depth += 1,
+                        b']' => depth -= 1,
+                        _ => {}
+                    }
+                    i += 1;
+                }
+            }
+            b'+' if i + 1 < bytes.len() && bytes[i + 1] == b'=' => return true,
+            b'=' => return true,
+            _ => return false,
+        }
+    }
+    false
 }
diff --git a/src/parser/compound.rs b/src/parser/compound.rs
index 202bdfa..2823c05 100644
--- a/src/parser/compound.rs
+++ b/src/parser/compound.rs
@@ -153,8 +153,9 @@ impl Parser {
             Some(ws)
         } else {
             Some(vec![Node::empty(NodeKind::Word {
+                parts: super::word_parts::decompose_word("\"$@\""),
                 value: "\"$@\"".to_string(),
-                parts: Vec::new(),
+                spans: Vec::new(),
             })])
         };
 
@@ -241,8 +242,9 @@ impl Parser {
         let word_tok = self.lexer.next_token()?;
         let word = Box::new(Node::new(
             NodeKind::Word {
+                parts: super::word_parts::decompose_word(&word_tok.value),
                 value: word_tok.value.clone(),
-                parts: Vec::new(),
+                spans: word_tok.spans,
             },
             Span::new(word_tok.pos, word_tok.pos + word_tok.value.len()),
         ));
diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs
index f2406f2..bc39e8e 100644
--- a/src/parser/helpers.rs
+++ b/src/parser/helpers.rs
@@ -2,11 +2,12 @@
 
 use crate::ast::{Node, NodeKind};
 
-/// Creates a `Word` node with no parts.
+/// Creates a `Word` node with decomposed parts.
 pub fn word_node(value: &str) -> Node {
     Node::empty(NodeKind::Word {
+        parts: super::word_parts::decompose_word(value),
         value: value.to_string(),
-        parts: Vec::new(),
+        spans: Vec::new(),
     })
 }
 
@@ -77,7 +78,10 @@ pub(super) fn make_stderr_redirect() -> Node {
         op: ">&".to_string(),
         target: Box::new(Node::empty(NodeKind::Word {
             value: "1".to_string(),
-            parts: Vec::new(),
+            parts: vec![Node::empty(NodeKind::WordLiteral {
+                value: "1".to_string(),
+            })],
+            spans: Vec::new(),
         })),
         fd: 2,
     })
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 6db6bab..6f8d0ab 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -1,6 +1,7 @@
 mod compound;
 mod conditional;
 pub mod helpers;
+mod word_parts;
 
 use crate::ast::{ListItem, ListOperator, Node, NodeKind, PipeSep, Span};
 use crate::error::{RableError, Result};
@@ -567,10 +568,12 @@ impl Parser {
                             return self.parse_function_def(&tok);
                         }
                         let word_span = Span::new(tok.pos, tok.pos + tok.value.len());
+                        let parts = word_parts::decompose_word(&tok.value);
                         let node = Node::new(
                             NodeKind::Word {
                                 value: tok.value,
-                                parts: Vec::new(),
+                                parts,
+                                spans: tok.spans,
                             },
                             word_span,
                         );
diff --git a/src/parser/tests.rs b/src/parser/tests.rs
index 3f6edde..fe5b88e 100644
--- a/src/parser/tests.rs
+++ b/src/parser/tests.rs
@@ -196,17 +196,14 @@ fn line_continuation() {
 
 #[test]
 fn command_has_assignments_field() {
-    // The lexer doesn't currently produce AssignmentWord tokens,
-    // so assignments stay in words. This test verifies the
-    // Command variant has the assignments field and S-expression
-    // output is unchanged.
     let nodes = parse("FOO=bar cmd arg");
     assert_eq!(nodes.len(), 1);
     assert!(matches!(
         &nodes[0].kind,
         NodeKind::Command { assignments, words, .. }
-        if assignments.is_empty() && words.len() == 3
+        if assignments.len() == 1 && words.len() == 2
     ));
+    // S-expression output is unchanged (assignments and words are merged)
     let output = format!("{}", nodes[0]);
     assert_eq!(
         output,
@@ -214,6 +211,61 @@ fn command_has_assignments_field() {
     );
 }
 
+#[test]
+fn multiple_assignments_before_command() {
+    let nodes = parse("a=1 b=2 cmd");
+    assert_eq!(nodes.len(), 1);
+    assert!(matches!(
+        &nodes[0].kind,
+        NodeKind::Command { assignments, words, .. }
+        if assignments.len() == 2 && words.len() == 1
+    ));
+}
+
+#[test]
+fn assignment_after_command_word_stays_in_words() {
+    let nodes = parse("cmd FOO=bar");
+    assert_eq!(nodes.len(), 1);
+    assert!(matches!(
+        &nodes[0].kind,
+        NodeKind::Command { assignments, words, .. }
+        if assignments.is_empty() && words.len() == 2
+    ));
+}
+
+#[test]
+fn plus_equals_assignment() {
+    let nodes = parse("FOO+=bar cmd");
+    assert_eq!(nodes.len(), 1);
+    assert!(matches!(
+        &nodes[0].kind,
+        NodeKind::Command { assignments, words, .. }
+        if assignments.len() == 1 && words.len() == 1
+    ));
+}
+
+#[test]
+fn array_assignment() {
+    let nodes = parse("arr=(a b c) cmd");
+    assert_eq!(nodes.len(), 1);
+    assert!(matches!(
+        &nodes[0].kind,
+        NodeKind::Command { assignments, words, .. }
+        if assignments.len() == 1 && words.len() == 1
+    ));
+}
+
+#[test]
+fn bare_assignment_no_command() {
+    let nodes = parse("FOO=bar");
+    assert_eq!(nodes.len(), 1);
+    assert!(matches!(
+        &nodes[0].kind,
+        NodeKind::Command { assignments, words, .. }
+        if assignments.len() == 1 && words.is_empty()
+    ));
+}
+
 #[test]
 fn list_items_structured() {
     let nodes = parse("a && b; c");
diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs
new file mode 100644
index 0000000..e441264
--- /dev/null
+++ b/src/parser/word_parts.rs
@@ -0,0 +1,210 @@
+//! Decomposes word values into structured AST parts.
+//!
+//! Reuses the segment parser from `sexp::word` to split a word's raw
+//! value into typed segments, then maps each segment to an AST `Node`.
+
+use std::cell::Cell;
+
+use crate::ast::{ListItem, Node, NodeKind, Span};
+use crate::sexp::word::{WordSegment, parse_word_segments};
+
+thread_local! {
+    static DECOMPOSE_DEPTH: Cell<usize> = const { Cell::new(0) };
+}
+
+/// RAII guard to prevent infinite recursion when decomposing nested
+/// command/process substitutions.
+///
+/// Depth limit of 2 matches `format::DepthGuard` — allows `$(a $(b))`
+/// but stops before unbounded nesting. The counter is separate from
+/// the format module's counter since decomposition and formatting
+/// are independent recursion paths.
+struct DepthGuard;
+
+impl DepthGuard {
+    fn enter() -> Option<Self> {
+        DECOMPOSE_DEPTH.with(|d| {
+            let v = d.get();
+            if v >= 2 {
+                return None;
+            }
+            d.set(v + 1);
+            Some(Self)
+        })
+    }
+}
+
+impl Drop for DepthGuard {
+    fn drop(&mut self) {
+        DECOMPOSE_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
+    }
+}
+
+/// Decomposes a word's raw value into a list of typed AST nodes.
+///
+/// Each segment becomes one of:
+/// - `WordLiteral` for plain text
+/// - `CommandSubstitution` for `$(...)`
+/// - `ProcessSubstitution` for `<(...)` / `>(...)`
+/// - `AnsiCQuote` for `$'...'`
+/// - `LocaleString` for `$"..."`
+pub(super) fn decompose_word(value: &str) -> Vec<Node> {
+    let segments = parse_word_segments(value);
+    segments.into_iter().map(segment_to_node).collect()
+}
+
+fn segment_to_node(seg: WordSegment) -> Node {
+    match seg {
+        WordSegment::Literal(text) => Node::empty(NodeKind::WordLiteral { value: text }),
+        WordSegment::AnsiCQuote(content) => Node::empty(NodeKind::AnsiCQuote { content }),
+        WordSegment::LocaleString(content) => Node::empty(NodeKind::LocaleString { content }),
+        WordSegment::CommandSubstitution(content) => cmdsub_to_node(&content),
+        WordSegment::ProcessSubstitution(direction, content) => {
+            procsub_to_node(direction, &content)
+        }
+    }
+}
+
+fn cmdsub_to_node(content: &str) -> Node {
+    parse_sub(content, &format!("$({content})"), |cmd| {
+        // brace: false — this is $(...) syntax, not ${...} brace expansion
+        NodeKind::CommandSubstitution {
+            command: cmd,
+            brace: false,
+        }
+    })
+}
+
+fn procsub_to_node(direction: char, content: &str) -> Node {
+    parse_sub(content, &format!("{direction}({content})"), |cmd| {
+        NodeKind::ProcessSubstitution {
+            direction: direction.to_string(),
+            command: cmd,
+        }
+    })
+}
+
+/// Shared logic for parsing command/process substitution content.
+fn parse_sub(
+    content: &str,
+    fallback_value: &str,
+    make_kind: impl FnOnce(Box<Node>) -> NodeKind,
+) -> Node {
+    let Some(_guard) = DepthGuard::enter() else {
+        return literal_fallback(fallback_value);
+    };
+    crate::parse(content, false).map_or_else(
+        |_| literal_fallback(fallback_value),
+        |nodes| Node::empty(make_kind(Box::new(wrap_nodes(nodes)))),
+    )
+}
+
+fn literal_fallback(value: &str) -> Node {
+    Node::empty(NodeKind::WordLiteral {
+        value: value.to_string(),
+    })
+}
+
+/// Wraps a list of parsed nodes into a single node.
+fn wrap_nodes(mut nodes: Vec<Node>) -> Node {
+    match nodes.len() {
+        0 => Node::empty(NodeKind::Empty),
+        1 => nodes.swap_remove(0),
+        _ => Node::new(
+            NodeKind::List {
+                items: nodes
+                    .into_iter()
+                    .map(|n| ListItem {
+                        command: n,
+                        operator: None,
+                    })
+                    .collect(),
+            },
+            Span::empty(),
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn plain_word() {
+        let parts = decompose_word("echo");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::WordLiteral { value } if value == "echo"
+        ));
+    }
+
+    #[test]
+    fn simple_variable_stays_literal() {
+        let parts = decompose_word("$foo");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::WordLiteral { value } if value == "$foo"
+        ));
+    }
+
+    #[test]
+    fn command_substitution() {
+        let parts = decompose_word("$(date)");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::CommandSubstitution { brace: false, .. }
+        ));
+    }
+
+    #[test]
+    fn mixed_segments() {
+        let parts = decompose_word("hello$(world)end");
+        assert_eq!(parts.len(), 3);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::WordLiteral { value } if value == "hello"
+        ));
+        assert!(matches!(
+            &parts[1].kind,
+            NodeKind::CommandSubstitution { .. }
+        ));
+        assert!(matches!(
+            &parts[2].kind,
+            NodeKind::WordLiteral { value } if value == "end"
+        ));
+    }
+
+    #[test]
+    fn ansi_c_quote() {
+        let parts = decompose_word("$'foo\\nbar'");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::AnsiCQuote { content } if content == "foo\\nbar"
+        ));
+    }
+
+    #[test]
+    fn process_substitution() {
+        let parts = decompose_word("<(cmd)");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::ProcessSubstitution { direction, .. }
+            if direction == "<"
+        ));
+    }
+
+    #[test]
+    fn locale_string() {
+        let parts = decompose_word("$\"hello\"");
+        assert_eq!(parts.len(), 1);
+        assert!(matches!(
+            &parts[0].kind,
+            NodeKind::LocaleString { content } if content == "\"hello\""
+        ));
+    }
+}
diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs
index 1461736..9faee1a 100644
--- a/src/sexp/mod.rs
+++ b/src/sexp/mod.rs
@@ -17,11 +17,17 @@ impl fmt::Display for Node {
 impl fmt::Display for NodeKind {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            Self::Word { value, .. } => {
+            Self::Word { value, spans, .. } => {
                 write!(f, "(word \"")?;
-                word::write_word_value(f, value)?;
+                if !spans.is_empty() && !word::needs_value_path(value) {
+                    let segments = word::segments_from_spans(value, spans);
+                    word::write_word_segments(f, &segments)?;
+                } else {
+                    word::write_word_value(f, value)?;
+                }
                 write!(f, "\")")
             }
+            Self::WordLiteral { value } => write!(f, "{value}"),
             Self::Command {
                 assignments,
                 words,
diff --git a/src/sexp/word.rs b/src/sexp/word.rs
index abec579..489dcb1 100644
--- a/src/sexp/word.rs
+++ b/src/sexp/word.rs
@@ -8,8 +8,6 @@ use std::fmt;
 
 use crate::format;
 
-use crate::context::read_balanced_delim;
-
 use super::{
     ansi_c::process_ansi_c_content, extract_paren_content, normalize_cmdsub_content,
     write_escaped_char, write_escaped_word,
@@ -234,128 +232,152 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment])
     Ok(())
 }
 
-/// The public entry point — replaces the old monolithic `write_word_value`.
+/// The public entry point — formats a word value via segment parsing.
+/// Used as fallback when spans are not available (synthetic word nodes).
 pub fn write_word_value(f: &mut fmt::Formatter<'_>, value: &str) -> fmt::Result {
-    // Check for array assignment pattern: name=(...) — normalize content
-    if let Some(normalized) = try_normalize_array(value) {
-        // Process word segments so $() inside arrays gets reformatted
-        let segments = parse_word_segments(&normalized);
-        return write_word_segments(f, &segments);
-    }
     let segments = parse_word_segments(value);
     write_word_segments(f, &segments)
 }
 
-/// Detects `name=(...)` or `name+=(...)` array patterns and normalizes whitespace/comments.
-fn try_normalize_array(value: &str) -> Option<String> {
-    // Find the `=(` pattern
-    let eq_paren = value.find("=(")?;
-    let prefix = &value[..=eq_paren]; // includes `=`
-    let after_eq = &value[eq_paren + 1..]; // starts with `(`
-
-    if !after_eq.starts_with('(') {
-        return None;
-    }
-
-    // Find the matching `)` using depth tracking
-    let chars: Vec<char> = after_eq.chars().collect();
-    let mut depth = 0;
-    let mut close_pos = None;
-    for (j, &c) in chars.iter().enumerate() {
-        if c == '(' {
-            depth += 1;
-        } else if c == ')' {
-            depth -= 1;
-            if depth == 0 {
-                close_pos = Some(j);
-                break;
-            }
+/// Converts lexer spans to segments without re-parsing the value.
+///
+/// Filters to top-level sexp-relevant spans (not contained within
+/// another sexp-relevant span) and treats everything else as literal.
+/// Uses span context to handle context-sensitive constructs like
+/// `$'...'` inside `${...}` vs inside `"..."`.
+pub fn segments_from_spans(
+    value: &str,
+    spans: &[crate::lexer::word_builder::WordSpan],
+) -> Vec<WordSegment> {
+    use crate::lexer::word_builder::{QuotingContext, WordSpanKind};
+    let top_level = collect_top_level_sexp_spans(spans);
+    let mut segments = Vec::new();
+    let mut pos = 0;
+    for span in &top_level {
+        if span.start > pos
+            && let Some(text) = value.get(pos..span.start)
+        {
+            segments.push(WordSegment::Literal(text.to_string()));
         }
-    }
-    let close = close_pos?;
-    let inner_chars: String = chars[1..close].iter().collect();
-    let suffix: String = chars[close + 1..].iter().collect();
-
-    // Always normalize — even just trimming trailing spaces matters
-    if inner_chars.is_empty() {
-        return None;
-    }
-    let inner = &inner_chars;
-
-    let normalized = normalize_array_content(inner);
-    let result = format!("{prefix}({normalized}){suffix}");
-    Some(result)
-}
-
-/// Normalizes array content: collapses whitespace, strips comments.
-#[allow(clippy::too_many_lines)]
-fn normalize_array_content(inner: &str) -> String {
-    let mut elements = Vec::new();
-    let chars: Vec<char> = inner.chars().collect();
-    let mut i = 0;
-    let mut current = String::new();
-
-    while i < chars.len() {
-        match chars[i] {
-            ' ' | '\t' | '\n' | '\r' => {
-                if !current.is_empty() {
-                    elements.push(std::mem::take(&mut current));
+        match &span.kind {
+            WordSpanKind::CommandSub => {
+                if let Some(c) = value.get(span.start + 2..span.end - 1) {
+                    segments.push(WordSegment::CommandSubstitution(c.to_string()));
                 }
-                i += 1;
             }
-            '#' => {
-                // # is only a comment when preceded by whitespace (current is empty)
-                if current.is_empty() {
-                    // Skip comment until end of line
-                    while i < chars.len() && chars[i] != '\n' {
-                        i += 1;
-                    }
-                } else {
-                    // # is part of the word (e.g., b# or [1]=b#)
-                    current.push(chars[i]);
-                    i += 1;
+            WordSpanKind::ProcessSub(dir) => {
+                if let Some(c) = value.get(span.start + 2..span.end - 1) {
+                    segments.push(WordSegment::ProcessSubstitution(*dir, c.to_string()));
                 }
             }
-            '\'' => {
-                crate::context::skip_single_quoted(&chars, &mut i, &mut current);
-            }
-            '"' => {
-                crate::context::skip_double_quoted(&chars, &mut i, &mut current);
+            WordSpanKind::AnsiCQuote => {
+                push_ansi_c_span(&mut segments, value, span);
             }
-            '\\' if i + 1 < chars.len() => {
-                current.push(chars[i]);
-                i += 1;
-                current.push(chars[i]);
-                i += 1;
-            }
-            '`' => {
-                crate::context::skip_backtick(&chars, &mut i, &mut current);
+            WordSpanKind::LocaleString => {
+                match span.context {
+                    QuotingContext::DoubleQuote => {
+                        // $"..." inside "..." is literal (not a locale string)
+                        if let Some(text) = value.get(span.start..span.end) {
+                            push_literal(&mut segments, text);
+                        }
+                    }
+                    _ => {
+                        if let Some(c) = value.get(span.start + 1..span.end) {
+                            segments.push(WordSegment::LocaleString(c.to_string()));
+                        }
+                    }
+                }
             }
-            '$' if i + 1 < chars.len() && chars[i + 1] == '(' => {
-                // Command substitution — read matched parens with quote awareness
-                current.push(chars[i]);
-                current.push(chars[i + 1]);
-                i += 2;
-                read_balanced_delim(&chars, &mut i, '(', ')', &mut current);
+            _ => {} // filtered out by is_sexp_relevant
+        }
+        pos = span.end;
+    }
+    if pos < value.len()
+        && let Some(text) = value.get(pos..)
+    {
+        segments.push(WordSegment::Literal(text.to_string()));
+    }
+    segments
+}
+
+/// Handles `$'...'` spans with context-sensitive behavior:
+/// - Inside `"..."`: not special, treat as literal `$` + `'...'`
+/// - Inside `${...}`: process escapes, absorb into literal (no quotes)
+/// - Otherwise: normal `AnsiCQuote` segment
+fn push_ansi_c_span(
+    segments: &mut Vec<WordSegment>,
+    value: &str,
+    span: &crate::lexer::word_builder::WordSpan,
+) {
+    use crate::lexer::word_builder::QuotingContext;
+    match span.context {
+        QuotingContext::DoubleQuote => {
+            // $'...' is NOT special inside "..." — treat as literal
+            if let Some(text) = value.get(span.start..span.end) {
+                push_literal(segments, text);
             }
-            '$' if i + 1 < chars.len() && chars[i + 1] == '{' => {
-                // Parameter expansion — read matched braces with quote awareness
-                current.push(chars[i]);
-                current.push(chars[i + 1]);
-                i += 2;
-                read_balanced_delim(&chars, &mut i, '{', '}', &mut current);
+        }
+        QuotingContext::ParamExpansion => {
+            // $'...' inside ${...} — process escapes, no quotes in output
+            if let Some(raw) = value.get(span.start + 2..span.end - 1) {
+                let chars: Vec<char> = raw.chars().collect();
+                let mut pos = 0;
+                let processed = super::process_ansi_c_content(&chars, &mut pos);
+                push_literal(segments, &processed);
             }
-            _ => {
-                current.push(chars[i]);
-                i += 1;
+        }
+        _ => {
+            // Top-level or inside $() / `` — normal ANSI-C quote
+            if let Some(c) = value.get(span.start + 2..span.end - 1) {
+                segments.push(WordSegment::AnsiCQuote(c.to_string()));
             }
         }
     }
-    if !current.is_empty() {
-        elements.push(current);
+}
+
+/// Appends text to the last `Literal` segment if possible, or creates a new one.
+fn push_literal(segments: &mut Vec<WordSegment>, text: &str) {
+    if let Some(WordSegment::Literal(last)) = segments.last_mut() {
+        last.push_str(text);
+    } else {
+        segments.push(WordSegment::Literal(text.to_string()));
     }
+}
+
+/// Returns true if the word value requires the value-based formatting
+/// path. Currently always returns false — all cases are handled by
+/// the span-based path or by lex-time normalization.
+pub const fn needs_value_path(_value: &str) -> bool {
+    false
+}
+
+const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bool {
+    use crate::lexer::word_builder::WordSpanKind;
+    matches!(
+        kind,
+        WordSpanKind::CommandSub
+            | WordSpanKind::AnsiCQuote
+            | WordSpanKind::LocaleString
+            | WordSpanKind::ProcessSub(_)
+    )
+}
 
-    elements.join(" ")
+/// Collects sexp-relevant spans that are not nested inside another
+/// sexp-relevant span. Sorted by start offset.
+fn collect_top_level_sexp_spans(
+    spans: &[crate::lexer::word_builder::WordSpan],
+) -> Vec<&crate::lexer::word_builder::WordSpan> {
+    let relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect();
+    relevant
+        .iter()
+        .filter(|s| {
+            // A span is top-level if no other relevant span contains it
+            !relevant.iter().any(|outer| {
+                outer.start <= s.start && outer.end >= s.end && !std::ptr::eq(*outer, **s)
+            })
+        })
+        .copied()
+        .collect()
 }
 
 // -- helpers --
diff --git a/src/token.rs b/src/token.rs
index 9212d13..1187ba1 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -141,6 +141,8 @@ impl TokenType {
     }
 }
 
+use crate::lexer::word_builder::WordSpan;
+
 /// A token produced by the lexer.
 #[derive(Debug, Clone)]
 pub struct Token {
@@ -148,6 +150,9 @@ pub struct Token {
     pub value: String,
     pub pos: usize,
     pub line: usize,
+    /// Expansion spans within the word value (empty for non-word tokens).
+    #[allow(dead_code)]
+    pub(crate) spans: Vec<WordSpan>,
 }
 
 impl Token {
@@ -157,6 +162,25 @@ impl Token {
             value: value.into(),
             pos,
             line,
+            spans: Vec::new(),
+        }
+    }
+
+    /// Creates a word token with pre-recorded expansion spans.
+    #[allow(dead_code)]
+    pub(crate) const fn with_spans(
+        kind: TokenType,
+        value: String,
+        pos: usize,
+        line: usize,
+        spans: Vec<WordSpan>,
+    ) -> Self {
+        Self {
+            kind,
+            value,
+            pos,
+            line,
+            spans,
         }
     }
 
@@ -171,6 +195,7 @@ impl Token {
             value: String::new(),
             pos,
             line,
+            spans: Vec::new(),
         }
     }
 }
diff --git a/tests/integration.rs b/tests/integration.rs
index 8331825..bc28ba7 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -242,18 +242,26 @@ fn oracle_test_suite() {
         let mut pass = 0;
         let mut fail = 0;
 
+        let mut failures = Vec::new();
         for case in &cases {
-            let (passed, _actual) = run_test(case);
+            let (passed, actual) = run_test(case);
             if passed {
                 pass += 1;
             } else {
                 fail += 1;
+                failures.push(format!(
+                    "    FAIL :: {}\n      input:    {:?}\n      expected: {:?}\n      actual:   {:?}",
+                    case.name, case.input, case.expected, actual,
+                ));
             }
         }
 
         let total = pass + fail;
         let status = if fail == 0 { "OK" } else { "FAIL" };
         eprintln!("  {file_name}: {pass}/{total} passed [{status}]");
+        for f in &failures {
+            eprintln!("{f}");
+        }
         total_pass += pass;
         total_fail += fail;
     }

From 54db8c79d15689d475ec12aeadc2d28b2d62fa90 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Wed, 25 Mar 2026 13:45:35 +0100
Subject: [PATCH 2/5] refactor: remove sexp re-parsing by threading spans
 through all nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminate ~400 lines of duplicate parsing code in the sexp/format layers
by ensuring all Word and CondTerm nodes carry lexer spans.

- Thread token spans via `word_node_from_token()` (9 call sites updated)
- Add spans to CondTerm nodes via `cond_term_from_token()` (4 call sites)
- Rewrite `write_redirect()` to use span-based segments instead of
  string searches (`needs_word_processing` removed)
- Update `process_word_value()` in format module to use spans
- Replace `decompose_word()` with `decompose_word_with_spans()` (span
  path) and `decompose_word_literal()` (synthetic nodes)
- Word Display for span-less synthetic nodes uses `write_escaped_word`
  directly instead of re-parsing through `parse_word_segments`

Deleted dead code:
- `parse_word_segments()` + `flush_literal` + `extract_ansi_c_content`
  + `extract_locale_content` (~195 lines in sexp/word.rs)
- `extract_paren_content()` (~107 lines in sexp/mod.rs)
- `needs_word_processing()`, `write_word_value()`, `needs_value_path()`
- `skip_single_quoted/double/backtick`, `is_backslash_escaped` (~70
  lines in context.rs)
- 8 tests for deleted `parse_word_segments` function

Oracle improved: 167/181 (was 165) — removing buggy re-parsing fixed
2 edge cases where the two parsers disagreed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ast.rs                |   6 +-
 src/context.rs            |  70 ----------
 src/format/mod.rs         |  21 ++-
 src/parser/compound.rs    |  19 +--
 src/parser/conditional.rs |  10 +-
 src/parser/helpers.rs     |  21 ++-
 src/parser/mod.rs         |  12 +-
 src/parser/word_parts.rs  |  47 ++++---
 src/sexp/mod.rs           | 207 ++++++++---------------------
 src/sexp/word.rs          | 273 +-------------------------------------
 10 files changed, 143 insertions(+), 543 deletions(-)

diff --git a/src/ast.rs b/src/ast.rs
index 9f78c17..79d6e52 100644
--- a/src/ast.rs
+++ b/src/ast.rs
@@ -326,7 +326,11 @@ pub enum NodeKind {
     CondParen { inner: Box<Node> },
 
     /// A term (word) in a conditional expression.
-    CondTerm { value: String },
+    CondTerm {
+        value: String,
+        #[allow(dead_code)]
+        spans: Vec<crate::lexer::word_builder::WordSpan>,
+    },
 
     // -- Other --
     /// Pipeline negation with `!`.
diff --git a/src/context.rs b/src/context.rs
index b8bd94c..d9b5903 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -58,73 +58,3 @@ impl CaseTracker {
         self.depth > 0 && self.in_pattern
     }
 }
-
-/// Skips a single-quoted region in a char array. Reads from `pos` (which
-/// should point AT the opening `'`) through the closing `'`.
-/// Pushes all chars (including quotes) into `out`.
-pub fn skip_single_quoted(chars: &[char], pos: &mut usize, out: &mut String) {
-    out.push(chars[*pos]); // opening '
-    *pos += 1;
-    while *pos < chars.len() && chars[*pos] != '\'' {
-        out.push(chars[*pos]);
-        *pos += 1;
-    }
-    if *pos < chars.len() {
-        out.push(chars[*pos]); // closing '
-        *pos += 1;
-    }
-}
-
-/// Skips a double-quoted region in a char array. Reads from `pos` (which
-/// should point AT the opening `"`) through the closing `"`.
-/// Handles backslash escapes inside. Pushes all chars into `out`.
-pub fn skip_double_quoted(chars: &[char], pos: &mut usize, out: &mut String) {
-    out.push(chars[*pos]); // opening "
-    *pos += 1;
-    while *pos < chars.len() && chars[*pos] != '"' {
-        if chars[*pos] == '\\' && *pos + 1 < chars.len() {
-            out.push(chars[*pos]);
-            *pos += 1;
-        }
-        out.push(chars[*pos]);
-        *pos += 1;
-    }
-    if *pos < chars.len() {
-        out.push(chars[*pos]); // closing "
-        *pos += 1;
-    }
-}
-
-/// Skips a backtick region in a char array. Reads from `pos` (which
-/// should point AT the opening `` ` ``) through the closing `` ` ``.
-/// Handles backslash escapes inside. Pushes all chars into `out`.
-pub fn skip_backtick(chars: &[char], pos: &mut usize, out: &mut String) {
-    out.push(chars[*pos]); // opening `
-    *pos += 1;
-    while *pos < chars.len() && chars[*pos] != '`' {
-        if chars[*pos] == '\\' && *pos + 1 < chars.len() {
-            out.push(chars[*pos]);
-            *pos += 1;
-        }
-        out.push(chars[*pos]);
-        *pos += 1;
-    }
-    if *pos < chars.len() {
-        out.push(chars[*pos]); // closing `
-        *pos += 1;
-    }
-}
-
-/// Counts consecutive backslashes before a position to determine
-/// if a character is escaped.
-///
-/// Even count → not escaped. Odd count → escaped.
-pub fn is_backslash_escaped(chars: &[char], pos: usize) -> bool {
-    let mut count = 0;
-    let mut j = pos;
-    while j > 0 && chars[j - 1] == '\\' {
-        count += 1;
-        j -= 1;
-    }
-    count % 2 != 0
-}
diff --git a/src/format/mod.rs b/src/format/mod.rs
index 73fad3d..e1998fa 100644
--- a/src/format/mod.rs
+++ b/src/format/mod.rs
@@ -206,8 +206,8 @@ fn format_command_words(assignments: &[Node], words: &[Node], out: &mut String)
         if i > 0 {
             out.push(' ');
         }
-        if let NodeKind::Word { value, .. } = &w.kind {
-            out.push_str(&process_word_value(value));
+        if let NodeKind::Word { value, spans, .. } = &w.kind {
+            out.push_str(&process_word_value(value, spans));
         } else {
             out.push_str(&w.to_string());
         }
@@ -233,8 +233,8 @@ fn format_redirect(node: &Node, out: &mut String) {
         if !is_dup {
             out.push(' ');
         }
-        if let NodeKind::Word { value, .. } = &target.kind {
-            out.push_str(&process_word_value(value));
+        if let NodeKind::Word { value, spans, .. } = &target.kind {
+            out.push_str(&process_word_value(value, spans));
         }
     } else if let NodeKind::HereDoc {
         delimiter,
@@ -515,7 +515,7 @@ fn format_cond_node(node: &Node, out: &mut String) {
             out.push_str("! ");
             format_cond_node(operand, out);
         }
-        NodeKind::CondTerm { value } => {
+        NodeKind::CondTerm { value, .. } => {
             out.push_str(value);
         }
         NodeKind::CondParen { inner } => {
@@ -535,13 +535,12 @@ fn indent_str(out: &mut String, n: usize) {
     }
 }
 
-/// Process a word value for canonical bash output using the same segment
-/// pipeline as S-expression formatting. This ensures consistent handling
-/// of `$'...'`, `$"..."`, and `$(...)` across both output paths.
-fn process_word_value(value: &str) -> String {
-    use crate::sexp::word::{WordSegment, parse_word_segments};
+/// Process a word value for canonical bash output using span-based
+/// segment extraction.
+fn process_word_value(value: &str, spans: &[crate::lexer::word_builder::WordSpan]) -> String {
+    use crate::sexp::word::{WordSegment, segments_from_spans};
 
-    let segments = parse_word_segments(value);
+    let segments = segments_from_spans(value, spans);
     let mut result = String::with_capacity(value.len());
 
     for seg in &segments {
diff --git a/src/parser/compound.rs b/src/parser/compound.rs
index 2823c05..1be8779 100644
--- a/src/parser/compound.rs
+++ b/src/parser/compound.rs
@@ -7,7 +7,7 @@ use crate::token::{Token, TokenType};
 
 use super::{
     Parser,
-    helpers::{is_fd_number, word_node},
+    helpers::{is_fd_number, word_node, word_node_from_token},
 };
 
 impl Parser {
@@ -148,12 +148,12 @@ impl Parser {
                     break;
                 }
                 let tok = self.lexer.next_token()?;
-                ws.push(word_node(&tok.value));
+                ws.push(word_node_from_token(&tok));
             }
             Some(ws)
         } else {
             Some(vec![Node::empty(NodeKind::Word {
-                parts: super::word_parts::decompose_word("\"$@\""),
+                parts: super::word_parts::decompose_word_literal("\"$@\""),
                 value: "\"$@\"".to_string(),
                 spans: Vec::new(),
             })])
@@ -242,7 +242,10 @@ impl Parser {
         let word_tok = self.lexer.next_token()?;
         let word = Box::new(Node::new(
             NodeKind::Word {
-                parts: super::word_parts::decompose_word(&word_tok.value),
+                parts: super::word_parts::decompose_word_with_spans(
+                    &word_tok.value,
+                    &word_tok.spans,
+                ),
                 value: word_tok.value.clone(),
                 spans: word_tok.spans,
             },
@@ -289,7 +292,7 @@ impl Parser {
             if tok.kind == TokenType::Pipe {
                 continue;
             }
-            pattern_words.push(word_node(&tok.value));
+            pattern_words.push(word_node_from_token(&tok));
         }
 
         self.skip_newlines()?;
@@ -348,7 +351,7 @@ impl Parser {
                     break;
                 }
                 let tok = self.lexer.next_token()?;
-                ws.push(word_node(&tok.value));
+                ws.push(word_node_from_token(&tok));
             }
             Some(ws)
         } else {
@@ -513,7 +516,7 @@ impl Parser {
         } else {
             None
         };
-        let mut words = vec![word_node(&first_tok.value)];
+        let mut words = vec![word_node_from_token(&first_tok)];
         let mut redirects = Vec::new();
         loop {
             if self.at_end()? {
@@ -529,7 +532,7 @@ impl Parser {
                 if is_fd_number(&tok.value) && self.is_redirect_operator()? {
                     redirects.push(self.parse_redirect_with_fd(&tok)?);
                 } else {
-                    words.push(word_node(&tok.value));
+                    words.push(word_node_from_token(&tok));
                 }
             } else {
                 break;
diff --git a/src/parser/conditional.rs b/src/parser/conditional.rs
index 4016638..2860e86 100644
--- a/src/parser/conditional.rs
+++ b/src/parser/conditional.rs
@@ -6,7 +6,7 @@ use crate::token::{Token, TokenType};
 
 use super::{
     Parser,
-    helpers::{cond_term, is_cond_binary_op},
+    helpers::{cond_term_from_token, is_cond_binary_op},
 };
 
 impl Parser {
@@ -107,7 +107,7 @@ impl Parser {
                 start,
                 NodeKind::UnaryTest {
                     op: first.value,
-                    operand: Box::new(cond_term(&operand_tok.value)),
+                    operand: Box::new(cond_term_from_token(&operand_tok)),
                 },
             ));
         }
@@ -128,8 +128,8 @@ impl Parser {
                     start,
                     NodeKind::BinaryTest {
                         op: op.value,
-                        left: Box::new(cond_term(&first.value)),
-                        right: Box::new(cond_term(&right.value)),
+                        left: Box::new(cond_term_from_token(&first)),
+                        right: Box::new(cond_term_from_token(&right)),
                     },
                 ));
             }
@@ -140,7 +140,7 @@ impl Parser {
             start,
             NodeKind::UnaryTest {
                 op: "-n".to_string(),
-                operand: Box::new(cond_term(&first.value)),
+                operand: Box::new(cond_term_from_token(&first)),
             },
         ))
     }
diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs
index bc39e8e..3ff2149 100644
--- a/src/parser/helpers.rs
+++ b/src/parser/helpers.rs
@@ -1,20 +1,31 @@
 //! Helper functions for the parser.
 
 use crate::ast::{Node, NodeKind};
+use crate::token::Token;
 
-/// Creates a `Word` node with decomposed parts.
+/// Creates a `Word` node from a lexer token, preserving spans.
+pub fn word_node_from_token(tok: &Token) -> Node {
+    Node::empty(NodeKind::Word {
+        parts: super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans),
+        value: tok.value.clone(),
+        spans: tok.spans.clone(),
+    })
+}
+
+/// Creates a `Word` node for synthetic values (no lexer token).
 pub fn word_node(value: &str) -> Node {
     Node::empty(NodeKind::Word {
-        parts: super::word_parts::decompose_word(value),
+        parts: super::word_parts::decompose_word_literal(value),
         value: value.to_string(),
         spans: Vec::new(),
     })
 }
 
-/// Creates a `cond-term` node for conditional expressions.
-pub(super) fn cond_term(value: &str) -> Node {
+/// Creates a `cond-term` node from a lexer token, preserving spans.
+pub(super) fn cond_term_from_token(tok: &Token) -> Node {
     Node::empty(NodeKind::CondTerm {
-        value: value.to_string(),
+        value: tok.value.clone(),
+        spans: tok.spans.clone(),
     })
 }
 
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 6f8d0ab..35428ae 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -10,7 +10,7 @@ use crate::token::{Token, TokenType};
 
 use helpers::{
     add_stderr_redirect, fill_heredoc_contents, is_fd_number, is_varfd, make_stderr_redirect,
-    parse_heredoc_delimiter, word_node,
+    parse_heredoc_delimiter, word_node, word_node_from_token,
 };
 
 /// Maximum recursion/iteration depth to prevent infinite loops.
@@ -424,11 +424,11 @@ impl Parser {
             // After |, time is a regular word, not a keyword.
             // Temporarily demote it and all following reserved words to words.
             let time_tok = self.lexer.next_token()?;
-            let mut words = vec![word_node(&time_tok.value)];
+            let mut words = vec![word_node_from_token(&time_tok)];
             // Check for -p flag (also a word in this context)
             if self.check_word("-p")? {
                 let p_tok = self.lexer.next_token()?;
-                words.push(word_node(&p_tok.value));
+                words.push(word_node_from_token(&p_tok));
             }
             // Consume all remaining words (including reserved words as plain words)
             let mut redirects = Vec::new();
@@ -456,7 +456,7 @@ impl Parser {
                     continue;
                 }
                 let t = self.lexer.next_token()?;
-                words.push(word_node(&t.value));
+                words.push(word_node_from_token(&t));
             }
             Ok(self.spanned(
                 start,
@@ -568,7 +568,7 @@ impl Parser {
                             return self.parse_function_def(&tok);
                         }
                         let word_span = Span::new(tok.pos, tok.pos + tok.value.len());
-                        let parts = word_parts::decompose_word(&tok.value);
+                        let parts = word_parts::decompose_word_with_spans(&tok.value, &tok.spans);
                         let node = Node::new(
                             NodeKind::Word {
                                 value: tok.value,
@@ -679,7 +679,7 @@ impl Parser {
             start,
             NodeKind::Redirect {
                 op: op_tok.value,
-                target: Box::new(word_node(&target_tok.value)),
+                target: Box::new(word_node_from_token(&target_tok)),
                 fd,
             },
         ))
diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs
index e441264..ab7b494 100644
--- a/src/parser/word_parts.rs
+++ b/src/parser/word_parts.rs
@@ -6,7 +6,8 @@
 use std::cell::Cell;
 
 use crate::ast::{ListItem, Node, NodeKind, Span};
-use crate::sexp::word::{WordSegment, parse_word_segments};
+use crate::lexer::word_builder::WordSpan;
+use crate::sexp::word::{WordSegment, segments_from_spans};
 
 thread_local! {
     static DECOMPOSE_DEPTH: Cell<usize> = const { Cell::new(0) };
@@ -40,19 +41,21 @@ impl Drop for DepthGuard {
     }
 }
 
-/// Decomposes a word's raw value into a list of typed AST nodes.
-///
-/// Each segment becomes one of:
-/// - `WordLiteral` for plain text
-/// - `CommandSubstitution` for `$(...)`
-/// - `ProcessSubstitution` for `<(...)` / `>(...)`
-/// - `AnsiCQuote` for `$'...'`
-/// - `LocaleString` for `$"..."`
-pub(super) fn decompose_word(value: &str) -> Vec<Node> {
-    let segments = parse_word_segments(value);
+/// Decomposes a word using lexer spans — the primary path for all
+/// token-derived words.
+pub(super) fn decompose_word_with_spans(value: &str, spans: &[WordSpan]) -> Vec<Node> {
+    let segments = segments_from_spans(value, spans);
     segments.into_iter().map(segment_to_node).collect()
 }
 
+/// Creates a single `WordLiteral` for synthetic values (no lexer token).
+/// Used for fd numbers (`"0"`, `"1"`), synthetic `"$@"`, etc.
+pub(super) fn decompose_word_literal(value: &str) -> Vec<Node> {
+    vec![Node::empty(NodeKind::WordLiteral {
+        value: value.to_string(),
+    })]
+}
+
 fn segment_to_node(seg: WordSegment) -> Node {
     match seg {
         WordSegment::Literal(text) => Node::empty(NodeKind::WordLiteral { value: text }),
@@ -129,9 +132,17 @@ fn wrap_nodes(mut nodes: Vec<Node>) -> Node {
 mod tests {
     use super::*;
 
+    /// Helper: lex a single word and decompose it using the real span path.
+    #[allow(clippy::unwrap_used)]
+    fn decompose(source: &str) -> Vec<Node> {
+        let mut lexer = crate::lexer::Lexer::new(source, false);
+        let tok = lexer.next_token().unwrap();
+        decompose_word_with_spans(&tok.value, &tok.spans)
+    }
+
     #[test]
     fn plain_word() {
-        let parts = decompose_word("echo");
+        let parts = decompose("echo");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
@@ -141,7 +152,7 @@ mod tests {
 
     #[test]
     fn simple_variable_stays_literal() {
-        let parts = decompose_word("$foo");
+        let parts = decompose("$foo");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
@@ -151,7 +162,7 @@ mod tests {
 
     #[test]
     fn command_substitution() {
-        let parts = decompose_word("$(date)");
+        let parts = decompose("$(date)");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
@@ -161,7 +172,7 @@ mod tests {
 
     #[test]
     fn mixed_segments() {
-        let parts = decompose_word("hello$(world)end");
+        let parts = decompose("hello$(world)end");
         assert_eq!(parts.len(), 3);
         assert!(matches!(
             &parts[0].kind,
@@ -179,7 +190,7 @@ mod tests {
 
     #[test]
     fn ansi_c_quote() {
-        let parts = decompose_word("$'foo\\nbar'");
+        let parts = decompose("$'foo\\nbar'");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
@@ -189,7 +200,7 @@ mod tests {
 
     #[test]
     fn process_substitution() {
-        let parts = decompose_word("<(cmd)");
+        let parts = decompose("<(cmd)");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
@@ -200,7 +211,7 @@ mod tests {
 
     #[test]
     fn locale_string() {
-        let parts = decompose_word("$\"hello\"");
+        let parts = decompose("$\"hello\"");
         assert_eq!(parts.len(), 1);
         assert!(matches!(
             &parts[0].kind,
diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs
index 9faee1a..13e9509 100644
--- a/src/sexp/mod.rs
+++ b/src/sexp/mod.rs
@@ -19,11 +19,12 @@ impl fmt::Display for NodeKind {
         match self {
             Self::Word { value, spans, .. } => {
                 write!(f, "(word \"")?;
-                if !spans.is_empty() && !word::needs_value_path(value) {
+                if spans.is_empty() {
+                    // Synthetic word (no lexer token) — escape directly
+                    write_escaped_word(f, value)?;
+                } else {
                     let segments = word::segments_from_spans(value, spans);
                     word::write_word_segments(f, &segments)?;
-                } else {
-                    word::write_word_value(f, value)?;
                 }
                 write!(f, "\")")
             }
@@ -205,18 +206,19 @@ impl fmt::Display for NodeKind {
                 write!(f, "(cond {body})")?;
                 write_redirects(f, redirects)
             }
-            Self::CondTerm { value } => {
+            Self::CondTerm { value, spans } => {
                 // Strip $" locale prefix
                 let val = if value.starts_with("$\"") {
                     &value[1..]
                 } else {
                     value
                 };
-                // Process cmdsubs within the value (for redirect normalization)
-                // but write other content raw (no escaping — cond-terms use raw values)
-                if val.contains("$(") {
+                let has_cmdsub = spans.iter().any(|s| {
+                    matches!(s.kind, crate::lexer::word_builder::WordSpanKind::CommandSub)
+                });
+                if has_cmdsub {
                     write!(f, "(cond-term \"")?;
-                    let segments = word::parse_word_segments(val);
+                    let segments = word::segments_from_spans(val, spans);
                     for seg in &segments {
                         match seg {
                             word::WordSegment::Literal(text) => write!(f, "{text}")?,
@@ -286,119 +288,6 @@ impl fmt::Display for CasePattern {
 
 // Old write_word_value replaced by word::write_word_value (segment-based)
 
-/// Extracts content from matched parentheses (for command substitution).
-/// Case-aware: `)` in case patterns doesn't close the substitution.
-#[allow(clippy::too_many_lines)]
-pub(super) fn extract_paren_content(chars: &[char], pos: &mut usize) -> String {
-    let mut content = String::new();
-    let mut depth = 1;
-    let mut case = crate::context::CaseTracker::default();
-    let mut word_buf = String::new();
-
-    while *pos < chars.len() {
-        let c = chars[*pos];
-        // Handle backslash-escaped characters
-        if c == '\\' && *pos + 1 < chars.len() {
-            word_buf.clear();
-            content.push(c);
-            *pos += 1;
-            content.push(chars[*pos]);
-            *pos += 1;
-            continue;
-        }
-        if c == '(' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            // In case pattern mode, `(` is optional pattern prefix — don't increment depth
-            if !case.is_pattern_open() {
-                depth += 1;
-            }
-            content.push(c);
-        } else if c == ')' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            if case.is_pattern_close() {
-                // Case pattern terminator — don't close
-                content.push(c);
-                case.close_pattern();
-            } else {
-                depth -= 1;
-                if depth == 0 {
-                    *pos += 1;
-                    return content;
-                }
-                content.push(c);
-            }
-        } else if c == '\'' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            crate::context::skip_single_quoted(chars, pos, &mut content);
-            continue; // skip_single_quoted already advances pos
-        } else if c == '"' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            crate::context::skip_double_quoted(chars, pos, &mut content);
-            continue;
-        } else if c == ';' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            content.push(c);
-            // Check for ;; ;& ;;& which resume case pattern mode
-            if *pos + 1 < chars.len() {
-                if chars[*pos + 1] == ';' {
-                    *pos += 1;
-                    content.push(chars[*pos]);
-                    if *pos + 1 < chars.len() && chars[*pos + 1] == '&' {
-                        *pos += 1;
-                        content.push(chars[*pos]);
-                    }
-                    case.resume_pattern();
-                } else if chars[*pos + 1] == '&' {
-                    *pos += 1;
-                    content.push(chars[*pos]);
-                    case.resume_pattern();
-                }
-            }
-        } else if c == '`' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            crate::context::skip_backtick(chars, pos, &mut content);
-            continue;
-        } else if c == '#' {
-            // Comment: # after whitespace/newline skips to end of line
-            case.check_word(&word_buf);
-            word_buf.clear();
-            let prev = if content.is_empty() {
-                '\n'
-            } else {
-                content.chars().last().unwrap_or('\n')
-            };
-            if prev == '\n' || prev == ' ' || prev == '\t' {
-                while *pos < chars.len() && chars[*pos] != '\n' {
-                    *pos += 1;
-                }
-                // Don't advance past the newline — the main loop will
-                if *pos < chars.len() {
-                    *pos -= 1; // will be incremented by the outer loop
-                }
-            } else {
-                content.push(c);
-            }
-        } else if c == ' ' || c == '\t' || c == '\n' || c == '|' {
-            case.check_word(&word_buf);
-            word_buf.clear();
-            content.push(c);
-        } else {
-            word_buf.push(c);
-            content.push(c);
-        }
-        *pos += 1;
-    }
-    content
-}
-
-// CaseTracker moved to crate::context
-
 /// Normalizes command substitution content:
 /// - Strips leading/trailing whitespace and newlines
 /// - Strips trailing semicolons
@@ -785,30 +674,21 @@ fn write_in_list(f: &mut fmt::Formatter<'_>, words: Option<&[Node]>) -> fmt::Res
 
 fn write_redirect(f: &mut fmt::Formatter<'_>, op: &str, target: &Node, _fd: i32) -> fmt::Result {
     write!(f, "(redirect \"{op}\" ")?;
-    if let NodeKind::Word { value, .. } = &target.kind {
+    if let NodeKind::Word { value, spans, .. } = &target.kind {
         // For fd dup operations (>&, <&), bare digit-only targets are unquoted
         let is_fd_op =
             op.starts_with(">&") || op.starts_with("<&") || op.ends_with("&-") || op.ends_with('&');
         if is_fd_op && !value.is_empty() && value.chars().all(|c| c.is_ascii_digit()) {
             write!(f, "{value})")
-        } else if value.starts_with("$\"") {
-            // Locale string: strip $ prefix, preserve literal quotes
-            write!(f, "\"{}\")", &value[1..])
-        } else if value.starts_with("$'") && !needs_word_processing(value) {
-            // ANSI-C at start with no nested constructs: process with literal newlines
-            let chars: Vec<char> = value.chars().collect();
-            let mut pos = 2; // skip $'
-            let processed = process_ansi_c_content(&chars, &mut pos);
-            // Include any text after the closing quote
-            let remaining: String = chars[pos..].iter().collect();
-            write!(f, "\"'{processed}'{remaining}\")")
-        } else if needs_word_processing(value) {
-            // Values with $'...', $"...", $(...), <(...) need segment processing
+        } else if !spans.is_empty() {
+            // Use span-based formatting — redirect targets use literal
+            // output (no S-expression escaping of quotes)
             write!(f, "\"")?;
-            word::write_word_value(f, value)?;
+            let segments = word::segments_from_spans(value, spans);
+            write_redirect_segments(f, &segments)?;
             write!(f, "\")")
         } else {
-            // Plain values: output as-is inside quotes
+            // Synthetic nodes (fd strings) — output as-is
             write!(f, "\"{value}\")")
         }
     } else {
@@ -816,18 +696,49 @@ fn write_redirect(f: &mut fmt::Formatter<'_>, op: &str, target: &Node, _fd: i32)
     }
 }
 
-/// Returns true if a redirect target value needs word segment processing.
-/// Note: $' and $" at the START are handled specially (literal newlines),
-/// so only mid-value occurrences or $()/<() trigger word processing.
-fn needs_word_processing(value: &str) -> bool {
-    if value.starts_with("$'") || value.starts_with("$\"") {
-        // $' or $" at start handled by special cases — only need word processing
-        // if there's ALSO a $( or <( elsewhere in the value
-        let rest = &value[2..];
-        rest.contains("$(") || rest.contains("<(") || rest.contains(">(")
-    } else {
-        value.contains("$'") || value.contains("$(") || value.contains("<(") || value.contains(">(")
+/// Formats word segments for redirect targets — uses literal output
+/// (no S-expression escaping) for text, but processes ANSI-C escapes
+/// and reformats command substitutions.
+fn write_redirect_segments(
+    f: &mut fmt::Formatter<'_>,
+    segments: &[word::WordSegment],
+) -> fmt::Result {
+    for seg in segments {
+        match seg {
+            word::WordSegment::Literal(text) => write!(f, "{text}")?,
+            word::WordSegment::AnsiCQuote(raw) => {
+                let chars: Vec<char> = raw.chars().collect();
+                let mut pos = 0;
+                let processed = process_ansi_c_content(&chars, &mut pos);
+                write!(f, "'{processed}'")?;
+            }
+            word::WordSegment::LocaleString(content) => {
+                // $"..." → "..." (content already includes "...")
+                write!(f, "{content}")?;
+            }
+            word::WordSegment::CommandSubstitution(content) => {
+                write!(f, "$(")?;
+                if let Some(reformatted) = crate::format::reformat_bash(content) {
+                    write!(f, "{reformatted}")?;
+                } else {
+                    let normalized = normalize_cmdsub_content(content);
+                    write!(f, "{normalized}")?;
+                }
+                write!(f, ")")?;
+            }
+            word::WordSegment::ProcessSubstitution(dir, content) => {
+                write!(f, "{dir}(")?;
+                if let Some(reformatted) = crate::format::reformat_bash(content) {
+                    write!(f, "{reformatted}")?;
+                } else {
+                    let normalized = normalize_cmdsub_content(content);
+                    write!(f, "{normalized}")?;
+                }
+                write!(f, ")")?;
+            }
+        }
     }
+    Ok(())
 }
 
 fn write_param(
diff --git a/src/sexp/word.rs b/src/sexp/word.rs
index 489dcb1..d22b12c 100644
--- a/src/sexp/word.rs
+++ b/src/sexp/word.rs
@@ -9,8 +9,8 @@ use std::fmt;
 use crate::format;
 
 use super::{
-    ansi_c::process_ansi_c_content, extract_paren_content, normalize_cmdsub_content,
-    write_escaped_char, write_escaped_word,
+    ansi_c::process_ansi_c_content, normalize_cmdsub_content, write_escaped_char,
+    write_escaped_word,
 };
 
 /// A typed segment within a word token's raw value.
@@ -28,150 +28,6 @@ pub enum WordSegment {
     ProcessSubstitution(char, String),
 }
 
-/// Parses a word's raw value into typed segments.
-#[allow(clippy::too_many_lines)]
-pub fn parse_word_segments(value: &str) -> Vec<WordSegment> {
-    let chars: Vec<char> = value.chars().collect();
-    let mut segments = Vec::new();
-    let mut i = 0;
-    let mut literal = String::new();
-    let mut brace_depth = 0usize; // Track ${...} nesting
-    let mut in_double_quote = false; // Track "..." context
-
-    while i < chars.len() {
-        let prev_backslash = crate::context::is_backslash_escaped(&chars, i);
-
-        // Track double-quote context (not inside ${...} where quotes nest differently)
-        if chars[i] == '"' && !prev_backslash && brace_depth == 0 {
-            in_double_quote = !in_double_quote;
-            literal.push(chars[i]);
-            i += 1;
-            continue;
-        }
-
-        // Track closing braces for ${...}
-        if chars[i] == '}' && brace_depth > 0 {
-            brace_depth -= 1;
-            literal.push(chars[i]);
-            i += 1;
-            continue;
-        }
-
-        // Single quotes are opaque — nothing is special inside '...'
-        if chars[i] == '\'' && !prev_backslash && !in_double_quote {
-            literal.push(chars[i]);
-            i += 1;
-            while i < chars.len() && chars[i] != '\'' {
-                literal.push(chars[i]);
-                i += 1;
-            }
-            if i < chars.len() {
-                literal.push(chars[i]); // closing '
-                i += 1;
-            }
-            continue;
-        }
-
-        // Backticks are opaque — don't process $'...' or $() inside them
-        if chars[i] == '`' && !prev_backslash {
-            literal.push(chars[i]);
-            i += 1;
-            while i < chars.len() && chars[i] != '`' {
-                if chars[i] == '\\' && i + 1 < chars.len() {
-                    literal.push(chars[i]);
-                    i += 1;
-                }
-                literal.push(chars[i]);
-                i += 1;
-            }
-            if i < chars.len() {
-                literal.push(chars[i]); // closing `
-                i += 1;
-            }
-            continue;
-        }
-
-        if chars[i] == '$' && !prev_backslash && i + 1 < chars.len() {
-            match chars[i + 1] {
-                '\'' => {
-                    if in_double_quote && brace_depth == 0 {
-                        // $'...' is NOT special inside double quotes — literal
-                        // (but IS special inside ${...} even when double-quoted)
-                        literal.push(chars[i]);
-                        i += 1;
-                        continue;
-                    }
-                    if brace_depth > 0 {
-                        // ANSI-C inside ${...} — process escapes, output without quotes
-                        i += 2; // skip $'
-                        let content = extract_ansi_c_content(&chars, &mut i);
-                        let ac_chars: Vec<char> = content.chars().collect();
-                        let mut pos = 0;
-                        let processed = super::process_ansi_c_content(&ac_chars, &mut pos);
-                        literal.push_str(&processed);
-                    } else {
-                        flush_literal(&mut literal, &mut segments);
-                        i += 2; // skip $'
-                        let content = extract_ansi_c_content(&chars, &mut i);
-                        segments.push(WordSegment::AnsiCQuote(content));
-                    }
-                }
-                '"' => {
-                    if in_double_quote {
-                        // $" inside double quotes is literal (not a locale string)
-                        literal.push(chars[i]);
-                        i += 1;
-                    } else {
-                        flush_literal(&mut literal, &mut segments);
-                        i += 1; // skip $, keep "
-                        let content = extract_locale_content(&chars, &mut i);
-                        segments.push(WordSegment::LocaleString(content));
-                    }
-                }
-                '{' => {
-                    literal.push(chars[i]);
-                    literal.push(chars[i + 1]);
-                    i += 2;
-                    brace_depth += 1;
-                }
-                '(' => {
-                    // $(( )) arithmetic — treat as literal (don't reformat)
-                    if i + 2 < chars.len() && chars[i + 2] == '(' {
-                        literal.push(chars[i]);
-                        i += 1;
-                        continue;
-                    }
-                    flush_literal(&mut literal, &mut segments);
-                    i += 2; // skip $(
-                    let content = extract_paren_content(&chars, &mut i);
-                    segments.push(WordSegment::CommandSubstitution(content));
-                }
-                _ => {
-                    literal.push(chars[i]);
-                    i += 1;
-                }
-            }
-        } else if (chars[i] == '>' || chars[i] == '<')
-            && !prev_backslash
-            && !in_double_quote
-            && i + 1 < chars.len()
-            && chars[i + 1] == '('
-        {
-            let direction = chars[i];
-            flush_literal(&mut literal, &mut segments);
-            i += 2; // skip >( or <(
-            let content = extract_paren_content(&chars, &mut i);
-            segments.push(WordSegment::ProcessSubstitution(direction, content));
-        } else {
-            literal.push(chars[i]);
-            i += 1;
-        }
-    }
-
-    flush_literal(&mut literal, &mut segments);
-    segments
-}
-
 /// Formats word segments into S-expression output.
 pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) -> fmt::Result {
     for seg in segments {
@@ -232,13 +88,6 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment])
     Ok(())
 }
 
-/// The public entry point — formats a word value via segment parsing.
-/// Used as fallback when spans are not available (synthetic word nodes).
-pub fn write_word_value(f: &mut fmt::Formatter<'_>, value: &str) -> fmt::Result {
-    let segments = parse_word_segments(value);
-    write_word_segments(f, &segments)
-}
-
 /// Converts lexer spans to segments without re-parsing the value.
 ///
 /// Filters to top-level sexp-relevant spans (not contained within
@@ -344,13 +193,6 @@ fn push_literal(segments: &mut Vec<WordSegment>, text: &str) {
     }
 }
 
-/// Returns true if the word value requires the value-based formatting
-/// path. Currently always returns false — all cases are handled by
-/// the span-based path or by lex-time normalization.
-pub const fn needs_value_path(_value: &str) -> bool {
-    false
-}
-
 const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bool {
     use crate::lexer::word_builder::WordSpanKind;
     matches!(
@@ -379,114 +221,3 @@ fn collect_top_level_sexp_spans(
         .copied()
         .collect()
 }
-
-// -- helpers --
-
-fn flush_literal(literal: &mut String, segments: &mut Vec<WordSegment>) {
-    if !literal.is_empty() {
-        segments.push(WordSegment::Literal(std::mem::take(literal)));
-    }
-}
-
-/// Extracts ANSI-C content between `'...'` (after `$'` was consumed).
-fn extract_ansi_c_content(chars: &[char], pos: &mut usize) -> String {
-    let mut content = String::new();
-    while *pos < chars.len() {
-        let c = chars[*pos];
-        if c == '\'' {
-            *pos += 1; // skip closing '
-            return content;
-        }
-        if c == '\\' && *pos + 1 < chars.len() {
-            content.push('\\');
-            *pos += 1;
-            content.push(chars[*pos]);
-        } else {
-            content.push(c);
-        }
-        *pos += 1;
-    }
-    content
-}
-
-/// Extracts locale string content `"..."` (after `$` was skipped, starting at `"`).
-fn extract_locale_content(chars: &[char], pos: &mut usize) -> String {
-    let mut content = String::new();
-    // Read opening " and content until closing "
-    let start = *pos;
-    while *pos < chars.len() {
-        content.push(chars[*pos]);
-        if chars[*pos] == '"' && *pos > start {
-            *pos += 1;
-            return content;
-        }
-        *pos += 1;
-    }
-    content
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn plain_word() {
-        let segs = parse_word_segments("hello");
-        assert_eq!(segs, vec![WordSegment::Literal("hello".into())]);
-    }
-
-    #[test]
-    fn ansi_c_quote() {
-        let segs = parse_word_segments("$'foo\\nbar'");
-        assert_eq!(segs, vec![WordSegment::AnsiCQuote("foo\\nbar".into())]);
-    }
-
-    #[test]
-    fn locale_string() {
-        let segs = parse_word_segments("$\"hello\"");
-        assert_eq!(segs, vec![WordSegment::LocaleString("\"hello\"".into())]);
-    }
-
-    #[test]
-    fn command_substitution() {
-        let segs = parse_word_segments("$(date)");
-        assert_eq!(segs, vec![WordSegment::CommandSubstitution("date".into())]);
-    }
-
-    #[test]
-    fn arithmetic_stays_literal() {
-        let segs = parse_word_segments("$((1+2))");
-        assert_eq!(segs, vec![WordSegment::Literal("$((1+2))".into())]);
-    }
-
-    #[test]
-    fn mixed_segments() {
-        let segs = parse_word_segments("hello$(world)$'foo'");
-        assert_eq!(
-            segs,
-            vec![
-                WordSegment::Literal("hello".into()),
-                WordSegment::CommandSubstitution("world".into()),
-                WordSegment::AnsiCQuote("foo".into()),
-            ]
-        );
-    }
-
-    #[test]
-    fn escaped_dollar_not_special() {
-        let segs = parse_word_segments("\\$'not-ansi'");
-        assert_eq!(segs, vec![WordSegment::Literal("\\$'not-ansi'".into())]);
-    }
-
-    #[test]
-    fn assignment_with_cmdsub() {
-        let segs = parse_word_segments("x=$(cmd)");
-        assert_eq!(
-            segs,
-            vec![
-                WordSegment::Literal("x=".into()),
-                WordSegment::CommandSubstitution("cmd".into()),
-            ]
-        );
-    }
-}

From cec7e8e5f8a245b15211598f21f1b5fedd6081d4 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Wed, 25 Mar 2026 13:56:02 +0100
Subject: [PATCH 3/5] refactor: simplify span collection, move to owned tokens,
 remove dead code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix O(n²) collect_top_level_sexp_spans: sort by start offset then
  single-pass sweep (was using ptr::eq containment check)
- Move word_node_from_token/cond_term_from_token to take owned Token,
  eliminating String+Vec clones per word
- Remove stale #[allow(dead_code)] annotations on Token.spans,
  Token::with_spans, and blanket module-level allow on word_builder
- Remove unused WordBuilder::push_str method
- Fix CondTerm Display to handle all segment types via
  write_redirect_segments instead of debug-formatting unknown segments
- Fix locale string context handling for $"..." inside words

Oracle improved: 169/181 (was 167).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ast.rs                |  2 --
 src/lexer/expansions.rs   |  4 ++--
 src/lexer/word_builder.rs |  5 -----
 src/parser/compound.rs    | 10 +++++-----
 src/parser/conditional.rs |  8 ++++----
 src/parser/helpers.rs     | 19 ++++++++++---------
 src/parser/mod.rs         |  8 ++++----
 src/sexp/mod.rs           | 37 ++++++++++++++-----------------------
 src/sexp/word.rs          | 28 +++++++++++++++-------------
 src/token.rs              |  2 --
 10 files changed, 54 insertions(+), 69 deletions(-)

diff --git a/src/ast.rs b/src/ast.rs
index 79d6e52..3fff3d4 100644
--- a/src/ast.rs
+++ b/src/ast.rs
@@ -75,7 +75,6 @@ pub enum NodeKind {
     Word {
         value: String,
         parts: Vec<Node>,
-        #[allow(dead_code)]
         spans: Vec<crate::lexer::word_builder::WordSpan>,
     },
 
@@ -328,7 +327,6 @@ pub enum NodeKind {
     /// A term (word) in a conditional expression.
     CondTerm {
         value: String,
-        #[allow(dead_code)]
         spans: Vec<crate::lexer::word_builder::WordSpan>,
     },
 
diff --git a/src/lexer/expansions.rs b/src/lexer/expansions.rs
index 96761da..5ba00f6 100644
--- a/src/lexer/expansions.rs
+++ b/src/lexer/expansions.rs
@@ -64,8 +64,8 @@ impl Lexer {
         self.read_matched_parens(wb, 1)?;
         let content_end = wb.len().saturating_sub(1);
         if content_start < content_end {
-            let content: String = wb.value[content_start..content_end].to_string();
-            if !content.trim().is_empty() && crate::parse(&content, self.extglob()).is_err() {
+            let content = &wb.value[content_start..content_end];
+            if !content.trim().is_empty() && crate::parse(content, self.extglob()).is_err() {
                 return Err(RableError::parse(
                     "invalid command substitution",
                     self.pos,
diff --git a/src/lexer/word_builder.rs b/src/lexer/word_builder.rs
index 7e3279a..a8298d1 100644
--- a/src/lexer/word_builder.rs
+++ b/src/lexer/word_builder.rs
@@ -4,7 +4,6 @@
 //! that record where each expansion starts and ends, along with the
 //! quoting context at the point of recording. This eliminates the need
 //! for downstream code to re-parse word values.
-#![allow(dead_code)]
 
 /// The quoting context at the point where a span was recorded.
 ///
@@ -52,10 +51,6 @@ impl WordBuilder {
         self.value.push(c);
     }
 
-    pub fn push_str(&mut self, s: &str) {
-        self.value.push_str(s);
-    }
-
     pub fn ends_with(&self, c: char) -> bool {
         self.value.ends_with(c)
     }
diff --git a/src/parser/compound.rs b/src/parser/compound.rs
index 1be8779..baf4ce7 100644
--- a/src/parser/compound.rs
+++ b/src/parser/compound.rs
@@ -148,7 +148,7 @@ impl Parser {
                     break;
                 }
                 let tok = self.lexer.next_token()?;
-                ws.push(word_node_from_token(&tok));
+                ws.push(word_node_from_token(tok));
             }
             Some(ws)
         } else {
@@ -292,7 +292,7 @@ impl Parser {
             if tok.kind == TokenType::Pipe {
                 continue;
             }
-            pattern_words.push(word_node_from_token(&tok));
+            pattern_words.push(word_node_from_token(tok));
         }
 
         self.skip_newlines()?;
@@ -351,7 +351,7 @@ impl Parser {
                     break;
                 }
                 let tok = self.lexer.next_token()?;
-                ws.push(word_node_from_token(&tok));
+                ws.push(word_node_from_token(tok));
             }
             Some(ws)
         } else {
@@ -516,7 +516,7 @@ impl Parser {
         } else {
             None
         };
-        let mut words = vec![word_node_from_token(&first_tok)];
+        let mut words = vec![word_node_from_token(first_tok)];
         let mut redirects = Vec::new();
         loop {
             if self.at_end()? {
@@ -532,7 +532,7 @@ impl Parser {
                 if is_fd_number(&tok.value) && self.is_redirect_operator()? {
                     redirects.push(self.parse_redirect_with_fd(&tok)?);
                 } else {
-                    words.push(word_node_from_token(&tok));
+                    words.push(word_node_from_token(tok));
                 }
             } else {
                 break;
diff --git a/src/parser/conditional.rs b/src/parser/conditional.rs
index 2860e86..a360710 100644
--- a/src/parser/conditional.rs
+++ b/src/parser/conditional.rs
@@ -107,7 +107,7 @@ impl Parser {
                 start,
                 NodeKind::UnaryTest {
                     op: first.value,
-                    operand: Box::new(cond_term_from_token(&operand_tok)),
+                    operand: Box::new(cond_term_from_token(operand_tok)),
                 },
             ));
         }
@@ -128,8 +128,8 @@ impl Parser {
                     start,
                     NodeKind::BinaryTest {
                         op: op.value,
-                        left: Box::new(cond_term_from_token(&first)),
-                        right: Box::new(cond_term_from_token(&right)),
+                        left: Box::new(cond_term_from_token(first)),
+                        right: Box::new(cond_term_from_token(right)),
                     },
                 ));
             }
@@ -140,7 +140,7 @@ impl Parser {
             start,
             NodeKind::UnaryTest {
                 op: "-n".to_string(),
-                operand: Box::new(cond_term_from_token(&first)),
+                operand: Box::new(cond_term_from_token(first)),
             },
         ))
     }
diff --git a/src/parser/helpers.rs b/src/parser/helpers.rs
index 3ff2149..72e42dd 100644
--- a/src/parser/helpers.rs
+++ b/src/parser/helpers.rs
@@ -3,12 +3,13 @@
 use crate::ast::{Node, NodeKind};
 use crate::token::Token;
 
-/// Creates a `Word` node from a lexer token, preserving spans.
-pub fn word_node_from_token(tok: &Token) -> Node {
+/// Creates a `Word` node from a lexer token, moving value and spans.
+pub fn word_node_from_token(tok: Token) -> Node {
+    let parts = super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans);
     Node::empty(NodeKind::Word {
-        parts: super::word_parts::decompose_word_with_spans(&tok.value, &tok.spans),
-        value: tok.value.clone(),
-        spans: tok.spans.clone(),
+        parts,
+        value: tok.value,
+        spans: tok.spans,
     })
 }
 
@@ -21,11 +22,11 @@ pub fn word_node(value: &str) -> Node {
     })
 }
 
-/// Creates a `cond-term` node from a lexer token, preserving spans.
-pub(super) fn cond_term_from_token(tok: &Token) -> Node {
+/// Creates a `cond-term` node from a lexer token, moving value and spans.
+pub(super) fn cond_term_from_token(tok: Token) -> Node {
     Node::empty(NodeKind::CondTerm {
-        value: tok.value.clone(),
-        spans: tok.spans.clone(),
+        value: tok.value,
+        spans: tok.spans,
     })
 }
 
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 35428ae..823b2df 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -424,11 +424,11 @@ impl Parser {
             // After |, time is a regular word, not a keyword.
             // Temporarily demote it and all following reserved words to words.
             let time_tok = self.lexer.next_token()?;
-            let mut words = vec![word_node_from_token(&time_tok)];
+            let mut words = vec![word_node_from_token(time_tok)];
             // Check for -p flag (also a word in this context)
             if self.check_word("-p")? {
                 let p_tok = self.lexer.next_token()?;
-                words.push(word_node_from_token(&p_tok));
+                words.push(word_node_from_token(p_tok));
             }
             // Consume all remaining words (including reserved words as plain words)
             let mut redirects = Vec::new();
@@ -456,7 +456,7 @@ impl Parser {
                     continue;
                 }
                 let t = self.lexer.next_token()?;
-                words.push(word_node_from_token(&t));
+                words.push(word_node_from_token(t));
             }
             Ok(self.spanned(
                 start,
@@ -679,7 +679,7 @@ impl Parser {
             start,
             NodeKind::Redirect {
                 op: op_tok.value,
-                target: Box::new(word_node_from_token(&target_tok)),
+                target: Box::new(word_node_from_token(target_tok)),
                 fd,
             },
         ))
diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs
index 13e9509..b41a637 100644
--- a/src/sexp/mod.rs
+++ b/src/sexp/mod.rs
@@ -213,31 +213,22 @@ impl fmt::Display for NodeKind {
                 } else {
                     value
                 };
-                let has_cmdsub = spans.iter().any(|s| {
-                    matches!(s.kind, crate::lexer::word_builder::WordSpanKind::CommandSub)
-                });
-                if has_cmdsub {
-                    write!(f, "(cond-term \"")?;
+                if spans.is_empty() {
+                    write!(f, "(cond-term \"{val}\")")
+                } else {
                     let segments = word::segments_from_spans(val, spans);
-                    for seg in &segments {
-                        match seg {
-                            word::WordSegment::Literal(text) => write!(f, "{text}")?,
-                            word::WordSegment::CommandSubstitution(content) => {
-                                write!(f, "$(")?;
-                                if let Some(reformatted) = crate::format::reformat_bash(content) {
-                                    write!(f, "{reformatted}")?;
-                                } else {
-                                    let normalized = normalize_cmdsub_content(content);
-                                    write!(f, "{normalized}")?;
-                                }
-                                write!(f, ")")?;
-                            }
-                            _ => write!(f, "{seg:?}")?,
-                        }
+                    if segments
+                        .iter()
+                        .all(|s| matches!(s, word::WordSegment::Literal(_)))
+                    {
+                        // No expansions needing processing
+                        write!(f, "(cond-term \"{val}\")")
+                    } else {
+                        write!(f, "(cond-term \"")?;
+                        // CondTerm uses raw output (like redirect targets)
+                        write_redirect_segments(f, &segments)?;
+                        write!(f, "\")")
                     }
-                    write!(f, "\")")
-                } else {
-                    write!(f, "(cond-term \"{val}\")")
                 }
             }
             Self::UnaryTest { op, operand } => {
diff --git a/src/sexp/word.rs b/src/sexp/word.rs
index d22b12c..e1fc31b 100644
--- a/src/sexp/word.rs
+++ b/src/sexp/word.rs
@@ -204,20 +204,22 @@ const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bo
     )
 }
 
-/// Collects sexp-relevant spans that are not nested inside another
-/// sexp-relevant span. Sorted by start offset.
+/// Collects top-level sexp-relevant spans sorted by start offset.
+/// A span is top-level if no other sexp-relevant span fully contains it.
 fn collect_top_level_sexp_spans(
     spans: &[crate::lexer::word_builder::WordSpan],
 ) -> Vec<&crate::lexer::word_builder::WordSpan> {
-    let relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect();
-    relevant
-        .iter()
-        .filter(|s| {
-            // A span is top-level if no other relevant span contains it
-            !relevant.iter().any(|outer| {
-                outer.start <= s.start && outer.end >= s.end && !std::ptr::eq(*outer, **s)
-            })
-        })
-        .copied()
-        .collect()
+    // Collect sexp-relevant spans sorted by start offset
+    let mut relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect();
+    relevant.sort_by_key(|s| s.start);
+    // Single pass: skip spans nested inside a previously collected one
+    let mut result = Vec::new();
+    let mut covered_until: usize = 0;
+    for span in relevant {
+        if span.start >= covered_until {
+            covered_until = span.end;
+            result.push(span);
+        }
+    }
+    result
 }
diff --git a/src/token.rs b/src/token.rs
index 1187ba1..c8e7c3b 100644
--- a/src/token.rs
+++ b/src/token.rs
@@ -151,7 +151,6 @@ pub struct Token {
     pub pos: usize,
     pub line: usize,
     /// Expansion spans within the word value (empty for non-word tokens).
-    #[allow(dead_code)]
     pub(crate) spans: Vec<WordSpan>,
 }
 
@@ -167,7 +166,6 @@ impl Token {
     }
 
     /// Creates a word token with pre-recorded expansion spans.
-    #[allow(dead_code)]
     pub(crate) const fn with_spans(
         kind: TokenType,
         value: String,

From 5ecfd6926b0438bf579a1de0cd09c7bc5061c713 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Wed, 25 Mar 2026 13:59:57 +0100
Subject: [PATCH 4/5] test: add assertion-based oracle tracking with
 KNOWN_ORACLE_FAILURES

Replace the non-asserting oracle test with one that tracks known
failures explicitly. The test now fails on:
- Regressions (previously passing test now fails)
- Newly passing tests (update KNOWN_ORACLE_FAILURES to track progress)

12 known failures documented with root cause comments.
Oracle: 169/181.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/integration.rs | 138 +++++++++++++++++++++++++++++++------------
 1 file changed, 100 insertions(+), 38 deletions(-)

diff --git a/tests/integration.rs b/tests/integration.rs
index bc28ba7..b46a720 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -199,9 +199,77 @@ parable_tests! {
     parable_35_parser_bugs          => "35_parser_bugs.tests",
 }
 
+/// Known oracle test failures — tests that don't match bash-oracle output yet.
+/// When a fix makes one of these pass, the test suite will fail with
+/// "NEWLY PASSING" so you know to remove it from this list.
+const KNOWN_ORACLE_FAILURES: &[&str] = &[
+    // Trailing backslash doubling
+    "ansi_c_escapes 3",
+    "redirect_formatting 3",
+    "heredoc_formatting 1",
+    // ANSI-C \x single hex digit and \0 octal repeat behavior
+    "ansi_c_escapes 13",
+    "other 10",
+    // Heredoc delimiter edge cases
+    "ansi_c_escapes 18",
+    "heredoc_formatting 8",
+    // Varfd {6d} not recognized → word dropped
+    "heredoc_formatting 9",
+    // Coproc with adjacent redirect
+    "redirect_formatting 7",
+    // Background & placement after heredoc in cmdsub
+    "cmdsub_formatting 9",
+    // Deprecated $[...] with ; splits word
+    "word_boundaries 8",
+    // Assignment detection causes esac to be keyword
+    "word_boundaries 2",
+];
+
+struct OracleResults {
+    total_pass: usize,
+    total_fail: usize,
+    regressions: Vec<String>,
+    newly_passing: Vec<String>,
+}
+
+fn run_oracle_file(path: &Path, results: &mut OracleResults) {
+    let file_name = path.file_name().unwrap_or_default().to_string_lossy();
+    let content = fs::read_to_string(path).unwrap_or_default();
+    let cases = parse_test_file(&content);
+    let mut pass = 0;
+    let mut fail = 0;
+
+    for case in &cases {
+        let (passed, actual) = run_test(case);
+        let is_known = KNOWN_ORACLE_FAILURES.contains(&case.name.as_str());
+        if passed {
+            pass += 1;
+            if is_known {
+                results.newly_passing.push(format!(
+                    "  NEWLY PASSING :: {} — remove from KNOWN_ORACLE_FAILURES",
+                    case.name,
+                ));
+            }
+        } else {
+            fail += 1;
+            if !is_known {
+                results.regressions.push(format!(
+                    "  REGRESSION :: {}\n    input:    {:?}\n    expected: {:?}\n    actual:   {:?}",
+                    case.name, case.input, case.expected, actual,
+                ));
+            }
+        }
+    }
+
+    let total = pass + fail;
+    let status = if fail == 0 { "OK" } else { "FAIL" };
+    eprintln!("  {file_name}: {pass}/{total} passed [{status}]");
+    results.total_pass += pass;
+    results.total_fail += fail;
+}
+
 /// Oracle-derived tests: correctness differences found by fuzzing against bash-oracle.
-/// These tests track progress toward full bash compatibility beyond Parable's test suite.
-/// This test reports results but does NOT fail the build.
+/// Asserts that known failures still fail and known passes don't regress.
 #[test]
 fn oracle_test_suite() {
     let test_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/oracle");
@@ -213,8 +281,12 @@ fn oracle_test_suite() {
     eprintln!("\n=== Oracle Test Suite (bash-oracle compatibility) ===\n");
 
     let filter = std::env::var("RABLE_TEST").ok();
-    let mut total_pass = 0;
-    let mut total_fail = 0;
+    let mut results = OracleResults {
+        total_pass: 0,
+        total_fail: 0,
+        regressions: Vec::new(),
+        newly_passing: Vec::new(),
+    };
 
     let mut entries: Vec<_> = fs::read_dir(&test_dir)
         .unwrap_or_else(|_| {
@@ -229,44 +301,34 @@ fn oracle_test_suite() {
 
     for entry in entries {
         let path = entry.path();
-        let file_name = path.file_name().unwrap_or_default().to_string_lossy();
-
-        if let Some(ref f) = filter
-            && !file_name.contains(f.as_str())
-        {
+        let name = path.file_name().unwrap_or_default().to_string_lossy();
+        if filter.as_ref().is_some_and(|f| !name.contains(f.as_str())) {
             continue;
         }
+        run_oracle_file(&path, &mut results);
+    }
 
-        let content = fs::read_to_string(&path).unwrap_or_default();
-        let cases = parse_test_file(&content);
-        let mut pass = 0;
-        let mut fail = 0;
+    let total = results.total_pass + results.total_fail;
+    eprintln!(
+        "\n=== Oracle Results: {}/{total} passed ({} remaining) ===\n",
+        results.total_pass, results.total_fail
+    );
 
-        let mut failures = Vec::new();
-        for case in &cases {
-            let (passed, actual) = run_test(case);
-            if passed {
-                pass += 1;
-            } else {
-                fail += 1;
-                failures.push(format!(
-                    "    FAIL :: {}\n      input:    {:?}\n      expected: {:?}\n      actual:   {:?}",
-                    case.name, case.input, case.expected, actual,
-                ));
-            }
-        }
-
-        let total = pass + fail;
-        let status = if fail == 0 { "OK" } else { "FAIL" };
-        eprintln!("  {file_name}: {pass}/{total} passed [{status}]");
-        for f in &failures {
-            eprintln!("{f}");
-        }
-        total_pass += pass;
-        total_fail += fail;
+    for msg in &results.newly_passing {
+        eprintln!("{msg}");
+    }
+    for msg in &results.regressions {
+        eprintln!("{msg}");
     }
 
-    let total = total_pass + total_fail;
-    eprintln!("\n=== Oracle Results: {total_pass}/{total} passed ({total_fail} remaining) ===\n");
-    // Intentionally does not assert — these are aspirational targets
+    assert!(
+        results.regressions.is_empty(),
+        "{} oracle test regression(s)",
+        results.regressions.len()
+    );
+    assert!(
+        results.newly_passing.is_empty(),
+        "{} oracle test(s) newly passing — update KNOWN_ORACLE_FAILURES",
+        results.newly_passing.len()
+    );
 }

From b768e979826344654d137d83bb677f16abe1fe64 Mon Sep 17 00:00:00 2001
From: Matjaz Domen Pecan <matjaz.pecan@gmail.com>
Date: Wed, 25 Mar 2026 14:01:13 +0100
Subject: [PATCH 5/5] test: generate per-file oracle test functions via macro

Each oracle .tests file now has its own test function generated by the
oracle_test! macro (e.g., oracle_ansi_c_escapes, oracle_heredoc_formatting).
Each test individually asserts no regressions and no newly passing tests.

This replaces the single oracle_test_suite function, giving specific
line-level feedback on which oracle category passes or fails.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/integration.rs | 115 ++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 62 deletions(-)

diff --git a/tests/integration.rs b/tests/integration.rs
index b46a720..37ad57d 100644
--- a/tests/integration.rs
+++ b/tests/integration.rs
@@ -225,6 +225,7 @@ const KNOWN_ORACLE_FAILURES: &[&str] = &[
     "word_boundaries 2",
 ];
 
+#[derive(Default)]
 struct OracleResults {
     total_pass: usize,
     total_fail: usize,
@@ -268,67 +269,57 @@ fn run_oracle_file(path: &Path, results: &mut OracleResults) {
     results.total_fail += fail;
 }
 
-/// Oracle-derived tests: correctness differences found by fuzzing against bash-oracle.
-/// Asserts that known failures still fail and known passes don't regress.
-#[test]
-fn oracle_test_suite() {
-    let test_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/oracle");
-    if !test_dir.exists() {
-        eprintln!("Skipping: tests/oracle/ directory not found");
-        return;
-    }
-
-    eprintln!("\n=== Oracle Test Suite (bash-oracle compatibility) ===\n");
-
-    let filter = std::env::var("RABLE_TEST").ok();
-    let mut results = OracleResults {
-        total_pass: 0,
-        total_fail: 0,
-        regressions: Vec::new(),
-        newly_passing: Vec::new(),
-    };
-
-    let mut entries: Vec<_> = fs::read_dir(&test_dir)
-        .unwrap_or_else(|_| {
-            eprintln!("Warning: test directory not found: {}", test_dir.display());
-            std::process::exit(0);
-        })
-        .filter_map(Result::ok)
-        .filter(|e| e.path().extension().is_some_and(|ext| ext == "tests"))
-        .collect();
-
-    entries.sort_by_key(std::fs::DirEntry::file_name);
-
-    for entry in entries {
-        let path = entry.path();
-        let name = path.file_name().unwrap_or_default().to_string_lossy();
-        if filter.as_ref().is_some_and(|f| !name.contains(f.as_str())) {
-            continue;
+/// Generates a test function for each oracle test file.
+/// Each test asserts no regressions and no newly passing tests
+/// (which would need `KNOWN_ORACLE_FAILURES` updated).
+macro_rules! oracle_test {
+    ($name:ident, $file:expr) => {
+        #[test]
+        fn $name() {
+            let path = Path::new(env!("CARGO_MANIFEST_DIR"))
+                .join("tests/oracle")
+                .join($file);
+            if !path.exists() {
+                eprintln!("Skipping: {} not found", $file);
+                return;
+            }
+            let mut results = OracleResults::default();
+            run_oracle_file(&path, &mut results);
+            let total = results.total_pass + results.total_fail;
+            eprintln!(
+                "  {}: {}/{total} passed ({} remaining)",
+                $file, results.total_pass, results.total_fail,
+            );
+            for msg in &results.newly_passing {
+                eprintln!("{msg}");
+            }
+            for msg in &results.regressions {
+                eprintln!("{msg}");
+            }
+            assert!(
+                results.regressions.is_empty(),
+                "{}: {} regression(s)",
+                $file,
+                results.regressions.len()
+            );
+            assert!(
+                results.newly_passing.is_empty(),
+                "{}: {} newly passing — update KNOWN_ORACLE_FAILURES",
+                $file,
+                results.newly_passing.len()
+            );
         }
-        run_oracle_file(&path, &mut results);
-    }
-
-    let total = results.total_pass + results.total_fail;
-    eprintln!(
-        "\n=== Oracle Results: {}/{total} passed ({} remaining) ===\n",
-        results.total_pass, results.total_fail
-    );
-
-    for msg in &results.newly_passing {
-        eprintln!("{msg}");
-    }
-    for msg in &results.regressions {
-        eprintln!("{msg}");
-    }
-
-    assert!(
-        results.regressions.is_empty(),
-        "{} oracle test regression(s)",
-        results.regressions.len()
-    );
-    assert!(
-        results.newly_passing.is_empty(),
-        "{} oracle test(s) newly passing — update KNOWN_ORACLE_FAILURES",
-        results.newly_passing.len()
-    );
+    };
 }
+
+oracle_test!(oracle_ansi_c_escapes, "ansi_c_escapes.tests");
+oracle_test!(oracle_ansi_c_processing, "ansi_c_processing.tests");
+oracle_test!(oracle_array_normalization, "array_normalization.tests");
+oracle_test!(oracle_cmdsub_formatting, "cmdsub_formatting.tests");
+oracle_test!(oracle_heredoc_formatting, "heredoc_formatting.tests");
+oracle_test!(oracle_locale_strings, "locale_strings.tests");
+oracle_test!(oracle_other, "other.tests");
+oracle_test!(oracle_procsub_formatting, "procsub_formatting.tests");
+oracle_test!(oracle_redirect_formatting, "redirect_formatting.tests");
+oracle_test!(oracle_top_level_separation, "top_level_separation.tests");
+oracle_test!(oracle_word_boundaries, "word_boundaries.tests");