diff --git a/src/bin/doodle/format/text.rs b/src/bin/doodle/format/text.rs index 3c2fb2f9..fd913757 100644 --- a/src/bin/doodle/format/text.rs +++ b/src/bin/doodle/format/text.rs @@ -132,15 +132,17 @@ pub fn main(module: &mut FormatModule, base: &BaseModule) -> FormatRef { ), ); - let ascii_str = module.define_format("text.string.ascii", repeat1(base.ascii_char_strict())); - let utf8_str = module.define_format("text.string.utf8", repeat(utf8_char.call())); + let ascii_char = module.define_format( + "text.char.ascii", + Format::Map( + Box::new(base.ascii_char_strict()), + Expr::Lambda("byte".into(), Box::new(Expr::AsChar(Box::new(var("byte"))))), + ), + ); module.define_format( "text.string", - Format::UnionNondet(vec![ - ("ascii".into(), ascii_str.call()), - ("utf8".into(), utf8_str.call()), - ]), + Format::RepeatFallback(Box::new(ascii_char.call()), Box::new(utf8_char.call())), ) } diff --git a/src/decoder.rs b/src/decoder.rs index 4ee93fc0..c4e027c2 100644 --- a/src/decoder.rs +++ b/src/decoder.rs @@ -23,6 +23,7 @@ pub enum Value { Mapped(Box, Box), Branch(usize, Box), Format(Box), + Fallback(bool, Box), } impl Value { @@ -431,6 +432,7 @@ enum Decoder { Record(Vec<(Cow<'static, str>, Decoder)>), While(MatchTree, Box), Until(MatchTree, Box), + RepeatFallback(MatchTree, Box, Box), RepeatCount(Expr, Box), RepeatUntilLast(Expr, Box), RepeatUntilSeq(Expr, Box), @@ -858,6 +860,38 @@ impl Decoder { Err(format!("cannot build match tree for {:?}", format)) } } + Format::RepeatFallback(narrow, wide) => { + if narrow.is_nullable(compiler.module) || wide.is_nullable(compiler.module) { + return Err(format!( + "Cannot repeat nullable format: Repeat({narrow:?} ⊂ {wide:?})" + )); + } + + let dnarrow = Box::new(Decoder::compile_next( + compiler, + narrow, + Rc::new(Next::Repeat(narrow, next.clone())), + )?); + + let dwide = Box::new(Decoder::compile_next( + compiler, + wide, + Rc::new(Next::Repeat(wide, next.clone())), + )?); + + // Under the precondition that narrow is a subset of wide, the union of the two matchtrees is just the + // matchtree for wide + + let wide_star = Format::Repeat(wide.clone()); + let f_wide = Format::Tuple(vec![(**wide).clone(), wide_star]); + let f_empty = Format::EMPTY; + + if let Some(tree) = MatchTree::build(compiler.module, &[f_wide, f_empty], next) { + Ok(Decoder::RepeatFallback(tree, dnarrow, dwide)) + } else { + Err(format!("Cannot build match tree for {format:?}")) + } + } Format::Repeat1(a) => { if a.is_nullable(compiler.module) { return Err(format!("cannot repeat nullable format: {a:?}")); @@ -1079,6 +1113,34 @@ impl Decoder { } Ok((Value::Seq(v), input)) } + Decoder::RepeatFallback(tree, subset, superset) => { + let mut input = input; + let mut v = Vec::new(); + let mut decoder = subset; + let mut fellback = false; + + while tree.matches(input).ok_or(ParseError::NoValidBranch { + offset: input.offset, + })? == 0 + { + match decoder.parse(program, scope, input) { + Ok((va, next_input)) => { + input = next_input; + v.push(va); + } + err @ Err(_) => { + if fellback { + return err; + } else { + decoder = superset; + fellback = true; + } + } + } + } + + Ok((Value::Fallback(fellback, Box::new(Value::Seq(v))), input)) + } Decoder::RepeatCount(expr, a) => { let mut input = input; let count = expr.eval_value(scope).unwrap_usize(); diff --git a/src/lib.rs b/src/lib.rs index 2aba0187..27f43119 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -552,6 +552,8 @@ pub enum Format { RepeatUntilLast(Expr, Box), /// Repeat a format until a condition is satisfied by the sequence RepeatUntilSeq(Expr, Box), + /// Repeat an eager narrow format, but continue with a broader item if necessary upon recoverable failure + RepeatFallback(Box, Box), /// Parse a format without advancing the stream position afterwards Peek(Box), /// Attempt to parse a format and fail if it succeeds @@ -630,7 +632,7 @@ impl Format { .map(|(_, f)| f.match_bounds(module)) .reduce(Bounds::add) .unwrap_or(Bounds::exact(0)), - Format::Repeat(_) => Bounds::new(0, None), + Format::Repeat(_) | Format::RepeatFallback(_, _) => Bounds::new(0, None), Format::Repeat1(f) => f.match_bounds(module) * Bounds::new(1, None), Format::RepeatCount(expr, f) => f.match_bounds(module) * expr.bounds(), Format::RepeatUntilLast(_, f) => f.match_bounds(module) * Bounds::new(1, None), @@ -677,7 +679,7 @@ impl Format { Format::Union(branches) => Format::iso_union_depends_on_next(branches, module), Format::Tuple(fields) => fields.iter().any(|f| f.depends_on_next(module)), Format::Record(fields) => fields.iter().any(|(_, f)| f.depends_on_next(module)), - Format::Repeat(_) => true, + Format::Repeat(_) | Format::RepeatFallback(_, _) => true, Format::Repeat1(_) => true, Format::RepeatCount(_, _f) => false, Format::RepeatUntilLast(_, _f) => false, @@ -734,6 +736,16 @@ impl Format { } } + pub fn is_char_format(&self, module: &FormatModule) -> bool { + match self { + // NOTE - currently only true for named formats matching `/.*char.*/` + Format::ItemVar(level, _args) => module.get_name(*level).contains("char"), + _ => false, + } + } + + + /// Returns `true` if values associated to this format should be handled as multi-character ASCII strings pub fn is_ascii_string_format(&self, module: &FormatModule) -> bool { match self { @@ -900,6 +912,11 @@ impl FormatModule { let t = self.infer_format_type(scope, a)?; Ok(ValueType::Seq(Box::new(t))) } + Format::RepeatFallback(narrow, wide) => { + let mut t = self.infer_format_type(scope, narrow)?; + t = t.unify(&self.infer_format_type(scope, wide)?)?; + Ok(ValueType::Seq(Box::new(t))) + } Format::Peek(a) => self.infer_format_type(scope, a), Format::PeekNot(_a) => Ok(ValueType::Tuple(vec![])), Format::Slice(_expr, a) => self.infer_format_type(scope, a), @@ -1265,6 +1282,19 @@ impl<'a> MatchTreeStep<'a> { Format::RepeatUntilSeq(_expr, _a) => { Self::accept() // FIXME } + Format::RepeatFallback(narrow, wide) => { + let tree = Self::add_next(module, next.clone()); + tree.union(Self::add( + module, + narrow, + Rc::new(Next::Repeat(narrow, next.clone())), + )) + .union(Self::add( + module, + wide, + Rc::new(Next::Repeat(wide, next.clone())), + )) + } Format::Peek(a) => { let tree = Self::add_next(module, next.clone()); let peek = Self::add(module, a, Rc::new(Next::Empty)); diff --git a/src/output/flat.rs b/src/output/flat.rs index bc6191c3..4ee9aba9 100644 --- a/src/output/flat.rs +++ b/src/output/flat.rs @@ -162,6 +162,10 @@ fn check_covered( | Format::RepeatUntilSeq(_, format) => { check_covered(module, path, format)?; } + Format::RepeatFallback(narrow, wide) => { + check_covered(module, path, narrow)?; + check_covered(module, path, wide)?; + } Format::Peek(_) => {} // FIXME Format::PeekNot(_) => {} // FIXME Format::Slice(_, format) => { @@ -279,6 +283,15 @@ impl<'module, W: io::Write> Context<'module, W> { } _ => panic!("expected sequence, found {value:?}"), }, + Format::RepeatFallback(_narrow, _wide) => match value { + Value::Seq(values) => { + for _v in values { + (); // FIXME + } + Ok(()) + } + _ => panic!("expected sequence"), + }, Format::Peek(format) => self.write_flat(scope, value, format), Format::PeekNot(format) => self.write_flat(scope, value, format), Format::Slice(_, format) => self.write_flat(scope, value, format), diff --git a/src/output/tree.rs b/src/output/tree.rs index 2ea33e07..ae851038 100644 --- a/src/output/tree.rs +++ b/src/output/tree.rs @@ -124,6 +124,13 @@ impl<'module> MonoidalPrinter<'module> { Value::Char(_) => true, Value::Bool(_) => true, Value::U8(_) | Value::U16(_) | Value::U32(_) => true, + Value::Fallback(is_fallback, v) => match format { + Some(Format::RepeatFallback(a, b)) => { + let format = if *is_fallback { b } else { a }; + self.is_atomic_value(v, Some(format)) + } + _ => self.is_atomic_value(v, None), + }, Value::Tuple(values) => values.is_empty(), Value::Record(fields) => fields.is_empty(), Value::Seq(values) => values.is_empty(), @@ -305,6 +312,30 @@ impl<'module> MonoidalPrinter<'module> { } _ => panic!("expected sequence, found {value:?}"), }, + Format::RepeatFallback(narrow, wide) => match value { + Value::Fallback(is_wide, v) => match v.deref() { + Value::Seq(values) => { + let format = if *is_wide { wide } else { narrow }; + if self.flags.tables_for_record_sequences + && self.is_record_with_atomic_fields(format).is_some() + { + self.compile_seq_records(values, format) + } else if self.flags.pretty_ascii_strings + && format.is_ascii_char_format(self.module) + { + self.compile_ascii_seq(values) + } else if self.flags.pretty_utf8_strings + && format.is_char_format(self.module) + { + self.compile_char_seq(values) + } else { + self.compile_seq(scope, values, Some(format)) + } + } + _ => panic!("expected sequence, found {v:?}"), + }, + _ => panic!("expected Fallback, found {value:?}"), + }, Format::Peek(format) => self.compile_decoded_value(scope, value, format), Format::PeekNot(_format) => self.compile_value(scope, value), Format::Slice(_, format) => self.compile_decoded_value(scope, value, format), @@ -375,6 +406,7 @@ impl<'module> MonoidalPrinter<'module> { Value::Seq(vals) => self.compile_seq(scope, vals, None), Value::Record(fields) => self.compile_record(scope, fields, None), Value::Variant(label, value) => self.compile_variant(scope, label, value, None), + Value::Fallback(_, value) => self.compile_value(scope, value), Value::Mapped(orig, value) => { if self.flags.collapse_mapped_values { self.compile_value(scope, value) @@ -1068,6 +1100,15 @@ impl<'module> MonoidalPrinter<'module> { prec, Precedence::FORMAT_COMPOUND, ), + Format::RepeatFallback(narrow, wide) => { + let wide_frag = self.compile_format(wide, Precedence::FORMAT_ATOM); + + cond_paren( + self.compile_nested_format("repeat_fallback", Some(&[wide_frag]), narrow, prec), + prec, + Precedence::FORMAT_COMPOUND, + ) + } Format::Repeat(format) => cond_paren( self.compile_nested_format("repeat", None, format, prec), prec, diff --git a/tests/expected/decode/test.txt.stdout b/tests/expected/decode/test.txt.stdout index 1ce16353..2ff6625a 100644 --- a/tests/expected/decode/test.txt.stdout +++ b/tests/expected/decode/test.txt.stdout @@ -1,2 +1,3 @@ -├── data <- _ |...| _ := { text := { ascii := "GIF89a is a popular format\n" } } +├── data <- _ |...| _ := +│ └── text <- text.string := "GIF89a is a popular format\n" └── end <- end-of-input diff --git a/tests/expected/decode/test.utf8.stdout b/tests/expected/decode/test.utf8.stdout index a1238639..bdec14c9 100644 --- a/tests/expected/decode/test.utf8.stdout +++ b/tests/expected/decode/test.utf8.stdout @@ -1,4 +1,3 @@ ├── data <- _ |...| _ := -│ └── text <- text.string := -│ └── utf8 <- text.string.utf8 := "この🦀は擂り身ではなく、本物のカニです。\n" +│ └── text <- text.string := "この🦀は擂り身ではなく、本物のカニです。\n" └── end <- end-of-input