From ba035acb61cb8b2b4081569bd46f7b39e1255eb9 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 24 Sep 2023 20:54:56 +0500 Subject: [PATCH 01/22] Implement DTD parsing in a new crate quick-dtd --- quick-dtd/Cargo.toml | 27 + quick-dtd/LICENSE-MIT.md | 23 + quick-dtd/src/comment.rs | 148 ++++++ quick-dtd/src/dtd.rs | 952 ++++++++++++++++++++++++++++++++++++ quick-dtd/src/lib.rs | 29 ++ quick-dtd/src/pi.rs | 92 ++++ quick-dtd/src/quoted.rs | 104 ++++ quick-dtd/tests/example.dtd | 54 ++ 8 files changed, 1429 insertions(+) create mode 100644 quick-dtd/Cargo.toml create mode 100644 quick-dtd/LICENSE-MIT.md create mode 100644 quick-dtd/src/comment.rs create mode 100644 quick-dtd/src/dtd.rs create mode 100644 quick-dtd/src/lib.rs create mode 100644 quick-dtd/src/pi.rs create mode 100644 quick-dtd/src/quoted.rs create mode 100644 quick-dtd/tests/example.dtd diff --git a/quick-dtd/Cargo.toml b/quick-dtd/Cargo.toml new file mode 100644 index 00000000..1ff05e06 --- /dev/null +++ b/quick-dtd/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "quick-dtd" +version = "0.1.0" +edition = "2021" + +description = "High performance DTD reader for quick-xml" + +documentation = "https://docs.rs/quick-dtd" +repository = "https://github.com/tafia/quick-xml" + +keywords = ["dtd", "parser", "xml"] +categories = ["parsing", "parser-implementations", "no-std"] +license = "MIT" +rust-version = "1.56" +include = ["src/*", "LICENSE-MIT.md", "README.md"] + +[dependencies] +document-features = { version = "0.2", optional = true } + +[dev-dependencies] +pretty_assertions = "1.4" + +[features] +default = ["std"] + +## Enables support of Rust standard library +std = [] \ No newline at end of file diff --git a/quick-dtd/LICENSE-MIT.md b/quick-dtd/LICENSE-MIT.md new file mode 100644 index 00000000..3329c509 --- /dev/null +++ b/quick-dtd/LICENSE-MIT.md @@ -0,0 +1,23 @@ +The MIT License (MIT) + +Copyright (c) 2023 Mingun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/quick-dtd/src/comment.rs b/quick-dtd/src/comment.rs new file mode 100644 index 00000000..6c52233b --- /dev/null +++ b/quick-dtd/src/comment.rs @@ -0,0 +1,148 @@ +//! Contains a parser for an XML comment. + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum State { + /// The parser does not yet seen any dashes at the end of previous slice. + Seen0, + /// The parser already seen one dash on the end of previous slice. + Seen1, + /// The parser already seen two dashes on the end of previous slice. + Seen2, +} + +impl Default for State { + fn default() -> Self { + Self::Seen0 + } +} + +/// A parser that search a `-->` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// comment is ended (the position after `-->`). If search was unsuccessful, +/// a [`None`] will be returned. You typically would expect positive result of +/// search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::CommentParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = CommentParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(12)); +/// // ^ ^ +/// // 0 11 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct CommentParser(State); + +impl CommentParser { + /// Determines the end position of an XML comment in the provided slice. + /// Comments is a pieces of text enclosed in `` braces. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: a slice to search end of comment. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + // --|> + if i == 0 && self.0 == State::Seen2 { + // +1 for `>` which should be included in event + return Some(1); + } + // x-|-> + // --|-> + if i == 1 && bytes[0] == b'-' && matches!(self.0, State::Seen1 | State::Seen2) { + // +1 for `>` which should be included in event + return Some(2); + } + if bytes[..i].ends_with(b"--") { + // +1 for `>` which should be included in event + return Some(i + 1); + } + } + if bytes.ends_with(b"--") { + self.0 = State::Seen2; + } else { + self.next_state(bytes.last().copied()); + } + None + } + + #[inline] + fn next_state(&mut self, last: Option) { + match (self.0, last) { + (State::Seen0, Some(b'-')) => self.0 = State::Seen1, + + (State::Seen1, Some(b'-')) => self.0 = State::Seen2, + (State::Seen1, Some(_)) => self.0 = State::Seen0, + + (State::Seen2, Some(b'-')) => {} + (State::Seen2, Some(_)) => self.0 = State::Seen0, + + _ => {} + } + } +} + +#[test] +fn test() { + use pretty_assertions::assert_eq; + use State::*; + + fn parse_comment(bytes: &[u8], initial: State) -> Result { + let mut parser = CommentParser(initial); + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser.0), + } + } + + assert_eq!(parse_comment(b"", Seen0), Err(Seen0)); // xx| + assert_eq!(parse_comment(b"", Seen1), Err(Seen1)); // x-| + assert_eq!(parse_comment(b"", Seen2), Err(Seen2)); // --| + + assert_eq!(parse_comment(b"-", Seen0), Err(Seen1)); // xx|- + assert_eq!(parse_comment(b"-", Seen1), Err(Seen2)); // x-|- + assert_eq!(parse_comment(b"-", Seen2), Err(Seen2)); // --|- + + assert_eq!(parse_comment(b">", Seen0), Err(Seen0)); // xx|> + assert_eq!(parse_comment(b">", Seen1), Err(Seen0)); // x-|> + assert_eq!(parse_comment(b">", Seen2), Ok(1)); // --|> + + assert_eq!(parse_comment(b"--", Seen0), Err(Seen2)); // xx|-- + assert_eq!(parse_comment(b"--", Seen1), Err(Seen2)); // x-|-- + assert_eq!(parse_comment(b"--", Seen2), Err(Seen2)); // --|-- + + assert_eq!(parse_comment(b"->", Seen0), Err(Seen0)); // xx|-> + assert_eq!(parse_comment(b"->", Seen1), Ok(2)); // x-|-> + assert_eq!(parse_comment(b"->", Seen2), Ok(2)); // --|-> + + assert_eq!(parse_comment(b"-->", Seen0), Ok(3)); // xx|--> + assert_eq!(parse_comment(b"-->", Seen1), Ok(3)); // x-|--> + assert_eq!(parse_comment(b"-->", Seen2), Ok(3)); // --|--> + + assert_eq!(parse_comment(b">-->", Seen0), Ok(4)); // xx|>--> + assert_eq!(parse_comment(b">-->", Seen1), Ok(4)); // x-|>--> + assert_eq!(parse_comment(b">-->", Seen2), Ok(1)); // --|>--> + + assert_eq!(parse_comment(b"->-->", Seen0), Ok(5)); // xx|->--> + assert_eq!(parse_comment(b"->-->", Seen1), Ok(2)); // x-|->--> + assert_eq!(parse_comment(b"->-->", Seen2), Ok(2)); // --|->--> +} diff --git a/quick-dtd/src/dtd.rs b/quick-dtd/src/dtd.rs new file mode 100644 index 00000000..b6201888 --- /dev/null +++ b/quick-dtd/src/dtd.rs @@ -0,0 +1,952 @@ +//! Contains the Document Type Definition pull-based parser. + +use crate::{CommentParser, PiParser, QuotedParser}; +use core::iter::Iterator; + +/// An internal state of a parser. Used to preserve information about currently +/// parsed event between calls to [`DtdParser::feed()`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum State { + /// Initial state used to begin parsing DTD events. + Start, + /// A `<` was seen, but nothing else. + Markup, + /// A ` Self { + Self::Start + } +} + +/// A result of feeding data into [`DtdParser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed. + NeedData, + /// The specified count of bytes should be consumed from the input. + EmitElement(usize), + /// The specified count of bytes should be consumed from the input. + EmitAttList(usize), + /// The specified count of bytes should be consumed from the input. + EmitEntity(usize), + /// The specified count of bytes should be consumed from the input. + EmitNotation(usize), + /// The specified count of bytes should be consumed from the input. + EmitPI(usize), + /// The specified count of bytes should be consumed from the input. + EmitComment(usize), + + /// Unexpected byte (`u8`) at the specified offset (`usize`) from begin of + /// chunk that was pushed to [`DtdParser::feed()`]. + /// + /// After getting this error the parser returned to the initial state and + /// you can start parsing another DTD event by feeding data. You should, + /// however, skip all unparsed data until `<` byte which is indication of + /// start of a new DTD event. + Unexpected(usize, u8), +} + +/// A parser of Document Type Definition (DTD) schemas. The parser operates on +/// user-provided buffers with content of DTD. The content can be in any ASCII-compatible +/// encoding. +/// +/// # Example +/// +/// ``` +/// # use pretty_assertions::assert_eq; +/// use quick_dtd::{DtdParser, FeedResult}; +/// +/// let mut parser = DtdParser::default(); +/// let mut result = Vec::new(); +/// let mut buf = Vec::new(); +/// // Suppose that you read `chunk` chunks from network, for example +/// 'outer: for chunk in &[ +/// "garbage\n'>", +/// ] { +/// let mut input = chunk.as_bytes(); +/// loop { +/// let consumed = match parser.feed(input) { +/// // All data in `input` was read and parser state didn't changed +/// // You should provide another chunk of data. The `input` should +/// // considered as fully consumed +/// FeedResult::NeedData => { +/// // Store all input to buffer for current event, request the +/// // new data from reader +/// buf.extend_from_slice(input); +/// continue 'outer; +/// } +/// FeedResult::Unexpected(offset, byte) => { +/// match input[offset..].iter().position(|b| *b == b'<') { +/// // Skip all garbage until start of new event +/// Some(end) => { +/// assert_eq!(&input[offset..end], b"garbage\n"); +/// offset + end +/// } +/// None => input.len(), +/// } +/// } +/// +/// FeedResult::EmitElement(offset) | +/// FeedResult::EmitAttList(offset) | +/// FeedResult::EmitEntity(offset) | +/// FeedResult::EmitNotation(offset) | +/// FeedResult::EmitPI(offset) | +/// FeedResult::EmitComment(offset) => { +/// // Store consumed input to buffer for current event +/// buf.extend_from_slice(&input[..offset]); +/// // ..process `buf` with data of events here +/// result.push(String::from_utf8(buf).unwrap()); +/// // Prepare buffer for new data +/// buf = Vec::new(); +/// offset +/// } +/// }; +/// // Skip consumed input, feed the rest on next iteration +/// input = &input[consumed..]; +/// } +/// } +/// +/// assert_eq!(result, [ +/// "", +/// "'>", +/// ]); +/// ``` +#[derive(Copy, Clone, Default, Debug, Eq, PartialEq)] +pub struct DtdParser(State); +impl DtdParser { + /// Provides new portion of data to the parser to parse. When this method + /// returns [`FeedResult::NeedData`], the whole buffer was analyzed and no + pub fn feed(&mut self, bytes: &[u8]) -> FeedResult { + for (offset, &byte) in bytes.iter().enumerate() { + let start = offset + 1; + let rest = &bytes[start..]; + self.0 = match self.0 { + State::Start => match byte { + b'<' => State::Markup, + // Skip spaces defined by XML standard + b' ' | b'\t' | b'\r' | b'\n' => continue, + b => return FeedResult::Unexpected(offset, b), + }, + State::Markup => match byte { + b'!' => State::MarkupBang, + b'?' => return self.parse_pi(rest, start, PiParser::default()), + b => return FeedResult::Unexpected(offset, b), + }, + State::MarkupBang => match byte { + b'E' => State::MaybeElementOrEntity, + b'A' => State::MaybeAttList1, + b'N' => State::MaybeNotation1, + b'-' => State::MaybeComment, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElementOrEntity => match byte { + b'L' => State::MaybeElement1, + b'N' => State::MaybeEntity1, + b => return FeedResult::Unexpected(offset, b), + }, + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeComment => match byte { + b'-' => return self.parse_comment(rest, start, CommentParser::default()), + b => return FeedResult::Unexpected(offset, b), + }, + State::Comment(parser) => return self.parse_comment(bytes, offset, parser), + State::PI(parser) => return self.parse_pi(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeElement1 => match byte { + b'E' => State::MaybeElement2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement2 => match byte { + b'M' => State::MaybeElement3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement3 => match byte { + b'E' => State::MaybeElement4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement4 => match byte { + b'N' => State::MaybeElement5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement5 => match byte { + b'T' => State::MaybeElement6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement6 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => return self.parse_element(rest, start), + b => return FeedResult::Unexpected(offset, b), + }, + State::Element => return self.parse_element(bytes, offset), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeEntity1 => match byte { + b'T' => State::MaybeEntity2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity2 => match byte { + b'I' => State::MaybeEntity3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity3 => match byte { + b'T' => State::MaybeEntity4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity4 => match byte { + b'Y' => State::MaybeEntity5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity5 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_entity(rest, start, QuotedParser::Outside) + } + b => return FeedResult::Unexpected(offset, b), + }, + State::Entity(parser) => return self.parse_entity(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeAttList1 => match byte { + b'T' => State::MaybeAttList2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList2 => match byte { + b'T' => State::MaybeAttList3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList3 => match byte { + b'L' => State::MaybeAttList4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList4 => match byte { + b'I' => State::MaybeAttList5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList5 => match byte { + b'S' => State::MaybeAttList6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList6 => match byte { + b'T' => State::MaybeAttList7, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList7 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_attlist(rest, start, QuotedParser::Outside) + } + b => return FeedResult::Unexpected(offset, b), + }, + State::AttList(parser) => return self.parse_attlist(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeNotation1 => match byte { + b'O' => State::MaybeNotation2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation2 => match byte { + b'T' => State::MaybeNotation3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation3 => match byte { + b'A' => State::MaybeNotation4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation4 => match byte { + b'T' => State::MaybeNotation5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation5 => match byte { + b'I' => State::MaybeNotation6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation6 => match byte { + b'O' => State::MaybeNotation7, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation7 => match byte { + b'N' => State::MaybeNotation8, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation8 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_notation(rest, start, QuotedParser::Outside); + } + b => return FeedResult::Unexpected(offset, b), + }, + State::Notation(parser) => return self.parse_notation(bytes, offset, parser), + }; + } + FeedResult::NeedData + } + + /// `` cannot contain `>` inside, so we emit it as soon as we found `>` + fn parse_element(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + match bytes.iter().position(|&b| b == b'>') { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitElement(offset + i + 1) + } + None => { + self.0 = State::Element; + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_entity( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitEntity(offset + i + 1) + } + None => { + self.0 = State::Entity(parser); + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_attlist( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitAttList(offset + i + 1) + } + None => { + self.0 = State::AttList(parser); + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_notation( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitNotation(offset + i + 1) + } + None => { + self.0 = State::Notation(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a PI target (at least, should) + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `has_mark`: a flag that indicates was the previous fed data ended with `?` + fn parse_pi(&mut self, bytes: &[u8], offset: usize, mut parser: PiParser) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + FeedResult::EmitPI(offset + i) + } + None => { + self.0 = State::PI(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a comment in the provided slice. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a comment content (at least, should) + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the state of comment parser saved after consuming the previous chunk of data + fn parse_comment( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: CommentParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + FeedResult::EmitComment(offset + i) + } + None => { + self.0 = State::Comment(parser); + FeedResult::NeedData + } + } + } + + /// Convert this parser to an iterator producing [`FeedResult`]s from specified + /// bytes. + pub fn into_iter<'a>(self, bytes: &'a [u8]) -> DtdIter<'a> { + DtdIter { + chunk: bytes, + parser: self, + } + } +} + +/// This struct is created by the [`into_iter`] method of [`DtdParser`]. +/// +/// [`into_iter`]: DtdParser::into_iter +pub struct DtdIter<'a> { + chunk: &'a [u8], + parser: DtdParser, +} +impl<'a> DtdIter<'a> { + /// Replaces current chunk of the iterator with nee one. All not-consumed + /// data would be loss, so call it only when you get `FeedResult::NeedData` + /// from the iterator. + pub fn feed(&mut self, chunk: &'a [u8]) { + self.chunk = chunk; + } +} +impl<'a> Iterator for DtdIter<'a> { + type Item = FeedResult; + + fn next(&mut self) -> Option { + if self.chunk.is_empty() { + return None; + } + let result = self.parser.feed(self.chunk); + match result { + FeedResult::NeedData => { + // All data consumed, so replace it empty data + self.chunk = b""; + None + } + FeedResult::EmitPI(off) + | FeedResult::EmitEntity(off) + | FeedResult::EmitAttList(off) + | FeedResult::EmitComment(off) + | FeedResult::EmitElement(off) + | FeedResult::EmitNotation(off) + | FeedResult::Unexpected(off, _) => { + self.chunk = &self.chunk[off..]; + Some(result) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::FeedResult::*; + use super::*; + use pretty_assertions::assert_eq; + + fn check(chunk_size: usize, bytes: &[u8]) { + let mut iter = DtdParser::default().into_iter(b""); + for (i, chunk) in bytes.chunks(chunk_size).enumerate() { + iter.feed(chunk); + while let Some(event) = iter.next() { + assert!( + !matches!(event, FeedResult::Unexpected(..)), + "#{}: {:?} => {:?}\n{:?}", + i * chunk_size, + iter.parser.0, + event, + core::str::from_utf8(chunk).unwrap(), + ); + } + } + } + + mod by_chunks { + use super::*; + + const BYTES: &[u8] = include_bytes!("../tests/example.dtd"); + + #[test] + fn _1() { + check(1, BYTES); + } + + #[test] + fn _2() { + check(2, BYTES); + } + + #[test] + fn _3() { + check(3, BYTES); + } + + #[test] + fn _5() { + check(5, BYTES); + } + + #[test] + fn _7() { + check(7, BYTES); + } + + #[test] + fn _11() { + check(11, BYTES); + } + + #[test] + fn _13() { + check(13, BYTES); + } + + #[test] + fn _17() { + check(17, BYTES); + } + + #[test] + fn _19() { + check(19, BYTES); + } + + #[test] + fn _23() { + check(23, BYTES); + } + + #[test] + fn _29() { + check(29, BYTES); + } + + #[test] + fn _31() { + check(31, BYTES); + } + + #[test] + fn _37() { + check(37, BYTES); + } + + #[test] + fn _41() { + check(41, BYTES); + } + + #[test] + fn _43() { + check(43, BYTES); + } + + #[test] + fn _47() { + check(47, BYTES); + } + } + + #[test] + fn element() { + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Element); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Element); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b">"), EmitElement(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b">a"), EmitElement(1)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn attlist() { + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitAttList(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitAttList(5)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn entity() { + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitEntity(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitEntity(5)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn notation() { + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitNotation(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitNotation(5)); + assert_eq!(parser.0, State::Start); + } + + /*#[test] + fn pi() { + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::PI(true)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"aa"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"aa"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b">"), EmitPI(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b">a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b">a"), EmitPI(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a>"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a>"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?>"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?>"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?>a"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?>a"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a?>"), EmitPI(3)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a?>"), EmitPI(3)); + assert_eq!(parser.0, State::Start); + }*/ +} diff --git a/quick-dtd/src/lib.rs b/quick-dtd/src/lib.rs new file mode 100644 index 00000000..2550a057 --- /dev/null +++ b/quick-dtd/src/lib.rs @@ -0,0 +1,29 @@ +//! High performant Document Type Definition (DTD) parser. +//! +//! # Features +//! +//! `quick-dtd` supports the following features: +#![cfg_attr( + feature = "document-features", + cfg_attr(doc, doc = ::document_features::document_features!( + // Replicates the default format, but adds an anchor to the feature + feature_label = "{feature}" + )) +)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] +// Enable feature requirements in the docs from 1.57 +// See https://stackoverflow.com/questions/61417452 +#![cfg_attr(docs_rs, feature(doc_auto_cfg))] +#![cfg_attr(not(feature = "std"), no_std)] + +mod dtd; +// Helper reusable parsers +mod comment; +mod pi; +mod quoted; + +pub use comment::CommentParser; +pub use dtd::{DtdIter, DtdParser, FeedResult}; +pub use pi::PiParser; +pub use quoted::{QuotedParser, OneOf}; diff --git a/quick-dtd/src/pi.rs b/quick-dtd/src/pi.rs new file mode 100644 index 00000000..e8957f7b --- /dev/null +++ b/quick-dtd/src/pi.rs @@ -0,0 +1,92 @@ +//! Contains a parser for an XML processing instruction. + +/// A parser that search a `?>` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// processing instruction is ended (the position after `?>`). If search was +/// unsuccessful, a [`None`] will be returned. You typically would expect positive +/// result of search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::PiParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = PiParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b" and ?"), None); +/// // ...get another chunk of data +/// assert_eq!(parser.feed(b"inside?>and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 7 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct PiParser( + /// A flag that indicates was the `bytes` in the previous attempt to find the + /// end ended with `?`. + bool, +); + +impl PiParser { + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a processing instruction. + /// Should contain text in ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + match i { + // +1 for `>` which should be included in event + 0 if self.0 => return Some(1), + // If the previous byte is `?`, then we found `?>` + // +1 for `>` which should be included in event + i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1), + _ => {} + } + } + self.0 = bytes.last().copied() == Some(b'?'); + None + } +} + +#[test] +fn pi() { + use pretty_assertions::assert_eq; + + fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result { + let mut parser = PiParser(had_question_mark); + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser.0), + } + } + + assert_eq!(parse_pi(b"", false), Err(false)); // x| + assert_eq!(parse_pi(b"", true), Err(false)); // ?| + + assert_eq!(parse_pi(b"?", false), Err(true)); // x|? + assert_eq!(parse_pi(b"?", true), Err(true)); // ?|? + + assert_eq!(parse_pi(b">", false), Err(false)); // x|> + assert_eq!(parse_pi(b">", true), Ok(1)); // ?|> + + assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?> + assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?> + + assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?> + assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?> +} diff --git a/quick-dtd/src/quoted.rs b/quick-dtd/src/quoted.rs new file mode 100644 index 00000000..aca431a1 --- /dev/null +++ b/quick-dtd/src/quoted.rs @@ -0,0 +1,104 @@ +/// Represents the result of [`QuotedParser::one_of`] operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum OneOf { + /// The open angle bracket (`<`) was found as specified position. + /// + /// The open angle bracket could only be part of a tag inside DTD + /// if DTD is correctly formed. + Open(usize), + /// The close angle bracket (`>`) was found as specified position. + Close(usize), + /// Nothing was found in the provided slice. + None, +} + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered, as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until yo'll get it. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::QuotedParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = QuotedParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum QuotedParser { + /// The initial state (inside element, but outside of attribute value). + Outside, + /// Inside a single-quoted region. + SingleQ, + /// Inside a double-quoted region. + DoubleQ, +} +impl QuotedParser { + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, &byte)) = it.find(|(_, &b)| matches!(b, b'>' | b'\'' | b'"')) { + match (*self, byte) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'>') => return Some(i), + (Self::Outside, b'\'') => *self = Self::SingleQ, + (Self::Outside, b'\"') => *self = Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => *self = Self::Outside, + + // all other bytes: no state change + _ => {} + } + } + None + } + + /// Returns number of consumed bytes or `None` if `<` or `>` was not found in `bytes`. + pub fn one_of(&mut self, bytes: &[u8]) -> OneOf { + let mut it = bytes.iter().enumerate(); + while let Some((i, &byte)) = it.find(|(_, &b)| matches!(b, b'<' | b'>' | b'\'' | b'"')) { + match (*self, byte) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'<') => return OneOf::Open(i), + (Self::Outside, b'>') => return OneOf::Close(i), + (Self::Outside, b'\'') => *self = Self::SingleQ, + (Self::Outside, b'\"') => *self = Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => *self = Self::Outside, + + // all other bytes: no state change + _ => {} + } + } + OneOf::None + } +} + +impl Default for QuotedParser { + fn default() -> Self { + Self::Outside + } +} diff --git a/quick-dtd/tests/example.dtd b/quick-dtd/tests/example.dtd new file mode 100644 index 00000000..8192eec7 --- /dev/null +++ b/quick-dtd/tests/example.dtd @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + +"> + + + + + + + + + + + +"> +'> +"> +' NDATA n-data> +'> +' NDATA n-data> + +"> +'> +"> +'> +'> +'> + + +"> +'> +'> +'> + + + + + +?> + +--> \ No newline at end of file From f3ccb26170ce0d49cc4cf4fa8088b2474417f377 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 21 Sep 2023 00:05:50 +0500 Subject: [PATCH 02/22] Implement new XML parser New parser decouples reading bytes from parsing --- Cargo.toml | 2 + Changelog.md | 2 + src/lib.rs | 1 + src/parser/cdata.rs | 126 ++++++ src/parser/mod.rs | 941 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1072 insertions(+) create mode 100644 src/parser/cdata.rs create mode 100644 src/parser/mod.rs diff --git a/Cargo.toml b/Cargo.toml index eb2f794d..4fc26aac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,11 +14,13 @@ rust-version = "1.56" include = ["src/*", "LICENSE-MIT.md", "README.md"] [dependencies] +aquamarine = { version = "0.3", optional = true } document-features = { version = "0.2", optional = true } encoding_rs = { version = "0.8", optional = true } serde = { version = ">=1.0.139", optional = true } tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] } memchr = "2.1" +quick-dtd = { path = "quick-dtd", version = "0.1" } arbitrary = { version = "1", features = ["derive"], optional = true } [dev-dependencies] diff --git a/Changelog.md b/Changelog.md index 5c51eec3..60e5d268 100644 --- a/Changelog.md +++ b/Changelog.md @@ -29,6 +29,8 @@ to get an offset of the error position. For `SyntaxError`s the range - [#362]: Added `escape::minimal_escape()` which escapes only `&` and `<`. - [#362]: Added `BytesCData::minimal_escape()` which escapes only `&` and `<`. - [#362]: Added `Serializer::set_quote_level()` which allow to set desired level of escaping. +- [#690]: Added a low-level hight-performant XML parser in `quick_xml::parser` module. + For advanced use. ### Bug Fixes diff --git a/src/lib.rs b/src/lib.rs index db164e21..a5d3768a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,7 @@ pub mod escape { } pub mod events; pub mod name; +pub mod parser; pub mod reader; #[cfg(feature = "serialize")] pub mod se; diff --git a/src/parser/cdata.rs b/src/parser/cdata.rs new file mode 100644 index 00000000..af6d1efe --- /dev/null +++ b/src/parser/cdata.rs @@ -0,0 +1,126 @@ +//! Contains a parser for an XML CDATA content. + +/// A parser that search a `]]>` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// comment is ended (the position after `]]>`). If search was unsuccessful, +/// a [`None`] will be returned. You typically would expect positive result of +/// search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// [`feed`]: Self::feed() +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum CDataParser { + /// The parser does not yet seen any braces at the end of previous slice. + Seen0, + /// The parser already seen one brace on the end of previous slice. + Seen1, + /// The parser already seen two braces on the end of previous slice. + Seen2, +} + +impl CDataParser { + /// Determines the end position of an XML character data in the provided slice. + /// Character data (CDATA) is a pieces of text enclosed in `` braces. + /// Character data ends on the first occurrence of `]]>` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: a slice to search end of CDATA. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + // ]]|> + if i == 0 && *self == Self::Seen2 { + // +1 for `>` which should be included in event + return Some(1); + } + // x]|]> + // ]]|]> + if i == 1 && bytes[0] == b']' && matches!(self, Self::Seen1 | Self::Seen2) { + // +1 for `>` which should be included in event + return Some(2); + } + if bytes[..i].ends_with(b"]]") { + // +1 for `>` which should be included in event + return Some(i + 1); + } + } + if bytes.ends_with(b"]]") { + *self = Self::Seen2; + } else { + *self = self.next_state(bytes.last().copied()); + } + None + } + + #[inline] + fn next_state(self, last: Option) -> Self { + match (self, last) { + (Self::Seen0, Some(b']')) => Self::Seen1, + + (Self::Seen1, Some(b']')) => Self::Seen2, + (Self::Seen1, Some(_)) => Self::Seen0, + + (Self::Seen2, Some(b']')) => self, + (Self::Seen2, Some(_)) => Self::Seen0, + + _ => self, + } + } +} + +impl Default for CDataParser { + fn default() -> Self { + Self::Seen0 + } +} + +#[test] +fn test() { + use pretty_assertions::assert_eq; + use CDataParser::*; + + fn parse_cdata(bytes: &[u8], mut parser: CDataParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_cdata(b"", Seen0), Err(Seen0)); // xx| + assert_eq!(parse_cdata(b"", Seen1), Err(Seen1)); // x]| + assert_eq!(parse_cdata(b"", Seen2), Err(Seen2)); // ]]| + + assert_eq!(parse_cdata(b"]", Seen0), Err(Seen1)); // xx|] + assert_eq!(parse_cdata(b"]", Seen1), Err(Seen2)); // x]|] + assert_eq!(parse_cdata(b"]", Seen2), Err(Seen2)); // ]]|] + + assert_eq!(parse_cdata(b">", Seen0), Err(Seen0)); // xx|> + assert_eq!(parse_cdata(b">", Seen1), Err(Seen0)); // x]|> + assert_eq!(parse_cdata(b">", Seen2), Ok(1)); // ]]|> + + assert_eq!(parse_cdata(b"]]", Seen0), Err(Seen2)); // xx|]] + assert_eq!(parse_cdata(b"]]", Seen1), Err(Seen2)); // x]|]] + assert_eq!(parse_cdata(b"]]", Seen2), Err(Seen2)); // ]]|]] + + assert_eq!(parse_cdata(b"]>", Seen0), Err(Seen0)); // xx|]> + assert_eq!(parse_cdata(b"]>", Seen1), Ok(2)); // x]|]> + assert_eq!(parse_cdata(b"]>", Seen2), Ok(2)); // ]]|]> + + assert_eq!(parse_cdata(b"]]>", Seen0), Ok(3)); // xx|]]> + assert_eq!(parse_cdata(b"]]>", Seen1), Ok(3)); // x]|]]> + assert_eq!(parse_cdata(b"]]>", Seen2), Ok(3)); // ]]|]]> + + assert_eq!(parse_cdata(b">]]>", Seen0), Ok(4)); // xx|>]]> + assert_eq!(parse_cdata(b">]]>", Seen1), Ok(4)); // x]|>]]> + assert_eq!(parse_cdata(b">]]>", Seen2), Ok(1)); // ]]|>]]> + + assert_eq!(parse_cdata(b"]>]]>", Seen0), Ok(5)); // xx|]>]]> + assert_eq!(parse_cdata(b"]>]]>", Seen1), Ok(2)); // x]|]>]]> + assert_eq!(parse_cdata(b"]>]]>", Seen2), Ok(2)); // ]]|]>]]> +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 00000000..84813d1f --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,941 @@ +//! A low-level XML parser. For advanced use. It is very low-level and you +//! typically should not use it. Use a [`Reader`] instead. +//! +//! To use a parser create an instance of [`Parser`] and [`feed`] data into it. +//! After successful search the parser will return [`FeedResult`] with position +//! where match was found and returned variant will represent what exactly was +//! found. In case if the provided data is not enough to made any decision, a +//! [`FeedResult::NeedData`] is returned. Finally, if parser encounters a byte +//! that should not be there, a [`SyntaxError`] is returned. +//! +//! To fully parse a document you should pass unconsumed data to [`feed`] in a +//! loop, that means `&bytes[offset..]` for `Emit*` cases and a completely new +//! slice for a `NeedData` case: +//! +//! ``` +//! # use quick_xml::parser::Parser; +//! use quick_xml::parser::FeedResult::*; +//! let mut parser = Parser::default(); +//! // Buffer for data of one event +//! let mut buf = Vec::new(); +//! // Feed data by 3 bytes at once +//! for (i, mut chunk) in b"".chunks(3).enumerate() { +//! loop { +//! match parser.feed(chunk).unwrap() { +//! // Return to the outer loop to request new chunk +//! NeedData => break, +//! +//! EmitText(offset) | +//! EmitCData(offset) | +//! EmitComment(offset) | +//! EmitDoctype(offset) | +//! EmitPI(offset) | +//! EmitEmptyTag(offset) | +//! EmitStartTag(offset) | +//! EmitEndTag(offset) => { +//! // Append data of an event to the buffer +//! buf.extend_from_slice(&chunk[..offset]); +//! +//! // Consume already read data +//! chunk = &chunk[offset..]; +//! +//! // Emit new event using `buf` +//! // ... +//! +//! // If content of buffer is not required anymore, it can be cleared +//! buf.clear(); +//! } +//! } +//! } +//! } +//! ``` +//! +//! [`Reader`]: crate::Reader +//! [`feed`]: Parser::feed() + +use crate::errors::SyntaxError; +use cdata::CDataParser; +use quick_dtd::{CommentParser, DtdParser, PiParser, QuotedParser, OneOf}; + +mod cdata; + +/// An internal state of a parser. Used to preserve information about currently +/// parsed event between calls to [`Parser::feed()`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum State { + /// Initial state used to begin parse XML events. + Text, + + /// A `<` was seen, but nothing else. + Markup, + /// A ``. + Doctype(QuotedParser), + /// We are inside of `[]` of `` definition. + Dtd(DtdParser), + /// We are after `]` of `` definition, looking for `>`. + DoctypeFinish, + + /// A `` was not. Parser expect more data to close a tag + /// and emit [`FeedResult::EmitEmptyTag`]. + EndTag, + /// A `<*` was seen, but nothing else where `*` is an any byte, except `!`, `?`, or `/`. + /// It is unable to understand right now what data follow. + StartOrEmptyTag(QuotedParser, bool), +} + +impl Default for State { + fn default() -> Self { + Self::Text + } +} + +/// A result of feeding data into [`Parser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed + NeedData, + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Text`] should be emitted. + /// + /// [`Event::Text`]: crate::events::Event::Text + EmitText(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::CData`] should be emitted. + /// + /// [`Event::CData`]: crate::events::Event::CData + EmitCData(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Comment`] should be emitted. + /// + /// [`Event::Comment`]: crate::events::Event::Comment + EmitComment(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::DocType`] should be emitted. + /// + /// [`Event::DocType`]: crate::events::Event::DocType + EmitDoctype(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::PI`] should be emitted. + /// + /// [`Event::PI`]: crate::events::Event::PI + EmitPI(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Empty`] should be emitted. + /// + /// [`Event::Empty`]: crate::events::Event::Empty + EmitEmptyTag(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Start`] should be emitted. + /// + /// [`Event::Start`]: crate::events::Event::Start + EmitStartTag(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::End`] should be emitted. + /// + /// [`Event::End`]: crate::events::Event::End + EmitEndTag(usize), +} + +// convert `mermaid` block to a diagram +#[cfg_attr(doc, aquamarine::aquamarine)] +/// A low-level XML parser that searches a boundaries of various kinds of XML +/// events in the provided slice. +/// +/// The parser represents a state machine with following states: +/// +/// ```mermaid +/// flowchart TD +/// Text -->|<| Markup +/// Text -->|*| Text +/// +/// Markup --> |!| CommentOrCDataOrDoctype +/// Markup --->|?| PIParser1 +/// Markup --->|/| EndTagParser +/// Markup --> |*| StartOrEmptyTag +/// +/// CommentOrCDataOrDoctype -->|-| CommentParser +/// CommentOrCDataOrDoctype -->|D| DoctypeParser1 +/// CommentOrCDataOrDoctype -->|d| DoctypeParser1 +/// CommentOrCDataOrDoctype -->|"["| CDataParser1 +/// CommentOrCDataOrDoctype -->|*| Error +/// +/// subgraph comment +/// CommentParser -->|-| CommentContent1 +/// CommentParser ----->|*| CommentError +/// +/// CommentContent1 -->|-| CommentContent2 +/// CommentContent1 -->|*| CommentContent1 +/// +/// CommentContent2 -->|-| CommentContent3 +/// CommentContent2 -->|*| CommentContent1 +/// +/// CommentContent3 -->|>| Comment +/// CommentContent3 -->|*| CommentContent1 +/// end +/// subgraph doctype +/// DoctypeParser1 -->|O| DoctypeParser2 +/// DoctypeParser1 -->|o| DoctypeParser2 +/// DoctypeParser1 ---->|*| DoctypeError +/// +/// DoctypeParser2 -->|C| DoctypeParser3 +/// DoctypeParser2 -->|c| DoctypeParser3 +/// DoctypeParser2 ---->|*| DoctypeError +/// +/// DoctypeParser3 -->|T| DoctypeParser4 +/// DoctypeParser3 -->|t| DoctypeParser4 +/// DoctypeParser3 ---->|*| DoctypeError +/// +/// DoctypeParser4 -->|Y| DoctypeParser5 +/// DoctypeParser4 -->|y| DoctypeParser5 +/// DoctypeParser4 ---->|*| DoctypeError +/// +/// DoctypeParser5 -->|P| DoctypeParser6 +/// DoctypeParser5 -->|p| DoctypeParser6 +/// DoctypeParser5 ---->|*| DoctypeError +/// +/// DoctypeParser6 -->|E| DoctypeContent1 +/// DoctypeParser6 -->|e| DoctypeContent1 +/// DoctypeParser6 ---->|*| DoctypeError +/// +/// DoctypeContent1 -->|!| DoctypeContent2 +/// DoctypeContent1 -->|*| DoctypeContent1 +/// +/// DoctypeContent2 -->|>| Doctype +/// DoctypeContent2 -->|*| DoctypeContent1 +/// end +/// subgraph cdata +/// CDataParser1 -->|C| CDataParser2 +/// CDataParser1 ----->|*| CDataError +/// CDataParser2 -->|D| CDataParser3 +/// CDataParser2 ----->|*| CDataError +/// CDataParser3 -->|A| CDataParser4 +/// CDataParser3 ----->|*| CDataError +/// CDataParser4 -->|T| CDataParser5 +/// CDataParser4 ----->|*| CDataError +/// CDataParser5 -->|A| CDataParser6 +/// CDataParser5 ----->|*| CDataError +/// CDataParser6 -->|"["| CDataContent1 +/// CDataParser6 ----->|*| CDataError +/// +/// CDataContent1 -->|"]"| CDataContent2 +/// CDataContent1 -->|*| CDataContent1 +/// +/// CDataContent2 -->|"]"| CDataContent3 +/// CDataContent2 -->|*| CDataContent1 +/// +/// CDataContent3 -->|>| CData +/// CDataContent3 -->|*| CDataContent1 +/// end +/// +/// subgraph pi_parser +/// PIParser1 -->|?| PIParser2 +/// PIParser1 -->|*| PIParser1 +/// +/// PIParser2 -->|>| PI +/// PIParser2 -->|*| PIError +/// end +/// +/// subgraph end_tag +/// EndTagParser -->|>| EndTag +/// EndTagParser -->|*| EndTagError +/// end +/// +/// StartOrEmptyTag --> |/| EmptyTagParser +/// StartOrEmptyTag --->|>| StartTag +/// StartOrEmptyTag --> |*| StartOrEmptyTag +/// +/// subgraph empty_tag +/// EmptyTagParser -->|>| EmptyTag +/// EmptyTagParser -->|*| EmptyTagError +/// end +/// ``` +/// +/// Every arrow on that diagram is marked with a byte that initiates that transition. +/// Transition marked with asterisks (`*`) represents any byte except explicitly +/// mentioned in other transitions from that state. +/// +/// Each `Error` state on that diagram represents a [`SyntaxError`]. +/// Every successful match (`Emit*`) returns the parser to its initial state `Text`. +#[derive(Copy, Clone, Default, Debug, Eq, PartialEq)] +pub struct Parser(State); +impl Parser { + /// Performs parsing of the provided byte slice and returns the outcome. + /// See [`Parser`] for more info. + /// + /// # Parameters + /// - `bytes`: a slice to search a new XML event. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Result { + for (offset, &byte) in bytes.iter().enumerate() { + let trail = &bytes[offset..]; + let start = offset + 1; + let rest = &bytes[start..]; + self.0 = match self.0 { + State::Text => match byte { + b'<' => State::Markup, + _ => return Ok(self.parse_text(trail, offset)), + }, + State::Markup => match byte { + b'!' => State::MaybeCommentOrCDataOrDoctype, + b'?' => return Ok(self.parse_pi(rest, start, PiParser::default())), + b'/' => return Ok(self.parse_end(rest, start)), + _ => { + return Ok(self.parse_start_or_empty( + trail, + offset, + QuotedParser::Outside, + false, + )) + } + }, + State::MaybeCommentOrCDataOrDoctype => match byte { + b'-' => State::MaybeComment, + b'[' => State::MaybeCData1, + b'D' | b'd' => State::MaybeDoctype1, + _ => return Err(SyntaxError::InvalidBangMarkup), + }, + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeComment => match byte { + b'-' => return Ok(self.parse_comment(rest, start, CommentParser::default())), + _ => return Err(SyntaxError::UnclosedComment), + }, + State::Comment(parser) => { + return Ok(self.parse_comment(trail, offset, parser)); + } + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeCData1 => match byte { + b'C' => State::MaybeCData2, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData2 => match byte { + b'D' => State::MaybeCData3, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData3 => match byte { + b'A' => State::MaybeCData4, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData4 => match byte { + b'T' => State::MaybeCData5, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData5 => match byte { + b'A' => State::MaybeCData6, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData6 => match byte { + b'[' => return Ok(self.parse_cdata(rest, start, CDataParser::default())), + _ => return Err(SyntaxError::UnclosedCData), + }, + State::CData(parser) => return Ok(self.parse_cdata(trail, offset, parser)), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeDoctype1 => match byte { + b'O' | b'o' => State::MaybeDoctype2, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype2 => match byte { + b'C' | b'c' => State::MaybeDoctype3, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype3 => match byte { + b'T' | b't' => State::MaybeDoctype4, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype4 => match byte { + b'Y' | b'y' => State::MaybeDoctype5, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype5 => match byte { + b'P' | b'p' => State::MaybeDoctype6, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype6 => match byte { + b'E' | b'e' => return self.parse_doctype(rest, start, QuotedParser::Outside), + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::Doctype(parser) => return self.parse_doctype(trail, offset, parser), + State::Dtd(parser) => return self.parse_dtd(trail, offset, parser), + State::DoctypeFinish => return Ok(self.parse_doctype_finish(trail, offset)), + + State::PI(parser) => return Ok(self.parse_pi(trail, offset, parser)), + State::EndTag => return Ok(self.parse_end(trail, offset)), + State::StartOrEmptyTag(parser, has_slash) => { + return Ok(self.parse_start_or_empty(trail, offset, parser, has_slash)); + } + } + } + Ok(FeedResult::NeedData) + } + + /// This method should be called when all data was feed into parser. + /// + /// If parser in intermediate state it will return a corresponding syntax + /// error, otherwise it returns successfully. + // rustfmt tend to move pipes to the begin of a line which ruins the nice look + #[rustfmt::skip] + pub fn finish(self) -> Result<(), SyntaxError> { + match self.0 { + State::Markup | + State::StartOrEmptyTag(..) | + State::EndTag => Err(SyntaxError::UnclosedTag), + + State::MaybeCommentOrCDataOrDoctype => Err(SyntaxError::InvalidBangMarkup), + + State::MaybeComment | + State::Comment(_) => Err(SyntaxError::UnclosedComment), + + State::MaybeCData1 | + State::MaybeCData2 | + State::MaybeCData3 | + State::MaybeCData4 | + State::MaybeCData5 | + State::MaybeCData6 | + State::CData(_) => Err(SyntaxError::UnclosedCData), + + State::MaybeDoctype1 | + State::MaybeDoctype2 | + State::MaybeDoctype3 | + State::MaybeDoctype4 | + State::MaybeDoctype5 | + State::MaybeDoctype6 | + State::Doctype(_) | + State::Dtd(_) | + State::DoctypeFinish => Err(SyntaxError::UnclosedDoctype), + + State::PI(_) => Err(SyntaxError::UnclosedPIOrXmlDecl), + State::Text => Ok(()), + } + } + + /// Check if parser currently parses text + #[inline] + pub fn is_text_parsing(&self) -> bool { + self.0 == State::Text + } + + /// Text cannot contain `<` inside, so we emit it as soon as we find `<`. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a text content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + #[inline] + fn parse_text(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + match bytes.iter().position(|&b| b == b'<') { + Some(i) => FeedResult::EmitText(offset + i), + None => FeedResult::NeedData, + } + } + + /// Determines the end position of a comment in the provided slice. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a comment content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `dashes_left`: count of dashes that wasn't seen yet in the end of previous data chunk + fn parse_comment( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: CommentParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitComment(offset + i) + } + None => { + self.0 = State::Comment(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a CDATA block in the provided slice. + /// CDATA block ends on the first occurrence of `]]>` which cannot be escaped. + /// + /// `` can contain `>` inside. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a CDATA content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `braces_left`: count of braces that wasn't seen yet in the end of previous data chunk + fn parse_cdata(&mut self, bytes: &[u8], offset: usize, mut parser: CDataParser) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitCData(offset + i) + } + None => { + self.0 = State::CData(parser); + FeedResult::NeedData + } + } + } + + fn parse_doctype( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> Result { + // Search `[` (start of DTD definitions) or `>` (end of tag) + match parser.one_of(bytes) { + OneOf::Open(i) => self.parse_dtd(&bytes[i..], offset + i, DtdParser::default()), + OneOf::Close(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + Ok(FeedResult::EmitDoctype(offset + i + 1)) + } + OneOf::None => { + self.0 = State::Doctype(parser); + Ok(FeedResult::NeedData) + } + } + } + + /// Skips DTD representation, correctly following DTD grammar. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on a byte that would represent first byte of DTD event + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the DTD parser persisted between `feed()` calls + fn parse_dtd( + &mut self, + mut bytes: &[u8], + mut offset: usize, + mut parser: DtdParser, + ) -> Result { + loop { + let result = match parser.feed(bytes) { + // Skip recognized DTD structure + // TODO: Emit DTD events while parsing + quick_dtd::FeedResult::EmitPI(off) + | quick_dtd::FeedResult::EmitAttList(off) + | quick_dtd::FeedResult::EmitComment(off) + | quick_dtd::FeedResult::EmitElement(off) + | quick_dtd::FeedResult::EmitEntity(off) + | quick_dtd::FeedResult::EmitNotation(off) => { + bytes = &bytes[off..]; + offset += off; + continue; + } + + // `]` finishes DOCTYPE subsets: + // After that we should find the close `>` + quick_dtd::FeedResult::Unexpected(off, b']') => { + return Ok(self.parse_doctype_finish(&bytes[off..], offset + off)) + } + // Other bytes not expected, so return error + quick_dtd::FeedResult::Unexpected(..) => Err(SyntaxError::UnclosedDoctype), + quick_dtd::FeedResult::NeedData => Ok(FeedResult::NeedData), + }; + self.0 = State::Dtd(parser); + return result; + } + } + + fn parse_doctype_finish(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + match bytes.iter().position(|&b| b == b'>') { + Some(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitDoctype(offset + i + 1) + } + None => { + self.0 = State::DoctypeFinish; + FeedResult::NeedData + } + } + } + + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a PI target + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `has_mark`: a flag that indicates was the previous fed data ended with `?` + fn parse_pi(&mut self, bytes: &[u8], offset: usize, mut parser: PiParser) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitPI(offset + i) + } + None => { + self.0 = State::PI(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of an end tag in the provided slice. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a tag name + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + fn parse_end(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + match bytes.iter().position(|&b| b == b'>') { + Some(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitEndTag(offset + i + 1) + } + None => { + self.0 = State::EndTag; + FeedResult::NeedData + } + } + } + + /// Determines the end position of a start or empty tag in the provided slice. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a second byte of + /// a tag name + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the state of a quotes used to skip `>` inside attribute values + /// - `has_slash`: a flag that indicates was the previous fed data ended with `/` + fn parse_start_or_empty( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + has_slash: bool, + ) -> FeedResult { + match parser.feed(bytes) { + Some(0) if has_slash => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitEmptyTag(offset + 1) + } + Some(i) => { + self.0 = State::Text; + // This slash cannot follow immediately after `<`, because otherwise + // we would be in a `parse_end` and not here + if i > 0 && bytes[i - 1] == b'/' { + // +1 for `>` which should be included in event + FeedResult::EmitEmptyTag(offset + i + 1) + } else { + // +1 for `>` which should be included in event + FeedResult::EmitStartTag(offset + i + 1) + } + } + None => { + self.0 = State::StartOrEmptyTag(parser, bytes.last().copied() == Some(b'/')); + FeedResult::NeedData + } + } + } +} + +#[cfg(test)] +mod tests { + use super::FeedResult::*; + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn text() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b"text with > symbol"), Ok(NeedData)); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b"text with < symbol"), Ok(EmitText(10))); + // ^^^^^^^^^^ + assert_eq!(parser.0, State::Text); + } + + #[test] + fn cdata() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitCData(1))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitCData(2))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitCData(1))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b" ]]>"), + // 0 ^ = 40 + Ok(EmitCData(41)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn comment() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(NeedData)); + assert!(matches!(parser.0, State::Comment(_))); + assert_eq!(parser.feed(b"-->"), Ok(EmitComment(3))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b""), + // 0 ^ = 31 + Ok(EmitComment(32)) + ); + assert_eq!(parser.0, State::Text); + } + + mod doctype { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn only_name() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitDoctype(15))); + // 0 ^ = 14 + assert_eq!(parser.0, State::Text); + } + + #[test] + fn with_external_id() { + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"']\">"), + // 0 ^ = 28 + Ok(EmitDoctype(29)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\"]'>"), + // 0 ^ = 28 + Ok(EmitDoctype(29)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\"]'>"), + // 0 ^ = 32 + Ok(EmitDoctype(33)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"']\">"), + // 0 ^ = 31 + Ok(EmitDoctype(32)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn with_subset() { + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"'>]>"), + // 0 ^ = 33 + Ok(EmitDoctype(34)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"'\" []>"), + // 0 ^ = 29 + Ok(EmitDoctype(30)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\"' []>"), + // 0 ^ = 29 + Ok(EmitDoctype(30)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\"' []>"), + // 0 ^ = 33 + Ok(EmitDoctype(34)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"'\" []>"), + // 0 ^ = 32 + Ok(EmitDoctype(33)) + ); + assert_eq!(parser.0, State::Text); + } + } + + #[test] + fn pi() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitPI(4))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitPI(10))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b"?>"), Ok(EmitPI(5))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitPI(5))); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn empty() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitEmptyTag(8))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\" two='\"/>'/>"), + Ok(EmitEmptyTag(28)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn start() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b"<>"), Ok(EmitStartTag(2))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitStartTag(7))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!( + parser.feed(b"\" two='\">'>"), + Ok(EmitStartTag(25)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn end() { + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(6))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(7))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::default(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(3))); + assert_eq!(parser.0, State::Text); + } +} From 327d46e098395ec7ce8cb050d8de2ebdd801798e Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 26 Sep 2023 21:39:01 +0500 Subject: [PATCH 03/22] Add support for encoding detection to a parser --- src/parser/bom.rs | 148 ++++++++++++++++++++++++++++++++++++++++++++++ src/parser/mod.rs | 146 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 258 insertions(+), 36 deletions(-) create mode 100644 src/parser/bom.rs diff --git a/src/parser/bom.rs b/src/parser/bom.rs new file mode 100644 index 00000000..992abb58 --- /dev/null +++ b/src/parser/bom.rs @@ -0,0 +1,148 @@ +//! A parser for encoding detection using BOM and heuristics. + +/// A result of feeding data into [`BomParser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed. + NeedData, + /// Encoding detected as UTF-16 Big-Endian based on the first 4 bytes of content. + /// Nothing should be consumed. + Utf16Be, + /// Encoding detected as UTF-16 Little-Endian based on the first 4 bytes of content. + /// Nothing should be consumed. + Utf16Le, + /// Encoding detected as UTF-8 on the first 4 bytes of content. + /// Nothing should be consumed. + Utf8, + /// Encoding detected as UTF-16 Big-Endian based on the first 4 bytes of content. + /// The 2 bytes of BOM should be consumed. + Utf16BeBom, + /// Encoding detected as UTF-16 Little-Endian based on the first 4 bytes of content. + /// The 2 bytes of BOM should be consumed. + Utf16LeBom, + /// Encoding detected as UTF-8 based on the first 3 bytes of content. + /// The 3 bytes of BOM should be consumed. + Utf8Bom, + /// Encoding was not recognized. Nothing should be consumed. + Unknown, +} + +/// Implements automatic encoding detection of XML using the +/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing). +/// +/// IF encoding was not recognized, [`FeedResult::Unknown`] is returned, otherwise +/// `Utf*` variant is returned. +/// +/// Because the [`encoding_rs`] crate supports only subset of those encodings, only +/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// | **BOM** +/// |`FE_FF_##_##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// | **No BOM** +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[allow(non_camel_case_types)] +pub enum BomParser { + X00, + X00_3C, + X00_3C_00, + + X3C, + X3C_00, + X3C_00_3F, + + X3C_3F, + X3C_3F_78, // FeedResult { + for &byte in bytes.iter() { + *self = match self { + //---------------------------------------------------------------------------------- + // UTF-16 BE without BOM 00 < 00 ? + //---------------------------------------------------------------------------------- + Self::X00 => match byte { + b'<' => Self::X00_3C, + _ => return FeedResult::Unknown, + }, + Self::X00_3C => match byte { + 0x00 => Self::X00_3C_00, + _ => return FeedResult::Unknown, + }, + Self::X00_3C_00 => match byte { + b'?' => return FeedResult::Utf16Be, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 LE without BOM < 00 ? 00 + //---------------------------------------------------------------------------------- + Self::X3C => match byte { + 0x00 => Self::X3C_00, + b'?' => Self::X3C_3F, + _ => return FeedResult::Unknown, + }, + Self::X3C_00 => match byte { + b'?' => Self::X3C_00_3F, + _ => return FeedResult::Unknown, + }, + Self::X3C_00_3F => match byte { + 0x00 => return FeedResult::Utf16Le, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-8-like without BOM < ? x m + //---------------------------------------------------------------------------------- + Self::X3C_3F => match byte { + b'x' => Self::X3C_3F_78, + _ => return FeedResult::Unknown, + }, + Self::X3C_3F_78 => match byte { + b'm' => return FeedResult::Utf8, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 BE with BOM FE FF + //---------------------------------------------------------------------------------- + Self::XFE => match byte { + 0xFF => return FeedResult::Utf16BeBom, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 LE with BOM FF FE + //---------------------------------------------------------------------------------- + Self::XFF => match byte { + 0xFE => return FeedResult::Utf16LeBom, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-8 with BOM EF BB + //---------------------------------------------------------------------------------- + Self::XEF => match byte { + 0xBB => Self::XEF_BB, + _ => return FeedResult::Unknown, + }, + Self::XEF_BB => match byte { + 0xBF => return FeedResult::Utf8Bom, + _ => return FeedResult::Unknown, + }, + } + } + FeedResult::NeedData + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 84813d1f..e6a6daee 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -15,6 +15,8 @@ //! ``` //! # use quick_xml::parser::Parser; //! use quick_xml::parser::FeedResult::*; +//! // Use `without_encoding_detection` instead if you don't want +//! // automatic encoding detection //! let mut parser = Parser::default(); //! // Buffer for data of one event //! let mut buf = Vec::new(); @@ -25,6 +27,12 @@ //! // Return to the outer loop to request new chunk //! NeedData => break, //! +//! EncodingUtf8Like(offset) | +//! EncodingUtf16BeLike(offset) | +//! EncodingUtf16LeLike(offset) => { +//! // Consume BOM, but do not add it to the data +//! chunk = &chunk[offset..]; +//! } //! EmitText(offset) | //! EmitCData(offset) | //! EmitComment(offset) | @@ -54,9 +62,11 @@ //! [`feed`]: Parser::feed() use crate::errors::SyntaxError; +use bom::BomParser; use cdata::CDataParser; use quick_dtd::{CommentParser, DtdParser, PiParser, QuotedParser, OneOf}; +mod bom; mod cdata; /// An internal state of a parser. Used to preserve information about currently @@ -64,6 +74,8 @@ mod cdata; #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum State { /// Initial state used to begin parse XML events. + Start, + Bom(BomParser), Text, /// A `<` was seen, but nothing else. @@ -128,7 +140,7 @@ enum State { impl Default for State { fn default() -> Self { - Self::Text + Self::Start } } @@ -137,6 +149,20 @@ impl Default for State { pub enum FeedResult { /// All fed bytes should be consumed, new portion should be feed NeedData, + + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-8 compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf8Like(usize), + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-16 Big-Endian compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf16BeLike(usize), + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-16 Little-Endian compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf16LeLike(usize), + /// The specified amount of bytes should be consumed from the input and /// [`Event::Text`] should be emitted. /// @@ -301,10 +327,17 @@ pub enum FeedResult { /// mentioned in other transitions from that state. /// /// Each `Error` state on that diagram represents a [`SyntaxError`]. -/// Every successful match (`Emit*`) returns the parser to its initial state `Text`. +/// Every successful match (`Emit*`) returns the parser to state `Text`. #[derive(Copy, Clone, Default, Debug, Eq, PartialEq)] pub struct Parser(State); impl Parser { + /// Creates a parser that would not try to guess encoding from the input text. + /// This is useful when you already knows the encoding and parses a part of document. + #[inline] + pub fn without_encoding_detection() -> Self { + Self(State::Text) + } + /// Performs parsing of the provided byte slice and returns the outcome. /// See [`Parser`] for more info. /// @@ -317,6 +350,28 @@ impl Parser { let start = offset + 1; let rest = &bytes[start..]; self.0 = match self.0 { + State::Start => match byte { + 0x00 => State::Bom(BomParser::X00), + b'<' => State::Bom(BomParser::X3C), + 0xEF => State::Bom(BomParser::XEF), + 0xFE => State::Bom(BomParser::XFE), + 0xFF => State::Bom(BomParser::XFF), + _ => return Ok(self.parse_text(trail, offset)), + }, + State::Bom(ref mut parser) => { + let encoding = match parser.feed(trail) { + bom::FeedResult::Unknown => FeedResult::EncodingUtf8Like(0), + bom::FeedResult::Utf8 => FeedResult::EncodingUtf8Like(0), + bom::FeedResult::Utf16Be => FeedResult::EncodingUtf16BeLike(0), + bom::FeedResult::Utf16Le => FeedResult::EncodingUtf16LeLike(0), + bom::FeedResult::Utf8Bom => FeedResult::EncodingUtf8Like(3), + bom::FeedResult::Utf16BeBom => FeedResult::EncodingUtf16BeLike(2), + bom::FeedResult::Utf16LeBom => FeedResult::EncodingUtf16LeLike(2), + bom::FeedResult::NeedData => return Ok(FeedResult::NeedData), + }; + self.0 = State::Text; + return Ok(encoding); + } State::Text => match byte { b'<' => State::Markup, _ => return Ok(self.parse_text(trail, offset)), @@ -430,6 +485,25 @@ impl Parser { #[rustfmt::skip] pub fn finish(self) -> Result<(), SyntaxError> { match self.0 { + // If nothing was fed into parser, document is empty. + // We allow empty documents, at least for now + State::Start | + State::Text => Ok(()), + + // We need data when we tried to determine document encoding + // < + State::Bom(BomParser::X00_3C) | + State::Bom(BomParser::X00_3C_00) | + State::Bom(BomParser::X3C) | + State::Bom(BomParser::X3C_00) => Err(SyntaxError::UnclosedTag), + // Err(SyntaxError::UnclosedPIOrXmlDecl), + // Threat unrecognized BOMs as text + State::Bom(_) => Ok(()), + State::Markup | State::StartOrEmptyTag(..) | State::EndTag => Err(SyntaxError::UnclosedTag), @@ -458,7 +532,6 @@ impl Parser { State::DoctypeFinish => Err(SyntaxError::UnclosedDoctype), State::PI(_) => Err(SyntaxError::UnclosedPIOrXmlDecl), - State::Text => Ok(()), } } @@ -476,6 +549,7 @@ impl Parser { /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` #[inline] fn parse_text(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + self.0 = State::Text; match bytes.iter().position(|&b| b == b'<') { Some(i) => FeedResult::EmitText(offset + i), None => FeedResult::NeedData, @@ -700,11 +774,11 @@ mod tests { #[test] fn text() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b"text with > symbol"), Ok(NeedData)); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b"text with < symbol"), Ok(EmitText(10))); // ^^^^^^^^^^ assert_eq!(parser.0, State::Text); @@ -712,7 +786,7 @@ mod tests { #[test] fn cdata() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitCData(1))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitCData(2))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitCData(1))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b" ]]>"), // 0 ^ = 40 @@ -745,7 +819,7 @@ mod tests { #[test] fn comment() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(NeedData)); assert!(matches!(parser.0, State::Comment(_))); assert_eq!(parser.feed(b"-->"), Ok(EmitComment(3))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b""), // 0 ^ = 31 @@ -788,7 +862,7 @@ mod tests { #[test] fn only_name() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitDoctype(15))); // 0 ^ = 14 assert_eq!(parser.0, State::Text); @@ -796,7 +870,7 @@ mod tests { #[test] fn with_external_id() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"']\">"), // 0 ^ = 28 @@ -804,7 +878,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\"]'>"), // 0 ^ = 28 @@ -812,7 +886,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\"]'>"), // 0 ^ = 32 @@ -820,7 +894,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"']\">"), // 0 ^ = 31 @@ -831,7 +905,7 @@ mod tests { #[test] fn with_subset() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"'>]>"), // 0 ^ = 33 @@ -839,7 +913,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"'\" []>"), // 0 ^ = 29 @@ -847,7 +921,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\"' []>"), // 0 ^ = 29 @@ -855,7 +929,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\"' []>"), // 0 ^ = 33 @@ -863,7 +937,7 @@ mod tests { ); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"'\" []>"), // 0 ^ = 32 @@ -875,30 +949,30 @@ mod tests { #[test] fn pi() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitPI(4))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitPI(10))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b"?>"), Ok(EmitPI(5))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitPI(5))); assert_eq!(parser.0, State::Text); } #[test] fn empty() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitEmptyTag(8))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\" two='\"/>'/>"), Ok(EmitEmptyTag(28)) @@ -908,15 +982,15 @@ mod tests { #[test] fn start() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b"<>"), Ok(EmitStartTag(2))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitStartTag(7))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!( parser.feed(b"\" two='\">'>"), Ok(EmitStartTag(25)) @@ -926,15 +1000,15 @@ mod tests { #[test] fn end() { - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitEndTag(6))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitEndTag(7))); assert_eq!(parser.0, State::Text); - let mut parser = Parser::default(); + let mut parser = Parser::without_encoding_detection(); assert_eq!(parser.feed(b""), Ok(EmitEndTag(3))); assert_eq!(parser.0, State::Text); } From 8456be3d036050307ea896f321ad87f873929f8b Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 25 Sep 2023 22:21:37 +0500 Subject: [PATCH 04/22] Remove dependency from XmlSource from NsReader XmlSource will be removed soon, so cleaning up in advance --- src/reader/ns_reader.rs | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index d5b79e78..3a83a5be 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -13,7 +13,7 @@ use std::path::Path; use crate::errors::Result; use crate::events::Event; use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; -use crate::reader::{Config, Reader, Span, XmlSource}; +use crate::reader::{Config, Reader, Span}; /// A low level encoding-agnostic XML event reader that performs namespace resolution. /// @@ -25,7 +25,7 @@ pub struct NsReader { ns_resolver: NamespaceResolver, /// We cannot pop data from the namespace stack until returned `Empty` or `End` /// event will be processed by the user, so we only mark that we should that - /// in the next [`Self::read_event_impl()`] call. + /// in the next [`Self::read_event()`] call. pending_pop: bool, } @@ -61,15 +61,6 @@ impl NsReader { } } - fn read_event_impl<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.pop(); - let event = self.reader.read_event_impl(buf); - self.process_event(event) - } - pub(super) fn pop(&mut self) { if self.pending_pop { self.ns_resolver.pop(); @@ -85,13 +76,13 @@ impl NsReader { } Ok(Event::Empty(e)) => { self.ns_resolver.push(&e)?; - // notify next `read_event_impl()` invocation that it needs to pop this + // notify next `read_event*()` invocation that it needs to pop this // namespace scope self.pending_pop = true; Ok(Event::Empty(e)) } Ok(Event::End(e)) => { - // notify next `read_event_impl()` invocation that it needs to pop this + // notify next `read_event*()` invocation that it needs to pop this // namespace scope self.pending_pop = true; Ok(Event::End(e)) @@ -351,7 +342,9 @@ impl NsReader { /// [`read_resolved_event_into()`]: Self::read_resolved_event_into #[inline] pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) + self.pop(); + let event = self.reader.read_event_into(buf); + self.process_event(event) } /// Reads the next event into given buffer and resolves its namespace (if applicable). @@ -415,7 +408,9 @@ impl NsReader { &mut self, buf: &'b mut Vec, ) -> Result<(ResolveResult, Event<'b>)> { - let event = self.read_event_impl(buf); + self.pop(); + let event = self.reader.read_event_into(buf); + let event = self.process_event(event); self.resolve_event(event) } @@ -595,7 +590,9 @@ impl<'i> NsReader<&'i [u8]> { /// [`read_resolved_event()`]: Self::read_resolved_event #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) + self.pop(); + let event = self.reader.read_event(); + self.process_event(event) } /// Reads the next event, borrow its content from the input buffer, and resolves @@ -659,7 +656,9 @@ impl<'i> NsReader<&'i [u8]> { /// [`read_event()`]: Self::read_event #[inline] pub fn read_resolved_event(&mut self) -> Result<(ResolveResult, Event<'i>)> { - let event = self.read_event_impl(()); + self.pop(); + let event = self.reader.read_event(); + let event = self.process_event(event); self.resolve_event(event) } From 9b66833b984fe5cae739a4e189b0fbb8fd3561fb Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 25 Sep 2023 23:30:15 +0500 Subject: [PATCH 05/22] Remove tests of internals that would be removed soon --- src/reader/async_tokio.rs | 3 - src/reader/buffered_reader.rs | 8 - src/reader/mod.rs | 675 ---------------------------------- src/reader/slice_reader.rs | 8 - 4 files changed, 694 deletions(-) diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1cdab220..702bb6d6 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -369,14 +369,11 @@ impl NsReader { #[cfg(test)] mod test { - use super::TokioAdapter; use crate::reader::test::{check, small_buffers}; check!( #[tokio::test] read_event_into_async, - read_until_close_async, - TokioAdapter, &mut Vec::new(), async, await ); diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 84f65875..f538527b 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -402,18 +402,10 @@ impl Reader> { #[cfg(test)] mod test { use crate::reader::test::{check, small_buffers}; - use crate::reader::XmlSource; - - /// Default buffer constructor just pass the byte array from the test - fn identity(input: T) -> T { - input - } check!( #[test] read_event_impl, - read_until_close, - identity, &mut Vec::new() ); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 6ccbdf54..5bea28f3 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1019,685 +1019,10 @@ mod test { ( #[$test:meta] $read_event:ident, - $read_until_close:ident, - // constructor of the XML source on which internal functions will be called - $source:path, // constructor of the buffer to which read data will stored $buf:expr $(, $async:ident, $await:ident)? ) => { - mod read_bytes_until { - use super::*; - // Use Bytes for printing bytes as strings for ASCII range - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that search in the empty buffer returns `None` - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 0; - let mut input = b"".as_ref(); - // ^= 0 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), false) - ); - assert_eq!(position, 0); - } - - /// Checks that search in the buffer non-existent value returns entire buffer - /// as a result and set `position` to `len()` - #[$test] - $($async)? fn non_existent() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef".as_ref(); - // ^= 6 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), false) - ); - assert_eq!(position, 6); - } - - /// Checks that search in the buffer an element that is located in the front of - /// buffer returns empty slice as a result and set `position` to one symbol - /// after match (`1`) - #[$test] - $($async)? fn at_the_start() { - let buf = $buf; - let mut position = 0; - let mut input = b"*abcdef".as_ref(); - // ^= 1 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), true) - ); - assert_eq!(position, 1); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the middle of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match - #[$test] - $($async)? fn inside() { - let buf = $buf; - let mut position = 0; - let mut input = b"abc*def".as_ref(); - // ^= 4 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abc"), true) - ); - assert_eq!(position, 4); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the end of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match (`len()`) - #[$test] - $($async)? fn in_the_end() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef*".as_ref(); - // ^= 7 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), true) - ); - assert_eq!(position, 7); // position after the symbol matched - } - } - - mod read_bang_element { - use super::*; - use crate::errors::{Error, SyntaxError}; - use crate::reader::BangType; - use crate::utils::Bytes; - - /// Checks that reading CDATA content works correctly - mod cdata { - use super::*; - use pretty_assertions::assert_eq; - - /// Checks that if input begins like CDATA element, but CDATA start sequence - /// is not finished, parsing ends with an error - #[$test] - #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"![]]>other content".as_ref(); - // ^= 1 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - /// Checks that if CDATA startup sequence was matched, but an end sequence - /// is not found, parsing ends with an error - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[other content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - /// Checks that CDATA element without content inside parsed successfully - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[]]>other content".as_ref(); - // ^= 1 ^= 12 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"![CDATA[]]")) - ); - assert_eq!(position, 12); - } - - /// Checks that CDATA element with content parsed successfully. - /// Additionally checks that sequences inside CDATA that may look like - /// a CDATA end sequence do not interrupt CDATA parsing - #[$test] - $($async)? fn with_content() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); - // ^= 1 ^= 29 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")) - ); - assert_eq!(position, 29); - } - } - - /// Checks that reading XML comments works correctly. According to the [specification], - /// comment data can contain any sequence except `--`: - /// - /// ```peg - /// comment = '<--' (!'--' char)* '-->'; - /// char = [#x1-#x2C] - /// / [#x2E-#xD7FF] - /// / [#xE000-#xFFFD] - /// / [#x10000-#x10FFFF] - /// ``` - /// - /// The presence of this limitation, however, is simply a poorly designed specification - /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for - /// presence of these sequences by default. This tests allow such content. - /// - /// [specification]: https://www.w3.org/TR/xml11/#dt-comment - mod comment { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!- -->other content".as_ref(); - // ^= 1 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - #[$test] - $($async)? fn not_properly_end() { - let buf = $buf; - let mut position = 1; - let mut input = b"!->other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn not_closed1() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn not_closed2() { - let buf = $buf; - let mut position = 1; - let mut input = b"!-->other content".as_ref(); - // ^= 1 ^= 18 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 18); - } - - #[$test] - $($async)? fn not_closed3() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--->other content".as_ref(); - // ^= 1 ^= 19 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 19); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!---->other content".as_ref(); - // ^= 1 ^= 7 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"!----")) - ); - assert_eq!(position, 7); - } - - #[$test] - $($async)? fn with_content() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--->comment<--->other content".as_ref(); - // ^= 1 ^= 18 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"!--->comment<---")) - ); - assert_eq!(position, 18); - } - } - - /// Checks that reading DOCTYPE definition works correctly - mod doctype { - use super::*; - - mod uppercase { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!D other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn without_space() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPEother content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPE>other content".as_ref(); - // ^= 1 ^= 10 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!DOCTYPE")) - ); - assert_eq!(position, 10); - } - - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPE other content".as_ref(); - // ^= 1 ^23 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 23); - } - } - - mod lowercase { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!d other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn without_space() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctypeother content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctype>other content".as_ref(); - // ^= 1 ^= 10 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!doctype")) - ); - assert_eq!(position, 10); - } - - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctype other content".as_ref(); - // ^= 1 ^= 23 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 23); - } - } - } - } - - mod read_element { - use super::*; - use crate::errors::{Error, SyntaxError}; - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that nothing was read from empty buffer - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"".as_ref(); - // ^= 1 - - match $source(&mut input).read_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedTag)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - mod open { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b">".as_ref(); - // ^= 2 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") - ); - assert_eq!(position, 2); - } - - #[$test] - $($async)? fn normal() { - let buf = $buf; - let mut position = 1; - let mut input = b"tag>".as_ref(); - // ^= 5 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"tag") - ); - assert_eq!(position, 5); - } - - #[$test] - $($async)? fn empty_ns_empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b":>".as_ref(); - // ^= 3 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":") - ); - assert_eq!(position, 3); - } - - #[$test] - $($async)? fn empty_ns() { - let buf = $buf; - let mut position = 1; - let mut input = b":tag>".as_ref(); - // ^= 6 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":tag") - ); - assert_eq!(position, 6); - } - - #[$test] - $($async)? fn with_attributes() { - let buf = $buf; - let mut position = 1; - let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); - // ^= 39 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#) - ); - assert_eq!(position, 39); - } - } - - mod self_closed { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b"/>".as_ref(); - // ^= 3 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"/") - ); - assert_eq!(position, 3); - } - - #[$test] - $($async)? fn normal() { - let buf = $buf; - let mut position = 1; - let mut input = b"tag/>".as_ref(); - // ^= 6 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"tag/") - ); - assert_eq!(position, 6); - } - - #[$test] - $($async)? fn empty_ns_empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b":/>".as_ref(); - // ^= 4 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":/") - ); - assert_eq!(position, 4); - } - - #[$test] - $($async)? fn empty_ns() { - let buf = $buf; - let mut position = 1; - let mut input = b":tag/>".as_ref(); - // ^= 7 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":tag/") - ); - assert_eq!(position, 7); - } - - #[$test] - $($async)? fn with_attributes() { - let buf = $buf; - let mut position = 1; - let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); - // ^= 42 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#) - ); - assert_eq!(position, 42); - } - } - } - /// Ensures, that no empty `Text` events are generated mod $read_event { use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 5e807e26..8ff87ea2 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -342,18 +342,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { #[cfg(test)] mod test { use crate::reader::test::check; - use crate::reader::XmlSource; - - /// Default buffer constructor just pass the byte array from the test - fn identity(input: T) -> T { - input - } check!( #[test] read_event_impl, - read_until_close, - identity, () ); From e53ce41bfd1a40955fe6a9eb5a2ce1137ea4ff41 Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 22 Sep 2023 00:52:20 +0500 Subject: [PATCH 06/22] Use new parser for async reads failures (30): syntax::cdata::unclosed03::async_tokio syntax::cdata::unclosed04::async_tokio syntax::cdata::unclosed06::async_tokio syntax::cdata::unclosed07::async_tokio syntax::cdata::unclosed09::async_tokio syntax::cdata::unclosed10::async_tokio syntax::cdata::unclosed12::async_tokio syntax::cdata::unclosed13::async_tokio syntax::cdata::unclosed15::async_tokio syntax::cdata::unclosed16::async_tokio syntax::cdata::unclosed18::async_tokio syntax::cdata::unclosed19::async_tokio syntax::comment::unclosed03::async_tokio syntax::comment::unclosed04::async_tokio syntax::doctype::unclosed03::async_tokio syntax::doctype::unclosed04::async_tokio syntax::doctype::unclosed06::async_tokio syntax::doctype::unclosed07::async_tokio syntax::doctype::unclosed09::async_tokio syntax::doctype::unclosed10::async_tokio syntax::doctype::unclosed12::async_tokio syntax::doctype::unclosed13::async_tokio syntax::doctype::unclosed15::async_tokio syntax::doctype::unclosed16::async_tokio syntax::doctype::unclosed18::async_tokio syntax::doctype::unclosed19::async_tokio syntax::unclosed_bang1::async_tokio syntax::unclosed_bang2::async_tokio syntax::unclosed_bang3::async_tokio syntax::unclosed_bang4::async_tokio --- src/events/mod.rs | 4 +- src/reader/async_tokio.rs | 42 ++------ src/reader/mod.rs | 50 +++++++++- src/reader/state.rs | 197 +++++++++++++++++++++++++++++++++++++- 4 files changed, 253 insertions(+), 40 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index 546ad392..bf03d5e6 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -1076,7 +1076,7 @@ fn str_cow_to_bytes<'a, C: Into>>(content: C) -> Cow<'a, [u8]> { /// Returns a byte slice with leading XML whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by [`is_whitespace`]. -const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { +pub(crate) const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { // Note: A pattern matching based approach (instead of indexing) allows // making the function const. while let [first, rest @ ..] = bytes { @@ -1092,7 +1092,7 @@ const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { /// Returns a byte slice with trailing XML whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by [`is_whitespace`]. -const fn trim_xml_end(mut bytes: &[u8]) -> &[u8] { +pub(crate) const fn trim_xml_end(mut bytes: &[u8]) -> &[u8] { // Note: A pattern matching based approach (instead of indexing) allows // making the function const. while let [rest @ .., last] = bytes { diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 702bb6d6..0a5aa3bf 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -4,24 +4,14 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt}; -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, Result}; +use crate::events::{BytesText, Event}; use crate::name::{QName, ResolveResult}; -use crate::reader::buffered_reader::impl_buffered_source; +use crate::reader::state::ParseOutcome; use crate::reader::{ - is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, + NsReader, Reader, Span, }; -/// A struct for read XML asynchronously from an [`AsyncBufRead`]. -/// -/// Having own struct allows us to implement anything without risk of name conflicts -/// and does not suffer from the impossibility of having `async` in traits. -struct TokioAdapter<'a, R>(&'a mut R); - -impl<'a, R: AsyncBufRead + Unpin> TokioAdapter<'a, R> { - impl_buffered_source!('b, 0, async, await); -} - //////////////////////////////////////////////////////////////////////////////////////////////////// impl Reader { @@ -72,13 +62,10 @@ impl Reader { /// [`read_event_into()`]: Reader::read_event_into pub async fn read_event_into_async<'b>( &mut self, - mut buf: &'b mut Vec, + buf: &'b mut Vec, ) -> Result> { read_event_impl!( self, buf, - TokioAdapter(&mut self.reader), - read_until_open_async, - read_until_close_async, await ) } @@ -134,29 +121,12 @@ impl Reader { /// [`Start`]: Event::Start pub async fn read_to_end_into_async<'n>( &mut self, - // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033` + // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033 end: QName<'n>, buf: &mut Vec, ) -> Result { Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)) } - - /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. - /// - /// Returns inner `Ok` if the loop should be broken and an event returned. - /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular. - async fn read_until_open_async<'b>( - &mut self, - buf: &'b mut Vec, - ) -> Result, &'b mut Vec>> { - read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await) - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { - read_until_close!(self, buf, TokioAdapter(&mut self.reader), await) - } } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 5bea28f3..3be8ee1b 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -258,6 +258,54 @@ macro_rules! read_event_impl { } event }}; + ( + $self:ident, $buf:ident + $(, $await:ident)? + ) => {{ + if let Some(end) = $self.state.pending_end() { + return Ok(end); + } + // Content in buffer before call is not a part of next event + let start = $buf.len(); + let offset = $self.state.offset; + loop { + break match $self.reader.fill_buf() $(.$await)? { + Ok(bytes) if bytes.is_empty() => { + let content = &$buf[start..]; + if content.is_empty() { + Ok(Event::Eof) + } else + if let Err(error) = $self.state.parser.finish() { + $self.state.last_error_offset = offset; + Err(Error::Syntax(error)) + } else { + // Content already trimmed, because we do not put whitespaces + // to the buffer at all if they should be trimmed + Ok(Event::Text(BytesText::wrap(content, $self.decoder()))) + } + } + Ok(bytes) => match $self.state.parse_into(bytes, $buf)? { + ParseOutcome::Consume(offset, result) => { + $self.reader.consume(offset); + $self.state.make_event(result, &$buf[start..]) + } + ParseOutcome::ConsumeAndEmitText(offset) => { + $self.reader.consume(offset); + Ok(Event::Text(BytesText::wrap(&$buf[start..], $self.decoder()))) + } + ParseOutcome::ConsumeAndContinue(offset) => { + $self.reader.consume(offset); + continue; + } + }, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + $self.state.last_error_offset = $self.state.offset; + Err(Error::Io(e.into())) + } + }; + } + }}; } /// Read bytes up to `<` and skip it. If current byte (after skipping all space @@ -456,7 +504,7 @@ pub type Span = Range; /// Empty -- End --> ClosedTag /// _ -. Eof .-> Exit /// ``` -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] enum ParseState { /// Initial state in which reader stay after creation. Transition from that /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next diff --git a/src/reader/state.rs b/src/reader/state.rs index d579b767..ed0a6420 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -1,20 +1,39 @@ #[cfg(feature = "encoding")] -use encoding_rs::UTF_8; +use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, Result, SyntaxError}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; +use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; use crate::reader::{is_whitespace, BangType, Config, ParseState}; +use crate::utils::Bytes; use memchr; +/// Result of a [`ReaderState::parse_into`] method. +#[derive(Debug)] +pub enum ParseOutcome { + /// The specified amount of data should be consumed. The parser result should + /// be converted to an [`Event`] using previously accumulated data and newly + /// consumed data. + Consume(usize, FeedResult), + /// The specified amount of data should be consumed. All accumulated data + /// and newly consumed data should be converted to an [`Event::Text`]. + ConsumeAndEmitText(usize), + /// The specified amount of data should be consumed, but no event should be + /// generated. Used to skip whitespaces and BOM. + ConsumeAndContinue(usize), +} + /// A struct that holds a current reader state and a parser configuration. /// It is independent on a way of reading data: the reader feed data into it and /// get back produced [`Event`]s. #[derive(Clone, Debug)] pub(super) struct ReaderState { + /// Current parsing state + pub parser: Parser, /// Number of bytes read from the source of data since the reader was created pub offset: usize, /// A snapshot of an `offset` of the last error returned. It can be less than @@ -26,6 +45,14 @@ pub(super) struct ReaderState { pub state: ParseState, /// User-defined settings that affect parsing pub config: Config, + /// When text trimming from start is enabled, we need to track is we seen + /// a non-space symbol between getting chunks from the reader, because we + /// trim each chunk individually. If such symbol was seen, trim is not + /// required until current text event would be emitted. + /// + /// Used only together with buffering readers, because borrowing reader + /// already have all data available. + can_trim_start: bool, /// All currently Started elements which didn't have a matching /// End element yet. /// @@ -312,15 +339,183 @@ impl ReaderState { encoding: self.encoding.encoding(), } } + + /// Parses `bytes`, appending data to a `buf`. Used in buffered readers + pub fn parse_into<'a, 'b>( + &mut self, + bytes: &'a [u8], + buf: &'b mut Vec, + ) -> Result { + let result = self.parser.feed(bytes)?; + match result { + FeedResult::NeedData => { + let mut content = bytes; + if self.config.trim_text_start + && self.can_trim_start + && self.parser.is_text_parsing() + { + content = crate::events::trim_xml_start(bytes); + // if we got some data while parsing text, we shouldn't to + // trim text anymore, because this is spaces inside text content + self.can_trim_start = content.is_empty(); + } + buf.extend_from_slice(content); + let len = bytes.len(); + self.offset += len; + Ok(ParseOutcome::ConsumeAndContinue(len)) + } + + FeedResult::EncodingUtf8Like(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_8); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } + FeedResult::EncodingUtf16BeLike(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_16BE); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } + FeedResult::EncodingUtf16LeLike(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_16LE); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } + + FeedResult::EmitText(offset) => { + let mut content = &bytes[..offset]; + if self.config.trim_text_start && self.can_trim_start { + content = crate::events::trim_xml_start(content); + } + // Reset ability to trim start + self.can_trim_start = true; + if self.config.trim_text_end { + content = crate::events::trim_xml_end(content); + } + buf.extend_from_slice(content); + self.offset += offset; + if buf.is_empty() { + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } else { + Ok(ParseOutcome::ConsumeAndEmitText(offset)) + } + } + FeedResult::EmitComment(offset) + | FeedResult::EmitCData(offset) + | FeedResult::EmitDoctype(offset) + | FeedResult::EmitPI(offset) + | FeedResult::EmitEmptyTag(offset) + | FeedResult::EmitStartTag(offset) + | FeedResult::EmitEndTag(offset) => { + buf.extend_from_slice(&bytes[..offset]); + self.offset += offset; + Ok(ParseOutcome::Consume(offset, result)) + } + } + } + + /// Converts result from a parser to reader's event. + /// + /// # Parameters + /// - `result`: a result from [`Parser::feed()`] + /// - `content`: a buffer with event data + /// + /// [`Parser::feed()`]: crate::parser::Parser::feed() + pub fn make_event<'a>(&mut self, result: FeedResult, content: &'a [u8]) -> Result> { + debug_assert_ne!(self.state, ParseState::Empty); + + match result { + FeedResult::EmitText(_) | FeedResult::NeedData => { + Ok(Event::Text(BytesText::wrap(content, self.decoder()))) + } + FeedResult::EmitCData(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + Ok(Event::CData(BytesCData::wrap( + &content[9..content.len() - 3], + self.decoder(), + ))) + } + FeedResult::EmitComment(_) => { + // `--` from start and end should not be overlapped + debug_assert!(content.len() >= 4 + 3, "{:?}", Bytes(content)); + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + self.emit_bang(BangType::Comment, &content[1..content.len() - 1]) + } + FeedResult::EmitDoctype(_) => { + debug_assert!(content.len() > 9, "{:?}", Bytes(content)); + debug_assert!( + content[0..9].eq_ignore_ascii_case(b""), "{:?}", Bytes(content)); + + self.emit_bang(BangType::DocType, &content[1..content.len() - 1]) + } + FeedResult::EmitPI(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + self.emit_question_mark(&content[1..content.len() - 1]) + } + FeedResult::EmitEmptyTag(_) => { + debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); + debug_assert!(content.ends_with(b"/>"), "{:?}", Bytes(content)); + + self.emit_start(&content[1..content.len() - 1]) + } + FeedResult::EmitStartTag(_) => { + debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); + debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); + + self.emit_start(&content[1..content.len() - 1]) + } + FeedResult::EmitEndTag(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + self.emit_end(&content[1..content.len() - 1]) + } + FeedResult::EncodingUtf8Like(_) + | FeedResult::EncodingUtf16BeLike(_) + | FeedResult::EncodingUtf16LeLike(_) => unreachable!("processed outside"), + } + } + + /// Get the pending event if the last returned event was a synthetic `Start` + /// event due to [`Config::expand_empty_elements`] setting. + /// + /// If this method returns something, the read next event should return this + /// event. + pub fn pending_end(&mut self) -> Option> { + if let ParseState::Empty = self.state { + return Some(self.close_expanded_empty().unwrap()); + } + None + } } impl Default for ReaderState { fn default() -> Self { Self { + parser: Parser::default(), offset: 0, last_error_offset: 0, state: ParseState::Init, config: Config::default(), + can_trim_start: true, opened_buffer: Vec::new(), opened_starts: Vec::new(), From 60aa437ed045970ad56b534289c4912e6d736f31 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 27 Sep 2023 23:44:41 +0500 Subject: [PATCH 07/22] Run `cargo fmt` --- src/reader/async_tokio.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 0a5aa3bf..7d0bb6ee 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -8,9 +8,7 @@ use crate::errors::{Error, Result}; use crate::events::{BytesText, Event}; use crate::name::{QName, ResolveResult}; use crate::reader::state::ParseOutcome; -use crate::reader::{ - NsReader, Reader, Span, -}; +use crate::reader::{NsReader, Reader, Span}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -60,14 +58,8 @@ impl Reader { /// ``` /// /// [`read_event_into()`]: Reader::read_event_into - pub async fn read_event_into_async<'b>( - &mut self, - buf: &'b mut Vec, - ) -> Result> { - read_event_impl!( - self, buf, - await - ) + pub async fn read_event_into_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { + read_event_impl!(self, buf, await) } /// An asynchronous version of [`read_to_end_into()`]. From d23c1d77388fcf666bf082578b0297e619902dbc Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 25 Sep 2023 22:42:44 +0500 Subject: [PATCH 08/22] Use new parser for buffered reader failures (60): syntax::cdata::unclosed03::async_tokio syntax::cdata::unclosed03::buffered syntax::cdata::unclosed04::async_tokio syntax::cdata::unclosed04::buffered syntax::cdata::unclosed06::async_tokio syntax::cdata::unclosed06::buffered syntax::cdata::unclosed07::async_tokio syntax::cdata::unclosed07::buffered syntax::cdata::unclosed09::async_tokio syntax::cdata::unclosed09::buffered syntax::cdata::unclosed10::async_tokio syntax::cdata::unclosed10::buffered syntax::cdata::unclosed12::async_tokio syntax::cdata::unclosed12::buffered syntax::cdata::unclosed13::async_tokio syntax::cdata::unclosed13::buffered syntax::cdata::unclosed15::async_tokio syntax::cdata::unclosed15::buffered syntax::cdata::unclosed16::async_tokio syntax::cdata::unclosed16::buffered syntax::cdata::unclosed18::async_tokio syntax::cdata::unclosed18::buffered syntax::cdata::unclosed19::async_tokio syntax::cdata::unclosed19::buffered syntax::comment::unclosed03::async_tokio syntax::comment::unclosed03::buffered syntax::comment::unclosed04::async_tokio syntax::comment::unclosed04::buffered syntax::doctype::unclosed03::async_tokio syntax::doctype::unclosed03::buffered syntax::doctype::unclosed04::async_tokio syntax::doctype::unclosed04::buffered syntax::doctype::unclosed06::async_tokio syntax::doctype::unclosed06::buffered syntax::doctype::unclosed07::async_tokio syntax::doctype::unclosed07::buffered syntax::doctype::unclosed09::async_tokio syntax::doctype::unclosed09::buffered syntax::doctype::unclosed10::async_tokio syntax::doctype::unclosed10::buffered syntax::doctype::unclosed12::async_tokio syntax::doctype::unclosed12::buffered syntax::doctype::unclosed13::async_tokio syntax::doctype::unclosed13::buffered syntax::doctype::unclosed15::async_tokio syntax::doctype::unclosed15::buffered syntax::doctype::unclosed16::async_tokio syntax::doctype::unclosed16::buffered syntax::doctype::unclosed18::async_tokio syntax::doctype::unclosed18::buffered syntax::doctype::unclosed19::async_tokio syntax::doctype::unclosed19::buffered syntax::unclosed_bang1::async_tokio syntax::unclosed_bang1::buffered syntax::unclosed_bang2::async_tokio syntax::unclosed_bang2::buffered syntax::unclosed_bang3::async_tokio syntax::unclosed_bang3::buffered syntax::unclosed_bang4::async_tokio syntax::unclosed_bang4::buffered --- src/reader/buffered_reader.rs | 245 +--------------------------------- 1 file changed, 7 insertions(+), 238 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index f538527b..8d48dffd 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,242 +5,11 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; -use memchr; - -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, Result}; +use crate::events::{BytesText, Event}; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; - -macro_rules! impl_buffered_source { - ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { - #[cfg(not(feature = "encoding"))] - $($async)? fn remove_utf8_bom(&mut self) -> Result<()> { - use crate::encoding::UTF8_BOM; - - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => { - if n.starts_with(UTF8_BOM) { - self $(.$reader)? .consume(UTF8_BOM.len()); - } - Ok(()) - }, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - #[cfg(feature = "encoding")] - $($async)? fn detect_encoding(&mut self) -> Result> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) { - self $(.$reader)? .consume(bom_len); - Ok(Some(enc)) - } else { - Ok(None) - }, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - #[inline] - $($async)? fn read_bytes_until $(<$lf>)? ( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<(&'b [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - let mut read = 0; - let mut done = false; - let start = buf.len(); - while !done { - let used = { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - done = true; - i + 1 - } - None => { - buf.extend_from_slice(available); - available.len() - } - } - }; - self $(.$reader)? .consume(used); - read += used; - } - *position += read; - - Ok((&buf[start..], done)) - } - - $($async)? fn read_bang_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<(BangType, &'b [u8])> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - let start = buf.len(); - let mut read = 1; - buf.push(b'!'); - self $(.$reader)? .consume(1); - - let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?; - - loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Ok(n) if n.is_empty() => break, - Ok(available) => { - // We only parse from start because we don't want to consider - // whatever is in the buffer before the bang element - if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used; - - *position += read; - return Ok((bang_type, &buf[start..])); - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - } - } - - *position += read; - Err(bang_type.to_err()) - } - - #[inline] - $($async)? fn read_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<&'b [u8]> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used; - - // Position now just after the `>` symbol - *position += read; - return Ok(&buf[start..]); - } else { - // The `>` symbol not yet found, continue reading - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - } - - *position += read; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - - $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => { - let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); - if count > 0 { - self $(.$reader)? .consume(count); - *position += count; - continue; - } else { - Ok(()) - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - match self.peek_one() $(.$await)? ? { - Some(b) if b == byte => { - *position += 1; - self $(.$reader)? .consume(1); - Ok(true) - } - _ => Ok(false), - } - } - - $($async)? fn peek_one(&mut self) -> Result> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - }; -} - -// Make it public for use in async implementations -pub(super) use impl_buffered_source; - -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { - impl_buffered_source!(); -} +use crate::reader::state::ParseOutcome; +use crate::reader::{Reader, Span}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -292,7 +61,7 @@ impl Reader { /// ``` #[inline] pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) + read_event_impl!(self, buf) } /// Reads until end element is found using provided buffer as intermediate @@ -384,7 +153,7 @@ impl Reader { /// [`check_end_names`]: crate::reader::Config::check_end_names /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result { - Ok(read_to_end!(self, end, buf, read_event_impl, { + Ok(read_to_end!(self, end, buf, read_event_into, { buf.clear(); })) } @@ -405,7 +174,7 @@ mod test { check!( #[test] - read_event_impl, + read_event_into, &mut Vec::new() ); From bf58185002afc6c762a41668a4b2cf3a38259665 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 25 Sep 2023 22:43:16 +0500 Subject: [PATCH 09/22] Use new parser for borrowed reader failures (90): syntax::cdata::unclosed03::async_tokio syntax::cdata::unclosed03::borrowed syntax::cdata::unclosed03::buffered syntax::cdata::unclosed04::async_tokio syntax::cdata::unclosed04::borrowed syntax::cdata::unclosed04::buffered syntax::cdata::unclosed06::async_tokio syntax::cdata::unclosed06::borrowed syntax::cdata::unclosed06::buffered syntax::cdata::unclosed07::async_tokio syntax::cdata::unclosed07::borrowed syntax::cdata::unclosed07::buffered syntax::cdata::unclosed09::async_tokio syntax::cdata::unclosed09::borrowed syntax::cdata::unclosed09::buffered syntax::cdata::unclosed10::async_tokio syntax::cdata::unclosed10::borrowed syntax::cdata::unclosed10::buffered syntax::cdata::unclosed12::async_tokio syntax::cdata::unclosed12::borrowed syntax::cdata::unclosed12::buffered syntax::cdata::unclosed13::async_tokio syntax::cdata::unclosed13::borrowed syntax::cdata::unclosed13::buffered syntax::cdata::unclosed15::async_tokio syntax::cdata::unclosed15::borrowed syntax::cdata::unclosed15::buffered syntax::cdata::unclosed16::async_tokio syntax::cdata::unclosed16::borrowed syntax::cdata::unclosed16::buffered syntax::cdata::unclosed18::async_tokio syntax::cdata::unclosed18::borrowed syntax::cdata::unclosed18::buffered syntax::cdata::unclosed19::async_tokio syntax::cdata::unclosed19::borrowed syntax::cdata::unclosed19::buffered syntax::comment::unclosed03::async_tokio syntax::comment::unclosed03::borrowed syntax::comment::unclosed03::buffered syntax::comment::unclosed04::async_tokio syntax::comment::unclosed04::borrowed syntax::comment::unclosed04::buffered syntax::doctype::unclosed03::async_tokio syntax::doctype::unclosed03::borrowed syntax::doctype::unclosed03::buffered syntax::doctype::unclosed04::async_tokio syntax::doctype::unclosed04::borrowed syntax::doctype::unclosed04::buffered syntax::doctype::unclosed06::async_tokio syntax::doctype::unclosed06::borrowed syntax::doctype::unclosed06::buffered syntax::doctype::unclosed07::async_tokio syntax::doctype::unclosed07::borrowed syntax::doctype::unclosed07::buffered syntax::doctype::unclosed09::async_tokio syntax::doctype::unclosed09::borrowed syntax::doctype::unclosed09::buffered syntax::doctype::unclosed10::async_tokio syntax::doctype::unclosed10::borrowed syntax::doctype::unclosed10::buffered syntax::doctype::unclosed12::async_tokio syntax::doctype::unclosed12::borrowed syntax::doctype::unclosed12::buffered syntax::doctype::unclosed13::async_tokio syntax::doctype::unclosed13::borrowed syntax::doctype::unclosed13::buffered syntax::doctype::unclosed15::async_tokio syntax::doctype::unclosed15::borrowed syntax::doctype::unclosed15::buffered syntax::doctype::unclosed16::async_tokio syntax::doctype::unclosed16::borrowed syntax::doctype::unclosed16::buffered syntax::doctype::unclosed18::async_tokio syntax::doctype::unclosed18::borrowed syntax::doctype::unclosed18::buffered syntax::doctype::unclosed19::async_tokio syntax::doctype::unclosed19::borrowed syntax::doctype::unclosed19::buffered syntax::unclosed_bang1::async_tokio syntax::unclosed_bang1::borrowed syntax::unclosed_bang1::buffered syntax::unclosed_bang2::async_tokio syntax::unclosed_bang2::borrowed syntax::unclosed_bang2::buffered syntax::unclosed_bang3::async_tokio syntax::unclosed_bang3::borrowed syntax::unclosed_bang3::buffered syntax::unclosed_bang4::async_tokio syntax::unclosed_bang4::borrowed syntax::unclosed_bang4::buffered --- src/reader/mod.rs | 228 +------------------------------------ src/reader/slice_reader.rs | 103 ++++++++++++++++- src/reader/state.rs | 27 ----- 3 files changed, 105 insertions(+), 253 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 3be8ee1b..194d4a9e 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -6,7 +6,6 @@ use std::ops::Range; use crate::encoding::Decoder; use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; use crate::reader::state::ReaderState; use memchr; @@ -204,60 +203,6 @@ impl Default for Config { //////////////////////////////////////////////////////////////////////////////////////////////////// macro_rules! read_event_impl { - ( - $self:ident, $buf:ident, - $reader:expr, - $read_until_open:ident, - $read_until_close:ident - $(, $await:ident)? - ) => {{ - let event = loop { - match $self.state.state { - ParseState::Init => { // Go to OpenedTag state - // If encoding set explicitly, we not need to detect it. For example, - // explicit UTF-8 set automatically if Reader was created using `from_str`. - // But we still need to remove BOM for consistency with no encoding - // feature enabled path - #[cfg(feature = "encoding")] - if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? { - if $self.state.encoding.can_be_refined() { - $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding); - } - } - - // Removes UTF-8 BOM if it is present - #[cfg(not(feature = "encoding"))] - $reader.remove_utf8_bom() $(.$await)? ?; - - // Go to OpenedTag state - match $self.$read_until_open($buf) $(.$await)? { - Ok(Ok(ev)) => break Ok(ev), - Ok(Err(b)) => $buf = b, - Err(err) => break Err(err), - } - }, - ParseState::ClosedTag => { // Go to OpenedTag state - match $self.$read_until_open($buf) $(.$await)? { - Ok(Ok(ev)) => break Ok(ev), - Ok(Err(b)) => $buf = b, - Err(err) => break Err(err), - } - }, - // Go to ClosedTag state in next two arms - ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?, - ParseState::Empty => break $self.state.close_expanded_empty(), - ParseState::Exit => break Ok(Event::Eof), - }; - }; - match event { - // #513: In case of ill-formed errors we already consume the wrong data - // and change the state. We can continue parsing if we wish - Err(Error::IllFormed(_)) => {} - Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Exit, - _ => {} - } - event - }}; ( $self:ident, $buf:ident $(, $await:ident)? @@ -308,141 +253,6 @@ macro_rules! read_event_impl { }}; } -/// Read bytes up to `<` and skip it. If current byte (after skipping all space -/// characters if [`Config::trim_text_start`] is `true`) is already `<`, then -/// returns the next event, otherwise stay at position just after the `<` symbol. -/// -/// Moves parser to the `OpenedTag` state. -/// -/// This code is executed in two cases: -/// - after start of parsing just after skipping BOM if it is present -/// - after parsing `` or `` -macro_rules! read_until_open { - ( - $self:ident, $buf:ident, - $reader:expr, - $read_event:ident - $(, $await:ident)? - ) => {{ - if $self.state.config.trim_text_start { - $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? { - $self.state.state = ParseState::OpenedTag; - // Pass $buf to the next next iteration of parsing loop - return Ok(Err($buf)); - } - - match $reader - .read_bytes_until(b'<', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, found)) => { - if found { - $self.state.state = ParseState::OpenedTag; - } - // Return Text event with `bytes` content or Eof if bytes is empty - $self.state.emit_text(bytes).map(Ok) - } - Err(e) => Err(e), - } - }}; -} - -/// Read bytes up to the `>` and skip it. This method is expected to be called -/// after seeing the `<` symbol and skipping it. Inspects the next (current) -/// symbol and returns an appropriate [`Event`]: -/// -/// |Symbol |Event -/// |-------|------------------------------------- -/// |`!` |[`Comment`], [`CData`] or [`DocType`] -/// |`/` |[`End`] -/// |`?` |[`PI`] -/// |_other_|[`Start`] or [`Empty`] -/// -/// Moves parser to the `ClosedTag` state. -/// -/// [`Comment`]: Event::Comment -/// [`CData`]: Event::CData -/// [`DocType`]: Event::DocType -/// [`End`]: Event::End -/// [`PI`]: Event::PI -/// [`Start`]: Event::Start -/// [`Empty`]: Event::Empty -macro_rules! read_until_close { - ( - $self:ident, $buf:ident, - $reader:expr - $(, $await:ident)? - ) => {{ - $self.state.state = ParseState::ClosedTag; - - let start = $self.state.offset; - match $reader.peek_one() $(.$await)? { - // ` match $reader - .read_bang_element($buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes), - Err(e) => { - // match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, true)) => $self.state.emit_end(bytes), - Ok((_, false)) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - Err(e) => Err(e), - }, - // ` match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, true)) => $self.state.emit_question_mark(bytes), - Ok((_, false)) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) - } - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match $reader - .read_element($buf, &mut $self.state.offset) - $(.$await)? - { - Ok(bytes) => $self.state.emit_start(bytes), - Err(e) => Err(e), - }, - // `<` - syntax error, tag not closed - Ok(None) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - Err(e) => Err(e), - } - }}; -} - /// Generalization of `read_to_end` method for buffered and borrowed readers macro_rules! read_to_end { ( @@ -520,6 +330,8 @@ enum ParseState { /// State in which reader searches the `<` symbol of a markup. All bytes before /// that symbol will be returned in the [`Event::Text`] event. After that /// the reader moves to the `OpenedTag` state. + /// + /// [`Event::Text`]: crate::events::Event::Text ClosedTag, /// This state is used only if option [`expand_empty_elements`] is set to `true`. /// Reader enters to this state when it is in a `ClosedTag` state and emits an @@ -527,6 +339,8 @@ enum ParseState { /// after which reader returned to the `ClosedTag` state. /// /// [`expand_empty_elements`]: Config::expand_empty_elements + /// [`Event::Start`]: crate::events::Event::Start + /// [`Event::End`]: crate::events::Event::End Empty, /// Reader enters this state when `Eof` event generated or an error occurred. /// This is the last state, the reader stay in it forever. @@ -635,6 +449,7 @@ impl EncodingRef { /// } /// ``` /// +/// [`Event`]: crate::events::Event /// [`NsReader`]: crate::reader::NsReader #[derive(Clone)] pub struct Reader { @@ -775,39 +590,6 @@ impl Reader { } } -/// Private sync reading methods -impl Reader { - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - read_event_impl!(self, buf, self.reader, read_until_open, read_until_close) - } - - /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. - /// - /// Returns inner `Ok` if the loop should be broken and an event returned. - /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular. - fn read_until_open<'i, B>(&mut self, buf: B) -> Result, B>> - where - R: XmlSource<'i, B>, - { - read_until_open!(self, buf, self.reader, read_event_impl) - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - read_until_close!(self, buf, self.reader) - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// /// Represents an input for a reader that can return borrowed data. diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 8ff87ea2..fd8267c4 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -7,11 +7,12 @@ use std::borrow::Cow; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_8}; +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::events::{BytesText, Event}; use crate::name::QName; +use crate::parser::FeedResult; use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; use memchr; @@ -71,7 +72,73 @@ impl<'a> Reader<&'a [u8]> { /// ``` #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) + if let Some(end) = self.state.pending_end() { + return Ok(end); + } + loop { + if self.reader.is_empty() { + return Ok(Event::Eof); + } + let result = self.state.parser.feed(self.reader)?; + return match result { + FeedResult::NeedData => { + let offset = self.reader.len(); + if let Err(error) = self.state.parser.finish() { + // We need return Event::Eof after error + self.consume(offset); + Err(Error::Syntax(error)) + } else { + match self.make_text(offset) { + Some(event) => Ok(event), + None => continue, + } + } + } + + FeedResult::EncodingUtf8Like(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_8); + } + continue; + } + FeedResult::EncodingUtf16BeLike(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_16BE); + } + continue; + } + FeedResult::EncodingUtf16LeLike(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_16LE); + } + continue; + } + + FeedResult::EmitText(offset) => match self.make_text(offset) { + Some(event) => Ok(event), + None => continue, + }, + FeedResult::EmitComment(offset) + | FeedResult::EmitCData(offset) + | FeedResult::EmitDoctype(offset) + | FeedResult::EmitPI(offset) + | FeedResult::EmitEmptyTag(offset) + | FeedResult::EmitStartTag(offset) + | FeedResult::EmitEndTag(offset) => { + let (content, source) = self.reader.split_at(offset); + self.reader = source; + + self.state.offset += offset; + self.state.make_event(result, content) + } + }; + } } /// Reads until end element is found. This function is supposed to be called @@ -157,6 +224,11 @@ impl<'a> Reader<&'a [u8]> { pub fn read_to_end(&mut self, end: QName) -> Result { Ok(read_to_end!(self, end, (), read_event_impl, {})) } + /// Tranpoline for a `read_to_end!` macro + #[inline] + fn read_event_impl(&mut self, _: ()) -> Result> { + self.read_event() + } /// Reads content between start and end tags, including any markup. This /// function is supposed to be called after you already read a [`Start`] event. @@ -231,6 +303,31 @@ impl<'a> Reader<&'a [u8]> { self.decoder().decode(&buffer[0..span.len()]) } + + #[inline] + fn consume(&mut self, count: usize) { + self.reader = &self.reader[count..]; + self.state.offset += count; + } + /// Returns [`Event::Text`] with the content of reader up to `offset` or + /// `None` if no event should be generated because of trimming and getting + /// empty text. + /// + /// Consumes data up to `offset`. + fn make_text(&mut self, offset: usize) -> Option> { + let (content, source) = self.reader.split_at(offset); + self.reader = source; + self.state.offset += offset; + + let mut event = BytesText::wrap(content, self.decoder()); + if self.state.config.trim_text_start && event.inplace_trim_start() { + return None; + } + if self.state.config.trim_text_end && event.inplace_trim_end() { + return None; + } + Some(Event::Text(event)) + } } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/reader/state.rs b/src/reader/state.rs index ed0a6420..3d4b9618 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -81,33 +81,6 @@ pub(super) struct ReaderState { } impl ReaderState { - /// Trims end whitespaces from `bytes`, if required, and returns a [`Text`] - /// event or an [`Eof`] event, if text after trimming is empty. - /// - /// # Parameters - /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` - /// - /// [`Text`]: Event::Text - /// [`Eof`]: Event::Eof - pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result> { - let mut content = bytes; - - if self.config.trim_text_end { - // Skip the ending '<' - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - content = &bytes[..len]; - } - - if content.is_empty() { - Ok(Event::Eof) - } else { - Ok(Event::Text(BytesText::wrap(content, self.decoder()))) - } - } - /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { From f063917ee6e0ffa9fe72db6a37cb3749d145e6cf Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 25 Sep 2023 23:18:33 +0500 Subject: [PATCH 10/22] Cleanup unused code --- src/reader/mod.rs | 207 +------------------------------------ src/reader/slice_reader.rs | 112 +------------------- src/reader/state.rs | 18 ++-- 3 files changed, 14 insertions(+), 323 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 194d4a9e..a5966a04 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -298,55 +298,6 @@ pub type Span = Range; //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Possible reader states. The state transition diagram (`true` and `false` shows -/// value of [`Config::expand_empty_elements`] option): -/// -/// ```mermaid -/// flowchart LR -/// subgraph _ -/// direction LR -/// -/// Init -- "(no event)"\n --> OpenedTag -/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag -/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag -/// end -/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty -/// Empty -- End --> ClosedTag -/// _ -. Eof .-> Exit -/// ``` -#[derive(Clone, Debug, PartialEq, Eq)] -enum ParseState { - /// Initial state in which reader stay after creation. Transition from that - /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next - /// state is always `OpenedTag`. The reader will never return to this state. The - /// event emitted during transition to `OpenedTag` is a `StartEvent` if the - /// first symbol not `<`, otherwise no event are emitted. - Init, - /// State after seeing the `<` symbol. Depending on the next symbol all other - /// events could be generated. - /// - /// After generating one event the reader moves to the `ClosedTag` state. - OpenedTag, - /// State in which reader searches the `<` symbol of a markup. All bytes before - /// that symbol will be returned in the [`Event::Text`] event. After that - /// the reader moves to the `OpenedTag` state. - /// - /// [`Event::Text`]: crate::events::Event::Text - ClosedTag, - /// This state is used only if option [`expand_empty_elements`] is set to `true`. - /// Reader enters to this state when it is in a `ClosedTag` state and emits an - /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], - /// after which reader returned to the `ClosedTag` state. - /// - /// [`expand_empty_elements`]: Config::expand_empty_elements - /// [`Event::Start`]: crate::events::Event::Start - /// [`Event::End`]: crate::events::Event::End - Empty, - /// Reader enters this state when `Eof` event generated or an error occurred. - /// This is the last state, the reader stay in it forever. - Exit, -} - /// A reference to an encoding together with information about how it was retrieved. /// /// The state transition diagram: @@ -551,13 +502,7 @@ impl Reader { /// Gets the current byte position in the input data. pub fn buffer_position(&self) -> usize { - // when internal state is OpenedTag, we have actually read until '<', - // which we don't want to show - if let ParseState::OpenedTag = self.state.state { - self.state.offset - 1 - } else { - self.state.offset - } + self.state.offset } /// Gets the last error byte position in the input data. If there is no errors @@ -592,122 +537,6 @@ impl Reader { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Represents an input for a reader that can return borrowed data. -/// -/// There are two implementors of this trait: generic one that read data from -/// `Self`, copies some part of it into a provided buffer of type `B` and then -/// returns data that borrow from that buffer. -/// -/// The other implementor is for `&[u8]` and instead of copying data returns -/// borrowed data from `Self` instead. This implementation allows zero-copy -/// deserialization. -/// -/// # Parameters -/// - `'r`: lifetime of a buffer from which events will borrow -/// - `B`: a type of a buffer that can be used to store data read from `Self` and -/// from which events can borrow -trait XmlSource<'r, B> { - /// Removes UTF-8 BOM if it is present - #[cfg(not(feature = "encoding"))] - fn remove_utf8_bom(&mut self) -> Result<()>; - - /// Determines encoding from the start of input and removes BOM if it is present - #[cfg(feature = "encoding")] - fn detect_encoding(&mut self) -> Result>; - - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte` (exclusive), - /// and a flag noting whether `byte` was found in the input or not. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// (b"abc".as_ref(), true) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut usize, - ) -> Result<(&'r [u8], bool)>; - - /// Read input until comment, CDATA or processing instruction is finished. - /// - /// This method expect that `<` already was read. - /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. - /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>; - - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns a buffer that contains a data between `<` and `>` or - /// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; - - /// Consume and discard all the whitespace until the next non-whitespace - /// character or EOF. - /// - /// # Parameters - /// - `position`: Will be increased by amount of bytes consumed - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; - - /// Consume and discard one character if it matches the given byte. Return - /// `true` if it matched. - /// - /// # Parameters - /// - `position`: Will be increased by 1 if byte is matched - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; - - /// Return one character without consuming it, so that future `read_*` calls - /// will still include it. On EOF, return `None`. - fn peek_one(&mut self) -> Result>; -} - /// Possible elements started with `` - /// and a position after that symbol or `None` if such symbol was not found - #[inline(always)] - fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { - *self = match (*self, chunk[i]) { - // only allowed to match `>` while we are in state `Elem` - (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), - (Self::Elem, b'\'') => Self::SingleQ, - (Self::Elem, b'\"') => Self::DoubleQ, - - // the only end_byte that gets us out if the same character - (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, - - // all other bytes: no state change - _ => *self, - }; - } - None - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index fd8267c4..173b0c27 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -7,15 +7,13 @@ use std::borrow::Cow; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; -use crate::errors::{Error, Result, SyntaxError}; +use crate::errors::{Error, Result}; use crate::events::{BytesText, Event}; use crate::name::QName; use crate::parser::FeedResult; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; - -use memchr; +use crate::reader::{Reader, Span}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -332,110 +330,6 @@ impl<'a> Reader<&'a [u8]> { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { - #[cfg(not(feature = "encoding"))] - fn remove_utf8_bom(&mut self) -> Result<()> { - if self.starts_with(crate::encoding::UTF8_BOM) { - *self = &self[crate::encoding::UTF8_BOM.len()..]; - } - Ok(()) - } - - #[cfg(feature = "encoding")] - fn detect_encoding(&mut self) -> Result> { - if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { - *self = &self[bom_len..]; - return Ok(Some(enc)); - } - Ok(None) - } - - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut usize, - ) -> Result<(&'a [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - if let Some(i) = memchr::memchr(byte, self) { - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - Ok((bytes, true)) - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) - } - } - - fn read_bang_element( - &mut self, - _buf: (), - position: &mut usize, - ) -> Result<(BangType, &'a [u8])> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - debug_assert_eq!(self[0], b'!'); - - let bang_type = BangType::new(self[1..].first().copied())?; - - if let Some((bytes, i)) = bang_type.parse(&[], self) { - *position += i; - *self = &self[i..]; - return Ok((bang_type, bytes)); - } - - *position += self.len(); - Err(bang_type.to_err()) - } - - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - // Position now just after the `>` symbol - *position += i; - *self = &self[i..]; - return Ok(bytes); - } - - *position += self.len(); - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - let whitespaces = self - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); - *position += whitespaces; - *self = &self[whitespaces..]; - Ok(()) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - if self.first() == Some(&byte) { - *self = &self[1..]; - *position += 1; - Ok(true) - } else { - Ok(false) - } - } - - fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) - } -} - #[cfg(test)] mod test { use crate::reader::test::check; diff --git a/src/reader/state.rs b/src/reader/state.rs index 3d4b9618..63d0f5fc 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -7,7 +7,7 @@ use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Even use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; -use crate::reader::{is_whitespace, BangType, Config, ParseState}; +use crate::reader::{is_whitespace, BangType, Config}; use crate::utils::Bytes; use memchr; @@ -41,8 +41,6 @@ pub(super) struct ReaderState { /// and changing `offset` is not possible, because `Error::IllFormed` errors /// are recoverable. pub last_error_offset: usize, - /// Defines how to process next byte - pub state: ParseState, /// User-defined settings that affect parsing pub config: Config, /// When text trimming from start is enabled, we need to track is we seen @@ -53,6 +51,10 @@ pub(super) struct ReaderState { /// Used only together with buffering readers, because borrowing reader /// already have all data available. can_trim_start: bool, + /// If case of [`Config::expand_empty_elements`] is true, this field will + /// be `true` if synthetic end event should be emitted on next call to read + /// event. + pending: bool, /// All currently Started elements which didn't have a matching /// End element yet. /// @@ -271,7 +273,7 @@ impl ReaderState { let event = BytesStart::wrap(&content[..len - 1], name_len); if self.config.expand_empty_elements { - self.state = ParseState::Empty; + self.pending = true; self.opened_starts.push(self.opened_buffer.len()); self.opened_buffer.extend(&content[..name_len]); Ok(Event::Start(event)) @@ -290,7 +292,7 @@ impl ReaderState { #[inline] pub fn close_expanded_empty(&mut self) -> Result> { - self.state = ParseState::ClosedTag; + self.pending = false; let name = self .opened_buffer .split_off(self.opened_starts.pop().unwrap()); @@ -403,7 +405,7 @@ impl ReaderState { /// /// [`Parser::feed()`]: crate::parser::Parser::feed() pub fn make_event<'a>(&mut self, result: FeedResult, content: &'a [u8]) -> Result> { - debug_assert_ne!(self.state, ParseState::Empty); + debug_assert!(!self.pending, "synthetic end event won't be emitted"); match result { FeedResult::EmitText(_) | FeedResult::NeedData => { @@ -473,7 +475,7 @@ impl ReaderState { /// If this method returns something, the read next event should return this /// event. pub fn pending_end(&mut self) -> Option> { - if let ParseState::Empty = self.state { + if self.pending { return Some(self.close_expanded_empty().unwrap()); } None @@ -486,9 +488,9 @@ impl Default for ReaderState { parser: Parser::default(), offset: 0, last_error_offset: 0, - state: ParseState::Init, config: Config::default(), can_trim_start: true, + pending: false, opened_buffer: Vec::new(), opened_starts: Vec::new(), From 40c1cfce7048d41be7ed1675ffbc1b76bfba1ae4 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 26 Sep 2023 00:15:07 +0500 Subject: [PATCH 11/22] Inline ReaderState::close_expanded_empty --- src/reader/state.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index 63d0f5fc..e8778d61 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -290,15 +290,6 @@ impl ReaderState { } } - #[inline] - pub fn close_expanded_empty(&mut self) -> Result> { - self.pending = false; - let name = self - .opened_buffer - .split_off(self.opened_starts.pop().unwrap()); - Ok(Event::End(BytesEnd::wrap(name.into()))) - } - /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// /// If [`encoding`] feature is enabled, the used encoding may change after @@ -476,7 +467,11 @@ impl ReaderState { /// event. pub fn pending_end(&mut self) -> Option> { if self.pending { - return Some(self.close_expanded_empty().unwrap()); + self.pending = false; + let name = self + .opened_buffer + .split_off(self.opened_starts.pop().unwrap()); + return Some(Event::End(BytesEnd::wrap(name.into()))); } None } From 584406e714abeaf7d46e22c18a2dfe48b924cee0 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 21:12:37 +0500 Subject: [PATCH 12/22] Inline ReaderState::emit_bang with obvious dead code elimination (removed unused arms in match) --- src/reader/state.rs | 164 ++++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index e8778d61..5e5cfce0 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -83,86 +83,6 @@ pub(super) struct ReaderState { } impl ReaderState { - /// reads `BytesElement` starting with a `!`, - /// return `Comment`, `CData` or `DocType` event - pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; - - let len = buf.len(); - match bang_type { - BangType::Comment if buf.starts_with(b"!--") => { - debug_assert!(buf.ends_with(b"--")); - if self.config.check_comments { - // search if '--' not in comments - let mut haystack = &buf[3..len - 2]; - let mut off = 0; - while let Some(p) = memchr::memchr(b'-', haystack) { - off += p + 1; - // if next byte after `-` is also `-`, return an error - if buf[3 + off] == b'-' { - // Explanation of the magic: - // - // - `self.offset`` just after `>`, - // - `buf` contains `!-- con--tent --` - // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~ : - buf - // : =========== : - zone of search (possible values of `p`) - // : |---p : - p is counted from | (| is 0) - // : : : ^ - self.offset - // ^ : : - self.offset - len - // ^ : - self.offset - len + 2 - // ^ - self.offset - len + 2 + p - self.last_error_offset = self.offset - len + 2 + p; - return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); - } - // Continue search after single `-` (+1 to skip it) - haystack = &haystack[p + 1..]; - } - } - Ok(Event::Comment(BytesText::wrap( - // Cut of `!--` and `--` from start and end - &buf[3..len - 2], - self.decoder(), - ))) - } - BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - debug_assert!(buf.ends_with(b"]]")); - Ok(Event::CData(BytesCData::wrap( - // Cut of `![CDATA[` and `]]` from start and end - &buf[8..len - 2], - self.decoder(), - ))) - } - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { - match buf[8..].iter().position(|&b| !is_whitespace(b)) { - Some(start) => Ok(Event::DocType(BytesText::wrap( - // Cut of `!DOCTYPE` and any number of spaces from start - &buf[8 + start..], - self.decoder(), - ))), - None => { - // Because we here, we at least read `` and offset after `>`. - // We want report error at place where name is expected - this is just - // before `>` - self.last_error_offset = self.offset - 1; - return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); - } - } - } - _ => { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(bang_type.to_err()) - } - } - } - /// Wraps content of `buf` into the [`Event::End`] event. Does the check that /// end name matches the last opened start name if `self.config.check_end_names` is set. pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result> { @@ -417,7 +337,57 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - self.emit_bang(BangType::Comment, &content[1..content.len() - 1]) + let bang_type = BangType::Comment; + let buf = &content[1..content.len() - 1]; + let uncased_starts_with = |string: &[u8], prefix: &[u8]| { + string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) + }; + + let len = buf.len(); + match bang_type { + BangType::Comment if buf.starts_with(b"!--") => { + debug_assert!(buf.ends_with(b"--")); + if self.config.check_comments { + // search if '--' not in comments + let mut haystack = &buf[3..len - 2]; + let mut off = 0; + while let Some(p) = memchr::memchr(b'-', haystack) { + off += p + 1; + // if next byte after `-` is also `-`, return an error + if buf[3 + off] == b'-' { + // Explanation of the magic: + // + // - `self.offset`` just after `>`, + // - `buf` contains `!-- con--tent --` + // - `p` is counted from byte after `: + // ~~~~~~~~~~~~~~~~ : - buf + // : =========== : - zone of search (possible values of `p`) + // : |---p : - p is counted from | (| is 0) + // : : : ^ - self.offset + // ^ : : - self.offset - len + // ^ : - self.offset - len + 2 + // ^ - self.offset - len + 2 + p + self.last_error_offset = self.offset - len + 2 + p; + return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); + } + haystack = &haystack[p + 1..]; + } + } + Ok(Event::Comment(BytesText::wrap( + &buf[3..len - 2], + self.decoder(), + ))) + } + _ => { + // + // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract 2 and buf len + self.last_error_offset = self.offset - len - 2; + Err(bang_type.to_err()) + } + } } FeedResult::EmitDoctype(_) => { debug_assert!(content.len() > 9, "{:?}", Bytes(content)); @@ -428,7 +398,37 @@ impl ReaderState { ); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); - self.emit_bang(BangType::DocType, &content[1..content.len() - 1]) + let bang_type = BangType::DocType; + let buf = &content[1..content.len() - 1]; + let uncased_starts_with = |string: &[u8], prefix: &[u8]| { + string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) + }; + + let len = buf.len(); + match bang_type { + BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { + match buf[8..].iter().position(|&b| !is_whitespace(b)) { + Some(start) => Ok(Event::DocType(BytesText::wrap( + &buf[8 + start..], + self.decoder(), + ))), + None => { + // Because we here, we at least read `` and offset after `>`. + // We want report error at place where name is expected - this is just + // before `>` + self.last_error_offset = self.offset - 1; + return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); + } + } + } + _ => { + // + // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract 2 and buf len + self.last_error_offset = self.offset - len - 2; + Err(bang_type.to_err()) + } + } } FeedResult::EmitPI(_) => { debug_assert!(content.starts_with(b" Date: Sun, 19 Nov 2023 21:32:22 +0500 Subject: [PATCH 13/22] Remove BangType (review with with whitespace ignored mode) --- src/reader/mod.rs | 95 --------------------------------- src/reader/state.rs | 124 ++++++++++++++++++++------------------------ 2 files changed, 56 insertions(+), 163 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index a5966a04..9062a148 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -5,11 +5,8 @@ use encoding_rs::Encoding; use std::ops::Range; use crate::encoding::Decoder; -use crate::errors::{Error, Result, SyntaxError}; use crate::reader::state::ReaderState; -use memchr; - /// A struct that holds a parser configuration. /// /// Current parser configuration can be retrieved by calling [`Reader::config()`] @@ -537,98 +534,6 @@ impl Reader { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Possible elements started with ` - CData, - /// - Comment, - /// - DocType, -} -impl BangType { - #[inline(always)] - fn new(byte: Option) -> Result { - Ok(match byte { - Some(b'[') => Self::CData, - Some(b'-') => Self::Comment, - Some(b'D') | Some(b'd') => Self::DocType, - _ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)), - }) - } - - /// If element is finished, returns its content up to `>` symbol and - /// an index of this symbol, otherwise returns `None` - /// - /// # Parameters - /// - `buf`: buffer with data consumed on previous iterations - /// - `chunk`: data read on current iteration and not yet consumed from reader - #[inline(always)] - fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr_iter(b'>', chunk) { - match self { - // Need to read at least 6 symbols (`!---->`) for properly finished comment - // - XML comment - // 012345 - i - Self::Comment if buf.len() + i > 4 => { - if chunk[..i].ends_with(b"--") { - // We cannot strip last `--` from the buffer because we need it in case of - // check_comments enabled option. XML standard requires that comment - // will not end with `--->` sequence because this is a special case of - // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `-|->` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `--|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"--") { - return Some((&[], i + 1)); // +1 for `>` - } - } - Self::Comment => {} - Self::CData => { - if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `]|]>` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `]]|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"]]") { - return Some((&[], i + 1)); // +1 for `>` - } - } - Self::DocType => { - let content = &chunk[..i]; - let balance = memchr::memchr2_iter(b'<', b'>', content) - .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) - .sum::(); - if balance == 0 { - return Some((content, i + 1)); // +1 for `>` - } - } - } - } - None - } - #[inline] - fn to_err(&self) -> Error { - match self { - Self::CData => Error::Syntax(SyntaxError::UnclosedCData), - Self::Comment => Error::Syntax(SyntaxError::UnclosedComment), - Self::DocType => Error::Syntax(SyntaxError::UnclosedDoctype), - } - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { diff --git a/src/reader/state.rs b/src/reader/state.rs index 5e5cfce0..ac4bfebc 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -7,7 +7,7 @@ use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Even use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; -use crate::reader::{is_whitespace, BangType, Config}; +use crate::reader::{is_whitespace, Config}; use crate::utils::Bytes; use memchr; @@ -337,56 +337,49 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - let bang_type = BangType::Comment; let buf = &content[1..content.len() - 1]; - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; let len = buf.len(); - match bang_type { - BangType::Comment if buf.starts_with(b"!--") => { - debug_assert!(buf.ends_with(b"--")); - if self.config.check_comments { - // search if '--' not in comments - let mut haystack = &buf[3..len - 2]; - let mut off = 0; - while let Some(p) = memchr::memchr(b'-', haystack) { - off += p + 1; - // if next byte after `-` is also `-`, return an error - if buf[3 + off] == b'-' { - // Explanation of the magic: - // - // - `self.offset`` just after `>`, - // - `buf` contains `!-- con--tent --` - // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~ : - buf - // : =========== : - zone of search (possible values of `p`) - // : |---p : - p is counted from | (| is 0) - // : : : ^ - self.offset - // ^ : : - self.offset - len - // ^ : - self.offset - len + 2 - // ^ - self.offset - len + 2 + p - self.last_error_offset = self.offset - len + 2 + p; - return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); - } - haystack = &haystack[p + 1..]; + if buf.starts_with(b"!--") { + debug_assert!(buf.ends_with(b"--")); + if self.config.check_comments { + // search if '--' not in comments + let mut haystack = &buf[3..len - 2]; + let mut off = 0; + while let Some(p) = memchr::memchr(b'-', haystack) { + off += p + 1; + // if next byte after `-` is also `-`, return an error + if buf[3 + off] == b'-' { + // Explanation of the magic: + // + // - `self.offset`` just after `>`, + // - `buf` contains `!-- con--tent --` + // - `p` is counted from byte after `: + // ~~~~~~~~~~~~~~~~ : - buf + // : =========== : - zone of search (possible values of `p`) + // : |---p : - p is counted from | (| is 0) + // : : : ^ - self.offset + // ^ : : - self.offset - len + // ^ : - self.offset - len + 2 + // ^ - self.offset - len + 2 + p + self.last_error_offset = self.offset - len + 2 + p; + return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } + haystack = &haystack[p + 1..]; } - Ok(Event::Comment(BytesText::wrap( - &buf[3..len - 2], - self.decoder(), - ))) - } - _ => { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(bang_type.to_err()) } + Ok(Event::Comment(BytesText::wrap( + &buf[3..len - 2], + self.decoder(), + ))) + } else { + // + // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract 2 and buf len + self.last_error_offset = self.offset - len - 2; + Err(Error::Syntax(SyntaxError::UnclosedComment)) } } FeedResult::EmitDoctype(_) => { @@ -398,36 +391,31 @@ impl ReaderState { ); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); - let bang_type = BangType::DocType; let buf = &content[1..content.len() - 1]; let uncased_starts_with = |string: &[u8], prefix: &[u8]| { string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) }; - let len = buf.len(); - match bang_type { - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { - match buf[8..].iter().position(|&b| !is_whitespace(b)) { - Some(start) => Ok(Event::DocType(BytesText::wrap( - &buf[8 + start..], - self.decoder(), - ))), - None => { - // Because we here, we at least read `` and offset after `>`. - // We want report error at place where name is expected - this is just - // before `>` - self.last_error_offset = self.offset - 1; - return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); - } + if uncased_starts_with(buf, b"!DOCTYPE") { + match buf[8..].iter().position(|&b| !is_whitespace(b)) { + Some(start) => Ok(Event::DocType(BytesText::wrap( + &buf[8 + start..], + self.decoder(), + ))), + None => { + // Because we here, we at least read `` and offset after `>`. + // We want report error at place where name is expected - this is just + // before `>` + self.last_error_offset = self.offset - 1; + return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); } } - _ => { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(bang_type.to_err()) - } + } else { + // + // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. + // ^------- We report error at that position, so we need to subtract 2 and buf len + self.last_error_offset = self.offset - len - 2; + Err(Error::Syntax(SyntaxError::UnclosedDoctype)) } } FeedResult::EmitPI(_) => { From 9f117a3fd77786184ebd035119c20dbb5b93fbf6 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 22:02:06 +0500 Subject: [PATCH 14/22] Remove dead code - these checks already performed by a parser (review with with whitespace ignored mode) --- src/reader/state.rs | 102 +++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 62 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index ac4bfebc..c2ce7b30 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -338,49 +338,39 @@ impl ReaderState { debug_assert!(content.ends_with(b"-->"), "{:?}", Bytes(content)); let buf = &content[1..content.len() - 1]; - let len = buf.len(); - if buf.starts_with(b"!--") { - debug_assert!(buf.ends_with(b"--")); - if self.config.check_comments { - // search if '--' not in comments - let mut haystack = &buf[3..len - 2]; - let mut off = 0; - while let Some(p) = memchr::memchr(b'-', haystack) { - off += p + 1; - // if next byte after `-` is also `-`, return an error - if buf[3 + off] == b'-' { - // Explanation of the magic: - // - // - `self.offset`` just after `>`, - // - `buf` contains `!-- con--tent --` - // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~ : - buf - // : =========== : - zone of search (possible values of `p`) - // : |---p : - p is counted from | (| is 0) - // : : : ^ - self.offset - // ^ : : - self.offset - len - // ^ : - self.offset - len + 2 - // ^ - self.offset - len + 2 + p - self.last_error_offset = self.offset - len + 2 + p; - return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); - } - haystack = &haystack[p + 1..]; + if self.config.check_comments { + // search if '--' not in comments + let mut haystack = &buf[3..len - 2]; + let mut off = 0; + while let Some(p) = memchr::memchr(b'-', haystack) { + off += p + 1; + // if next byte after `-` is also `-`, return an error + if buf[3 + off] == b'-' { + // Explanation of the magic: + // + // - `self.offset`` just after `>`, + // - `buf` contains `!-- con--tent --` + // - `p` is counted from byte after `: + // ~~~~~~~~~~~~~~~~ : - buf + // : =========== : - zone of search (possible values of `p`) + // : |---p : - p is counted from | (| is 0) + // : : : ^ - self.offset + // ^ : : - self.offset - len + // ^ : - self.offset - len + 2 + // ^ - self.offset - len + 2 + p + self.last_error_offset = self.offset - len + 2 + p; + return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } + haystack = &haystack[p + 1..]; } - Ok(Event::Comment(BytesText::wrap( - &buf[3..len - 2], - self.decoder(), - ))) - } else { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(Error::Syntax(SyntaxError::UnclosedComment)) } + Ok(Event::Comment(BytesText::wrap( + &buf[3..len - 2], + self.decoder(), + ))) } FeedResult::EmitDoctype(_) => { debug_assert!(content.len() > 9, "{:?}", Bytes(content)); @@ -392,30 +382,18 @@ impl ReaderState { debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); let buf = &content[1..content.len() - 1]; - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; - - if uncased_starts_with(buf, b"!DOCTYPE") { - match buf[8..].iter().position(|&b| !is_whitespace(b)) { - Some(start) => Ok(Event::DocType(BytesText::wrap( - &buf[8 + start..], - self.decoder(), - ))), - None => { - // Because we here, we at least read `` and offset after `>`. - // We want report error at place where name is expected - this is just - // before `>` - self.last_error_offset = self.offset - 1; - return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); - } + match buf[8..].iter().position(|&b| !is_whitespace(b)) { + Some(start) => Ok(Event::DocType(BytesText::wrap( + &buf[8 + start..], + self.decoder(), + ))), + None => { + // Because we here, we at least read `` and offset after `>`. + // We want report error at place where name is expected - this is just + // before `>` + self.last_error_offset = self.offset - 1; + return Err(Error::IllFormed(IllFormedError::MissingDoctypeName)); } - } else { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) } } FeedResult::EmitPI(_) => { From d71f3f5c4a35c1430f0e6b28440c9243007d0664 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 22:07:50 +0500 Subject: [PATCH 15/22] Remove intermediate slices in Doctype and comment checks --- src/reader/state.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index c2ce7b30..d4bdbdb6 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -337,16 +337,16 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - let buf = &content[1..content.len() - 1]; - let len = buf.len(); + let len = content.len(); if self.config.check_comments { // search if '--' not in comments - let mut haystack = &buf[3..len - 2]; + // Skip `` + let mut haystack = &content[4..len - 3]; let mut off = 0; while let Some(p) = memchr::memchr(b'-', haystack) { off += p + 1; // if next byte after `-` is also `-`, return an error - if buf[3 + off] == b'-' { + if content[4 + off] == b'-' { // Explanation of the magic: // // - `self.offset`` just after `>`, @@ -354,21 +354,21 @@ impl ReaderState { // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~ : - buf + // ~~~~~~~~~~~~~~~~~~: - buf // : =========== : - zone of search (possible values of `p`) // : |---p : - p is counted from | (| is 0) // : : : ^ - self.offset - // ^ : : - self.offset - len - // ^ : - self.offset - len + 2 - // ^ - self.offset - len + 2 + p - self.last_error_offset = self.offset - len + 2 + p; + // ^ : : - self.offset - len + // ^ : - self.offset - len + 4 + // ^ - self.offset - len + 4 + p + self.last_error_offset = self.offset - len + 4 + p; return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } haystack = &haystack[p + 1..]; } } Ok(Event::Comment(BytesText::wrap( - &buf[3..len - 2], + &content[4..len - 3], self.decoder(), ))) } @@ -381,10 +381,10 @@ impl ReaderState { ); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); - let buf = &content[1..content.len() - 1]; - match buf[8..].iter().position(|&b| !is_whitespace(b)) { + let buf = &content[9..content.len() - 1]; + match buf.iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( - &buf[8 + start..], + &buf[start..], self.decoder(), ))), None => { From adccd213653e64fcc95edc7e71035dc580ad1399 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 22:23:12 +0500 Subject: [PATCH 16/22] Simplify check in Doctype and start search of a name after first space --- src/reader/state.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/reader/state.rs b/src/reader/state.rs index d4bdbdb6..e42ab140 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -381,8 +381,12 @@ impl ReaderState { ); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); + // Skip `` let buf = &content[9..content.len() - 1]; match buf.iter().position(|&b| !is_whitespace(b)) { + // Found the first non-space symbol after ` Ok(Event::DocType(BytesText::wrap( &buf[start..], self.decoder(), From 0302010805e0b10aa80b4644b137f91e70d54e38 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 26 Sep 2023 00:22:47 +0500 Subject: [PATCH 17/22] Inline ReaderState::emit_question_mark --- src/reader/state.rs | 75 ++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index e42ab140..0e2bae4f 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -137,46 +137,6 @@ impl ReaderState { Ok(Event::End(BytesEnd::wrap(name.into()))) } - /// `buf` contains data between `<` and `>` and the first byte is `?`. - /// `self.offset` already after the `>` - /// - /// Returns `Decl` or `PI` event - pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result> { - debug_assert!(buf.len() > 0); - debug_assert_eq!(buf[0], b'?'); - - let len = buf.len(); - // We accept at least - // ~~ - len = 2 - if len > 1 && buf[len - 1] == b'?' { - // Cut of `?` and `?` from start and end - let content = &buf[1..len - 1]; - let len = content.len(); - - if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { - let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); - - // Try getting encoding from the declaration event - #[cfg(feature = "encoding")] - if self.encoding.can_be_refined() { - if let Some(encoding) = event.encoder() { - self.encoding = EncodingRef::XmlDetected(encoding); - } - } - - Ok(Event::Decl(event)) - } else { - Ok(Event::PI(BytesText::wrap(content, self.decoder()))) - } - } else { - // `) - self.last_error_offset = self.offset - len - 2; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) - } - } - /// Converts content of a tag to a `Start` or an `Empty` event /// /// # Parameters @@ -404,7 +364,40 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - self.emit_question_mark(&content[1..content.len() - 1]) + let buf = &content[1..content.len() - 1]; + debug_assert!(buf.len() > 0); + debug_assert_eq!(buf[0], b'?'); + + let len = buf.len(); + // We accept at least + // ~~ - len = 2 + if len > 1 && buf[len - 1] == b'?' { + // Cut of `?` and `?` from start and end + let content = &buf[1..len - 1]; + let len = content.len(); + + if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { + let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); + + // Try getting encoding from the declaration event + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + if let Some(encoding) = event.encoder() { + self.encoding = EncodingRef::XmlDetected(encoding); + } + } + + Ok(Event::Decl(event)) + } else { + Ok(Event::PI(BytesText::wrap(content, self.decoder()))) + } + } else { + // `) + self.last_error_offset = self.offset - len - 2; + Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + } } FeedResult::EmitEmptyTag(_) => { debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); From d5d3a9929afb305838c844771f32d93bea614733 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 22:55:11 +0500 Subject: [PATCH 18/22] Remove dead code - these checks already performed by a parser (review with with whitespace ignored mode) --- src/reader/state.rs | 47 +++++++++++++++------------------------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index 0e2bae4f..a99213d6 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -2,7 +2,7 @@ use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; use crate::encoding::Decoder; -use crate::errors::{Error, IllFormedError, Result, SyntaxError}; +use crate::errors::{Error, IllFormedError, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] @@ -364,39 +364,24 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - let buf = &content[1..content.len() - 1]; - debug_assert!(buf.len() > 0); - debug_assert_eq!(buf[0], b'?'); - - let len = buf.len(); - // We accept at least - // ~~ - len = 2 - if len > 1 && buf[len - 1] == b'?' { - // Cut of `?` and `?` from start and end - let content = &buf[1..len - 1]; - let len = content.len(); - - if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { - let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); - - // Try getting encoding from the declaration event - #[cfg(feature = "encoding")] - if self.encoding.can_be_refined() { - if let Some(encoding) = event.encoder() { - self.encoding = EncodingRef::XmlDetected(encoding); - } - } + // Cut of `` from start and end + let content = &content[2..content.len() - 2]; + let len = content.len(); + + if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { + let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); - Ok(Event::Decl(event)) - } else { - Ok(Event::PI(BytesText::wrap(content, self.decoder()))) + // Try getting encoding from the declaration event + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + if let Some(encoding) = event.encoder() { + self.encoding = EncodingRef::XmlDetected(encoding); + } } + + Ok(Event::Decl(event)) } else { - // `) - self.last_error_offset = self.offset - len - 2; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + Ok(Event::PI(BytesText::wrap(content, self.decoder()))) } } FeedResult::EmitEmptyTag(_) => { From 8771d656f5a49546a9e29cad66c3a41685f08afb Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Nov 2023 22:58:33 +0500 Subject: [PATCH 19/22] Inline ReaderState::emit_start with obvious dead code elimination --- src/reader/state.rs | 65 +++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index a99213d6..20c13986 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -137,39 +137,6 @@ impl ReaderState { Ok(Event::End(BytesEnd::wrap(name.into()))) } - /// Converts content of a tag to a `Start` or an `Empty` event - /// - /// # Parameters - /// - `content`: Content of a tag between `<` and `>` - pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result> { - let len = content.len(); - let name_end = content - .iter() - .position(|&b| is_whitespace(b)) - .unwrap_or(len); - if let Some(&b'/') = content.last() { - // This is self-closed tag `` - let name_len = if name_end < len { name_end } else { len - 1 }; - let event = BytesStart::wrap(&content[..len - 1], name_len); - - if self.config.expand_empty_elements { - self.pending = true; - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_len]); - Ok(Event::Start(event)) - } else { - Ok(Event::Empty(event)) - } - } else { - // #514: Always store names event when .check_end_names == false, - // because checks can be temporary disabled and when they would be - // enabled, we should have that information - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_end]); - Ok(Event::Start(BytesStart::wrap(content, name_end))) - } - } - /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// /// If [`encoding`] feature is enabled, the used encoding may change after @@ -388,13 +355,41 @@ impl ReaderState { debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); debug_assert!(content.ends_with(b"/>"), "{:?}", Bytes(content)); - self.emit_start(&content[1..content.len() - 1]) + let content = &content[1..content.len() - 1]; + let len = content.len(); + let name_end = content + .iter() + .position(|&b| is_whitespace(b)) + .unwrap_or(len); + // This is self-closed tag `` + let name_len = if name_end < len { name_end } else { len - 1 }; + let event = BytesStart::wrap(&content[..len - 1], name_len); + + if self.config.expand_empty_elements { + self.pending = true; + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&content[..name_len]); + Ok(Event::Start(event)) + } else { + Ok(Event::Empty(event)) + } } FeedResult::EmitStartTag(_) => { debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); - self.emit_start(&content[1..content.len() - 1]) + let content = &content[1..content.len() - 1]; + let len = content.len(); + let name_end = content + .iter() + .position(|&b| is_whitespace(b)) + .unwrap_or(len); + // #514: Always store names event when .check_end_names == false, + // because checks can be temporary disabled and when they would be + // enabled, we should have that information + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&content[..name_end]); + Ok(Event::Start(BytesStart::wrap(content, name_end))) } FeedResult::EmitEndTag(_) => { debug_assert!(content.starts_with(b" Date: Sun, 19 Nov 2023 23:26:40 +0500 Subject: [PATCH 20/22] Move name length calculation to BytesStart::wrap It also will be used in a new BytesPI event that will be introduced in a separate PR --- src/events/mod.rs | 8 ++++++-- src/reader/state.rs | 31 ++++++++++--------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index bf03d5e6..3ee52721 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -76,8 +76,12 @@ pub struct BytesStart<'a> { impl<'a> BytesStart<'a> { /// Internal constructor, used by `Reader`. Supplies data in reader's encoding #[inline] - pub(crate) fn wrap(content: &'a [u8], name_len: usize) -> Self { - BytesStart { + pub(crate) fn wrap(content: &'a [u8]) -> Self { + let name_len = content + .iter() + .position(|&b| is_whitespace(b)) + .unwrap_or(content.len()); + Self { buf: Cow::Borrowed(content), name_len, } diff --git a/src/reader/state.rs b/src/reader/state.rs index 20c13986..4e492055 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -4,6 +4,7 @@ use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; use crate::encoding::Decoder; use crate::errors::{Error, IllFormedError, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; +use crate::name::QName; use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; @@ -333,10 +334,10 @@ impl ReaderState { // Cut of `` from start and end let content = &content[2..content.len() - 2]; - let len = content.len(); + let event = BytesStart::wrap(content); - if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { - let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); + if event.name() == QName(b"xml") { + let event = BytesDecl::from_start(event); // Try getting encoding from the declaration event #[cfg(feature = "encoding")] @@ -355,20 +356,12 @@ impl ReaderState { debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); debug_assert!(content.ends_with(b"/>"), "{:?}", Bytes(content)); - let content = &content[1..content.len() - 1]; - let len = content.len(); - let name_end = content - .iter() - .position(|&b| is_whitespace(b)) - .unwrap_or(len); - // This is self-closed tag `` - let name_len = if name_end < len { name_end } else { len - 1 }; - let event = BytesStart::wrap(&content[..len - 1], name_len); + let event = BytesStart::wrap(&content[1..content.len() - 2]); if self.config.expand_empty_elements { self.pending = true; self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_len]); + self.opened_buffer.extend(event.name().as_ref()); Ok(Event::Start(event)) } else { Ok(Event::Empty(event)) @@ -378,18 +371,14 @@ impl ReaderState { debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); - let content = &content[1..content.len() - 1]; - let len = content.len(); - let name_end = content - .iter() - .position(|&b| is_whitespace(b)) - .unwrap_or(len); + let event = BytesStart::wrap(&content[1..content.len() - 1]); + // #514: Always store names event when .check_end_names == false, // because checks can be temporary disabled and when they would be // enabled, we should have that information self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_end]); - Ok(Event::Start(BytesStart::wrap(content, name_end))) + self.opened_buffer.extend(event.name().as_ref()); + Ok(Event::Start(event)) } FeedResult::EmitEndTag(_) => { debug_assert!(content.starts_with(b" Date: Tue, 26 Sep 2023 19:34:25 +0500 Subject: [PATCH 21/22] Inline ReaderState::emit_end --- src/reader/state.rs | 105 +++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 55 deletions(-) diff --git a/src/reader/state.rs b/src/reader/state.rs index 4e492055..b7a59c03 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -84,60 +84,6 @@ pub(super) struct ReaderState { } impl ReaderState { - /// Wraps content of `buf` into the [`Event::End`] event. Does the check that - /// end name matches the last opened start name if `self.config.check_end_names` is set. - pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result> { - // Strip the `/` character. `content` contains data between `` - let content = &buf[1..]; - // XML standard permits whitespaces after the markup name in closing tags. - // Let's strip them from the buffer before comparing tag names. - let name = if self.config.trim_markup_names_in_closing_tags { - if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { - &content[..pos_end_name + 1] - } else { - content - } - } else { - content - }; - - let decoder = self.decoder(); - - // Get the index in self.opened_buffer of the name of the last opened tag - match self.opened_starts.pop() { - Some(start) => { - if self.config.check_end_names { - let expected = &self.opened_buffer[start..]; - if name != expected { - let expected = decoder.decode(expected).unwrap_or_default().into_owned(); - // #513: In order to allow error recovery we should drop content of the buffer - self.opened_buffer.truncate(start); - - // Report error at start of the end tag at `<` character - // -2 for `<` and `>` - self.last_error_offset = self.offset - buf.len() - 2; - return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { - expected, - found: decoder.decode(name).unwrap_or_default().into_owned(), - })); - } - } - - self.opened_buffer.truncate(start); - } - None => { - // Report error at start of the end tag at `<` character - // -2 for `<` and `>` - self.last_error_offset = self.offset - buf.len() - 2; - return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( - decoder.decode(name).unwrap_or_default().into_owned(), - ))); - } - } - - Ok(Event::End(BytesEnd::wrap(name.into()))) - } - /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// /// If [`encoding`] feature is enabled, the used encoding may change after @@ -384,7 +330,56 @@ impl ReaderState { debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - self.emit_end(&content[1..content.len() - 1]) + let buf = &content[1..content.len() - 1]; + // Strip the `/` character. `content` contains data between `` + let content = &buf[1..]; + // XML standard permits whitespaces after the markup name in closing tags. + // Let's strip them from the buffer before comparing tag names. + let name = if self.config.trim_markup_names_in_closing_tags { + if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { + &content[..pos_end_name + 1] + } else { + content + } + } else { + content + }; + + let decoder = self.decoder(); + + // Get the index in self.opened_buffer of the name of the last opened tag + match self.opened_starts.pop() { + Some(start) => { + if self.config.check_end_names { + let expected = &self.opened_buffer[start..]; + if name != expected { + let expected = decoder.decode(expected).unwrap_or_default().into_owned(); + // #513: In order to allow error recovery we should drop content of the buffer + self.opened_buffer.truncate(start); + + // Report error at start of the end tag at `<` character + // -2 for `<` and `>` + self.last_error_offset = self.offset - buf.len() - 2; + return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { + expected, + found: decoder.decode(name).unwrap_or_default().into_owned(), + })); + } + } + + self.opened_buffer.truncate(start); + } + None => { + // Report error at start of the end tag at `<` character + // -2 for `<` and `>` + self.last_error_offset = self.offset - buf.len() - 2; + return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( + decoder.decode(name).unwrap_or_default().into_owned(), + ))); + } + } + + Ok(Event::End(BytesEnd::wrap(name.into()))) } FeedResult::EncodingUtf8Like(_) | FeedResult::EncodingUtf16BeLike(_) From 273c8a008fce553697d0f772fb6057a95b962f4c Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 28 Sep 2023 00:49:39 +0500 Subject: [PATCH 22/22] dbg! in xml --- src/parser/mod.rs | 23 +++++++++++++++++------ src/reader/mod.rs | 8 +++++--- src/reader/slice_reader.rs | 3 ++- src/reader/state.rs | 3 ++- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index e6a6daee..46a14007 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -345,10 +345,12 @@ impl Parser { /// - `bytes`: a slice to search a new XML event. Should contain text in /// ASCII-compatible encoding pub fn feed(&mut self, bytes: &[u8]) -> Result { + dbg!((self.0, crate::utils::Bytes(bytes))); for (offset, &byte) in bytes.iter().enumerate() { let trail = &bytes[offset..]; let start = offset + 1; let rest = &bytes[start..]; + dbg!((self.0, offset, byte as char, crate::utils::Bytes(trail), crate::utils::Bytes(rest))); self.0 = match self.0 { State::Start => match byte { 0x00 => State::Bom(BomParser::X00), @@ -549,6 +551,7 @@ impl Parser { /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` #[inline] fn parse_text(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); self.0 = State::Text; match bytes.iter().position(|&b| b == b'<') { Some(i) => FeedResult::EmitText(offset + i), @@ -570,6 +573,7 @@ impl Parser { offset: usize, mut parser: CommentParser, ) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); match parser.feed(bytes) { Some(i) => { self.0 = State::Text; @@ -593,6 +597,7 @@ impl Parser { /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` /// - `braces_left`: count of braces that wasn't seen yet in the end of previous data chunk fn parse_cdata(&mut self, bytes: &[u8], offset: usize, mut parser: CDataParser) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); match parser.feed(bytes) { Some(i) => { self.0 = State::Text; @@ -611,8 +616,9 @@ impl Parser { offset: usize, mut parser: QuotedParser, ) -> Result { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); // Search `[` (start of DTD definitions) or `>` (end of tag) - match parser.one_of(bytes) { + match dbg!(parser.one_of(bytes)) { OneOf::Open(i) => self.parse_dtd(&bytes[i..], offset + i, DtdParser::default()), OneOf::Close(i) => { self.0 = State::Text; @@ -639,8 +645,9 @@ impl Parser { mut offset: usize, mut parser: DtdParser, ) -> Result { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); loop { - let result = match parser.feed(bytes) { + let result = match dbg!(parser.feed(bytes)) { // Skip recognized DTD structure // TODO: Emit DTD events while parsing quick_dtd::FeedResult::EmitPI(off) @@ -669,7 +676,8 @@ impl Parser { } fn parse_doctype_finish(&mut self, bytes: &[u8], offset: usize) -> FeedResult { - match bytes.iter().position(|&b| b == b'>') { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); + match dbg!(bytes.iter().position(|&b| b == b'>')) { Some(i) => { self.0 = State::Text; // +1 for `>` which should be included in event @@ -692,7 +700,8 @@ impl Parser { /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` /// - `has_mark`: a flag that indicates was the previous fed data ended with `?` fn parse_pi(&mut self, bytes: &[u8], offset: usize, mut parser: PiParser) -> FeedResult { - match parser.feed(bytes) { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + match dbg!(parser.feed(bytes)) { Some(i) => { self.0 = State::Text; FeedResult::EmitPI(offset + i) @@ -711,7 +720,8 @@ impl Parser { /// That sub-slice begins on the byte that represents a tag name /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` fn parse_end(&mut self, bytes: &[u8], offset: usize) -> FeedResult { - match bytes.iter().position(|&b| b == b'>') { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); + match dbg!(bytes.iter().position(|&b| b == b'>')) { Some(i) => { self.0 = State::Text; // +1 for `>` which should be included in event @@ -740,7 +750,8 @@ impl Parser { mut parser: QuotedParser, has_slash: bool, ) -> FeedResult { - match parser.feed(bytes) { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser, has_slash)); + match dbg!(parser.feed(bytes)) { Some(0) if has_slash => { self.0 = State::Text; // +1 for `>` which should be included in event diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 9062a148..9c5b3dfc 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -204,6 +204,7 @@ macro_rules! read_event_impl { $self:ident, $buf:ident $(, $await:ident)? ) => {{ + dbg!("==============================================================="); if let Some(end) = $self.state.pending_end() { return Ok(end); } @@ -211,13 +212,14 @@ macro_rules! read_event_impl { let start = $buf.len(); let offset = $self.state.offset; loop { - break match $self.reader.fill_buf() $(.$await)? { + dbg!("--------------------------------"); + break match dbg!($self.reader.fill_buf() $(.$await)?) { Ok(bytes) if bytes.is_empty() => { let content = &$buf[start..]; if content.is_empty() { Ok(Event::Eof) } else - if let Err(error) = $self.state.parser.finish() { + if let Err(error) = dbg!($self.state.parser.finish()) { $self.state.last_error_offset = offset; Err(Error::Syntax(error)) } else { @@ -226,7 +228,7 @@ macro_rules! read_event_impl { Ok(Event::Text(BytesText::wrap(content, $self.decoder()))) } } - Ok(bytes) => match $self.state.parse_into(bytes, $buf)? { + Ok(bytes) => match dbg!($self.state.parse_into(bytes, $buf))? { ParseOutcome::Consume(offset, result) => { $self.reader.consume(offset); $self.state.make_event(result, &$buf[start..]) diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 173b0c27..a975af21 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -70,6 +70,7 @@ impl<'a> Reader<&'a [u8]> { /// ``` #[inline] pub fn read_event(&mut self) -> Result> { + dbg!(self.state.parser); if let Some(end) = self.state.pending_end() { return Ok(end); } @@ -77,7 +78,7 @@ impl<'a> Reader<&'a [u8]> { if self.reader.is_empty() { return Ok(Event::Eof); } - let result = self.state.parser.feed(self.reader)?; + let result = dbg!(self.state.parser.feed(self.reader))?; return match result { FeedResult::NeedData => { let offset = self.reader.len(); diff --git a/src/reader/state.rs b/src/reader/state.rs index b7a59c03..57b19590 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -106,7 +106,8 @@ impl ReaderState { bytes: &'a [u8], buf: &'b mut Vec, ) -> Result { - let result = self.parser.feed(bytes)?; + dbg!(&self); + let result = dbg!(self.parser.feed(bytes))?; match result { FeedResult::NeedData => { let mut content = bytes;