diff --git a/Cargo.toml b/Cargo.toml index eb2f794d..4fc26aac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,11 +14,13 @@ rust-version = "1.56" include = ["src/*", "LICENSE-MIT.md", "README.md"] [dependencies] +aquamarine = { version = "0.3", optional = true } document-features = { version = "0.2", optional = true } encoding_rs = { version = "0.8", optional = true } serde = { version = ">=1.0.139", optional = true } tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] } memchr = "2.1" +quick-dtd = { path = "quick-dtd", version = "0.1" } arbitrary = { version = "1", features = ["derive"], optional = true } [dev-dependencies] diff --git a/Changelog.md b/Changelog.md index 5c51eec3..60e5d268 100644 --- a/Changelog.md +++ b/Changelog.md @@ -29,6 +29,8 @@ to get an offset of the error position. For `SyntaxError`s the range - [#362]: Added `escape::minimal_escape()` which escapes only `&` and `<`. - [#362]: Added `BytesCData::minimal_escape()` which escapes only `&` and `<`. - [#362]: Added `Serializer::set_quote_level()` which allow to set desired level of escaping. +- [#690]: Added a low-level hight-performant XML parser in `quick_xml::parser` module. + For advanced use. ### Bug Fixes diff --git a/quick-dtd/Cargo.toml b/quick-dtd/Cargo.toml new file mode 100644 index 00000000..1ff05e06 --- /dev/null +++ b/quick-dtd/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "quick-dtd" +version = "0.1.0" +edition = "2021" + +description = "High performance DTD reader for quick-xml" + +documentation = "https://docs.rs/quick-dtd" +repository = "https://github.com/tafia/quick-xml" + +keywords = ["dtd", "parser", "xml"] +categories = ["parsing", "parser-implementations", "no-std"] +license = "MIT" +rust-version = "1.56" +include = ["src/*", "LICENSE-MIT.md", "README.md"] + +[dependencies] +document-features = { version = "0.2", optional = true } + +[dev-dependencies] +pretty_assertions = "1.4" + +[features] +default = ["std"] + +## Enables support of Rust standard library +std = [] \ No newline at end of file diff --git a/quick-dtd/LICENSE-MIT.md b/quick-dtd/LICENSE-MIT.md new file mode 100644 index 00000000..3329c509 --- /dev/null +++ b/quick-dtd/LICENSE-MIT.md @@ -0,0 +1,23 @@ +The MIT License (MIT) + +Copyright (c) 2023 Mingun + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/quick-dtd/src/comment.rs b/quick-dtd/src/comment.rs new file mode 100644 index 00000000..6c52233b --- /dev/null +++ b/quick-dtd/src/comment.rs @@ -0,0 +1,148 @@ +//! Contains a parser for an XML comment. + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum State { + /// The parser does not yet seen any dashes at the end of previous slice. + Seen0, + /// The parser already seen one dash on the end of previous slice. + Seen1, + /// The parser already seen two dashes on the end of previous slice. + Seen2, +} + +impl Default for State { + fn default() -> Self { + Self::Seen0 + } +} + +/// A parser that search a `-->` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// comment is ended (the position after `-->`). If search was unsuccessful, +/// a [`None`] will be returned. You typically would expect positive result of +/// search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::CommentParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = CommentParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(12)); +/// // ^ ^ +/// // 0 11 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct CommentParser(State); + +impl CommentParser { + /// Determines the end position of an XML comment in the provided slice. + /// Comments is a pieces of text enclosed in `` braces. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: a slice to search end of comment. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + // --|> + if i == 0 && self.0 == State::Seen2 { + // +1 for `>` which should be included in event + return Some(1); + } + // x-|-> + // --|-> + if i == 1 && bytes[0] == b'-' && matches!(self.0, State::Seen1 | State::Seen2) { + // +1 for `>` which should be included in event + return Some(2); + } + if bytes[..i].ends_with(b"--") { + // +1 for `>` which should be included in event + return Some(i + 1); + } + } + if bytes.ends_with(b"--") { + self.0 = State::Seen2; + } else { + self.next_state(bytes.last().copied()); + } + None + } + + #[inline] + fn next_state(&mut self, last: Option) { + match (self.0, last) { + (State::Seen0, Some(b'-')) => self.0 = State::Seen1, + + (State::Seen1, Some(b'-')) => self.0 = State::Seen2, + (State::Seen1, Some(_)) => self.0 = State::Seen0, + + (State::Seen2, Some(b'-')) => {} + (State::Seen2, Some(_)) => self.0 = State::Seen0, + + _ => {} + } + } +} + +#[test] +fn test() { + use pretty_assertions::assert_eq; + use State::*; + + fn parse_comment(bytes: &[u8], initial: State) -> Result { + let mut parser = CommentParser(initial); + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser.0), + } + } + + assert_eq!(parse_comment(b"", Seen0), Err(Seen0)); // xx| + assert_eq!(parse_comment(b"", Seen1), Err(Seen1)); // x-| + assert_eq!(parse_comment(b"", Seen2), Err(Seen2)); // --| + + assert_eq!(parse_comment(b"-", Seen0), Err(Seen1)); // xx|- + assert_eq!(parse_comment(b"-", Seen1), Err(Seen2)); // x-|- + assert_eq!(parse_comment(b"-", Seen2), Err(Seen2)); // --|- + + assert_eq!(parse_comment(b">", Seen0), Err(Seen0)); // xx|> + assert_eq!(parse_comment(b">", Seen1), Err(Seen0)); // x-|> + assert_eq!(parse_comment(b">", Seen2), Ok(1)); // --|> + + assert_eq!(parse_comment(b"--", Seen0), Err(Seen2)); // xx|-- + assert_eq!(parse_comment(b"--", Seen1), Err(Seen2)); // x-|-- + assert_eq!(parse_comment(b"--", Seen2), Err(Seen2)); // --|-- + + assert_eq!(parse_comment(b"->", Seen0), Err(Seen0)); // xx|-> + assert_eq!(parse_comment(b"->", Seen1), Ok(2)); // x-|-> + assert_eq!(parse_comment(b"->", Seen2), Ok(2)); // --|-> + + assert_eq!(parse_comment(b"-->", Seen0), Ok(3)); // xx|--> + assert_eq!(parse_comment(b"-->", Seen1), Ok(3)); // x-|--> + assert_eq!(parse_comment(b"-->", Seen2), Ok(3)); // --|--> + + assert_eq!(parse_comment(b">-->", Seen0), Ok(4)); // xx|>--> + assert_eq!(parse_comment(b">-->", Seen1), Ok(4)); // x-|>--> + assert_eq!(parse_comment(b">-->", Seen2), Ok(1)); // --|>--> + + assert_eq!(parse_comment(b"->-->", Seen0), Ok(5)); // xx|->--> + assert_eq!(parse_comment(b"->-->", Seen1), Ok(2)); // x-|->--> + assert_eq!(parse_comment(b"->-->", Seen2), Ok(2)); // --|->--> +} diff --git a/quick-dtd/src/dtd.rs b/quick-dtd/src/dtd.rs new file mode 100644 index 00000000..b6201888 --- /dev/null +++ b/quick-dtd/src/dtd.rs @@ -0,0 +1,952 @@ +//! Contains the Document Type Definition pull-based parser. + +use crate::{CommentParser, PiParser, QuotedParser}; +use core::iter::Iterator; + +/// An internal state of a parser. Used to preserve information about currently +/// parsed event between calls to [`DtdParser::feed()`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum State { + /// Initial state used to begin parsing DTD events. + Start, + /// A `<` was seen, but nothing else. + Markup, + /// A ` Self { + Self::Start + } +} + +/// A result of feeding data into [`DtdParser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed. + NeedData, + /// The specified count of bytes should be consumed from the input. + EmitElement(usize), + /// The specified count of bytes should be consumed from the input. + EmitAttList(usize), + /// The specified count of bytes should be consumed from the input. + EmitEntity(usize), + /// The specified count of bytes should be consumed from the input. + EmitNotation(usize), + /// The specified count of bytes should be consumed from the input. + EmitPI(usize), + /// The specified count of bytes should be consumed from the input. + EmitComment(usize), + + /// Unexpected byte (`u8`) at the specified offset (`usize`) from begin of + /// chunk that was pushed to [`DtdParser::feed()`]. + /// + /// After getting this error the parser returned to the initial state and + /// you can start parsing another DTD event by feeding data. You should, + /// however, skip all unparsed data until `<` byte which is indication of + /// start of a new DTD event. + Unexpected(usize, u8), +} + +/// A parser of Document Type Definition (DTD) schemas. The parser operates on +/// user-provided buffers with content of DTD. The content can be in any ASCII-compatible +/// encoding. +/// +/// # Example +/// +/// ``` +/// # use pretty_assertions::assert_eq; +/// use quick_dtd::{DtdParser, FeedResult}; +/// +/// let mut parser = DtdParser::default(); +/// let mut result = Vec::new(); +/// let mut buf = Vec::new(); +/// // Suppose that you read `chunk` chunks from network, for example +/// 'outer: for chunk in &[ +/// "garbage\n'>", +/// ] { +/// let mut input = chunk.as_bytes(); +/// loop { +/// let consumed = match parser.feed(input) { +/// // All data in `input` was read and parser state didn't changed +/// // You should provide another chunk of data. The `input` should +/// // considered as fully consumed +/// FeedResult::NeedData => { +/// // Store all input to buffer for current event, request the +/// // new data from reader +/// buf.extend_from_slice(input); +/// continue 'outer; +/// } +/// FeedResult::Unexpected(offset, byte) => { +/// match input[offset..].iter().position(|b| *b == b'<') { +/// // Skip all garbage until start of new event +/// Some(end) => { +/// assert_eq!(&input[offset..end], b"garbage\n"); +/// offset + end +/// } +/// None => input.len(), +/// } +/// } +/// +/// FeedResult::EmitElement(offset) | +/// FeedResult::EmitAttList(offset) | +/// FeedResult::EmitEntity(offset) | +/// FeedResult::EmitNotation(offset) | +/// FeedResult::EmitPI(offset) | +/// FeedResult::EmitComment(offset) => { +/// // Store consumed input to buffer for current event +/// buf.extend_from_slice(&input[..offset]); +/// // ..process `buf` with data of events here +/// result.push(String::from_utf8(buf).unwrap()); +/// // Prepare buffer for new data +/// buf = Vec::new(); +/// offset +/// } +/// }; +/// // Skip consumed input, feed the rest on next iteration +/// input = &input[consumed..]; +/// } +/// } +/// +/// assert_eq!(result, [ +/// "", +/// "'>", +/// ]); +/// ``` +#[derive(Copy, Clone, Default, Debug, Eq, PartialEq)] +pub struct DtdParser(State); +impl DtdParser { + /// Provides new portion of data to the parser to parse. When this method + /// returns [`FeedResult::NeedData`], the whole buffer was analyzed and no + pub fn feed(&mut self, bytes: &[u8]) -> FeedResult { + for (offset, &byte) in bytes.iter().enumerate() { + let start = offset + 1; + let rest = &bytes[start..]; + self.0 = match self.0 { + State::Start => match byte { + b'<' => State::Markup, + // Skip spaces defined by XML standard + b' ' | b'\t' | b'\r' | b'\n' => continue, + b => return FeedResult::Unexpected(offset, b), + }, + State::Markup => match byte { + b'!' => State::MarkupBang, + b'?' => return self.parse_pi(rest, start, PiParser::default()), + b => return FeedResult::Unexpected(offset, b), + }, + State::MarkupBang => match byte { + b'E' => State::MaybeElementOrEntity, + b'A' => State::MaybeAttList1, + b'N' => State::MaybeNotation1, + b'-' => State::MaybeComment, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElementOrEntity => match byte { + b'L' => State::MaybeElement1, + b'N' => State::MaybeEntity1, + b => return FeedResult::Unexpected(offset, b), + }, + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeComment => match byte { + b'-' => return self.parse_comment(rest, start, CommentParser::default()), + b => return FeedResult::Unexpected(offset, b), + }, + State::Comment(parser) => return self.parse_comment(bytes, offset, parser), + State::PI(parser) => return self.parse_pi(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeElement1 => match byte { + b'E' => State::MaybeElement2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement2 => match byte { + b'M' => State::MaybeElement3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement3 => match byte { + b'E' => State::MaybeElement4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement4 => match byte { + b'N' => State::MaybeElement5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement5 => match byte { + b'T' => State::MaybeElement6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeElement6 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => return self.parse_element(rest, start), + b => return FeedResult::Unexpected(offset, b), + }, + State::Element => return self.parse_element(bytes, offset), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeEntity1 => match byte { + b'T' => State::MaybeEntity2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity2 => match byte { + b'I' => State::MaybeEntity3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity3 => match byte { + b'T' => State::MaybeEntity4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity4 => match byte { + b'Y' => State::MaybeEntity5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeEntity5 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_entity(rest, start, QuotedParser::Outside) + } + b => return FeedResult::Unexpected(offset, b), + }, + State::Entity(parser) => return self.parse_entity(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeAttList1 => match byte { + b'T' => State::MaybeAttList2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList2 => match byte { + b'T' => State::MaybeAttList3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList3 => match byte { + b'L' => State::MaybeAttList4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList4 => match byte { + b'I' => State::MaybeAttList5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList5 => match byte { + b'S' => State::MaybeAttList6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList6 => match byte { + b'T' => State::MaybeAttList7, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeAttList7 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_attlist(rest, start, QuotedParser::Outside) + } + b => return FeedResult::Unexpected(offset, b), + }, + State::AttList(parser) => return self.parse_attlist(bytes, offset, parser), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeNotation1 => match byte { + b'O' => State::MaybeNotation2, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation2 => match byte { + b'T' => State::MaybeNotation3, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation3 => match byte { + b'A' => State::MaybeNotation4, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation4 => match byte { + b'T' => State::MaybeNotation5, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation5 => match byte { + b'I' => State::MaybeNotation6, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation6 => match byte { + b'O' => State::MaybeNotation7, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation7 => match byte { + b'N' => State::MaybeNotation8, + b => return FeedResult::Unexpected(offset, b), + }, + State::MaybeNotation8 => match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + return self.parse_notation(rest, start, QuotedParser::Outside); + } + b => return FeedResult::Unexpected(offset, b), + }, + State::Notation(parser) => return self.parse_notation(bytes, offset, parser), + }; + } + FeedResult::NeedData + } + + /// `` cannot contain `>` inside, so we emit it as soon as we found `>` + fn parse_element(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + match bytes.iter().position(|&b| b == b'>') { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitElement(offset + i + 1) + } + None => { + self.0 = State::Element; + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_entity( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitEntity(offset + i + 1) + } + None => { + self.0 = State::Entity(parser); + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_attlist( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitAttList(offset + i + 1) + } + None => { + self.0 = State::AttList(parser); + FeedResult::NeedData + } + } + } + + /// `` can contain `>` inside, but all those symbols either in single or double quotes + fn parse_notation( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + // +1 for `>` which should be included in event + FeedResult::EmitNotation(offset + i + 1) + } + None => { + self.0 = State::Notation(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a PI target (at least, should) + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `has_mark`: a flag that indicates was the previous fed data ended with `?` + fn parse_pi(&mut self, bytes: &[u8], offset: usize, mut parser: PiParser) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + FeedResult::EmitPI(offset + i) + } + None => { + self.0 = State::PI(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a comment in the provided slice. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a comment content (at least, should) + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the state of comment parser saved after consuming the previous chunk of data + fn parse_comment( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: CommentParser, + ) -> FeedResult { + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Start; + FeedResult::EmitComment(offset + i) + } + None => { + self.0 = State::Comment(parser); + FeedResult::NeedData + } + } + } + + /// Convert this parser to an iterator producing [`FeedResult`]s from specified + /// bytes. + pub fn into_iter<'a>(self, bytes: &'a [u8]) -> DtdIter<'a> { + DtdIter { + chunk: bytes, + parser: self, + } + } +} + +/// This struct is created by the [`into_iter`] method of [`DtdParser`]. +/// +/// [`into_iter`]: DtdParser::into_iter +pub struct DtdIter<'a> { + chunk: &'a [u8], + parser: DtdParser, +} +impl<'a> DtdIter<'a> { + /// Replaces current chunk of the iterator with nee one. All not-consumed + /// data would be loss, so call it only when you get `FeedResult::NeedData` + /// from the iterator. + pub fn feed(&mut self, chunk: &'a [u8]) { + self.chunk = chunk; + } +} +impl<'a> Iterator for DtdIter<'a> { + type Item = FeedResult; + + fn next(&mut self) -> Option { + if self.chunk.is_empty() { + return None; + } + let result = self.parser.feed(self.chunk); + match result { + FeedResult::NeedData => { + // All data consumed, so replace it empty data + self.chunk = b""; + None + } + FeedResult::EmitPI(off) + | FeedResult::EmitEntity(off) + | FeedResult::EmitAttList(off) + | FeedResult::EmitComment(off) + | FeedResult::EmitElement(off) + | FeedResult::EmitNotation(off) + | FeedResult::Unexpected(off, _) => { + self.chunk = &self.chunk[off..]; + Some(result) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::FeedResult::*; + use super::*; + use pretty_assertions::assert_eq; + + fn check(chunk_size: usize, bytes: &[u8]) { + let mut iter = DtdParser::default().into_iter(b""); + for (i, chunk) in bytes.chunks(chunk_size).enumerate() { + iter.feed(chunk); + while let Some(event) = iter.next() { + assert!( + !matches!(event, FeedResult::Unexpected(..)), + "#{}: {:?} => {:?}\n{:?}", + i * chunk_size, + iter.parser.0, + event, + core::str::from_utf8(chunk).unwrap(), + ); + } + } + } + + mod by_chunks { + use super::*; + + const BYTES: &[u8] = include_bytes!("../tests/example.dtd"); + + #[test] + fn _1() { + check(1, BYTES); + } + + #[test] + fn _2() { + check(2, BYTES); + } + + #[test] + fn _3() { + check(3, BYTES); + } + + #[test] + fn _5() { + check(5, BYTES); + } + + #[test] + fn _7() { + check(7, BYTES); + } + + #[test] + fn _11() { + check(11, BYTES); + } + + #[test] + fn _13() { + check(13, BYTES); + } + + #[test] + fn _17() { + check(17, BYTES); + } + + #[test] + fn _19() { + check(19, BYTES); + } + + #[test] + fn _23() { + check(23, BYTES); + } + + #[test] + fn _29() { + check(29, BYTES); + } + + #[test] + fn _31() { + check(31, BYTES); + } + + #[test] + fn _37() { + check(37, BYTES); + } + + #[test] + fn _41() { + check(41, BYTES); + } + + #[test] + fn _43() { + check(43, BYTES); + } + + #[test] + fn _47() { + check(47, BYTES); + } + } + + #[test] + fn element() { + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Element); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Element); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b">"), EmitElement(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Element); + assert_eq!(parser.feed(b">a"), EmitElement(1)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn attlist() { + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitAttList(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitAttList(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::AttList(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitAttList(5)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn entity() { + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitEntity(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitEntity(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Entity(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitEntity(5)); + assert_eq!(parser.0, State::Start); + } + + #[test] + fn notation() { + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::SingleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"'"), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::DoubleQ)); + assert_eq!(parser.feed(b"\""), NeedData); + assert_eq!(parser.0, State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b">a"), EmitNotation(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"'>\"'>"), EmitNotation(5)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::Notation(QuotedParser::Outside)); + assert_eq!(parser.feed(b"\"'>\">"), EmitNotation(5)); + assert_eq!(parser.0, State::Start); + } + + /*#[test] + fn pi() { + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b""), NeedData); + assert_eq!(parser.0, State::PI(true)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"aa"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"aa"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a?"), NeedData); + assert_eq!(parser.0, State::PI(true)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b">"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b">"), EmitPI(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b">a"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b">a"), EmitPI(1)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a>"), NeedData); + assert_eq!(parser.0, State::PI(false)); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a>"), NeedData); + assert_eq!(parser.0, State::PI(false)); + + //---------------------------------------------------------------------- + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?>"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?>"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"?>a"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"?>a"), EmitPI(2)); + assert_eq!(parser.0, State::Start); + + let mut parser = DtdParser(State::PI(false)); + assert_eq!(parser.feed(b"a?>"), EmitPI(3)); + assert_eq!(parser.0, State::Start); + let mut parser = DtdParser(State::PI(true)); + assert_eq!(parser.feed(b"a?>"), EmitPI(3)); + assert_eq!(parser.0, State::Start); + }*/ +} diff --git a/quick-dtd/src/lib.rs b/quick-dtd/src/lib.rs new file mode 100644 index 00000000..2550a057 --- /dev/null +++ b/quick-dtd/src/lib.rs @@ -0,0 +1,29 @@ +//! High performant Document Type Definition (DTD) parser. +//! +//! # Features +//! +//! `quick-dtd` supports the following features: +#![cfg_attr( + feature = "document-features", + cfg_attr(doc, doc = ::document_features::document_features!( + // Replicates the default format, but adds an anchor to the feature + feature_label = "{feature}" + )) +)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] +// Enable feature requirements in the docs from 1.57 +// See https://stackoverflow.com/questions/61417452 +#![cfg_attr(docs_rs, feature(doc_auto_cfg))] +#![cfg_attr(not(feature = "std"), no_std)] + +mod dtd; +// Helper reusable parsers +mod comment; +mod pi; +mod quoted; + +pub use comment::CommentParser; +pub use dtd::{DtdIter, DtdParser, FeedResult}; +pub use pi::PiParser; +pub use quoted::{QuotedParser, OneOf}; diff --git a/quick-dtd/src/pi.rs b/quick-dtd/src/pi.rs new file mode 100644 index 00000000..e8957f7b --- /dev/null +++ b/quick-dtd/src/pi.rs @@ -0,0 +1,92 @@ +//! Contains a parser for an XML processing instruction. + +/// A parser that search a `?>` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// processing instruction is ended (the position after `?>`). If search was +/// unsuccessful, a [`None`] will be returned. You typically would expect positive +/// result of search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::PiParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = PiParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b" and ?"), None); +/// // ...get another chunk of data +/// assert_eq!(parser.feed(b"inside?>and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 7 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct PiParser( + /// A flag that indicates was the `bytes` in the previous attempt to find the + /// end ended with `?`. + bool, +); + +impl PiParser { + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a processing instruction. + /// Should contain text in ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + match i { + // +1 for `>` which should be included in event + 0 if self.0 => return Some(1), + // If the previous byte is `?`, then we found `?>` + // +1 for `>` which should be included in event + i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1), + _ => {} + } + } + self.0 = bytes.last().copied() == Some(b'?'); + None + } +} + +#[test] +fn pi() { + use pretty_assertions::assert_eq; + + fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result { + let mut parser = PiParser(had_question_mark); + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser.0), + } + } + + assert_eq!(parse_pi(b"", false), Err(false)); // x| + assert_eq!(parse_pi(b"", true), Err(false)); // ?| + + assert_eq!(parse_pi(b"?", false), Err(true)); // x|? + assert_eq!(parse_pi(b"?", true), Err(true)); // ?|? + + assert_eq!(parse_pi(b">", false), Err(false)); // x|> + assert_eq!(parse_pi(b">", true), Ok(1)); // ?|> + + assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?> + assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?> + + assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?> + assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?> +} diff --git a/quick-dtd/src/quoted.rs b/quick-dtd/src/quoted.rs new file mode 100644 index 00000000..aca431a1 --- /dev/null +++ b/quick-dtd/src/quoted.rs @@ -0,0 +1,104 @@ +/// Represents the result of [`QuotedParser::one_of`] operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum OneOf { + /// The open angle bracket (`<`) was found as specified position. + /// + /// The open angle bracket could only be part of a tag inside DTD + /// if DTD is correctly formed. + Open(usize), + /// The close angle bracket (`>`) was found as specified position. + Close(usize), + /// Nothing was found in the provided slice. + None, +} + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered, as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until yo'll get it. +/// +/// # Example +/// +/// ``` +/// # use quick_dtd::QuotedParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = QuotedParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum QuotedParser { + /// The initial state (inside element, but outside of attribute value). + Outside, + /// Inside a single-quoted region. + SingleQ, + /// Inside a double-quoted region. + DoubleQ, +} +impl QuotedParser { + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, &byte)) = it.find(|(_, &b)| matches!(b, b'>' | b'\'' | b'"')) { + match (*self, byte) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'>') => return Some(i), + (Self::Outside, b'\'') => *self = Self::SingleQ, + (Self::Outside, b'\"') => *self = Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => *self = Self::Outside, + + // all other bytes: no state change + _ => {} + } + } + None + } + + /// Returns number of consumed bytes or `None` if `<` or `>` was not found in `bytes`. + pub fn one_of(&mut self, bytes: &[u8]) -> OneOf { + let mut it = bytes.iter().enumerate(); + while let Some((i, &byte)) = it.find(|(_, &b)| matches!(b, b'<' | b'>' | b'\'' | b'"')) { + match (*self, byte) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'<') => return OneOf::Open(i), + (Self::Outside, b'>') => return OneOf::Close(i), + (Self::Outside, b'\'') => *self = Self::SingleQ, + (Self::Outside, b'\"') => *self = Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => *self = Self::Outside, + + // all other bytes: no state change + _ => {} + } + } + OneOf::None + } +} + +impl Default for QuotedParser { + fn default() -> Self { + Self::Outside + } +} diff --git a/quick-dtd/tests/example.dtd b/quick-dtd/tests/example.dtd new file mode 100644 index 00000000..8192eec7 --- /dev/null +++ b/quick-dtd/tests/example.dtd @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + +"> + + + + + + + + + + + +"> +'> +"> +' NDATA n-data> +'> +' NDATA n-data> + +"> +'> +"> +'> +'> +'> + + +"> +'> +'> +'> + + + + + +?> + +--> \ No newline at end of file diff --git a/src/events/mod.rs b/src/events/mod.rs index 546ad392..3ee52721 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -76,8 +76,12 @@ pub struct BytesStart<'a> { impl<'a> BytesStart<'a> { /// Internal constructor, used by `Reader`. Supplies data in reader's encoding #[inline] - pub(crate) fn wrap(content: &'a [u8], name_len: usize) -> Self { - BytesStart { + pub(crate) fn wrap(content: &'a [u8]) -> Self { + let name_len = content + .iter() + .position(|&b| is_whitespace(b)) + .unwrap_or(content.len()); + Self { buf: Cow::Borrowed(content), name_len, } @@ -1076,7 +1080,7 @@ fn str_cow_to_bytes<'a, C: Into>>(content: C) -> Cow<'a, [u8]> { /// Returns a byte slice with leading XML whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by [`is_whitespace`]. -const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { +pub(crate) const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { // Note: A pattern matching based approach (instead of indexing) allows // making the function const. while let [first, rest @ ..] = bytes { @@ -1092,7 +1096,7 @@ const fn trim_xml_start(mut bytes: &[u8]) -> &[u8] { /// Returns a byte slice with trailing XML whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by [`is_whitespace`]. -const fn trim_xml_end(mut bytes: &[u8]) -> &[u8] { +pub(crate) const fn trim_xml_end(mut bytes: &[u8]) -> &[u8] { // Note: A pattern matching based approach (instead of indexing) allows // making the function const. while let [rest @ .., last] = bytes { diff --git a/src/lib.rs b/src/lib.rs index db164e21..a5d3768a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,7 @@ pub mod escape { } pub mod events; pub mod name; +pub mod parser; pub mod reader; #[cfg(feature = "serialize")] pub mod se; diff --git a/src/parser/bom.rs b/src/parser/bom.rs new file mode 100644 index 00000000..992abb58 --- /dev/null +++ b/src/parser/bom.rs @@ -0,0 +1,148 @@ +//! A parser for encoding detection using BOM and heuristics. + +/// A result of feeding data into [`BomParser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed. + NeedData, + /// Encoding detected as UTF-16 Big-Endian based on the first 4 bytes of content. + /// Nothing should be consumed. + Utf16Be, + /// Encoding detected as UTF-16 Little-Endian based on the first 4 bytes of content. + /// Nothing should be consumed. + Utf16Le, + /// Encoding detected as UTF-8 on the first 4 bytes of content. + /// Nothing should be consumed. + Utf8, + /// Encoding detected as UTF-16 Big-Endian based on the first 4 bytes of content. + /// The 2 bytes of BOM should be consumed. + Utf16BeBom, + /// Encoding detected as UTF-16 Little-Endian based on the first 4 bytes of content. + /// The 2 bytes of BOM should be consumed. + Utf16LeBom, + /// Encoding detected as UTF-8 based on the first 3 bytes of content. + /// The 3 bytes of BOM should be consumed. + Utf8Bom, + /// Encoding was not recognized. Nothing should be consumed. + Unknown, +} + +/// Implements automatic encoding detection of XML using the +/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing). +/// +/// IF encoding was not recognized, [`FeedResult::Unknown`] is returned, otherwise +/// `Utf*` variant is returned. +/// +/// Because the [`encoding_rs`] crate supports only subset of those encodings, only +/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// | **BOM** +/// |`FE_FF_##_##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// | **No BOM** +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[allow(non_camel_case_types)] +pub enum BomParser { + X00, + X00_3C, + X00_3C_00, + + X3C, + X3C_00, + X3C_00_3F, + + X3C_3F, + X3C_3F_78, // FeedResult { + for &byte in bytes.iter() { + *self = match self { + //---------------------------------------------------------------------------------- + // UTF-16 BE without BOM 00 < 00 ? + //---------------------------------------------------------------------------------- + Self::X00 => match byte { + b'<' => Self::X00_3C, + _ => return FeedResult::Unknown, + }, + Self::X00_3C => match byte { + 0x00 => Self::X00_3C_00, + _ => return FeedResult::Unknown, + }, + Self::X00_3C_00 => match byte { + b'?' => return FeedResult::Utf16Be, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 LE without BOM < 00 ? 00 + //---------------------------------------------------------------------------------- + Self::X3C => match byte { + 0x00 => Self::X3C_00, + b'?' => Self::X3C_3F, + _ => return FeedResult::Unknown, + }, + Self::X3C_00 => match byte { + b'?' => Self::X3C_00_3F, + _ => return FeedResult::Unknown, + }, + Self::X3C_00_3F => match byte { + 0x00 => return FeedResult::Utf16Le, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-8-like without BOM < ? x m + //---------------------------------------------------------------------------------- + Self::X3C_3F => match byte { + b'x' => Self::X3C_3F_78, + _ => return FeedResult::Unknown, + }, + Self::X3C_3F_78 => match byte { + b'm' => return FeedResult::Utf8, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 BE with BOM FE FF + //---------------------------------------------------------------------------------- + Self::XFE => match byte { + 0xFF => return FeedResult::Utf16BeBom, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-16 LE with BOM FF FE + //---------------------------------------------------------------------------------- + Self::XFF => match byte { + 0xFE => return FeedResult::Utf16LeBom, + _ => return FeedResult::Unknown, + }, + //---------------------------------------------------------------------------------- + // UTF-8 with BOM EF BB + //---------------------------------------------------------------------------------- + Self::XEF => match byte { + 0xBB => Self::XEF_BB, + _ => return FeedResult::Unknown, + }, + Self::XEF_BB => match byte { + 0xBF => return FeedResult::Utf8Bom, + _ => return FeedResult::Unknown, + }, + } + } + FeedResult::NeedData + } +} diff --git a/src/parser/cdata.rs b/src/parser/cdata.rs new file mode 100644 index 00000000..af6d1efe --- /dev/null +++ b/src/parser/cdata.rs @@ -0,0 +1,126 @@ +//! Contains a parser for an XML CDATA content. + +/// A parser that search a `]]>` sequence in the slice. +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position where +/// comment is ended (the position after `]]>`). If search was unsuccessful, +/// a [`None`] will be returned. You typically would expect positive result of +/// search, so that you should feed new data until yo'll get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// [`feed`]: Self::feed() +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum CDataParser { + /// The parser does not yet seen any braces at the end of previous slice. + Seen0, + /// The parser already seen one brace on the end of previous slice. + Seen1, + /// The parser already seen two braces on the end of previous slice. + Seen2, +} + +impl CDataParser { + /// Determines the end position of an XML character data in the provided slice. + /// Character data (CDATA) is a pieces of text enclosed in `` braces. + /// Character data ends on the first occurrence of `]]>` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: a slice to search end of CDATA. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Option { + let mut it = bytes.iter().enumerate(); + while let Some((i, _)) = it.find(|(_, &b)| b == b'>') { + // ]]|> + if i == 0 && *self == Self::Seen2 { + // +1 for `>` which should be included in event + return Some(1); + } + // x]|]> + // ]]|]> + if i == 1 && bytes[0] == b']' && matches!(self, Self::Seen1 | Self::Seen2) { + // +1 for `>` which should be included in event + return Some(2); + } + if bytes[..i].ends_with(b"]]") { + // +1 for `>` which should be included in event + return Some(i + 1); + } + } + if bytes.ends_with(b"]]") { + *self = Self::Seen2; + } else { + *self = self.next_state(bytes.last().copied()); + } + None + } + + #[inline] + fn next_state(self, last: Option) -> Self { + match (self, last) { + (Self::Seen0, Some(b']')) => Self::Seen1, + + (Self::Seen1, Some(b']')) => Self::Seen2, + (Self::Seen1, Some(_)) => Self::Seen0, + + (Self::Seen2, Some(b']')) => self, + (Self::Seen2, Some(_)) => Self::Seen0, + + _ => self, + } + } +} + +impl Default for CDataParser { + fn default() -> Self { + Self::Seen0 + } +} + +#[test] +fn test() { + use pretty_assertions::assert_eq; + use CDataParser::*; + + fn parse_cdata(bytes: &[u8], mut parser: CDataParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_cdata(b"", Seen0), Err(Seen0)); // xx| + assert_eq!(parse_cdata(b"", Seen1), Err(Seen1)); // x]| + assert_eq!(parse_cdata(b"", Seen2), Err(Seen2)); // ]]| + + assert_eq!(parse_cdata(b"]", Seen0), Err(Seen1)); // xx|] + assert_eq!(parse_cdata(b"]", Seen1), Err(Seen2)); // x]|] + assert_eq!(parse_cdata(b"]", Seen2), Err(Seen2)); // ]]|] + + assert_eq!(parse_cdata(b">", Seen0), Err(Seen0)); // xx|> + assert_eq!(parse_cdata(b">", Seen1), Err(Seen0)); // x]|> + assert_eq!(parse_cdata(b">", Seen2), Ok(1)); // ]]|> + + assert_eq!(parse_cdata(b"]]", Seen0), Err(Seen2)); // xx|]] + assert_eq!(parse_cdata(b"]]", Seen1), Err(Seen2)); // x]|]] + assert_eq!(parse_cdata(b"]]", Seen2), Err(Seen2)); // ]]|]] + + assert_eq!(parse_cdata(b"]>", Seen0), Err(Seen0)); // xx|]> + assert_eq!(parse_cdata(b"]>", Seen1), Ok(2)); // x]|]> + assert_eq!(parse_cdata(b"]>", Seen2), Ok(2)); // ]]|]> + + assert_eq!(parse_cdata(b"]]>", Seen0), Ok(3)); // xx|]]> + assert_eq!(parse_cdata(b"]]>", Seen1), Ok(3)); // x]|]]> + assert_eq!(parse_cdata(b"]]>", Seen2), Ok(3)); // ]]|]]> + + assert_eq!(parse_cdata(b">]]>", Seen0), Ok(4)); // xx|>]]> + assert_eq!(parse_cdata(b">]]>", Seen1), Ok(4)); // x]|>]]> + assert_eq!(parse_cdata(b">]]>", Seen2), Ok(1)); // ]]|>]]> + + assert_eq!(parse_cdata(b"]>]]>", Seen0), Ok(5)); // xx|]>]]> + assert_eq!(parse_cdata(b"]>]]>", Seen1), Ok(2)); // x]|]>]]> + assert_eq!(parse_cdata(b"]>]]>", Seen2), Ok(2)); // ]]|]>]]> +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 00000000..46a14007 --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,1026 @@ +//! A low-level XML parser. For advanced use. It is very low-level and you +//! typically should not use it. Use a [`Reader`] instead. +//! +//! To use a parser create an instance of [`Parser`] and [`feed`] data into it. +//! After successful search the parser will return [`FeedResult`] with position +//! where match was found and returned variant will represent what exactly was +//! found. In case if the provided data is not enough to made any decision, a +//! [`FeedResult::NeedData`] is returned. Finally, if parser encounters a byte +//! that should not be there, a [`SyntaxError`] is returned. +//! +//! To fully parse a document you should pass unconsumed data to [`feed`] in a +//! loop, that means `&bytes[offset..]` for `Emit*` cases and a completely new +//! slice for a `NeedData` case: +//! +//! ``` +//! # use quick_xml::parser::Parser; +//! use quick_xml::parser::FeedResult::*; +//! // Use `without_encoding_detection` instead if you don't want +//! // automatic encoding detection +//! let mut parser = Parser::default(); +//! // Buffer for data of one event +//! let mut buf = Vec::new(); +//! // Feed data by 3 bytes at once +//! for (i, mut chunk) in b"".chunks(3).enumerate() { +//! loop { +//! match parser.feed(chunk).unwrap() { +//! // Return to the outer loop to request new chunk +//! NeedData => break, +//! +//! EncodingUtf8Like(offset) | +//! EncodingUtf16BeLike(offset) | +//! EncodingUtf16LeLike(offset) => { +//! // Consume BOM, but do not add it to the data +//! chunk = &chunk[offset..]; +//! } +//! EmitText(offset) | +//! EmitCData(offset) | +//! EmitComment(offset) | +//! EmitDoctype(offset) | +//! EmitPI(offset) | +//! EmitEmptyTag(offset) | +//! EmitStartTag(offset) | +//! EmitEndTag(offset) => { +//! // Append data of an event to the buffer +//! buf.extend_from_slice(&chunk[..offset]); +//! +//! // Consume already read data +//! chunk = &chunk[offset..]; +//! +//! // Emit new event using `buf` +//! // ... +//! +//! // If content of buffer is not required anymore, it can be cleared +//! buf.clear(); +//! } +//! } +//! } +//! } +//! ``` +//! +//! [`Reader`]: crate::Reader +//! [`feed`]: Parser::feed() + +use crate::errors::SyntaxError; +use bom::BomParser; +use cdata::CDataParser; +use quick_dtd::{CommentParser, DtdParser, PiParser, QuotedParser, OneOf}; + +mod bom; +mod cdata; + +/// An internal state of a parser. Used to preserve information about currently +/// parsed event between calls to [`Parser::feed()`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum State { + /// Initial state used to begin parse XML events. + Start, + Bom(BomParser), + Text, + + /// A `<` was seen, but nothing else. + Markup, + /// A ``. + Doctype(QuotedParser), + /// We are inside of `[]` of `` definition. + Dtd(DtdParser), + /// We are after `]` of `` definition, looking for `>`. + DoctypeFinish, + + /// A `` was not. Parser expect more data to close a tag + /// and emit [`FeedResult::EmitEmptyTag`]. + EndTag, + /// A `<*` was seen, but nothing else where `*` is an any byte, except `!`, `?`, or `/`. + /// It is unable to understand right now what data follow. + StartOrEmptyTag(QuotedParser, bool), +} + +impl Default for State { + fn default() -> Self { + Self::Start + } +} + +/// A result of feeding data into [`Parser`]. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum FeedResult { + /// All fed bytes should be consumed, new portion should be feed + NeedData, + + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-8 compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf8Like(usize), + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-16 Big-Endian compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf16BeLike(usize), + /// The specified amount of bytes should be consumed from the input and + /// encoding of the document set to the UTF-16 Little-Endian compatible. + /// The encoding should be refined after reading XML declaration. + EncodingUtf16LeLike(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Text`] should be emitted. + /// + /// [`Event::Text`]: crate::events::Event::Text + EmitText(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::CData`] should be emitted. + /// + /// [`Event::CData`]: crate::events::Event::CData + EmitCData(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Comment`] should be emitted. + /// + /// [`Event::Comment`]: crate::events::Event::Comment + EmitComment(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::DocType`] should be emitted. + /// + /// [`Event::DocType`]: crate::events::Event::DocType + EmitDoctype(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::PI`] should be emitted. + /// + /// [`Event::PI`]: crate::events::Event::PI + EmitPI(usize), + + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Empty`] should be emitted. + /// + /// [`Event::Empty`]: crate::events::Event::Empty + EmitEmptyTag(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::Start`] should be emitted. + /// + /// [`Event::Start`]: crate::events::Event::Start + EmitStartTag(usize), + /// The specified amount of bytes should be consumed from the input and + /// [`Event::End`] should be emitted. + /// + /// [`Event::End`]: crate::events::Event::End + EmitEndTag(usize), +} + +// convert `mermaid` block to a diagram +#[cfg_attr(doc, aquamarine::aquamarine)] +/// A low-level XML parser that searches a boundaries of various kinds of XML +/// events in the provided slice. +/// +/// The parser represents a state machine with following states: +/// +/// ```mermaid +/// flowchart TD +/// Text -->|<| Markup +/// Text -->|*| Text +/// +/// Markup --> |!| CommentOrCDataOrDoctype +/// Markup --->|?| PIParser1 +/// Markup --->|/| EndTagParser +/// Markup --> |*| StartOrEmptyTag +/// +/// CommentOrCDataOrDoctype -->|-| CommentParser +/// CommentOrCDataOrDoctype -->|D| DoctypeParser1 +/// CommentOrCDataOrDoctype -->|d| DoctypeParser1 +/// CommentOrCDataOrDoctype -->|"["| CDataParser1 +/// CommentOrCDataOrDoctype -->|*| Error +/// +/// subgraph comment +/// CommentParser -->|-| CommentContent1 +/// CommentParser ----->|*| CommentError +/// +/// CommentContent1 -->|-| CommentContent2 +/// CommentContent1 -->|*| CommentContent1 +/// +/// CommentContent2 -->|-| CommentContent3 +/// CommentContent2 -->|*| CommentContent1 +/// +/// CommentContent3 -->|>| Comment +/// CommentContent3 -->|*| CommentContent1 +/// end +/// subgraph doctype +/// DoctypeParser1 -->|O| DoctypeParser2 +/// DoctypeParser1 -->|o| DoctypeParser2 +/// DoctypeParser1 ---->|*| DoctypeError +/// +/// DoctypeParser2 -->|C| DoctypeParser3 +/// DoctypeParser2 -->|c| DoctypeParser3 +/// DoctypeParser2 ---->|*| DoctypeError +/// +/// DoctypeParser3 -->|T| DoctypeParser4 +/// DoctypeParser3 -->|t| DoctypeParser4 +/// DoctypeParser3 ---->|*| DoctypeError +/// +/// DoctypeParser4 -->|Y| DoctypeParser5 +/// DoctypeParser4 -->|y| DoctypeParser5 +/// DoctypeParser4 ---->|*| DoctypeError +/// +/// DoctypeParser5 -->|P| DoctypeParser6 +/// DoctypeParser5 -->|p| DoctypeParser6 +/// DoctypeParser5 ---->|*| DoctypeError +/// +/// DoctypeParser6 -->|E| DoctypeContent1 +/// DoctypeParser6 -->|e| DoctypeContent1 +/// DoctypeParser6 ---->|*| DoctypeError +/// +/// DoctypeContent1 -->|!| DoctypeContent2 +/// DoctypeContent1 -->|*| DoctypeContent1 +/// +/// DoctypeContent2 -->|>| Doctype +/// DoctypeContent2 -->|*| DoctypeContent1 +/// end +/// subgraph cdata +/// CDataParser1 -->|C| CDataParser2 +/// CDataParser1 ----->|*| CDataError +/// CDataParser2 -->|D| CDataParser3 +/// CDataParser2 ----->|*| CDataError +/// CDataParser3 -->|A| CDataParser4 +/// CDataParser3 ----->|*| CDataError +/// CDataParser4 -->|T| CDataParser5 +/// CDataParser4 ----->|*| CDataError +/// CDataParser5 -->|A| CDataParser6 +/// CDataParser5 ----->|*| CDataError +/// CDataParser6 -->|"["| CDataContent1 +/// CDataParser6 ----->|*| CDataError +/// +/// CDataContent1 -->|"]"| CDataContent2 +/// CDataContent1 -->|*| CDataContent1 +/// +/// CDataContent2 -->|"]"| CDataContent3 +/// CDataContent2 -->|*| CDataContent1 +/// +/// CDataContent3 -->|>| CData +/// CDataContent3 -->|*| CDataContent1 +/// end +/// +/// subgraph pi_parser +/// PIParser1 -->|?| PIParser2 +/// PIParser1 -->|*| PIParser1 +/// +/// PIParser2 -->|>| PI +/// PIParser2 -->|*| PIError +/// end +/// +/// subgraph end_tag +/// EndTagParser -->|>| EndTag +/// EndTagParser -->|*| EndTagError +/// end +/// +/// StartOrEmptyTag --> |/| EmptyTagParser +/// StartOrEmptyTag --->|>| StartTag +/// StartOrEmptyTag --> |*| StartOrEmptyTag +/// +/// subgraph empty_tag +/// EmptyTagParser -->|>| EmptyTag +/// EmptyTagParser -->|*| EmptyTagError +/// end +/// ``` +/// +/// Every arrow on that diagram is marked with a byte that initiates that transition. +/// Transition marked with asterisks (`*`) represents any byte except explicitly +/// mentioned in other transitions from that state. +/// +/// Each `Error` state on that diagram represents a [`SyntaxError`]. +/// Every successful match (`Emit*`) returns the parser to state `Text`. +#[derive(Copy, Clone, Default, Debug, Eq, PartialEq)] +pub struct Parser(State); +impl Parser { + /// Creates a parser that would not try to guess encoding from the input text. + /// This is useful when you already knows the encoding and parses a part of document. + #[inline] + pub fn without_encoding_detection() -> Self { + Self(State::Text) + } + + /// Performs parsing of the provided byte slice and returns the outcome. + /// See [`Parser`] for more info. + /// + /// # Parameters + /// - `bytes`: a slice to search a new XML event. Should contain text in + /// ASCII-compatible encoding + pub fn feed(&mut self, bytes: &[u8]) -> Result { + dbg!((self.0, crate::utils::Bytes(bytes))); + for (offset, &byte) in bytes.iter().enumerate() { + let trail = &bytes[offset..]; + let start = offset + 1; + let rest = &bytes[start..]; + dbg!((self.0, offset, byte as char, crate::utils::Bytes(trail), crate::utils::Bytes(rest))); + self.0 = match self.0 { + State::Start => match byte { + 0x00 => State::Bom(BomParser::X00), + b'<' => State::Bom(BomParser::X3C), + 0xEF => State::Bom(BomParser::XEF), + 0xFE => State::Bom(BomParser::XFE), + 0xFF => State::Bom(BomParser::XFF), + _ => return Ok(self.parse_text(trail, offset)), + }, + State::Bom(ref mut parser) => { + let encoding = match parser.feed(trail) { + bom::FeedResult::Unknown => FeedResult::EncodingUtf8Like(0), + bom::FeedResult::Utf8 => FeedResult::EncodingUtf8Like(0), + bom::FeedResult::Utf16Be => FeedResult::EncodingUtf16BeLike(0), + bom::FeedResult::Utf16Le => FeedResult::EncodingUtf16LeLike(0), + bom::FeedResult::Utf8Bom => FeedResult::EncodingUtf8Like(3), + bom::FeedResult::Utf16BeBom => FeedResult::EncodingUtf16BeLike(2), + bom::FeedResult::Utf16LeBom => FeedResult::EncodingUtf16LeLike(2), + bom::FeedResult::NeedData => return Ok(FeedResult::NeedData), + }; + self.0 = State::Text; + return Ok(encoding); + } + State::Text => match byte { + b'<' => State::Markup, + _ => return Ok(self.parse_text(trail, offset)), + }, + State::Markup => match byte { + b'!' => State::MaybeCommentOrCDataOrDoctype, + b'?' => return Ok(self.parse_pi(rest, start, PiParser::default())), + b'/' => return Ok(self.parse_end(rest, start)), + _ => { + return Ok(self.parse_start_or_empty( + trail, + offset, + QuotedParser::Outside, + false, + )) + } + }, + State::MaybeCommentOrCDataOrDoctype => match byte { + b'-' => State::MaybeComment, + b'[' => State::MaybeCData1, + b'D' | b'd' => State::MaybeDoctype1, + _ => return Err(SyntaxError::InvalidBangMarkup), + }, + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeComment => match byte { + b'-' => return Ok(self.parse_comment(rest, start, CommentParser::default())), + _ => return Err(SyntaxError::UnclosedComment), + }, + State::Comment(parser) => { + return Ok(self.parse_comment(trail, offset, parser)); + } + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeCData1 => match byte { + b'C' => State::MaybeCData2, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData2 => match byte { + b'D' => State::MaybeCData3, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData3 => match byte { + b'A' => State::MaybeCData4, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData4 => match byte { + b'T' => State::MaybeCData5, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData5 => match byte { + b'A' => State::MaybeCData6, + _ => return Err(SyntaxError::UnclosedCData), + }, + State::MaybeCData6 => match byte { + b'[' => return Ok(self.parse_cdata(rest, start, CDataParser::default())), + _ => return Err(SyntaxError::UnclosedCData), + }, + State::CData(parser) => return Ok(self.parse_cdata(trail, offset, parser)), + + //---------------------------------------------------------------------------------- + // + //---------------------------------------------------------------------------------- + State::MaybeDoctype1 => match byte { + b'O' | b'o' => State::MaybeDoctype2, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype2 => match byte { + b'C' | b'c' => State::MaybeDoctype3, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype3 => match byte { + b'T' | b't' => State::MaybeDoctype4, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype4 => match byte { + b'Y' | b'y' => State::MaybeDoctype5, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype5 => match byte { + b'P' | b'p' => State::MaybeDoctype6, + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::MaybeDoctype6 => match byte { + b'E' | b'e' => return self.parse_doctype(rest, start, QuotedParser::Outside), + _ => return Err(SyntaxError::UnclosedDoctype), + }, + State::Doctype(parser) => return self.parse_doctype(trail, offset, parser), + State::Dtd(parser) => return self.parse_dtd(trail, offset, parser), + State::DoctypeFinish => return Ok(self.parse_doctype_finish(trail, offset)), + + State::PI(parser) => return Ok(self.parse_pi(trail, offset, parser)), + State::EndTag => return Ok(self.parse_end(trail, offset)), + State::StartOrEmptyTag(parser, has_slash) => { + return Ok(self.parse_start_or_empty(trail, offset, parser, has_slash)); + } + } + } + Ok(FeedResult::NeedData) + } + + /// This method should be called when all data was feed into parser. + /// + /// If parser in intermediate state it will return a corresponding syntax + /// error, otherwise it returns successfully. + // rustfmt tend to move pipes to the begin of a line which ruins the nice look + #[rustfmt::skip] + pub fn finish(self) -> Result<(), SyntaxError> { + match self.0 { + // If nothing was fed into parser, document is empty. + // We allow empty documents, at least for now + State::Start | + State::Text => Ok(()), + + // We need data when we tried to determine document encoding + // < + State::Bom(BomParser::X00_3C) | + State::Bom(BomParser::X00_3C_00) | + State::Bom(BomParser::X3C) | + State::Bom(BomParser::X3C_00) => Err(SyntaxError::UnclosedTag), + // Err(SyntaxError::UnclosedPIOrXmlDecl), + // Threat unrecognized BOMs as text + State::Bom(_) => Ok(()), + + State::Markup | + State::StartOrEmptyTag(..) | + State::EndTag => Err(SyntaxError::UnclosedTag), + + State::MaybeCommentOrCDataOrDoctype => Err(SyntaxError::InvalidBangMarkup), + + State::MaybeComment | + State::Comment(_) => Err(SyntaxError::UnclosedComment), + + State::MaybeCData1 | + State::MaybeCData2 | + State::MaybeCData3 | + State::MaybeCData4 | + State::MaybeCData5 | + State::MaybeCData6 | + State::CData(_) => Err(SyntaxError::UnclosedCData), + + State::MaybeDoctype1 | + State::MaybeDoctype2 | + State::MaybeDoctype3 | + State::MaybeDoctype4 | + State::MaybeDoctype5 | + State::MaybeDoctype6 | + State::Doctype(_) | + State::Dtd(_) | + State::DoctypeFinish => Err(SyntaxError::UnclosedDoctype), + + State::PI(_) => Err(SyntaxError::UnclosedPIOrXmlDecl), + } + } + + /// Check if parser currently parses text + #[inline] + pub fn is_text_parsing(&self) -> bool { + self.0 == State::Text + } + + /// Text cannot contain `<` inside, so we emit it as soon as we find `<`. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a text content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + #[inline] + fn parse_text(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); + self.0 = State::Text; + match bytes.iter().position(|&b| b == b'<') { + Some(i) => FeedResult::EmitText(offset + i), + None => FeedResult::NeedData, + } + } + + /// Determines the end position of a comment in the provided slice. + /// Comment ends on the first occurrence of `-->` which cannot be escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a comment content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `dashes_left`: count of dashes that wasn't seen yet in the end of previous data chunk + fn parse_comment( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: CommentParser, + ) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitComment(offset + i) + } + None => { + self.0 = State::Comment(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of a CDATA block in the provided slice. + /// CDATA block ends on the first occurrence of `]]>` which cannot be escaped. + /// + /// `` can contain `>` inside. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a CDATA content + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `braces_left`: count of braces that wasn't seen yet in the end of previous data chunk + fn parse_cdata(&mut self, bytes: &[u8], offset: usize, mut parser: CDataParser) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + match parser.feed(bytes) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitCData(offset + i) + } + None => { + self.0 = State::CData(parser); + FeedResult::NeedData + } + } + } + + fn parse_doctype( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + ) -> Result { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + // Search `[` (start of DTD definitions) or `>` (end of tag) + match dbg!(parser.one_of(bytes)) { + OneOf::Open(i) => self.parse_dtd(&bytes[i..], offset + i, DtdParser::default()), + OneOf::Close(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + Ok(FeedResult::EmitDoctype(offset + i + 1)) + } + OneOf::None => { + self.0 = State::Doctype(parser); + Ok(FeedResult::NeedData) + } + } + } + + /// Skips DTD representation, correctly following DTD grammar. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on a byte that would represent first byte of DTD event + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the DTD parser persisted between `feed()` calls + fn parse_dtd( + &mut self, + mut bytes: &[u8], + mut offset: usize, + mut parser: DtdParser, + ) -> Result { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + loop { + let result = match dbg!(parser.feed(bytes)) { + // Skip recognized DTD structure + // TODO: Emit DTD events while parsing + quick_dtd::FeedResult::EmitPI(off) + | quick_dtd::FeedResult::EmitAttList(off) + | quick_dtd::FeedResult::EmitComment(off) + | quick_dtd::FeedResult::EmitElement(off) + | quick_dtd::FeedResult::EmitEntity(off) + | quick_dtd::FeedResult::EmitNotation(off) => { + bytes = &bytes[off..]; + offset += off; + continue; + } + + // `]` finishes DOCTYPE subsets: + // After that we should find the close `>` + quick_dtd::FeedResult::Unexpected(off, b']') => { + return Ok(self.parse_doctype_finish(&bytes[off..], offset + off)) + } + // Other bytes not expected, so return error + quick_dtd::FeedResult::Unexpected(..) => Err(SyntaxError::UnclosedDoctype), + quick_dtd::FeedResult::NeedData => Ok(FeedResult::NeedData), + }; + self.0 = State::Dtd(parser); + return result; + } + } + + fn parse_doctype_finish(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); + match dbg!(bytes.iter().position(|&b| b == b'>')) { + Some(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitDoctype(offset + i + 1) + } + None => { + self.0 = State::DoctypeFinish; + FeedResult::NeedData + } + } + } + + /// Determines the end position of a processing instruction in the provided slice. + /// Processing instruction ends on the first occurrence of `?>` which cannot be + /// escaped. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a PI target + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `has_mark`: a flag that indicates was the previous fed data ended with `?` + fn parse_pi(&mut self, bytes: &[u8], offset: usize, mut parser: PiParser) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser)); + match dbg!(parser.feed(bytes)) { + Some(i) => { + self.0 = State::Text; + FeedResult::EmitPI(offset + i) + } + None => { + self.0 = State::PI(parser); + FeedResult::NeedData + } + } + } + + /// Determines the end position of an end tag in the provided slice. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a tag name + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + fn parse_end(&mut self, bytes: &[u8], offset: usize) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes))); + match dbg!(bytes.iter().position(|&b| b == b'>')) { + Some(i) => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitEndTag(offset + i + 1) + } + None => { + self.0 = State::EndTag; + FeedResult::NeedData + } + } + } + + /// Determines the end position of a start or empty tag in the provided slice. + /// + /// # Parameters + /// - `bytes`: sub-slice to the original slice that was passed to `feed()`. + /// That sub-slice begins on the byte that represents a second byte of + /// a tag name + /// - `offset`: a position of `bytes` sub-slice in the one that was passed to `feed()` + /// - `parser`: the state of a quotes used to skip `>` inside attribute values + /// - `has_slash`: a flag that indicates was the previous fed data ended with `/` + fn parse_start_or_empty( + &mut self, + bytes: &[u8], + offset: usize, + mut parser: QuotedParser, + has_slash: bool, + ) -> FeedResult { + dbg!((self.0, offset, crate::utils::Bytes(bytes), parser, has_slash)); + match dbg!(parser.feed(bytes)) { + Some(0) if has_slash => { + self.0 = State::Text; + // +1 for `>` which should be included in event + FeedResult::EmitEmptyTag(offset + 1) + } + Some(i) => { + self.0 = State::Text; + // This slash cannot follow immediately after `<`, because otherwise + // we would be in a `parse_end` and not here + if i > 0 && bytes[i - 1] == b'/' { + // +1 for `>` which should be included in event + FeedResult::EmitEmptyTag(offset + i + 1) + } else { + // +1 for `>` which should be included in event + FeedResult::EmitStartTag(offset + i + 1) + } + } + None => { + self.0 = State::StartOrEmptyTag(parser, bytes.last().copied() == Some(b'/')); + FeedResult::NeedData + } + } + } +} + +#[cfg(test)] +mod tests { + use super::FeedResult::*; + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn text() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b"text with > symbol"), Ok(NeedData)); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b"text with < symbol"), Ok(EmitText(10))); + // ^^^^^^^^^^ + assert_eq!(parser.0, State::Text); + } + + #[test] + fn cdata() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitCData(1))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitCData(2))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitCData(1))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b" ]]>"), + // 0 ^ = 40 + Ok(EmitCData(41)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn comment() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(NeedData)); + assert!(matches!(parser.0, State::Comment(_))); + assert_eq!(parser.feed(b"-->"), Ok(EmitComment(3))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b""), + // 0 ^ = 31 + Ok(EmitComment(32)) + ); + assert_eq!(parser.0, State::Text); + } + + mod doctype { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn only_name() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitDoctype(15))); + // 0 ^ = 14 + assert_eq!(parser.0, State::Text); + } + + #[test] + fn with_external_id() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"']\">"), + // 0 ^ = 28 + Ok(EmitDoctype(29)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\"]'>"), + // 0 ^ = 28 + Ok(EmitDoctype(29)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\"]'>"), + // 0 ^ = 32 + Ok(EmitDoctype(33)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"']\">"), + // 0 ^ = 31 + Ok(EmitDoctype(32)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn with_subset() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"'>]>"), + // 0 ^ = 33 + Ok(EmitDoctype(34)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"'\" []>"), + // 0 ^ = 29 + Ok(EmitDoctype(30)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\"' []>"), + // 0 ^ = 29 + Ok(EmitDoctype(30)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\"' []>"), + // 0 ^ = 33 + Ok(EmitDoctype(34)) + ); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"'\" []>"), + // 0 ^ = 32 + Ok(EmitDoctype(33)) + ); + assert_eq!(parser.0, State::Text); + } + } + + #[test] + fn pi() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitPI(4))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitPI(10))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b"?>"), Ok(EmitPI(5))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitPI(5))); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn empty() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitEmptyTag(8))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\" two='\"/>'/>"), + Ok(EmitEmptyTag(28)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn start() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b"<>"), Ok(EmitStartTag(2))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitStartTag(7))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!( + parser.feed(b"\" two='\">'>"), + Ok(EmitStartTag(25)) + ); + assert_eq!(parser.0, State::Text); + } + + #[test] + fn end() { + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(6))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(7))); + assert_eq!(parser.0, State::Text); + + let mut parser = Parser::without_encoding_detection(); + assert_eq!(parser.feed(b""), Ok(EmitEndTag(3))); + assert_eq!(parser.0, State::Text); + } +} diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1cdab220..7d0bb6ee 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -4,23 +4,11 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt}; -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, Result}; +use crate::events::{BytesText, Event}; use crate::name::{QName, ResolveResult}; -use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{ - is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, -}; - -/// A struct for read XML asynchronously from an [`AsyncBufRead`]. -/// -/// Having own struct allows us to implement anything without risk of name conflicts -/// and does not suffer from the impossibility of having `async` in traits. -struct TokioAdapter<'a, R>(&'a mut R); - -impl<'a, R: AsyncBufRead + Unpin> TokioAdapter<'a, R> { - impl_buffered_source!('b, 0, async, await); -} +use crate::reader::state::ParseOutcome; +use crate::reader::{NsReader, Reader, Span}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -70,17 +58,8 @@ impl Reader { /// ``` /// /// [`read_event_into()`]: Reader::read_event_into - pub async fn read_event_into_async<'b>( - &mut self, - mut buf: &'b mut Vec, - ) -> Result> { - read_event_impl!( - self, buf, - TokioAdapter(&mut self.reader), - read_until_open_async, - read_until_close_async, - await - ) + pub async fn read_event_into_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { + read_event_impl!(self, buf, await) } /// An asynchronous version of [`read_to_end_into()`]. @@ -134,29 +113,12 @@ impl Reader { /// [`Start`]: Event::Start pub async fn read_to_end_into_async<'n>( &mut self, - // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033` + // We should name that lifetime due to https://github.com/rust-lang/rust/issues/63033 end: QName<'n>, buf: &mut Vec, ) -> Result { Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)) } - - /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. - /// - /// Returns inner `Ok` if the loop should be broken and an event returned. - /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular. - async fn read_until_open_async<'b>( - &mut self, - buf: &'b mut Vec, - ) -> Result, &'b mut Vec>> { - read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await) - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { - read_until_close!(self, buf, TokioAdapter(&mut self.reader), await) - } } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -369,14 +331,11 @@ impl NsReader { #[cfg(test)] mod test { - use super::TokioAdapter; use crate::reader::test::{check, small_buffers}; check!( #[tokio::test] read_event_into_async, - read_until_close_async, - TokioAdapter, &mut Vec::new(), async, await ); diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 84f65875..8d48dffd 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,242 +5,11 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; -use memchr; - -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, Result}; +use crate::events::{BytesText, Event}; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; - -macro_rules! impl_buffered_source { - ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { - #[cfg(not(feature = "encoding"))] - $($async)? fn remove_utf8_bom(&mut self) -> Result<()> { - use crate::encoding::UTF8_BOM; - - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => { - if n.starts_with(UTF8_BOM) { - self $(.$reader)? .consume(UTF8_BOM.len()); - } - Ok(()) - }, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - #[cfg(feature = "encoding")] - $($async)? fn detect_encoding(&mut self) -> Result> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) { - self $(.$reader)? .consume(bom_len); - Ok(Some(enc)) - } else { - Ok(None) - }, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - #[inline] - $($async)? fn read_bytes_until $(<$lf>)? ( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<(&'b [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - let mut read = 0; - let mut done = false; - let start = buf.len(); - while !done { - let used = { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - done = true; - i + 1 - } - None => { - buf.extend_from_slice(available); - available.len() - } - } - }; - self $(.$reader)? .consume(used); - read += used; - } - *position += read; - - Ok((&buf[start..], done)) - } - - $($async)? fn read_bang_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<(BangType, &'b [u8])> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - let start = buf.len(); - let mut read = 1; - buf.push(b'!'); - self $(.$reader)? .consume(1); - - let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?; - - loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Ok(n) if n.is_empty() => break, - Ok(available) => { - // We only parse from start because we don't want to consider - // whatever is in the buffer before the bang element - if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used; - - *position += read; - return Ok((bang_type, &buf[start..])); - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - } - } - - *position += read; - Err(bang_type.to_err()) - } - - #[inline] - $($async)? fn read_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<&'b [u8]> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used; - - // Position now just after the `>` symbol - *position += read; - return Ok(&buf[start..]); - } else { - // The `>` symbol not yet found, continue reading - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - } - - *position += read; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - - $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) => { - let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); - if count > 0 { - self $(.$reader)? .consume(count); - *position += count; - continue; - } else { - Ok(()) - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - - $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - match self.peek_one() $(.$await)? ? { - Some(b) if b == byte => { - *position += 1; - self $(.$reader)? .consume(1); - Ok(true) - } - _ => Ok(false), - } - } - - $($async)? fn peek_one(&mut self) -> Result> { - loop { - break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e.into())), - }; - } - } - }; -} - -// Make it public for use in async implementations -pub(super) use impl_buffered_source; - -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { - impl_buffered_source!(); -} +use crate::reader::state::ParseOutcome; +use crate::reader::{Reader, Span}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -292,7 +61,7 @@ impl Reader { /// ``` #[inline] pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) + read_event_impl!(self, buf) } /// Reads until end element is found using provided buffer as intermediate @@ -384,7 +153,7 @@ impl Reader { /// [`check_end_names`]: crate::reader::Config::check_end_names /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result { - Ok(read_to_end!(self, end, buf, read_event_impl, { + Ok(read_to_end!(self, end, buf, read_event_into, { buf.clear(); })) } @@ -402,18 +171,10 @@ impl Reader> { #[cfg(test)] mod test { use crate::reader::test::{check, small_buffers}; - use crate::reader::XmlSource; - - /// Default buffer constructor just pass the byte array from the test - fn identity(input: T) -> T { - input - } check!( #[test] - read_event_impl, - read_until_close, - identity, + read_event_into, &mut Vec::new() ); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 6ccbdf54..9c5b3dfc 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -5,12 +5,8 @@ use encoding_rs::Encoding; use std::ops::Range; use crate::encoding::Decoder; -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; use crate::reader::state::ReaderState; -use memchr; - /// A struct that holds a parser configuration. /// /// Current parser configuration can be retrieved by calling [`Reader::config()`] @@ -205,192 +201,53 @@ impl Default for Config { macro_rules! read_event_impl { ( - $self:ident, $buf:ident, - $reader:expr, - $read_until_open:ident, - $read_until_close:ident + $self:ident, $buf:ident $(, $await:ident)? ) => {{ - let event = loop { - match $self.state.state { - ParseState::Init => { // Go to OpenedTag state - // If encoding set explicitly, we not need to detect it. For example, - // explicit UTF-8 set automatically if Reader was created using `from_str`. - // But we still need to remove BOM for consistency with no encoding - // feature enabled path - #[cfg(feature = "encoding")] - if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? { - if $self.state.encoding.can_be_refined() { - $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding); - } - } - - // Removes UTF-8 BOM if it is present - #[cfg(not(feature = "encoding"))] - $reader.remove_utf8_bom() $(.$await)? ?; - - // Go to OpenedTag state - match $self.$read_until_open($buf) $(.$await)? { - Ok(Ok(ev)) => break Ok(ev), - Ok(Err(b)) => $buf = b, - Err(err) => break Err(err), - } - }, - ParseState::ClosedTag => { // Go to OpenedTag state - match $self.$read_until_open($buf) $(.$await)? { - Ok(Ok(ev)) => break Ok(ev), - Ok(Err(b)) => $buf = b, - Err(err) => break Err(err), + dbg!("==============================================================="); + if let Some(end) = $self.state.pending_end() { + return Ok(end); + } + // Content in buffer before call is not a part of next event + let start = $buf.len(); + let offset = $self.state.offset; + loop { + dbg!("--------------------------------"); + break match dbg!($self.reader.fill_buf() $(.$await)?) { + Ok(bytes) if bytes.is_empty() => { + let content = &$buf[start..]; + if content.is_empty() { + Ok(Event::Eof) + } else + if let Err(error) = dbg!($self.state.parser.finish()) { + $self.state.last_error_offset = offset; + Err(Error::Syntax(error)) + } else { + // Content already trimmed, because we do not put whitespaces + // to the buffer at all if they should be trimmed + Ok(Event::Text(BytesText::wrap(content, $self.decoder()))) + } + } + Ok(bytes) => match dbg!($self.state.parse_into(bytes, $buf))? { + ParseOutcome::Consume(offset, result) => { + $self.reader.consume(offset); + $self.state.make_event(result, &$buf[start..]) + } + ParseOutcome::ConsumeAndEmitText(offset) => { + $self.reader.consume(offset); + Ok(Event::Text(BytesText::wrap(&$buf[start..], $self.decoder()))) + } + ParseOutcome::ConsumeAndContinue(offset) => { + $self.reader.consume(offset); + continue; } }, - // Go to ClosedTag state in next two arms - ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?, - ParseState::Empty => break $self.state.close_expanded_empty(), - ParseState::Exit => break Ok(Event::Eof), - }; - }; - match event { - // #513: In case of ill-formed errors we already consume the wrong data - // and change the state. We can continue parsing if we wish - Err(Error::IllFormed(_)) => {} - Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Exit, - _ => {} - } - event - }}; -} - -/// Read bytes up to `<` and skip it. If current byte (after skipping all space -/// characters if [`Config::trim_text_start`] is `true`) is already `<`, then -/// returns the next event, otherwise stay at position just after the `<` symbol. -/// -/// Moves parser to the `OpenedTag` state. -/// -/// This code is executed in two cases: -/// - after start of parsing just after skipping BOM if it is present -/// - after parsing `` or `` -macro_rules! read_until_open { - ( - $self:ident, $buf:ident, - $reader:expr, - $read_event:ident - $(, $await:ident)? - ) => {{ - if $self.state.config.trim_text_start { - $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? { - $self.state.state = ParseState::OpenedTag; - // Pass $buf to the next next iteration of parsing loop - return Ok(Err($buf)); - } - - match $reader - .read_bytes_until(b'<', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, found)) => { - if found { - $self.state.state = ParseState::OpenedTag; - } - // Return Text event with `bytes` content or Eof if bytes is empty - $self.state.emit_text(bytes).map(Ok) - } - Err(e) => Err(e), - } - }}; -} - -/// Read bytes up to the `>` and skip it. This method is expected to be called -/// after seeing the `<` symbol and skipping it. Inspects the next (current) -/// symbol and returns an appropriate [`Event`]: -/// -/// |Symbol |Event -/// |-------|------------------------------------- -/// |`!` |[`Comment`], [`CData`] or [`DocType`] -/// |`/` |[`End`] -/// |`?` |[`PI`] -/// |_other_|[`Start`] or [`Empty`] -/// -/// Moves parser to the `ClosedTag` state. -/// -/// [`Comment`]: Event::Comment -/// [`CData`]: Event::CData -/// [`DocType`]: Event::DocType -/// [`End`]: Event::End -/// [`PI`]: Event::PI -/// [`Start`]: Event::Start -/// [`Empty`]: Event::Empty -macro_rules! read_until_close { - ( - $self:ident, $buf:ident, - $reader:expr - $(, $await:ident)? - ) => {{ - $self.state.state = ParseState::ClosedTag; - - let start = $self.state.offset; - match $reader.peek_one() $(.$await)? { - // ` match $reader - .read_bang_element($buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => { - // match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, true)) => $self.state.emit_end(bytes), - Ok((_, false)) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) + $self.state.last_error_offset = $self.state.offset; + Err(Error::Io(e.into())) } - Err(e) => Err(e), - }, - // ` match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) - $(.$await)? - { - Ok((bytes, true)) => $self.state.emit_question_mark(bytes), - Ok((_, false)) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) - } - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match $reader - .read_element($buf, &mut $self.state.offset) - $(.$await)? - { - Ok(bytes) => $self.state.emit_start(bytes), - Err(e) => Err(e), - }, - // `<` - syntax error, tag not closed - Ok(None) => { - // We want to report error at `<`, but offset was increased, - // so return it back (-1 for `<`) - $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - Err(e) => Err(e), + }; } }}; } @@ -440,51 +297,6 @@ pub type Span = Range; //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Possible reader states. The state transition diagram (`true` and `false` shows -/// value of [`Config::expand_empty_elements`] option): -/// -/// ```mermaid -/// flowchart LR -/// subgraph _ -/// direction LR -/// -/// Init -- "(no event)"\n --> OpenedTag -/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag -/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag -/// end -/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty -/// Empty -- End --> ClosedTag -/// _ -. Eof .-> Exit -/// ``` -#[derive(Clone, Debug)] -enum ParseState { - /// Initial state in which reader stay after creation. Transition from that - /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next - /// state is always `OpenedTag`. The reader will never return to this state. The - /// event emitted during transition to `OpenedTag` is a `StartEvent` if the - /// first symbol not `<`, otherwise no event are emitted. - Init, - /// State after seeing the `<` symbol. Depending on the next symbol all other - /// events could be generated. - /// - /// After generating one event the reader moves to the `ClosedTag` state. - OpenedTag, - /// State in which reader searches the `<` symbol of a markup. All bytes before - /// that symbol will be returned in the [`Event::Text`] event. After that - /// the reader moves to the `OpenedTag` state. - ClosedTag, - /// This state is used only if option [`expand_empty_elements`] is set to `true`. - /// Reader enters to this state when it is in a `ClosedTag` state and emits an - /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], - /// after which reader returned to the `ClosedTag` state. - /// - /// [`expand_empty_elements`]: Config::expand_empty_elements - Empty, - /// Reader enters this state when `Eof` event generated or an error occurred. - /// This is the last state, the reader stay in it forever. - Exit, -} - /// A reference to an encoding together with information about how it was retrieved. /// /// The state transition diagram: @@ -587,6 +399,7 @@ impl EncodingRef { /// } /// ``` /// +/// [`Event`]: crate::events::Event /// [`NsReader`]: crate::reader::NsReader #[derive(Clone)] pub struct Reader { @@ -688,13 +501,7 @@ impl Reader { /// Gets the current byte position in the input data. pub fn buffer_position(&self) -> usize { - // when internal state is OpenedTag, we have actually read until '<', - // which we don't want to show - if let ParseState::OpenedTag = self.state.state { - self.state.offset - 1 - } else { - self.state.offset - } + self.state.offset } /// Gets the last error byte position in the input data. If there is no errors @@ -727,283 +534,8 @@ impl Reader { } } -/// Private sync reading methods -impl Reader { - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - read_event_impl!(self, buf, self.reader, read_until_open, read_until_close) - } - - /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. - /// - /// Returns inner `Ok` if the loop should be broken and an event returned. - /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular. - fn read_until_open<'i, B>(&mut self, buf: B) -> Result, B>> - where - R: XmlSource<'i, B>, - { - read_until_open!(self, buf, self.reader, read_event_impl) - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - read_until_close!(self, buf, self.reader) - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Represents an input for a reader that can return borrowed data. -/// -/// There are two implementors of this trait: generic one that read data from -/// `Self`, copies some part of it into a provided buffer of type `B` and then -/// returns data that borrow from that buffer. -/// -/// The other implementor is for `&[u8]` and instead of copying data returns -/// borrowed data from `Self` instead. This implementation allows zero-copy -/// deserialization. -/// -/// # Parameters -/// - `'r`: lifetime of a buffer from which events will borrow -/// - `B`: a type of a buffer that can be used to store data read from `Self` and -/// from which events can borrow -trait XmlSource<'r, B> { - /// Removes UTF-8 BOM if it is present - #[cfg(not(feature = "encoding"))] - fn remove_utf8_bom(&mut self) -> Result<()>; - - /// Determines encoding from the start of input and removes BOM if it is present - #[cfg(feature = "encoding")] - fn detect_encoding(&mut self) -> Result>; - - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte` (exclusive), - /// and a flag noting whether `byte` was found in the input or not. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// (b"abc".as_ref(), true) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut usize, - ) -> Result<(&'r [u8], bool)>; - - /// Read input until comment, CDATA or processing instruction is finished. - /// - /// This method expect that `<` already was read. - /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. - /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>; - - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns a buffer that contains a data between `<` and `>` or - /// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; - - /// Consume and discard all the whitespace until the next non-whitespace - /// character or EOF. - /// - /// # Parameters - /// - `position`: Will be increased by amount of bytes consumed - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; - - /// Consume and discard one character if it matches the given byte. Return - /// `true` if it matched. - /// - /// # Parameters - /// - `position`: Will be increased by 1 if byte is matched - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; - - /// Return one character without consuming it, so that future `read_*` calls - /// will still include it. On EOF, return `None`. - fn peek_one(&mut self) -> Result>; -} - -/// Possible elements started with ` - CData, - /// - Comment, - /// - DocType, -} -impl BangType { - #[inline(always)] - fn new(byte: Option) -> Result { - Ok(match byte { - Some(b'[') => Self::CData, - Some(b'-') => Self::Comment, - Some(b'D') | Some(b'd') => Self::DocType, - _ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)), - }) - } - - /// If element is finished, returns its content up to `>` symbol and - /// an index of this symbol, otherwise returns `None` - /// - /// # Parameters - /// - `buf`: buffer with data consumed on previous iterations - /// - `chunk`: data read on current iteration and not yet consumed from reader - #[inline(always)] - fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr_iter(b'>', chunk) { - match self { - // Need to read at least 6 symbols (`!---->`) for properly finished comment - // - XML comment - // 012345 - i - Self::Comment if buf.len() + i > 4 => { - if chunk[..i].ends_with(b"--") { - // We cannot strip last `--` from the buffer because we need it in case of - // check_comments enabled option. XML standard requires that comment - // will not end with `--->` sequence because this is a special case of - // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `-|->` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `--|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"--") { - return Some((&[], i + 1)); // +1 for `>` - } - } - Self::Comment => {} - Self::CData => { - if chunk[..i].ends_with(b"]]") { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `]|]>` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `]]|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"]]") { - return Some((&[], i + 1)); // +1 for `>` - } - } - Self::DocType => { - let content = &chunk[..i]; - let balance = memchr::memchr2_iter(b'<', b'>', content) - .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) - .sum::(); - if balance == 0 { - return Some((content, i + 1)); // +1 for `>` - } - } - } - } - None - } - #[inline] - fn to_err(&self) -> Error { - match self { - Self::CData => Error::Syntax(SyntaxError::UnclosedCData), - Self::Comment => Error::Syntax(SyntaxError::UnclosedComment), - Self::DocType => Error::Syntax(SyntaxError::UnclosedDoctype), - } - } -} - -/// State machine for the [`XmlSource::read_element`] -#[derive(Clone, Copy)] -enum ReadElementState { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, -} -impl ReadElementState { - /// Changes state by analyzing part of input. - /// Returns a tuple with part of chunk up to element closing symbol `>` - /// and a position after that symbol or `None` if such symbol was not found - #[inline(always)] - fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { - *self = match (*self, chunk[i]) { - // only allowed to match `>` while we are in state `Elem` - (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), - (Self::Elem, b'\'') => Self::SingleQ, - (Self::Elem, b'\"') => Self::DoubleQ, - - // the only end_byte that gets us out if the same character - (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, - - // all other bytes: no state change - _ => *self, - }; - } - None - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { @@ -1019,685 +551,10 @@ mod test { ( #[$test:meta] $read_event:ident, - $read_until_close:ident, - // constructor of the XML source on which internal functions will be called - $source:path, // constructor of the buffer to which read data will stored $buf:expr $(, $async:ident, $await:ident)? ) => { - mod read_bytes_until { - use super::*; - // Use Bytes for printing bytes as strings for ASCII range - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that search in the empty buffer returns `None` - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 0; - let mut input = b"".as_ref(); - // ^= 0 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), false) - ); - assert_eq!(position, 0); - } - - /// Checks that search in the buffer non-existent value returns entire buffer - /// as a result and set `position` to `len()` - #[$test] - $($async)? fn non_existent() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef".as_ref(); - // ^= 6 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), false) - ); - assert_eq!(position, 6); - } - - /// Checks that search in the buffer an element that is located in the front of - /// buffer returns empty slice as a result and set `position` to one symbol - /// after match (`1`) - #[$test] - $($async)? fn at_the_start() { - let buf = $buf; - let mut position = 0; - let mut input = b"*abcdef".as_ref(); - // ^= 1 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), true) - ); - assert_eq!(position, 1); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the middle of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match - #[$test] - $($async)? fn inside() { - let buf = $buf; - let mut position = 0; - let mut input = b"abc*def".as_ref(); - // ^= 4 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abc"), true) - ); - assert_eq!(position, 4); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the end of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match (`len()`) - #[$test] - $($async)? fn in_the_end() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef*".as_ref(); - // ^= 7 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), true) - ); - assert_eq!(position, 7); // position after the symbol matched - } - } - - mod read_bang_element { - use super::*; - use crate::errors::{Error, SyntaxError}; - use crate::reader::BangType; - use crate::utils::Bytes; - - /// Checks that reading CDATA content works correctly - mod cdata { - use super::*; - use pretty_assertions::assert_eq; - - /// Checks that if input begins like CDATA element, but CDATA start sequence - /// is not finished, parsing ends with an error - #[$test] - #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"![]]>other content".as_ref(); - // ^= 1 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - /// Checks that if CDATA startup sequence was matched, but an end sequence - /// is not found, parsing ends with an error - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[other content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - /// Checks that CDATA element without content inside parsed successfully - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[]]>other content".as_ref(); - // ^= 1 ^= 12 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"![CDATA[]]")) - ); - assert_eq!(position, 12); - } - - /// Checks that CDATA element with content parsed successfully. - /// Additionally checks that sequences inside CDATA that may look like - /// a CDATA end sequence do not interrupt CDATA parsing - #[$test] - $($async)? fn with_content() { - let buf = $buf; - let mut position = 1; - let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); - // ^= 1 ^= 29 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")) - ); - assert_eq!(position, 29); - } - } - - /// Checks that reading XML comments works correctly. According to the [specification], - /// comment data can contain any sequence except `--`: - /// - /// ```peg - /// comment = '<--' (!'--' char)* '-->'; - /// char = [#x1-#x2C] - /// / [#x2E-#xD7FF] - /// / [#xE000-#xFFFD] - /// / [#x10000-#x10FFFF] - /// ``` - /// - /// The presence of this limitation, however, is simply a poorly designed specification - /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for - /// presence of these sequences by default. This tests allow such content. - /// - /// [specification]: https://www.w3.org/TR/xml11/#dt-comment - mod comment { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!- -->other content".as_ref(); - // ^= 1 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - #[$test] - $($async)? fn not_properly_end() { - let buf = $buf; - let mut position = 1; - let mut input = b"!->other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn not_closed1() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn not_closed2() { - let buf = $buf; - let mut position = 1; - let mut input = b"!-->other content".as_ref(); - // ^= 1 ^= 18 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 18); - } - - #[$test] - $($async)? fn not_closed3() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--->other content".as_ref(); - // ^= 1 ^= 19 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 19); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!---->other content".as_ref(); - // ^= 1 ^= 7 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"!----")) - ); - assert_eq!(position, 7); - } - - #[$test] - $($async)? fn with_content() { - let buf = $buf; - let mut position = 1; - let mut input = b"!--->comment<--->other content".as_ref(); - // ^= 1 ^= 18 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::Comment, Bytes(b"!--->comment<---")) - ); - assert_eq!(position, 18); - } - } - - /// Checks that reading DOCTYPE definition works correctly - mod doctype { - use super::*; - - mod uppercase { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!D other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn without_space() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPEother content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPE>other content".as_ref(); - // ^= 1 ^= 10 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!DOCTYPE")) - ); - assert_eq!(position, 10); - } - - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"!DOCTYPE other content".as_ref(); - // ^= 1 ^23 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 23); - } - } - - mod lowercase { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn not_properly_start() { - let buf = $buf; - let mut position = 1; - let mut input = b"!d other content".as_ref(); - // ^= 1 ^= 17 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 17); - } - - #[$test] - $($async)? fn without_space() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctypeother content".as_ref(); - // ^= 1 ^= 22 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 22); - } - - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctype>other content".as_ref(); - // ^= 1 ^= 10 - - let (ty, bytes) = $source(&mut input) - .read_bang_element(buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!doctype")) - ); - assert_eq!(position, 10); - } - - #[$test] - $($async)? fn not_closed() { - let buf = $buf; - let mut position = 1; - let mut input = b"!doctype other content".as_ref(); - // ^= 1 ^= 23 - - match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 23); - } - } - } - } - - mod read_element { - use super::*; - use crate::errors::{Error, SyntaxError}; - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that nothing was read from empty buffer - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 1; - let mut input = b"".as_ref(); - // ^= 1 - - match $source(&mut input).read_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedTag)) => {} - x => panic!( - "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`", - x - ), - } - assert_eq!(position, 1); - } - - mod open { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b">".as_ref(); - // ^= 2 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"") - ); - assert_eq!(position, 2); - } - - #[$test] - $($async)? fn normal() { - let buf = $buf; - let mut position = 1; - let mut input = b"tag>".as_ref(); - // ^= 5 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"tag") - ); - assert_eq!(position, 5); - } - - #[$test] - $($async)? fn empty_ns_empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b":>".as_ref(); - // ^= 3 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":") - ); - assert_eq!(position, 3); - } - - #[$test] - $($async)? fn empty_ns() { - let buf = $buf; - let mut position = 1; - let mut input = b":tag>".as_ref(); - // ^= 6 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":tag") - ); - assert_eq!(position, 6); - } - - #[$test] - $($async)? fn with_attributes() { - let buf = $buf; - let mut position = 1; - let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); - // ^= 39 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#) - ); - assert_eq!(position, 39); - } - } - - mod self_closed { - use super::*; - use pretty_assertions::assert_eq; - - #[$test] - $($async)? fn empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b"/>".as_ref(); - // ^= 3 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"/") - ); - assert_eq!(position, 3); - } - - #[$test] - $($async)? fn normal() { - let buf = $buf; - let mut position = 1; - let mut input = b"tag/>".as_ref(); - // ^= 6 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b"tag/") - ); - assert_eq!(position, 6); - } - - #[$test] - $($async)? fn empty_ns_empty_tag() { - let buf = $buf; - let mut position = 1; - let mut input = b":/>".as_ref(); - // ^= 4 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":/") - ); - assert_eq!(position, 4); - } - - #[$test] - $($async)? fn empty_ns() { - let buf = $buf; - let mut position = 1; - let mut input = b":tag/>".as_ref(); - // ^= 7 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(b":tag/") - ); - assert_eq!(position, 7); - } - - #[$test] - $($async)? fn with_attributes() { - let buf = $buf; - let mut position = 1; - let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); - // ^= 42 - - assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), - Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#) - ); - assert_eq!(position, 42); - } - } - } - /// Ensures, that no empty `Text` events are generated mod $read_event { use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index d5b79e78..3a83a5be 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -13,7 +13,7 @@ use std::path::Path; use crate::errors::Result; use crate::events::Event; use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; -use crate::reader::{Config, Reader, Span, XmlSource}; +use crate::reader::{Config, Reader, Span}; /// A low level encoding-agnostic XML event reader that performs namespace resolution. /// @@ -25,7 +25,7 @@ pub struct NsReader { ns_resolver: NamespaceResolver, /// We cannot pop data from the namespace stack until returned `Empty` or `End` /// event will be processed by the user, so we only mark that we should that - /// in the next [`Self::read_event_impl()`] call. + /// in the next [`Self::read_event()`] call. pending_pop: bool, } @@ -61,15 +61,6 @@ impl NsReader { } } - fn read_event_impl<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.pop(); - let event = self.reader.read_event_impl(buf); - self.process_event(event) - } - pub(super) fn pop(&mut self) { if self.pending_pop { self.ns_resolver.pop(); @@ -85,13 +76,13 @@ impl NsReader { } Ok(Event::Empty(e)) => { self.ns_resolver.push(&e)?; - // notify next `read_event_impl()` invocation that it needs to pop this + // notify next `read_event*()` invocation that it needs to pop this // namespace scope self.pending_pop = true; Ok(Event::Empty(e)) } Ok(Event::End(e)) => { - // notify next `read_event_impl()` invocation that it needs to pop this + // notify next `read_event*()` invocation that it needs to pop this // namespace scope self.pending_pop = true; Ok(Event::End(e)) @@ -351,7 +342,9 @@ impl NsReader { /// [`read_resolved_event_into()`]: Self::read_resolved_event_into #[inline] pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) + self.pop(); + let event = self.reader.read_event_into(buf); + self.process_event(event) } /// Reads the next event into given buffer and resolves its namespace (if applicable). @@ -415,7 +408,9 @@ impl NsReader { &mut self, buf: &'b mut Vec, ) -> Result<(ResolveResult, Event<'b>)> { - let event = self.read_event_impl(buf); + self.pop(); + let event = self.reader.read_event_into(buf); + let event = self.process_event(event); self.resolve_event(event) } @@ -595,7 +590,9 @@ impl<'i> NsReader<&'i [u8]> { /// [`read_resolved_event()`]: Self::read_resolved_event #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) + self.pop(); + let event = self.reader.read_event(); + self.process_event(event) } /// Reads the next event, borrow its content from the input buffer, and resolves @@ -659,7 +656,9 @@ impl<'i> NsReader<&'i [u8]> { /// [`read_event()`]: Self::read_event #[inline] pub fn read_resolved_event(&mut self) -> Result<(ResolveResult, Event<'i>)> { - let event = self.read_event_impl(()); + self.pop(); + let event = self.reader.read_event(); + let event = self.process_event(event); self.resolve_event(event) } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 5e807e26..a975af21 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -7,14 +7,13 @@ use std::borrow::Cow; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_8}; +use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; -use crate::errors::{Error, Result, SyntaxError}; -use crate::events::Event; +use crate::errors::{Error, Result}; +use crate::events::{BytesText, Event}; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; - -use memchr; +use crate::parser::FeedResult; +use crate::reader::{Reader, Span}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -71,7 +70,74 @@ impl<'a> Reader<&'a [u8]> { /// ``` #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) + dbg!(self.state.parser); + if let Some(end) = self.state.pending_end() { + return Ok(end); + } + loop { + if self.reader.is_empty() { + return Ok(Event::Eof); + } + let result = dbg!(self.state.parser.feed(self.reader))?; + return match result { + FeedResult::NeedData => { + let offset = self.reader.len(); + if let Err(error) = self.state.parser.finish() { + // We need return Event::Eof after error + self.consume(offset); + Err(Error::Syntax(error)) + } else { + match self.make_text(offset) { + Some(event) => Ok(event), + None => continue, + } + } + } + + FeedResult::EncodingUtf8Like(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_8); + } + continue; + } + FeedResult::EncodingUtf16BeLike(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_16BE); + } + continue; + } + FeedResult::EncodingUtf16LeLike(offset) => { + self.consume(offset); + #[cfg(feature = "encoding")] + if self.state.encoding.can_be_refined() { + self.state.encoding = EncodingRef::BomDetected(UTF_16LE); + } + continue; + } + + FeedResult::EmitText(offset) => match self.make_text(offset) { + Some(event) => Ok(event), + None => continue, + }, + FeedResult::EmitComment(offset) + | FeedResult::EmitCData(offset) + | FeedResult::EmitDoctype(offset) + | FeedResult::EmitPI(offset) + | FeedResult::EmitEmptyTag(offset) + | FeedResult::EmitStartTag(offset) + | FeedResult::EmitEndTag(offset) => { + let (content, source) = self.reader.split_at(offset); + self.reader = source; + + self.state.offset += offset; + self.state.make_event(result, content) + } + }; + } } /// Reads until end element is found. This function is supposed to be called @@ -157,6 +223,11 @@ impl<'a> Reader<&'a [u8]> { pub fn read_to_end(&mut self, end: QName) -> Result { Ok(read_to_end!(self, end, (), read_event_impl, {})) } + /// Tranpoline for a `read_to_end!` macro + #[inline] + fn read_event_impl(&mut self, _: ()) -> Result> { + self.read_event() + } /// Reads content between start and end tags, including any markup. This /// function is supposed to be called after you already read a [`Start`] event. @@ -231,129 +302,42 @@ impl<'a> Reader<&'a [u8]> { self.decoder().decode(&buffer[0..span.len()]) } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { - #[cfg(not(feature = "encoding"))] - fn remove_utf8_bom(&mut self) -> Result<()> { - if self.starts_with(crate::encoding::UTF8_BOM) { - *self = &self[crate::encoding::UTF8_BOM.len()..]; - } - Ok(()) - } - - #[cfg(feature = "encoding")] - fn detect_encoding(&mut self) -> Result> { - if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { - *self = &self[bom_len..]; - return Ok(Some(enc)); - } - Ok(None) - } - - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut usize, - ) -> Result<(&'a [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - if let Some(i) = memchr::memchr(byte, self) { - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - Ok((bytes, true)) - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) - } - } - fn read_bang_element( - &mut self, - _buf: (), - position: &mut usize, - ) -> Result<(BangType, &'a [u8])> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - debug_assert_eq!(self[0], b'!'); - - let bang_type = BangType::new(self[1..].first().copied())?; - - if let Some((bytes, i)) = bang_type.parse(&[], self) { - *position += i; - *self = &self[i..]; - return Ok((bang_type, bytes)); - } - - *position += self.len(); - Err(bang_type.to_err()) + #[inline] + fn consume(&mut self, count: usize) { + self.reader = &self.reader[count..]; + self.state.offset += count; } - - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - // Position now just after the `>` symbol - *position += i; - *self = &self[i..]; - return Ok(bytes); + /// Returns [`Event::Text`] with the content of reader up to `offset` or + /// `None` if no event should be generated because of trimming and getting + /// empty text. + /// + /// Consumes data up to `offset`. + fn make_text(&mut self, offset: usize) -> Option> { + let (content, source) = self.reader.split_at(offset); + self.reader = source; + self.state.offset += offset; + + let mut event = BytesText::wrap(content, self.decoder()); + if self.state.config.trim_text_start && event.inplace_trim_start() { + return None; } - - *position += self.len(); - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - let whitespaces = self - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); - *position += whitespaces; - *self = &self[whitespaces..]; - Ok(()) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - if self.first() == Some(&byte) { - *self = &self[1..]; - *position += 1; - Ok(true) - } else { - Ok(false) + if self.state.config.trim_text_end && event.inplace_trim_end() { + return None; } - } - - fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) + Some(Event::Text(event)) } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod test { use crate::reader::test::check; - use crate::reader::XmlSource; - - /// Default buffer constructor just pass the byte array from the test - fn identity(input: T) -> T { - input - } check!( #[test] read_event_impl, - read_until_close, - identity, () ); diff --git a/src/reader/state.rs b/src/reader/state.rs index d579b767..57b19590 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -1,20 +1,40 @@ #[cfg(feature = "encoding")] -use encoding_rs::UTF_8; +use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8}; use crate::encoding::Decoder; -use crate::errors::{Error, IllFormedError, Result, SyntaxError}; +use crate::errors::{Error, IllFormedError, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; +use crate::name::QName; +use crate::parser::{FeedResult, Parser}; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; -use crate::reader::{is_whitespace, BangType, Config, ParseState}; +use crate::reader::{is_whitespace, Config}; +use crate::utils::Bytes; use memchr; +/// Result of a [`ReaderState::parse_into`] method. +#[derive(Debug)] +pub enum ParseOutcome { + /// The specified amount of data should be consumed. The parser result should + /// be converted to an [`Event`] using previously accumulated data and newly + /// consumed data. + Consume(usize, FeedResult), + /// The specified amount of data should be consumed. All accumulated data + /// and newly consumed data should be converted to an [`Event::Text`]. + ConsumeAndEmitText(usize), + /// The specified amount of data should be consumed, but no event should be + /// generated. Used to skip whitespaces and BOM. + ConsumeAndContinue(usize), +} + /// A struct that holds a current reader state and a parser configuration. /// It is independent on a way of reading data: the reader feed data into it and /// get back produced [`Event`]s. #[derive(Clone, Debug)] pub(super) struct ReaderState { + /// Current parsing state + pub parser: Parser, /// Number of bytes read from the source of data since the reader was created pub offset: usize, /// A snapshot of an `offset` of the last error returned. It can be less than @@ -22,10 +42,20 @@ pub(super) struct ReaderState { /// and changing `offset` is not possible, because `Error::IllFormed` errors /// are recoverable. pub last_error_offset: usize, - /// Defines how to process next byte - pub state: ParseState, /// User-defined settings that affect parsing pub config: Config, + /// When text trimming from start is enabled, we need to track is we seen + /// a non-space symbol between getting chunks from the reader, because we + /// trim each chunk individually. If such symbol was seen, trim is not + /// required until current text event would be emitted. + /// + /// Used only together with buffering readers, because borrowing reader + /// already have all data available. + can_trim_start: bool, + /// If case of [`Config::expand_empty_elements`] is true, this field will + /// be `true` if synthetic end event should be emitted on next call to read + /// event. + pending: bool, /// All currently Started elements which didn't have a matching /// End element yet. /// @@ -54,52 +84,144 @@ pub(super) struct ReaderState { } impl ReaderState { - /// Trims end whitespaces from `bytes`, if required, and returns a [`Text`] - /// event or an [`Eof`] event, if text after trimming is empty. + /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// - /// # Parameters - /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` + /// If [`encoding`] feature is enabled, the used encoding may change after + /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. /// - /// [`Text`]: Event::Text - /// [`Eof`]: Event::Eof - pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result> { - let mut content = bytes; - - if self.config.trim_text_end { - // Skip the ending '<' - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - content = &bytes[..len]; + /// If [`encoding`] feature is enabled and no encoding is specified in declaration, + /// defaults to UTF-8. + /// + /// [`encoding`]: ../../index.html#encoding + pub fn decoder(&self) -> Decoder { + Decoder { + #[cfg(feature = "encoding")] + encoding: self.encoding.encoding(), } + } + + /// Parses `bytes`, appending data to a `buf`. Used in buffered readers + pub fn parse_into<'a, 'b>( + &mut self, + bytes: &'a [u8], + buf: &'b mut Vec, + ) -> Result { + dbg!(&self); + let result = dbg!(self.parser.feed(bytes))?; + match result { + FeedResult::NeedData => { + let mut content = bytes; + if self.config.trim_text_start + && self.can_trim_start + && self.parser.is_text_parsing() + { + content = crate::events::trim_xml_start(bytes); + // if we got some data while parsing text, we shouldn't to + // trim text anymore, because this is spaces inside text content + self.can_trim_start = content.is_empty(); + } + buf.extend_from_slice(content); + let len = bytes.len(); + self.offset += len; + Ok(ParseOutcome::ConsumeAndContinue(len)) + } + + FeedResult::EncodingUtf8Like(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_8); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } + FeedResult::EncodingUtf16BeLike(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_16BE); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } + FeedResult::EncodingUtf16LeLike(offset) => { + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + self.encoding = EncodingRef::BomDetected(UTF_16LE); + } + self.offset += offset; + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } - if content.is_empty() { - Ok(Event::Eof) - } else { - Ok(Event::Text(BytesText::wrap(content, self.decoder()))) + FeedResult::EmitText(offset) => { + let mut content = &bytes[..offset]; + if self.config.trim_text_start && self.can_trim_start { + content = crate::events::trim_xml_start(content); + } + // Reset ability to trim start + self.can_trim_start = true; + if self.config.trim_text_end { + content = crate::events::trim_xml_end(content); + } + buf.extend_from_slice(content); + self.offset += offset; + if buf.is_empty() { + Ok(ParseOutcome::ConsumeAndContinue(offset)) + } else { + Ok(ParseOutcome::ConsumeAndEmitText(offset)) + } + } + FeedResult::EmitComment(offset) + | FeedResult::EmitCData(offset) + | FeedResult::EmitDoctype(offset) + | FeedResult::EmitPI(offset) + | FeedResult::EmitEmptyTag(offset) + | FeedResult::EmitStartTag(offset) + | FeedResult::EmitEndTag(offset) => { + buf.extend_from_slice(&bytes[..offset]); + self.offset += offset; + Ok(ParseOutcome::Consume(offset, result)) + } } } - /// reads `BytesElement` starting with a `!`, - /// return `Comment`, `CData` or `DocType` event - pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; + /// Converts result from a parser to reader's event. + /// + /// # Parameters + /// - `result`: a result from [`Parser::feed()`] + /// - `content`: a buffer with event data + /// + /// [`Parser::feed()`]: crate::parser::Parser::feed() + pub fn make_event<'a>(&mut self, result: FeedResult, content: &'a [u8]) -> Result> { + debug_assert!(!self.pending, "synthetic end event won't be emitted"); - let len = buf.len(); - match bang_type { - BangType::Comment if buf.starts_with(b"!--") => { - debug_assert!(buf.ends_with(b"--")); + match result { + FeedResult::EmitText(_) | FeedResult::NeedData => { + Ok(Event::Text(BytesText::wrap(content, self.decoder()))) + } + FeedResult::EmitCData(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + Ok(Event::CData(BytesCData::wrap( + &content[9..content.len() - 3], + self.decoder(), + ))) + } + FeedResult::EmitComment(_) => { + // `--` from start and end should not be overlapped + debug_assert!(content.len() >= 4 + 3, "{:?}", Bytes(content)); + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); + + let len = content.len(); if self.config.check_comments { // search if '--' not in comments - let mut haystack = &buf[3..len - 2]; + // Skip `` + let mut haystack = &content[4..len - 3]; let mut off = 0; while let Some(p) = memchr::memchr(b'-', haystack) { off += p + 1; // if next byte after `-` is also `-`, return an error - if buf[3 + off] == b'-' { + if content[4 + off] == b'-' { // Explanation of the magic: // // - `self.offset`` just after `>`, @@ -107,39 +229,41 @@ impl ReaderState { // - `p` is counted from byte after `: - // ~~~~~~~~~~~~~~~~ : - buf + // ~~~~~~~~~~~~~~~~~~: - buf // : =========== : - zone of search (possible values of `p`) // : |---p : - p is counted from | (| is 0) // : : : ^ - self.offset - // ^ : : - self.offset - len - // ^ : - self.offset - len + 2 - // ^ - self.offset - len + 2 + p - self.last_error_offset = self.offset - len + 2 + p; + // ^ : : - self.offset - len + // ^ : - self.offset - len + 4 + // ^ - self.offset - len + 4 + p + self.last_error_offset = self.offset - len + 4 + p; return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment)); } - // Continue search after single `-` (+1 to skip it) haystack = &haystack[p + 1..]; } } Ok(Event::Comment(BytesText::wrap( - // Cut of `!--` and `--` from start and end - &buf[3..len - 2], + &content[4..len - 3], self.decoder(), ))) } - BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - debug_assert!(buf.ends_with(b"]]")); - Ok(Event::CData(BytesCData::wrap( - // Cut of `![CDATA[` and `]]` from start and end - &buf[8..len - 2], - self.decoder(), - ))) - } - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { - match buf[8..].iter().position(|&b| !is_whitespace(b)) { + FeedResult::EmitDoctype(_) => { + debug_assert!(content.len() > 9, "{:?}", Bytes(content)); + debug_assert!( + content[0..9].eq_ignore_ascii_case(b""), "{:?}", Bytes(content)); + + // Skip `` + let buf = &content[9..content.len() - 1]; + match buf.iter().position(|&b| !is_whitespace(b)) { + // Found the first non-space symbol after ` Ok(Event::DocType(BytesText::wrap( - // Cut of `!DOCTYPE` and any number of spaces from start - &buf[8 + start..], + &buf[start..], self.decoder(), ))), None => { @@ -151,176 +275,145 @@ impl ReaderState { } } } - _ => { - // - // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`. - // ^------- We report error at that position, so we need to subtract 2 and buf len - self.last_error_offset = self.offset - len - 2; - Err(bang_type.to_err()) - } - } - } + FeedResult::EmitPI(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - /// Wraps content of `buf` into the [`Event::End`] event. Does the check that - /// end name matches the last opened start name if `self.config.check_end_names` is set. - pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result> { - // Strip the `/` character. `content` contains data between `` - let content = &buf[1..]; - // XML standard permits whitespaces after the markup name in closing tags. - // Let's strip them from the buffer before comparing tag names. - let name = if self.config.trim_markup_names_in_closing_tags { - if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { - &content[..pos_end_name + 1] - } else { - content - } - } else { - content - }; + // Cut of `` from start and end + let content = &content[2..content.len() - 2]; + let event = BytesStart::wrap(content); - let decoder = self.decoder(); + if event.name() == QName(b"xml") { + let event = BytesDecl::from_start(event); - // Get the index in self.opened_buffer of the name of the last opened tag - match self.opened_starts.pop() { - Some(start) => { - if self.config.check_end_names { - let expected = &self.opened_buffer[start..]; - if name != expected { - let expected = decoder.decode(expected).unwrap_or_default().into_owned(); - // #513: In order to allow error recovery we should drop content of the buffer - self.opened_buffer.truncate(start); - - // Report error at start of the end tag at `<` character - // -2 for `<` and `>` - self.last_error_offset = self.offset - buf.len() - 2; - return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { - expected, - found: decoder.decode(name).unwrap_or_default().into_owned(), - })); + // Try getting encoding from the declaration event + #[cfg(feature = "encoding")] + if self.encoding.can_be_refined() { + if let Some(encoding) = event.encoder() { + self.encoding = EncodingRef::XmlDetected(encoding); + } } + + Ok(Event::Decl(event)) + } else { + Ok(Event::PI(BytesText::wrap(content, self.decoder()))) } + } + FeedResult::EmitEmptyTag(_) => { + debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); + debug_assert!(content.ends_with(b"/>"), "{:?}", Bytes(content)); - self.opened_buffer.truncate(start); + let event = BytesStart::wrap(&content[1..content.len() - 2]); + + if self.config.expand_empty_elements { + self.pending = true; + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(event.name().as_ref()); + Ok(Event::Start(event)) + } else { + Ok(Event::Empty(event)) + } } - None => { - // Report error at start of the end tag at `<` character - // -2 for `<` and `>` - self.last_error_offset = self.offset - buf.len() - 2; - return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( - decoder.decode(name).unwrap_or_default().into_owned(), - ))); + FeedResult::EmitStartTag(_) => { + debug_assert!(content.starts_with(b"<"), "{:?}", Bytes(content)); + debug_assert!(content.ends_with(b">"), "{:?}", Bytes(content)); + + let event = BytesStart::wrap(&content[1..content.len() - 1]); + + // #514: Always store names event when .check_end_names == false, + // because checks can be temporary disabled and when they would be + // enabled, we should have that information + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(event.name().as_ref()); + Ok(Event::Start(event)) } - } + FeedResult::EmitEndTag(_) => { + debug_assert!(content.starts_with(b""), "{:?}", Bytes(content)); - Ok(Event::End(BytesEnd::wrap(name.into()))) - } + let buf = &content[1..content.len() - 1]; + // Strip the `/` character. `content` contains data between `` + let content = &buf[1..]; + // XML standard permits whitespaces after the markup name in closing tags. + // Let's strip them from the buffer before comparing tag names. + let name = if self.config.trim_markup_names_in_closing_tags { + if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) { + &content[..pos_end_name + 1] + } else { + content + } + } else { + content + }; - /// `buf` contains data between `<` and `>` and the first byte is `?`. - /// `self.offset` already after the `>` - /// - /// Returns `Decl` or `PI` event - pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result> { - debug_assert!(buf.len() > 0); - debug_assert_eq!(buf[0], b'?'); + let decoder = self.decoder(); - let len = buf.len(); - // We accept at least - // ~~ - len = 2 - if len > 1 && buf[len - 1] == b'?' { - // Cut of `?` and `?` from start and end - let content = &buf[1..len - 1]; - let len = content.len(); + // Get the index in self.opened_buffer of the name of the last opened tag + match self.opened_starts.pop() { + Some(start) => { + if self.config.check_end_names { + let expected = &self.opened_buffer[start..]; + if name != expected { + let expected = decoder.decode(expected).unwrap_or_default().into_owned(); + // #513: In order to allow error recovery we should drop content of the buffer + self.opened_buffer.truncate(start); - if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) { - let event = BytesDecl::from_start(BytesStart::wrap(content, 3)); + // Report error at start of the end tag at `<` character + // -2 for `<` and `>` + self.last_error_offset = self.offset - buf.len() - 2; + return Err(Error::IllFormed(IllFormedError::MismatchedEndTag { + expected, + found: decoder.decode(name).unwrap_or_default().into_owned(), + })); + } + } - // Try getting encoding from the declaration event - #[cfg(feature = "encoding")] - if self.encoding.can_be_refined() { - if let Some(encoding) = event.encoder() { - self.encoding = EncodingRef::XmlDetected(encoding); + self.opened_buffer.truncate(start); + } + None => { + // Report error at start of the end tag at `<` character + // -2 for `<` and `>` + self.last_error_offset = self.offset - buf.len() - 2; + return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag( + decoder.decode(name).unwrap_or_default().into_owned(), + ))); } } - Ok(Event::Decl(event)) - } else { - Ok(Event::PI(BytesText::wrap(content, self.decoder()))) + Ok(Event::End(BytesEnd::wrap(name.into()))) } - } else { - // `) - self.last_error_offset = self.offset - len - 2; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + FeedResult::EncodingUtf8Like(_) + | FeedResult::EncodingUtf16BeLike(_) + | FeedResult::EncodingUtf16LeLike(_) => unreachable!("processed outside"), } } - /// Converts content of a tag to a `Start` or an `Empty` event - /// - /// # Parameters - /// - `content`: Content of a tag between `<` and `>` - pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result> { - let len = content.len(); - let name_end = content - .iter() - .position(|&b| is_whitespace(b)) - .unwrap_or(len); - if let Some(&b'/') = content.last() { - // This is self-closed tag `` - let name_len = if name_end < len { name_end } else { len - 1 }; - let event = BytesStart::wrap(&content[..len - 1], name_len); - - if self.config.expand_empty_elements { - self.state = ParseState::Empty; - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_len]); - Ok(Event::Start(event)) - } else { - Ok(Event::Empty(event)) - } - } else { - // #514: Always store names event when .check_end_names == false, - // because checks can be temporary disabled and when they would be - // enabled, we should have that information - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&content[..name_end]); - Ok(Event::Start(BytesStart::wrap(content, name_end))) - } - } - - #[inline] - pub fn close_expanded_empty(&mut self) -> Result> { - self.state = ParseState::ClosedTag; - let name = self - .opened_buffer - .split_off(self.opened_starts.pop().unwrap()); - Ok(Event::End(BytesEnd::wrap(name.into()))) - } - - /// Get the decoder, used to decode bytes, read by this reader, to the strings. + /// Get the pending event if the last returned event was a synthetic `Start` + /// event due to [`Config::expand_empty_elements`] setting. /// - /// If [`encoding`] feature is enabled, the used encoding may change after - /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. - /// - /// If [`encoding`] feature is enabled and no encoding is specified in declaration, - /// defaults to UTF-8. - /// - /// [`encoding`]: ../../index.html#encoding - pub fn decoder(&self) -> Decoder { - Decoder { - #[cfg(feature = "encoding")] - encoding: self.encoding.encoding(), + /// If this method returns something, the read next event should return this + /// event. + pub fn pending_end(&mut self) -> Option> { + if self.pending { + self.pending = false; + let name = self + .opened_buffer + .split_off(self.opened_starts.pop().unwrap()); + return Some(Event::End(BytesEnd::wrap(name.into()))); } + None } } impl Default for ReaderState { fn default() -> Self { Self { + parser: Parser::default(), offset: 0, last_error_offset: 0, - state: ParseState::Init, config: Config::default(), + can_trim_start: true, + pending: false, opened_buffer: Vec::new(), opened_starts: Vec::new(),