Skip to content

Commit dab8065

Browse files
committed
Remove BOM from first-emitted text event
1 parent b302b6f commit dab8065

File tree

5 files changed

+64
-17
lines changed

5 files changed

+64
-17
lines changed

Changelog.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
4141
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
4242
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
43-
- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.
43+
- [#458]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.
4444

4545
### Bug Fixes
4646

@@ -180,7 +180,8 @@
180180
- [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
181181
You still can use re-exported definitions from a crate root
182182

183-
- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
183+
- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
184+
- [#458]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`.
184185

185186
### New Tests
186187

src/encoding.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
152152
}
153153

154154
#[cfg(feature = "encoding")]
155-
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
155+
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
156156
let (_, bytes) = split_at_bom(bytes, encoding);
157157
bytes
158158
}

src/reader/parser.rs

+20-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
#[cfg(feature = "encoding")]
22
use encoding_rs::UTF_8;
33

4-
#[cfg(feature = "encoding")]
5-
use crate::encoding::detect_encoding;
6-
use crate::encoding::Decoder;
4+
use crate::encoding::{self, Decoder};
75
use crate::errors::{Error, Result};
86
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
97
#[cfg(feature = "encoding")]
@@ -68,23 +66,31 @@ impl Parser {
6866
///
6967
/// [`Text`]: Event::Text
7068
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
71-
#[cfg(feature = "encoding")]
72-
if first && self.encoding.can_be_refined() {
73-
if let Some(encoding) = detect_encoding(bytes) {
74-
self.encoding = EncodingRef::BomDetected(encoding);
75-
}
76-
}
69+
let mut content = bytes;
7770

78-
let content = if self.trim_text_end {
71+
if self.trim_text_end {
7972
// Skip the ending '<'
8073
let len = bytes
8174
.iter()
8275
.rposition(|&b| !is_whitespace(b))
8376
.map_or_else(|| bytes.len(), |p| p + 1);
84-
&bytes[..len]
85-
} else {
86-
bytes
87-
};
77+
content = &bytes[..len];
78+
}
79+
80+
if first {
81+
#[cfg(feature = "encoding")]
82+
if self.encoding.can_be_refined() {
83+
if let Some(encoding) = encoding::detect_encoding(bytes) {
84+
self.encoding = EncodingRef::BomDetected(encoding);
85+
content = encoding::remove_bom(content, encoding);
86+
}
87+
}
88+
#[cfg(not(feature = "encoding"))]
89+
if bytes.starts_with(encoding::UTF8_BOM) {
90+
content = &bytes[encoding::UTF8_BOM.len()..];
91+
}
92+
}
93+
8894
Ok(Event::Text(BytesText::wrap(content, self.decoder())))
8995
}
9096

tests/encodings.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#[allow(dead_code)]
12
use quick_xml::events::Event;
3+
#[allow(dead_code)]
24
use quick_xml::Reader;
35

46
#[cfg(feature = "encoding")]

tests/xmlrs_reader_tests.rs

+38
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,44 @@ fn html5() {
5151
);
5252
}
5353

54+
#[test]
55+
fn bom_removed_from_initial_text() {
56+
let expected = r#"
57+
|Characters(asdf)
58+
|StartElement(paired [attr1="value1", attr2="value2"])
59+
|Characters(text)
60+
|EndElement(paired)
61+
|EndDocument
62+
"#;
63+
64+
// BOM right up against the text
65+
test(
66+
"\u{FEFF}asdf<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
67+
expected,
68+
true,
69+
);
70+
71+
trimming should ignore the BOM
72+
test(
73+
"\u{FEFF} asdf<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
74+
expected,
75+
true,
76+
);
77+
78+
//
79+
test(
80+
"\u{FEFF}<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
81+
r#"
82+
|StartElement(paired [attr1="value1", attr2="value2"])
83+
|Characters(text)
84+
|EndElement(paired)
85+
|EndDocument
86+
"#,
87+
true,
88+
);
89+
}
90+
91+
5492
#[test]
5593
fn escaped_characters() {
5694
test(

0 commit comments

Comments
 (0)