Skip to content

Commit b27d52f

Browse files
committed
Remove BOM from first-emitted text event
1 parent 08d4a3a commit b27d52f

File tree

5 files changed

+42
-15
lines changed

5 files changed

+42
-15
lines changed

Changelog.md

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@
181181
You still can use re-exported definitions from a crate root
182182

183183
- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
184+
- [#459]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use `Writer::write_bom()`.
184185

185186
### New Tests
186187

src/encoding.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
155155
}
156156

157157
#[cfg(feature = "encoding")]
158-
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
158+
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
159159
let (_, bytes) = split_at_bom(bytes, encoding);
160160
bytes
161161
}

src/reader/parser.rs

+20-14
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
#[cfg(feature = "encoding")]
22
use encoding_rs::UTF_8;
33

4-
#[cfg(feature = "encoding")]
5-
use crate::encoding::detect_encoding;
6-
use crate::encoding::Decoder;
4+
use crate::encoding::{self, Decoder};
75
use crate::errors::{Error, Result};
86
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
97
#[cfg(feature = "encoding")]
@@ -70,23 +68,31 @@ impl Parser {
7068
///
7169
/// [`Text`]: Event::Text
7270
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
73-
#[cfg(feature = "encoding")]
74-
if first && self.encoding.can_be_refined() {
75-
if let Some(encoding) = detect_encoding(bytes) {
76-
self.encoding = EncodingRef::BomDetected(encoding);
77-
}
78-
}
71+
let mut content = bytes;
7972

80-
let content = if self.trim_text_end {
73+
if self.trim_text_end {
8174
// Skip the ending '<'
8275
let len = bytes
8376
.iter()
8477
.rposition(|&b| !is_whitespace(b))
8578
.map_or_else(|| bytes.len(), |p| p + 1);
86-
&bytes[..len]
87-
} else {
88-
bytes
89-
};
79+
content = &bytes[..len];
80+
}
81+
82+
if first {
83+
#[cfg(feature = "encoding")]
84+
if self.encoding.can_be_refined() {
85+
if let Some(encoding) = encoding::detect_encoding(bytes) {
86+
self.encoding = EncodingRef::BomDetected(encoding);
87+
content = encoding::remove_bom(content, encoding);
88+
}
89+
}
90+
#[cfg(not(feature = "encoding"))]
91+
if bytes.starts_with(encoding::UTF8_BOM) {
92+
content = &bytes[encoding::UTF8_BOM.len()..];
93+
}
94+
}
95+
9096
Ok(Event::Text(BytesText::wrap(content, self.decoder())))
9197
}
9298

tests/encodings.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#[allow(unused_imports)]
12
use quick_xml::events::Event;
3+
#[allow(unused_imports)]
24
use quick_xml::Reader;
35

46
#[cfg(feature = "encoding")]

tests/xmlrs_reader_tests.rs

+18
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,24 @@ fn html5() {
5151
);
5252
}
5353

54+
#[test]
55+
fn bom_removed_from_initial_text() {
56+
let expected = r#"
57+
|Characters(asdf)
58+
|StartElement(paired [attr1="value1", attr2="value2"])
59+
|Characters(text)
60+
|EndElement(paired)
61+
|EndDocument
62+
"#;
63+
64+
// BOM right up against the text
65+
test(
66+
"\u{FEFF}asdf<paired attr1=\"value1\" attr2=\"value2\">text</paired>",
67+
expected,
68+
true,
69+
);
70+
}
71+
5472
#[test]
5573
fn escaped_characters() {
5674
test(

0 commit comments

Comments
 (0)