Skip to content

Commit 08d4a3a

Browse files
committed
Add a write_bom() method to the Writer
1 parent b1a9670 commit 08d4a3a

File tree

3 files changed

+63
-12
lines changed

3 files changed

+63
-12
lines changed

Changelog.md

+5-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
- [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
4141
- [#455]: Change return type of all `read_to_end*` methods to return a span between tags
4242
- [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
43-
43+
- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.
4444

4545
### Bug Fixes
4646

@@ -175,11 +175,13 @@
175175
- [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
176176
array cannot guarantee borrowing due to possible copying while decoding.
177177

178-
- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`
178+
- [#455]: Removed `Reader::read_text_into` which is just a thin wrapper over match on `Event::Text`
179179

180180
- [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
181181
You still can use re-exported definitions from a crate root
182182

183+
- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
184+
183185
### New Tests
184186

185187
- [#9]: Added tests for incorrect nested tags in input
@@ -223,7 +225,7 @@
223225
[#450]: https://github.com/tafia/quick-xml/pull/450
224226
[#455]: https://github.com/tafia/quick-xml/pull/455
225227
[#456]: https://github.com/tafia/quick-xml/pull/456
226-
228+
[#459]: https://github.com/tafia/quick-xml/pull/459
227229

228230
## 0.23.0 -- 2022-05-08
229231

src/encoding.rs

+19-7
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,18 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
99
use crate::Error;
1010
use crate::Result;
1111

12+
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
13+
/// See <https://unicode.org/faq/utf_bom.html#bom1>
14+
pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
15+
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
16+
/// See <https://unicode.org/faq/utf_bom.html#bom1>
17+
#[cfg(feature = "encoding")]
18+
pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
19+
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
20+
/// See <https://unicode.org/faq/utf_bom.html#bom1>
21+
#[cfg(feature = "encoding")]
22+
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
23+
1224
/// Decoder of byte slices into strings.
1325
///
1426
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
@@ -62,7 +74,7 @@ impl Decoder {
6274
///
6375
/// If you instead want to use XML declared encoding, use the `encoding` feature
6476
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
65-
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
77+
let bytes = if bytes.starts_with(UTF8_BOM) {
6678
&bytes[3..]
6779
} else {
6880
bytes
@@ -131,11 +143,11 @@ pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
131143

132144
#[cfg(feature = "encoding")]
133145
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
134-
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
146+
if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
135147
bytes.split_at(3)
136-
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
148+
} else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
137149
bytes.split_at(2)
138-
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
150+
} else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
139151
bytes.split_at(2)
140152
} else {
141153
(&[], bytes)
@@ -172,9 +184,9 @@ fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
172184
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
173185
match bytes {
174186
// with BOM
175-
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
176-
_ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
177-
_ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
187+
_ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
188+
_ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
189+
_ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
178190

179191
// without BOM
180192
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2

src/writer.rs

+39-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
//! Contains high-level interface for an events-based XML emitter.
22
3+
use std::io::Write;
4+
5+
use crate::encoding::UTF8_BOM;
36
use crate::errors::{Error, Result};
47
use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
5-
use std::io::Write;
68

79
/// XML writer.
810
///
@@ -86,6 +88,40 @@ impl<W: Write> Writer<W> {
8688
&mut self.writer
8789
}
8890

91+
/// Write a [Byte-Order-Mark] character to the document.
92+
///
93+
/// # Example
94+
///
95+
/// ```rust
96+
/// # use quick_xml::Result;
97+
/// # fn main() -> Result<()> {
98+
/// use quick_xml::events::{BytesStart, BytesText, Event};
99+
/// use quick_xml::writer::Writer;
100+
/// use quick_xml::Error;
101+
/// use std::io::Cursor;
102+
///
103+
/// let mut buffer = Vec::new();
104+
/// let mut writer = Writer::new_with_indent(&mut buffer, b' ', 4);
105+
///
106+
/// writer.write_bom()?;
107+
/// writer
108+
/// .create_element("empty")
109+
/// .with_attribute(("attr1", "value1"))
110+
/// .write_empty()
111+
/// .expect("failure");
112+
///
113+
/// assert_eq!(
114+
/// std::str::from_utf8(&buffer).unwrap(),
115+
/// "\u{FEFF}<empty attr1=\"value1\"/>"
116+
/// );
117+
/// # Ok(())
118+
/// # }
119+
/// ```
120+
/// [Byte-Order-Mark]: https://unicode.org/faq/utf_bom.html#BOM
121+
pub fn write_bom(&mut self) -> Result<()> {
122+
self.write(UTF8_BOM)
123+
}
124+
89125
/// Writes the given event to the underlying writer.
90126
pub fn write_event<'a, E: AsRef<Event<'a>>>(&mut self, event: E) -> Result<()> {
91127
let mut next_should_line_break = true;
@@ -128,7 +164,7 @@ impl<W: Write> Writer<W> {
128164

129165
/// Writes bytes
130166
#[inline]
131-
pub fn write(&mut self, value: &[u8]) -> Result<()> {
167+
pub(crate) fn write(&mut self, value: &[u8]) -> Result<()> {
132168
self.writer.write_all(value).map_err(Error::Io)
133169
}
134170

@@ -502,6 +538,7 @@ mod indentation {
502538
</paired>"#
503539
);
504540
}
541+
505542
#[test]
506543
fn element_writer_empty() {
507544
let mut buffer = Vec::new();

0 commit comments

Comments
 (0)