Add a write_bom() method to the Writer

dralley · dralley · commit b302b6fca95d · 2022-08-16T23:08:08.000-04:00
diff --git a/Changelog.md b/Changelog.md
@@ -40,7 +40,7 @@
 - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
 - [#455]: Change return type of all `read_to_end*` methods to return a span between tags
 - [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
-
+- [#459]: Added a `Writer::write_bom()` method for inserting a Byte-Order-Mark into the document.
 
 ### Bug Fixes
 
@@ -175,11 +175,13 @@
 - [#440]: Removed `Deserializer::from_slice` and `quick_xml::de::from_slice` methods because deserializing from a byte
   array cannot guarantee borrowing due to possible copying while decoding.
 
-- [#455]: Removed `Reader::read_text_into` which is only not a better wrapper over match on `Event::Text`
+- [#455]: Removed `Reader::read_text_into` which is just a thin wrapper over match on `Event::Text`
 
 - [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
   You still can use re-exported definitions from a crate root
 
+- [#459]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
+
 ### New Tests
 
 - [#9]: Added tests for incorrect nested tags in input
@@ -223,7 +225,7 @@
 [#450]: https://github.com/tafia/quick-xml/pull/450
 [#455]: https://github.com/tafia/quick-xml/pull/455
 [#456]: https://github.com/tafia/quick-xml/pull/456
-
+[#458]: https://github.com/tafia/quick-xml/pull/458
 
 ## 0.23.0 -- 2022-05-08
 
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -9,6 +9,15 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
 use crate::Error;
 use crate::Result;
 
+/// Unicode "byte order mark" encoded as UTF-8
+pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
+/// Unicode "byte order mark" encoded as UTF-16 with little-endian byte order
+#[allow(dead_code)]
+pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
+/// Unicode "byte order mark" encoded as UTF-16 with big-endian byte order
+#[allow(dead_code)]
+pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
+
 /// Decoder of byte slices into strings.
 ///
 /// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
@@ -62,7 +71,7 @@ impl Decoder {
     ///
     /// If you instead want to use XML declared encoding, use the `encoding` feature
     pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-        let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
+        let bytes = if bytes.starts_with(UTF8_BOM) {
             &bytes[3..]
         } else {
             bytes
@@ -131,11 +140,11 @@ pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
 
 #[cfg(feature = "encoding")]
 fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
-    if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
+    if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
         bytes.split_at(3)
-    } else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
+    } else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
         bytes.split_at(2)
-    } else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
+    } else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
         bytes.split_at(2)
     } else {
         (&[], bytes)
@@ -172,9 +181,9 @@ fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
 pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
     match bytes {
         // with BOM
-        _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
-        _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE),
-        _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8),
+        _ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
+        _ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
+        _ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
 
         // without BOM
         _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
diff --git a/src/writer.rs b/src/writer.rs
@@ -1,8 +1,10 @@
 //! Contains high-level interface for an events-based XML emitter.
 
+use std::io::Write;
+
+use crate::encoding::UTF8_BOM;
 use crate::errors::{Error, Result};
 use crate::events::{attributes::Attribute, BytesCData, BytesStart, BytesText, Event};
-use std::io::Write;
 
 /// XML writer.
 ///
@@ -86,6 +88,39 @@ impl<W: Write> Writer<W> {
         &mut self.writer
     }
 
+    /// Write a Byte-Order-Mark character to the document.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # use quick_xml::Result;
+    /// # fn main() -> Result<()> {
+    /// use quick_xml::events::{BytesStart, BytesText, Event};
+    /// use quick_xml::writer::Writer;
+    /// use quick_xml::Error;
+    /// use std::io::Cursor;
+    ///
+    /// let mut buffer = Vec::new();
+    /// let mut writer = Writer::new_with_indent(&mut buffer, b' ', 4);
+    ///
+    /// writer.write_bom()?;
+    /// writer
+    ///     .create_element("empty")
+    ///     .with_attribute(("attr1", "value1"))
+    ///     .write_empty()
+    ///     .expect("failure");
+    ///
+    /// assert_eq!(
+    ///     std::str::from_utf8(&buffer).unwrap(),
+    ///     "\u{FEFF}<empty attr1=\"value1\"/>"
+    /// );
+    /// # Ok(())
+    /// # }
+    /// ```
+    pub fn write_bom(&mut self) -> Result<()> {
+        self.write(UTF8_BOM)
+    }
+
     /// Writes the given event to the underlying writer.
     pub fn write_event<'a, E: AsRef<Event<'a>>>(&mut self, event: E) -> Result<()> {
         let mut next_should_line_break = true;
@@ -128,7 +163,7 @@ impl<W: Write> Writer<W> {
 
     /// Writes bytes
     #[inline]
-    pub fn write(&mut self, value: &[u8]) -> Result<()> {
+    pub(crate) fn write(&mut self, value: &[u8]) -> Result<()> {
         self.writer.write_all(value).map_err(Error::Io)
     }
 
@@ -502,6 +537,7 @@ mod indentation {
 </paired>"#
         );
     }
+
     #[test]
     fn element_writer_empty() {
         let mut buffer = Vec::new();