|
1 | 1 | //! A module for wrappers that encode / decode data.
|
2 | 2 |
|
3 | 3 | use std::borrow::Cow;
|
| 4 | +use std::io::BufRead; |
4 | 5 |
|
5 | 6 | #[cfg(feature = "encoding")]
|
6 |
| -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; |
| 7 | +use encoding_rs::{Decoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult}; |
7 | 8 |
|
8 | 9 | use crate::{Error, Result};
|
9 | 10 |
|
@@ -184,4 +185,158 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
|
184 | 185 | }
|
185 | 186 | }
|
186 | 187 |
|
| 188 | +/// A reference to an encoding together with information about how it was retrieved. |
| 189 | +/// |
| 190 | +/// The state transition diagram: |
| 191 | +/// |
| 192 | +/// ```mermaid |
| 193 | +/// flowchart LR |
| 194 | +/// Implicit -- from_str --> Explicit |
| 195 | +/// Implicit -- BOM --> BomDetected |
| 196 | +/// Implicit -- "encoding=..." --> XmlDetected |
| 197 | +/// BomDetected -- "encoding=..." --> XmlDetected |
| 198 | +/// ``` |
| 199 | +#[cfg(feature = "encoding")] |
| 200 | +#[derive(Clone, Copy)] |
| 201 | +enum EncodingRef { |
| 202 | + /// Encoding was implicitly assumed to have a specified value. It can be refined |
| 203 | + /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`) |
| 204 | + Implicit(&'static Encoding), |
| 205 | + /// Encoding was explicitly set to the desired value. It cannot be changed |
| 206 | + /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`) |
| 207 | + Explicit(&'static Encoding), |
| 208 | + /// Encoding was detected from a byte order mark (BOM) or by the first bytes |
| 209 | + /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`) |
| 210 | + BomDetected(&'static Encoding), |
| 211 | + /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`). |
| 212 | + /// It can no longer change |
| 213 | + XmlDetected(&'static Encoding), |
| 214 | +} |
| 215 | +#[cfg(feature = "encoding")] |
| 216 | +impl EncodingRef { |
| 217 | + #[inline] |
| 218 | + fn encoding(&self) -> &'static Encoding { |
| 219 | + match self { |
| 220 | + Self::Implicit(e) => e, |
| 221 | + Self::Explicit(e) => e, |
| 222 | + Self::BomDetected(e) => e, |
| 223 | + Self::XmlDetected(e) => e, |
| 224 | + } |
| 225 | + } |
| 226 | + #[inline] |
| 227 | + fn can_be_refined(&self) -> bool { |
| 228 | + match self { |
| 229 | + Self::Implicit(_) | Self::BomDetected(_) => true, |
| 230 | + Self::Explicit(_) | Self::XmlDetected(_) => false, |
| 231 | + } |
| 232 | + } |
| 233 | +} |
| 234 | + |
| 235 | +#[cfg(feature = "encoding")] |
| 236 | + |
| 237 | +struct DecodingBufReader<R> { |
| 238 | + // // The buffer |
| 239 | + // buffer: String, |
| 240 | + // // How many bytes in the buffer currently hold significant data. |
| 241 | + // current_position: usize, |
| 242 | + |
| 243 | + // /// Track whether we see errors. |
| 244 | + // encoding: Option<Encoding>, |
| 245 | + |
| 246 | + inner: R, |
| 247 | + decoded_buffer: Vec<u8>, |
| 248 | + current_pos: usize, |
| 249 | + |
| 250 | + decoder: Decoder, |
| 251 | + encoding: EncodingRef, |
| 252 | +} |
| 253 | + |
| 254 | +#[cfg(feature = "encoding")] |
| 255 | +impl<R: BufRead> BufRead for DecodingBufReader<R> { |
| 256 | + fn fill_buf(&mut self) -> io::Result<&[u8]> { |
| 257 | + |
| 258 | + self.shuffle(); |
| 259 | + let data = inner.fill_buf(); |
| 260 | + |
| 261 | + let amount_read_from_inner = self.feed(data)?; |
| 262 | + self.inner.consume(amount_read_from_inner); |
| 263 | + } |
| 264 | + |
| 265 | + fn consume(&mut self, amt: usize) { |
| 266 | + self.current_pos = cmp::min(self.current_pos + amt, self.buffer.capacity()); |
| 267 | + } |
| 268 | +} |
| 269 | + |
| 270 | +#[cfg(feature = "encoding")] |
| 271 | +impl<R: BufRead> DecodingBufReader<R> { |
| 272 | + fn new() -> Self { |
| 273 | + DecodingBufReader { |
| 274 | + inner: R, |
| 275 | + decoded_buffer: Vec::new(), |
| 276 | + current_pos: 0, |
| 277 | + |
| 278 | + decoder: UTF_8.new_decoder(), |
| 279 | + encoding: EncodingRef::Implicit(UTF_8), |
| 280 | + } |
| 281 | + } |
| 282 | + |
| 283 | + fn get_raw_buffer(&self) -> io::Result<&[u8]> { |
| 284 | + self.inner.fill_buf() |
| 285 | + } |
| 286 | + |
| 287 | + /// Move unconsumed data to the front of the buffer and reset the length |
| 288 | + fn shuffle(&mut self) { |
| 289 | + if self.current_pos == 0 { |
| 290 | + return; |
| 291 | + } |
| 292 | + |
| 293 | + // Get the slice of bytes which haven't been consumed yet |
| 294 | + let remaining = &self.decoded_buffer[self.current_pos..]; |
| 295 | + // Copy all unconsumed bytes to the beginning of the buffer |
| 296 | + self.decoded_buffer.as_mut_slice().copy_within(remaining, 0); |
| 297 | + // Truncate the buffer |
| 298 | + self.decoded_buffer.truncate(remaining.len()); |
| 299 | + } |
| 300 | + |
| 301 | + /// Reallocate a smaller buffer with the provided size |
| 302 | + fn shrink_buffer(&mut self, size: usize) { |
| 303 | + self.shuffle(); |
| 304 | + self.decoded_buffer.shrink_to_fit(size); |
| 305 | + } |
| 306 | + |
| 307 | + fn set_encoding(encoding: Encoding) { |
| 308 | + self.encoding = EncodingRef::Explicit(encoding); |
| 309 | + } |
| 310 | + |
| 311 | + fn feed(&mut self, data: &[u8]) -> Result<usize, ()> { |
| 312 | + // reserve (at least) enough space in our buffer to hold the decoded data |
| 313 | + // encoding::max_utf8_buffer_length(data.len()) |
| 314 | + self.decoded_buffer.reserve(data.len()); |
| 315 | + |
| 316 | + // The number of bytes already read from current `input` in total. |
| 317 | + let (result, read, written, had_errors) = |
| 318 | + self.decoder.decode_to_utf8(&data[..], |
| 319 | + &mut self.decoded_buffer[self.current_pos..], |
| 320 | + data.is_empty()); |
| 321 | + self.current_position += written; |
| 322 | + match result { |
| 323 | + CoderResult::InputEmpty => { |
| 324 | + // We have consumed the current input buffer. |
| 325 | + match had_errors { |
| 326 | + true => Err(()), |
| 327 | + false => Ok(read), |
| 328 | + } |
| 329 | + }, |
| 330 | + CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"), |
| 331 | + } |
| 332 | + } |
| 333 | +} |
| 334 | + |
| 335 | +#[cfg(test)] |
| 336 | +mod tests { |
| 337 | + |
| 338 | +} |
| 339 | + |
| 340 | + |
| 341 | + |
187 | 342 | // TODO: add some tests for functions
|
0 commit comments