Skip to content

Commit 8b57c07

Browse files
committed
temp
1 parent 6d883b5 commit 8b57c07

File tree

2 files changed

+157
-49
lines changed

2 files changed

+157
-49
lines changed

src/encoding.rs

+156-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
//! A module for wrappers that encode / decode data.
22
33
use std::borrow::Cow;
4+
use std::io::BufRead;
45

56
#[cfg(feature = "encoding")]
6-
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
7+
use encoding_rs::{Decoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult};
78

89
use crate::{Error, Result};
910

@@ -184,4 +185,158 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
184185
}
185186
}
186187

188+
/// A reference to an encoding together with information about how it was retrieved.
189+
///
190+
/// The state transition diagram:
191+
///
192+
/// ```mermaid
193+
/// flowchart LR
194+
/// Implicit -- from_str --> Explicit
195+
/// Implicit -- BOM --> BomDetected
196+
/// Implicit -- "encoding=..." --> XmlDetected
197+
/// BomDetected -- "encoding=..." --> XmlDetected
198+
/// ```
199+
#[cfg(feature = "encoding")]
200+
#[derive(Clone, Copy)]
201+
enum EncodingRef {
202+
/// Encoding was implicitly assumed to have a specified value. It can be refined
203+
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
204+
Implicit(&'static Encoding),
205+
/// Encoding was explicitly set to the desired value. It cannot be changed
206+
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
207+
Explicit(&'static Encoding),
208+
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
209+
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
210+
BomDetected(&'static Encoding),
211+
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
212+
/// It can no longer change
213+
XmlDetected(&'static Encoding),
214+
}
215+
#[cfg(feature = "encoding")]
216+
impl EncodingRef {
217+
#[inline]
218+
fn encoding(&self) -> &'static Encoding {
219+
match self {
220+
Self::Implicit(e) => e,
221+
Self::Explicit(e) => e,
222+
Self::BomDetected(e) => e,
223+
Self::XmlDetected(e) => e,
224+
}
225+
}
226+
#[inline]
227+
fn can_be_refined(&self) -> bool {
228+
match self {
229+
Self::Implicit(_) | Self::BomDetected(_) => true,
230+
Self::Explicit(_) | Self::XmlDetected(_) => false,
231+
}
232+
}
233+
}
234+
235+
#[cfg(feature = "encoding")]
236+
237+
struct DecodingBufReader<R> {
238+
// // The buffer
239+
// buffer: String,
240+
// // How many bytes in the buffer currently hold significant data.
241+
// current_position: usize,
242+
243+
// /// Track whether we see errors.
244+
// encoding: Option<Encoding>,
245+
246+
inner: R,
247+
decoded_buffer: Vec<u8>,
248+
current_pos: usize,
249+
250+
decoder: Decoder,
251+
encoding: EncodingRef,
252+
}
253+
254+
#[cfg(feature = "encoding")]
255+
impl<R: BufRead> BufRead for DecodingBufReader<R> {
256+
fn fill_buf(&mut self) -> io::Result<&[u8]> {
257+
258+
self.shuffle();
259+
let data = inner.fill_buf();
260+
261+
let amount_read_from_inner = self.feed(data)?;
262+
self.inner.consume(amount_read_from_inner);
263+
}
264+
265+
fn consume(&mut self, amt: usize) {
266+
self.current_pos = cmp::min(self.current_pos + amt, self.buffer.capacity());
267+
}
268+
}
269+
270+
#[cfg(feature = "encoding")]
271+
impl<R: BufRead> DecodingBufReader<R> {
272+
fn new() -> Self {
273+
DecodingBufReader {
274+
inner: R,
275+
decoded_buffer: Vec::new(),
276+
current_pos: 0,
277+
278+
decoder: UTF_8.new_decoder(),
279+
encoding: EncodingRef::Implicit(UTF_8),
280+
}
281+
}
282+
283+
fn get_raw_buffer(&self) -> io::Result<&[u8]> {
284+
self.inner.fill_buf()
285+
}
286+
287+
/// Move unconsumed data to the front of the buffer and reset the length
288+
fn shuffle(&mut self) {
289+
if self.current_pos == 0 {
290+
return;
291+
}
292+
293+
// Get the slice of bytes which haven't been consumed yet
294+
let remaining = &self.decoded_buffer[self.current_pos..];
295+
// Copy all unconsumed bytes to the beginning of the buffer
296+
self.decoded_buffer.as_mut_slice().copy_within(remaining, 0);
297+
// Truncate the buffer
298+
self.decoded_buffer.truncate(remaining.len());
299+
}
300+
301+
/// Reallocate a smaller buffer with the provided size
302+
fn shrink_buffer(&mut self, size: usize) {
303+
self.shuffle();
304+
self.decoded_buffer.shrink_to_fit(size);
305+
}
306+
307+
fn set_encoding(encoding: Encoding) {
308+
self.encoding = EncodingRef::Explicit(encoding);
309+
}
310+
311+
fn feed(&mut self, data: &[u8]) -> Result<usize, ()> {
312+
// reserve (at least) enough space in our buffer to hold the decoded data
313+
// encoding::max_utf8_buffer_length(data.len())
314+
self.decoded_buffer.reserve(data.len());
315+
316+
// The number of bytes already read from current `input` in total.
317+
let (result, read, written, had_errors) =
318+
self.decoder.decode_to_utf8(&data[..],
319+
&mut self.decoded_buffer[self.current_pos..],
320+
data.is_empty());
321+
self.current_position += written;
322+
match result {
323+
CoderResult::InputEmpty => {
324+
// We have consumed the current input buffer.
325+
match had_errors {
326+
true => Err(()),
327+
false => Ok(read),
328+
}
329+
},
330+
CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"),
331+
}
332+
}
333+
}
334+
335+
#[cfg(test)]
336+
mod tests {
337+
338+
}
339+
340+
341+
187342
// TODO: add some tests for functions

src/reader/mod.rs

+1-48
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::str::from_utf8;
66
use encoding_rs::{Encoding, UTF_8};
77

88
#[cfg(feature = "encoding")]
9-
use crate::encoding::detect_encoding;
9+
use crate::encoding::{detect_encoding, EncodingRef};
1010
use crate::encoding::Decoder;
1111
use crate::errors::{Error, Result};
1212
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
@@ -179,53 +179,6 @@ enum TagState {
179179
Exit,
180180
}
181181

182-
/// A reference to an encoding together with information about how it was retrieved.
183-
///
184-
/// The state transition diagram:
185-
///
186-
/// ```mermaid
187-
/// flowchart LR
188-
/// Implicit -- from_str --> Explicit
189-
/// Implicit -- BOM --> BomDetected
190-
/// Implicit -- "encoding=..." --> XmlDetected
191-
/// BomDetected -- "encoding=..." --> XmlDetected
192-
/// ```
193-
#[cfg(feature = "encoding")]
194-
#[derive(Clone, Copy)]
195-
enum EncodingRef {
196-
/// Encoding was implicitly assumed to have a specified value. It can be refined
197-
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
198-
Implicit(&'static Encoding),
199-
/// Encoding was explicitly set to the desired value. It cannot be changed
200-
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
201-
Explicit(&'static Encoding),
202-
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
203-
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
204-
BomDetected(&'static Encoding),
205-
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
206-
/// It can no longer change
207-
XmlDetected(&'static Encoding),
208-
}
209-
#[cfg(feature = "encoding")]
210-
impl EncodingRef {
211-
#[inline]
212-
fn encoding(&self) -> &'static Encoding {
213-
match self {
214-
Self::Implicit(e) => e,
215-
Self::Explicit(e) => e,
216-
Self::BomDetected(e) => e,
217-
Self::XmlDetected(e) => e,
218-
}
219-
}
220-
#[inline]
221-
fn can_be_refined(&self) -> bool {
222-
match self {
223-
Self::Implicit(_) | Self::BomDetected(_) => true,
224-
Self::Explicit(_) | Self::XmlDetected(_) => false,
225-
}
226-
}
227-
}
228-
229182
////////////////////////////////////////////////////////////////////////////////////////////////////
230183

231184
/// A low level encoding-agnostic XML event reader.

0 commit comments

Comments
 (0)