Skip to content

Commit 11e483a

Browse files
committed
Split off most encoding-related tests to a separate file
Write a few new ones Fix up descriptions on some decoding functions
1 parent 687942e commit 11e483a

File tree

5 files changed

+150
-73
lines changed

5 files changed

+150
-73
lines changed

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,6 @@ Note that despite not focusing on performance (there are several unnecessary cop
271271
Benchmarking is hard and the results depend on your input file and your machine.
272272

273273
Here on my particular file, quick-xml is around **50 times faster** than [xml-rs](https://crates.io/crates/xml-rs) crate.
274-
_(measurements was done while this crate named quick-xml)_
275274

276275
```
277276
// quick-xml benches

src/encoding.rs

+3-6
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,9 @@ impl Decoder {
105105
}
106106
}
107107

108-
/// Decodes the provided bytes using the specified encoding, ignoring the BOM
109-
/// if it is present in the `bytes`.
108+
/// Decodes the provided bytes using the specified encoding.
110109
///
111-
/// Returns an error in case of malformed sequences in the `bytes`.
110+
/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
112111
#[cfg(feature = "encoding")]
113112
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
114113
encoding
@@ -119,7 +118,7 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
119118
/// Decodes a slice with an unknown encoding, removing the BOM if it is present
120119
/// in the bytes.
121120
///
122-
/// Returns an error in case of malformed sequences in the `bytes`.
121+
/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
123122
#[cfg(feature = "encoding")]
124123
pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
125124
if let Some(encoding) = detect_encoding(bytes) {
@@ -185,5 +184,3 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
185184
_ => None,
186185
}
187186
}
188-
189-
// TODO: add some tests for functions

tests/documents/utf8.xml

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<?xml version="1.0"?>
2+
<project name="project-name">
3+
</project>

tests/encodings.rs

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
use quick_xml::events::Event;
2+
use quick_xml::Reader;
3+
4+
#[cfg(feature = "encoding")]
5+
mod decode {
6+
use encoding_rs::{UTF_16BE, UTF_16LE, UTF_8};
7+
use quick_xml::encoding::*;
8+
use std::borrow::Cow;
9+
10+
static UTF16BE_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf16be.xml");
11+
static UTF16LE_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf16le.xml");
12+
static UTF8_TEXT_WITH_BOM: &[u8] = include_bytes!("./documents/utf8.xml");
13+
14+
static UTF8_TEXT: &str = r#"<?xml version="1.0"?>
15+
<project name="project-name">
16+
</project>
17+
"#;
18+
19+
#[test]
20+
fn test_removes_bom() {
21+
// No BOM
22+
assert_eq!(
23+
decode_with_bom_removal(UTF8_TEXT.as_bytes()).unwrap(),
24+
Cow::Borrowed(UTF8_TEXT)
25+
);
26+
// BOM
27+
assert_eq!(
28+
decode_with_bom_removal(UTF8_TEXT_WITH_BOM).unwrap(),
29+
Cow::Borrowed(UTF8_TEXT)
30+
);
31+
assert_eq!(
32+
decode_with_bom_removal(UTF16BE_TEXT_WITH_BOM).unwrap(),
33+
Cow::Borrowed(UTF8_TEXT).into_owned()
34+
);
35+
assert_eq!(
36+
decode_with_bom_removal(UTF16LE_TEXT_WITH_BOM).unwrap(),
37+
Cow::Borrowed(UTF8_TEXT).into_owned()
38+
);
39+
}
40+
41+
#[test]
42+
fn test_detect_encoding() {
43+
// No BOM
44+
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8));
45+
// BOM
46+
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8));
47+
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE));
48+
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE));
49+
}
50+
51+
#[test]
52+
fn test_decode_with_bom_removal() {
53+
// No BOM
54+
assert_eq!(
55+
decode_with_bom_removal(UTF8_TEXT.as_bytes()).unwrap(),
56+
UTF8_TEXT
57+
);
58+
// BOM
59+
assert_eq!(
60+
decode_with_bom_removal(UTF8_TEXT_WITH_BOM).unwrap(),
61+
UTF8_TEXT
62+
);
63+
assert_eq!(
64+
decode_with_bom_removal(UTF16BE_TEXT_WITH_BOM).unwrap(),
65+
UTF8_TEXT
66+
);
67+
assert_eq!(
68+
decode_with_bom_removal(UTF16LE_TEXT_WITH_BOM).unwrap(),
69+
UTF8_TEXT
70+
);
71+
}
72+
}
73+
74+
#[test]
75+
#[cfg(feature = "encoding")]
76+
fn test_koi8_r_encoding() {
77+
let src = include_bytes!("documents/opennews_all.rss").as_ref();
78+
let mut buf = vec![];
79+
let mut r = Reader::from_reader(src);
80+
r.trim_text(true).expand_empty_elements(false);
81+
loop {
82+
match r.read_event_into(&mut buf) {
83+
Ok(Event::Text(e)) => {
84+
e.unescape().unwrap();
85+
}
86+
Ok(Event::Eof) => break,
87+
_ => (),
88+
}
89+
}
90+
}
91+
92+
#[test]
93+
#[cfg(feature = "encoding")]
94+
fn fuzz_53() {
95+
use std::io::Cursor;
96+
97+
let data: &[u8] = b"\xe9\x00\x00\x00\x00\x00\x00\x00\x00\
98+
\x00\x00\x00\x00\n(\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\
99+
\x00<>\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<<\x00\x00\x00";
100+
let cursor = Cursor::new(data);
101+
let mut reader = Reader::from_reader(cursor);
102+
let mut buf = vec![];
103+
loop {
104+
match reader.read_event_into(&mut buf) {
105+
Ok(Event::Eof) | Err(..) => break,
106+
_ => buf.clear(),
107+
}
108+
}
109+
}
110+
111+
#[test]
112+
#[cfg(feature = "encoding")]
113+
fn fuzz_101() {
114+
use std::io::Cursor;
115+
116+
let data: &[u8] = b"\x00\x00<\x00\x00\x0a>&#44444444401?#\x0a413518\
117+
#\x0a\x0a\x0a;<:<)(<:\x0a\x0a\x0a\x0a;<:\x0a\x0a\
118+
<:\x0a\x0a\x0a\x0a\x0a<\x00*\x00\x00\x00\x00";
119+
let cursor = Cursor::new(data);
120+
let mut reader = Reader::from_reader(cursor);
121+
let mut buf = vec![];
122+
loop {
123+
match reader.read_event_into(&mut buf) {
124+
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
125+
for a in e.attributes() {
126+
if a.ok()
127+
.map_or(true, |a| a.decode_and_unescape_value(&reader).is_err())
128+
{
129+
break;
130+
}
131+
}
132+
}
133+
Ok(Event::Text(e)) => {
134+
if e.unescape().is_err() {
135+
break;
136+
}
137+
}
138+
Ok(Event::Eof) | Err(..) => break,
139+
_ => (),
140+
}
141+
buf.clear();
142+
}
143+
}

tests/test.rs

+1-66
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use quick_xml::events::Event::*;
33
use quick_xml::name::QName;
44
use quick_xml::reader::Reader;
55
use quick_xml::Error;
6-
use std::{borrow::Cow, io::Cursor};
6+
use std::borrow::Cow;
77

88
#[cfg(feature = "serialize")]
99
use serde::{Deserialize, Serialize};
@@ -92,40 +92,6 @@ fn test_comment_starting_with_gt() {
9292
}
9393
}
9494

95-
#[test]
96-
#[cfg(feature = "encoding")]
97-
fn test_koi8_r_encoding() {
98-
let src = include_bytes!("documents/opennews_all.rss").as_ref();
99-
let mut buf = vec![];
100-
let mut r = Reader::from_reader(src);
101-
r.trim_text(true).expand_empty_elements(false);
102-
loop {
103-
match r.read_event_into(&mut buf) {
104-
Ok(Text(e)) => {
105-
e.unescape().unwrap();
106-
}
107-
Ok(Eof) => break,
108-
_ => (),
109-
}
110-
}
111-
}
112-
113-
#[test]
114-
fn fuzz_53() {
115-
let data: &[u8] = b"\xe9\x00\x00\x00\x00\x00\x00\x00\x00\
116-
\x00\x00\x00\x00\n(\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\
117-
\x00<>\x00\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00<<\x00\x00\x00";
118-
let cursor = Cursor::new(data);
119-
let mut reader = Reader::from_reader(cursor);
120-
let mut buf = vec![];
121-
loop {
122-
match reader.read_event_into(&mut buf) {
123-
Ok(Eof) | Err(..) => break,
124-
_ => buf.clear(),
125-
}
126-
}
127-
}
128-
12995
#[test]
13096
fn test_issue94() {
13197
let data = br#"<Run>
@@ -141,37 +107,6 @@ fn test_issue94() {
141107
}
142108
}
143109

144-
#[test]
145-
fn fuzz_101() {
146-
let data: &[u8] = b"\x00\x00<\x00\x00\x0a>&#44444444401?#\x0a413518\
147-
#\x0a\x0a\x0a;<:<)(<:\x0a\x0a\x0a\x0a;<:\x0a\x0a\
148-
<:\x0a\x0a\x0a\x0a\x0a<\x00*\x00\x00\x00\x00";
149-
let cursor = Cursor::new(data);
150-
let mut reader = Reader::from_reader(cursor);
151-
let mut buf = vec![];
152-
loop {
153-
match reader.read_event_into(&mut buf) {
154-
Ok(Start(e)) | Ok(Empty(e)) => {
155-
for a in e.attributes() {
156-
if a.ok()
157-
.map_or(true, |a| a.decode_and_unescape_value(&reader).is_err())
158-
{
159-
break;
160-
}
161-
}
162-
}
163-
Ok(Text(e)) => {
164-
if e.unescape().is_err() {
165-
break;
166-
}
167-
}
168-
Ok(Eof) | Err(..) => break,
169-
_ => (),
170-
}
171-
buf.clear();
172-
}
173-
}
174-
175110
#[test]
176111
fn test_no_trim() {
177112
let mut reader = Reader::from_str(" <tag> text </tag> ");

0 commit comments

Comments
 (0)