Skip to content

Commit 64a0a79

Browse files
committed
Extract invalid UTF-8 iteration for reuse
Benchmark shows that performance is unaffected (`x bench library/alloc --stage 0 --test-args from_utf8_lossy`).
1 parent 8247948 commit 64a0a79

File tree

1 file changed

+82
-77
lines changed

1 file changed

+82
-77
lines changed

library/core/src/str/lossy.rs

+82-77
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use super::from_utf8_unchecked;
22
use super::validations::utf8_char_width;
3-
use crate::fmt;
43
use crate::fmt::{Formatter, Write};
54
use crate::iter::FusedIterator;
5+
use crate::{fmt, slice};
66

77
impl [u8] {
88
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
@@ -152,7 +152,7 @@ impl fmt::Debug for Debug<'_> {
152152
///
153153
/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154154
///
155-
/// [byteslice]: slice
155+
/// [byteslice]: prim@slice
156156
/// [`from_utf8`]: super::from_utf8
157157
///
158158
/// # Examples
@@ -197,86 +197,29 @@ impl<'a> Iterator for Utf8Chunks<'a> {
197197
return None;
198198
}
199199

200-
const TAG_CONT_U8: u8 = 128;
201-
fn safe_get(xs: &[u8], i: usize) -> u8 {
202-
*xs.get(i).unwrap_or(&0)
203-
}
204-
205-
let mut i = 0;
206-
let mut valid_up_to = 0;
207-
while i < self.source.len() {
208-
// SAFETY: `i < self.source.len()` per previous line.
209-
// For some reason the following are both significantly slower:
210-
// while let Some(&byte) = self.source.get(i) {
211-
// while let Some(byte) = self.source.get(i).copied() {
212-
let byte = unsafe { *self.source.get_unchecked(i) };
213-
i += 1;
214-
215-
if byte < 128 {
216-
// This could be a `1 => ...` case in the match below, but for
217-
// the common case of all-ASCII inputs, we bypass loading the
218-
// sizeable UTF8_CHAR_WIDTH table into cache.
219-
} else {
220-
let w = utf8_char_width(byte);
221-
222-
match w {
223-
2 => {
224-
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
225-
break;
226-
}
227-
i += 1;
228-
}
229-
3 => {
230-
match (byte, safe_get(self.source, i)) {
231-
(0xE0, 0xA0..=0xBF) => (),
232-
(0xE1..=0xEC, 0x80..=0xBF) => (),
233-
(0xED, 0x80..=0x9F) => (),
234-
(0xEE..=0xEF, 0x80..=0xBF) => (),
235-
_ => break,
236-
}
237-
i += 1;
238-
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
239-
break;
240-
}
241-
i += 1;
242-
}
243-
4 => {
244-
match (byte, safe_get(self.source, i)) {
245-
(0xF0, 0x90..=0xBF) => (),
246-
(0xF1..=0xF3, 0x80..=0xBF) => (),
247-
(0xF4, 0x80..=0x8F) => (),
248-
_ => break,
249-
}
250-
i += 1;
251-
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
252-
break;
253-
}
254-
i += 1;
255-
if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
256-
break;
257-
}
258-
i += 1;
259-
}
260-
_ => break,
261-
}
200+
let mut iter = self.source.iter();
201+
let mut len_after_valid = iter.len();
202+
while !iter.is_empty() {
203+
if !advance_utf8(&mut iter) {
204+
// Stop at the first invalid sequence.
205+
break;
262206
}
263-
264-
valid_up_to = i;
207+
len_after_valid = iter.len();
265208
}
209+
let valid_up_to = self.source.len() - len_after_valid;
210+
let inspected_len = self.source.len() - iter.len();
266211

267-
// SAFETY: `i <= self.source.len()` because it is only ever incremented
268-
// via `i += 1` and in between every single one of those increments, `i`
269-
// is compared against `self.source.len()`. That happens either
270-
// literally by `i < self.source.len()` in the while-loop's condition,
271-
// or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272-
// loop is terminated as soon as the latest `i += 1` has made `i` no
273-
// longer less than `self.source.len()`, which means it'll be at most
274-
// equal to `self.source.len()`.
275-
let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
212+
// SAFETY: The length of the remaining bytes in `iter` only decreases,
213+
// so `iter.len() <= self.source.len()`. The length of inspected bytes,
214+
// `self.source.len() - iter.len()`, then only increases and can be at
215+
// most `self.source.len()`.
216+
let (inspected, remaining) = unsafe { self.source.split_at_unchecked(inspected_len) };
276217
self.source = remaining;
277218

278-
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
279-
// `valid_up_to = i` and `i` only increases.
219+
// SAFETY: Since `iter.len()` only decreases and `len_after_valid` is
220+
// the value of `iter.len()` from the previous iteration, it follows
221+
// that `len_after_valid <= iter.len()`, which is equivalent to
222+
// `valid_up_to <= inspected_len` by simple substitution.
280223
let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
281224

282225
Some(Utf8Chunk {
@@ -296,3 +239,65 @@ impl fmt::Debug for Utf8Chunks<'_> {
296239
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
297240
}
298241
}
242+
243+
/// Advances the byte iterator by one UTF-8 scalar value, allowing invalid UTF-8
244+
/// sequences. When the current sequence is invalid, the maximal prefix of a
245+
/// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
246+
/// a valid Unicode scalar value.
247+
#[inline]
248+
fn advance_utf8(bytes: &mut slice::Iter<'_, u8>) -> bool {
249+
const TAG_CONT_U8: u8 = 128;
250+
#[inline]
251+
fn peek(bytes: &slice::Iter<'_, u8>) -> u8 {
252+
*bytes.clone().next().unwrap_or(&0)
253+
}
254+
255+
let Some(&byte) = bytes.next() else { return false };
256+
if byte < 128 {
257+
// This could be a `1 => ...` case in the match below, but for the
258+
// common case of all-ASCII inputs, we bypass loading the sizeable
259+
// UTF8_CHAR_WIDTH table into cache.
260+
} else {
261+
match utf8_char_width(byte) {
262+
2 => {
263+
if peek(bytes) & 192 != TAG_CONT_U8 {
264+
return false;
265+
}
266+
bytes.next();
267+
}
268+
3 => {
269+
match (byte, peek(bytes)) {
270+
(0xE0, 0xA0..=0xBF) => {}
271+
(0xE1..=0xEC, 0x80..=0xBF) => {}
272+
(0xED, 0x80..=0x9F) => {}
273+
(0xEE..=0xEF, 0x80..=0xBF) => {}
274+
_ => return false,
275+
}
276+
bytes.next();
277+
if peek(bytes) & 192 != TAG_CONT_U8 {
278+
return false;
279+
}
280+
bytes.next();
281+
}
282+
4 => {
283+
match (byte, peek(bytes)) {
284+
(0xF0, 0x90..=0xBF) => {}
285+
(0xF1..=0xF3, 0x80..=0xBF) => {}
286+
(0xF4, 0x80..=0x8F) => {}
287+
_ => return false,
288+
}
289+
bytes.next();
290+
if peek(bytes) & 192 != TAG_CONT_U8 {
291+
return false;
292+
}
293+
bytes.next();
294+
if peek(bytes) & 192 != TAG_CONT_U8 {
295+
return false;
296+
}
297+
bytes.next();
298+
}
299+
_ => return false,
300+
}
301+
}
302+
true
303+
}

0 commit comments

Comments
 (0)