1
1
use super :: from_utf8_unchecked;
2
2
use super :: validations:: utf8_char_width;
3
- use crate :: fmt;
4
3
use crate :: fmt:: { Formatter , Write } ;
5
4
use crate :: iter:: FusedIterator ;
5
+ use crate :: { fmt, slice} ;
6
6
7
7
impl [ u8 ] {
8
8
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
@@ -152,7 +152,7 @@ impl fmt::Debug for Debug<'_> {
152
152
///
153
153
/// See the [`Utf8Chunk`] type for documentation of the items yielded by this iterator.
154
154
///
155
- /// [byteslice]: slice
155
+ /// [byteslice]: prim@ slice
156
156
/// [`from_utf8`]: super::from_utf8
157
157
///
158
158
/// # Examples
@@ -197,86 +197,29 @@ impl<'a> Iterator for Utf8Chunks<'a> {
197
197
return None ;
198
198
}
199
199
200
- const TAG_CONT_U8 : u8 = 128 ;
201
- fn safe_get ( xs : & [ u8 ] , i : usize ) -> u8 {
202
- * xs. get ( i) . unwrap_or ( & 0 )
203
- }
204
-
205
- let mut i = 0 ;
206
- let mut valid_up_to = 0 ;
207
- while i < self . source . len ( ) {
208
- // SAFETY: `i < self.source.len()` per previous line.
209
- // For some reason the following are both significantly slower:
210
- // while let Some(&byte) = self.source.get(i) {
211
- // while let Some(byte) = self.source.get(i).copied() {
212
- let byte = unsafe { * self . source . get_unchecked ( i) } ;
213
- i += 1 ;
214
-
215
- if byte < 128 {
216
- // This could be a `1 => ...` case in the match below, but for
217
- // the common case of all-ASCII inputs, we bypass loading the
218
- // sizeable UTF8_CHAR_WIDTH table into cache.
219
- } else {
220
- let w = utf8_char_width ( byte) ;
221
-
222
- match w {
223
- 2 => {
224
- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
225
- break ;
226
- }
227
- i += 1 ;
228
- }
229
- 3 => {
230
- match ( byte, safe_get ( self . source , i) ) {
231
- ( 0xE0 , 0xA0 ..=0xBF ) => ( ) ,
232
- ( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => ( ) ,
233
- ( 0xED , 0x80 ..=0x9F ) => ( ) ,
234
- ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => ( ) ,
235
- _ => break ,
236
- }
237
- i += 1 ;
238
- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
239
- break ;
240
- }
241
- i += 1 ;
242
- }
243
- 4 => {
244
- match ( byte, safe_get ( self . source , i) ) {
245
- ( 0xF0 , 0x90 ..=0xBF ) => ( ) ,
246
- ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => ( ) ,
247
- ( 0xF4 , 0x80 ..=0x8F ) => ( ) ,
248
- _ => break ,
249
- }
250
- i += 1 ;
251
- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
252
- break ;
253
- }
254
- i += 1 ;
255
- if safe_get ( self . source , i) & 192 != TAG_CONT_U8 {
256
- break ;
257
- }
258
- i += 1 ;
259
- }
260
- _ => break ,
261
- }
200
+ let mut iter = self . source . iter ( ) ;
201
+ let mut len_after_valid = iter. len ( ) ;
202
+ while !iter. is_empty ( ) {
203
+ if !advance_utf8 ( & mut iter) {
204
+ // Stop at the first invalid sequence.
205
+ break ;
262
206
}
263
-
264
- valid_up_to = i;
207
+ len_after_valid = iter. len ( ) ;
265
208
}
209
+ let valid_up_to = self . source . len ( ) - len_after_valid;
210
+ let inspected_len = self . source . len ( ) - iter. len ( ) ;
266
211
267
- // SAFETY: `i <= self.source.len()` because it is only ever incremented
268
- // via `i += 1` and in between every single one of those increments, `i`
269
- // is compared against `self.source.len()`. That happens either
270
- // literally by `i < self.source.len()` in the while-loop's condition,
271
- // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
272
- // loop is terminated as soon as the latest `i += 1` has made `i` no
273
- // longer less than `self.source.len()`, which means it'll be at most
274
- // equal to `self.source.len()`.
275
- let ( inspected, remaining) = unsafe { self . source . split_at_unchecked ( i) } ;
212
+ // SAFETY: The length of the remaining bytes in `iter` only decreases,
213
+ // so `iter.len() <= self.source.len()`. The length of inspected bytes,
214
+ // `self.source.len() - iter.len()`, then only increases and can be at
215
+ // most `self.source.len()`.
216
+ let ( inspected, remaining) = unsafe { self . source . split_at_unchecked ( inspected_len) } ;
276
217
self . source = remaining;
277
218
278
- // SAFETY: `valid_up_to <= i` because it is only ever assigned via
279
- // `valid_up_to = i` and `i` only increases.
219
+ // SAFETY: Since `iter.len()` only decreases and `len_after_valid` is
220
+ // the value of `iter.len()` from the previous iteration, it follows
221
+ // that `len_after_valid <= iter.len()`, which is equivalent to
222
+ // `valid_up_to <= inspected_len` by simple substitution.
280
223
let ( valid, invalid) = unsafe { inspected. split_at_unchecked ( valid_up_to) } ;
281
224
282
225
Some ( Utf8Chunk {
@@ -296,3 +239,65 @@ impl fmt::Debug for Utf8Chunks<'_> {
296
239
f. debug_struct ( "Utf8Chunks" ) . field ( "source" , & self . debug ( ) ) . finish ( )
297
240
}
298
241
}
242
+
243
+ /// Advances the byte iterator by one UTF-8 scalar value, allowing invalid UTF-8
244
+ /// sequences. When the current sequence is invalid, the maximal prefix of a
245
+ /// valid UTF-8 code unit sequence is consumed. Returns whether the sequence is
246
+ /// a valid Unicode scalar value.
247
+ #[ inline]
248
+ fn advance_utf8 ( bytes : & mut slice:: Iter < ' _ , u8 > ) -> bool {
249
+ const TAG_CONT_U8 : u8 = 128 ;
250
+ #[ inline]
251
+ fn peek ( bytes : & slice:: Iter < ' _ , u8 > ) -> u8 {
252
+ * bytes. clone ( ) . next ( ) . unwrap_or ( & 0 )
253
+ }
254
+
255
+ let Some ( & byte) = bytes. next ( ) else { return false } ;
256
+ if byte < 128 {
257
+ // This could be a `1 => ...` case in the match below, but for the
258
+ // common case of all-ASCII inputs, we bypass loading the sizeable
259
+ // UTF8_CHAR_WIDTH table into cache.
260
+ } else {
261
+ match utf8_char_width ( byte) {
262
+ 2 => {
263
+ if peek ( bytes) & 192 != TAG_CONT_U8 {
264
+ return false ;
265
+ }
266
+ bytes. next ( ) ;
267
+ }
268
+ 3 => {
269
+ match ( byte, peek ( bytes) ) {
270
+ ( 0xE0 , 0xA0 ..=0xBF ) => { }
271
+ ( 0xE1 ..=0xEC , 0x80 ..=0xBF ) => { }
272
+ ( 0xED , 0x80 ..=0x9F ) => { }
273
+ ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
274
+ _ => return false ,
275
+ }
276
+ bytes. next ( ) ;
277
+ if peek ( bytes) & 192 != TAG_CONT_U8 {
278
+ return false ;
279
+ }
280
+ bytes. next ( ) ;
281
+ }
282
+ 4 => {
283
+ match ( byte, peek ( bytes) ) {
284
+ ( 0xF0 , 0x90 ..=0xBF ) => { }
285
+ ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) => { }
286
+ ( 0xF4 , 0x80 ..=0x8F ) => { }
287
+ _ => return false ,
288
+ }
289
+ bytes. next ( ) ;
290
+ if peek ( bytes) & 192 != TAG_CONT_U8 {
291
+ return false ;
292
+ }
293
+ bytes. next ( ) ;
294
+ if peek ( bytes) & 192 != TAG_CONT_U8 {
295
+ return false ;
296
+ }
297
+ bytes. next ( ) ;
298
+ }
299
+ _ => return false ,
300
+ }
301
+ }
302
+ true
303
+ }
0 commit comments