@@ -234,25 +234,27 @@ const unsafe fn resolve_error_location(st: u32, bytes: &[u8], i: usize) -> Utf8E
234
234
}
235
235
236
236
// The simpler but slower algorithm to run DFA with error handling.
237
+ // Returns the final state after execution on the whole slice.
237
238
//
238
239
// # Safety
239
240
// The caller must ensure `bytes[..i]` is a valid UTF-8 prefix and `st` is the DFA state after
240
241
// executing on `bytes[..i]`.
242
+ #[ inline]
241
243
const unsafe fn run_with_error_handling (
242
- st : & mut u32 ,
244
+ mut st : u32 ,
243
245
bytes : & [ u8 ] ,
244
246
mut i : usize ,
245
- ) -> Result < ( ) , Utf8Error > {
247
+ ) -> Result < u32 , Utf8Error > {
246
248
while i < bytes. len ( ) {
247
- let new_st = next_state ( * st, bytes[ i] ) ;
249
+ let new_st = next_state ( st, bytes[ i] ) ;
248
250
if new_st & STATE_MASK == ST_ERROR {
249
251
// SAFETY: Guaranteed by the caller.
250
- return Err ( unsafe { resolve_error_location ( * st, bytes, i) } ) ;
252
+ return Err ( unsafe { resolve_error_location ( st, bytes, i) } ) ;
251
253
}
252
- * st = new_st;
254
+ st = new_st;
253
255
i += 1 ;
254
256
}
255
- Ok ( ( ) )
257
+ Ok ( st )
256
258
}
257
259
258
260
/// Walks through `v` checking that it's a valid UTF-8 sequence,
@@ -265,19 +267,15 @@ pub(super) const fn run_utf8_validation(bytes: &[u8]) -> Result<(), Utf8Error> {
265
267
266
268
#[ inline]
267
269
const fn run_utf8_validation_const ( bytes : & [ u8 ] ) -> Result < ( ) , Utf8Error > {
268
- let mut st = ST_ACCEPT ;
269
270
// SAFETY: Start at empty string with valid state ACCEPT.
270
- match unsafe { run_with_error_handling ( & mut st , bytes, 0 ) } {
271
+ match unsafe { run_with_error_handling ( ST_ACCEPT , bytes, 0 ) } {
271
272
Err ( err) => Err ( err) ,
272
- Ok ( ( ) ) => {
273
- if st & STATE_MASK == ST_ACCEPT {
274
- Ok ( ( ) )
275
- } else {
276
- // SAFETY: `st` is the last state after execution without encountering any error.
277
- let mut err = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
278
- err. error_len = Utf8ErrorLen :: Eof ;
279
- Err ( err)
280
- }
273
+ Ok ( st) if st & STATE_MASK == ST_ACCEPT => Ok ( ( ) ) ,
274
+ Ok ( st) => {
275
+ // SAFETY: `st` is the last state after execution without encountering any error.
276
+ let mut err = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
277
+ err. error_len = Utf8ErrorLen :: Eof ;
278
+ Err ( err)
281
279
}
282
280
}
283
281
}
@@ -288,10 +286,9 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
288
286
const ASCII_CHUNK_SIZE : usize = 16 ;
289
287
const { assert ! ( ASCII_CHUNK_SIZE % MAIN_CHUNK_SIZE == 0 ) } ;
290
288
291
- let mut st = ST_ACCEPT ;
292
289
let mut i = bytes. len ( ) % MAIN_CHUNK_SIZE ;
293
290
// SAFETY: Start at initial state ACCEPT.
294
- unsafe { run_with_error_handling ( & mut st , & bytes[ ..i] , 0 ) ? } ;
291
+ let mut st = unsafe { run_with_error_handling ( ST_ACCEPT , & bytes[ ..i] , 0 ) ? } ;
295
292
296
293
while i + MAIN_CHUNK_SIZE <= bytes. len ( ) {
297
294
// Fast path: if the current state is ACCEPT, we can skip to the next non-ASCII chunk.
@@ -326,7 +323,8 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
326
323
}
327
324
if new_st & STATE_MASK == ST_ERROR {
328
325
// SAFETY: `st` is the last state after executing `bytes[..i]` without encountering any error.
329
- return unsafe { run_with_error_handling ( & mut st, bytes, i) } ;
326
+ // And we know the next chunk must fail the validation.
327
+ return Err ( unsafe { run_with_error_handling ( st, bytes, i) . unwrap_err_unchecked ( ) } ) ;
330
328
}
331
329
332
330
st = new_st;
0 commit comments