@@ -135,7 +135,11 @@ where
135
135
// and it becomes free on modern ISAs, including x86, x86_64 and ARM.
136
136
//
137
137
// ```
138
- // // shrx state, qword ptr [table_addr + 8 * byte], state # On x86-64-v3
138
+ // // On x86-64-v3: (more instructions on ordinary x86_64 but with same cycles-per-byte)
139
+ // // shrx state, qword ptr [TRANS_TABLE + 4 * byte], state
140
+ // // On aarch64/ARMv8:
141
+ // // ldr temp, [TRANS_TABLE, byte, lsl 2]
142
+ // // lsr state, temp, state
139
143
// state = TRANS_TABLE[byte].wrapping_shr(state);
140
144
// ```
141
145
//
@@ -290,27 +294,28 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
290
294
// SAFETY: Start at initial state ACCEPT.
291
295
let mut st = unsafe { run_with_error_handling ( ST_ACCEPT , & bytes[ ..i] , 0 ) ? } ;
292
296
293
- while i + MAIN_CHUNK_SIZE <= bytes. len ( ) {
297
+ while i < bytes. len ( ) {
294
298
// Fast path: if the current state is ACCEPT, we can skip to the next non-ASCII chunk.
295
299
// We also did a quick inspection on the first byte to avoid getting into this path at all
296
300
// when handling strings with almost no ASCII, eg. Chinese scripts.
297
301
// SAFETY: `i` is in bound.
298
- if st == ST_ACCEPT && unsafe { * bytes. get_unchecked ( i) } < 0x80 {
302
+ if st == ST_ACCEPT && unsafe { bytes. get_unchecked ( i) . is_ascii ( ) } {
299
303
// SAFETY: `i` is in bound.
300
304
let rest = unsafe { bytes. get_unchecked ( i..) } ;
301
305
let mut ascii_chunks = rest. array_chunks :: < ASCII_CHUNK_SIZE > ( ) ;
302
306
let ascii_rest_chunk_cnt = ascii_chunks. len ( ) ;
303
307
let pos = ascii_chunks
304
308
. position ( |chunk| {
305
- // NB. Always traverse the whole chunk to enable vectorization, instead of `.any()`.
306
- // LLVM will be fear of memory traps and fallback if loop has short-circuit.
309
+ // NB. Always traverse the whole chunk instead of `.all()`, to persuade LLVM to
310
+ // vectorize this check.
311
+ // We also do not use `<[u8]>::is_ascii` which is unnecessarily complex here.
307
312
#[ expect( clippy:: unnecessary_fold) ]
308
- let has_non_ascii = chunk. iter ( ) . fold ( false , |acc, & b| acc || ( b >= 0x80 ) ) ;
309
- has_non_ascii
313
+ let all_ascii = chunk. iter ( ) . fold ( true , |acc, b| acc && b . is_ascii ( ) ) ;
314
+ !all_ascii
310
315
} )
311
316
. unwrap_or ( ascii_rest_chunk_cnt) ;
312
317
i += pos * ASCII_CHUNK_SIZE ;
313
- if i + MAIN_CHUNK_SIZE > bytes. len ( ) {
318
+ if i >= bytes. len ( ) {
314
319
break ;
315
320
}
316
321
}
0 commit comments