Skip to content

Commit 44ccc3b

Browse files
committed
Add is_ascii function optimized for x86-64 for [u8]
The new `is_ascii` function is optimized to use the `pmovmskb` vector instruction which tests the high bit in a lane. This corresponds to the same check of whether a byte is ASCII so more bytes can be checked at a time for ASCII validity. This instruction does not exist on other platforms so it is likely to regress performance and is gated to all(target_arch = "x86_64", target_feature = "sse2"). Remove crate::mem import for functions which now exist in the prelude Add codegen test
1 parent 2ef16f6 commit 44ccc3b

File tree

2 files changed

+73
-12
lines changed

2 files changed

+73
-12
lines changed

library/core/src/slice/ascii.rs

+57-12
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use core::ascii::EscapeDefault;
44

55
use crate::fmt::{self, Write};
6-
use crate::{ascii, iter, mem, ops};
6+
use crate::{ascii, iter, ops};
77

88
#[cfg(not(test))]
99
impl [u8] {
@@ -297,14 +297,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
297297
}
298298
}
299299

300-
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
301-
/// from `../str/mod.rs`, which does something similar for utf8 validation.
302-
#[inline]
303-
const fn contains_nonascii(v: usize) -> bool {
304-
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
305-
(NONASCII_MASK & v) != 0
306-
}
307-
308300
/// ASCII test *without* the chunk-at-a-time optimizations.
309301
///
310302
/// This is carefully structured to produce nice small code -- it's smaller in
@@ -335,9 +327,17 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
335327
///
336328
/// If any of these loads produces something for which `contains_nonascii`
337329
/// (above) returns true, then we know the answer is false.
330+
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
338331
#[inline]
339332
const fn is_ascii(s: &[u8]) -> bool {
340-
const USIZE_SIZE: usize = mem::size_of::<usize>();
333+
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
334+
/// from `../str/mod.rs`, which does something similar for utf8 validation.
335+
const fn contains_nonascii(v: usize) -> bool {
336+
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
337+
(NONASCII_MASK & v) != 0
338+
}
339+
340+
const USIZE_SIZE: usize = size_of::<usize>();
341341

342342
let len = s.len();
343343
let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
@@ -347,7 +347,7 @@ const fn is_ascii(s: &[u8]) -> bool {
347347
//
348348
// We also do this for architectures where `size_of::<usize>()` isn't
349349
// sufficient alignment for `usize`, because it's a weird edge case.
350-
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
350+
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
351351
return is_ascii_simple(s);
352352
}
353353

@@ -381,7 +381,7 @@ const fn is_ascii(s: &[u8]) -> bool {
381381
// have alignment information it should have given a `usize::MAX` for
382382
// `align_offset` earlier, sending things through the scalar path instead of
383383
// this one, so this check should pass if it's reachable.
384-
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
384+
debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
385385

386386
// Read subsequent words until the last aligned word, excluding the last
387387
// aligned word by itself to be done in tail check later, to ensure that
@@ -420,3 +420,48 @@ const fn is_ascii(s: &[u8]) -> bool {
420420

421421
!contains_nonascii(last_word)
422422
}
423+
424+
/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
425+
/// platforms.
426+
///
427+
/// Other platforms are not likely to benefit from this code structure, so they
428+
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
429+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
430+
#[inline]
431+
const fn is_ascii(bytes: &[u8]) -> bool {
432+
// Process chunks of 32 bytes at a time in the fast path to enable
433+
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
434+
// can be OR'd together and then the resulting vector can be tested for
435+
// non-ASCII bytes.
436+
const CHUNK_SIZE: usize = 32;
437+
438+
let mut i = 0;
439+
440+
while i + CHUNK_SIZE <= bytes.len() {
441+
let chunk_end = i + CHUNK_SIZE;
442+
443+
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
444+
// creates a mask from the most significant bit of each byte.
445+
// ASCII bytes are less than 128 (0x80), so their most significant
446+
// bit is unset.
447+
let mut count = 0;
448+
while i < chunk_end {
449+
count += bytes[i].is_ascii() as u8;
450+
i += 1;
451+
}
452+
453+
// All bytes should be <= 127 so count is equal to chunk size.
454+
if count != CHUNK_SIZE as u8 {
455+
return false;
456+
}
457+
}
458+
459+
// Process the remaining `bytes.len() % N` bytes.
460+
let mut is_ascii = true;
461+
while i < bytes.len() {
462+
is_ascii &= bytes[i].is_ascii();
463+
i += 1;
464+
}
465+
466+
is_ascii
467+
}

tests/codegen/slice-is-ascii.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
//@ only-x86_64
2+
//@ compile-flags: -C opt-level=3
3+
#![crate_type = "lib"]
4+
5+
/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
6+
/// Platforms lacking an equivalent instruction use other techniques for
7+
/// optimizing `is_ascii`.
8+
// CHECK-LABEL: @is_ascii_autovectorized
9+
#[no_mangle]
10+
pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
11+
// CHECK: load <32 x i8>
12+
// CHECK-NEXT: icmp slt <32 x i8>
13+
// CHECK-NEXT: bitcast <32 x i1>
14+
// CHECK-NEXT: icmp eq i32
15+
s.is_ascii()
16+
}

0 commit comments

Comments
 (0)