Skip to content

Commit bc78676

Browse files
committed
Optimize is_ascii for str and [u8] further
Replace the existing optimized function with one that enables use of vector instructions. This is especially beneficial on x86-64 as `pmovmskb` can be emitted with careful structuring of the code. The instruction can detect non-ASCII characters a vector register width at a time instead of the current `usize` at a time check. This results in a completely safe implementation. Remove previous implementation's alignment test
1 parent 8caa7d6 commit bc78676

File tree

2 files changed

+29
-152
lines changed

2 files changed

+29
-152
lines changed

library/core/src/slice/ascii.rs

+29-98
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use core::ascii::EscapeDefault;
44

55
use crate::fmt::{self, Write};
6-
use crate::{ascii, iter, mem, ops};
6+
use crate::{ascii, iter, ops};
77

88
#[cfg(not(test))]
99
impl [u8] {
@@ -297,14 +297,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
297297
}
298298
}
299299

300-
/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
301-
/// from `../str/mod.rs`, which does something similar for utf8 validation.
302-
#[inline]
303-
const fn contains_nonascii(v: usize) -> bool {
304-
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
305-
(NONASCII_MASK & v) != 0
306-
}
307-
308300
/// ASCII test *without* the chunk-at-a-time optimizations.
309301
///
310302
/// This is carefully structured to produce nice small code -- it's smaller in
@@ -323,100 +315,39 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
323315
bytes.is_empty()
324316
}
325317

326-
/// Optimized ASCII test that will use usize-at-a-time operations instead of
327-
/// byte-at-a-time operations (when possible).
328-
///
329-
/// The algorithm we use here is pretty simple. If `s` is too short, we just
330-
/// check each byte and be done with it. Otherwise:
331-
///
332-
/// - Read the first word with an unaligned load.
333-
/// - Align the pointer, read subsequent words until end with aligned loads.
334-
/// - Read the last `usize` from `s` with an unaligned load.
335-
///
336-
/// If any of these loads produces something for which `contains_nonascii`
337-
/// (above) returns true, then we know the answer is false.
338318
#[inline]
339-
const fn is_ascii(s: &[u8]) -> bool {
340-
const USIZE_SIZE: usize = mem::size_of::<usize>();
341-
342-
let len = s.len();
343-
let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
344-
345-
// If we wouldn't gain anything from the word-at-a-time implementation, fall
346-
// back to a scalar loop.
347-
//
348-
// We also do this for architectures where `size_of::<usize>()` isn't
349-
// sufficient alignment for `usize`, because it's a weird edge case.
350-
if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
351-
return is_ascii_simple(s);
352-
}
353-
354-
// We always read the first word unaligned, which means `align_offset` is
355-
// 0, we'd read the same value again for the aligned read.
356-
let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset };
357-
358-
let start = s.as_ptr();
359-
// SAFETY: We verify `len < USIZE_SIZE` above.
360-
let first_word = unsafe { (start as *const usize).read_unaligned() };
361-
362-
if contains_nonascii(first_word) {
363-
return false;
364-
}
365-
// We checked this above, somewhat implicitly. Note that `offset_to_aligned`
366-
// is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
367-
// above.
368-
debug_assert!(offset_to_aligned <= len);
369-
370-
// SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
371-
// middle chunk of the slice.
372-
let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize };
373-
374-
// `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
375-
let mut byte_pos = offset_to_aligned;
376-
377-
// Paranoia check about alignment, since we're about to do a bunch of
378-
// unaligned loads. In practice this should be impossible barring a bug in
379-
// `align_offset` though.
380-
// While this method is allowed to spuriously fail in CTFE, if it doesn't
381-
// have alignment information it should have given a `usize::MAX` for
382-
// `align_offset` earlier, sending things through the scalar path instead of
383-
// this one, so this check should pass if it's reachable.
384-
debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
385-
386-
// Read subsequent words until the last aligned word, excluding the last
387-
// aligned word by itself to be done in tail check later, to ensure that
388-
// tail is always one `usize` at most to extra branch `byte_pos == len`.
389-
while byte_pos < len - USIZE_SIZE {
390-
// Sanity check that the read is in bounds
391-
debug_assert!(byte_pos + USIZE_SIZE <= len);
392-
// And that our assumptions about `byte_pos` hold.
393-
debug_assert!(matches!(
394-
word_ptr.cast::<u8>().guaranteed_eq(start.wrapping_add(byte_pos)),
395-
// These are from the same allocation, so will hopefully always be
396-
// known to match even in CTFE, but if it refuses to compare them
397-
// that's ok since it's just a debug check anyway.
398-
None | Some(true),
399-
));
319+
const fn is_ascii(bytes: &[u8]) -> bool {
320+
// Constant chosen to enable `pmovmskb` instruction on x86-64
321+
const N: usize = 32;
322+
323+
let mut i = 0;
324+
325+
while i + N <= bytes.len() {
326+
let chunk_end = i + N;
327+
328+
// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
329+
// creates a mask from the most significant bit of each byte.
330+
// ASCII bytes are less than 128 (0x80), so their most significant
331+
// bit is unset. Thus, detecting non-ASCII bytes can be done in one
332+
// instruction.
333+
let mut count = 0;
334+
while i < chunk_end {
335+
count += (bytes[i] <= 127) as u8;
336+
i += 1;
337+
}
400338

401-
// SAFETY: We know `word_ptr` is properly aligned (because of
402-
// `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
403-
let word = unsafe { word_ptr.read() };
404-
if contains_nonascii(word) {
339+
// All bytes should be <= 127 so count is equal to chunk size.
340+
if count != N as u8 {
405341
return false;
406342
}
407-
408-
byte_pos += USIZE_SIZE;
409-
// SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
410-
// after this `add`, `word_ptr` will be at most one-past-the-end.
411-
word_ptr = unsafe { word_ptr.add(1) };
412343
}
413344

414-
// Sanity check to ensure there really is only one `usize` left. This should
415-
// be guaranteed by our loop condition.
416-
debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE);
417-
418-
// SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
419-
let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() };
345+
// Process the remaining `bytes.len() % N` bytes.
346+
let mut is_ascii = true;
347+
while i < bytes.len() {
348+
is_ascii &= bytes[i] <= 127;
349+
i += 1;
350+
}
420351

421-
!contains_nonascii(last_word)
352+
is_ascii
422353
}

library/core/tests/ascii.rs

-54
Original file line numberDiff line numberDiff line change
@@ -361,60 +361,6 @@ fn test_is_ascii_control() {
361361
);
362362
}
363363

364-
// `is_ascii` does a good amount of pointer manipulation and has
365-
// alignment-dependent computation. This is all sanity-checked via
366-
// `debug_assert!`s, so we test various sizes/alignments thoroughly versus an
367-
// "obviously correct" baseline function.
368-
#[test]
369-
fn test_is_ascii_align_size_thoroughly() {
370-
// The "obviously-correct" baseline mentioned above.
371-
fn is_ascii_baseline(s: &[u8]) -> bool {
372-
s.iter().all(|b| b.is_ascii())
373-
}
374-
375-
// Helper to repeat `l` copies of `b0` followed by `l` copies of `b1`.
376-
fn repeat_concat(b0: u8, b1: u8, l: usize) -> Vec<u8> {
377-
use core::iter::repeat;
378-
repeat(b0).take(l).chain(repeat(b1).take(l)).collect()
379-
}
380-
381-
// Miri is too slow
382-
let iter = if cfg!(miri) { 0..20 } else { 0..100 };
383-
384-
for i in iter {
385-
#[cfg(not(miri))]
386-
let cases = &[
387-
b"a".repeat(i),
388-
b"\0".repeat(i),
389-
b"\x7f".repeat(i),
390-
b"\x80".repeat(i),
391-
b"\xff".repeat(i),
392-
repeat_concat(b'a', 0x80u8, i),
393-
repeat_concat(0x80u8, b'a', i),
394-
];
395-
396-
#[cfg(miri)]
397-
let cases = &[b"a".repeat(i), b"\x80".repeat(i), repeat_concat(b'a', 0x80u8, i)];
398-
399-
for case in cases {
400-
for pos in 0..=case.len() {
401-
// Potentially misaligned head
402-
let prefix = &case[pos..];
403-
assert_eq!(is_ascii_baseline(prefix), prefix.is_ascii(),);
404-
405-
// Potentially misaligned tail
406-
let suffix = &case[..case.len() - pos];
407-
408-
assert_eq!(is_ascii_baseline(suffix), suffix.is_ascii(),);
409-
410-
// Both head and tail are potentially misaligned
411-
let mid = &case[(pos / 2)..(case.len() - (pos / 2))];
412-
assert_eq!(is_ascii_baseline(mid), mid.is_ascii(),);
413-
}
414-
}
415-
}
416-
}
417-
418364
#[test]
419365
fn ascii_const() {
420366
// test that the `is_ascii` methods of `char` and `u8` are usable in a const context

0 commit comments

Comments
 (0)