Skip to content

Commit 6a70556

Browse files
committed
Auto merge of #94412 - scottmcm:cfg-out-miri-from-swap, r=oli-obk
For MIRI, cfg out the swap vectorization logic from 94212 Because of #69488 the swap logic from #94212 doesn't currently work in MIRI. Copying in smaller pieces is probably much worse for its performance anyway, so it'd probably rather just use the simple path regardless. Part of #94371, though another PR will be needed for the CTFE aspect. r? `@oli-obk` cc `@RalfJung`
2 parents 3b1fe7e + b582bd3 commit 6a70556

File tree

3 files changed

+64
-13
lines changed

3 files changed

+64
-13
lines changed

library/core/src/mem/mod.rs

+22-5
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,10 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
708708
// understanding `mem::replace`, `Option::take`, etc. - a better overall
709709
// solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
710710
// a backend can choose to implement using the block optimization, or not.
711-
#[cfg(not(target_arch = "spirv"))]
711+
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
712+
// pessimization for it. Also, if the type contains any unaligned pointers,
713+
// copying those over multiple reads is difficult to support.
714+
#[cfg(not(any(target_arch = "spirv", miri)))]
712715
{
713716
// For types that are larger multiples of their alignment, the simple way
714717
// tends to copy the whole thing to stack rather than doing it one part
@@ -737,12 +740,26 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
737740
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
738741
#[inline]
739742
pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
743+
// We arrange for this to typically be called with small types,
744+
// so this reads-and-writes approach is actually better than using
745+
// copy_nonoverlapping as it easily puts things in LLVM registers
746+
// directly and doesn't end up inlining allocas.
747+
// And LLVM actually optimizes it to 3×memcpy if called with
748+
// a type larger than it's willing to keep in a register.
749+
// Having typed reads and writes in MIR here is also good as
750+
// it lets MIRI and CTFE understand them better, including things
751+
// like enforcing type validity for them.
752+
// Importantly, read+copy_nonoverlapping+write introduces confusing
753+
// asymmetry to the behaviour where one value went through read+write
754+
// whereas the other was copied over by the intrinsic (see #94371).
755+
740756
// SAFETY: exclusive references are always valid to read/write,
741-
// are non-overlapping, and nothing here panics so it's drop-safe.
757+
// including being aligned, and nothing here panics so it's drop-safe.
742758
unsafe {
743-
let z = ptr::read(x);
744-
ptr::copy_nonoverlapping(y, x, 1);
745-
ptr::write(y, z);
759+
let a = ptr::read(x);
760+
let b = ptr::read(y);
761+
ptr::write(x, b);
762+
ptr::write(y, a);
746763
}
747764
}
748765

library/core/src/ptr/mod.rs

+15-8
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ pub const unsafe fn swap<T>(x: *mut T, y: *mut T) {
419419
#[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
420420
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
421421
pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
422+
#[allow(unused)]
422423
macro_rules! attempt_swap_as_chunks {
423424
($ChunkTy:ty) => {
424425
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
@@ -437,15 +438,21 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
437438
};
438439
}
439440

440-
// Split up the slice into small power-of-two-sized chunks that LLVM is able
441-
// to vectorize (unless it's a special type with more-than-pointer alignment,
442-
// because we don't want to pessimize things like slices of SIMD vectors.)
443-
if mem::align_of::<T>() <= mem::size_of::<usize>()
444-
&& (!mem::size_of::<T>().is_power_of_two()
445-
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
441+
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
442+
// pessimization for it. Also, if the type contains any unaligned pointers,
443+
// copying those over multiple reads is difficult to support.
444+
#[cfg(not(miri))]
446445
{
447-
attempt_swap_as_chunks!(usize);
448-
attempt_swap_as_chunks!(u8);
446+
// Split up the slice into small power-of-two-sized chunks that LLVM is able
447+
// to vectorize (unless it's a special type with more-than-pointer alignment,
448+
// because we don't want to pessimize things like slices of SIMD vectors.)
449+
if mem::align_of::<T>() <= mem::size_of::<usize>()
450+
&& (!mem::size_of::<T>().is_power_of_two()
451+
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
452+
{
453+
attempt_swap_as_chunks!(usize);
454+
attempt_swap_as_chunks!(u8);
455+
}
449456
}
450457

451458
// SAFETY: Same preconditions as this function

src/test/codegen/swap-large-types.rs

+27
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
3939
swap(x, y)
4040
}
4141

42+
// Verify that types with usize alignment are swapped via vectored usizes,
43+
// not falling back to byte-level code.
44+
4245
// CHECK-LABEL: @swap_slice
4346
#[no_mangle]
4447
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
@@ -50,6 +53,8 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
5053
}
5154
}
5255

56+
// But for a large align-1 type, vectorized byte copying is what we want.
57+
5358
type OneKilobyteBuffer = [u8; 1024];
5459

5560
// CHECK-LABEL: @swap_1kb_slices
@@ -62,3 +67,25 @@ pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer])
6267
x.swap_with_slice(y);
6368
}
6469
}
70+
71+
// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
72+
// for an unusual type like this. It's not clear whether we should do anything
73+
// smarter in Rust for these, so for now it's fine to leave these up to the backend.
74+
// That's not as bad as it might seem, as for example, LLVM will lower the
75+
// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
76+
// Eventually we'll be able to pass `align_of::<T>` to a const generic and
77+
// thus pick a smarter chunk size ourselves without huge code duplication.
78+
79+
#[repr(align(64))]
80+
pub struct BigButHighlyAligned([u8; 64 * 3]);
81+
82+
// CHECK-LABEL: @swap_big_aligned
83+
#[no_mangle]
84+
pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
85+
// CHECK-NOT: call void @llvm.memcpy
86+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
87+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
88+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
89+
// CHECK-NOT: call void @llvm.memcpy
90+
swap(x, y)
91+
}

0 commit comments

Comments
 (0)