Ensure swap_nonoverlapping is really always untyped

scottmcm · scottmcm · commit 6964320a3cea · 2025-02-21T21:36:35.000-08:00
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
@@ -398,6 +398,7 @@ use crate::cmp::Ordering;
 use crate::intrinsics::const_eval_select;
 use crate::marker::FnPtr;
 use crate::mem::{self, MaybeUninit, SizedTypeProperties};
+use crate::num::NonZero;
 use crate::{fmt, hash, intrinsics, ub_checks};
 
 mod alignment;
@@ -1094,49 +1095,22 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
             // are pointers inside `T` we will copy them in one go rather than trying to copy a part
             // of a pointer (which would not work).
             // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
+            unsafe { swap_nonoverlapping_const(x, y, count) }
         } else {
-            macro_rules! attempt_swap_as_chunks {
-                ($ChunkTy:ty) => {
-                    if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
-                        && mem::size_of::<T>() % mem::size_of::<$ChunkTy>() == 0
-                    {
-                        let x: *mut $ChunkTy = x.cast();
-                        let y: *mut $ChunkTy = y.cast();
-                        let count = count * (mem::size_of::<T>() / mem::size_of::<$ChunkTy>());
-                        // SAFETY: these are the same bytes that the caller promised were
-                        // ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
-                        // The `if` condition above ensures that we're not violating
-                        // alignment requirements, and that the division is exact so
-                        // that we don't lose any bytes off the end.
-                        return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
-                    }
-                };
+            // SAFETY: To exist as a memory range its size in bytes can't overflow.
+            let bytes = unsafe { size_of::<T>().unchecked_mul(count) };
+            if let Some(bytes) = NonZero::new(bytes) {
+                // SAFETY: These are the same ranges, just expressed in a different
+                // type, so they're still non-overlapping.
+                unsafe { swap_nonoverlapping_bytes(x.cast(), y.cast(), bytes) };
             }
-
-            // Split up the slice into small power-of-two-sized chunks that LLVM is able
-            // to vectorize (unless it's a special type with more-than-pointer alignment,
-            // because we don't want to pessimize things like slices of SIMD vectors.)
-            if mem::align_of::<T>() <= mem::size_of::<usize>()
-            && (!mem::size_of::<T>().is_power_of_two()
-                || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
-            {
-                attempt_swap_as_chunks!(usize);
-                attempt_swap_as_chunks!(u8);
-            }
-
-            // SAFETY: Same preconditions as this function
-            unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
         }
     )
 }
 
 /// Same behavior and safety conditions as [`swap_nonoverlapping`]
-///
-/// LLVM can vectorize this (at least it can for the power-of-two-sized types
-/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
 #[inline]
-const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
+const unsafe fn swap_nonoverlapping_const<T>(x: *mut T, y: *mut T, count: usize) {
     let x = x.cast::<MaybeUninit<T>>();
     let y = y.cast::<MaybeUninit<T>>();
     let mut i = 0;
@@ -1147,13 +1121,6 @@ const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, coun
         // and it's distinct from `x` since the ranges are non-overlapping
         let y = unsafe { y.add(i) };
 
-        // If we end up here, it's because we're using a simple type -- like
-        // a small power-of-two-sized thing -- or a special type with particularly
-        // large alignment, particularly SIMD types.
-        // Thus, we're fine just reading-and-writing it, as either it's small
-        // and that works well anyway or it's special and the type's author
-        // presumably wanted things to be done in the larger chunk.
-
         // SAFETY: we're only ever given pointers that are valid to read/write,
         // including being aligned, and nothing here panics so it's drop-safe.
         unsafe {
@@ -1167,6 +1134,72 @@ const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, coun
     }
 }
 
+// Don't let MIR inline this, because we really want it to keep its noalias metadata
+#[rustc_no_mir_inline]
+#[inline]
+fn swap_chunk<const N: usize>(x: &mut MaybeUninit<[u8; N]>, y: &mut MaybeUninit<[u8; N]>) {
+    let a = *x;
+    let b = *y;
+    *x = b;
+    *y = a;
+}
+
+#[inline]
+unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
+    // Same as `swap_nonoverlapping::<[u8; N]>`.
+    #[inline]
+    unsafe fn swap_nonoverlapping_chunks<const N: usize>(
+        x: *mut MaybeUninit<[u8; N]>,
+        y: *mut MaybeUninit<[u8; N]>,
+        chunks: NonZero<usize>,
+    ) {
+        let chunks = chunks.get();
+        for i in 0..chunks {
+            // SAFETY: i is in [0, chunks) so the adds and dereferences are in-bounds.
+            unsafe { swap_chunk(&mut *x.add(i), &mut *y.add(i)) };
+        }
+    }
+
+    // Same as `swap_nonoverlapping_bytes`, but accepts at most 1+2+4=7 bytes
+    #[inline]
+    unsafe fn swap_nonoverlapping_short(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
+        let bytes = bytes.get();
+        let mut i = 0;
+        macro_rules! swap_prefix {
+            ($($n:literal)+) => {$(
+                if (bytes & $n) != 0 {
+                    // SAFETY: `i` can only have the same bits set as those in bytes,
+                    // so these `add`s are in-bounds of `bytes`.  But the bit for
+                    // `$n` hasn't been set yet, so the `$n` bytes that `swap_chunk`
+                    // will read and write are within the usable range.
+                    unsafe { swap_chunk::<$n>(&mut*x.add(i).cast(), &mut*y.add(i).cast()) };
+                    i |= $n;
+                }
+            )+};
+        }
+        swap_prefix!(4 2 1);
+        debug_assert_eq!(i, bytes);
+    }
+
+    const CHUNK_SIZE: usize = size_of::<*const ()>();
+    let bytes = bytes.get();
+
+    let chunks = bytes / CHUNK_SIZE;
+    let tail = bytes % CHUNK_SIZE;
+    if let Some(chunks) = NonZero::new(chunks) {
+        // SAFETY: this is bytes/CHUNK_SIZE*CHUNK_SIZE bytes, which is <= bytes,
+        // so it's within the range of our non-overlapping bytes.
+        unsafe { swap_nonoverlapping_chunks::<CHUNK_SIZE>(x.cast(), y.cast(), chunks) };
+    }
+    if let Some(tail) = NonZero::new(tail) {
+        const { assert!(CHUNK_SIZE <= 8) };
+        let delta = chunks * CHUNK_SIZE;
+        // SAFETY: the tail length is below CHUNK SIZE because of the remainder,
+        // and CHUNK_SIZE is at most 8 by the const assert, so tail <= 7
+        unsafe { swap_nonoverlapping_short(x.add(delta), y.add(delta), tail) };
+    }
+}
+
 /// Moves `src` into the pointed `dst`, returning the previous `dst` value.
 ///
 /// Neither value is dropped.
diff --git a/tests/assembly/x86_64-typed-swap.rs b/tests/assembly/x86_64-typed-swap.rs
@@ -51,3 +51,31 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
     // CHECK-NEXT: retq
     swap(x, y)
 }
+
+// CHECK-LABEL: swap_string:
+#[no_mangle]
+pub fn swap_string(x: &mut String, y: &mut String) {
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movups
+    // CHECK-NOT: mov
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: mov
+    swap(x, y)
+}
+
+// CHECK-LABEL: swap_44_bytes:
+#[no_mangle]
+pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
+    // Ensure we do better than a long run of byte copies,
+    // see <https://github.com/rust-lang/rust/issues/134946>
+
+    // CHECK-NOT: movb
+    // CHECK-COUNT-8: movups{{.+}}xmm
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movq
+    // CHECK-NOT: movb
+    // CHECK-COUNT-4: movl
+    // CHECK-NOT: movb
+    // CHECK: retq
+    swap(x, y)
+}
diff --git a/tests/codegen/simd/swap-simd-types.rs b/tests/codegen/simd/swap-simd-types.rs
@@ -23,8 +23,8 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
 #[no_mangle]
 pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
     // CHECK-NOT: alloca
-    // CHECK: load <8 x float>{{.+}}align 32
-    // CHECK: store <8 x float>{{.+}}align 32
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -34,7 +34,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <32 x i8>{{.+}}align 1
-    // CHECK: store <32 x i8>{{.+}}align 1
+    // CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
+    // CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
     swap(x, y)
 }
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
@@ -38,9 +38,6 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
     swap(x, y)
 }
 
-// Verify that types with usize alignment are swapped via vectored usizes,
-// not falling back to byte-level code.
-
 // CHECK-LABEL: @swap_slice
 #[no_mangle]
 pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
@@ -52,39 +49,61 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
     }
 }
 
-// But for a large align-1 type, vectorized byte copying is what we want.
-
 type OneKilobyteBuffer = [u8; 1024];
 
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK: load <{{[0-9]+}} x i64>{{.+}}align 1,
+    // CHECK: store <{{[0-9]+}} x i64>{{.+}}align 1,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
-// for an unusual type like this.  It's not clear whether we should do anything
-// smarter in Rust for these, so for now it's fine to leave these up to the backend.
-// That's not as bad as it might seem, as for example, LLVM will lower the
-// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
-// Eventually we'll be able to pass `align_of::<T>` to a const generic and
-// thus pick a smarter chunk size ourselves without huge code duplication.
-
 #[repr(align(64))]
 pub struct BigButHighlyAligned([u8; 64 * 3]);
 
 // CHECK-LABEL: @swap_big_aligned
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
     // CHECK-NOT: call void @llvm.memcpy
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
-    // CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
+    // CHECK-COUNT-2: load <{{[0-9]+}} x i64>{{.+}}align 64,
+    // CHECK-COUNT-2: store <{{[0-9]+}} x i64>{{.+}}align 64,
+
+    // CHECK-COUNT-2: load <{{[0-9]+}} x i64>{{.+}}align 32,
+    // CHECK-COUNT-2: store <{{[0-9]+}} x i64>{{.+}}align 32,
+
+    // CHECK-NOT: load i32
+    // CHECK-NOT: store i32
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
     // CHECK-NOT: call void @llvm.memcpy
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
@@ -27,13 +27,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
 
-    // Whether `i8` is the best for this is unclear, but
-    // might as well record what's actually happening right now.
-
-    // CHECK: load i8
-    // CHECK: load i8
-    // CHECK: store i8
-    // CHECK: store i8
+    // Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
+    // and is closer to the assembly it generates anyway.
+
+    // CHECK-NOT: load{{ }}
+    // CHECK: load i32{{.+}}align 2
+    // CHECK-NEXT: load i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK-NEXT: store i32{{.+}}align 2
+    // CHECK: load i16{{.+}}align 2
+    // CHECK-NEXT: load i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NEXT: store i16{{.+}}align 2
+    // CHECK-NOT: store{{ }}
     swap(x, y)
 }
 
@@ -76,30 +82,47 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
     swap(x, y)
 }
 
-// LLVM doesn't vectorize a loop over 3-byte elements,
-// so we chunk it down to bytes and loop over those instead.
 type RGB24 = [u8; 3];
 
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i8>
-    // CHECK: store <{{[0-9]+}} x i8>
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+    // CHECK-COUNT-2: load i16
+    // CHECK-COUNT-2: store i16
+    // CHECK-COUNT-2: load i8
+    // CHECK-COUNT-2: store i8
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
     // CHECK-NOT: alloca
-    // CHECK: load <{{[0-9]+}} x i32>
-    // CHECK: store <{{[0-9]+}} x i32>
+
+    // Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+
+    // CHECK-COUNT-2: load i32
+    // CHECK-COUNT-2: store i32
+
+    // CHECK-NOT: load i16
+    // CHECK-NOT: store i16
+    // CHECK-NOT: load i8
+    // CHECK-NOT: store i8
+
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr
@@ -5,7 +5,7 @@ error[E0080]: evaluation of constant value failed
    |
 note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `std::ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
+note: inside `std::ptr::swap_nonoverlapping_const::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
 note: inside `swap_nonoverlapping::compiletime::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ error[E0080]: evaluation of constant value failed`
`5`	`5`	`\|`
`6`	`6`	note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
`7`	`7`	`--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL`
`8`		-note: inside `std::ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
	`8`	+note: inside `std::ptr::swap_nonoverlapping_const::<MaybeUninit<u8>>`
`9`	`9`	`--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL`
`10`	`10`	note: inside `swap_nonoverlapping::compiletime::<MaybeUninit<u8>>`
`11`	`11`	`--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL`