Skip to content

Commit c2239bc

Browse files
committed
Auto merge of rust-lang#123185 - scottmcm:more-typed-copy, r=compiler-errors
Remove my `scalar_copy_backend_type` optimization attempt I added this back in rust-lang#111999 , but I no longer think it's a good idea - It had to get scaled back to only power-of-two things to not break a bunch of targets - LLVM seems to be getting better at memcpy removal anyway - Introducing vector instructions has seemed to sometimes (rust-lang#115515 (comment)) make autovectorization worse So this removes it from the codegen crates entirely, and instead just tries to use <https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/builder/trait.BuilderMethods.html#method.typed_place_copy> instead of direct `memcpy` so things will still use load/store when a type isn't `OperandValue::Ref`.
2 parents 5974fe8 + 593e900 commit c2239bc

File tree

11 files changed

+91
-165
lines changed

11 files changed

+91
-165
lines changed

compiler/rustc_codegen_llvm/src/type_.rs

-3
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,6 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
281281
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
282282
ty.llvm_type(self)
283283
}
284-
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
285-
layout.scalar_copy_llvm_type(self)
286-
}
287284
}
288285

289286
impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {

compiler/rustc_codegen_llvm/src/type_of.rs

-42
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use rustc_middle::bug;
55
use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
66
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
77
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
8-
use rustc_target::abi::HasDataLayout;
98
use rustc_target::abi::{Abi, Align, FieldsShape};
109
use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64};
1110
use rustc_target::abi::{Scalar, Size, Variants};
@@ -166,7 +165,6 @@ pub trait LayoutLlvmExt<'tcx> {
166165
index: usize,
167166
immediate: bool,
168167
) -> &'a Type;
169-
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
170168
}
171169

172170
impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
@@ -308,44 +306,4 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
308306

309307
self.scalar_llvm_type_at(cx, scalar)
310308
}
311-
312-
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
313-
debug_assert!(self.is_sized());
314-
315-
// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
316-
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
317-
// (like AVX2 registers) seems at least like a tolerable starting point.
318-
let threshold = cx.data_layout().pointer_size * 4;
319-
if self.layout.size() > threshold {
320-
return None;
321-
}
322-
323-
// Vectors, even for non-power-of-two sizes, have the same layout as
324-
// arrays but don't count as aggregate types
325-
// While LLVM theoretically supports non-power-of-two sizes, and they
326-
// often work fine, sometimes x86-isel deals with them horribly
327-
// (see #115212) so for now only use power-of-two ones.
328-
if let FieldsShape::Array { count, .. } = self.layout.fields()
329-
&& count.is_power_of_two()
330-
&& let element = self.field(cx, 0)
331-
&& element.ty.is_integral()
332-
{
333-
// `cx.type_ix(bits)` is tempting here, but while that works great
334-
// for things that *stay* as memory-to-memory copies, it also ends
335-
// up suppressing vectorization as it introduces shifts when it
336-
// extracts all the individual values.
337-
338-
let ety = element.llvm_type(cx);
339-
if *count == 1 {
340-
// Emitting `<1 x T>` would be silly; just use the scalar.
341-
return Some(ety);
342-
} else {
343-
return Some(cx.type_vector(ety, *count));
344-
}
345-
}
346-
347-
// FIXME: The above only handled integer arrays; surely more things
348-
// would also be possible. Be careful about provenance, though!
349-
None
350-
}
351309
}

compiler/rustc_codegen_ssa/src/base.rs

+3-35
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use crate::mir;
1212
use crate::mir::operand::OperandValue;
1313
use crate::mir::place::PlaceRef;
1414
use crate::traits::*;
15-
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind};
15+
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind};
1616

1717
use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS};
1818
use rustc_attr as attr;
@@ -37,7 +37,7 @@ use rustc_session::config::{self, CrateType, EntryFnType, OptLevel, OutputType};
3737
use rustc_session::Session;
3838
use rustc_span::symbol::sym;
3939
use rustc_span::Symbol;
40-
use rustc_target::abi::{Align, FIRST_VARIANT};
40+
use rustc_target::abi::FIRST_VARIANT;
4141

4242
use std::cmp;
4343
use std::collections::BTreeSet;
@@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
282282
}
283283

284284
if src_f.layout.ty == dst_f.layout.ty {
285-
memcpy_ty(
286-
bx,
287-
dst_f.llval,
288-
dst_f.align,
289-
src_f.llval,
290-
src_f.align,
291-
src_f.layout,
292-
MemFlags::empty(),
293-
);
285+
bx.typed_place_copy(dst_f, src_f);
294286
} else {
295287
coerce_unsized_into(bx, src_f, dst_f);
296288
}
@@ -382,30 +374,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool {
382374
wants_wasm_eh(sess) || wants_msvc_seh(sess)
383375
}
384376

385-
pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
386-
bx: &mut Bx,
387-
dst: Bx::Value,
388-
dst_align: Align,
389-
src: Bx::Value,
390-
src_align: Align,
391-
layout: TyAndLayout<'tcx>,
392-
flags: MemFlags,
393-
) {
394-
let size = layout.size.bytes();
395-
if size == 0 {
396-
return;
397-
}
398-
399-
if flags == MemFlags::empty()
400-
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
401-
{
402-
let temp = bx.load(bty, src, src_align);
403-
bx.store(temp, dst, dst_align);
404-
} else {
405-
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
406-
}
407-
}
408-
409377
pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
410378
cx: &'a Bx::CodegenCx,
411379
instance: Instance<'tcx>,

compiler/rustc_codegen_ssa/src/mir/block.rs

+3-10
Original file line numberDiff line numberDiff line change
@@ -1459,7 +1459,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
14591459
}
14601460
_ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false),
14611461
},
1462-
Ref(llval, _, align) => match arg.mode {
1462+
Ref(llval, llextra, align) => match arg.mode {
14631463
PassMode::Indirect { attrs, .. } => {
14641464
let required_align = match attrs.pointee_align {
14651465
Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi),
@@ -1470,15 +1470,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
14701470
// alignment requirements may be higher than the type's alignment, so copy
14711471
// to a higher-aligned alloca.
14721472
let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align);
1473-
base::memcpy_ty(
1474-
bx,
1475-
scratch.llval,
1476-
scratch.align,
1477-
llval,
1478-
align,
1479-
op.layout,
1480-
MemFlags::empty(),
1481-
);
1473+
let op_place = PlaceRef { llval, llextra, layout: op.layout, align };
1474+
bx.typed_place_copy(scratch, op_place);
14821475
(scratch.llval, scratch.align, true)
14831476
} else {
14841477
(llval, align, true)

compiler/rustc_codegen_ssa/src/mir/operand.rs

+5-11
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use super::place::PlaceRef;
22
use super::{FunctionCx, LocalRef};
33

4-
use crate::base;
54
use crate::size_of_val;
65
use crate::traits::*;
76
use crate::MemFlags;
@@ -398,7 +397,7 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
398397
self.store_with_flags(bx, dest, MemFlags::NONTEMPORAL);
399398
}
400399

401-
fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
400+
pub(crate) fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
402401
self,
403402
bx: &mut Bx,
404403
dest: PlaceRef<'tcx, V>,
@@ -410,16 +409,11 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
410409
// Avoid generating stores of zero-sized values, because the only way to have a zero-sized
411410
// value is through `undef`/`poison`, and the store itself is useless.
412411
}
413-
OperandValue::Ref(r, None, source_align) => {
412+
OperandValue::Ref(llval, llextra @ None, source_align) => {
414413
assert!(dest.layout.is_sized(), "cannot directly store unsized values");
415-
if flags.contains(MemFlags::NONTEMPORAL) {
416-
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
417-
let ty = bx.backend_type(dest.layout);
418-
let val = bx.load(ty, r, source_align);
419-
bx.store_with_flags(val, dest.llval, dest.align, flags);
420-
return;
421-
}
422-
base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags)
414+
let source_place =
415+
PlaceRef { llval, llextra, align: source_align, layout: dest.layout };
416+
bx.typed_place_copy_with_flags(dest, source_place, flags);
423417
}
424418
OperandValue::Ref(_, Some(_), _) => {
425419
bug!("cannot directly store unsized values");

compiler/rustc_codegen_ssa/src/traits/builder.rs

+20-5
Original file line numberDiff line numberDiff line change
@@ -281,17 +281,32 @@ pub trait BuilderMethods<'a, 'tcx>:
281281
dst: PlaceRef<'tcx, Self::Value>,
282282
src: PlaceRef<'tcx, Self::Value>,
283283
) {
284-
debug_assert!(src.llextra.is_none());
285-
debug_assert!(dst.llextra.is_none());
284+
self.typed_place_copy_with_flags(dst, src, MemFlags::empty());
285+
}
286+
287+
fn typed_place_copy_with_flags(
288+
&mut self,
289+
dst: PlaceRef<'tcx, Self::Value>,
290+
src: PlaceRef<'tcx, Self::Value>,
291+
flags: MemFlags,
292+
) {
293+
debug_assert!(src.llextra.is_none(), "cannot directly copy from unsized values");
294+
debug_assert!(dst.llextra.is_none(), "cannot directly copy into unsized values");
286295
debug_assert_eq!(dst.layout.size, src.layout.size);
287-
if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout) {
296+
if flags.contains(MemFlags::NONTEMPORAL) {
297+
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
298+
let ty = self.backend_type(dst.layout);
299+
let val = self.load(ty, src.llval, src.align);
300+
self.store_with_flags(val, dst.llval, dst.align, flags);
301+
} else if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout)
302+
{
288303
// If we're not optimizing, the aliasing information from `memcpy`
289304
// isn't useful, so just load-store the value for smaller code.
290305
let temp = self.load_operand(src);
291-
temp.val.store(self, dst);
306+
temp.val.store_with_flags(self, dst, flags);
292307
} else if !dst.layout.is_zst() {
293308
let bytes = self.const_usize(dst.layout.size.bytes());
294-
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, MemFlags::empty());
309+
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags);
295310
}
296311
}
297312

compiler/rustc_codegen_ssa/src/traits/type_.rs

-22
Original file line numberDiff line numberDiff line change
@@ -133,28 +133,6 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
133133
|| self.is_backend_immediate(layout)
134134
|| self.is_backend_scalar_pair(layout))
135135
}
136-
137-
/// A type that can be used in a [`super::BuilderMethods::load`] +
138-
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
139-
/// such as a MIR `*_0 = *_1`.
140-
///
141-
/// It's always legal to return `None` here, as the provided impl does,
142-
/// in which case callers should use [`super::BuilderMethods::memcpy`]
143-
/// instead of the `load`+`store` pair.
144-
///
145-
/// This can be helpful for things like arrays, where the LLVM backend type
146-
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
147-
/// instead be copied via an `i48` that stays as the single `load`+`store`.
148-
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
149-
/// cases, due to `poison` handling, but in codegen we have more information
150-
/// about the type invariants, so can emit something better instead.)
151-
///
152-
/// This *should* return `None` for particularly-large types, where leaving
153-
/// the `memcpy` may well be important to avoid code size explosion.
154-
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
155-
let _ = layout;
156-
None
157-
}
158136
}
159137

160138
// For backends that support CFI using type membership (i.e., testing whether a given pointer is

tests/codegen/array-codegen.rs

+25-19
Original file line numberDiff line numberDiff line change
@@ -5,52 +5,58 @@
55
// CHECK-LABEL: @array_load
66
#[no_mangle]
77
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
8-
// CHECK: %_0 = alloca [4 x i8], align 1
9-
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
10-
// CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1
11-
// CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1
12-
// CHECK: ret i32 %[[TEMP2]]
8+
// CHECK-NOT: alloca
9+
// CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1
10+
// CHECK-NOT: alloca
11+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false)
12+
// CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1
13+
// CHECK: ret i32 %[[TEMP]]
1314
*a
1415
}
1516

1617
// CHECK-LABEL: @array_store
1718
#[no_mangle]
1819
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
20+
// CHECK-NOT: alloca
21+
// CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]]
22+
// CHECK-NOT: alloca
1923
// CHECK: %a = alloca [4 x i8]
20-
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
21-
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
24+
// CHECK-NOT: alloca
25+
// store i32 %0, ptr %[[TEMP]]
26+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false)
27+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false)
2228
*p = a;
2329
}
2430

2531
// CHECK-LABEL: @array_copy
2632
#[no_mangle]
2733
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
34+
// CHECK-NOT: alloca
2835
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
29-
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
30-
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
31-
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
32-
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
36+
// CHECK-NOT: alloca
37+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false)
38+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false)
3339
*p = *a;
3440
}
3541

3642
// CHECK-LABEL: @array_copy_1_element
3743
#[no_mangle]
3844
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
45+
// CHECK-NOT: alloca
3946
// CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
40-
// CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
41-
// CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1
42-
// CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1
43-
// CHECK: store i8 %[[TEMP2]], ptr %p, align 1
47+
// CHECK-NOT: alloca
48+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false)
49+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false)
4450
*p = *a;
4551
}
4652

4753
// CHECK-LABEL: @array_copy_2_elements
4854
#[no_mangle]
4955
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
56+
// CHECK-NOT: alloca
5057
// CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
51-
// CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1
52-
// CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
53-
// CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1
54-
// CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1
58+
// CHECK-NOT: alloca
59+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false)
60+
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false)
5561
*p = *a;
5662
}

tests/codegen/array-optimized.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
1616
#[no_mangle]
1717
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
1818
// CHECK-NOT: alloca
19-
// CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1
20-
// CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1
19+
// CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1
20+
// CHECK: store i16 %[[TEMP]], ptr %p, align 1
2121
// CHECK: ret
2222
*p = *a;
2323
}
@@ -26,8 +26,8 @@ pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
2626
#[no_mangle]
2727
pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
2828
// CHECK-NOT: alloca
29-
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
30-
// CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
29+
// CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1
30+
// CHECK: store i32 %[[TEMP]], ptr %p, align 1
3131
// CHECK: ret
3232
*p = *a;
3333
}

0 commit comments

Comments
 (0)