Skip to content

Commit 7e0a135

Browse files
committed
Avoid alloca for fully static sizes
1 parent 035b01b commit 7e0a135

5 files changed

Lines changed: 63 additions & 25 deletions

File tree

compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ use std::ffi::CString;
22

33
use llvm::Linkage::*;
44
use rustc_abi::Align;
5+
use rustc_codegen_ssa::MemFlags;
56
use rustc_codegen_ssa::common::TypeKind;
67
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
78
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
89
use rustc_middle::bug;
9-
use rustc_middle::ty::offload_meta::OffloadMetadata;
10+
use rustc_middle::ty::offload_meta::{OffloadMetadata, OffloadSize};
1011

1112
use crate::builder::Builder;
1213
use crate::common::CodegenCx;
@@ -427,7 +428,15 @@ pub(crate) fn gen_define_handling<'ll>(
427428
let (sizes, transfer): (Vec<_>, Vec<_>) =
428429
metadata.iter().map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
429430

430-
let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes);
431+
let actual_sizes = sizes
432+
.iter()
433+
.map(|s| match s {
434+
OffloadSize::Static(sz) => *sz,
435+
OffloadSize::Dynamic => 0,
436+
})
437+
.collect::<Vec<_>>();
438+
let offload_sizes =
439+
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &actual_sizes);
431440
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
432441
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
433442
// A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
@@ -470,10 +479,6 @@ pub(crate) fn gen_define_handling<'ll>(
470479
cx.add_compiler_used_global(offload_entry);
471480

472481
let result = OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id };
473-
474-
// FIXME(Sa4dUs): use this global for constant offload sizes
475-
cx.add_compiler_used_global(result.offload_sizes);
476-
477482
cx.offload_kernel_cache.borrow_mut().insert(symbol, result);
478483

479484
result
@@ -507,6 +512,15 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
507512
}
508513
}
509514

515+
fn get_runtime_size<'ll, 'tcx>(
516+
_cx: &CodegenCx<'ll, 'tcx>,
517+
_val: &'ll Value,
518+
_meta: &OffloadMetadata,
519+
) -> &'ll Value {
520+
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
521+
bug!("offload does not support dynamic sizes yet");
522+
}
523+
510524
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
511525
// the gpu. For now, we only handle the data transfer part of it.
512526
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
@@ -535,10 +549,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
535549
offload_dims: &OffloadKernelDims<'ll>,
536550
) {
537551
let cx = builder.cx;
538-
let OffloadKernelGlobals { memtransfer_types, region_id, .. } = offload_data;
552+
let OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id } = offload_data;
539553
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
540554
offload_dims;
541555

556+
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
557+
542558
let tgt_decl = offload_globals.launcher_fn;
543559
let tgt_target_kernel_ty = offload_globals.launcher_ty;
544560

@@ -562,7 +578,24 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
562578
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
563579
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
564580
let ty2 = cx.type_array(cx.type_i64(), num_args);
565-
let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
581+
582+
let a4 = if has_dynamic {
583+
let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
584+
585+
builder.memcpy(
586+
alloc,
587+
Align::EIGHT,
588+
offload_sizes,
589+
Align::EIGHT,
590+
cx.get_const_i64(8 * args.len() as u64),
591+
MemFlags::empty(),
592+
None,
593+
);
594+
595+
alloc
596+
} else {
597+
offload_sizes
598+
};
566599

567600
//%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
568601
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
@@ -620,9 +653,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
620653
builder.store(vals[i as usize], gep1, Align::EIGHT);
621654
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
622655
builder.store(geps[i as usize], gep2, Align::EIGHT);
623-
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
624-
// FIXME(offload): write an offload frontend and handle arbitrary types.
625-
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);
656+
657+
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
658+
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
659+
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
660+
builder.store(size_val, gep3, Align::EIGHT);
661+
}
626662
}
627663

628664
// For now we have a very simplistic indexing scheme into our

compiler/rustc_middle/src/ty/offload_meta.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@ use bitflags::bitflags;
33
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
44

55
pub struct OffloadMetadata {
6-
pub payload_size: u64,
6+
pub payload_size: OffloadSize,
77
pub mode: MappingFlags,
88
}
99

10+
#[derive(Debug, Copy, Clone)]
11+
pub enum OffloadSize {
12+
Dynamic,
13+
Static(u64),
14+
}
15+
1016
bitflags! {
1117
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
1218
#[derive(Debug, Copy, Clone)]
@@ -59,17 +65,18 @@ impl OffloadMetadata {
5965
}
6066

6167
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
62-
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
68+
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
6369
match ty.kind() {
6470
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
65-
_ => tcx
66-
.layout_of(PseudoCanonicalInput {
71+
_ => OffloadSize::Static(
72+
tcx.layout_of(PseudoCanonicalInput {
6773
typing_env: TypingEnv::fully_monomorphized(),
6874
value: ty,
6975
})
7076
.unwrap()
7177
.size
7278
.bytes(),
79+
),
7380
}
7481
}
7582

tests/codegen-llvm/gpu_offload/control_flow.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@
1414
// CHECK-NOT: define
1515
// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8
1616
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
17-
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
1817
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
1918
// CHECK: br label %bb3
2019
// CHECK-NOT define
2120
// CHECK: bb3
22-
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo, ptr null, ptr null)
21+
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.{{.*}}, ptr nonnull @.offload_maptypes.{{.*}}, ptr null, ptr null)
2322
// CHECK: %10 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
24-
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo, ptr null, ptr null)
23+
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.{{.*}}, ptr nonnull @.offload_maptypes.{{.*}}, ptr null, ptr null)
2524
#[unsafe(no_mangle)]
2625
unsafe fn main() {
2726
let A = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0];

tests/codegen-llvm/gpu_offload/gpu_host.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,11 @@ pub fn _kernel_1(x: &mut [f32; 256]) {
5353
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
5454
// CHECK-NEXT: %.offload_baseptrs = alloca [1 x ptr], align 8
5555
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
56-
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
5756
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
5857
// CHECK: call void @__tgt_init_all_rtls()
5958
// CHECK-NEXT: store ptr %x, ptr %.offload_baseptrs, align 8
6059
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
61-
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
62-
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null)
60+
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null)
6361
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
6462
// CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
6563
// CHECK-NEXT: store i32 1, ptr [[P4]], align 4
@@ -68,7 +66,7 @@ pub fn _kernel_1(x: &mut [f32; 256]) {
6866
// CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
6967
// CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8
7068
// CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
71-
// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8
69+
// CHECK-NEXT: store ptr @.offload_sizes.[[K]], ptr [[P24]], align 8
7270
// CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
7371
// CHECK-NEXT: store ptr @.offload_maptypes.[[K]], ptr [[P32]], align 8
7472
// CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
@@ -82,7 +80,7 @@ pub fn _kernel_1(x: &mut [f32; 256]) {
8280
// CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
8381
// CHECK-NEXT: store i32 0, ptr [[P96]], align 8
8482
// CHECK-NEXT: {{%[^ ]+}} = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args)
85-
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null)
83+
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]], ptr null, ptr null)
8684
// CHECK: ret void
8785
// CHECK-NEXT: }
8886

tests/codegen-llvm/gpu_offload/scalar_host.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
// CHECK-NEXT: store double %_0.i, ptr %1, align 8
2121
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
2222
// CHECK-NEXT: store ptr %addr, ptr %2, align 8
23-
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
24-
// CHECK-NEXT: store i64 4, ptr %3, align 8
2523
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper
2624

2725
#[unsafe(no_mangle)]

0 commit comments

Comments
 (0)