Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 47 additions & 10 deletions compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ use std::ffi::CString;
use bitflags::Flags;
use llvm::Linkage::*;
use rustc_abi::Align;
use rustc_codegen_ssa::MemFlags;
use rustc_codegen_ssa::common::TypeKind;
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
use rustc_middle::bug;
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata};
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize};

use crate::builder::Builder;
use crate::common::CodegenCx;
Expand Down Expand Up @@ -450,7 +451,15 @@ pub(crate) fn gen_define_handling<'ll>(
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];

let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes);
let actual_sizes = sizes
.iter()
.map(|s| match s {
OffloadSize::Static(sz) => *sz,
OffloadSize::Dynamic => 0,
})
.collect::<Vec<_>>();
let offload_sizes =
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &actual_sizes);
let memtransfer_begin =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to);
let memtransfer_kernel =
Expand Down Expand Up @@ -499,9 +508,6 @@ pub(crate) fn gen_define_handling<'ll>(
region_id,
};

// FIXME(Sa4dUs): use this global for constant offload sizes
cx.add_compiler_used_global(result.offload_sizes);

cx.offload_kernel_cache.borrow_mut().insert(symbol, result);

result
Expand Down Expand Up @@ -535,6 +541,15 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
}
}

fn get_runtime_size<'ll, 'tcx>(
_cx: &CodegenCx<'ll, 'tcx>,
_val: &'ll Value,
_meta: &OffloadMetadata,
) -> &'ll Value {
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
bug!("offload does not support dynamic sizes yet");
}

// For each kernel *call*, we now use some of our previous declared globals to move data to and from
// the gpu. For now, we only handle the data transfer part of it.
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
Expand Down Expand Up @@ -564,15 +579,17 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
) {
let cx = builder.cx;
let OffloadKernelGlobals {
offload_sizes,
memtransfer_begin,
memtransfer_kernel,
memtransfer_end,
region_id,
..
} = offload_data;
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
offload_dims;

let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));

let tgt_decl = offload_globals.launcher_fn;
let tgt_target_kernel_ty = offload_globals.launcher_ty;

Expand All @@ -596,7 +613,24 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
let ty2 = cx.type_array(cx.type_i64(), num_args);
let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");

let a4 = if has_dynamic {
let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");

builder.memcpy(
alloc,
Align::EIGHT,
offload_sizes,
Align::EIGHT,
cx.get_const_i64(8 * args.len() as u64),
MemFlags::empty(),
None,
);

alloc
} else {
offload_sizes
};

//%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args");
Expand Down Expand Up @@ -648,9 +682,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
builder.store(vals[i as usize], gep1, Align::EIGHT);
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
builder.store(geps[i as usize], gep2, Align::EIGHT);
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
// FIXME(offload): write an offload frontend and handle arbitrary types.
builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT);

if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
builder.store(size_val, gep3, Align::EIGHT);
}
}

// For now we have a very simplistic indexing scheme into our
Expand Down
15 changes: 11 additions & 4 deletions compiler/rustc_middle/src/ty/offload_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@ use bitflags::bitflags;
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};

pub struct OffloadMetadata {
pub payload_size: u64,
pub payload_size: OffloadSize,
pub mode: MappingFlags,
}

#[derive(Debug, Copy, Clone)]
pub enum OffloadSize {
Dynamic,
Static(u64),
}

bitflags! {
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
#[derive(Debug, Copy, Clone)]
Expand Down Expand Up @@ -59,17 +65,18 @@ impl OffloadMetadata {
}

// FIXME(Sa4dUs): implement a solid logic to determine the payload size
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 {
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
match ty.kind() {
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
_ => tcx
.layout_of(PseudoCanonicalInput {
_ => OffloadSize::Static(
tcx.layout_of(PseudoCanonicalInput {
typing_env: TypingEnv::fully_monomorphized(),
value: ty,
})
.unwrap()
.size
.bytes(),
),
}
}

Expand Down
5 changes: 2 additions & 3 deletions tests/codegen-llvm/gpu_offload/control_flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
// CHECK-NOT: define
// CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8
// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
// CHECK: br label %bb3
// CHECK-NOT define
// CHECK: bb3
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null)
// CHECK: %10 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null)
#[unsafe(no_mangle)]
unsafe fn main() {
let A = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
Expand Down
10 changes: 3 additions & 7 deletions tests/codegen-llvm/gpu_offload/gpu_host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,14 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
// CHECK-NEXT: %x = alloca [1024 x i8], align 16
// CHECK-NEXT: %.offload_baseptrs = alloca [2 x ptr], align 8
// CHECK-NEXT: %.offload_ptrs = alloca [2 x ptr], align 8
// CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8
// CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
// CHECK: store ptr %x, ptr %.offload_baseptrs, align 8
// CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8
// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8
// CHECK-NEXT: [[BPTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_baseptrs, i64 8
// CHECK-NEXT: store ptr %y, ptr [[BPTRS_1]], align 8
// CHECK-NEXT: [[PTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
// CHECK-NEXT: store ptr %y, ptr [[PTRS_1]], align 8
// CHECK-NEXT: [[SIZES_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
// CHECK-NEXT: store i64 1024, ptr [[SIZES_1]], align 8
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
// CHECK-NEXT: store i32 3, ptr %kernel_args, align 8
// CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4
// CHECK-NEXT: store i32 2, ptr [[P4]], align 4
Expand All @@ -78,7 +74,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
// CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16
// CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8
// CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24
// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8
// CHECK-NEXT: store ptr @.offload_sizes.[[K]], ptr [[P24]], align 8
// CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32
// CHECK-NEXT: store ptr @.offload_maptypes.[[K]].kernel, ptr [[P32]], align 8
// CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40
Expand All @@ -92,7 +88,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) {
// CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96
// CHECK-NEXT: store i32 0, ptr [[P96]], align 8
// CHECK-NEXT: [[TGT_RET:%.*]] = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
// CHECK: ret void
// CHECK-NEXT: }

Expand Down
2 changes: 0 additions & 2 deletions tests/codegen-llvm/gpu_offload/scalar_host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
// CHECK-NEXT: store double %_0.i, ptr %1, align 8
// CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8
// CHECK-NEXT: store ptr %addr, ptr %2, align 8
// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8
// CHECK-NEXT: store i64 4, ptr %3, align 8
// CHECK-NEXT: call void @__tgt_target_data_begin_mapper

#[unsafe(no_mangle)]
Expand Down
Loading