Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions compiler/rustc_arena/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#![cfg_attr(test, feature(test))]
#![deny(unsafe_op_in_unsafe_fn)]
#![doc(test(no_crate_inject, attr(deny(warnings), allow(internal_features))))]
#![feature(core_intrinsics)]
#![feature(decl_macro)]
#![feature(dropck_eyepatch)]
#![feature(never_type)]
Expand All @@ -26,7 +25,7 @@ use std::cell::{Cell, RefCell};
use std::marker::PhantomData;
use std::mem::{self, MaybeUninit};
use std::ptr::{self, NonNull};
use std::{cmp, intrinsics, slice};
use std::{cmp, hint, slice};

use smallvec::SmallVec;

Expand Down Expand Up @@ -452,7 +451,7 @@ impl DroplessArena {
let bytes = align_up(layout.size(), DROPLESS_ALIGNMENT);

// Tell LLVM that `end` is aligned to DROPLESS_ALIGNMENT.
unsafe { intrinsics::assume(end == align_down(end, DROPLESS_ALIGNMENT)) };
unsafe { hint::assert_unchecked(end == align_down(end, DROPLESS_ALIGNMENT)) };

if let Some(sub) = end.checked_sub(bytes) {
let new_end = align_down(sub, layout.align());
Expand Down
38 changes: 38 additions & 0 deletions compiler/rustc_attr_parsing/src/attributes/test_attrs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,41 @@ impl<S: Stage> SingleAttributeParser<S> for RustcAbiParser {
Some(AttributeKind::RustcAbi { attr_span: cx.attr_span, kind })
}
}

pub(crate) struct RustcDelayedBugFromInsideQueryParser;

impl<S: Stage> NoArgsAttributeParser<S> for RustcDelayedBugFromInsideQueryParser {
const PATH: &[Symbol] = &[sym::rustc_delayed_bug_from_inside_query];
const ON_DUPLICATE: OnDuplicate<S> = OnDuplicate::Warn;
const ALLOWED_TARGETS: AllowedTargets = AllowedTargets::AllowList(&[Allow(Target::Fn)]);
const CREATE: fn(Span) -> AttributeKind = |_| AttributeKind::RustcDelayedBugFromInsideQuery;
}

pub(crate) struct RustcEvaluateWhereClausesParser;

impl<S: Stage> NoArgsAttributeParser<S> for RustcEvaluateWhereClausesParser {
const PATH: &[Symbol] = &[sym::rustc_evaluate_where_clauses];
const ON_DUPLICATE: OnDuplicate<S> = OnDuplicate::Warn;
const ALLOWED_TARGETS: AllowedTargets = AllowedTargets::AllowList(&[
Allow(Target::Fn),
Allow(Target::Method(MethodKind::Inherent)),
Allow(Target::Method(MethodKind::Trait { body: true })),
Allow(Target::Method(MethodKind::TraitImpl)),
Allow(Target::Method(MethodKind::Trait { body: false })),
]);
const CREATE: fn(Span) -> AttributeKind = |_| AttributeKind::RustcEvaluateWhereClauses;
}

pub(crate) struct RustcOutlivesParser;

impl<S: Stage> NoArgsAttributeParser<S> for RustcOutlivesParser {
const PATH: &[Symbol] = &[sym::rustc_outlives];
const ON_DUPLICATE: OnDuplicate<S> = OnDuplicate::Warn;
const ALLOWED_TARGETS: AllowedTargets = AllowedTargets::AllowList(&[
Allow(Target::Struct),
Allow(Target::Enum),
Allow(Target::Union),
Allow(Target::TyAlias),
]);
const CREATE: fn(Span) -> AttributeKind = |_| AttributeKind::RustcOutlives;
}
3 changes: 3 additions & 0 deletions compiler/rustc_attr_parsing/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,14 @@ attribute_parsers!(
Single<WithoutArgs<RustcAllocatorZeroedParser>>,
Single<WithoutArgs<RustcCoherenceIsCoreParser>>,
Single<WithoutArgs<RustcDeallocatorParser>>,
Single<WithoutArgs<RustcDelayedBugFromInsideQueryParser>>,
Single<WithoutArgs<RustcDumpDefParentsParser>>,
Single<WithoutArgs<RustcDumpItemBoundsParser>>,
Single<WithoutArgs<RustcDumpPredicatesParser>>,
Single<WithoutArgs<RustcDumpUserArgsParser>>,
Single<WithoutArgs<RustcDumpVtableParser>>,
Single<WithoutArgs<RustcEffectiveVisibilityParser>>,
Single<WithoutArgs<RustcEvaluateWhereClausesParser>>,
Single<WithoutArgs<RustcHasIncoherentInherentImplsParser>>,
Single<WithoutArgs<RustcHiddenTypeOfOpaquesParser>>,
Single<WithoutArgs<RustcIntrinsicConstStableIndirectParser>>,
Expand All @@ -281,6 +283,7 @@ attribute_parsers!(
Single<WithoutArgs<RustcNonConstTraitMethodParser>>,
Single<WithoutArgs<RustcNounwindParser>>,
Single<WithoutArgs<RustcOffloadKernelParser>>,
Single<WithoutArgs<RustcOutlivesParser>>,
Single<WithoutArgs<RustcPassIndirectlyInNonRusticAbisParser>>,
Single<WithoutArgs<RustcPreserveUbChecksParser>>,
Single<WithoutArgs<RustcReallocatorParser>>,
Expand Down
88 changes: 58 additions & 30 deletions compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use std::ffi::CString;

use bitflags::Flags;
use llvm::Linkage::*;
use rustc_abi::Align;
use rustc_codegen_ssa::common::TypeKind;
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
use rustc_middle::bug;
use rustc_middle::ty::offload_meta::OffloadMetadata;
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata};

use crate::builder::Builder;
use crate::common::CodegenCx;
Expand All @@ -28,10 +29,6 @@ pub(crate) struct OffloadGlobals<'ll> {
pub mapper_fn_ty: &'ll llvm::Type,

pub ident_t_global: &'ll llvm::Value,

// FIXME(offload): Drop this, once we fully automated our offload compilation pipeline, since
// LLVM will initialize them for us if it sees gpu kernels being registered.
pub init_rtls: &'ll llvm::Value,
}

impl<'ll> OffloadGlobals<'ll> {
Expand All @@ -42,9 +39,6 @@ impl<'ll> OffloadGlobals<'ll> {
let (begin_mapper, _, end_mapper, mapper_fn_ty) = gen_tgt_data_mappers(cx);
let ident_t_global = generate_at_one(cx);

let init_ty = cx.type_func(&[], cx.type_void());
let init_rtls = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);

// We want LLVM's openmp-opt pass to pick up and optimize this module, since it covers both
// openmp and offload optimizations.
llvm::add_module_flag_u32(cx.llmod(), llvm::ModuleFlagMergeBehavior::Max, "openmp", 51);
Expand All @@ -58,7 +52,6 @@ impl<'ll> OffloadGlobals<'ll> {
end_mapper,
mapper_fn_ty,
ident_t_global,
init_rtls,
}
}
}
Expand Down Expand Up @@ -91,6 +84,11 @@ pub(crate) fn register_offload<'ll>(cx: &CodegenCx<'ll, '_>) {
let atexit = cx.type_func(&[cx.type_ptr()], cx.type_i32());
let atexit_fn = declare_offload_fn(cx, "atexit", atexit);

// FIXME(offload): Drop this, once we fully automated our offload compilation pipeline, since
// LLVM will initialize them for us if it sees gpu kernels being registered.
let init_ty = cx.type_func(&[], cx.type_void());
let init_rtls = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);

let desc_ty = cx.type_func(&[], cx.type_void());
let reg_name = ".omp_offloading.descriptor_reg";
let unreg_name = ".omp_offloading.descriptor_unreg";
Expand All @@ -104,12 +102,14 @@ pub(crate) fn register_offload<'ll>(cx: &CodegenCx<'ll, '_>) {
// define internal void @.omp_offloading.descriptor_reg() section ".text.startup" {
// entry:
// call void @__tgt_register_lib(ptr @.omp_offloading.descriptor)
// call void @__tgt_init_all_rtls()
// %0 = call i32 @atexit(ptr @.omp_offloading.descriptor_unreg)
// ret void
// }
let bb = Builder::append_block(cx, desc_reg_fn, "entry");
let mut a = Builder::build(cx, bb);
a.call(reg_lib_decl, None, None, register_lib, &[omp_descriptor], None, None);
a.call(init_ty, None, None, init_rtls, &[], None, None);
a.call(atexit, None, None, atexit_fn, &[desc_unreg_fn], None, None);
a.ret_void();

Expand Down Expand Up @@ -345,7 +345,9 @@ impl KernelArgsTy {
#[derive(Copy, Clone)]
pub(crate) struct OffloadKernelGlobals<'ll> {
pub offload_sizes: &'ll llvm::Value,
pub memtransfer_types: &'ll llvm::Value,
pub memtransfer_begin: &'ll llvm::Value,
pub memtransfer_kernel: &'ll llvm::Value,
pub memtransfer_end: &'ll llvm::Value,
pub region_id: &'ll llvm::Value,
}

Expand Down Expand Up @@ -423,18 +425,38 @@ pub(crate) fn gen_define_handling<'ll>(

let offload_entry_ty = offload_globals.offload_entry_ty;

// FIXME(Sa4dUs): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
let (sizes, transfer): (Vec<_>, Vec<_>) =
metadata.iter().map(|m| (m.payload_size, m.mode.bits() | 0x20)).unzip();
metadata.iter().map(|m| (m.payload_size, m.mode)).unzip();
// Our begin mapper should only see simplified information about which args have to be
// transferred to the device, the end mapper only about which args should be transferred back.
// Any information beyond that makes it harder for LLVM's opt pass to evaluate whether it can
// safely move (=optimize) the LLVM-IR location of this data transfer. Only the mapping types
// mentioned below are handled, so make sure that we don't generate any other ones.
let handled_mappings = MappingFlags::TO
| MappingFlags::FROM
| MappingFlags::TARGET_PARAM
| MappingFlags::LITERAL
| MappingFlags::IMPLICIT;
for arg in &transfer {
debug_assert!(!arg.contains_unknown_bits());
debug_assert!(handled_mappings.contains(*arg));
}

let valid_begin_mappings = MappingFlags::TO | MappingFlags::LITERAL | MappingFlags::IMPLICIT;
let transfer_to: Vec<u64> =
transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect();
let transfer_from: Vec<u64> =
transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect();
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];

let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes);
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
// A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
// will be 2. For now, everything is 3, until we have our frontend set up.
// 1+2+32: 1 (MapTo), 2 (MapFrom), 32 (Add one extra input ptr per function, to be used later).
let memtransfer_types =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}"), &transfer);
let memtransfer_begin =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to);
let memtransfer_kernel =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.kernel"), &transfer_kernel);
let memtransfer_end =
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.end"), &transfer_from);

// Next: For each function, generate these three entries. A weak constant,
// the llvm.rodata entry name, and the llvm_offload_entries value
Expand Down Expand Up @@ -469,7 +491,13 @@ pub(crate) fn gen_define_handling<'ll>(

cx.add_compiler_used_global(offload_entry);

let result = OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id };
let result = OffloadKernelGlobals {
offload_sizes,
memtransfer_begin,
memtransfer_kernel,
memtransfer_end,
region_id,
};

// FIXME(Sa4dUs): use this global for constant offload sizes
cx.add_compiler_used_global(result.offload_sizes);
Expand Down Expand Up @@ -535,7 +563,13 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
offload_dims: &OffloadKernelDims<'ll>,
) {
let cx = builder.cx;
let OffloadKernelGlobals { memtransfer_types, region_id, .. } = offload_data;
let OffloadKernelGlobals {
memtransfer_begin,
memtransfer_kernel,
memtransfer_end,
region_id,
..
} = offload_data;
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
offload_dims;

Expand Down Expand Up @@ -608,12 +642,6 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
geps.push(gep);
}

let init_ty = cx.type_func(&[], cx.type_void());
let init_rtls_decl = offload_globals.init_rtls;

// call void @__tgt_init_all_rtls()
builder.call(init_ty, None, None, init_rtls_decl, &[], None, None);

for i in 0..num_args {
let idx = cx.get_const_i32(i);
let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
Expand Down Expand Up @@ -668,14 +696,14 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
generate_mapper_call(
builder,
geps,
memtransfer_types,
memtransfer_begin,
begin_mapper_decl,
fn_ty,
num_args,
s_ident_t,
);
let values =
KernelArgsTy::new(&cx, num_args, memtransfer_types, geps, workgroup_dims, thread_dims);
KernelArgsTy::new(&cx, num_args, memtransfer_kernel, geps, workgroup_dims, thread_dims);

// Step 3)
// Here we fill the KernelArgsTy, see the documentation above
Expand All @@ -701,7 +729,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
generate_mapper_call(
builder,
geps,
memtransfer_types,
memtransfer_end,
end_mapper_decl,
fn_ty,
num_args,
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_data_structures/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#![allow(internal_features)]
#![allow(rustc::default_hash_types)]
#![allow(rustc::potential_query_instability)]
#![cfg_attr(bootstrap, feature(cold_path))]
#![deny(unsafe_op_in_unsafe_fn)]
#![feature(allocator_api)]
#![feature(ascii_char)]
Expand All @@ -19,7 +20,6 @@
#![feature(cfg_select)]
#![feature(const_default)]
#![feature(const_trait_impl)]
#![feature(core_intrinsics)]
#![feature(dropck_eyepatch)]
#![feature(extend_one)]
#![feature(file_buffered)]
Expand Down
9 changes: 5 additions & 4 deletions compiler/rustc_data_structures/src/profiling.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,11 @@ use std::borrow::Borrow;
use std::collections::hash_map::Entry;
use std::error::Error;
use std::fmt::Display;
use std::intrinsics::unlikely;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::{Duration, Instant};
use std::{fs, process};
use std::{fs, hint, process};

pub use measureme::EventId;
use measureme::{EventIdBuilder, Profiler, SerializableString, StringId};
Expand Down Expand Up @@ -427,7 +426,8 @@ impl SelfProfilerRef {
.unwrap()
.increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
}
if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS) {
hint::cold_path();
profiler_ref.instant_query_event(
|profiler| profiler.query_cache_hit_event_kind,
query_invocation_id,
Expand All @@ -437,7 +437,8 @@ impl SelfProfilerRef {

// We check both kinds of query cache hit events at once, to reduce overhead in the
// common case (with self-profile disabled).
if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
if self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED) {
hint::cold_path();
cold_call(self, query_invocation_id);
}
}
Expand Down
5 changes: 3 additions & 2 deletions compiler/rustc_data_structures/src/sync/freeze.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::cell::UnsafeCell;
use std::intrinsics::likely;
use std::hint;
use std::marker::PhantomData;
use std::ops::{Deref, DerefMut};
use std::ptr::NonNull;
Expand Down Expand Up @@ -60,10 +60,11 @@ impl<T> FreezeLock<T> {
/// Get the inner value if frozen.
#[inline]
pub fn get(&self) -> Option<&T> {
if likely(self.frozen.load(Ordering::Acquire)) {
if self.frozen.load(Ordering::Acquire) {
// SAFETY: This is frozen so the data cannot be modified.
unsafe { Some(&*self.data.get()) }
} else {
hint::cold_path();
None
}
}
Expand Down
9 changes: 5 additions & 4 deletions compiler/rustc_data_structures/src/sync/lock.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! This module implements a lock which only uses synchronization if `might_be_dyn_thread_safe` is true.
//! It implements `DynSend` and `DynSync` instead of the typical `Send` and `Sync` traits.

use std::fmt;
use std::{fmt, hint};

#[derive(Clone, Copy, PartialEq)]
pub enum Mode {
Expand All @@ -10,7 +10,6 @@ pub enum Mode {
}

use std::cell::{Cell, UnsafeCell};
use std::intrinsics::unlikely;
use std::marker::PhantomData;
use std::mem::ManuallyDrop;
use std::ops::{Deref, DerefMut};
Expand Down Expand Up @@ -92,7 +91,8 @@ pub struct Lock<T> {
impl<T> Lock<T> {
#[inline(always)]
pub fn new(inner: T) -> Self {
let (mode, mode_union) = if unlikely(mode::might_be_dyn_thread_safe()) {
let (mode, mode_union) = if mode::might_be_dyn_thread_safe() {
hint::cold_path();
// Create the lock with synchronization enabled using the `RawMutex` type.
(Mode::Sync, ModeUnion { sync: ManuallyDrop::new(RawMutex::INIT) })
} else {
Expand Down Expand Up @@ -150,7 +150,8 @@ impl<T> Lock<T> {
unsafe {
match mode {
Mode::NoSync => {
if unlikely(self.mode_union.no_sync.replace(LOCKED) == LOCKED) {
if self.mode_union.no_sync.replace(LOCKED) == LOCKED {
hint::cold_path();
lock_held()
}
}
Expand Down
Loading
Loading