Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3b6b7c6
RC-01: fix cluster::sync() to use cluster user barrier (-3) instead o…
AviralGoelAMD May 23, 2026
3891e96
RC-02: fix mfma323232 const C aliasing violation when D != C
AviralGoelAMD May 23, 2026
c02694f
RC-03: add assert to init_barrier validating count range
AviralGoelAMD May 23, 2026
b8dcb10
RC-04: fix packed_shfl_down width=64 on wave-32 — use WARP_THREADS
AviralGoelAMD May 23, 2026
4d9afaf
SR-01: add M/N/K alignment asserts in grid() to catch silent truncation
AviralGoelAMD May 23, 2026
524771f
SR-02: add shared_allocator overflow check via assert
AviralGoelAMD May 23, 2026
dc39999
SR-03: add per-segment watermarks to shared_allocator
AviralGoelAMD May 23, 2026
2b9406a
SR-04: document load_b32 as unpadded-only
AviralGoelAMD May 23, 2026
63eed26
SR-05: add static_assert for 16-byte pad alignment in load_b128
AviralGoelAMD May 23, 2026
d1b2099
SR-06: document that g2s load functions require tile-aligned dimensions
AviralGoelAMD May 23, 2026
5364ced
SR-07: move laneid()==0 guard into async_barrier_arrive
AviralGoelAMD May 23, 2026
a17ff7b
SR-08: assert barrier address fits in D# 16-bit field
AviralGoelAMD May 23, 2026
3dbae36
SR-09: static_assert pad interval is power-of-2 for D# encoding
AviralGoelAMD May 23, 2026
0754181
SR-10: fix chiplet_transform_chunked off-by-one (> to >=)
AviralGoelAMD May 23, 2026
3c45296
SR-11/SR-12: document padding derivation and write/read consistency
AviralGoelAMD May 23, 2026
756fbd0
SR-13/SR-14: segment size static_assert and wait_barrier hang warning
AviralGoelAMD May 23, 2026
fcca9b8
IV-01: skip last-iteration sync/wait_async in all 7 ladder kernels
AviralGoelAMD May 23, 2026
a1e99e1
IV-02: document why expert mode scope is intentionally wide
AviralGoelAMD May 23, 2026
8d011ed
IV-03: move arrive() before async loads in split_bar, segment, expert
AviralGoelAMD May 23, 2026
c069417
CQ-01: replace -w with -Wall and targeted suppressions
AviralGoelAMD May 23, 2026
94a463a
CQ-01/CQ-02/CQ-18: Makefile warning flags, header deps, all target
AviralGoelAMD May 23, 2026
ddddb1a
CQ-03/CQ-05: fix kernel name printf and add hipGetLastError after launch
AviralGoelAMD May 23, 2026
e749779
CQ-07/CQ-11: fix MASK_ALL for wave-32 and remove dead static_asserts
AviralGoelAMD May 23, 2026
2088056
CQ-13: remove unused bar_bytes variable in TDM dispatch
AviralGoelAMD May 23, 2026
52bf9fe
CQ-23: combine paired ds_load_b128 into single asm volatile block
AviralGoelAMD May 23, 2026
d1d3153
CQ-04/CQ-06: fix CPU reference precision and scale tolerance with K
AviralGoelAMD May 23, 2026
aedc8b9
CQ-14: add hipFree and hipEventDestroy cleanup in harness
AviralGoelAMD May 23, 2026
1802e7b
CQ-19: add isa target to Makefile for GPU assembly dump
AviralGoelAMD May 23, 2026
ac71d74
CQ-12: document fence() scope — drains loadcnt + dscnt only
AviralGoelAMD May 23, 2026
50d19fa
CQ-16: extract gfx1250_lane_offset helper for shared_to_register
AviralGoelAMD May 23, 2026
17d0275
CQ-17: static_assert subtile dims are power-of-2 in subtile_flat
AviralGoelAMD May 23, 2026
16e5e1a
CQ-10/CQ-20/CQ-21: auto-match padded shapes, padding name clarity, se…
AviralGoelAMD May 23, 2026
9777baa
MF-02: add SCHED_MODE bit[4] DISABLE_VALU_STALL
AviralGoelAMD May 23, 2026
9b58ced
MF-08: add s_wakeup and s_sleep_var wrappers
AviralGoelAMD May 23, 2026
12cdbab
MF-09: document WMMA co-execution opportunity
AviralGoelAMD May 23, 2026
aae0401
MF-01: wrap named barriers (IDs 1-16) for subset-of-waves sync
AviralGoelAMD May 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 29 additions & 17 deletions include/common/util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ __host__ __device__ inline int ceil_div(int a, int b) {
int limit = (num_workgroups / block) * block;

// If pid beyond the last full block, leave unchanged
if (workgroup_id > limit) return workgroup_id;
if (workgroup_id >= limit) return workgroup_id;

// Local PID (within round-robin assignment)
int local_pid = workgroup_id / num_xcds;
Expand Down Expand Up @@ -174,7 +174,11 @@ struct default_type {};
/**
* @brief Mask constant for all active threads in a warp.
*/
static constexpr uint64_t MASK_ALL = 0xFFFFFFFFFFFFFFFF;
#ifdef KITTENS_UDNA1
static constexpr uint64_t MASK_ALL = 0x00000000FFFFFFFF; // wave-32
#else
static constexpr uint64_t MASK_ALL = 0xFFFFFFFFFFFFFFFF; // wave-64
#endif

/**
* @brief Perform a shuffle down operation on a packed type synchronously across a warp.
Expand All @@ -201,7 +205,7 @@ __device__ static inline T packed_shfl_down(uint64_t mask, const T &f, int delta
*reinterpret_cast<const __hip_bfloat16*>(&f)};
}

u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, 64);
u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, WARP_THREADS);
if constexpr (std::is_same_v<T, bf16>) {
return *reinterpret_cast<const T*>(&u.bf162.x); // Extract single bf16 from the .x component
} else {
Expand Down Expand Up @@ -332,8 +336,9 @@ template<typename T> concept all = is_segment<T>::value;
template<int default_alignment=16>
struct shared_allocator {
int *ptr;
#ifdef KITTENS_UDNA1
int *base;
#ifdef KITTENS_UDNA1
int *seg_ptr[LDS_NUM_SEGMENTS];
#endif

private:
Expand Down Expand Up @@ -366,24 +371,26 @@ struct shared_allocator {
* @brief Construct a new shared allocator using a pointer to extern shared memory.
* @param[in] _ptr Pointer to the start of the extern shared memory.
*/
__device__ shared_allocator(int *_ptr): ptr(_ptr), base(_ptr) {
#ifdef KITTENS_UDNA1
__device__ shared_allocator(int *_ptr): ptr(_ptr), base(_ptr) {}
#else
__device__ shared_allocator(int *_ptr): ptr(_ptr) {}
for (int i = 0; i < LDS_NUM_SEGMENTS; i++)
seg_ptr[i] = base + i * (LDS_SEGMENT_BYTES / (int)sizeof(int));
#endif
}
/**
* @brief Allocate shared memory for a single instance or N-dimensional array of type A.
* @tparam A The type of the object to allocate.
* @tparam dims... A list of dimensions for the N-dimensional array.
* @return Reference to the allocated object.
*/
template<typename A, size_t... dims>
template<typename A, size_t... dims>
__device__ inline variadic_array_t<A, dims...>& allocate() {
// static_assert(sizeof(A) % default_alignment == 0, "Type is not aligned properly for array allocation");

align_ptr<default_alignment>();
using at = variadic_array_t<A, dims...>;
at*p = reinterpret_cast<at*>(ptr);
ptr += sizeof(at)/sizeof(int);
assert(ptr <= base + MAX_SHARED_MEMORY / sizeof(int));
return *p;
}
/**
Expand All @@ -393,13 +400,14 @@ struct shared_allocator {
* @tparam dims... A list of dimensions for the N-dimensional array.
* @return Reference to the allocated object.
*/
template<int alignment, typename A, size_t... dims>
template<int alignment, typename A, size_t... dims>
__device__ inline variadic_array_t<A, dims...>& allocate() {
// static_assert(sizeof(A) % alignment == 0, "Type is not aligned properly for array allocation");

align_ptr<alignment>();
using at = variadic_array_t<A, dims...>;
at*p = reinterpret_cast<at*>(ptr);
ptr += sizeof(at)/sizeof(int);
assert(ptr <= base + MAX_SHARED_MEMORY / sizeof(int));
return *p;
}

Expand All @@ -419,13 +427,17 @@ struct shared_allocator {
template<typename SEG, typename A, size_t... dims>
requires ducks::segment_tag::all<SEG>
__device__ inline variadic_array_t<A, dims...>& allocate_in() {
int* target = base + (SEG::byte_offset / sizeof(int));
// If we've already allocated past the requested segment, keep
// packing where we are; otherwise jump forward to the segment.
if (ptr < target) ptr = target;
constexpr int idx = SEG::index;
if constexpr (default_alignment > 0) {
uint64_t p = reinterpret_cast<uint64_t>(seg_ptr[idx]);
if (p % default_alignment != 0)
seg_ptr[idx] = (int*)(p + (default_alignment - (p % default_alignment)));
}
using at = variadic_array_t<A, dims...>;
at* p = reinterpret_cast<at*>(ptr);
ptr += sizeof(at) / sizeof(int);
at* p = reinterpret_cast<at*>(seg_ptr[idx]);
seg_ptr[idx] += sizeof(at) / sizeof(int);
constexpr int seg_end = (idx + 1) * LDS_SEGMENT_BYTES / sizeof(int);
assert(seg_ptr[idx] <= base + seg_end);
return *p;
}
#endif // KITTENS_UDNA1
Expand Down
10 changes: 5 additions & 5 deletions include/ops/warp/cluster/cluster.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ __device__ __host__ __forceinline__ constexpr uint32_t mask(
}

/**
* @brief Cluster-wide split barrier.
* @brief Cluster-wide barrier (signal + wait on cluster user barrier -3).
*
* Outside a CGA cluster this lowers to a workgroup-wide `sync::sync()`. Inside
* a cluster the same `s_barrier_signal -1 / s_barrier_wait -1` pair extends to
* every workgroup in the cluster by hardware-managed forwarding.
* Barrier -3 syncs across all workgroups in a CGA cluster.
* Outside a cluster, use `sync::sync()` (barrier -1, workgroup-only).
*/
__device__ __forceinline__ void sync() {
::kittens::sync::sync();
__builtin_amdgcn_s_barrier_signal(-3);
__builtin_amdgcn_s_barrier_wait(-3);
}

} // namespace cluster
Expand Down
25 changes: 25 additions & 0 deletions include/ops/warp/memory/tile/global_to_shared.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@

namespace kittens {

// CDNA global→shared load overloads (4 variants):
// 1. load(ST, GL, coord) — typed ST, computes swizzle inline
// 2. load(ST, GL, coord, precomp) — typed ST, takes precomputed swizzle offsets
// 3. load_async(ST, GL, coord) — async DMA path (CDNA2+)
// 4. store(GL, ST, coord) — shared→global (reverse direction)
// For gfx1250, see the g2s:: namespace below for register-mediated and TDM paths.

template<int axis, bool assume_aligned,
ducks::st::all ST, ducks::gl::all GL,
ducks::coord::tile COORD = coord<ST>,
Expand Down Expand Up @@ -461,6 +468,12 @@ struct lds_nopad {
};

/// @brief Default LDS padding for bf16 GEMMs on gfx1250.
/// Derivation: gfx1250 LDS has 32 banks, 4 bytes wide. A 16x32 bf16 subtile
/// row is 32 * 2 = 64 bytes = 16 banks. Two consecutive rows hit the same 16
/// banks → 2-way conflict on ds_load_b128. Padding by 8 bf16 (16 bytes = 4
/// banks) shifts each row's bank mapping, eliminating conflicts.
/// Interval 128 = one subtile row (128 bf16 = 256 bytes). Must be power-of-2
/// for D# encoding (see SR-09 static_assert).
using lds_pad_default = lds_padded<128, 8>;

namespace g2s {
Expand All @@ -480,6 +493,10 @@ using i32x4_lvec = int __attribute__((__vector_size__(16))) __attribute__((addr
*/
template<int ROWS, int COLS, int SUB_ROWS, int SUB_COLS>
__device__ __forceinline__ int subtile_flat(int flat) {
static_assert((SUB_ROWS * SUB_COLS & (SUB_ROWS * SUB_COLS - 1)) == 0,
"sub_elems must be power-of-2 to avoid integer division");
static_assert((SUB_COLS & (SUB_COLS - 1)) == 0,
"SUB_COLS must be power-of-2 to avoid integer division");
constexpr int sub_elems = SUB_ROWS * SUB_COLS;
constexpr int subs_per_row = COLS / SUB_COLS;
const int subtile_id = flat / sub_elems;
Expand All @@ -501,6 +518,8 @@ __device__ __forceinline__ int subtile_flat(int flat) {
* Plain `global_load` -> VGPR -> `ds_store` path. Use this when no async
* intrinsic is available or for correctness baselines. The `Pad` parameter
* controls the per-element LDS placement; pass `lds_nopad` for flat layouts.
*
* Caller must ensure matrix dimensions are multiples of ROWS/COLS (no bounds clamping).
*/
template<typename Pad = lds_nopad, int ROWS = 0, int COLS = 0, int N_THREADS = WARP_THREADS,
typename T, ducks::gl::all GL, ducks::coord::tile COORD = coord<>>
Expand Down Expand Up @@ -536,6 +555,8 @@ __device__ inline void load(T* __restrict__ lds_dst, const GL& src, const COORD&
* issues one 16-byte transfer; the warp covers `8 * N_THREADS` elements per
* iteration. Drain with `kittens::sync::wait_async()` before consuming.
*
* Caller must ensure matrix dimensions are multiples of ROWS/COLS (no bounds clamping).
*
* @tparam Pad LDS padding descriptor.
* @tparam ROWS,COLS Tile shape (elements).
* @tparam N_THREADS Number of threads participating in the load.
Expand Down Expand Up @@ -645,6 +666,9 @@ __device__ __forceinline__ void build_tdm_d_2d(
: (sizeof(T) == 4) ? 2
: 3;
constexpr uint32_t pad_enable = (Pad::interval > 0) ? 1u : 0u;
static_assert(Pad::interval == 0 ||
__builtin_popcount(Pad::interval * sizeof(T) / 4) == 1,
"Pad interval in DWords must be a power of 2 for D# encoding");
constexpr uint32_t pad_int_enc = (Pad::interval > 0)
? ( __builtin_ctz(Pad::interval * sizeof(T) / 4) ) : 0;
constexpr uint32_t pad_amt_enc = (Pad::amount > 0)
Expand All @@ -669,6 +693,7 @@ __device__ __forceinline__ void build_tdm_d_2d(
const uint32_t tiledim1 = static_cast<uint32_t>(ROWS);

// barrier_addr occupies w1[15:0]; tensor_dim0 lo16 occupies w1[31:16].
assert(bar_lds_addr == 0 || bar_lds_addr < 0x10000u);
uint32_t w1 = (bar_lds_addr & 0xFFFFu) | (tdim0 << 16);
uint32_t w2 = (tdim0 >> 16) | (tdim1 << 16);
uint32_t w3 = (tdim1 >> 16) | (tiledim0 << 16);
Expand Down
26 changes: 13 additions & 13 deletions include/ops/warp/memory/tile/shared_to_register.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,10 @@ namespace detail {
inline constexpr int GFX1250_SUB_ROWS = 16;
inline constexpr int GFX1250_SUB_COLS = 32;
inline constexpr int GFX1250_SUB_ELEMS = GFX1250_SUB_ROWS * GFX1250_SUB_COLS;

__device__ __forceinline__ int gfx1250_lane_offset(int sub_id, int row, int half) {
return sub_id * GFX1250_SUB_ELEMS + row * GFX1250_SUB_COLS + half * 16;
}
} // namespace detail

/**
Expand All @@ -729,6 +733,8 @@ __device__ inline void load_b128(
rt_bf<WARP_M, WARP_K, ducks::rt_layout::row, ducks::rt_shape::rt_16x32>& dst,
const bf16* __restrict__ warp_lds_base)
{
static_assert(Pad::amount == 0 || Pad::amount * sizeof(bf16) % 16 == 0,
"Pad amount must be a multiple of 16 bytes for ds_load_b128 alignment");
constexpr int height = WARP_M / detail::GFX1250_SUB_ROWS;
constexpr int width = WARP_K / detail::GFX1250_SUB_COLS;
constexpr int subs_per_row = WARP_K / detail::GFX1250_SUB_COLS;
Expand All @@ -742,19 +748,15 @@ __device__ inline void load_b128(
#pragma unroll
for (int tj = 0; tj < width; tj++) {
const int sub_id = ti * subs_per_row + tj;
const int base_flat = sub_id * detail::GFX1250_SUB_ELEMS
+ row * detail::GFX1250_SUB_COLS
+ half * 16;
const int padded_off = Pad::padded(base_flat);
const int padded_off = Pad::padded(detail::gfx1250_lane_offset(sub_id, row, half));

const uint32_t addr = static_cast<uint32_t>(
reinterpret_cast<uintptr_t>(warp_lds_base + padded_off));

float4 lo, hi;
asm volatile("ds_load_b128 %0, %1 offset:0\n"
: "=v"(lo) : "v"(addr) : "memory");
asm volatile("ds_load_b128 %0, %1 offset:16\n"
: "=v"(hi) : "v"(addr) : "memory");
asm volatile("ds_load_b128 %0, %2 offset:0\n"
"ds_load_b128 %1, %2 offset:16\n"
: "=v"(lo), "=v"(hi) : "v"(addr) : "memory");

bf16_2* lo_p = reinterpret_cast<bf16_2*>(&lo);
bf16_2* hi_p = reinterpret_cast<bf16_2*>(&hi);
Expand Down Expand Up @@ -783,6 +785,7 @@ __device__ inline void load_b32(
rt_bf<WARP_M, WARP_K, ducks::rt_layout::row, ducks::rt_shape::rt_16x32>& dst,
const bf16* __restrict__ warp_lds_base)
{
// Unpadded only — use load_b128<Pad> for padded LDS layouts.
constexpr int height = WARP_M / detail::GFX1250_SUB_ROWS;
constexpr int width = WARP_K / detail::GFX1250_SUB_COLS;
constexpr int subs_per_row = WARP_K / detail::GFX1250_SUB_COLS;
Expand All @@ -795,13 +798,10 @@ __device__ inline void load_b32(
for (int ti = 0; ti < height; ti++) {
#pragma unroll
for (int tj = 0; tj < width; tj++) {
const int sub_id = ti * subs_per_row + tj;
const int base_flat = sub_id * detail::GFX1250_SUB_ELEMS
+ row * detail::GFX1250_SUB_COLS
+ half * 16;
const int sub_id = ti * subs_per_row + tj;

const bf16_2* lds_p = reinterpret_cast<const bf16_2*>(
warp_lds_base + base_flat);
warp_lds_base + detail::gfx1250_lane_offset(sub_id, row, half));

#pragma unroll
for (int k = 0; k < 8; k++) {
Expand Down
21 changes: 17 additions & 4 deletions include/ops/warp/register/tile/mma.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ __device__ static inline void mfma323232( float2 (&D)[8],
typedef __attribute__((__vector_size__(8 * sizeof(__bf16)))) __bf16 bf16x8_t;
typedef __attribute__((__vector_size__(16 * sizeof(float)))) float floatx16_t;

*(floatx16_t*)C = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
floatx16_t acc = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
*(bf16x8_t*)A,
*(bf16x8_t*)B,
*(floatx16_t*)C,
Expand All @@ -134,7 +134,7 @@ __device__ static inline void mfma323232( float2 (&D)[8],
*(floatx16_t*)D = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
*(bf16x8_t*)(A + 4),
*(bf16x8_t*)(B + 4),
*(floatx16_t*)C,
acc,
0, 0, 0
);
}
Expand Down Expand Up @@ -220,19 +220,32 @@ __device__ static inline void mma_AB_base(rt_base<float, ducks::rt_layout::col,
constexpr int B_stride = B_shape::stride;
static_assert(A_stride == B_stride, "A and B must have the same stride");

if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> &&
#ifdef KITTENS_UDNA1
// gfx1250 WMMA always computes A × B_input^T. For mma_AB, B is col-major,
// so B_input^T = B_row = the non-transposed view. Same WMMA instruction.
if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> &&
A_rows == 16 && A_cols == 32 &&
B_rows == 32 && B_cols == 16 &&
std::is_same_v<C_shape, typename ducks::rt_shape::rt_16x16>) {
wmma161632<false, false>(d.data, a.data, b.data, c.data);
} else {
static_assert(false, "Unsupported shape combination for gfx1250 mma_AB_base");
}
#else
if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> &&
A_rows == 16 && A_cols == 32 &&
B_rows == 32 && B_cols == 16 &&
std::is_same_v<C_shape, typename ducks::rt_shape::rt_16x16>) {
mfma161632(d.data, a.data, b.data, c.data);
} else if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_32x32> &&
} else if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_32x32> &&
A_rows == 32 && A_cols == 16 &&
B_rows == 16 && B_cols == 32 &&
std::is_same_v<C_shape, typename ducks::rt_shape::rt_32x32>) {
mfma323216(d.data, a.data, b.data, c.data);
} else {
static_assert(false, "Unsupported shape combination");
}
#endif
}

/**
Expand Down
38 changes: 32 additions & 6 deletions include/ops/warp/sched/sched.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ namespace sched {
* as experimental / unsafe by default.
*/
enum class mode : int {
normal = 0,
full = 1,
limited = 2,
normal = 0,
full = 1,
limited = 2,
limited_nostall = 2 | (1 << 4), // limited + DISABLE_VALU_STALL (bit[4])
};

// `s_setreg_b32 hwreg(MODE_REG=1, offset=4, size=2), value`
// Encoded simm16 = 1 | (4 << 6) | ((2-1) << 11) = 2305.
constexpr int SCHED_MODE_HWREG_SIMM16 = 1 | (4 << 6) | (1 << 11);
// `s_setreg_b32 hwreg(MODE_REG=1, offset=4, size=5), value`
// 5 bits to cover bits [4:0] of SCHED_MODE (including DISABLE_VALU_STALL at bit[4]).
constexpr int SCHED_MODE_HWREG_SIMM16 = 1 | (4 << 6) | ((5 - 1) << 11);

/**
* @brief Set the wave's SCHED_MODE to `m`.
Expand Down Expand Up @@ -104,12 +105,37 @@ __device__ __forceinline__ void sleep() {
__builtin_amdgcn_s_sleep(N);
}

/**
* @brief Wake all sleeping waves in this workgroup.
*
* Lowers to `s_wakeup`. Use after a producer finishes work to wake consumer
* waves that are polling in `wait_barrier` via `s_sleep`.
*/
__device__ __forceinline__ void wakeup() {
asm volatile("s_wakeup" ::: "memory");
}

/**
* @brief Sleep the wave for a runtime-variable number of cycles.
*
* Lowers to `s_sleep_var`. Duration = SGPR[6:0] * 64 cycles.
*/
__device__ __forceinline__ void sleep_var(unsigned cycles_div64) {
asm volatile("s_sleep_var %0" :: "s"(cycles_div64));
}

/**
* @brief Compiler-only scheduling fence.
*
* Tells the LLVM scheduler not to reorder instructions across this point
* but emits no hardware op. Useful when constraining the compiler's WMMA
* burst grouping without paying a runtime barrier.
*
* WMMA co-execution note: each WMMA takes 16 cycles. During this time, the
* SIMD can issue up to 8 independent VALU ops for free (1 per 2 cycles).
* Place address computation, format conversion, or accumulator scaling
* between WMMA instructions to exploit this. The compiler does this
* automatically when independent work is available in the same basic block.
*/
__device__ __forceinline__ void compiler_fence() {
__builtin_amdgcn_sched_barrier(0);
Expand Down
Loading