HazyResearch · AviralGoelAMD · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/include/common/util.cuh b/include/common/util.cuh
@@ -109,7 +109,7 @@ __host__ __device__ inline int ceil_div(int a, int b) {
     int limit = (num_workgroups / block) * block;
 
     // If pid beyond the last full block, leave unchanged
-    if (workgroup_id > limit) return workgroup_id;
+    if (workgroup_id >= limit) return workgroup_id;
 
     // Local PID (within round-robin assignment)
     int local_pid    = workgroup_id / num_xcds;
@@ -174,7 +174,11 @@ struct default_type {};
 /**
  * @brief Mask constant for all active threads in a warp.
  */
-static constexpr uint64_t MASK_ALL = 0xFFFFFFFFFFFFFFFF;
+#ifdef KITTENS_UDNA1
+static constexpr uint64_t MASK_ALL = 0x00000000FFFFFFFF; // wave-32
+#else
+static constexpr uint64_t MASK_ALL = 0xFFFFFFFFFFFFFFFF; // wave-64
+#endif
 
 /**
  * @brief Perform a shuffle down operation on a packed type synchronously across a warp.
@@ -201,7 +205,7 @@ __device__ static inline T packed_shfl_down(uint64_t mask, const T &f, int delta
                                        *reinterpret_cast<const __hip_bfloat16*>(&f)};
         }
 
-        u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, 64);
+        u.ui = __shfl_down_sync<unsigned long long, unsigned int>(mask, u.ui, delta, WARP_THREADS);
         if constexpr (std::is_same_v<T, bf16>) {
             return *reinterpret_cast<const T*>(&u.bf162.x);  // Extract single bf16 from the .x component
         } else {
@@ -332,8 +336,9 @@ template<typename T> concept all = is_segment<T>::value;
 template<int default_alignment=16> 
 struct shared_allocator {
     int *ptr;
-#ifdef KITTENS_UDNA1
     int *base;
+#ifdef KITTENS_UDNA1
+    int *seg_ptr[LDS_NUM_SEGMENTS];
 #endif
 
     private:
@@ -366,24 +371,26 @@ struct shared_allocator {
         * @brief Construct a new shared allocator using a pointer to extern shared memory.
         * @param[in] _ptr Pointer to the start of the extern shared memory.
         */
+        __device__ shared_allocator(int *_ptr): ptr(_ptr), base(_ptr) {
 #ifdef KITTENS_UDNA1
-        __device__ shared_allocator(int *_ptr): ptr(_ptr), base(_ptr) {}
-#else
-        __device__ shared_allocator(int *_ptr): ptr(_ptr) {}
+            for (int i = 0; i < LDS_NUM_SEGMENTS; i++)
+                seg_ptr[i] = base + i * (LDS_SEGMENT_BYTES / (int)sizeof(int));
 #endif
+        }
         /**
         * @brief Allocate shared memory for a single instance or N-dimensional array of type A.
         * @tparam A The type of the object to allocate.
         * @tparam dims... A list of dimensions for the N-dimensional array.
         * @return Reference to the allocated object.
         */
-        template<typename A, size_t... dims> 
+        template<typename A, size_t... dims>
         __device__ inline variadic_array_t<A, dims...>& allocate() {
-            // static_assert(sizeof(A) % default_alignment == 0, "Type is not aligned properly for array allocation");
+
             align_ptr<default_alignment>();
             using at = variadic_array_t<A, dims...>;
             at*p = reinterpret_cast<at*>(ptr);
             ptr += sizeof(at)/sizeof(int);
+            assert(ptr <= base + MAX_SHARED_MEMORY / sizeof(int));
             return *p;
         }
         /**
@@ -393,13 +400,14 @@ struct shared_allocator {
         * @tparam dims... A list of dimensions for the N-dimensional array.
         * @return Reference to the allocated object.
         */
-        template<int alignment, typename A, size_t... dims> 
+        template<int alignment, typename A, size_t... dims>
         __device__ inline variadic_array_t<A, dims...>& allocate() {
-            // static_assert(sizeof(A) % alignment == 0, "Type is not aligned properly for array allocation");
+
             align_ptr<alignment>();
             using at = variadic_array_t<A, dims...>;
             at*p = reinterpret_cast<at*>(ptr);
             ptr += sizeof(at)/sizeof(int);
+            assert(ptr <= base + MAX_SHARED_MEMORY / sizeof(int));
             return *p;
         }
 
@@ -419,13 +427,17 @@ struct shared_allocator {
         template<typename SEG, typename A, size_t... dims>
             requires ducks::segment_tag::all<SEG>
         __device__ inline variadic_array_t<A, dims...>& allocate_in() {
-            int* target = base + (SEG::byte_offset / sizeof(int));
-            // If we've already allocated past the requested segment, keep
-            // packing where we are; otherwise jump forward to the segment.
-            if (ptr < target) ptr = target;
+            constexpr int idx = SEG::index;
+            if constexpr (default_alignment > 0) {
+                uint64_t p = reinterpret_cast<uint64_t>(seg_ptr[idx]);
+                if (p % default_alignment != 0)
+                    seg_ptr[idx] = (int*)(p + (default_alignment - (p % default_alignment)));
+            }
             using at = variadic_array_t<A, dims...>;
-            at* p = reinterpret_cast<at*>(ptr);
-            ptr += sizeof(at) / sizeof(int);
+            at* p = reinterpret_cast<at*>(seg_ptr[idx]);
+            seg_ptr[idx] += sizeof(at) / sizeof(int);
+            constexpr int seg_end = (idx + 1) * LDS_SEGMENT_BYTES / sizeof(int);
+            assert(seg_ptr[idx] <= base + seg_end);
             return *p;
         }
 #endif // KITTENS_UDNA1

diff --git a/include/ops/warp/cluster/cluster.cuh b/include/ops/warp/cluster/cluster.cuh
@@ -44,14 +44,14 @@ __device__ __host__ __forceinline__ constexpr uint32_t mask(
 }
 
 /**
- * @brief Cluster-wide split barrier.
+ * @brief Cluster-wide barrier (signal + wait on cluster user barrier -3).
  *
- * Outside a CGA cluster this lowers to a workgroup-wide `sync::sync()`. Inside
- * a cluster the same `s_barrier_signal -1 / s_barrier_wait -1` pair extends to
- * every workgroup in the cluster by hardware-managed forwarding.
+ * Barrier -3 syncs across all workgroups in a CGA cluster.
+ * Outside a cluster, use `sync::sync()` (barrier -1, workgroup-only).
  */
 __device__ __forceinline__ void sync() {
-    ::kittens::sync::sync();
+    __builtin_amdgcn_s_barrier_signal(-3);
+    __builtin_amdgcn_s_barrier_wait(-3);
 }
 
 } // namespace cluster

diff --git a/include/ops/warp/memory/tile/global_to_shared.cuh b/include/ops/warp/memory/tile/global_to_shared.cuh
@@ -10,6 +10,13 @@
 
 namespace kittens {
 
+// CDNA global→shared load overloads (4 variants):
+// 1. load(ST, GL, coord)             — typed ST, computes swizzle inline
+// 2. load(ST, GL, coord, precomp)    — typed ST, takes precomputed swizzle offsets
+// 3. load_async(ST, GL, coord)       — async DMA path (CDNA2+)
+// 4. store(GL, ST, coord)            — shared→global (reverse direction)
+// For gfx1250, see the g2s:: namespace below for register-mediated and TDM paths.
+
 template<int axis, bool assume_aligned,
          ducks::st::all ST, ducks::gl::all GL,
          ducks::coord::tile COORD = coord<ST>,
@@ -461,6 +468,12 @@ struct lds_nopad {
 };
 
 /// @brief Default LDS padding for bf16 GEMMs on gfx1250.
+/// Derivation: gfx1250 LDS has 32 banks, 4 bytes wide. A 16x32 bf16 subtile
+/// row is 32 * 2 = 64 bytes = 16 banks. Two consecutive rows hit the same 16
+/// banks → 2-way conflict on ds_load_b128. Padding by 8 bf16 (16 bytes = 4
+/// banks) shifts each row's bank mapping, eliminating conflicts.
+/// Interval 128 = one subtile row (128 bf16 = 256 bytes). Must be power-of-2
+/// for D# encoding (see SR-09 static_assert).
 using lds_pad_default = lds_padded<128, 8>;
 
 namespace g2s {
@@ -480,6 +493,10 @@ using i32x4_lvec  = int __attribute__((__vector_size__(16))) __attribute__((addr
  */
 template<int ROWS, int COLS, int SUB_ROWS, int SUB_COLS>
 __device__ __forceinline__ int subtile_flat(int flat) {
+    static_assert((SUB_ROWS * SUB_COLS & (SUB_ROWS * SUB_COLS - 1)) == 0,
+                  "sub_elems must be power-of-2 to avoid integer division");
+    static_assert((SUB_COLS & (SUB_COLS - 1)) == 0,
+                  "SUB_COLS must be power-of-2 to avoid integer division");
     constexpr int sub_elems    = SUB_ROWS * SUB_COLS;
     constexpr int subs_per_row = COLS / SUB_COLS;
     const int subtile_id = flat / sub_elems;
@@ -501,6 +518,8 @@ __device__ __forceinline__ int subtile_flat(int flat) {
  * Plain `global_load` -> VGPR -> `ds_store` path. Use this when no async
  * intrinsic is available or for correctness baselines. The `Pad` parameter
  * controls the per-element LDS placement; pass `lds_nopad` for flat layouts.
+ *
+ * Caller must ensure matrix dimensions are multiples of ROWS/COLS (no bounds clamping).
  */
 template<typename Pad = lds_nopad, int ROWS = 0, int COLS = 0, int N_THREADS = WARP_THREADS,
          typename T, ducks::gl::all GL, ducks::coord::tile COORD = coord<>>
@@ -536,6 +555,8 @@ __device__ inline void load(T* __restrict__ lds_dst, const GL& src, const COORD&
  * issues one 16-byte transfer; the warp covers `8 * N_THREADS` elements per
  * iteration. Drain with `kittens::sync::wait_async()` before consuming.
  *
+ * Caller must ensure matrix dimensions are multiples of ROWS/COLS (no bounds clamping).
+ *
  * @tparam Pad      LDS padding descriptor.
  * @tparam ROWS,COLS  Tile shape (elements).
  * @tparam N_THREADS  Number of threads participating in the load.
@@ -645,6 +666,9 @@ __device__ __forceinline__ void build_tdm_d_2d(
                                      : (sizeof(T) == 4) ? 2
                                      : 3;
     constexpr uint32_t pad_enable   = (Pad::interval > 0) ? 1u : 0u;
+    static_assert(Pad::interval == 0 ||
+                  __builtin_popcount(Pad::interval * sizeof(T) / 4) == 1,
+                  "Pad interval in DWords must be a power of 2 for D# encoding");
     constexpr uint32_t pad_int_enc  = (Pad::interval > 0)
         ? ( __builtin_ctz(Pad::interval * sizeof(T) / 4) ) : 0;
     constexpr uint32_t pad_amt_enc  = (Pad::amount > 0)
@@ -669,6 +693,7 @@ __device__ __forceinline__ void build_tdm_d_2d(
     const uint32_t tiledim1 = static_cast<uint32_t>(ROWS);
 
     // barrier_addr occupies w1[15:0]; tensor_dim0 lo16 occupies w1[31:16].
+    assert(bar_lds_addr == 0 || bar_lds_addr < 0x10000u);
     uint32_t w1 = (bar_lds_addr & 0xFFFFu) | (tdim0 << 16);
     uint32_t w2 = (tdim0 >> 16) | (tdim1 << 16);
     uint32_t w3 = (tdim1 >> 16) | (tiledim0 << 16);

diff --git a/include/ops/warp/memory/tile/shared_to_register.cuh b/include/ops/warp/memory/tile/shared_to_register.cuh
@@ -709,6 +709,10 @@ namespace detail {
 inline constexpr int GFX1250_SUB_ROWS  = 16;
 inline constexpr int GFX1250_SUB_COLS  = 32;
 inline constexpr int GFX1250_SUB_ELEMS = GFX1250_SUB_ROWS * GFX1250_SUB_COLS;
+
+__device__ __forceinline__ int gfx1250_lane_offset(int sub_id, int row, int half) {
+    return sub_id * GFX1250_SUB_ELEMS + row * GFX1250_SUB_COLS + half * 16;
+}
 } // namespace detail
 
 /**
@@ -729,6 +733,8 @@ __device__ inline void load_b128(
     rt_bf<WARP_M, WARP_K, ducks::rt_layout::row, ducks::rt_shape::rt_16x32>& dst,
     const bf16* __restrict__ warp_lds_base)
 {
+    static_assert(Pad::amount == 0 || Pad::amount * sizeof(bf16) % 16 == 0,
+                  "Pad amount must be a multiple of 16 bytes for ds_load_b128 alignment");
     constexpr int height       = WARP_M / detail::GFX1250_SUB_ROWS;
     constexpr int width        = WARP_K / detail::GFX1250_SUB_COLS;
     constexpr int subs_per_row = WARP_K / detail::GFX1250_SUB_COLS;
@@ -742,19 +748,15 @@ __device__ inline void load_b128(
         #pragma unroll
         for (int tj = 0; tj < width; tj++) {
             const int sub_id     = ti * subs_per_row + tj;
-            const int base_flat  = sub_id * detail::GFX1250_SUB_ELEMS
-                                 + row * detail::GFX1250_SUB_COLS
-                                 + half * 16;
-            const int padded_off = Pad::padded(base_flat);
+            const int padded_off = Pad::padded(detail::gfx1250_lane_offset(sub_id, row, half));
 
             const uint32_t addr = static_cast<uint32_t>(
                 reinterpret_cast<uintptr_t>(warp_lds_base + padded_off));
 
             float4 lo, hi;
-            asm volatile("ds_load_b128 %0, %1 offset:0\n"
-                : "=v"(lo) : "v"(addr) : "memory");
-            asm volatile("ds_load_b128 %0, %1 offset:16\n"
-                : "=v"(hi) : "v"(addr) : "memory");
+            asm volatile("ds_load_b128 %0, %2 offset:0\n"
+                         "ds_load_b128 %1, %2 offset:16\n"
+                : "=v"(lo), "=v"(hi) : "v"(addr) : "memory");
 
             bf16_2* lo_p = reinterpret_cast<bf16_2*>(&lo);
             bf16_2* hi_p = reinterpret_cast<bf16_2*>(&hi);
@@ -783,6 +785,7 @@ __device__ inline void load_b32(
     rt_bf<WARP_M, WARP_K, ducks::rt_layout::row, ducks::rt_shape::rt_16x32>& dst,
     const bf16* __restrict__ warp_lds_base)
 {
+    // Unpadded only — use load_b128<Pad> for padded LDS layouts.
     constexpr int height       = WARP_M / detail::GFX1250_SUB_ROWS;
     constexpr int width        = WARP_K / detail::GFX1250_SUB_COLS;
     constexpr int subs_per_row = WARP_K / detail::GFX1250_SUB_COLS;
@@ -795,13 +798,10 @@ __device__ inline void load_b32(
     for (int ti = 0; ti < height; ti++) {
         #pragma unroll
         for (int tj = 0; tj < width; tj++) {
-            const int sub_id    = ti * subs_per_row + tj;
-            const int base_flat = sub_id * detail::GFX1250_SUB_ELEMS
-                                + row * detail::GFX1250_SUB_COLS
-                                + half * 16;
+            const int sub_id = ti * subs_per_row + tj;
 
             const bf16_2* lds_p = reinterpret_cast<const bf16_2*>(
-                warp_lds_base + base_flat);
+                warp_lds_base + detail::gfx1250_lane_offset(sub_id, row, half));
 
             #pragma unroll
             for (int k = 0; k < 8; k++) {

diff --git a/include/ops/warp/register/tile/mma.cuh b/include/ops/warp/register/tile/mma.cuh
@@ -124,7 +124,7 @@ __device__ static inline void mfma323232(      float2 (&D)[8],
     typedef __attribute__((__vector_size__(8 * sizeof(__bf16)))) __bf16 bf16x8_t;
     typedef __attribute__((__vector_size__(16 * sizeof(float)))) float floatx16_t;
 
-    *(floatx16_t*)C = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
+    floatx16_t acc = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
         *(bf16x8_t*)A,
         *(bf16x8_t*)B,
         *(floatx16_t*)C,
@@ -134,7 +134,7 @@ __device__ static inline void mfma323232(      float2 (&D)[8],
     *(floatx16_t*)D = __builtin_amdgcn_mfma_f32_32x32x16_bf16(
         *(bf16x8_t*)(A + 4),
         *(bf16x8_t*)(B + 4),
-        *(floatx16_t*)C,
+        acc,
         0, 0, 0
     );
 }
@@ -220,19 +220,32 @@ __device__ static inline void mma_AB_base(rt_base<float, ducks::rt_layout::col,
     constexpr int B_stride = B_shape::stride;
     static_assert(A_stride == B_stride, "A and B must have the same stride");
 
-    if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> && 
+#ifdef KITTENS_UDNA1
+    // gfx1250 WMMA always computes A × B_input^T. For mma_AB, B is col-major,
+    // so B_input^T = B_row = the non-transposed view. Same WMMA instruction.
+    if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> &&
+                  A_rows == 16 && A_cols == 32 &&
+                  B_rows == 32 && B_cols == 16 &&
+                  std::is_same_v<C_shape, typename ducks::rt_shape::rt_16x16>) {
+        wmma161632<false, false>(d.data, a.data, b.data, c.data);
+    } else {
+        static_assert(false, "Unsupported shape combination for gfx1250 mma_AB_base");
+    }
+#else
+    if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_16x16> &&
                   A_rows == 16 && A_cols == 32 &&
                   B_rows == 32 && B_cols == 16 &&
                   std::is_same_v<C_shape, typename ducks::rt_shape::rt_16x16>) {
         mfma161632(d.data, a.data, b.data, c.data);
-    } else if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_32x32> && 
+    } else if constexpr (std::is_same_v<D_shape, typename ducks::rt_shape::rt_32x32> &&
                   A_rows == 32 && A_cols == 16 &&
                   B_rows == 16 && B_cols == 32 &&
                   std::is_same_v<C_shape, typename ducks::rt_shape::rt_32x32>) {
         mfma323216(d.data, a.data, b.data, c.data);
     } else {
         static_assert(false, "Unsupported shape combination");
     }
+#endif
 }
 
 /**

diff --git a/include/ops/warp/sched/sched.cuh b/include/ops/warp/sched/sched.cuh
@@ -28,14 +28,15 @@ namespace sched {
  *                as experimental / unsafe by default.
  */
 enum class mode : int {
-    normal  = 0,
-    full    = 1,
-    limited = 2,
+    normal          = 0,
+    full            = 1,
+    limited         = 2,
+    limited_nostall = 2 | (1 << 4), // limited + DISABLE_VALU_STALL (bit[4])
 };
 
-// `s_setreg_b32 hwreg(MODE_REG=1, offset=4, size=2), value`
-// Encoded simm16 = 1 | (4 << 6) | ((2-1) << 11) = 2305.
-constexpr int SCHED_MODE_HWREG_SIMM16 = 1 | (4 << 6) | (1 << 11);
+// `s_setreg_b32 hwreg(MODE_REG=1, offset=4, size=5), value`
+// 5 bits to cover bits [4:0] of SCHED_MODE (including DISABLE_VALU_STALL at bit[4]).
+constexpr int SCHED_MODE_HWREG_SIMM16 = 1 | (4 << 6) | ((5 - 1) << 11);
 
 /**
  * @brief Set the wave's SCHED_MODE to `m`.
@@ -104,12 +105,37 @@ __device__ __forceinline__ void sleep() {
     __builtin_amdgcn_s_sleep(N);
 }
 
+/**
+ * @brief Wake all sleeping waves in this workgroup.
+ *
+ * Lowers to `s_wakeup`. Use after a producer finishes work to wake consumer
+ * waves that are polling in `wait_barrier` via `s_sleep`.
+ */
+__device__ __forceinline__ void wakeup() {
+    asm volatile("s_wakeup" ::: "memory");
+}
+
+/**
+ * @brief Sleep the wave for a runtime-variable number of cycles.
+ *
+ * Lowers to `s_sleep_var`. Duration = SGPR[6:0] * 64 cycles.
+ */
+__device__ __forceinline__ void sleep_var(unsigned cycles_div64) {
+    asm volatile("s_sleep_var %0" :: "s"(cycles_div64));
+}
+
 /**
  * @brief Compiler-only scheduling fence.
  *
  * Tells the LLVM scheduler not to reorder instructions across this point
  * but emits no hardware op. Useful when constraining the compiler's WMMA
  * burst grouping without paying a runtime barrier.
+ *
+ * WMMA co-execution note: each WMMA takes 16 cycles. During this time, the
+ * SIMD can issue up to 8 independent VALU ops for free (1 per 2 cycles).
+ * Place address computation, format conversion, or accumulator scaling
+ * between WMMA instructions to exploit this. The compiler does this
+ * automatically when independent work is available in the same basic block.
  */
 __device__ __forceinline__ void compiler_fence() {
     __builtin_amdgcn_sched_barrier(0);