LegNeato
diff --git a/‎crates/cuda_std/src/warp/matrix/intrinsics.rs‎
Lines changed: 6 additions & 87 deletions b/‎crates/cuda_std/src/warp/matrix/intrinsics.rs‎
Lines changed: 6 additions & 87 deletions
diff --git a/‎crates/cuda_std/src/warp/matrix/mod.rs‎
Lines changed: 44 additions & 0 deletions b/‎crates/cuda_std/src/warp/matrix/mod.rs‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎crates/cuda_std/src/warp/matrix/multi.rs‎
Lines changed: 1 addition & 61 deletions b/‎crates/cuda_std/src/warp/matrix/multi.rs‎
Lines changed: 1 addition & 61 deletions
@@ -244,92 +244,12 @@ mod m16n16k16 {
 }
 
 // ============= 16x8x16 shape =============
+// NOTE: m16n8k16 only supports MMA operations, not WMMA load/store
 mod m16n8k16 {
-    // Load operations
-    pub(crate) mod load {
-        extern "C" {
-            // f16 loads
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.row.stride.f16"]
-            pub(crate) fn wmma_load_a_f16_row_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.col.stride.f16"]
-            pub(crate) fn wmma_load_a_f16_col_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.row.stride.f16"]
-            pub(crate) fn wmma_load_b_f16_row_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.col.stride.f16"]
-            pub(crate) fn wmma_load_b_f16_col_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-
-            // bf16 loads
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.row.stride.bf16"]
-            pub(crate) fn wmma_load_a_bf16_row_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.col.stride.bf16"]
-            pub(crate) fn wmma_load_a_bf16_col_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.row.stride.bf16"]
-            pub(crate) fn wmma_load_b_bf16_row_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.col.stride.bf16"]
-            pub(crate) fn wmma_load_b_bf16_col_m16n8k16(ptr: *const u8, stride: i32) -> [i16; 8];
-
-            // i8/u8 loads
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.row.stride.s8"]
-            pub(crate) fn wmma_load_a_s8_row_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.col.stride.s8"]
-            pub(crate) fn wmma_load_a_s8_col_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.row.stride.u8"]
-            pub(crate) fn wmma_load_a_u8_row_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.a.sync.col.stride.u8"]
-            pub(crate) fn wmma_load_a_u8_col_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.row.stride.s8"]
-            pub(crate) fn wmma_load_b_s8_row_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.col.stride.s8"]
-            pub(crate) fn wmma_load_b_s8_col_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.row.stride.u8"]
-            pub(crate) fn wmma_load_b_u8_row_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.b.sync.col.stride.u8"]
-            pub(crate) fn wmma_load_b_u8_col_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 2];
-
-            // Accumulator loads
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.c.sync.row.stride.f32"]
-            pub(crate) fn wmma_load_c_f32_row_m16n8k16(ptr: *const u8, stride: i32) -> [f32; 4];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.c.sync.col.stride.f32"]
-            pub(crate) fn wmma_load_c_f32_col_m16n8k16(ptr: *const u8, stride: i32) -> [f32; 4];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.c.sync.row.stride.s32"]
-            pub(crate) fn wmma_load_c_s32_row_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 4];
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.load.c.sync.col.stride.s32"]
-            pub(crate) fn wmma_load_c_s32_col_m16n8k16(ptr: *const u8, stride: i32) -> [i32; 4];
-        }
-    }
-
-    // Store operations
-    pub(crate) mod store {
-        extern "C" {
-            // f32 stores
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.store.d.sync.row.stride.f32"]
-            pub(crate) fn wmma_store_d_f32_row_m16n8k16(
-                ptr: *mut u8,
-                d0: f32, d1: f32, d2: f32, d3: f32,
-                stride: i32,
-            );
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.store.d.sync.col.stride.f32"]
-            pub(crate) fn wmma_store_d_f32_col_m16n8k16(
-                ptr: *mut u8,
-                d0: f32, d1: f32, d2: f32, d3: f32,
-                stride: i32,
-            );
-
-            // i32 stores
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.store.d.sync.row.stride.s32"]
-            pub(crate) fn wmma_store_d_s32_row_m16n8k16(
-                ptr: *mut u8,
-                d0: i32, d1: i32, d2: i32, d3: i32,
-                stride: i32,
-            );
-            #[link_name = "llvm.nvvm.wmma.m16n8k16.store.d.sync.col.stride.s32"]
-            pub(crate) fn wmma_store_d_s32_col_m16n8k16(
-                ptr: *mut u8,
-                d0: i32, d1: i32, d2: i32, d3: i32,
-                stride: i32,
-            );
-        }
-    }
+    // No WMMA load operations - this shape only supports MMA
+    // The LLVM spec only defines MMA intrinsics for m16n8k16
+    
+    // No WMMA store operations - this shape only supports MMA
 
     // MMA operations
     pub(crate) mod mma {
@@ -784,8 +704,7 @@ pub(crate) use m16n16k16::load::*;
 pub(crate) use m16n16k16::store::*;
 pub(crate) use m16n16k16::mma::*;
 
-pub(crate) use m16n8k16::load::*;
-pub(crate) use m16n8k16::store::*;
+// m16n8k16 has no load/store operations (MMA-only)
 pub(crate) use m16n8k16::mma::*;
 
 pub(crate) use m32n8k16::load::*;
 
@@ -45,6 +45,24 @@ pub trait TensorCoreShape: sealed::Sealed {
     const K: usize;
 }
 
+/// Shapes that support MMA compute operations
+#[diagnostic::on_unimplemented(
+    message = "`{Self}` does not support MMA compute operations",
+    label = "MMA compute not available for this shape",
+    note = "This shape is not a valid tensor core shape for MMA operations"
+)]
+pub trait MmaShape: TensorCoreShape {}
+
+/// Shapes that support full WMMA operations (load, store, in addition to compute)
+/// This trait extends MmaShape since WMMA shapes can do everything MMA shapes can do
+#[diagnostic::on_unimplemented(
+    message = "`{Self}` does not support WMMA load/store operations",
+    label = "WMMA load/store not available for this shape",
+    note = "This shape only supports MMA compute operations, not WMMA load/store",
+    note = "See LLVM source: https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsNVVM.td#L419-L1067"
+)]
+pub trait WmmaShape: MmaShape {}
+
 // Only these exact combinations are valid for tensor cores
 impl TensorCoreShape for dims::Shape<16, 16, 16> {
     const M: usize = 16;
@@ -94,6 +112,28 @@ impl TensorCoreShape for dims::Shape<8, 8, 4> {
     const K: usize = 4;
 }
 
+// All tensor core shapes support MMA compute operations
+impl MmaShape for dims::Shape<16, 16, 16> {}
+impl MmaShape for dims::Shape<32, 8, 16> {}
+impl MmaShape for dims::Shape<8, 32, 16> {}
+impl MmaShape for dims::Shape<16, 8, 16> {} // MMA-only (no WMMA load/store)
+impl MmaShape for dims::Shape<16, 16, 8> {}
+impl MmaShape for dims::Shape<8, 8, 32> {}
+impl MmaShape for dims::Shape<8, 8, 128> {}
+impl MmaShape for dims::Shape<8, 8, 4> {}
+
+// Shapes that ALSO support WMMA load/store (in addition to MMA compute)
+impl WmmaShape for dims::Shape<16, 16, 16> {}
+impl WmmaShape for dims::Shape<32, 8, 16> {}
+impl WmmaShape for dims::Shape<8, 32, 16> {}
+// Note: 16x8x16 does NOT implement WmmaShape - it's MMA-only
+
+// Other shapes support WMMA for their respective data types
+impl WmmaShape for dims::Shape<16, 16, 8> {} // TF32
+impl WmmaShape for dims::Shape<8, 8, 32> {} // i8/u8
+impl WmmaShape for dims::Shape<8, 8, 128> {} // i4/u4
+impl WmmaShape for dims::Shape<8, 8, 4> {} // f64
+
 // ============================================================================
 // Layout Types
 // ============================================================================
@@ -306,6 +346,7 @@ where
     #[gpu_only]
     pub unsafe fn load<const STRIDE: usize>(&mut self, ptr: *const T)
     where
+        Shape: WmmaShape, // Require WmmaShape for load operations
         StrideValidator<T, STRIDE>: ValidStride,
         T: ops::LoadMatrixA<Shape, L>,
     {
@@ -343,6 +384,7 @@ where
     #[gpu_only]
     pub unsafe fn load<const STRIDE: usize>(&mut self, ptr: *const T)
     where
+        Shape: WmmaShape, // Require WmmaShape for load operations
         StrideValidator<T, STRIDE>: ValidStride,
         T: ops::LoadMatrixB<Shape, L>,
     {
@@ -379,6 +421,7 @@ where
     #[gpu_only]
     pub unsafe fn load<L, const STRIDE: usize>(&mut self, ptr: *const T)
     where
+        Shape: WmmaShape, // Require WmmaShape for load operations
         L: Layout,
         StrideValidator<T, STRIDE>: ValidStride,
         T: ops::LoadMatrixC<Shape, L>,
@@ -398,6 +441,7 @@ where
     #[gpu_only]
     pub unsafe fn store<L, const STRIDE: usize>(&self, ptr: *mut T)
     where
+        Shape: WmmaShape, // Require WmmaShape for store operations
         L: Layout,
         StrideValidator<T, STRIDE>: ValidStride,
         T: ops::StoreMatrixD<Shape, L>,
 
@@ -247,7 +247,7 @@ impl<T, Shape, L, const N: usize, const STRIDE: usize> BatchLoad<STRIDE>
     for FragmentArray<MatrixA<T, Shape, L>, N>
 where
     T: MatrixElement,
-    Shape: TensorCoreShape,
+    Shape: TensorCoreShape + crate::warp::matrix::WmmaShape, // Require WmmaShape for load
     L: Layout,
     crate::warp::matrix::StrideValidator<T, STRIDE>: crate::warp::matrix::ValidStride,
     T: crate::warp::matrix::ops::LoadMatrixA<Shape, L>,
@@ -360,63 +360,3 @@ pub type FragmentRow<T, const N: usize> = FragmentArray<T, N>;
 
 /// Single column of fragments
 pub type FragmentCol<T, const N: usize> = FragmentArray<T, N>;
-
-// ============================================================================
-// Example Usage
-// ============================================================================
-
-#[cfg(test)]
-mod examples {
-    use super::*;
-    use crate::warp::matrix::dims;
-    use half::f16;
-
-    /// Example: Flash Attention with idiomatic Rust patterns
-    fn flash_attention_example() {
-        // Use const generics for compile-time guarantees
-        const TILE_M: usize = 2;
-        const TILE_N: usize = 8;
-
-        type Shape = dims::Shape<16, 8, 16>;
-
-        // Create tiled fragments with type inference
-        let mut q_tiles: FragmentGrid<MatrixA<f16, Shape, Row>, TILE_M, TILE_N> =
-            Default::default();
-        let mut k_tiles: FragmentGrid<MatrixB<f16, Shape, Row>, TILE_M, TILE_N> =
-            Default::default();
-        let mut acc_tiles: FragmentGrid<Accumulator<f32, Shape>, TILE_M, TILE_N> =
-            Default::default();
-
-        // Load with iterator pattern
-        for (i, tile) in q_tiles.iter_mut().enumerate() {
-            // tile.load(ptr.offset(i * stride), stride);
-        }
-
-        // Use indexing for specific tiles
-        let tile_0_0 = &q_tiles[(0, 0)];
-
-        // Map operations are zero-cost
-        let scaled_tiles = acc_tiles.map(|tile| {
-            // Scale each tile
-            tile
-        });
-    }
-
-    /// Example: Using the builder pattern
-    fn builder_example() {
-        type Shape = dims::Shape<16, 16, 16>;
-
-        // Type inference makes this clean
-        let a_tiles = FragmentBuilder::<f16, Shape>::array::<4>();
-        let b_tiles = FragmentBuilder::<f16, Shape>::grid::<2, 2>();
-    }
-
-    /// Example: Pattern matching for optimization
-    fn pattern_matching_example() {
-        // Compiler can optimize based on const values
-        const PATTERN: TilePattern<2, 4, 2> = TilePattern;
-
-        type Shape = dims::Shape<16, 16, 16>;
-        let (a, b, c) = PATTERN.create_fragments::<f16, f32, Shape>();
-    }
-}