LegNeato
diff --git a/‎crates/cuda_std/src/warp/matrix/intrinsics.rs‎
Lines changed: 43 additions & 0 deletions b/‎crates/cuda_std/src/warp/matrix/intrinsics.rs‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎crates/cuda_std/src/warp/matrix/ldmatrix_design.md‎
Lines changed: 113 additions & 0 deletions b/‎crates/cuda_std/src/warp/matrix/ldmatrix_design.md‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎crates/cuda_std/src/warp/matrix/mma_api_design.md‎
Lines changed: 106 additions & 0 deletions b/‎crates/cuda_std/src/warp/matrix/mma_api_design.md‎
Lines changed: 106 additions & 0 deletions
@@ -723,3 +723,46 @@ pub(crate) use m16n16k8::convert::*;
 pub(crate) use m16n16k8::load::*;
 pub(crate) use m16n16k8::store::*;
 pub(crate) use m16n16k8::mma::*;
+
+// ============= ldmatrix intrinsics =============
+// These load matrix fragments from shared memory for MMA operations
+pub(crate) mod ldmatrix {
+    #[allow(dead_code)]
+    extern "C" {
+        // 8x8 matrix with 16-bit elements (bf16/f16)
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16"]
+        pub(crate) fn ldmatrix_m8n8_x1_b16(ptr: *const u8) -> i32;
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.b16"]
+        pub(crate) fn ldmatrix_m8n8_x2_b16(ptr: *const u8) -> [i32; 2];
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.b16"]
+        pub(crate) fn ldmatrix_m8n8_x4_b16(ptr: *const u8) -> [i32; 4];
+        
+        // With transpose
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.trans.b16"]
+        pub(crate) fn ldmatrix_m8n8_x1_trans_b16(ptr: *const u8) -> i32;
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x2.trans.b16"]
+        pub(crate) fn ldmatrix_m8n8_x2_trans_b16(ptr: *const u8) -> [i32; 2];
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m8n8.x4.trans.b16"]
+        pub(crate) fn ldmatrix_m8n8_x4_trans_b16(ptr: *const u8) -> [i32; 4];
+        
+        // 16x16 matrix with 8-bit elements
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m16n16.x1.b8"]
+        pub(crate) fn ldmatrix_m16n16_x1_b8(ptr: *const u8) -> [i32; 2];
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m16n16.x2.b8"]
+        pub(crate) fn ldmatrix_m16n16_x2_b8(ptr: *const u8) -> [i32; 4];
+        
+        // 16x16 with transpose (mandatory for 16x16)
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m16n16.x1.trans.b8"]
+        pub(crate) fn ldmatrix_m16n16_x1_trans_b8(ptr: *const u8) -> [i32; 2];
+        
+        #[link_name = "llvm.nvvm.ldmatrix.sync.aligned.m16n16.x2.trans.b8"]
+        pub(crate) fn ldmatrix_m16n16_x2_trans_b8(ptr: *const u8) -> [i32; 4];
+    }
+}
+
+pub(crate) use ldmatrix::*;
@@ -0,0 +1,113 @@
+# LdMatrix Integration Design
+
+## Overview
+`ldmatrix` provides a way to load matrix data from shared memory into registers formatted for MMA operations. This enables memory loading for MMA-only shapes like Shape<16, 8, 16>.
+
+## Trait Hierarchy
+
+```rust
+// Base trait - all shapes support MMA compute
+pub trait MmaShape: TensorCoreShape {
+    // MMA compute operations
+}
+
+// Shapes that support loading from shared memory
+pub trait LdMatrixShape: MmaShape {
+    const LDMATRIX_SHAPE: LdMatrixShapeType;
+    const LDMATRIX_NUM: LdMatrixNum;
+}
+
+// Shapes that support WMMA load/store from global memory  
+pub trait WmmaShape: MmaShape {
+    // WMMA global memory operations
+}
+
+// Some shapes implement both!
+impl LdMatrixShape for Shape<16, 16, 16> { ... }
+impl WmmaShape for Shape<16, 16, 16> { ... }
+
+// Shape<16, 8, 16> only supports ldmatrix (not WMMA)
+impl LdMatrixShape for Shape<16, 8, 16> { 
+    const LDMATRIX_SHAPE: LdMatrixShapeType = LdMatrixShapeType::M8N8;
+    const LDMATRIX_NUM: LdMatrixNum = LdMatrixNum::X2; // Need 2 matrices for 16x8
+}
+```
+
+## API Methods
+
+```rust
+impl<T, Shape, L> MatrixA<T, Shape, L>
+where
+    T: MatrixElement,
+    Shape: MmaShape + FragmentSize<T>,
+    L: Layout,
+{
+    /// Available for all MMA shapes - register initialization
+    pub fn from_array(values: [T; Shape::A_REGISTERS]) -> Self { ... }
+    pub fn splat(value: T) -> Self { ... }
+}
+
+impl<T, Shape, L> MatrixA<T, Shape, L>
+where
+    T: MatrixElement,
+    Shape: LdMatrixShape + FragmentSize<T>,
+    L: Layout,
+{
+    /// Load from shared memory using ldmatrix
+    pub unsafe fn load_shared(&mut self, ptr: *const T, stride: usize) {
+        // Use ldmatrix intrinsic
+    }
+}
+
+impl<T, Shape, L> MatrixA<T, Shape, L>
+where
+    T: MatrixElement,
+    Shape: WmmaShape + FragmentSize<T>,
+    L: Layout,
+{
+    /// Load from global memory using WMMA
+    pub unsafe fn load<const STRIDE: usize>(&mut self, ptr: *const T) {
+        // Use WMMA load intrinsic
+    }
+}
+```
+
+## Usage Examples
+
+```rust
+// Shape<16, 8, 16> - MMA + ldmatrix (no WMMA)
+type Shape = dims::Shape<16, 8, 16>;
+
+// Can use register API
+let a = MatrixA::<bf16, Shape, Row>::from_array([bf16::ZERO; 8]);
+
+// Can use shared memory load
+extern "C" __shared__ static mut SMEM: [bf16; 1024];
+let mut b = MatrixA::<bf16, Shape, Row>::new();
+b.load_shared(SMEM.as_ptr(), 16);  // OK - has LdMatrixShape
+
+// Cannot use global memory load
+// b.load::<16>(global_ptr);  // ERROR: Shape doesn't implement WmmaShape
+
+// Shape<16, 16, 16> - has all three APIs
+type WmmaShape = dims::Shape<16, 16, 16>;
+
+let mut c = MatrixA::<bf16, WmmaShape, Row>::new();
+c.load_shared(SMEM.as_ptr(), 16);  // OK - has LdMatrixShape  
+c.load::<16>(global_ptr);          // OK - has WmmaShape
+```
+
+## Implementation Strategy
+
+1. Define ldmatrix intrinsics for each shape/type combination
+2. Implement LdMatrixShape trait for applicable shapes
+3. Add load_shared methods conditionally based on LdMatrixShape
+4. Update documentation to explain memory hierarchy
+
+## Benefits
+
+1. **Type-safe**: Compile-time enforcement of what each shape supports
+2. **Ergonomic**: Same API style, just different method names for different memory
+3. **Zero-cost**: All resolved at compile time
+4. **Clear semantics**: Method names indicate memory source
+5. **Flexible**: Shapes can support multiple loading strategies
@@ -0,0 +1,106 @@
+# MMA-Only Shape API Design
+
+## Problem
+Shape<16, 8, 16> supports MMA compute but not WMMA load/store. We need an ergonomic way to:
+1. Get data into fragments for MMA-only shapes
+2. Keep the API consistent with WMMA shapes where possible
+3. Make it compile-time safe with zero runtime cost
+
+## Proposed Solution
+
+### 1. Register-based construction (works for ALL shapes)
+```rust
+impl<T, Shape, L> MatrixA<T, Shape, L> 
+where 
+    T: MatrixElement,
+    Shape: MmaShape,  // Note: only requires MmaShape, not WmmaShape
+    L: Layout,
+{
+    /// Create from register values directly
+    /// This works for both WMMA and MMA-only shapes
+    pub fn from_registers(values: &[T]) -> Self {
+        // Implementation details
+    }
+    
+    /// Fill with a single value (broadcast)
+    pub fn splat(value: T) -> Self {
+        // Implementation details
+    }
+}
+```
+
+### 2. Conditional load method (only available for WMMA shapes)
+```rust
+impl<T, Shape, L> MatrixA<T, Shape, L> 
+where 
+    Shape: WmmaShape,  // Only for WMMA-capable shapes
+{
+    pub unsafe fn load<const STRIDE: usize>(&mut self, ptr: *const T) { ... }
+}
+```
+
+### 3. Builder pattern for complex initialization
+```rust
+impl<T, Shape, L> MatrixA<T, Shape, L>
+where
+    Shape: MmaShape,
+{
+    pub fn builder() -> MatrixBuilder<T, Shape, L> { ... }
+}
+
+pub struct MatrixBuilder<T, Shape, L> { ... }
+
+impl<T, Shape, L> MatrixBuilder<T, Shape, L> {
+    pub fn set_lane(self, lane: usize, value: T) -> Self { ... }
+    pub fn set_row(self, row: usize, values: &[T]) -> Self { ... }
+    pub fn build(self) -> MatrixA<T, Shape, L> { ... }
+}
+```
+
+### 4. Associated types for fragment size
+```rust
+pub trait MmaShape: TensorCoreShape {
+    type FragmentA<T: MatrixElement>: AsRef<[T::Storage]>;
+    type FragmentB<T: MatrixElement>: AsRef<[T::Storage]>;
+    type FragmentC<T: AccumulatorElement>: AsRef<[T::Storage]>;
+}
+```
+
+## Usage Examples
+
+### For WMMA shapes (unchanged):
+```rust
+// Traditional WMMA load still works
+let mut a_frag = tc.matrix_a();
+a_frag.load::<16>(data_ptr);
+```
+
+### For MMA-only shapes:
+```rust
+// Shape<16, 8, 16> - MMA only
+type Shape = dims::Shape<16, 8, 16>;
+let tc = TensorCore::<bf16, Shape>::new();
+
+// Option 1: From registers
+let values = [bf16::from_f32(1.0); 8];
+let a_frag = MatrixA::from_registers(&values);
+
+// Option 2: Splat
+let b_frag = MatrixB::splat(bf16::from_f32(2.0));
+
+// Option 3: Builder
+let c_frag = Accumulator::builder()
+    .set_lane(0, 1.0)
+    .set_lane(1, 2.0)
+    .build();
+
+// MMA operations work the same way
+let result = c_frag.mma(&a_frag, &b_frag);
+```
+
+## Benefits
+1. **Ergonomic**: Similar API for both WMMA and MMA shapes
+2. **Type-safe**: Compile-time errors if you try to load on MMA-only shapes
+3. **Zero-cost**: All resolved at compile time
+4. **Intuitive**: Methods clearly indicate data source (registers vs memory)
+5. **Flexible**: Multiple ways to construct fragments based on needs