hw-native-sys · HecreReed · May 11, 2026 · May 11, 2026 · gemini-code-assist · May 11, 2026
diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
@@ -137,6 +137,8 @@ static LogicalResult verifyMatTileOperandsA2A3(Operation *op, Type lhsTy,
                                                Type rhsTy, Type dstTy);
 static LogicalResult verifyMatTileOperandsA5(Operation *op, Type lhsTy,
                                              Type rhsTy, Type dstTy);
+static LogicalResult verifyMadTileLayoutsA5(Operation *op, Type lhsTy,
+                                            Type rhsTy, Type dstTy);
 static LogicalResult verifyGemvTileOperands(Operation *op, Type lhsTy, Type rhsTy,
                                             Type dstTy);
 static LogicalResult verifyGemvTileOperandsA2A3(Operation *op, Type lhsTy,
@@ -3131,6 +3133,11 @@ static LogicalResult verifyMatTileOperandsA5(Operation *op, Type lhsTy,
   if (failed(verifyMatTileOperandsA2A3(op, lhsTy, rhsTy, dstTy)))
     return failure();
 
+  return verifyMadTileLayoutsA5(op, lhsTy, rhsTy, dstTy);
+}
+
+static LogicalResult verifyMadTileLayoutsA5(Operation *op, Type lhsTy,
+                                            Type rhsTy, Type dstTy) {
   auto lhsTb = mlir::dyn_cast<pto::TileBufType>(lhsTy);
   auto rhsTb = mlir::dyn_cast<pto::TileBufType>(rhsTy);
   auto dstTb = mlir::dyn_cast<pto::TileBufType>(dstTy);
@@ -3204,7 +3211,7 @@ static LogicalResult verifyGemvTileOperandsA5(Operation *op, Type lhsTy,
                                               Type rhsTy, Type dstTy) {
   if (failed(verifyGemvTileOperandsA2A3(op, lhsTy, rhsTy, dstTy)))
     return failure();
-  return verifyMatTileOperandsA5(op, lhsTy, rhsTy, dstTy);
+  return verifyMadTileLayoutsA5(op, lhsTy, rhsTy, dstTy);
 }
 
 static LogicalResult verifyGemvTileOperands(Operation *op, Type lhsTy, Type rhsTy,

diff --git a/test/lit/pto/issue226_remove_redundant_pipe_pair.pto b/test/lit/pto/issue226_remove_redundant_pipe_pair.pto
@@ -15,15 +15,15 @@
 
 module {
   func.func @remove_redundant_pipe_pair(
-      %arg0: memref<64x1xf16, strided<[1, 1]>, #pto.address_space<vec>>) {
+      %arg0: !pto.ptr<f16>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
     %c64 = arith.constant 64 : index
-    %vbuf0 = pto.bind_tile %arg0, %c64, %c1
-      {config = #pto.tile_buf_config<blayout=1 : i32, slayout=2 : i32, s_fractal_size=512, pad=0 : i32>}
-      : memref<64x1xf16, strided<[1, 1]>, #pto.address_space<vec>>
-     -> memref<64x1xf16, strided<[1, 1], offset: ?>, #pto.address_space<vec>>
+    %vview = pto.make_tensor_view %arg0, shape = [%c64, %c1], strides = [%c1, %c1]
+      : !pto.tensor_view<64x1xf16>
+    %vbuf0 = pto.partition_view %vview, offsets = [%c0, %c0], sizes = [%c64, %c1]
+      : !pto.tensor_view<64x1xf16> -> !pto.partition_tensor_view<64x1xf16>
 
     pto.section.cube {
       %mat_a = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=512, pad=0>
@@ -33,7 +33,7 @@ module {
       %acc = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
 
       scf.for %i = %c0 to %c2 step %c1 {
-        pto.tload ins(%vbuf0 : memref<64x1xf16, strided<[1, 1], offset: ?>, #pto.address_space<vec>>)
+        pto.tload ins(%vbuf0 : !pto.partition_tensor_view<64x1xf16>)
             outs(%mat_a : !pto.tile_buf<loc=mat, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
         pto.tmov ins(%mat_a : !pto.tile_buf<loc=mat, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
             outs(%left : !pto.tile_buf<loc=left, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=row_major, slayout=row_major, fractal=512, pad=0>)

diff --git a/test/lit/pto/tgemv_a5_aligned_acc_rows.pto b/test/lit/pto/tgemv_a5_aligned_acc_rows.pto
@@ -0,0 +1,24 @@
+// RUN: ptoas --pto-arch=a5 %s | FileCheck %s
+
+module {
+  func.func @tgemv_a5_aligned_acc_rows() {
+    %lhs = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f16, rows=1, cols=64, v_row=1, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %rhs = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f16, rows=64, cols=80, v_row=64, v_col=80, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    %bias = pto.alloc_tile : !pto.tile_buf<loc=bias, dtype=f32, rows=1, cols=80, v_row=1, v_col=80, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %acc_in = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %dst0 = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %dst1 = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %dst2 = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+
+    pto.tgemv ins(%lhs, %rhs : !pto.tile_buf<loc=left, dtype=f16, rows=1, cols=64, v_row=1, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f16, rows=64, cols=80, v_row=64, v_col=80, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%dst0 : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tgemv.acc ins(%acc_in, %lhs, %rhs : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>, !pto.tile_buf<loc=left, dtype=f16, rows=1, cols=64, v_row=1, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f16, rows=64, cols=80, v_row=64, v_col=80, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%dst1 : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tgemv.bias ins(%lhs, %rhs, %bias : !pto.tile_buf<loc=left, dtype=f16, rows=1, cols=64, v_row=1, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f16, rows=64, cols=80, v_row=64, v_col=80, blayout=row_major, slayout=col_major, fractal=512, pad=0>, !pto.tile_buf<loc=bias, dtype=f32, rows=1, cols=80, v_row=1, v_col=80, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%dst2 : !pto.tile_buf<loc=acc, dtype=f32, rows=80, cols=80, v_row=1, v_col=80, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+
+    return
+  }
+}
+
+// CHECK-LABEL: __global__ AICORE void tgemv_a5_aligned_acc_rows()
+// CHECK: TGEMV(
+// CHECK: TGEMV_ACC(
+// CHECK: TGEMV_BIAS(