huawei-csl · MirkoDeVita98 · Apr 13, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 26, 2026
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -16,8 +16,8 @@ RUN pip install --no-cache-dir \
     ipython jupyterlab matplotlib pandas
 
 # certain operations need latest isa header, not CANN 8.5.0 default
-# header on 2026/03/16
-ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613
+# header on 2026/03/24
+ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af
 WORKDIR /sources
 RUN git clone https://gitcode.com/cann/pto-isa.git \
     && cd pto-isa && git checkout $PTOISA_COMMIT
@@ -29,10 +29,10 @@ ARG CACHE_BURST=1
 
 # ARG ARCH=x86_64
 ARG ARCH=aarch64
-ARG RELEASE_REPO=zhangstevenunity/PTOAS
-ARG RELEASE_VER=0.9
-ARG RELEASE_TAG=v${RELEASE_VER}
-ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
+ARG RELEASE_REPO=huawei-csl/PTOAS
+ARG RELEASE_VER=20260327
+ARG RELEASE_TAG=${RELEASE_VER}
+ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl
 ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz
 
 WORKDIR /installers/

diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore
@@ -0,0 +1 @@
+msprof_res/
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md
@@ -0,0 +1,17 @@
+# Bidirectional `TPUSH`/`TPOP` MLIR Example
+
+This example mirrors the `mix-kernel_cpp` flow, but starts from
+[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir).
+
+The pipeline is:
+
+1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp`
+2. compile the generated C++ together with `caller.cpp`
+3. build `./tpushpop_mlir_lib.so`
+4. launch the generated `pto.entry` kernel from Python
+
+## Run
+
+```bash
+python run_bidirectional_example.py
+```
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
@@ -0,0 +1,108 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+
+def meta_data():
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    i32 = pto.int32
+    tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+    tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+    x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
+    x_left_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="LEFT",
+        config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"),
+    )
+    x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
+    acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
+    recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def module():
+    @pto.func(kernel="cube")
+    def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_import = pto.import_reserved_buffer(
+            name="c2v_fifo",
+            peer_func="@vector_kernel",
+        )
+
+        pto.aic_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_import,
+            v2c_consumer_buf=c0_i32,
+        )
+
+        x_mat_tile = pto.alloc_tile(x_mat_ty)
+        x_left_tile = pto.alloc_tile(x_left_ty)
+        x_right_tile = pto.alloc_tile(x_right_ty)
+        acc_tile = pto.alloc_tile(acc_ty)
+
+        gm_x_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_x,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.load(gm_x_tile_view, x_mat_tile)
+        tile.mov(x_mat_tile, x_left_tile)
+        tile.mov(x_mat_tile, x_right_tile)
+        tile.matmul(x_left_tile, x_right_tile, acc_tile)
+        pto.tpush_to_aiv(acc_tile, 0)
+
+    @pto.func(kernel="vector")
+    def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC")
+
+        pto.aiv_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_local,
+            v2c_consumer_buf=c0_i32,
+        )
+
+        gm_y_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_y,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.store(pto.tpop_from_aic(recv_ty, 0), gm_y_tile_view)
+        pto.tfree_from_aic(0)
+
+    @pto.func(entry=True)
+    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.call(cube_kernel, gm_slot_buffer, gm_x)
+        pto.call(vector_kernel, gm_slot_buffer, gm_y)
+
+
+if __name__ == "__main__":
+    print(module)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
@@ -0,0 +1,89 @@
+// Bidirectional pipe example.
+//
+// This reduced version only uses the C2V pipe:
+// - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel`
+//
+// `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import
+// ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe`
+// binds those FIFO endpoints to the shared GM slot buffer plus each side's local
+// consumer buffer.
+//
+// End-to-end data flow:
+// - Cube loads one input matrix `X` from GM.
+// - Cube computes `Y = X @ X`.
+// - Cube sends that accumulator tile to vector over `c2v_fifo`.
+// - Vector pops the tile and stores it to GM as output matrix `Y`.
+//
+// What is transferred:
+// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile `Y = X @ X`
+//   sent with `pto.tpush_to_aiv` using `split = 0` (no split). Vector receives
+//   that same logical `16 x 16` tile with `pto.tpop_from_aic` in a vector tile
+//   type/layout, then stores it to the GM output buffer.
+//
+// Shape summary:
+// - All transferred tiles are `rows=16, cols=16, dtype=f32`
+// - Cube-produced C2V tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
+// - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
+module {
+
+  func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>) attributes {pto.entry} {
+    func.call @cube_kernel(%gm_slot_buffer, %gm_x) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    func.call @vector_kernel(%gm_slot_buffer, %gm_y) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    return
+  }
+
+  func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c2v_import = pto.import_reserved_buffer {
+      name = "c2v_fifo",
+      peer_func = @vector_kernel
+    } -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
+       c2v_consumer_buf = %c2v_import : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %x_mat_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %gm_x_view = pto.make_tensor_view %gm_x, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_x_tile_view = pto.partition_view %gm_x_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tload ins(%gm_x_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    pto.tmatmul ins(%x_left_tile, %x_right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    return
+  }
+
+  func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c2v_local = pto.reserve_buffer {
+      name = "c2v_fifo",
+      size = 4096,
+      location = #pto.address_space<vec>,
+      auto = true
+    } -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
+       c2v_consumer_buf = %c2v_local : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %gm_y_view = pto.make_tensor_view %gm_y, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_y_tile_view = pto.partition_view %gm_y_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %recv_tile = pto.tpop_from_aic {split = 0}
+      -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tstore ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gm_y_tile_view : !pto.partition_tensor_view<16x16xf32>)
+    pto.tfree_from_aic {split = 0}
+    return
+  }
+
+}
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -0,0 +1,17 @@
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+#include <cstdint>
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *gmSlotBuffer,
+    uint8_t *x,
+    uint8_t *y)
+{
+    call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y);
+}
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
+MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
+GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
+LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
+
+mkdir -p "${ARTIFACT_DIR}"
+rm -f "${GENERATED_CPP}" "${LIB_PATH}"
+
+MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir"
+python bidirectional_builder.py > bidir_gen.mlir
+ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
+
+#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
+
+
+
+bisheng \
+    -I/sources/pto-isa/include/ \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 -g \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"${GENERATED_CPP}\"" \
+    "${SCRIPT_DIR}/caller.cpp" \
+    -o "${LIB_PATH}"
+
+echo "Generated ${GENERATED_CPP}."
+echo "Built ${LIB_PATH}."