huawei-csl · MirkoDeVita98 · Apr 13, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 26, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
       RELEASE_VER: 0.24
       RELEASE_TAG: v0.24
       CLI_DIR: /installers/ptoas-cli
-      PTOISA_COMMIT: 2ee948ef636863ed149f176d5327d9db5f349bb6
+      PTOISA_COMMIT: a8c3fbf42a2f4a0f609f64e138dda62deefddb8e
 
     steps:
       - name: Install system packages

diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@ __pycache__
 extra-info
 
 *.ptodsl_jit
+
+msprof_res/
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -16,8 +16,8 @@ RUN pip install --no-cache-dir \
     ipython jupyterlab matplotlib pandas
 
 # certain operations need latest isa header, not CANN 8.5.0 default
-# header on 2026/04/01
-ARG PTOISA_COMMIT=2ee948ef636863ed149f176d5327d9db5f349bb6
+# header on 2026/04/08
+ARG PTOISA_COMMIT=a8c3fbf42a2f4a0f609f64e138dda62deefddb8e
 WORKDIR /sources
 RUN git clone https://gitcode.com/cann/pto-isa.git \
     && cd pto-isa && git checkout $PTOISA_COMMIT

diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore
@@ -0,0 +1 @@
+build_artifacts/
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md
@@ -0,0 +1,80 @@
+# TPush / TPop mixed-kernel examples
+
+Small examples of tile FIFO communication between Cube (`AIC`) and Vector (`AIV`).
+
+```bash
+python run.py c2v
+python run.py v2c
+python run.py bidi
+```
+
+`python run.py` defaults to `c2v`.
+
+Files:
+
+- `kernels/` has the Python builders.
+- `build_artifacts/` gets generated MLIR, generated C++, and the `.so`.
+- `gm_slot_buffer` is the GM backing store for the pipe.
+- `caller.cpp` sets the FFTS base before launching the generated kernel.
+
+Core idea:
+
+- `aic_initialize_pipe` / `aiv_initialize_pipe` lower to matching `TPipe<...>` objects.
+- `gm_slot_buffer` is the shared GM slot memory used by that `TPipe`.
+- `tpush_to_aiv` / `tpush_to_aic` lower to `TPUSH(pipe, tile)`.
+- `tpop_from_aic` / `tpop_from_aiv` lower to `TPOP(pipe, tile)`.
+- `tfree_from_aic` / `tfree_from_aiv` lower to `TFREE(pipe)` and release the consumed slot.
+
+## C2V
+
+Cube sends. Vector receives.
+
+This example computes `X @ X` on Cube, sends the accumulator tile to Vector, then Vector stores it to GM.
+
+```text
+Cube:   load X -> matmul -> tpush_to_aiv
+Vector: tpop_from_aic -> store Y -> tfree_from_aic
+```
+
+Pipe wiring:
+
+- Vector owns the consumer buffer: `reserve_buffer("c2v_fifo", location="VEC")`
+- Cube imports it: `import_reserved_buffer("c2v_fifo", peer_func="@vector_kernel")`
+- Both sides initialize with `dir_mask = 1`
+
+## V2C
+
+Vector sends. Cube receives.
+
+This example loads `X` on Vector, sends that tile to Cube, then Cube stores it to GM.
+
+```text
+Vector: load X -> tpush_to_aic
+Cube:   tpop_from_aiv -> store Y -> tfree_from_aiv
+```
+
+Pipe wiring:
+
+- Cube owns the consumer buffer: `reserve_buffer("v2c_fifo", location="MAT")`
+- Vector imports it: `import_reserved_buffer("v2c_fifo", peer_func="@cube_kernel")`
+- Both sides initialize with `dir_mask = 2`
+
+## BIDI
+
+Both directions are enabled.
+
+This example sends `X @ X` from Cube to Vector. Vector doubles it and sends it back. Cube receives the returned tile and stores it to GM.
+
+```text
+Cube:   matmul -> tpush_to_aiv
+Vector: tpop_from_aic -> add -> tpush_to_aic -> tfree_from_aic
+Cube:   tpop_from_aiv -> store Y -> tfree_from_aiv
+```
+
+Pipe wiring:
+
+- Vector reserves `c2v_fifo`; Cube imports it
+- Cube reserves `v2c_fifo`; Vector imports it
+- Both sides initialize with `dir_mask = 3`
+
+For `dir_mask = 3`, allocate FIFO backing for both directions. `run.py` uses `8 KiB`.
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir b/examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir
@@ -0,0 +1,89 @@
+// Bidirectional pipe example.
+//
+// This reduced version only uses the C2V pipe:
+// - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel`
+//
+// `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import
+// ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe`
+// binds those FIFO endpoints to the shared GM slot buffer plus each side's local
+// consumer buffer.
+//
+// End-to-end data flow:
+// - Cube loads one input matrix `X` from GM.
+// - Cube computes `Y = X @ X`.
+// - Cube sends that accumulator tile to vector over `c2v_fifo`.
+// - Vector pops the tile and stores it to GM as output matrix `Y`.
+//
+// What is transferred:
+// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile `Y = X @ X`
+//   sent with `pto.tpush_to_aiv` using `split = 0` (no split). Vector receives
+//   that same logical `16 x 16` tile with `pto.tpop_from_aic` in a vector tile
+//   type/layout, then stores it to the GM output buffer.
+//
+// Shape summary:
+// - All transferred tiles are `rows=16, cols=16, dtype=f32`
+// - Cube-produced C2V tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
+// - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
+module {
+
+  func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>) attributes {pto.entry} {
+    func.call @cube_kernel(%gm_slot_buffer, %gm_x) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    func.call @vector_kernel(%gm_slot_buffer, %gm_y) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    return
+  }
+
+  func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c2v_import = pto.import_reserved_buffer {
+      name = "c2v_fifo",
+      peer_func = @vector_kernel
+    } -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
+       c2v_consumer_buf = %c2v_import : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %x_mat_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
+    %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %gm_x_view = pto.make_tensor_view %gm_x, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_x_tile_view = pto.partition_view %gm_x_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tload ins(%gm_x_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    pto.tmatmul ins(%x_left_tile, %x_right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+    pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    return
+  }
+
+  func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c2v_local = pto.reserve_buffer {
+      name = "c2v_fifo",
+      size = 4096,
+      location = #pto.address_space<vec>,
+      auto = true
+    } -> i32
+    %c0_i32 = arith.constant 0 : i32
+    pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
+       c2v_consumer_buf = %c2v_local : i32,
+       v2c_consumer_buf = %c0_i32 : i32)
+
+    %gm_y_view = pto.make_tensor_view %gm_y, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_y_tile_view = pto.partition_view %gm_y_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    %recv_tile = pto.tpop_from_aic {split = 0}
+      -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tstore ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gm_y_tile_view : !pto.partition_tensor_view<16x16xf32>)
+    pto.tfree_from_aic {split = 0}
+    return
+  }
+
+}
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -0,0 +1,28 @@
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+#include <cstdint>
+
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *gmSlotBuffer,
+    uint8_t *x,
+    uint8_t *y)
+{
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    call_both<<<blockDim, nullptr, stream>>>(
+        (__gm__ int64_t *)fftsAddr,
+        (__gm__ float *)gmSlotBuffer,
+        (__gm__ float *)x,
+        (__gm__ float *)y);
+}
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
+MODE="${TPUSHPOP_MODE:-c2v}"
+BUILDER_PATH="${SCRIPT_DIR}/kernels/${MODE}_builder.py"
+MLIR_GEN_PATH="${ARTIFACT_DIR}/${MODE}_gen.mlir"
+GENERATED_CPP="${ARTIFACT_DIR}/${MODE}.cpp"
+LIB_PATH="${ARTIFACT_DIR}/tpushpop_mlir_lib.so"
+
+case "${MODE}" in
+  c2v|c2v_add|v2c|bidi) ;;
+  *)
+    echo "Unknown TPUSHPOP_MODE: ${MODE}" >&2
+    exit 2
+    ;;
+esac
+
+mkdir -p "${ARTIFACT_DIR}"
+rm -f "${GENERATED_CPP}" "${LIB_PATH}"
+
+python "${BUILDER_PATH}" > "${MLIR_GEN_PATH}"
+ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
+# add extern "C" to function so kernel name is not mangled
+perl -0pi -e 's/\b__global__ AICORE void call_both\(/extern "C" __global__ AICORE void call_both(/' "${GENERATED_CPP}"
+
+bisheng \
+    -I/sources/pto-isa/include/ \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 -g \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"${GENERATED_CPP}\"" \
+    "${SCRIPT_DIR}/caller.cpp" \
+    -o "${LIB_PATH}"
+
+echo "Generated ${GENERATED_CPP}."
+echo "Built ${LIB_PATH}."
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,3 +6,5 @@ __pycache__
		extra-info

		*.ptodsl_jit

		msprof_res/