From b269e2bacd86716a40c3216bb16db1c46c41193a Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 15:58:08 +0000
Subject: [PATCH 01/38] feat: init

---
 .../cube_to_vector_matmul_add/README.md       |  31 ++
 .../cube_to_vector_matmul_add/caller.cpp      |  27 ++
 .../cube_to_vector_matmul_add/compile.sh      |  46 +++
 .../run_tpushpop_cv.py                        | 150 +++++++++
 refs/tpushpop_cv.cpp                          | 290 ++++++++++++++++++
 refs/tpushpop_vc.cpp                          | 236 ++++++++++++++
 6 files changed, 780 insertions(+)
 create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/README.md
 create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp
 create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh
 create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py
 create mode 100644 refs/tpushpop_cv.cpp
 create mode 100644 refs/tpushpop_vc.cpp
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md b/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md
new file mode 100644
index 00000000..28ae0c39
--- /dev/null
+++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md
@@ -0,0 +1,31 @@
+# Cube To Vector `TPUSH`/`TPOP` Example
+
+This example wraps `refs/tpushpop_cv.cpp` into the same `compile.sh` + Python runner flow used by the AOT examples.
+
+The kernel does:
+
+- cube-side `TMATMUL`
+- `TPUSH` from cube to vector
+- vector-side `TPOP`
+- vector-side bias add
+
+## Run
+
+```bash
+python run_tpushpop_cv.py
+```
+
+That will:
+
+1. call `compile.sh`
+2. build `./tpushpop_cv_lib.so`
+3. launch the kernel on NPU
+4. compare against `A @ B + bias`
+
+The wrapper fetches the runtime FFTS/control address inside `caller.cpp` with `rtGetC2cCtrlAddr(...)`, so the Python side only needs to provide the kernel inputs, output, and FIFO backing memory.
+
+If your environment needs different PTO include roots:
+
+```bash
+PTO_INCLUDE_PATH=/sources/pto-isa/include python run_tpushpop_cv.py
+```
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp b/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp
new file mode 100644
index 00000000..fbe697f4
--- /dev/null
+++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp
@@ -0,0 +1,27 @@
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+#include <cstdint>
+
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *out,
+    uint8_t *srcA,
+    uint8_t *srcB,
+    uint8_t *bias,
+    uint8_t *fifoMem)
+{
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)blockDim;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    LaunchTPushPopMatmulAdd(reinterpret_cast<uint8_t *>(fftsAddr), out, srcA, srcB, bias, fifoMem, stream);
+}
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh b/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh
new file mode 100644
index 00000000..4df82d4c
--- /dev/null
+++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
+LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so"
+EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}"
+
+if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then
+    EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS} -DTPUSHPOP_SANITY_ONLY"
+fi
+
+PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}"
+if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then
+    if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then
+        PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include"
+    elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then
+        PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include"
+    else
+        echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2
+        exit 1
+    fi
+fi
+
+mkdir -p "${ARTIFACT_DIR}"
+rm -f "${LIB_PATH}"
+
+bisheng \
+    -I"${PTO_INCLUDE_PATH}" \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    ${EXTRA_BISHENG_FLAGS} \
+    -DKERNEL_CPP="\"${REPO_ROOT}/refs/tpushpop_cv.cpp\"" \
+    "${SCRIPT_DIR}/caller.cpp" \
+    -o "${LIB_PATH}"
+
+echo "Built ${LIB_PATH}."
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py b/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py
new file mode 100644
index 00000000..011fc1d2
--- /dev/null
+++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py
@@ -0,0 +1,150 @@
+import ctypes
+import os
+import subprocess
+
+import numpy as np
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so")
+DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
+DEFAULT_FIFO_BYTES = 4 * 1024
+TOTAL_M = 16
+K = 32
+N = 32
+INPUT_DTYPE = torch.float16
+SEED = 0
+ATOL = 5e-2
+RTOL = 5e-2
+SANITY_ONLY = False
+
+
+def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def compile_example(compile_script: str) -> None:
+    subprocess.run(
+        ["bash", compile_script],
+        check=True,
+        cwd=THIS_DIR,
+    )
+
+
+def load_lib(lib_path: str) -> ctypes.CDLL:
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+    ]
+    lib.call_kernel.restype = None
+    return lib
+
+
+def make_buffers(
+    *,
+    total_m: int,
+    k: int,
+    n: int,
+    input_dtype: torch.dtype,
+    device: str,
+    fifo_bytes: int,
+):
+    src_a = torch.randn((total_m, k), dtype=input_dtype, device=device)
+    src_b = torch.randn((k, n), dtype=input_dtype, device=device)
+    bias = torch.randn((total_m, n), dtype=torch.float32, device=device)
+    out = torch.zeros((total_m, n), dtype=torch.float32, device=device)
+
+    fifo_elems = max(1, (fifo_bytes + 3) // 4)
+    fifo_mem = torch.zeros((fifo_elems,), dtype=torch.float32, device=device)
+    return out, src_a, src_b, bias, fifo_mem
+
+
+def run_kernel(
+    lib: ctypes.CDLL,
+    *,
+    out: torch.Tensor,
+    src_a: torch.Tensor,
+    src_b: torch.Tensor,
+    bias: torch.Tensor,
+    fifo_mem: torch.Tensor,
+) -> torch.Tensor:
+    stream_ptr = torch.npu.current_stream()._as_parameter_
+    lib.call_kernel(
+        1,
+        stream_ptr,
+        torch_to_ctypes(out),
+        torch_to_ctypes(src_a),
+        torch_to_ctypes(src_b),
+        torch_to_ctypes(bias),
+        torch_to_ctypes(fifo_mem),
+    )
+    torch.npu.synchronize()
+    return out
+
+
+def reference_result(src_a: torch.Tensor, src_b: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
+    ref = torch.matmul(src_a.float().cpu(), src_b.float().cpu())
+    if not SANITY_ONLY:
+        ref = ref + bias.cpu()
+    return ref.to(torch.float32)
+
+
+def main() -> None:
+    compile_example(DEFAULT_COMPILE_SCRIPT)
+
+    device = get_test_device()
+    torch.npu.set_device(device)
+    torch.manual_seed(SEED)
+    np.random.seed(SEED)
+
+    lib = load_lib(DEFAULT_LIB_PATH)
+    out, src_a, src_b, bias, fifo_mem = make_buffers(
+        total_m=TOTAL_M,
+        k=K,
+        n=N,
+        input_dtype=INPUT_DTYPE,
+        device=device,
+        fifo_bytes=DEFAULT_FIFO_BYTES,
+    )
+
+    out = run_kernel(
+        lib,
+        out=out,
+        src_a=src_a,
+        src_b=src_b,
+        bias=bias,
+        fifo_mem=fifo_mem,
+    )
+    ref = reference_result(src_a, src_b, bias)
+    out_cpu = out.cpu()
+
+    max_abs = float(torch.max(torch.abs(out_cpu - ref)).item())
+    mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item())
+    ok = bool(torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL))
+
+    print(
+        f"mode={'sanity_matmul' if SANITY_ONLY else 'tpushpop_cv'} "
+        f"shape=({TOTAL_M}, {K}, {N}) dtype={INPUT_DTYPE} "
+        f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}"
+    )
+
+    if not ok:
+        raise SystemExit(
+            f"Validation failed with atol={ATOL} rtol={RTOL}. "
+            f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}"
+        )
+
+    print(f"Validation passed using {DEFAULT_LIB_PATH}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/refs/tpushpop_cv.cpp b/refs/tpushpop_cv.cpp
new file mode 100644
index 00000000..324ade79
--- /dev/null
+++ b/refs/tpushpop_cv.cpp
@@ -0,0 +1,290 @@
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+
+using namespace pto;
+
+#define VEC_CORES 2
+
+using ExampleInT = half;
+using ExampleOutT = float;
+constexpr uint32_t EXAMPLE_TOTAL_M = 16;
+constexpr uint32_t EXAMPLE_CASE_TILE_M = 16;
+constexpr uint32_t EXAMPLE_TILE_K = 32;
+constexpr uint32_t EXAMPLE_TILE_N = 32;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2)
+{
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+#ifdef TPUSHPOP_SANITY_ONLY
+__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB)
+{
+    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
+    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_TOTAL_M, 16);
+    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
+    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
+
+    using GlobalA =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
+                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
+    using GlobalB =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
+                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalOut =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_TOTAL_M,
+                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
+                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>;
+    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
+    using AccTile = TileAcc<ExampleOutT, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>;
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMatTile;
+        TileMatB bMatTile;
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile accTile;
+        TASSIGN(aMatTile, 0x0);
+        TASSIGN(bMatTile, 0x20000);
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        GlobalA globalA(srcA);
+        GlobalB globalB(srcB);
+        GlobalOut globalOut(out);
+
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(aMatTile, globalA);
+        TLOAD(bMatTile, globalB);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        TMATMUL(accTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+#else
+__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out,
+                                            __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB,
+                                            __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem)
+{
+    set_ffts_base_addr((uint64_t)ffts_addr);
+    constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M;
+    constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES;
+
+    constexpr uint16_t FLAG_ID = 0;
+    constexpr uint8_t FIFO_DEPTH = 2;
+    constexpr uint8_t FIFO_PERIOD = 1;
+    // local fifo base used for TPOP of vector side(vecTileHalf)
+    constexpr uint32_t localFiFoBase = 0x0;
+
+    using AccTile = TileAcc<ExampleOutT, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N>;
+    using VecTileHalf =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+    using BiasTile =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+    using OutTile =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+
+    using MatPipe = TPipe<FLAG_ID, Direction::DIR_C2V,
+                          EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_N * sizeof(ExampleOutT), FIFO_DEPTH>;
+    MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase);
+
+    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
+    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_CASE_TILE_M, 16);
+    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
+    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
+
+    using GlobalA =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
+                                 EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
+    using GlobalB =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
+                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalBias =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalOut =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_CASE_TILE_M,
+                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
+                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>;
+    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMatTile;
+        TileMatB bMatTile;
+        TASSIGN(aMatTile, 0x0);
+        TASSIGN(bMatTile, 0x20000);
+
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile accTile;
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
+            GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K);
+            GlobalB globalB(srcB);
+
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            TLOAD(aMatTile, globalA);
+            TLOAD(bMatTile, globalB);
+
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+            TMOV(aTile, aMatTile);
+            TMOV(bTile, bMatTile);
+
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+            wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+
+            TMATMUL(accTile, aTile, bTile);
+
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+            set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+            wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+            TPUSH<MatPipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
+
+            set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        }
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+
+    if constexpr (DAV_VEC) {
+        VecTileHalf vecTileHalf;
+        BiasTile biasTile;
+        OutTile outTile;
+        TASSIGN(biasTile, 0x10000);
+        TASSIGN(outTile, 0x20000);
+
+        uint32_t subBlockIdx = get_subblockid();
+
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            TPOP<MatPipe, VecTileHalf, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecTileHalf);
+
+            size_t biasOffset =
+                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
+            GlobalBias globalBias(bias + biasOffset);
+
+            TLOAD(biasTile, globalBias);
+
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+            TADD(outTile, vecTileHalf, biasTile);
+
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+            size_t outOffset =
+                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
+            GlobalOut globalOut(out + outOffset);
+            TSTORE(globalOut, outTile);
+
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        }
+
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+#endif
+
+void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
+                             void *stream)
+{
+#ifdef TPUSHPOP_SANITY_ONLY
+    (void)ffts;
+    (void)bias;
+    (void)fifoMem;
+    runSanityMatmul<<<1, nullptr, stream>>>(
+        reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA), reinterpret_cast<ExampleInT *>(srcB));
+#else
+    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
+        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA),
+        reinterpret_cast<ExampleInT *>(srcB), reinterpret_cast<ExampleOutT *>(bias), reinterpret_cast<ExampleOutT *>(fifoMem));
+#endif
+}
diff --git a/refs/tpushpop_vc.cpp b/refs/tpushpop_vc.cpp
new file mode 100644
index 00000000..69672e57
--- /dev/null
+++ b/refs/tpushpop_vc.cpp
@@ -0,0 +1,236 @@
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+
+using namespace pto;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2)
+{
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+template <typename QuantT, typename InT, typename OutT, int TOTAL_M, int TOTAL_K, int N, int CASE_TILE_K>
+__global__ AICORE void runTPushPopVCMatmul(__gm__ uint64_t *ffts_addr, __gm__ OutT *out, __gm__ InT *srcA,
+                                           __gm__ QuantT *quantB, __gm__ OutT *scale, __gm__ OutT *offset,
+                                           __gm__ OutT *fifoMem)
+{
+    set_ffts_base_addr((uint64_t)ffts_addr);
+    constexpr uint32_t TILE_K = CASE_TILE_K;
+    constexpr uint32_t HALF_TILE_K = TILE_K / 2;
+    constexpr uint32_t TILE_N = N;
+    constexpr uint32_t NUM_K_TILES = TOTAL_K / CASE_TILE_K;
+
+    constexpr uint16_t FLAG_ID = 0;
+    constexpr uint8_t FIFO_DEPTH = 2;
+    constexpr uint8_t FIFO_PERIOD = 1;
+    // fifo base used for TPOP of cube side (bMatTile)
+    constexpr uint32_t localFiFoBase = 0x20000;
+
+    using VecTileProd = Tile<TileType::Vec, OutT, HALF_TILE_K, TILE_N, BLayout::RowMajor, HALF_TILE_K, TILE_N>;
+    using MatTileCons =
+        Tile<TileType::Mat, OutT, TILE_K, TILE_N, BLayout::ColMajor, TILE_K, TILE_N, SLayout::RowMajor, 512>;
+
+    using MatPipe = TPipe<FLAG_ID, Direction::DIR_V2C, TILE_K * TILE_N * sizeof(OutT), FIFO_DEPTH>;
+    MatPipe mPipe((__gm__ void *)fifoMem, 0x0, localFiFoBase);
+
+    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(InT);
+    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(TOTAL_M, 16);
+    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(TILE_K, blockAlign);
+    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(TILE_N, blockAlign);
+
+    using GlobalA = GlobalTensor<InT, pto::Shape<1, 1, 1, TOTAL_M, TILE_K>,
+                                 pto::Stride<TOTAL_M * TOTAL_K, TOTAL_M * TOTAL_K, TOTAL_M * TOTAL_K, TOTAL_K, 1>>;
+    using GlobalOut = GlobalTensor<OutT, pto::Shape<1, 1, 1, TOTAL_M, TILE_N>,
+                                   pto::Stride<TOTAL_M * TILE_N, TOTAL_M * TILE_N, TOTAL_M * TILE_N, TILE_N, 1>>;
+
+    using TileMatA =
+        Tile<TileType::Mat, InT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, TOTAL_M, TILE_K, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<InT, ALIGNED_M, ALIGNED_K, TOTAL_M, TILE_K>;
+    using PopTile =
+        Tile<TileType::Mat, OutT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, TILE_K, TILE_N, SLayout::RowMajor, 512>;
+    using RightTile = TileRight<OutT, ALIGNED_K, ALIGNED_N, TILE_K, TILE_N>;
+    using AccTile = TileAcc<OutT, TOTAL_M, TILE_N, TOTAL_M, TILE_N>;
+
+    using QuantTile = Tile<TileType::Vec, QuantT, HALF_TILE_K, TILE_N, BLayout::RowMajor, HALF_TILE_K, TILE_N>;
+    using ScaleTile = Tile<TileType::Vec, OutT, HALF_TILE_K, 8, BLayout::RowMajor, -1, -1>;
+    using OffsetTile = Tile<TileType::Vec, OutT, HALF_TILE_K, 8, BLayout::RowMajor, -1, -1>;
+
+    if constexpr (DAV_VEC) {
+        QuantTile quantTile;
+        VecTileProd dequantTile;
+        ScaleTile scaleTile(HALF_TILE_K, 1);
+        OffsetTile offsetTile(HALF_TILE_K, 1);
+        TASSIGN(quantTile, 0x0);
+        TASSIGN(dequantTile, 0x10000);
+        TASSIGN(scaleTile, 0x20000);
+        TASSIGN(offsetTile, 0x28000);
+
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+        using GlobalQuantB =
+            GlobalTensor<QuantT, pto::Shape<1, 1, 1, HALF_TILE_K, TILE_N>,
+                         pto::Stride<TOTAL_K * TILE_N, TOTAL_K * TILE_N, HALF_TILE_K * TILE_N, TILE_N, 1>>;
+        using GlobalScaleOffset =
+            GlobalTensor<OutT, pto::Shape<1, 1, 1, HALF_TILE_K, 1>, pto::Stride<TOTAL_K, TOTAL_K, HALF_TILE_K, 1, 1>>;
+
+        uint32_t subBlockIdx = get_subblockid();
+
+        for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) {
+            GlobalQuantB globalQuantB(quantB + k_tile * TILE_K * TILE_N + subBlockIdx * HALF_TILE_K * TILE_N);
+            GlobalScaleOffset globalScale(scale + k_tile * TILE_K + subBlockIdx * HALF_TILE_K);
+            GlobalScaleOffset globalOffset(offset + k_tile * TILE_K + subBlockIdx * HALF_TILE_K);
+
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            TLOAD(quantTile, globalQuantB);
+            TLOAD(scaleTile, globalScale);
+            TLOAD(offsetTile, globalOffset);
+
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+            TDEQUANT(dequantTile, quantTile, scaleTile, offsetTile);
+
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+            TPUSH<MatPipe, VecTileProd, TileSplitAxis::TILE_UP_DOWN>(mPipe, dequantTile);
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        }
+
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMatTile;
+        PopTile bMatTile;
+        TASSIGN(aMatTile, 0x0);
+
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile accTile;
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        typename MatPipe::Consumer cons;
+
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+        for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) {
+            GlobalA globalA(srcA + k_tile * TILE_K);
+
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            TLOAD(aMatTile, globalA);
+
+            TPOP<MatPipe, PopTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, bMatTile);
+
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+            TMOV(aTile, aMatTile);
+            TMOV(bTile, bMatTile);
+
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            if (k_tile == 0) {
+                TMATMUL(accTile, aTile, bTile);
+            } else {
+                TMATMUL_ACC(accTile, accTile, aTile, bTile);
+            }
+
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        }
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+        GlobalOut globalOut(out);
+        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+
+template <int32_t tilingKey>
+void LaunchTPushPopVCMatmul(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                            uint8_t *offset, uint8_t *fifoMem, void *stream)
+{
+    if constexpr (tilingKey == 1) {
+        runTPushPopVCMatmul<int8_t, float, float, 16, 64, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    } else if constexpr (tilingKey == 2) {
+        runTPushPopVCMatmul<int8_t, float, float, 16, 128, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    } else if constexpr (tilingKey == 3) {
+        runTPushPopVCMatmul<int8_t, float, float, 16, 256, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    } else if constexpr (tilingKey == 4) {
+        runTPushPopVCMatmul<int16_t, float, float, 16, 64, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    } else if constexpr (tilingKey == 5) {
+        runTPushPopVCMatmul<int16_t, float, float, 16, 128, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    } else if constexpr (tilingKey == 6) {
+        runTPushPopVCMatmul<int16_t, float, float, 16, 256, 32, 64><<<1, nullptr, stream>>>(
+            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
+            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
+            reinterpret_cast<float *>(fifoMem));
+    }
+}
+
+template void LaunchTPushPopVCMatmul<1>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
+template void LaunchTPushPopVCMatmul<2>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
+template void LaunchTPushPopVCMatmul<3>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
+template void LaunchTPushPopVCMatmul<4>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
+template void LaunchTPushPopVCMatmul<5>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
+template void LaunchTPushPopVCMatmul<6>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
+                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
\ No newline at end of file

From e821d3da63c414e2d85247c8d9a2480d99699209 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 24 Mar 2026 16:40:29 +0000
Subject: [PATCH 02/38] feat: up isa version, need  etc.. for mix-kernels

---
 docker/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7eda8bc5..d04eba64 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -16,8 +16,8 @@ RUN pip install --no-cache-dir \
     ipython jupyterlab matplotlib pandas
 
 # certain operations need latest isa header, not CANN 8.5.0 default
-# header on 2026/03/16
-ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613
+# header on 2026/03/24
+ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af
 WORKDIR /sources
 RUN git clone https://gitcode.com/cann/pto-isa.git \
     && cd pto-isa && git checkout $PTOISA_COMMIT
@@ -30,7 +30,7 @@ ARG CACHE_BURST=1
 # ARG ARCH=x86_64
 ARG ARCH=aarch64
 ARG RELEASE_REPO=zhangstevenunity/PTOAS
-ARG RELEASE_VER=0.9
+ARG RELEASE_VER=0.15
 ARG RELEASE_TAG=v${RELEASE_VER}
 ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
 ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz

From a83b06d23020a60f1da49704a664f8115ee26b01 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 26 Mar 2026 16:59:31 +0000
Subject: [PATCH 03/38] feat: add simpler cpp case

---
 .../tpushpop/mix-kernel_cpp_simple/README.md  |  15 ++
 .../tpushpop/mix-kernel_cpp_simple/caller.cpp |  27 +++
 .../tpushpop/mix-kernel_cpp_simple/compile.sh |  36 ++++
 .../tpushpop/mix-kernel_cpp_simple/kernel.cpp | 156 ++++++++++++++++++
 .../aot/tpushpop/mix-kernel_cpp_simple/run.py |  72 ++++++++
 5 files changed, 306 insertions(+)
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/run.py

diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
new file mode 100644
index 00000000..45adb9fc
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
@@ -0,0 +1,15 @@
+# Simple Cube To Vector `TPUSH`/`TPOP` Example
+
+This is a stripped-down sibling of `mix-kernel_cpp`.
+
+The kernel is fixed to a single `16x32 @ 32x32` matmul, followed by a bias add on the vector side:
+
+- no tile loop
+- no sanity mode
+- no extra runner configuration
+
+Run it with:
+
+```bash
+python run.py
+```
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
new file mode 100644
index 00000000..fbe697f4
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
@@ -0,0 +1,27 @@
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+#include <cstdint>
+
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *out,
+    uint8_t *srcA,
+    uint8_t *srcB,
+    uint8_t *bias,
+    uint8_t *fifoMem)
+{
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)blockDim;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    LaunchTPushPopMatmulAdd(reinterpret_cast<uint8_t *>(fftsAddr), out, srcA, srcB, bias, fifoMem, stream);
+}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
new file mode 100644
index 00000000..0d8d8eb7
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}"
+LIB_PATH="${SCRIPT_DIR}/lib.so"
+
+if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then
+    if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then
+        PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include"
+    elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then
+        PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include"
+    else
+        echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2
+        exit 1
+    fi
+fi
+
+rm -f "${LIB_PATH}"
+
+bisheng \
+    -I"${PTO_INCLUDE_PATH}" \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -DKERNEL_CPP="\"${SCRIPT_DIR}/kernel.cpp\"" \
+    "${SCRIPT_DIR}/caller.cpp" \
+    -o "${LIB_PATH}"
+
+echo "Built ${LIB_PATH}."
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
new file mode 100644
index 00000000..02336d41
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
@@ -0,0 +1,156 @@
+/*
+Flow:
+1. Cube loads A and B from GM through GlobalTensor views.
+2. Cube copies those GM-backed matrix tiles into local matrix tiles:
+   `aMat` at `0x0`, `bMat` at `0x20000`, then converts them to matmul inputs
+   `aTile` and `bTile` and runs one `TMATMUL` into `acc`.
+3. Cube `TPUSH`es the full `16x32` accumulator tile to the C2V pipe.
+4. Vector `TPOP`s its `8x32` half-tile from that pushed accumulator, loads the
+   matching `8x32` bias tile from GM, does `TADD`, and stores the result to GM.
+
+Allocation summary:
+- `GlobalTensor` objects are just GM views over `srcA`, `srcB`, `bias`, and `out`.
+  They do not allocate local on-core memory themselves.
+- The C2V FIFO is also explicit GM memory in this example: `fifoMem` is the GM slot
+  buffer passed into `TPipe`, so cube writes the pushed accumulator tile into GM and
+  vector reads it back from that same GM-backed FIFO.
+- Cube local tiles:
+  `aMat @ 0x0`, `bMat @ 0x20000`, `aTile @ 0x0`, `bTile @ 0x0`, `acc @ 0x0`.
+- Vector local tiles:
+  `biasTile @ 0x10000`, `outTile @ 0x20000`.
+- The cross-core transfer is the matmul result: one full `AccTile<float, 16, 32>`
+  produced on cube and split `up/down` so each vector subcore receives one `8x32`
+  row half via `TPOP`.
+*/
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+
+using namespace pto;
+
+using In = half;
+using Out = float;
+
+constexpr uint32_t M = 16;
+constexpr uint32_t K = 32;
+constexpr uint32_t N = 32;
+constexpr uint32_t VEC_CORES = 2;
+constexpr uint32_t VEC_M = M / VEC_CORES;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts, __gm__ Out *out, __gm__ In *srcA, __gm__ In *srcB,
+                                            __gm__ Out *bias, __gm__ Out *fifoMem)
+{
+    set_ffts_base_addr((uint64_t)ffts);
+
+    using GlobalA = GlobalTensor<In, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<In, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalBias = GlobalTensor<Out, Shape<1, 1, 1, VEC_M, N>, Stride<M * N, M * N, VEC_M * N, N, 1>>;
+    using GlobalOut = GlobalTensor<Out, Shape<1, 1, 1, VEC_M, N>, Stride<M * N, M * N, VEC_M * N, N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, In, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, In, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<In, M, K, M, K>;
+    using RightTile = TileRight<In, K, N, K, N>;
+    using AccTile = TileAcc<Out, M, N, M, N>;
+    using VecTile = Tile<TileType::Vec, Out, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
+
+    using Pipe = TPipe<0, Direction::DIR_C2V, M * N * sizeof(Out), 2>;
+    Pipe pipe((__gm__ void *)(uint64_t)fifoMem, 0x0, 0x0);
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMat;
+        TileMatB bMat;
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile acc;
+        TASSIGN(aMat, 0x0);
+        TASSIGN(bMat, 0x20000);
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(acc, 0x0);
+
+        GlobalA globalA(srcA);
+        GlobalB globalB(srcB);
+
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(aMat, globalA);
+        TLOAD(bMat, globalB);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TMOV(aTile, aMat);
+        TMOV(bTile, bMat);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        TMATMUL(acc, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        TPUSH<Pipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(pipe, acc);
+
+        pipe_barrier(PIPE_ALL);
+    }
+
+    if constexpr (DAV_VEC) {
+        VecTile popped;
+        VecTile biasTile;
+        VecTile outTile;
+        TASSIGN(biasTile, 0x10000);
+        TASSIGN(outTile, 0x20000);
+
+        uint32_t subBlock = get_subblockid();
+        uint32_t offset = subBlock * VEC_M * N;
+        GlobalBias globalBias(bias + offset);
+        GlobalOut globalOut(out + offset);
+
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        TPOP<Pipe, VecTile, TileSplitAxis::TILE_UP_DOWN>(pipe, popped);
+        TLOAD(biasTile, globalBias);
+
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        TADD(outTile, popped, biasTile);
+
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(globalOut, outTile);
+
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+
+void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
+                             void *stream)
+{
+    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
+        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<Out *>(out), reinterpret_cast<In *>(srcA),
+        reinterpret_cast<In *>(srcB), reinterpret_cast<Out *>(bias), reinterpret_cast<Out *>(fifoMem));
+}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py
new file mode 100644
index 00000000..e098ddeb
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py
@@ -0,0 +1,72 @@
+import ctypes
+import os
+import subprocess
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+LIB_PATH = os.path.join(THIS_DIR, "lib.so")
+M = 16
+K = 32
+N = 32
+FIFO_ELEMS = 1024
+ATOL = 5e-2
+RTOL = 5e-2
+
+
+def ptr(tensor: torch.Tensor) -> ctypes.c_void_p:
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def main() -> None:
+    subprocess.run(["bash", "compile.sh"], check=True, cwd=THIS_DIR)
+
+    device = get_test_device()
+    torch.npu.set_device(device)
+    torch.manual_seed(0)
+
+    lib = ctypes.CDLL(LIB_PATH)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+    ]
+    lib.call_kernel.restype = None
+
+    a = torch.randn((M, K), dtype=torch.float16, device=device)
+    b = torch.randn((K, N), dtype=torch.float16, device=device)
+    bias = torch.randn((M, N), dtype=torch.float32, device=device)
+    out = torch.zeros((M, N), dtype=torch.float32, device=device)
+    fifo = torch.zeros((FIFO_ELEMS,), dtype=torch.float32, device=device)
+
+    lib.call_kernel(
+        1,
+        torch.npu.current_stream()._as_parameter_,
+        ptr(out),
+        ptr(a),
+        ptr(b),
+        ptr(bias),
+        ptr(fifo),
+    )
+    torch.npu.synchronize()
+
+    ref = a.float().cpu() @ b.float().cpu() + bias.cpu()
+    out_cpu = out.cpu()
+    max_abs = float((out_cpu - ref).abs().max().item())
+    print(f"max_abs={max_abs:.6f}")
+
+    if not torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL):
+        raise SystemExit("validation failed")
+
+    print("validation passed")
+
+
+if __name__ == "__main__":
+    main()

From 0ad3ceebe34690de857c7159ab7f31a5610ff2a1 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 26 Mar 2026 17:00:04 +0000
Subject: [PATCH 04/38] feat: rename

---
 .../README.md                                 |   2 +-
 .../caller.cpp                                |   0
 .../compile.sh                                |   4 +-
 .../run_tpushpop_cv.py                        |  10 +-
 .../tpushpop/mix-kernel_cpp/tpushpop_cv.cpp   | 297 ++++++++++++++++++
 5 files changed, 309 insertions(+), 4 deletions(-)
 rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/README.md (79%)
 rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/caller.cpp (100%)
 rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/compile.sh (93%)
 rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/run_tpushpop_cv.py (91%)
 create mode 100644 examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp

diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md b/examples/aot/tpushpop/mix-kernel_cpp/README.md
similarity index 79%
rename from examples/aot/tpushpop/cube_to_vector_matmul_add/README.md
rename to examples/aot/tpushpop/mix-kernel_cpp/README.md
index 28ae0c39..672e71f1 100644
--- a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md
+++ b/examples/aot/tpushpop/mix-kernel_cpp/README.md
@@ -1,6 +1,6 @@
 # Cube To Vector `TPUSH`/`TPOP` Example
 
-This example wraps `refs/tpushpop_cv.cpp` into the same `compile.sh` + Python runner flow used by the AOT examples.
+This example keeps the kernel source in the same directory as the wrapper, using `./tpushpop_cv.cpp` with the same `compile.sh` + Python runner flow used by the AOT examples.
 
 The kernel does:
 
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp
similarity index 100%
rename from examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp
rename to examples/aot/tpushpop/mix-kernel_cpp/caller.cpp
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh
similarity index 93%
rename from examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh
rename to examples/aot/tpushpop/mix-kernel_cpp/compile.sh
index 4df82d4c..df924539 100644
--- a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh
@@ -2,9 +2,9 @@
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
 ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
 LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so"
+KERNEL_CPP_PATH="${KERNEL_CPP_PATH:-${SCRIPT_DIR}/tpushpop_cv.cpp}"
 EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}"
 
 if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then
@@ -39,7 +39,7 @@ bisheng \
     --npu-arch=dav-2201 -DMEMORY_BASE \
     -std=gnu++17 \
     ${EXTRA_BISHENG_FLAGS} \
-    -DKERNEL_CPP="\"${REPO_ROOT}/refs/tpushpop_cv.cpp\"" \
+    -DKERNEL_CPP="\"${KERNEL_CPP_PATH}\"" \
     "${SCRIPT_DIR}/caller.cpp" \
     -o "${LIB_PATH}"
 
diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
similarity index 91%
rename from examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py
rename to examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
index 011fc1d2..4e2d468d 100644
--- a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py
+++ b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
@@ -11,8 +11,9 @@
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so")
 DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
+DEFAULT_KERNEL_CPP = os.path.join(THIS_DIR, "tpushpop_cv.cpp")
 DEFAULT_FIFO_BYTES = 4 * 1024
-TOTAL_M = 16
+TOTAL_M = 128
 K = 32
 N = 32
 INPUT_DTYPE = torch.float16
@@ -27,10 +28,13 @@ def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
 
 
 def compile_example(compile_script: str) -> None:
+    env = os.environ.copy()
+    env["KERNEL_CPP_PATH"] = DEFAULT_KERNEL_CPP
     subprocess.run(
         ["bash", compile_script],
         check=True,
         cwd=THIS_DIR,
+        env=env,
     )
 
 
@@ -126,6 +130,10 @@ def main() -> None:
     )
     ref = reference_result(src_a, src_b, bias)
     out_cpu = out.cpu()
+    assert ref.device == out_cpu.device
+    torch.npu.synchronize()
+    torch.set_printoptions(precision=1, sci_mode=False, linewidth=250, threshold=5000)
+    print(ref-out_cpu)
 
     max_abs = float(torch.max(torch.abs(out_cpu - ref)).item())
     mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item())
diff --git a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp
new file mode 100644
index 00000000..3f4c42b2
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp
@@ -0,0 +1,297 @@
+#include <pto/pto-inst.hpp>
+#include <pto/common/fifo.hpp>
+
+using namespace pto;
+
+#define VEC_CORES 2
+
+using ExampleInT = half;
+using ExampleOutT = float;
+constexpr uint32_t EXAMPLE_TOTAL_M = 128;
+constexpr uint32_t EXAMPLE_CASE_TILE_M = 16;
+constexpr uint32_t EXAMPLE_TILE_K = 32;
+constexpr uint32_t EXAMPLE_TILE_N = 32;
+
+#ifdef __DAV_CUBE__
+constexpr bool DAV_CUBE = true;
+#else
+constexpr bool DAV_CUBE = false;
+#endif
+
+#ifdef __DAV_VEC__
+constexpr bool DAV_VEC = true;
+#else
+constexpr bool DAV_VEC = false;
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2)
+{
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+#ifdef TPUSHPOP_SANITY_ONLY
+__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB)
+{
+    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
+    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_TOTAL_M, 16);
+    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
+    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
+
+    using GlobalA =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
+                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
+    using GlobalB =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
+                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalOut =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_TOTAL_M,
+                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
+                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>;
+    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
+    using AccTile = TileAcc<ExampleOutT, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>;
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMatTile;
+        TileMatB bMatTile;
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile accTile;
+        TASSIGN(aMatTile, 0x0);
+        TASSIGN(bMatTile, 0x20000);
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        GlobalA globalA(srcA);
+        GlobalB globalB(srcB);
+        GlobalOut globalOut(out);
+
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        TLOAD(aMatTile, globalA);
+        TLOAD(bMatTile, globalB);
+
+        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        TMOV(aTile, aMatTile);
+        TMOV(bTile, bMatTile);
+
+        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        TMATMUL(accTile, aTile, bTile);
+
+        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+#else
+__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out,
+                                            __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB,
+                                            __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem)
+{
+    // Point the cross-core FIFO signaling ops at the FFTS flag storage used by TPUSH/TPOP handshakes.
+    //t_ffts_base_addr((uint64_t)ffts_addr);
+    constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M;
+    constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES;
+
+    constexpr uint16_t FLAG_ID = 0;
+    constexpr uint8_t FIFO_DEPTH = 2;
+    constexpr uint8_t FIFO_PERIOD = 1;
+    // Local ring-buffer base used by vector-side TPOP to place each popped half-tile before vector compute uses it.
+    constexpr uint32_t localFiFoBase = 0x0;
+
+    using AccTile = TileAcc<ExampleOutT, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N>;
+    using VecTileHalf =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+    using BiasTile =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+    using OutTile =
+        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
+
+    // Cube-to-vector FIFO: each GM slot stores one full AccTile, and vector TPOP reads it back as two row halves.
+    using MatPipe = TPipe<FLAG_ID, Direction::DIR_C2V,
+                          EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_N * sizeof(ExampleOutT), FIFO_DEPTH>;
+    // Bind the FIFO protocol to GM slot storage and the vector-side local staging buffer used by TPOP.
+    MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase);
+
+    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
+    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_CASE_TILE_M, 16);
+    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
+    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
+
+    using GlobalA =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
+                                 EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
+    using GlobalB =
+        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
+                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalBias =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+    using GlobalOut =
+        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
+                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
+                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
+
+    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_CASE_TILE_M,
+                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
+                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
+    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>;
+    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
+
+    if constexpr (DAV_CUBE) {
+        TileMatA aMatTile;
+        TileMatB bMatTile;
+        TASSIGN(aMatTile, 0x0);
+        TASSIGN(bMatTile, 0x20000);
+
+        LeftTile aTile;
+        RightTile bTile;
+        AccTile accTile;
+        TASSIGN(aTile, 0x0);
+        TASSIGN(bTile, 0x0);
+        TASSIGN(accTile, 0x0);
+
+        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
+            GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K);
+            GlobalB globalB(srcB);
+
+            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            TLOAD(aMatTile, globalA);
+            TLOAD(bMatTile, globalB);
+
+            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+            TMOV(aTile, aMatTile);
+            TMOV(bTile, bMatTile);
+
+            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+
+            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+            wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+
+            TMATMUL(accTile, aTile, bTile);
+
+            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+
+            set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+            wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+            // Push the full accumulator tile into the next GM FIFO slot and signal vector that one split-up-down tile is ready.
+            TPUSH<MatPipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
+
+            set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+        }
+
+        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
+        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+
+    if constexpr (DAV_VEC) {
+        VecTileHalf vecTileHalf;
+        BiasTile biasTile;
+        OutTile outTile;
+        TASSIGN(biasTile, 0x10000);
+        TASSIGN(outTile, 0x20000);
+
+        uint32_t subBlockIdx = get_subblockid();
+
+        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
+            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            // Pop this subcore's half-tile from the next ready FIFO slot into local vector memory based on get_subblockid().
+            // TILE_UP_DOWN means split MxN tile into-> [M/2xN, M/2xN].
+            TPOP<MatPipe, VecTileHalf, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecTileHalf);
+
+            size_t biasOffset =
+                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
+            GlobalBias globalBias(bias + biasOffset);
+
+            TLOAD(biasTile, globalBias);
+
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+            TADD(outTile, vecTileHalf, biasTile);
+
+            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+            size_t outOffset =
+                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
+            GlobalOut globalOut(out + outOffset);
+            // Store this vector subcore's output half-tile from local vector memory back to its GM output slice.
+            TSTORE(globalOut, outTile);
+
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+        }
+
+        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
+        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
+
+        pipe_barrier(PIPE_ALL);
+    }
+}
+#endif
+
+void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
+                             void *stream)
+{
+#ifdef TPUSHPOP_SANITY_ONLY
+    (void)ffts;
+    (void)bias;
+    (void)fifoMem;
+    runSanityMatmul<<<1, nullptr, stream>>>(
+        reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA), reinterpret_cast<ExampleInT *>(srcB));
+#else
+    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
+        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA),
+        reinterpret_cast<ExampleInT *>(srcB), reinterpret_cast<ExampleOutT *>(bias), reinterpret_cast<ExampleOutT *>(fifoMem));
+#endif
+}

From 4e034359eded6af2ad30603a8e4d035146958efb Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 26 Mar 2026 17:01:02 +0000
Subject: [PATCH 05/38] feat: add mlir example[ WIP ]

---
 .../aot/tpushpop/mix-kernel_mlir/README.md    |  17 +
 .../bidirectional_example.mlir                | 133 +++
 .../aot/tpushpop/mix-kernel_mlir/caller.cpp   |  15 +
 .../aot/tpushpop/mix-kernel_mlir/compile.sh   |  32 +
 .../aot/tpushpop/mix-kernel_mlir/pto_docs.md  | 822 ++++++++++++++++++
 .../run_bidirectional_example.py              |  71 ++
 6 files changed, 1090 insertions(+)
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/README.md
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/compile.sh
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md
new file mode 100644
index 00000000..a898a57b
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md
@@ -0,0 +1,17 @@
+# Bidirectional `TPUSH`/`TPOP` MLIR Example
+
+This example mirrors the `mix-kernel_cpp` flow, but starts from
+[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir).
+
+The pipeline is:
+
+1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp`
+2. compile the generated C++ together with `caller.cpp`
+3. build `./tpushpop_mlir_lib.so`
+4. launch the generated `pto.entry` kernel from Python
+
+## Run
+
+```bash
+python run_bidirectional_example.py
+```
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
new file mode 100644
index 00000000..01052493
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
@@ -0,0 +1,133 @@
+module {
+  func.func @cube_kernel(%gm_slot_buffer: i32)
+      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %v2c_local = pto.reserve_buffer {
+      name = "v2c_fifo",
+      size = 4096,
+      location = #pto.address_space<mat>,
+      auto = true
+    } -> i32
+    %c2v_import = pto.import_reserved_buffer {
+      name = "c2v_fifo",
+      peer_func = @vector_kernel
+    } -> i32
+    pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : i32,
+       c2v_consumer_buf = %c2v_import : i32,
+       v2c_consumer_buf = %v2c_local : i32)
+
+    %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+
+    %mat_tile = pto.tpop_from_aiv {split = 1}
+      -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>
+    pto.tfree_from_aiv {split = 2}
+    return
+  }
+
+  func.func @vector_kernel(%gm_slot_buffer: i32)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c2v_local = pto.reserve_buffer {
+      name = "c2v_fifo",
+      size = 4096,
+      location = #pto.address_space<vec>,
+      auto = true
+    } -> i32
+    %v2c_import = pto.import_reserved_buffer {
+      name = "v2c_fifo",
+      peer_func = @cube_kernel
+    } -> i32
+    pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024}
+      (gm_slot_buffer = %gm_slot_buffer : i32,
+       c2v_consumer_buf = %c2v_local : i32,
+       v2c_consumer_buf = %v2c_import : i32)
+
+    %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
+
+    %recv_tile = pto.tpop_from_aic {split = 1}
+      -> !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tfree_from_aic {split = 2}
+    return
+  }
+
+  func.func @cube_kernel_nested(%gm_slot_buffer: i32)
+      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %true = arith.constant true
+    scf.if %true {
+      %v2c_local = pto.reserve_buffer {
+        name = "v2c_fifo_nested",
+        size = 4096,
+        location = #pto.address_space<mat>,
+        auto = true
+      } -> i32
+      %c2v_import = pto.import_reserved_buffer {
+        name = "c2v_fifo_nested",
+        peer_func = @vector_kernel_nested
+      } -> i32
+      pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024}
+        (gm_slot_buffer = %gm_slot_buffer : i32,
+         c2v_consumer_buf = %c2v_import : i32,
+         v2c_consumer_buf = %v2c_local : i32)
+
+      %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+      pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+
+      %recv_tile = pto.tpop_from_aiv {split = 1}
+        -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>
+      pto.tfree_from_aiv {split = 2}
+    }
+    return
+  }
+
+  func.func @vector_kernel_nested(%gm_slot_buffer: i32)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %true = arith.constant true
+    scf.if %true {
+      %c2v_local = pto.reserve_buffer {
+        name = "c2v_fifo_nested",
+        size = 4096,
+        location = #pto.address_space<vec>,
+        auto = true
+      } -> i32
+      %v2c_import = pto.import_reserved_buffer {
+        name = "v2c_fifo_nested",
+        peer_func = @cube_kernel_nested
+      } -> i32
+      pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024}
+        (gm_slot_buffer = %gm_slot_buffer : i32,
+         c2v_consumer_buf = %c2v_local : i32,
+         v2c_consumer_buf = %v2c_import : i32)
+
+      %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
+
+      %recv_tile = pto.tpop_from_aic {split = 1}
+        -> !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+      pto.tfree_from_aic {split = 2}
+    }
+    return
+  }
+}
+
+// A3-LABEL: AICORE void cube_kernel(
+// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
+// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
+// A3: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
+// A3: TPOP<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
+// A3: TFREE<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
+
+// A3-LABEL: AICORE void vector_kernel(
+// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
+// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
+// A3: TPUSH<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
+// A3: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
+// A3: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
+
+// A3-LABEL: AICORE void cube_kernel_nested(
+// A3: if (
+// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
+// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
+// A3: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
+// A3: TPOP<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
+// A3: TFREE<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
new file mode 100644
index 00000000..5926c256
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -0,0 +1,15 @@
+#ifndef KERNEL_CPP
+#error "KERNEL_CPP must be defined at compile time."
+#endif
+
+#include <cstdint>
+
+#include KERNEL_CPP
+
+extern "C" void call_kernel(
+    uint32_t blockDim,
+    void *stream,
+    uint8_t *gmSlotBuffer)
+{
+    bidirectional_example<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer);
+}
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
new file mode 100644
index 00000000..ee6376fb
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
+MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
+GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
+LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
+
+mkdir -p "${ARTIFACT_DIR}"
+rm -f "${GENERATED_CPP}" "${LIB_PATH}"
+
+ptoas --pto-arch=a3 "${MLIR_PATH}" > "${GENERATED_CPP}"
+
+bisheng \
+    -I/sources/pto-isa/include/ \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
+    -xcce -Xhost-start -Xhost-end \
+    -mllvm -cce-aicore-stack-size=0x8000 \
+    -mllvm -cce-aicore-function-stack-size=0x8000 \
+    -mllvm -cce-aicore-record-overflow=true \
+    -mllvm -cce-aicore-addr-transform \
+    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
+    --npu-arch=dav-2201 -DMEMORY_BASE \
+    -std=gnu++17 \
+    -DKERNEL_CPP="\"${GENERATED_CPP}\"" \
+    "${SCRIPT_DIR}/caller.cpp" \
+    -o "${LIB_PATH}"
+
+echo "Generated ${GENERATED_CPP}."
+echo "Built ${LIB_PATH}."
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md
new file mode 100644
index 00000000..394b2798
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md
@@ -0,0 +1,822 @@
+# TPUSH/TPOP 前端接口与 PTOAS 实现设计
+
+## 1. 文档范围
+
+本文定义PTOAS TPUSH/TPOP 前端IR接口，以及其在 PTOAS 内部的 lowering、地址传播、flag 分配和 EmitC 映射规则。
+
+本文覆盖两层接口：
+
+- 前端接口
+  - `pto.aic_initialize_pipe`
+  - `pto.aiv_initialize_pipe`
+  - `pto.tpush_to_aiv`
+  - `pto.tpush_to_aic`
+  - `pto.tpop_from_aic`
+  - `pto.tpop_from_aiv`
+  - `pto.tfree_from_aic`
+  - `pto.tfree_from_aiv`
+  - `pto.reserve_buffer`
+  - `pto.import_reserved_buffer`
+- PTOAS 内部统一接口
+  - `pto.initialize_l2g2l_pipe`
+  - `pto.initialize_l2l_pipe`
+  - `pto.tpush`
+  - `pto.declare_tile`
+  - `pto.tpop`
+  - `pto.tfree`
+
+本文只描述接口契约与编译流程，不展开具体 C++ 模板实现细节。
+
+## 2. 设计目标
+
+本设计的目标如下：
+
+- 对前端提供\*\_initialize_pipe/tpush_to_\*/tpop_from_\*/tfree_from_\*IR接口。
+- 在 PTOAS 内部统一为 pipe/tpush/tpop/tfree 指令，便于复用已有 pass。
+- 支持 A2/A3 与 A5 两个平台使用同一套前端接口。
+- 定义consumer slot buffer的分配地址与producer之间的匹配关系，并传播。
+
+## 3. 前端 IR 接口定义
+
+### 3.1 `pto.aic_initialize_pipe`
+
+#### 语义
+
+由 Cube kernel 在函数启动时调用，初始化该函数涉及的通信 pipe。
+
+#### 语法
+
+```mlir
+pto.aic_initialize_pipe(
+    DIR_MASK,
+    SLOT_SIZE,
+    GM_SLOT_BUFFER,
+    C2V_CONSUMER_BUF,
+    V2C_CONSUMER_BUF)
+```
+
+#### 参数
+
+| 参数 | 类型 | 说明 |
+|---|---|---|
+| `DIR_MASK` | 编译期整数常量 | `1`、`2` 或 `3` |
+| `SLOT_SIZE` | 编译期整数常量 | 单 slot 字节数，定义为切分前完整 tile 字节数 |
+| `GM_SLOT_BUFFER` | GM 地址或空值 | A2/A3 路径使用，A5 路径为空 |
+| `C2V_CONSUMER_BUF` | `i32` | C2V 方向 consumer 的 local slot buffer 基址 |
+| `V2C_CONSUMER_BUF` | `i32` | V2C 方向 consumer 的 local slot buffer 基址 |
+
+### 3.2 `pto.aiv_initialize_pipe`
+
+#### 语义
+
+由 Vector kernel 在函数启动时调用，初始化该函数涉及的通信 pipe。
+
+#### 语法
+
+```mlir
+pto.aiv_initialize_pipe(
+    DIR_MASK,
+    SLOT_SIZE,
+    GM_SLOT_BUFFER,
+    C2V_CONSUMER_BUF,
+    V2C_CONSUMER_BUF)
+```
+
+参数语义与 `pto.aic_initialize_pipe` 相同。
+
+### 3.3 前端数据传输接口
+
+#### `pto.tpush_to_aiv`
+
+```mlir
+pto.tpush_to_aiv(%tile) { split = 0 }
+```
+
+- 仅出现在 Cube kernel 中
+- 表示 C2V 方向 producer push
+
+#### `pto.tpush_to_aic`
+
+```mlir
+pto.tpush_to_aic(%tile) { split = 0 }
+```
+
+- 仅出现在 Vector kernel 中
+- 表示 V2C 方向 producer push
+
+#### `pto.tpop_from_aic`
+
+```mlir
+%tile = pto.tpop_from_aic { split = 0 } -> !pto.tile_buf<...>
+```
+
+- 仅出现在 Vector kernel 中
+- 表示 C2V 方向 consumer pop
+
+#### `pto.tpop_from_aiv`
+
+```mlir
+%tile = pto.tpop_from_aiv { split = 0 } -> !pto.tile_buf<...>
+```
+
+- 仅出现在 Cube kernel 中
+- 表示 V2C 方向 consumer pop
+
+#### `pto.tfree_from_aic`
+
+```mlir
+pto.tfree_from_aic { split = 0 }
+```
+
+- 仅出现在 Vector kernel 中
+- 表示 C2V 方向 consumer free
+
+#### `pto.tfree_from_aiv`
+
+```mlir
+pto.tfree_from_aiv { split = 0 }
+```
+
+- 仅出现在 Cube kernel 中
+- 表示 V2C 方向 consumer free
+
+以上前端数据传输接口中的 `split` 均为编译期常量属性，不是运行时 SSA operand。
+
+- 取值使用 `TileSplitAxis` 枚举语义：`0/1/2` 分别对应 `TILE_NO_SPLIT`、`TILE_UP_DOWN`、`TILE_LEFT_RIGHT`
+- lowering 到 PTOAS 内部 IR 时，`split` 继续以属性形式保留
+
+### 3.4 地址提示接口
+
+#### `pto.reserve_buffer`
+
+用于在当前函数内声明一块 consumer slot buffer 预留空间。其合法写法由
+当前编译流程是否启用 local address planning 决定。
+
+```mlir
+%buf = pto.reserve_buffer {
+    name = "c2v_slot_buffer",
+    size = 2048,
+    location = #pto.address_space<vec>,
+    auto = true
+} -> i32
+```
+
+或使用显式地址：
+
+```mlir
+%buf = pto.reserve_buffer {
+    name = "c2v_slot_buffer",
+    size = 2048,
+    location = #pto.address_space<vec>,
+    auto = false,
+    base = 4096
+} -> i32
+```
+
+#### 参数
+
+| 参数 | 类型 | 说明 |
+|---|---|---|
+| `name` | 字符串属性 | 本函数内唯一的预留段名字 |
+| `size` | 整数属性 | 预留字节数 |
+| `location` | 地址空间属性 | 预留空间所在 local 地址空间 |
+| `auto` | `bool` 属性 | 地址解析路径标志；`true` 表示地址由 PTOAS 地址规划路径分配，`false` 表示地址已在输入 IR 中显式给定 |
+| `base` | 可选整数属性 | 显式起始地址；仅 manual 路径使用 |
+
+#### 结果
+
+- 结果类型为 `i32`
+- 结果值表示该 buffer 当前可用的基址
+- 当前可用基址可来自显式 `base`，也可来自 plan memory 回填后的解析地址
+- 在当前约束下，每个函数最多一条 `reserve_buffer`
+- 编译路径与 `auto` 的合法组合只有两种：
+  - 启用 local address planning：`auto = true`，且不带 `base`
+  - 跳过 local address planning：`auto = false`，且显式提供 `base`
+
+#### `pto.import_reserved_buffer`
+
+用于引用 peer function 中已经定义的 `reserve_buffer` 结果。
+
+```mlir
+%buf = pto.import_reserved_buffer {
+    name = "c2v_slot_buffer",
+    peer_func = @vector_kernel
+} -> i32
+```
+
+#### 参数
+
+| 参数 | 类型 | 说明 |
+|---|---|---|
+| `name` | 字符串属性 | peer 侧 `reserve_buffer` 的名字 |
+| `peer_func` | symbol ref | peer 函数符号 |
+
+#### 结果
+
+- 结果类型为 `i32`
+- 结果值表示从 peer `reserve_buffer` 导入的已解析基址
+
+### 3.5 前端层约束
+
+前端 IR 需满足以下约束：
+
+- 每个 Cube function 最多一条 `pto.aic_initialize_pipe`
+- 每个 Vector function 最多一条 `pto.aiv_initialize_pipe`
+- 每个函数内最多一条 C2V 逻辑 pipe 和一条 V2C 逻辑 pipe
+- 每个函数最多一条 `reserve_buffer`
+- 每个函数最多一条 `import_reserved_buffer`
+- `DIR_MASK` 只允许 `1`、`2`、`3`
+- `SLOT_SIZE > 0`
+- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM`
+- C2V consumer 的 `reserve_buffer.location` 必须是 `VEC`
+- V2C consumer 的 `reserve_buffer.location` 必须是 `MAT`
+- `reserve_buffer.name` 在本函数内必须唯一
+- op 级约束：`reserve_buffer.auto = false` 时必须提供 `base`
+- op 级约束：`reserve_buffer.auto = true` 时必须不提供 `base`
+- 启用 local address planning 的编译流程：`reserve_buffer` 只允许 `auto = true`
+- 跳过 local address planning 的编译流程：`reserve_buffer` 只允许 `auto = false` 且显式提供 `base`
+- `import_reserved_buffer` 必须能在 `peer_func` 中找到同名 `reserve_buffer`
+
+## 4. 核心约定
+
+### 4.1 逻辑 pipe
+
+本文中的“逻辑 pipe”指一条单向通信通道。
+
+- C2V：Cube producer -> Vector consumer
+- V2C：Vector producer -> Cube consumer
+
+`DIR_MASK=3` 表示前端一个同时包含 C2V 和 V2C 的初始化请求，在 PTOAS lowering 后拆成两条单向逻辑 pipe：
+
+- 一条 `dir_mask = 1` 的 C2V pipe
+- 一条 `dir_mask = 2` 的 V2C pipe
+
+### 4.2 `split` 的角色
+
+`split` 使用 `TileSplitAxis` 枚举表达：
+
+- `TILE_NO_SPLIT`
+- `TILE_UP_DOWN`
+- `TILE_LEFT_RIGHT`
+
+在 PTOAS 设计中，`split` 的角色定义为：
+
+- `split` 是 `tpush/tpop/tfree` 的逐指令执行模式
+- `split` 在 IR 中表示为编译期常量属性，不是运行时 SSA operand
+- `split` 不参与pipe 初始化
+- `split` 不参与 plan memory、地址传播、flag 分配
+- PTOAS 将 `split` 作为透明的编译期参数向 EmitC 和底层 pto-isa 透传
+
+因此：
+
+- 同一条逻辑 pipe 上可以出现不同 `split` 的 `tpush/tpop/tfree`
+- PTOAS 不要求同一逻辑 pipe 内所有指令使用同一个 `split`
+- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证；PTOAS 仅校验 `split` 枚举合法并向下透传
+
+### 4.3 `SLOT_SIZE` 的定义
+
+`SLOT_SIZE` 的定义固定为：
+
+- 切分前完整 tile 的字节数
+
+即使 `split` 为 `TILE_UP_DOWN` 或 `TILE_LEFT_RIGHT`，`SLOT_SIZE` 仍然表示未切分前的逻辑 tile 总字节数。
+
+`split` 只影响底层 `TPUSH/TPOP/TFREE` 的执行方式，不影响 `SLOT_SIZE` 的含义。
+
+### 4.4 `SLOT_NUM` 规则
+
+`SLOT_NUM` 由 `DIR_MASK` 固定决定：
+
+- `DIR_MASK = 1` 或 `2`：`SLOT_NUM = 8`
+- `DIR_MASK = 3`：拆成两条单向 pipe，且每条 `SLOT_NUM = 4`
+
+`SLOT_NUM` 不由 `split` 决定。
+
+## 5. PTOAS 内部 IR 接口定义
+
+### 5.1 `!pto.pipe`
+
+本文设计的内部 `!pto.pipe` 为不透明 handle。
+
+`!pto.pipe` 的协议信息由其定义 op 上的属性承载，而不是由 type 参数承载。
+
+底层 `pto-isa` 若对 `TPUSH/TPOP` 的模板形态继续演进，不反向约束 `!pto.pipe` 的 type 设计；内部 `!pto.pipe` 仍保持 opaque handle。
+
+### 5.2 `pto.initialize_l2g2l_pipe`
+
+用于 A2/A3 路径。
+
+```mlir
+%pipe = pto.initialize_l2g2l_pipe {
+    dir_mask = 1,
+    slot_size = 512,
+    slot_num = 8,
+    local_slot_num = 8
+}(%gm_addr, %local_addr) -> !pto.pipe
+```
+
+#### 必需属性
+
+- `dir_mask`
+- `slot_size`
+- `slot_num`
+
+#### 可选属性
+
+- `local_slot_num`
+  - 仅 `initialize_l2g2l_pipe` 承载
+  - 表示 GM 路径下 consumer 侧 local slot buffer 的槽数
+  - 仅在通过 GM 传递时对底层 `TPipe` 模板参数有意义，不改变 GM FIFO 的 `slot_num`
+  - 缺省值等于该内部单向 pipe 的 `slot_num`
+  - 因此当前固定规则下：
+    - `DIR_MASK=1/2` 直接 lowering 时，`local_slot_num = 8`
+    - `DIR_MASK=3` 拆成两条单向 pipe 后，每条 `local_slot_num = 4`
+- `flag_base`
+  - 由 PTOAS flag 分配阶段填写
+  - frontend lowering 阶段可以缺省
+  - EmitC 前必须已经解析为显式常量
+
+#### 操作数
+
+- `gm_addr`
+- `local_addr`
+
+### 5.3 `pto.initialize_l2l_pipe`
+
+用于 A5 路径。
+
+```mlir
+%pipe = pto.initialize_l2l_pipe {
+    dir_mask = 1,
+    slot_size = 512,
+    slot_num = 8
+}(%local_addr) -> !pto.pipe
+```
+
+#### 必需属性
+
+- `dir_mask`
+- `slot_size`
+- `slot_num`
+
+#### 可选属性
+
+- `flag_base`
+  - 由 PTOAS flag 分配阶段填写
+  - frontend lowering 阶段可以缺省
+  - EmitC 前必须已经解析为显式常量
+
+#### 操作数
+
+- `local_addr`
+
+### 5.4 `pto.tpush`
+
+```mlir
+pto.tpush(%tile, %pipe) { split = 0 }
+```
+
+### 5.5 `pto.declare_tile`
+
+```mlir
+%tile = pto.declare_tile -> !pto.tile_buf<...>
+```
+
+### 5.6 `pto.tpop`
+
+```mlir
+pto.tpop(%tile, %pipe) { split = 0 }
+```
+
+### 5.7 `pto.tfree`
+
+```mlir
+pto.tfree(%pipe) { split = 0 }
+```
+
+`split` 在内部 IR 中必须以编译期常量属性形式保留，不能在 lowering 时擦除或降为运行时 operand。
+
+## 6. 前端到内部 IR 的 lowering 规则
+
+### 6.1 初始化接口 lowering
+
+#### A2/A3
+
+- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2g2l_pipe`
+- 若前端未提供更具体信息，lowering 默认补上 `local_slot_num = slot_num`
+
+#### A5
+
+- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2l_pipe`
+
+### 6.2 `DIR_MASK=1/2`
+
+- 只生成一条内部 pipe
+- `slot_num = 8`
+- 对 `initialize_l2g2l_pipe`，`local_slot_num = 8`
+
+### 6.3 `DIR_MASK=3`
+
+前端一个 init op 固定拆成两条内部 pipe：
+
+- `%pipe_c2v`：`dir_mask = 1`，`slot_num = 4`
+- `%pipe_v2c`：`dir_mask = 2`，`slot_num = 4`
+
+若 lowering 为 `initialize_l2g2l_pipe`，则两条内部 pipe 还满足：
+
+- `%pipe_c2v`：`local_slot_num = 4`
+- `%pipe_v2c`：`local_slot_num = 4`
+
+地址选择规则：
+
+- `%pipe_c2v` 使用 `C2V_CONSUMER_BUF`
+- `%pipe_v2c` 使用 `V2C_CONSUMER_BUF`
+
+### 6.4 前端数据传输 op 与内部 pipe 的绑定
+
+绑定规则固定如下：
+
+| 前端 op | 所在函数 | 方向 | 使用的内部 pipe |
+|---|---|---|---|
+| `tpush_to_aiv` | Cube | C2V | `dir_mask = 1` |
+| `tpop_from_aic` | Vector | C2V | `dir_mask = 1` |
+| `tfree_from_aic` | Vector | C2V | `dir_mask = 1` |
+| `tpush_to_aic` | Vector | V2C | `dir_mask = 2` |
+| `tpop_from_aiv` | Cube | V2C | `dir_mask = 2` |
+| `tfree_from_aiv` | Cube | V2C | `dir_mask = 2` |
+
+### 6.5 数据传输 op lowering
+
+#### `tpush_to_aiv` / `tpush_to_aic`
+
+lower 为：
+
+```mlir
+pto.tpush(%tile, %pipe) { split = 0 }
+```
+
+#### `tpop_from_aic` / `tpop_from_aiv`
+
+lower 为：
+
+```mlir
+%decl = pto.declare_tile -> !pto.tile_buf<...>
+pto.tpop(%decl, %pipe) { split = 0 }
+```
+
+即：
+
+- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` 是返回 tile 结果值的接口
+- PTOAS 内部 `pto.tpop` 才是 destination-style 形式，显式接收一个 `pto.declare_tile` 结果作为入参
+
+#### `tfree_from_aic` / `tfree_from_aiv`
+
+lower 为：
+
+```mlir
+pto.tfree(%pipe) { split = 0 }
+```
+
+## 7. `reserve_buffer` 与地址传播
+
+### 7.1 设计原则
+
+- `reserve_buffer` 只表示本函数 consumer slot buffer 的本地预留
+- `import_reserved_buffer` 只表示对 peer 预留段地址的引用
+- `reserve_buffer` 用属性描述“如何得到地址”，用结果值统一承载“当前可用地址”
+- 当前编译流程是否启用 local address planning 与 `reserve_buffer.auto` 共同决定地址处理路径
+- 启用 local address planning：`reserve_buffer` 必须使用 `auto = true`，由 `PlanMemory` 分配地址
+- 跳过 local address planning：`reserve_buffer` 必须使用 `auto = false` 且显式提供 `base`，不再进入 `PlanMemory` 分配路径
+- PTOAS 复用现有 `PlanMemory` pass 实现 `reserve_buffer` 地址确定，不额外增加独立的预分配 pass
+- PTOAS 新增独立地址传播 pass，专门处理 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐
+- 地址传播 pass 在 EmitC 之前运行；启用规划时位于 plan memory 之后，跳过规划时直接消费前端已给定地址
+
+### 7.2 使用规则
+
+#### C2V
+
+- consumer 是 Vector
+- Vector function 需要 `reserve_buffer(location = VEC)`
+- Cube function 需要 `import_reserved_buffer(peer_func = @vector_kernel)`
+
+#### V2C
+
+- consumer 是 Cube
+- Cube function 需要 `reserve_buffer(location = MAT)`
+- Vector function 需要 `import_reserved_buffer(peer_func = @cube_kernel)`
+
+### 7.3 编译路径与地址处理路径
+
+对包含 `reserve_buffer` 的函数，PTOAS 按当前编译流程是否启用 local address planning 以及 `auto` 的组合选择地址处理路径：
+
+- 启用 local address planning + `auto = true`
+  - 进入 auto 路径
+  - 由 `PlanMemory` 为 `reserve_buffer` 分配 `base`
+  - 随后由 `pto-resolve-reserved-buffers` 传播地址并完成 peer `flag_base` 对齐
+- 跳过 local address planning + `auto = false` + 显式 `base`
+  - 进入 manual 路径
+  - 跳过 `PlanMemory`
+  - 由 `pto-resolve-reserved-buffers` 直接传播已给定地址并完成 peer `flag_base` 对齐
+
+以下组合均非法：
+
+- 启用 local address planning + `auto = false`
+- 跳过 local address planning + `auto = true`
+
+若函数内不存在 `reserve_buffer`，则保持现有编译流程对 `PlanMemory` 的原始控制行为，不引入额外语义。
+
+### 7.4 启用 local address planning 的 auto 路径
+
+在启用 local address planning 的编译流程中，`reserve_buffer` 必须使用 `auto = true`，并由 plan memory 负责地址分配。
+
+若函数中存在 `reserve_buffer`，则对其 `location` 对应的地址空间执行：
+
+1. 先按现有逻辑完成普通 local buffer 的 `MemPlan`
+2. 再收集该地址空间内已经分配完成的 local 区间
+3. 在剩余空洞中按地址空间对齐要求寻找一段可容纳 `reserve_buffer.size` 的连续区间
+4. 将该区间起始地址回填为这条唯一 `reserve_buffer` 的 `base`
+
+即：
+
+- 普通 `memref.alloc` / tile buffer 等 local 内存仍先由既有 `MemPlan` 按原逻辑分配
+- `reserve_buffer` 不参与普通 local buffer 的 inplace / reuse 规划
+- `reserve_buffer` 在普通 local buffer 分配完成后，再作为独立的一段连续 local 区间进行 hole 分配
+- `reserve_buffer` 不保证位于地址空间起始地址，也不保证形成预留前缀；其语义仅为“在该地址空间中为 consumer slot buffer 找到一段对齐且连续的可用地址”
+- 若整体容量足够但 `MemPlan` 结果将空间打散，导致不存在满足大小和对齐要求的连续空洞，则 `reserve_buffer` 分配失败并报错
+
+### 7.5 跳过 local address planning 的 manual 路径
+
+在跳过 local address planning 的编译流程中：
+
+- 每个 `reserve_buffer` 必须显式提供 `base`
+- PTOAS 只校验 `base` 的基本合法性
+- `PlanMemory` 不参与该函数的 local 地址分配
+- 因此该函数中其他 local buffer 地址也必须已由前端或更前阶段整体确定
+- 地址传播 pass 不做地址分配，只将显式 `base` 传播到 `import_reserved_buffer`
+
+该 manual 路径的目标是：
+
+- 保持前端或外部地址规划结果不被 PTOAS 改写
+- 避免 `reserve_buffer` 显式地址与 PTOAS 自动规划结果相互覆盖
+
+### 7.6 `import_reserved_buffer` 规则
+
+- 不做地址分配
+
+### 7.7 地址传播 pass 规则
+
+对每个 `import_reserved_buffer`：
+
+1. 通过 `peer_func` 找到 peer 函数
+2. 在 peer 函数内查找同名 `reserve_buffer`
+3. 读取对方已经解析出的 `base` 或其等价结果值
+4. 用该常量地址替换 `import_reserved_buffer` 的结果
+
+地址传播完成后：
+
+- producer 与 consumer 对同一逻辑 pipe 使用同一个 local buffer 地址
+- EmitC 只处理解析后的常量地址，不处理 `import_reserved_buffer`
+
+#### 7.7.1 pass 落点
+
+- PTOAS 增加独立 `ModulePass`：`pto-resolve-reserved-buffers`
+- 该 pass 固定运行在 EmitC lowering 之前
+- 启用规划时：运行在 `pto-plan-memory` 之后
+- 跳过规划时：不经过 `pto-plan-memory`，但该 pass 仍会运行
+- 该 pass 不负责地址分配，只消费前一阶段已经确定的 `reserve_buffer.base`
+
+#### 7.7.2 输入假设
+
+- 启用规划时，`reserve_buffer.auto = true`，其 `base` 已由 `PlanMemory` 回填
+- 跳过规划时，`reserve_buffer.auto = false`，其 `base` 已由前端显式给定
+- `import_reserved_buffer.peer_func` 已能解析到合法 peer function
+- `import_reserved_buffer.name` 已能在 peer function 中找到唯一匹配的 `reserve_buffer`
+
+#### 7.7.3 实现流程
+
+pass 在模块级按两步执行：
+
+1. 先建立 peer 对应关系
+2. 再将 `reserve_buffer` / `import_reserved_buffer` 物化为显式常量地址
+
+其中第一步的实现方式是：
+
+- 遍历模块内所有 `pto.initialize_l2l_pipe` / `pto.initialize_l2g2l_pipe`
+- 若其 `local_addr` 来自 `reserve_buffer`，则以“当前函数 + reserve 名字 + dir_mask”识别逻辑 pipe
+- 若其 `local_addr` 来自 `import_reserved_buffer`，则以“peer_func + reserve 名字 + dir_mask”识别逻辑 pipe
+- 将 peer 两侧引用到同一逻辑 pipe 的内部 init op 归并到同一组
+- 若某条 init 未显式提供 `flag_base`，则其 `local_addr` 必须来自 `reserve_buffer` 或 `import_reserved_buffer`
+- 对每个逻辑 pipe 分组，要求必须形成完整 peer init pair：恰好两条 init，且分别来自 peer 两侧函数；若 peer 信息不完整则直接报错
+- 在同一组内，若任一侧已显式提供 `flag_base`，则该值作为该组最终值；若两侧显式值冲突则报错
+- 若同组两侧都未显式提供 `flag_base`，则按默认规则回填：
+  - 单向场景：`flag_base = 0`
+  - 双向场景：C2V 组 `flag_base = 0`，V2C 组 `flag_base = 2`
+- 所谓“双向场景”，是指同一对 peer 函数之间同时存在 `dir_mask = 1` 和 `dir_mask = 2` 两个逻辑 pipe 分组
+- 完成分组决策后，将最终 `flag_base` 回填到该组内所有尚未显式填写的 init op，保证 peer 两侧一致
+
+第二步的实现方式是：
+
+- 对每个 `reserve_buffer`，读取其已解析 `base`
+- 在该 op 位置插入 `arith.constant`
+- 用该常量替换 `reserve_buffer` 结果值的全部 uses
+- 对每个 `import_reserved_buffer`，通过 `peer_func + name` 找到 peer `reserve_buffer`
+- 读取对方已解析 `base`
+- 在当前 op 位置插入同值 `arith.constant`
+- 用该常量替换 `import_reserved_buffer` 结果值的全部 uses
+- 常量替换完成后，删除 `reserve_buffer` / `import_reserved_buffer`
+
+#### 7.7.4 结果 IR 形态
+
+地址传播 pass 之后：
+
+- IR 中不再保留 `reserve_buffer` / `import_reserved_buffer`
+- 内部 pipe init op 的 `local_addr` 只再引用普通 SSA 常量地址
+- 因而后续 EmitC 无需理解 frontend 预留地址语义，只需透传解析后的地址值
+
+#### 7.7.5 失败条件
+
+若出现以下情况，pass 直接报错：
+
+- `reserve_buffer.base` 在 pass 运行时仍未解析
+- 启用规划的编译流程却出现 `reserve_buffer.auto = false`
+- 跳过规划的编译流程却出现 `reserve_buffer.auto = true`
+- `peer_func` 无法解析到函数
+- 在 peer function 中找不到同名 `reserve_buffer`
+- 某条未显式提供 `flag_base` 的内部 init，其 `local_addr` 不来自 `reserve_buffer` / `import_reserved_buffer`
+- 基于 `reserve_buffer` / `import_reserved_buffer` 建立的某个逻辑 pipe 分组，未形成完整 peer init pair
+- peer `flag_base` 已显式给定但两侧取值冲突
+
+## 8. flag 分配规则
+
+### 8.1 总原则
+
+- `flag_base` 由 PTOAS flag 分配阶段在内部 init op 上填写
+- 在 flag 分配完成前，内部 init op 可以暂时不携带 `flag_base`
+- peer 两侧同一逻辑 pipe 必须使用同一个 `flag_base`
+
+### 8.2 单向场景
+
+当前规划中，当 `DIR_MASK = 1` 或 `2` 且函数内仅有该唯一逻辑 pipe 时，可采用：
+
+- 该方向唯一逻辑 pipe 的 `flag_base = 0`
+- 该 pipe 占用逻辑 flag 对：`0` 和 `1`
+
+### 8.3 双向场景
+
+当前规划中，当 `DIR_MASK = 3` 时，可采用：
+
+- C2V pipe：`flag_base = 0`
+- V2C pipe：`flag_base = 2`
+
+因此双向固定占用两组逻辑 flag：
+
+- C2V：`0` / `1`
+- V2C：`2` / `3`
+
+### 8.4 与地址传播的关系
+
+地址传播 pass 在识别出 `import_reserved_buffer` 与 `reserve_buffer` 的 peer 对应关系后，同时可以完成 peer pipe 的 `flag_base` 对齐。
+
+即：
+
+- 基于同一 FIFO 通信的两条 peer init op，必须拿到相同的 `flag_base`
+
+## 9. verifier 规则
+
+### 9.1 前端 verifier
+
+前端 verifier 负责检查：
+
+- 每个函数 init op 数量是否合法
+- 每个函数 `reserve_buffer` / `import_reserved_buffer` 数量是否合法
+- `DIR_MASK` 取值是否合法
+- `SLOT_SIZE > 0`
+- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM`
+- `reserve_buffer.location` 与 consumer 函数类型匹配
+- `reserve_buffer.name` 在函数内唯一
+- `reserve_buffer.auto = false` 时必须带 `base`
+- `reserve_buffer.auto = true` 时必须不带 `base`
+- driver / pipeline 级约束：启用规划的编译流程只接受 `auto = true`
+- driver / pipeline 级约束：跳过规划的编译流程只接受 `auto = false` 且显式 `base`
+- `import_reserved_buffer` 能在 `peer_func` 中找到同名 `reserve_buffer`
+- 方向相关 op 只能出现在合法 kernel 中
+- 前端数据传输 op 的 `split` 必须是合法的编译期常量属性
+
+### 9.2 内部 IR verifier
+
+内部 verifier 负责检查：
+
+- `slot_size > 0`
+- `slot_num` 只允许 `8` 或 `4`
+- `DIR_MASK=1/2` 时，`slot_num` 必须与单向/双向 lowering 规则一致
+- `local_slot_num` 若出现，只允许出现在 `pto.initialize_l2g2l_pipe` 上，且必须大于 `0` 且不大于 `slot_num`
+- `flag_base` 若出现，必须满足基本合法性；是否已填写以及具体分配值由 flag 分配保证
+- `pto.initialize_l2g2l_pipe` 必须提供 `gm_addr` 和 `local_addr`
+- `pto.initialize_l2l_pipe` 必须提供 `local_addr`
+- `dir_mask = 1` 的 pipe 只能被 C2V 方向 lowering 使用
+- `dir_mask = 2` 的 pipe 只能被 V2C 方向 lowering 使用
+- `tpush/tpop/tfree` 的 `split` 必须是合法的编译期常量属性
+
+### 9.3 关于 `split` 的校验边界
+
+PTOAS 对 `split` 的处理边界如下：
+
+- PTOAS 验证 `split` 是合法枚举值
+- PTOAS 要求 `split` 以编译期常量属性形式出现
+- PTOAS 不验证同一逻辑 pipe 上多个 `tpush/tpop/tfree` 的 `split` 是否一致
+- PTOAS 不根据 `split` 改变地址分配、flag 分配或 pipe 配对
+
+因此：
+
+- `split` 混用是否语义正确，不是 PTOAS 静态保证项
+- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证
+- PTOAS 只负责校验 `split` 枚举值合法，并将其透传到底层
+
+## 10. EmitC 与 pto-isa 映射
+
+### 10.1 初始化 op
+
+在进入 EmitC 前：
+
+- 前端 `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe`
+- 前端 `pto.tpush_to_aiv` / `pto.tpush_to_aic`
+- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv`
+- 前端 `pto.tfree_from_aic` / `pto.tfree_from_aiv`
+- `pto.reserve_buffer` / `pto.import_reserved_buffer`
+
+都必须已经被前序 pass 消除。
+
+EmitC 只处理 PTOAS 内部统一 IR，不直接理解前端 pipe 接口或地址提示接口。
+
+EmitC 将以下内部 init op 映射到底层 `TPipe`：
+
+- `pto.initialize_l2l_pipe`
+- `pto.initialize_l2g2l_pipe`
+
+映射时需要使用以下信息：
+
+- `dir_mask`
+- `slot_size`
+- `slot_num`
+- `local_slot_num`
+- `flag_base`
+- `gm_addr`
+- `local_addr`
+
+其中：
+
+- 若 `flag_base` 尚未在 EmitC 前完成填写，PTOAS 应报错。
+
+### 10.2 数据传输 op
+
+EmitC 将以下内部数据传输 op 映射到底层：
+
+- `pto.tpush` -> `TPUSH`
+- `pto.tpop` -> `TPOP`
+- `pto.tfree` -> `TFREE`
+
+映射时需要使用以下信息：
+
+- `tile`
+- `split`
+- `pipe`
+
+其中：
+
+- `split` 不在 PTOAS 内部解释
+- `split` 作为底层 `TPUSH/TPOP/TFREE` 的编译期模板实参透传
+
+### 10.3 InsertSync
+
+`split` 不影响 PTOAS 中的 pipeline derivation 与 InsertSync 规则。
+
+InsertSync 只依赖：
+
+- op 种类
+- init op 形态
+- `dir_mask`
+- 目标架构
+
+而不依赖 `split`。
+
+## 11. 编译流程总览
+
+完整流程如下：
+
+```text
+前端 IR 接口
+  -> lowering pass
+  -> PTOAS 内部统一 IR
+  -> plan memory
+  -> 地址传播 pass
+  -> EmitC
+  -> pto-isa C++ 代码
+```
+
+其中：
+
+- lowering pass 负责拆分 `DIR_MASK=3`、绑定方向与 pipe
+- 启用规划的编译流程中，plan memory 先按既有逻辑规划普通 local buffer，再为 `reserve_buffer` 在目标地址空间中分配 hole
+- 跳过规划的编译流程中，不运行 plan memory；`reserve_buffer.base` 必须已由前端给定
+- 地址传播 pass 负责 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐
+- EmitC 只负责将内部 `initialize_l2l_pipe` / `initialize_l2g2l_pipe` / `tpush` / `tpop` / `tfree` 及其属性透传到底层
\ No newline at end of file
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
new file mode 100644
index 00000000..b5db9833
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
@@ -0,0 +1,71 @@
+import ctypes
+import os
+import subprocess
+
+import torch
+import torch_npu  # noqa: F401
+
+from ptodsl.test_util import get_test_device
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so")
+DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
+DEFAULT_FIFO_BYTES = 4 * 1024
+
+
+def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
+    return ctypes.c_void_p(tensor.data_ptr())
+
+
+def compile_example(compile_script: str) -> None:
+    subprocess.run(
+        ["bash", compile_script],
+        check=True,
+        cwd=THIS_DIR,
+    )
+
+
+def load_lib(lib_path: str) -> ctypes.CDLL:
+    lib = ctypes.CDLL(lib_path)
+    lib.call_kernel.argtypes = [
+        ctypes.c_uint32,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+    ]
+    lib.call_kernel.restype = None
+    return lib
+
+
+def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor:
+    fifo_elems = max(1, (fifo_bytes + 3) // 4)
+    return torch.zeros((fifo_elems,), dtype=torch.float32, device=device)
+
+
+def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor) -> None:
+    stream_ptr = torch.npu.current_stream()._as_parameter_
+    lib.call_kernel(
+        1,
+        stream_ptr,
+        torch_to_ctypes(gm_slot_buffer),
+    )
+    torch.npu.synchronize()
+
+
+def main() -> None:
+    compile_example(DEFAULT_COMPILE_SCRIPT)
+
+    device = get_test_device()
+    torch.npu.set_device(device)
+
+    lib = load_lib(DEFAULT_LIB_PATH)
+    gm_slot_buffer = make_gm_slot_buffer(
+        fifo_bytes=DEFAULT_FIFO_BYTES,
+        device=device,
+    )
+
+    run_kernel(lib, gm_slot_buffer=gm_slot_buffer)
+    print(f"Launched bidirectional_example using {DEFAULT_LIB_PATH}.")
+
+
+if __name__ == "__main__":
+    main()

From da15c6d80429a83e4dcdb46c1b63591a70472f1a Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 26 Mar 2026 17:01:55 +0000
Subject: [PATCH 06/38] feat: add gitignore

---
 examples/aot/tpushpop/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 examples/aot/tpushpop/.gitignore

diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore
new file mode 100644
index 00000000..ab5698d1
--- /dev/null
+++ b/examples/aot/tpushpop/.gitignore
@@ -0,0 +1 @@
+msprof_res/
\ No newline at end of file

From 041625f818a1720b0b7c3790c92e7007fe98d477 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 10:37:20 +0000
Subject: [PATCH 07/38] feat: simple bidirectional transfer working in mlir

---
 .../bidirectional_example.mlir                | 140 ++++++------------
 .../aot/tpushpop/mix-kernel_mlir/caller.cpp   |   2 +-
 2 files changed, 49 insertions(+), 93 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
index 01052493..849f8d37 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
@@ -1,6 +1,37 @@
+// Bidirectional pipe example.
+//
+// There are two logical FIFO pipes:
+// - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel`
+// - `v2c_fifo`: vector/kernel `@vector_kernel` pushes to cube/kernel `@cube_kernel`
+//
+// `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import
+// ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe`
+// binds those FIFO endpoints to the shared GM slot buffer plus each side's local
+// consumer buffer.
+//
+// What is transferred:
+// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv`
+//   with `split = 0` (no split). Vector receives that same logical `16 x 16` tile
+//   with `pto.tpop_from_aic`, but in a vector tile type/layout.
+// - Vector -> Cube: one full `16 x 16` `f32` vector tile via `pto.tpush_to_aic`
+//   with `split = 0` (no split). Cube receives that same logical `16 x 16` tile
+//   with `pto.tpop_from_aiv`, but in a matrix tile type/layout.
+//
+// Shape summary:
+// - All transferred tiles are `rows=16, cols=16, dtype=f32`
+// - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
+// - Vector-produced tile: `loc=vec`, `blayout=row_major`, `slayout=none_box`
+// - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major`
+// - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
 module {
-  func.func @cube_kernel(%gm_slot_buffer: i32)
-      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+
+  func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>) attributes {pto.entry} {
+    func.call @cube_kernel(%gm_slot_buffer) : (!pto.ptr<f32>) -> ()
+    func.call @vector_kernel(%gm_slot_buffer) : (!pto.ptr<f32>) -> ()
+    return
+  }
+
+  func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
     %v2c_local = pto.reserve_buffer {
       name = "v2c_fifo",
       size = 4096,
@@ -12,20 +43,22 @@ module {
       peer_func = @vector_kernel
     } -> i32
     pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024}
-      (gm_slot_buffer = %gm_slot_buffer : i32,
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
        c2v_consumer_buf = %c2v_import : i32,
        v2c_consumer_buf = %v2c_local : i32)
 
     %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
     pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
 
-    %mat_tile = pto.tpop_from_aiv {split = 1}
-      -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>
-    pto.tfree_from_aiv {split = 2}
+    %mat_tile = pto.tpop_from_aiv {split = 0}
+      -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tfree_from_aiv {split = 0}
     return
   }
 
-  func.func @vector_kernel(%gm_slot_buffer: i32)
+  func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>)
       attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
     %c2v_local = pto.reserve_buffer {
       name = "c2v_fifo",
@@ -38,96 +71,19 @@ module {
       peer_func = @cube_kernel
     } -> i32
     pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024}
-      (gm_slot_buffer = %gm_slot_buffer : i32,
+      (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
        c2v_consumer_buf = %c2v_local : i32,
        v2c_consumer_buf = %v2c_import : i32)
 
-    %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
-
-    %recv_tile = pto.tpop_from_aic {split = 1}
-      -> !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tfree_from_aic {split = 2}
-    return
-  }
-
-  func.func @cube_kernel_nested(%gm_slot_buffer: i32)
-      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-    %true = arith.constant true
-    scf.if %true {
-      %v2c_local = pto.reserve_buffer {
-        name = "v2c_fifo_nested",
-        size = 4096,
-        location = #pto.address_space<mat>,
-        auto = true
-      } -> i32
-      %c2v_import = pto.import_reserved_buffer {
-        name = "c2v_fifo_nested",
-        peer_func = @vector_kernel_nested
-      } -> i32
-      pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024}
-        (gm_slot_buffer = %gm_slot_buffer : i32,
-         c2v_consumer_buf = %c2v_import : i32,
-         v2c_consumer_buf = %v2c_local : i32)
-
-      %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
-      pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
+    %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
 
-      %recv_tile = pto.tpop_from_aiv {split = 1}
-        -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>
-      pto.tfree_from_aiv {split = 2}
-    }
+    %recv_tile = pto.tpop_from_aic {split = 0}
+      -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    %neg_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tneg ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%neg_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tfree_from_aic {split = 0}
     return
   }
 
-  func.func @vector_kernel_nested(%gm_slot_buffer: i32)
-      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
-    %true = arith.constant true
-    scf.if %true {
-      %c2v_local = pto.reserve_buffer {
-        name = "c2v_fifo_nested",
-        size = 4096,
-        location = #pto.address_space<vec>,
-        auto = true
-      } -> i32
-      %v2c_import = pto.import_reserved_buffer {
-        name = "v2c_fifo_nested",
-        peer_func = @cube_kernel_nested
-      } -> i32
-      pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024}
-        (gm_slot_buffer = %gm_slot_buffer : i32,
-         c2v_consumer_buf = %c2v_local : i32,
-         v2c_consumer_buf = %v2c_import : i32)
-
-      %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
-
-      %recv_tile = pto.tpop_from_aic {split = 1}
-        -> !pto.tile_buf<loc=vec, dtype=f32, rows=8, cols=16, v_row=8, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-      pto.tfree_from_aic {split = 2}
-    }
-    return
-  }
 }
-
-// A3-LABEL: AICORE void cube_kernel(
-// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
-// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
-// A3: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
-// A3: TPOP<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
-// A3: TFREE<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
-
-// A3-LABEL: AICORE void vector_kernel(
-// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
-// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
-// A3: TPUSH<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
-// A3: TPOP<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
-// A3: TFREE<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
-
-// A3-LABEL: AICORE void cube_kernel_nested(
-// A3: if (
-// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>(
-// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>(
-// A3: TPUSH<TPipe<0, Direction::DIR_C2V, 1024, 4, 4>
-// A3: TPOP<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
-// A3: TFREE<TPipe<2, Direction::DIR_V2C, 1024, 4, 4>
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
index 5926c256..8406b32d 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
+++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -11,5 +11,5 @@ extern "C" void call_kernel(
     void *stream,
     uint8_t *gmSlotBuffer)
 {
-    bidirectional_example<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer);
+    call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer);
 }

From 5137291c30671bf27bd341bbea12b626a21b45a5 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 10:49:21 +0000
Subject: [PATCH 08/38] feat: now does simple add

---
 .../mix-kernel_mlir/bidirectional_example.mlir  | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
index 849f8d37..5f209333 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
@@ -13,14 +13,15 @@
 // - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv`
 //   with `split = 0` (no split). Vector receives that same logical `16 x 16` tile
 //   with `pto.tpop_from_aic`, but in a vector tile type/layout.
-// - Vector -> Cube: one full `16 x 16` `f32` vector tile via `pto.tpush_to_aic`
-//   with `split = 0` (no split). Cube receives that same logical `16 x 16` tile
-//   with `pto.tpop_from_aiv`, but in a matrix tile type/layout.
+// - Vector -> Cube: the doubled version of that received tile. Vector computes
+//   `recv_tile + recv_tile` with `pto.tadd`, then sends that full `16 x 16` `f32`
+//   result back with `pto.tpush_to_aic`. Cube receives it with `pto.tpop_from_aiv`
+//   in a matrix tile type/layout.
 //
 // Shape summary:
 // - All transferred tiles are `rows=16, cols=16, dtype=f32`
 // - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
-// - Vector-produced tile: `loc=vec`, `blayout=row_major`, `slayout=none_box`
+// - Vector-produced return tile: `loc=vec`, `blayout=row_major`, `slayout=none_box`
 // - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major`
 // - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
 module {
@@ -75,14 +76,12 @@ module {
        c2v_consumer_buf = %c2v_local : i32,
        v2c_consumer_buf = %v2c_import : i32)
 
-    %vec_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tpush_to_aic(%vec_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
-
     %recv_tile = pto.tpop_from_aic {split = 0}
       -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %neg_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tneg ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%neg_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    %sum_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.tadd ins(%recv_tile, %recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sum_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
     pto.tfree_from_aic {split = 0}
+    pto.tpush_to_aic(%sum_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
     return
   }
 

From 6210864e2f09b941d7b0f7cf0d7a08498cbbfc33 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 14:02:32 +0000
Subject: [PATCH 09/38] feat: clean working version simple

---
 .../bidirectional_example.mlir                | 81 ++++++++++---------
 .../aot/tpushpop/mix-kernel_mlir/caller.cpp   |  6 +-
 .../aot/tpushpop/mix-kernel_mlir/compile.sh   |  8 +-
 .../run_bidirectional_example.py              | 33 +++++++-
 4 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
index 5f209333..d5eab7fb 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
@@ -1,87 +1,88 @@
 // Bidirectional pipe example.
 //
-// There are two logical FIFO pipes:
+// This reduced version only uses the C2V pipe:
 // - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel`
-// - `v2c_fifo`: vector/kernel `@vector_kernel` pushes to cube/kernel `@cube_kernel`
 //
 // `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import
 // ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe`
 // binds those FIFO endpoints to the shared GM slot buffer plus each side's local
 // consumer buffer.
 //
+// End-to-end data flow:
+// - Cube loads one input matrix `X` from GM.
+// - Cube computes `Y = X @ X`.
+// - Cube sends that accumulator tile to vector over `c2v_fifo`.
+// - Vector pops the tile and stores it to GM as output matrix `Y`.
+//
 // What is transferred:
-// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv`
-//   with `split = 0` (no split). Vector receives that same logical `16 x 16` tile
-//   with `pto.tpop_from_aic`, but in a vector tile type/layout.
-// - Vector -> Cube: the doubled version of that received tile. Vector computes
-//   `recv_tile + recv_tile` with `pto.tadd`, then sends that full `16 x 16` `f32`
-//   result back with `pto.tpush_to_aic`. Cube receives it with `pto.tpop_from_aiv`
-//   in a matrix tile type/layout.
+// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile `Y = X @ X`
+//   sent with `pto.tpush_to_aiv` using `split = 0` (no split). Vector receives
+//   that same logical `16 x 16` tile with `pto.tpop_from_aic` in a vector tile
+//   type/layout, then stores it to the GM output buffer.
 //
 // Shape summary:
 // - All transferred tiles are `rows=16, cols=16, dtype=f32`
-// - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
-// - Vector-produced return tile: `loc=vec`, `blayout=row_major`, `slayout=none_box`
-// - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major`
+// - Cube-produced C2V tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
 // - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
 module {
 
-  func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>) attributes {pto.entry} {
-    func.call @cube_kernel(%gm_slot_buffer) : (!pto.ptr<f32>) -> ()
-    func.call @vector_kernel(%gm_slot_buffer) : (!pto.ptr<f32>) -> ()
+  func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>) attributes {pto.entry} {
+    func.call @cube_kernel(%gm_slot_buffer, %gm_x) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
+    func.call @vector_kernel(%gm_slot_buffer, %gm_y) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
     return
   }
 
-  func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
-    %v2c_local = pto.reserve_buffer {
-      name = "v2c_fifo",
-      size = 4096,
-      location = #pto.address_space<mat>,
-      auto = true
-    } -> i32
+  func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
     %c2v_import = pto.import_reserved_buffer {
       name = "c2v_fifo",
       peer_func = @vector_kernel
     } -> i32
-    pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024}
+    %c0_i32 = arith.constant 0 : i32
+    pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
       (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
        c2v_consumer_buf = %c2v_import : i32,
-       v2c_consumer_buf = %v2c_local : i32)
+       v2c_consumer_buf = %c0_i32 : i32)
 
+    %x_mat_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %x_right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
     %acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %gm_x_view = pto.make_tensor_view %gm_x, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_x_tile_view = pto.partition_view %gm_x_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tload ins(%gm_x_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+    pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
+    pto.tmatmul ins(%x_left_tile, %x_right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
     pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
-
-    %mat_tile = pto.tpop_from_aiv {split = 0}
-      -> !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
-    %left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>
-    pto.tmov ins(%mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=row_major, fractal=512, pad=0>)
-    pto.tfree_from_aiv {split = 0}
     return
   }
 
-  func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>)
+  func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>)
       attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
     %c2v_local = pto.reserve_buffer {
       name = "c2v_fifo",
       size = 4096,
       location = #pto.address_space<vec>,
       auto = true
     } -> i32
-    %v2c_import = pto.import_reserved_buffer {
-      name = "v2c_fifo",
-      peer_func = @cube_kernel
-    } -> i32
-    pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024}
+    %c0_i32 = arith.constant 0 : i32
+    pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
       (gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
        c2v_consumer_buf = %c2v_local : i32,
-       v2c_consumer_buf = %v2c_import : i32)
+       v2c_consumer_buf = %c0_i32 : i32)
 
+    %gm_y_view = pto.make_tensor_view %gm_y, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %gm_y_tile_view = pto.partition_view %gm_y_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
     %recv_tile = pto.tpop_from_aic {split = 0}
       -> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    %sum_tile = pto.alloc_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
-    pto.tadd ins(%recv_tile, %recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%sum_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.tstore ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gm_y_tile_view : !pto.partition_tensor_view<16x16xf32>)
     pto.tfree_from_aic {split = 0}
-    pto.tpush_to_aic(%sum_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) {split = 0}
     return
   }
 
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
index 8406b32d..e558e69d 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
+++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -9,7 +9,9 @@
 extern "C" void call_kernel(
     uint32_t blockDim,
     void *stream,
-    uint8_t *gmSlotBuffer)
+    uint8_t *gmSlotBuffer,
+    uint8_t *x,
+    uint8_t *y)
 {
-    call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer);
+    call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y);
 }
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index ee6376fb..62c64573 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -7,14 +7,14 @@ MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
 GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
 LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 
-mkdir -p "${ARTIFACT_DIR}"
-rm -f "${GENERATED_CPP}" "${LIB_PATH}"
+#mkdir -p "${ARTIFACT_DIR}"
+#rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
-ptoas --pto-arch=a3 "${MLIR_PATH}" > "${GENERATED_CPP}"
+#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
 
 bisheng \
     -I/sources/pto-isa/include/ \
-    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
+    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 -g \
     -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
     -xcce -Xhost-start -Xhost-end \
     -mllvm -cce-aicore-stack-size=0x8000 \
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
index b5db9833..0854a6a1 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
@@ -11,6 +11,10 @@
 DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so")
 DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
 DEFAULT_FIFO_BYTES = 4 * 1024
+M = 16
+N = 16
+ATOL = 1e-4
+RTOL = 1e-4
 
 
 def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
@@ -31,6 +35,8 @@ def load_lib(lib_path: str) -> ctypes.CDLL:
         ctypes.c_uint32,
         ctypes.c_void_p,
         ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
     ]
     lib.call_kernel.restype = None
     return lib
@@ -41,12 +47,20 @@ def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor:
     return torch.zeros((fifo_elems,), dtype=torch.float32, device=device)
 
 
-def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor) -> None:
+def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]:
+    x = torch.rand((M, N), dtype=torch.float32, device=device) -0.5
+    y = torch.zeros((M, N), dtype=torch.float32, device=device)
+    return x, y
+
+
+def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None:
     stream_ptr = torch.npu.current_stream()._as_parameter_
     lib.call_kernel(
         1,
         stream_ptr,
         torch_to_ctypes(gm_slot_buffer),
+        torch_to_ctypes(x),
+        torch_to_ctypes(y),
     )
     torch.npu.synchronize()
 
@@ -62,9 +76,22 @@ def main() -> None:
         fifo_bytes=DEFAULT_FIFO_BYTES,
         device=device,
     )
+    x, y = make_io_tensors(device=device)
+
+    print(y)
+    run_kernel(lib, gm_slot_buffer=gm_slot_buffer, x=x, y=y)
+    print(y)
+
+    y_ref = x.cpu() @ x.cpu()
+    y_cpu = y.cpu()
+    max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item())
+    ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL))
+
+    print(f"shape=({M}, {N}) max_abs={max_abs:.6f}")
+    if not ok:
+        raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}")
 
-    run_kernel(lib, gm_slot_buffer=gm_slot_buffer)
-    print(f"Launched bidirectional_example using {DEFAULT_LIB_PATH}.")
+    print(f"Validation passed using {DEFAULT_LIB_PATH}.")
 
 
 if __name__ == "__main__":

From f9d8812a50f2c43b543f7098723892682a03f18c Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 14:26:13 +0000
Subject: [PATCH 10/38] feat: clean working version simple

---
 examples/aot/tpushpop/mix-kernel_mlir/compile.sh            | 6 +++---
 .../tpushpop/mix-kernel_mlir/run_bidirectional_example.py   | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index 62c64573..8978eb9e 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -7,10 +7,10 @@ MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
 GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
 LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 
-#mkdir -p "${ARTIFACT_DIR}"
-#rm -f "${GENERATED_CPP}" "${LIB_PATH}"
+mkdir -p "${ARTIFACT_DIR}"
+rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
-#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
+ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
 
 bisheng \
     -I/sources/pto-isa/include/ \
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
index 0854a6a1..1b619869 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
@@ -76,6 +76,7 @@ def main() -> None:
         fifo_bytes=DEFAULT_FIFO_BYTES,
         device=device,
     )
+    torch.set_printoptions(precision=1, threshold=2000, linewidth=250, sci_mode=False)
     x, y = make_io_tensors(device=device)
 
     print(y)
@@ -84,6 +85,8 @@ def main() -> None:
 
     y_ref = x.cpu() @ x.cpu()
     y_cpu = y.cpu()
+
+    print(y_ref-y_cpu)
     max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item())
     ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL))
 

From d2bf0ca18cfe34cf125ca98ae251422132ac04aa Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 15:07:25 +0000
Subject: [PATCH 11/38] wip: add transfer ops to dsl

---
 ptodsl/api/pto.py         | 20 +++++++++
 ptodsl/api/pto_general.py | 90 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py
index f2e2d0ac..0cdf4bdb 100644
--- a/ptodsl/api/pto.py
+++ b/ptodsl/api/pto.py
@@ -2,15 +2,25 @@
 from .scalar import Value, wrap_value
 from .pto_general import (
     alloc_tile,
+    aic_initialize_pipe,
+    aiv_initialize_pipe,
     as_tensor,
     cube_section,
     get_block_idx,
     get_block_num,
     get_subblock_idx,
     get_subblock_num,
+    import_reserved_buffer,
     load,
+    reserve_buffer,
     slice_view,
     store,
+    tfree_from_aic,
+    tfree_from_aiv,
+    tpop_from_aic,
+    tpop_from_aiv,
+    tpush_to_aic,
+    tpush_to_aiv,
     vector_section,
     print,
 )
@@ -49,9 +59,19 @@
     "range",
     "if_context",
     "cond",
+    "reserve_buffer",
+    "import_reserved_buffer",
+    "aic_initialize_pipe",
+    "aiv_initialize_pipe",
     "alloc_tile",
     "load",
     "store",
+    "tpush_to_aiv",
+    "tpush_to_aic",
+    "tpop_from_aic",
+    "tpop_from_aiv",
+    "tfree_from_aic",
+    "tfree_from_aiv",
     "print",
     "record_event",
     "wait_event",
diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index c8f649ea..958c8459 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -1,7 +1,7 @@
 from contextlib import contextmanager
 
 from mlir.dialects import pto as _pto
-from mlir.ir import InsertionPoint
+from mlir.ir import FlatSymbolRefAttr, InsertionPoint
 
 from .scalar import Value, _unwrap
 
@@ -30,6 +30,12 @@ def _resolve_layout_attr(layout):
     return layout
 
 
+def _resolve_address_space_attr(location):
+    if isinstance(location, str):
+        return _pto.AddressSpaceAttr.get(getattr(_pto.AddressSpace, location.upper()))
+    return location
+
+
 def as_tensor(tensor_type, *, ptr, shape, strides, layout=None):
     shape_vals = [_unwrap(v) for v in shape]
     stride_vals = [_unwrap(v) for v in strides]
@@ -77,6 +83,78 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None):
     return _pto.AllocTileOp(tile_type, **kwargs).result
 
 
+# %c2v_local = pto.reserve_buffer {
+#     name = "c2v_fifo",
+#     size = 4096,
+#     location = #pto.address_space<vec>,
+#     auto = true
+# } -> i32
+def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
+    # TODO: should return be wrapped in Value class?
+    # All params are compile time attributes
+    # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
+    pass
+
+
+# %c2v_import = pto.import_reserved_buffer {
+#     name = "c2v_fifo",
+#     peer_func = @vector_kernel
+# } -> i32
+def import_reserved_buffer(*, name, peer_func):
+    # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
+    pass
+
+
+def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf):
+    # wrap
+    # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    pass
+
+
+# pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (
+#    gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
+#    c2v_consumer_buf = %c2v_local : i32,
+#    v2c_consumer_buf = %c0_i32 : i32
+# )
+def aiv_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf):
+    # wrap
+    # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    pass
+
+
+# pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, ..., pad=0>) {split = 0}
+def tpush_to_aiv(tile, split):
+    # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    _pto.tpush_to_aiv(tile, split)
+
+
+def tpush_to_aic(tile, split):
+    # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    _pto.tpush_to_aic(tile, split)
+
+
+# %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, ... fractal=512, pad=0>
+def tpop_from_aic(tile_type, split):
+    # wrap tpop_from_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
+    return _pto.tpop_from_aic(tile_type, split)
+
+
+def tpop_from_aiv(tile_type, split):
+    # wraps tpop_from_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
+    return _pto.tpop_from_aiv(tile_type, split)
+
+
+# pto.tfree_from_aic {split = 0}
+def tfree_from_aic(split):
+    # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    _pto.tfree_from_aic(split)
+
+
+def tfree_from_aiv(split):
+    # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    _pto.tfree_from_aiv(split)
+
+
 def load(source, dest):
     _pto.TLoadOp(None, source, dest)
 
@@ -111,7 +189,17 @@ def print(format, scalar):
     "vector_section",
     "cube_section",
     "alloc_tile",
+    "reserve_buffer",
+    "import_reserved_buffer",
+    "aic_initialize_pipe",
+    "aiv_initialize_pipe",
     "load",
     "store",
+    "tpush_to_aiv",
+    "tpush_to_aic",
+    "tpop_from_aic",
+    "tpop_from_aiv",
+    "tfree_from_aic",
+    "tfree_from_aiv",
     "print",
 ]

From 6055f694a74e0360f1d4b26ab35b4423b4cf5e35 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 16:08:20 +0000
Subject: [PATCH 12/38] feat: docker add compiled cpp and bindings

---
 docker/Dockerfile         |  8 ++---
 ptodsl/api/pto_general.py | 64 ++++++++++++++++++++++++++++++++-------
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index d04eba64..331ca0d1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -29,10 +29,10 @@ ARG CACHE_BURST=1
 
 # ARG ARCH=x86_64
 ARG ARCH=aarch64
-ARG RELEASE_REPO=zhangstevenunity/PTOAS
-ARG RELEASE_VER=0.15
-ARG RELEASE_TAG=v${RELEASE_VER}
-ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
+ARG RELEASE_REPO=huawei-csl/PTOAS
+ARG RELEASE_VER=20260327
+ARG RELEASE_TAG=${RELEASE_VER}
+ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl
 ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz
 
 WORKDIR /installers/
diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index 958c8459..ef238787 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -36,6 +36,12 @@ def _resolve_address_space_attr(location):
     return location
 
 
+def _resolve_peer_func_attr(peer_func):
+    if isinstance(peer_func, str):
+        return FlatSymbolRefAttr.get(peer_func.removeprefix("@"))
+    return peer_func
+
+
 def as_tensor(tensor_type, *, ptr, shape, strides, layout=None):
     shape_vals = [_unwrap(v) for v in shape]
     stride_vals = [_unwrap(v) for v in strides]
@@ -90,10 +96,14 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None):
 #     auto = true
 # } -> i32
 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
-    # TODO: should return be wrapped in Value class?
     # All params are compile time attributes
     # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    pass
+    kwargs = {}
+    if base is not None:
+        kwargs["base"] = base
+    return _pto.reserve_buffer(
+        name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs
+    )
 
 
 # %c2v_import = pto.import_reserved_buffer {
@@ -102,13 +112,29 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
 # } -> i32
 def import_reserved_buffer(*, name, peer_func):
     # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    pass
+    return _pto.import_reserved_buffer(name, _resolve_peer_func_attr(peer_func))
 
 
-def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf):
+def aic_initialize_pipe(
+    *,
+    dir_mask,
+    slot_size,
+    gm_slot_buffer=None, # only needed on a2/a3?
+    c2v_consumer_buf,
+    v2c_consumer_buf,
+):
     # wrap
     # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    pass
+    kwargs = {}
+    if gm_slot_buffer is not None:
+        kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
+    return _pto.aic_initialize_pipe(
+        dir_mask,
+        slot_size,
+        _unwrap(c2v_consumer_buf),
+        _unwrap(v2c_consumer_buf),
+        **kwargs,
+    )
 
 
 # pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (
@@ -116,21 +142,37 @@ def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consume
 #    c2v_consumer_buf = %c2v_local : i32,
 #    v2c_consumer_buf = %c0_i32 : i32
 # )
-def aiv_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf):
+def aiv_initialize_pipe(
+    *,
+    dir_mask,
+    slot_size,
+    gm_slot_buffer=None,
+    c2v_consumer_buf,
+    v2c_consumer_buf,
+):
     # wrap
     # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    pass
+    kwargs = {}
+    if gm_slot_buffer is not None:
+        kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
+    return _pto.aiv_initialize_pipe(
+        dir_mask,
+        slot_size,
+        _unwrap(c2v_consumer_buf),
+        _unwrap(v2c_consumer_buf),
+        **kwargs,
+    )
 
 
 # pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, ..., pad=0>) {split = 0}
 def tpush_to_aiv(tile, split):
     # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    _pto.tpush_to_aiv(tile, split)
+    return _pto.tpush_to_aiv(_unwrap(tile), split)
 
 
 def tpush_to_aic(tile, split):
     # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    _pto.tpush_to_aic(tile, split)
+    return _pto.tpush_to_aic(_unwrap(tile), split)
 
 
 # %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, ... fractal=512, pad=0>
@@ -147,12 +189,12 @@ def tpop_from_aiv(tile_type, split):
 # pto.tfree_from_aic {split = 0}
 def tfree_from_aic(split):
     # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    _pto.tfree_from_aic(split)
+    return _pto.tfree_from_aic(split)
 
 
 def tfree_from_aiv(split):
     # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    _pto.tfree_from_aiv(split)
+    return _pto.tfree_from_aiv(split)
 
 
 def load(source, dest):

From 8ea9459ebf0f804211de23c9d0a1f3287178614b Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Fri, 27 Mar 2026 16:08:51 +0000
Subject: [PATCH 13/38] feat: use classes instead

---
 ptodsl/api/pto_general.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index ef238787..27fff68f 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -101,9 +101,9 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
     kwargs = {}
     if base is not None:
         kwargs["base"] = base
-    return _pto.reserve_buffer(
+    return _pto.ReserveBufferOp(
         name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs
-    )
+    ).result
 
 
 # %c2v_import = pto.import_reserved_buffer {
@@ -112,7 +112,7 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
 # } -> i32
 def import_reserved_buffer(*, name, peer_func):
     # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    return _pto.import_reserved_buffer(name, _resolve_peer_func_attr(peer_func))
+    return _pto.ImportReservedBufferOp(name, _resolve_peer_func_attr(peer_func)).result
 
 
 def aic_initialize_pipe(
@@ -128,11 +128,11 @@ def aic_initialize_pipe(
     kwargs = {}
     if gm_slot_buffer is not None:
         kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
-    return _pto.aic_initialize_pipe(
+    return _pto.AicInitializePipeOp(
         dir_mask,
         slot_size,
-        _unwrap(c2v_consumer_buf),
-        _unwrap(v2c_consumer_buf),
+        c2v_consumer_buf=_unwrap(c2v_consumer_buf),
+        v2c_consumer_buf=_unwrap(v2c_consumer_buf),
         **kwargs,
     )
 
@@ -146,7 +146,7 @@ def aiv_initialize_pipe(
     *,
     dir_mask,
     slot_size,
-    gm_slot_buffer=None,
+    gm_slot_buffer=None, # only needed on a2/a3
     c2v_consumer_buf,
     v2c_consumer_buf,
 ):
@@ -155,11 +155,11 @@ def aiv_initialize_pipe(
     kwargs = {}
     if gm_slot_buffer is not None:
         kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
-    return _pto.aiv_initialize_pipe(
+    return _pto.AivInitializePipeOp(
         dir_mask,
         slot_size,
-        _unwrap(c2v_consumer_buf),
-        _unwrap(v2c_consumer_buf),
+        c2v_consumer_buf=_unwrap(c2v_consumer_buf),
+        v2c_consumer_buf=_unwrap(v2c_consumer_buf),
         **kwargs,
     )
 
@@ -167,34 +167,34 @@ def aiv_initialize_pipe(
 # pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, ..., pad=0>) {split = 0}
 def tpush_to_aiv(tile, split):
     # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    return _pto.tpush_to_aiv(_unwrap(tile), split)
+    return _pto.TPushToAivOp(_unwrap(tile), split)
 
 
 def tpush_to_aic(tile, split):
     # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    return _pto.tpush_to_aic(_unwrap(tile), split)
+    return _pto.TPushToAicOp(_unwrap(tile), split)
 
 
 # %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf<loc=vec, ... fractal=512, pad=0>
 def tpop_from_aic(tile_type, split):
     # wrap tpop_from_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    return _pto.tpop_from_aic(tile_type, split)
+    return _pto.TPopFromAicOp(tile_type, split).result
 
 
 def tpop_from_aiv(tile_type, split):
     # wraps tpop_from_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    return _pto.tpop_from_aiv(tile_type, split)
+    return _pto.TPopFromAivOp(tile_type, split).result
 
 
 # pto.tfree_from_aic {split = 0}
 def tfree_from_aic(split):
     # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    return _pto.tfree_from_aic(split)
+    return _pto.TFreeFromAicOp(split)
 
 
 def tfree_from_aiv(split):
     # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    return _pto.tfree_from_aiv(split)
+    return _pto.TFreeFromAivOp(split)
 
 
 def load(source, dest):

From 3c7dbd9d06e973b64cadbe56f99b78efc57ab584 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 08:47:24 +0000
Subject: [PATCH 14/38] WIP: add builder with multiple funcs

---
 .../mix-kernel_mlir/bidirectional_builder.py  | 165 ++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
new file mode 100644
index 00000000..fbf68e6b
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
@@ -0,0 +1,165 @@
+from mlir.dialects import arith, func, pto as _pto
+from mlir.ir import (
+    Attribute,
+    Context,
+    FlatSymbolRefAttr,
+    InsertionPoint,
+    Location,
+    Module,
+    Operation,
+    UnitAttr,
+)
+
+from ptodsl import pto, tile
+from ptodsl import scalar as s
+
+const = s.const
+
+
+def _call(name, *args):
+    return Operation.create(
+        "func.call",
+        operands=list(args),
+        attributes={"callee": FlatSymbolRefAttr.get(name)},
+    )
+
+
+def _kernel(fn, kind):
+    fn.operation.attributes["pto.kernel_kind"] = Attribute.parse(
+        f"#pto.kernel_kind<{kind}>"
+    )
+
+
+def build_module():
+    with Context() as ctx, Location.unknown():
+        _pto.register_dialect(ctx, load=True)
+        module = Module.create()
+
+        dtype = pto.float32
+        ptr_ty = pto.PtrType(dtype)
+        i32 = pto.int32
+        tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+        tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+        left_cfg = pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor")
+        x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
+        x_left_ty = pto.TileBufType(
+            shape=[16, 16],
+            dtype=dtype,
+            memory_space="LEFT",
+            config=left_cfg,
+        )
+        x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
+        acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
+        recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+        call_both_ty = func.FunctionType.get([ptr_ty, ptr_ty, ptr_ty], [])
+        two_ptr_ty = func.FunctionType.get([ptr_ty, ptr_ty], [])
+
+        with InsertionPoint(module.body):
+            call_both = func.FuncOp("call_both", call_both_ty)
+            cube_kernel = func.FuncOp("cube_kernel", two_ptr_ty)
+            vector_kernel = func.FuncOp("vector_kernel", two_ptr_ty)
+
+        call_both.operation.attributes["pto.entry"] = UnitAttr.get(ctx)
+        _kernel(cube_kernel, "cube")
+        _kernel(vector_kernel, "vector")
+
+        call_both_entry = call_both.add_entry_block()
+        with InsertionPoint(call_both_entry):
+            gm_slot_buffer, gm_x, gm_y = call_both_entry.arguments
+            _call("cube_kernel", gm_slot_buffer, gm_x)
+            _call("vector_kernel", gm_slot_buffer, gm_y)
+            func.ReturnOp([])
+
+        cube_entry = cube_kernel.add_entry_block()
+        with InsertionPoint(cube_entry):
+            gm_slot_buffer, gm_x = cube_entry.arguments
+            c0 = const(0)
+            c1 = const(1)
+            c16 = const(16)
+            c0_i32 = arith.ConstantOp(i32, 0).result
+            c2v_import = pto.import_reserved_buffer(
+                name="c2v_fifo",
+                peer_func="@vector_kernel",
+            )
+
+            pto.aic_initialize_pipe(
+                dir_mask=1,
+                slot_size=1024,
+                gm_slot_buffer=gm_slot_buffer,
+                c2v_consumer_buf=c2v_import,
+                v2c_consumer_buf=c0_i32,
+            )
+
+            x_mat_tile = pto.alloc_tile(x_mat_ty)
+            x_left_tile = pto.alloc_tile(x_left_ty)
+            x_right_tile = pto.alloc_tile(x_right_ty)
+            acc_tile = pto.alloc_tile(acc_ty)
+
+            gm_x_view = pto.as_tensor(
+                tensor_ty,
+                ptr=gm_x,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            )
+            gm_x_tile_view = pto.slice_view(
+                tile_view_ty,
+                source=gm_x_view,
+                offsets=[c0, c0],
+                sizes=[c16, c16],
+            )
+
+            pto.load(gm_x_tile_view, x_mat_tile)
+            tile.mov(x_mat_tile, x_left_tile)
+            tile.mov(x_mat_tile, x_right_tile)
+            tile.matmul(x_left_tile, x_right_tile, acc_tile)
+            pto.tpush_to_aiv(acc_tile, 0)
+            func.ReturnOp([])
+
+        vector_entry = vector_kernel.add_entry_block()
+        with InsertionPoint(vector_entry):
+            gm_slot_buffer, gm_y = vector_entry.arguments
+            c0 = const(0)
+            c1 = const(1)
+            c16 = const(16)
+            c0_i32 = arith.ConstantOp(i32, 0).result
+            c2v_local = pto.reserve_buffer(
+                name="c2v_fifo",
+                size=4096,
+                location="VEC",
+            )
+
+            pto.aiv_initialize_pipe(
+                dir_mask=1,
+                slot_size=1024,
+                gm_slot_buffer=gm_slot_buffer,
+                c2v_consumer_buf=c2v_local,
+                v2c_consumer_buf=c0_i32,
+            )
+
+            gm_y_view = pto.as_tensor(
+                tensor_ty,
+                ptr=gm_y,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            )
+            gm_y_tile_view = pto.slice_view(
+                tile_view_ty,
+                source=gm_y_view,
+                offsets=[c0, c0],
+                sizes=[c16, c16],
+            )
+
+            recv_tile = pto.tpop_from_aic(recv_ty, 0)
+            pto.store(recv_tile, gm_y_tile_view)
+            pto.tfree_from_aic(0)
+            func.ReturnOp([])
+
+        module.operation.verify()
+        return module
+
+
+module = build_module()
+
+
+if __name__ == "__main__":
+    print(module)

From 834ca8f73949468da2a49f6c0c6b8e660299f43b Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 09:08:51 +0000
Subject: [PATCH 15/38] feat: add type arg to const() api

---
 ptodsl/api/scalar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ptodsl/api/scalar.py b/ptodsl/api/scalar.py
index 90938daa..73ad8a1b 100644
--- a/ptodsl/api/scalar.py
+++ b/ptodsl/api/scalar.py
@@ -98,8 +98,10 @@ def __getattr__(name):
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
 
-def const(value):
-    return Value(arith.ConstantOp(IndexType.get(), value).result)
+def const(value, type=None):
+    if type is None:
+        type = IndexType.get()
+    return Value(arith.ConstantOp(type, value).result)
 
 
 def index_cast(value, index_type=IndexType):

From 422f5f23235727059a44c9dec542f65448bc4e0d Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 09:20:17 +0000
Subject: [PATCH 16/38] WIP: in decorated function we allow multiple functions

---
 ptodsl/api/pto.py                   |   4 +
 ptodsl/api/pto_general.py           |  13 ++-
 ptodsl/compiler/ir.py               | 126 ++++++++++++++++++++++------
 tests/frontend/test_multifunc_ir.py |  27 ++++++
 4 files changed, 143 insertions(+), 27 deletions(-)
 create mode 100644 tests/frontend/test_multifunc_ir.py

diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py
index 0cdf4bdb..f1f3012f 100644
--- a/ptodsl/api/pto.py
+++ b/ptodsl/api/pto.py
@@ -1,3 +1,4 @@
+from ..compiler.ir import ir_func as func
 from .control_flow import cond, range, if_context
 from .scalar import Value, wrap_value
 from .pto_general import (
@@ -5,6 +6,7 @@
     aic_initialize_pipe,
     aiv_initialize_pipe,
     as_tensor,
+    call,
     cube_section,
     get_block_idx,
     get_block_num,
@@ -48,10 +50,12 @@
     "SubTensorType",
     "TileBufConfig",
     "TileBufType",
+    "func",
     "get_block_idx",
     "get_subblock_idx",
     "get_subblock_num",
     "get_block_num",
+    "call",
     "as_tensor",
     "slice_view",
     "vector_section",
diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index 27fff68f..1ea2a5c8 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -1,7 +1,7 @@
 from contextlib import contextmanager
 
 from mlir.dialects import pto as _pto
-from mlir.ir import FlatSymbolRefAttr, InsertionPoint
+from mlir.ir import FlatSymbolRefAttr, InsertionPoint, Operation
 
 from .scalar import Value, _unwrap
 
@@ -37,11 +37,21 @@ def _resolve_address_space_attr(location):
 
 
 def _resolve_peer_func_attr(peer_func):
+    if hasattr(peer_func, "sym_name"):
+        peer_func = peer_func.sym_name
     if isinstance(peer_func, str):
         return FlatSymbolRefAttr.get(peer_func.removeprefix("@"))
     return peer_func
 
 
+def call(callee, *args):
+    return Operation.create(
+        "func.call",
+        operands=[_unwrap(arg) for arg in args],
+        attributes={"callee": _resolve_peer_func_attr(callee)},
+    )
+
+
 def as_tensor(tensor_type, *, ptr, shape, strides, layout=None):
     shape_vals = [_unwrap(v) for v in shape]
     stride_vals = [_unwrap(v) for v in strides]
@@ -226,6 +236,7 @@ def print(format, scalar):
     "get_subblock_idx",
     "get_subblock_num",
     "get_block_num",
+    "call",
     "as_tensor",
     "slice_view",
     "vector_section",
diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py
index b32730ef..edeb5c24 100644
--- a/ptodsl/compiler/ir.py
+++ b/ptodsl/compiler/ir.py
@@ -1,11 +1,26 @@
 import inspect
 
 from mlir.dialects import func, pto as _pto
-from mlir.ir import Context, InsertionPoint, Location, Module
+from mlir.ir import Attribute, Context, InsertionPoint, Location, Module, UnitAttr
 
 from ..api.scalar import wrap_value
 
 
+_MODULE_STACK = []
+
+
+class FuncRef:
+    def __init__(self, sym_name):
+        self.sym_name = sym_name
+
+
+class _ModuleState:
+    def __init__(self, *, ctx, module, meta_map):
+        self.ctx = ctx
+        self.module = module
+        self.meta_map = meta_map
+
+
 def _resolve_meta(meta_fn):
     values = meta_fn()
     if not isinstance(values, dict):
@@ -72,38 +87,97 @@ def _restore_globals(fn, old, injected_names):
             fn.__globals__[name] = old[name]
 
 
-def to_ir_module(*, meta_data):
+def _build_func_body(ir_func, fn, ret_types, meta_map):
+    entry = ir_func.add_entry_block()
+    with InsertionPoint(entry):
+        wrapped_args = [wrap_value(arg) for arg in entry.arguments]
+        injected = set(meta_map.keys())
+        old_globals = _inject_globals(fn, meta_map)
+        try:
+            fn(*wrapped_args)
+        finally:
+            _restore_globals(fn, old_globals, injected)
+
+        if not ret_types and not _has_func_return(entry):
+            func.ReturnOp([])
+
+
+def _current_module_state():
+    if not _MODULE_STACK:
+        raise RuntimeError(
+            "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`."
+        )
+    return _MODULE_STACK[-1]
+
+
+def ir_func(*, name=None, entry=False, kernel=None):
     def decorator(fn):
+        state = _current_module_state()
         sig = inspect.signature(fn)
+        arg_types = _resolve_arg_types(sig, state.meta_map)
+        ret_types = _resolve_ret_types(sig, state.meta_map)
+        fn_name = name or fn.__name__
+        fn_ty = func.FunctionType.get(arg_types, ret_types)
+
+        with InsertionPoint(state.module.body):
+            ir_op = func.FuncOp(fn_name, fn_ty)
+
+        if entry:
+            ir_op.operation.attributes["pto.entry"] = UnitAttr.get(state.ctx)
+        if kernel is not None:
+            ir_op.operation.attributes["pto.kernel_kind"] = Attribute.parse(
+                f"#pto.kernel_kind<{kernel}>"
+            )
 
+        _build_func_body(ir_op, fn, ret_types, state.meta_map)
+        return FuncRef(fn_name)
+
+    return decorator
+
+
+def _build_single_func_module(fn, meta_map):
+    sig = inspect.signature(fn)
+    arg_types = _resolve_arg_types(sig, meta_map)
+    ret_types = _resolve_ret_types(sig, meta_map)
+    module = Module.create()
+    fn_ty = func.FunctionType.get(arg_types, ret_types)
+
+    with InsertionPoint(module.body):
+        ir_op = func.FuncOp(fn.__name__, fn_ty)
+
+    _build_func_body(ir_op, fn, ret_types, meta_map)
+    return module
+
+
+def _build_multi_func_module(fn, meta_map, ctx):
+    if inspect.signature(fn).parameters:
+        raise ValueError("`module=True` expects a zero-argument builder function.")
+
+    module = Module.create()
+    injected = set(meta_map.keys())
+    old_globals = _inject_globals(fn, meta_map)
+    _MODULE_STACK.append(_ModuleState(ctx=ctx, module=module, meta_map=meta_map))
+    try:
+        fn()
+    finally:
+        _MODULE_STACK.pop()
+        _restore_globals(fn, old_globals, injected)
+    return module
+
+
+def to_ir_module(*, meta_data, module=False):
+    def decorator(fn):
         with Context() as ctx, Location.unknown():
             _pto.register_dialect(ctx, load=True)
             meta_map = _resolve_meta(meta_data)
-            arg_types = _resolve_arg_types(sig, meta_map)
-            ret_types = _resolve_ret_types(sig, meta_map)
-            module = Module.create()
-            fn_ty = func.FunctionType.get(arg_types, ret_types)
-
-            with InsertionPoint(module.body):
-                ir_func = func.FuncOp(fn.__name__, fn_ty)
-                entry = ir_func.add_entry_block()
-
-            with InsertionPoint(entry):
-                wrapped_args = [wrap_value(arg) for arg in entry.arguments]
-                injected = set(meta_map.keys())
-                old_globals = _inject_globals(fn, meta_map)
-                try:
-                    fn(*wrapped_args)
-                finally:
-                    _restore_globals(fn, old_globals, injected)
-
-                if not ret_types and not _has_func_return(entry):
-                    func.ReturnOp([])
-
-            module.operation.verify()
-            return module
+            if module:
+                ir_module = _build_multi_func_module(fn, meta_map, ctx)
+            else:
+                ir_module = _build_single_func_module(fn, meta_map)
+            ir_module.operation.verify()
+            return ir_module
 
     return decorator
 
 
-__all__ = ["to_ir_module"]
+__all__ = ["FuncRef", "ir_func", "to_ir_module"]
diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py
new file mode 100644
index 00000000..90fb29f7
--- /dev/null
+++ b/tests/frontend/test_multifunc_ir.py
@@ -0,0 +1,27 @@
+from ptodsl import pto, to_ir_module
+
+
+def meta_data():
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    return {"ptr_ty": ptr_ty}
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def build_module():
+    @pto.func(kernel="vector")
+    def worker(arg0: "ptr_ty") -> None:
+        pass
+
+    @pto.func(entry=True)
+    def entry(arg0: "ptr_ty") -> None:
+        pto.call(worker, arg0)
+
+
+def test_multifunc_builder_shapes_module():
+    text = str(build_module)
+    assert "func.func @worker" in text
+    assert "pto.kernel_kind = #pto.kernel_kind<vector>" in text
+    assert "func.func @entry" in text
+    assert "attributes {pto.entry}" in text
+    assert "func.call @worker" in text

From 2362f7ee3212bc1a71b3ec4b1d65e255dcafc1b3 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 09:50:41 +0000
Subject: [PATCH 17/38] WIP: simplify ir.py

---
 ptodsl/compiler/ir.py | 139 ++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 79 deletions(-)

diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py
index edeb5c24..c2cef082 100644
--- a/ptodsl/compiler/ir.py
+++ b/ptodsl/compiler/ir.py
@@ -6,7 +6,9 @@
 from ..api.scalar import wrap_value
 
 
-_MODULE_STACK = []
+# For the inner decorators to be clean for the user visible API `pto.func(kernel='cube')`
+# with no reference to module, we need this:
+_CURRENT = None
 
 
 class FuncRef:
@@ -14,13 +16,6 @@ def __init__(self, sym_name):
         self.sym_name = sym_name
 
 
-class _ModuleState:
-    def __init__(self, *, ctx, module, meta_map):
-        self.ctx = ctx
-        self.module = module
-        self.meta_map = meta_map
-
-
 def _resolve_meta(meta_fn):
     values = meta_fn()
     if not isinstance(values, dict):
@@ -56,10 +51,7 @@ def _resolve_ret_types(signature, meta_map):
     if isinstance(ret_annot, (list, tuple)):
         out = []
         for elem in ret_annot:
-            if isinstance(elem, str):
-                out.append(meta_map[elem])
-            else:
-                out.append(elem)
+            out.append(meta_map[elem] if isinstance(elem, str) else elem)
         return out
     return [ret_annot]
 
@@ -79,101 +71,90 @@ def _inject_globals(fn, values):
     return old
 
 
-def _restore_globals(fn, old, injected_names):
-    for name in injected_names:
+def _restore_globals(fn, old, names):
+    for name in names:
         if old[name] is None and name in fn.__globals__:
             del fn.__globals__[name]
         else:
             fn.__globals__[name] = old[name]
 
 
-def _build_func_body(ir_func, fn, ret_types, meta_map):
-    entry = ir_func.add_entry_block()
-    with InsertionPoint(entry):
-        wrapped_args = [wrap_value(arg) for arg in entry.arguments]
-        injected = set(meta_map.keys())
-        old_globals = _inject_globals(fn, meta_map)
+def _define(module, ctx, meta_map, fn, *, name=None, entry=False, kernel=None):
+    sig = inspect.signature(fn)
+    arg_types = _resolve_arg_types(sig, meta_map)
+    ret_types = _resolve_ret_types(sig, meta_map)
+    fn_name = name or fn.__name__
+    fn_ty = func.FunctionType.get(arg_types, ret_types)
+
+    with InsertionPoint(module.body):
+        ir_func = func.FuncOp(fn_name, fn_ty)
+
+    if entry:
+        ir_func.operation.attributes["pto.entry"] = UnitAttr.get(ctx)
+    if kernel is not None:
+        ir_func.operation.attributes["pto.kernel_kind"] = Attribute.parse(
+            f"#pto.kernel_kind<{kernel}>"
+        )
+
+    block = ir_func.add_entry_block()
+    with InsertionPoint(block):
+        wrapped_args = [wrap_value(arg) for arg in block.arguments]
+        old = _inject_globals(fn, meta_map)
         try:
             fn(*wrapped_args)
         finally:
-            _restore_globals(fn, old_globals, injected)
+            _restore_globals(fn, old, meta_map.keys())
 
-        if not ret_types and not _has_func_return(entry):
+        if not ret_types and not _has_func_return(block):
             func.ReturnOp([])
 
-
-def _current_module_state():
-    if not _MODULE_STACK:
-        raise RuntimeError(
-            "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`."
-        )
-    return _MODULE_STACK[-1]
+    return FuncRef(fn_name)
 
 
 def ir_func(*, name=None, entry=False, kernel=None):
     def decorator(fn):
-        state = _current_module_state()
-        sig = inspect.signature(fn)
-        arg_types = _resolve_arg_types(sig, state.meta_map)
-        ret_types = _resolve_ret_types(sig, state.meta_map)
-        fn_name = name or fn.__name__
-        fn_ty = func.FunctionType.get(arg_types, ret_types)
-
-        with InsertionPoint(state.module.body):
-            ir_op = func.FuncOp(fn_name, fn_ty)
-
-        if entry:
-            ir_op.operation.attributes["pto.entry"] = UnitAttr.get(state.ctx)
-        if kernel is not None:
-            ir_op.operation.attributes["pto.kernel_kind"] = Attribute.parse(
-                f"#pto.kernel_kind<{kernel}>"
+        if _CURRENT is None:
+            raise RuntimeError(
+                "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`."
             )
-
-        _build_func_body(ir_op, fn, ret_types, state.meta_map)
-        return FuncRef(fn_name)
+        return _define(
+            _CURRENT["module"],
+            _CURRENT["ctx"],
+            _CURRENT["meta_map"],
+            fn,
+            name=name,
+            entry=entry,
+            kernel=kernel,
+        )
 
     return decorator
 
 
-def _build_single_func_module(fn, meta_map):
-    sig = inspect.signature(fn)
-    arg_types = _resolve_arg_types(sig, meta_map)
-    ret_types = _resolve_ret_types(sig, meta_map)
-    module = Module.create()
-    fn_ty = func.FunctionType.get(arg_types, ret_types)
-
-    with InsertionPoint(module.body):
-        ir_op = func.FuncOp(fn.__name__, fn_ty)
-
-    _build_func_body(ir_op, fn, ret_types, meta_map)
-    return module
-
-
-def _build_multi_func_module(fn, meta_map, ctx):
-    if inspect.signature(fn).parameters:
-        raise ValueError("`module=True` expects a zero-argument builder function.")
-
-    module = Module.create()
-    injected = set(meta_map.keys())
-    old_globals = _inject_globals(fn, meta_map)
-    _MODULE_STACK.append(_ModuleState(ctx=ctx, module=module, meta_map=meta_map))
-    try:
-        fn()
-    finally:
-        _MODULE_STACK.pop()
-        _restore_globals(fn, old_globals, injected)
-    return module
-
-
 def to_ir_module(*, meta_data, module=False):
     def decorator(fn):
+        global _CURRENT
+
         with Context() as ctx, Location.unknown():
             _pto.register_dialect(ctx, load=True)
             meta_map = _resolve_meta(meta_data)
+            ir_module = Module.create()
+
             if module:
-                ir_module = _build_multi_func_module(fn, meta_map, ctx)
+                if inspect.signature(fn).parameters:
+                    raise ValueError(
+                        "`module=True` expects a zero-argument builder function."
+                    )
+                old = _inject_globals(fn, meta_map)
+                prev = _CURRENT
+                _CURRENT = {"ctx": ctx, "module": ir_module, "meta_map": meta_map}
+                try:
+                    fn()
+                finally:
+                    _CURRENT = prev
+                    _restore_globals(fn, old, meta_map.keys())
             else:
-                ir_module = _build_single_func_module(fn, meta_map)
+                _define(ir_module, ctx, meta_map, fn)
+
             ir_module.operation.verify()
             return ir_module
 

From c7b31f4160179b09c93838e4ca6081f7f2ef51cc Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 10:00:54 +0000
Subject: [PATCH 18/38] use new ptodsl api for builder

---
 .../mix-kernel_mlir/bidirectional_builder.py  | 229 +++++++-----------
 .../aot/tpushpop/mix-kernel_mlir/compile.sh   |   8 +-
 2 files changed, 94 insertions(+), 143 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
index fbf68e6b..9535b318 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
@@ -1,164 +1,109 @@
-from mlir.dialects import arith, func, pto as _pto
-from mlir.ir import (
-    Attribute,
-    Context,
-    FlatSymbolRefAttr,
-    InsertionPoint,
-    Location,
-    Module,
-    Operation,
-    UnitAttr,
-)
-
-from ptodsl import pto, tile
+from mlir.dialects import arith
+
+from ptodsl import pto, tile, to_ir_module
 from ptodsl import scalar as s
 
 const = s.const
 
 
-def _call(name, *args):
-    return Operation.create(
-        "func.call",
-        operands=list(args),
-        attributes={"callee": FlatSymbolRefAttr.get(name)},
+def meta_data():
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    i32 = pto.int32
+    tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+    tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+    x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
+    x_left_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="LEFT",
+        config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"),
     )
+    x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
+    acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
+    recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def module():
+    @pto.func(kernel="cube")
+    def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_import = pto.import_reserved_buffer(
+            name="c2v_fifo",
+            peer_func="@vector_kernel",
+        )
 
+        pto.aic_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_import,
+            v2c_consumer_buf=c0_i32,
+        )
 
-def _kernel(fn, kind):
-    fn.operation.attributes["pto.kernel_kind"] = Attribute.parse(
-        f"#pto.kernel_kind<{kind}>"
-    )
-
+        x_mat_tile = pto.alloc_tile(x_mat_ty)
+        x_left_tile = pto.alloc_tile(x_left_ty)
+        x_right_tile = pto.alloc_tile(x_right_ty)
+        acc_tile = pto.alloc_tile(acc_ty)
 
-def build_module():
-    with Context() as ctx, Location.unknown():
-        _pto.register_dialect(ctx, load=True)
-        module = Module.create()
-
-        dtype = pto.float32
-        ptr_ty = pto.PtrType(dtype)
-        i32 = pto.int32
-        tensor_ty = pto.TensorType(rank=2, dtype=dtype)
-        tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
-        left_cfg = pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor")
-        x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
-        x_left_ty = pto.TileBufType(
-            shape=[16, 16],
-            dtype=dtype,
-            memory_space="LEFT",
-            config=left_cfg,
-        )
-        x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
-        acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
-        recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
-        call_both_ty = func.FunctionType.get([ptr_ty, ptr_ty, ptr_ty], [])
-        two_ptr_ty = func.FunctionType.get([ptr_ty, ptr_ty], [])
-
-        with InsertionPoint(module.body):
-            call_both = func.FuncOp("call_both", call_both_ty)
-            cube_kernel = func.FuncOp("cube_kernel", two_ptr_ty)
-            vector_kernel = func.FuncOp("vector_kernel", two_ptr_ty)
-
-        call_both.operation.attributes["pto.entry"] = UnitAttr.get(ctx)
-        _kernel(cube_kernel, "cube")
-        _kernel(vector_kernel, "vector")
-
-        call_both_entry = call_both.add_entry_block()
-        with InsertionPoint(call_both_entry):
-            gm_slot_buffer, gm_x, gm_y = call_both_entry.arguments
-            _call("cube_kernel", gm_slot_buffer, gm_x)
-            _call("vector_kernel", gm_slot_buffer, gm_y)
-            func.ReturnOp([])
-
-        cube_entry = cube_kernel.add_entry_block()
-        with InsertionPoint(cube_entry):
-            gm_slot_buffer, gm_x = cube_entry.arguments
-            c0 = const(0)
-            c1 = const(1)
-            c16 = const(16)
-            c0_i32 = arith.ConstantOp(i32, 0).result
-            c2v_import = pto.import_reserved_buffer(
-                name="c2v_fifo",
-                peer_func="@vector_kernel",
-            )
-
-            pto.aic_initialize_pipe(
-                dir_mask=1,
-                slot_size=1024,
-                gm_slot_buffer=gm_slot_buffer,
-                c2v_consumer_buf=c2v_import,
-                v2c_consumer_buf=c0_i32,
-            )
-
-            x_mat_tile = pto.alloc_tile(x_mat_ty)
-            x_left_tile = pto.alloc_tile(x_left_ty)
-            x_right_tile = pto.alloc_tile(x_right_ty)
-            acc_tile = pto.alloc_tile(acc_ty)
-
-            gm_x_view = pto.as_tensor(
+        gm_x_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
                 tensor_ty,
                 ptr=gm_x,
                 shape=[c16, c16],
                 strides=[c16, c1],
-            )
-            gm_x_tile_view = pto.slice_view(
-                tile_view_ty,
-                source=gm_x_view,
-                offsets=[c0, c0],
-                sizes=[c16, c16],
-            )
-
-            pto.load(gm_x_tile_view, x_mat_tile)
-            tile.mov(x_mat_tile, x_left_tile)
-            tile.mov(x_mat_tile, x_right_tile)
-            tile.matmul(x_left_tile, x_right_tile, acc_tile)
-            pto.tpush_to_aiv(acc_tile, 0)
-            func.ReturnOp([])
-
-        vector_entry = vector_kernel.add_entry_block()
-        with InsertionPoint(vector_entry):
-            gm_slot_buffer, gm_y = vector_entry.arguments
-            c0 = const(0)
-            c1 = const(1)
-            c16 = const(16)
-            c0_i32 = arith.ConstantOp(i32, 0).result
-            c2v_local = pto.reserve_buffer(
-                name="c2v_fifo",
-                size=4096,
-                location="VEC",
-            )
-
-            pto.aiv_initialize_pipe(
-                dir_mask=1,
-                slot_size=1024,
-                gm_slot_buffer=gm_slot_buffer,
-                c2v_consumer_buf=c2v_local,
-                v2c_consumer_buf=c0_i32,
-            )
-
-            gm_y_view = pto.as_tensor(
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.load(gm_x_tile_view, x_mat_tile)
+        tile.mov(x_mat_tile, x_left_tile)
+        tile.mov(x_mat_tile, x_right_tile)
+        tile.matmul(x_left_tile, x_right_tile, acc_tile)
+        pto.tpush_to_aiv(acc_tile, 0)
+
+    @pto.func(kernel="vector")
+    def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC")
+
+        pto.aiv_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_local,
+            v2c_consumer_buf=c0_i32,
+        )
+
+        gm_y_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
                 tensor_ty,
                 ptr=gm_y,
                 shape=[c16, c16],
                 strides=[c16, c1],
-            )
-            gm_y_tile_view = pto.slice_view(
-                tile_view_ty,
-                source=gm_y_view,
-                offsets=[c0, c0],
-                sizes=[c16, c16],
-            )
-
-            recv_tile = pto.tpop_from_aic(recv_ty, 0)
-            pto.store(recv_tile, gm_y_tile_view)
-            pto.tfree_from_aic(0)
-            func.ReturnOp([])
-
-        module.operation.verify()
-        return module
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
 
+        pto.store(pto.tpop_from_aic(recv_ty, 0), gm_y_tile_view)
+        pto.tfree_from_aic(0)
 
-module = build_module()
+    @pto.func(entry=True)
+    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.call(cube_kernel, gm_slot_buffer, gm_x)
+        pto.call(vector_kernel, gm_slot_buffer, gm_y)
 
 
 if __name__ == "__main__":
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index 8978eb9e..7169a980 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -10,7 +10,13 @@ LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 mkdir -p "${ARTIFACT_DIR}"
 rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
-ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
+MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir"
+python bidirectional_builder.py > bidir_gen.mlir
+ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
+
+#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
+
+
 
 bisheng \
     -I/sources/pto-isa/include/ \

From 0597c0a34ab4a97f3536e4e1b27a93e8ccac4e8b Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 10:09:04 +0000
Subject: [PATCH 19/38] feat: remove files

---
 .../aot/tpushpop/mix-kernel_cpp/README.md     |  31 --
 .../aot/tpushpop/mix-kernel_cpp/caller.cpp    |  27 --
 .../aot/tpushpop/mix-kernel_cpp/compile.sh    |  46 ---
 .../mix-kernel_cpp/run_tpushpop_cv.py         | 158 ----------
 .../tpushpop/mix-kernel_cpp/tpushpop_cv.cpp   | 297 ------------------
 .../tpushpop/mix-kernel_cpp_simple/README.md  |  15 -
 .../tpushpop/mix-kernel_cpp_simple/caller.cpp |  27 --
 .../tpushpop/mix-kernel_cpp_simple/compile.sh |  36 ---
 .../tpushpop/mix-kernel_cpp_simple/kernel.cpp | 156 ---------
 .../aot/tpushpop/mix-kernel_cpp_simple/run.py |  72 -----
 refs/tpushpop_cv.cpp                          | 290 -----------------
 refs/tpushpop_vc.cpp                          | 236 --------------
 12 files changed, 1391 deletions(-)
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/README.md
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/caller.cpp
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/compile.sh
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
 delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/run.py
 delete mode 100644 refs/tpushpop_cv.cpp
 delete mode 100644 refs/tpushpop_vc.cpp

diff --git a/examples/aot/tpushpop/mix-kernel_cpp/README.md b/examples/aot/tpushpop/mix-kernel_cpp/README.md
deleted file mode 100644
index 672e71f1..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Cube To Vector `TPUSH`/`TPOP` Example
-
-This example keeps the kernel source in the same directory as the wrapper, using `./tpushpop_cv.cpp` with the same `compile.sh` + Python runner flow used by the AOT examples.
-
-The kernel does:
-
-- cube-side `TMATMUL`
-- `TPUSH` from cube to vector
-- vector-side `TPOP`
-- vector-side bias add
-
-## Run
-
-```bash
-python run_tpushpop_cv.py
-```
-
-That will:
-
-1. call `compile.sh`
-2. build `./tpushpop_cv_lib.so`
-3. launch the kernel on NPU
-4. compare against `A @ B + bias`
-
-The wrapper fetches the runtime FFTS/control address inside `caller.cpp` with `rtGetC2cCtrlAddr(...)`, so the Python side only needs to provide the kernel inputs, output, and FIFO backing memory.
-
-If your environment needs different PTO include roots:
-
-```bash
-PTO_INCLUDE_PATH=/sources/pto-isa/include python run_tpushpop_cv.py
-```
diff --git a/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp
deleted file mode 100644
index fbe697f4..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef KERNEL_CPP
-#error "KERNEL_CPP must be defined at compile time."
-#endif
-
-#include <cstdint>
-
-extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
-
-#include KERNEL_CPP
-
-extern "C" void call_kernel(
-    uint32_t blockDim,
-    void *stream,
-    uint8_t *out,
-    uint8_t *srcA,
-    uint8_t *srcB,
-    uint8_t *bias,
-    uint8_t *fifoMem)
-{
-    void *fftsAddr = nullptr;
-    uint32_t fftsLen = 0;
-    (void)blockDim;
-    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
-    (void)fftsLen;
-
-    LaunchTPushPopMatmulAdd(reinterpret_cast<uint8_t *>(fftsAddr), out, srcA, srcB, bias, fifoMem, stream);
-}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh
deleted file mode 100644
index df924539..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp/compile.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
-LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so"
-KERNEL_CPP_PATH="${KERNEL_CPP_PATH:-${SCRIPT_DIR}/tpushpop_cv.cpp}"
-EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}"
-
-if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then
-    EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS} -DTPUSHPOP_SANITY_ONLY"
-fi
-
-PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}"
-if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then
-    if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then
-        PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include"
-    elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then
-        PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include"
-    else
-        echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2
-        exit 1
-    fi
-fi
-
-mkdir -p "${ARTIFACT_DIR}"
-rm -f "${LIB_PATH}"
-
-bisheng \
-    -I"${PTO_INCLUDE_PATH}" \
-    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
-    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-    -xcce -Xhost-start -Xhost-end \
-    -mllvm -cce-aicore-stack-size=0x8000 \
-    -mllvm -cce-aicore-function-stack-size=0x8000 \
-    -mllvm -cce-aicore-record-overflow=true \
-    -mllvm -cce-aicore-addr-transform \
-    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
-    --npu-arch=dav-2201 -DMEMORY_BASE \
-    -std=gnu++17 \
-    ${EXTRA_BISHENG_FLAGS} \
-    -DKERNEL_CPP="\"${KERNEL_CPP_PATH}\"" \
-    "${SCRIPT_DIR}/caller.cpp" \
-    -o "${LIB_PATH}"
-
-echo "Built ${LIB_PATH}."
diff --git a/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
deleted file mode 100644
index 4e2d468d..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import ctypes
-import os
-import subprocess
-
-import numpy as np
-import torch
-import torch_npu  # noqa: F401
-
-from ptodsl.test_util import get_test_device
-
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so")
-DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
-DEFAULT_KERNEL_CPP = os.path.join(THIS_DIR, "tpushpop_cv.cpp")
-DEFAULT_FIFO_BYTES = 4 * 1024
-TOTAL_M = 128
-K = 32
-N = 32
-INPUT_DTYPE = torch.float16
-SEED = 0
-ATOL = 5e-2
-RTOL = 5e-2
-SANITY_ONLY = False
-
-
-def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
-    return ctypes.c_void_p(tensor.data_ptr())
-
-
-def compile_example(compile_script: str) -> None:
-    env = os.environ.copy()
-    env["KERNEL_CPP_PATH"] = DEFAULT_KERNEL_CPP
-    subprocess.run(
-        ["bash", compile_script],
-        check=True,
-        cwd=THIS_DIR,
-        env=env,
-    )
-
-
-def load_lib(lib_path: str) -> ctypes.CDLL:
-    lib = ctypes.CDLL(lib_path)
-    lib.call_kernel.argtypes = [
-        ctypes.c_uint32,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-    ]
-    lib.call_kernel.restype = None
-    return lib
-
-
-def make_buffers(
-    *,
-    total_m: int,
-    k: int,
-    n: int,
-    input_dtype: torch.dtype,
-    device: str,
-    fifo_bytes: int,
-):
-    src_a = torch.randn((total_m, k), dtype=input_dtype, device=device)
-    src_b = torch.randn((k, n), dtype=input_dtype, device=device)
-    bias = torch.randn((total_m, n), dtype=torch.float32, device=device)
-    out = torch.zeros((total_m, n), dtype=torch.float32, device=device)
-
-    fifo_elems = max(1, (fifo_bytes + 3) // 4)
-    fifo_mem = torch.zeros((fifo_elems,), dtype=torch.float32, device=device)
-    return out, src_a, src_b, bias, fifo_mem
-
-
-def run_kernel(
-    lib: ctypes.CDLL,
-    *,
-    out: torch.Tensor,
-    src_a: torch.Tensor,
-    src_b: torch.Tensor,
-    bias: torch.Tensor,
-    fifo_mem: torch.Tensor,
-) -> torch.Tensor:
-    stream_ptr = torch.npu.current_stream()._as_parameter_
-    lib.call_kernel(
-        1,
-        stream_ptr,
-        torch_to_ctypes(out),
-        torch_to_ctypes(src_a),
-        torch_to_ctypes(src_b),
-        torch_to_ctypes(bias),
-        torch_to_ctypes(fifo_mem),
-    )
-    torch.npu.synchronize()
-    return out
-
-
-def reference_result(src_a: torch.Tensor, src_b: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
-    ref = torch.matmul(src_a.float().cpu(), src_b.float().cpu())
-    if not SANITY_ONLY:
-        ref = ref + bias.cpu()
-    return ref.to(torch.float32)
-
-
-def main() -> None:
-    compile_example(DEFAULT_COMPILE_SCRIPT)
-
-    device = get_test_device()
-    torch.npu.set_device(device)
-    torch.manual_seed(SEED)
-    np.random.seed(SEED)
-
-    lib = load_lib(DEFAULT_LIB_PATH)
-    out, src_a, src_b, bias, fifo_mem = make_buffers(
-        total_m=TOTAL_M,
-        k=K,
-        n=N,
-        input_dtype=INPUT_DTYPE,
-        device=device,
-        fifo_bytes=DEFAULT_FIFO_BYTES,
-    )
-
-    out = run_kernel(
-        lib,
-        out=out,
-        src_a=src_a,
-        src_b=src_b,
-        bias=bias,
-        fifo_mem=fifo_mem,
-    )
-    ref = reference_result(src_a, src_b, bias)
-    out_cpu = out.cpu()
-    assert ref.device == out_cpu.device
-    torch.npu.synchronize()
-    torch.set_printoptions(precision=1, sci_mode=False, linewidth=250, threshold=5000)
-    print(ref-out_cpu)
-
-    max_abs = float(torch.max(torch.abs(out_cpu - ref)).item())
-    mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item())
-    ok = bool(torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL))
-
-    print(
-        f"mode={'sanity_matmul' if SANITY_ONLY else 'tpushpop_cv'} "
-        f"shape=({TOTAL_M}, {K}, {N}) dtype={INPUT_DTYPE} "
-        f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}"
-    )
-
-    if not ok:
-        raise SystemExit(
-            f"Validation failed with atol={ATOL} rtol={RTOL}. "
-            f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}"
-        )
-
-    print(f"Validation passed using {DEFAULT_LIB_PATH}.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp
deleted file mode 100644
index 3f4c42b2..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-#include <pto/pto-inst.hpp>
-#include <pto/common/fifo.hpp>
-
-using namespace pto;
-
-#define VEC_CORES 2
-
-using ExampleInT = half;
-using ExampleOutT = float;
-constexpr uint32_t EXAMPLE_TOTAL_M = 128;
-constexpr uint32_t EXAMPLE_CASE_TILE_M = 16;
-constexpr uint32_t EXAMPLE_TILE_K = 32;
-constexpr uint32_t EXAMPLE_TILE_N = 32;
-
-#ifdef __DAV_CUBE__
-constexpr bool DAV_CUBE = true;
-#else
-constexpr bool DAV_CUBE = false;
-#endif
-
-#ifdef __DAV_VEC__
-constexpr bool DAV_VEC = true;
-#else
-constexpr bool DAV_VEC = false;
-#endif
-
-template <typename T>
-AICORE constexpr inline T CeilAlign(T num_1, T num_2)
-{
-    if (num_2 == 0) {
-        return 0;
-    }
-    return (num_1 + num_2 - 1) / num_2 * num_2;
-}
-
-#ifdef TPUSHPOP_SANITY_ONLY
-__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB)
-{
-    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
-    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_TOTAL_M, 16);
-    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
-    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
-
-    using GlobalA =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
-                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
-    using GlobalB =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
-                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalOut =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_TOTAL_M,
-                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
-                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>;
-    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
-    using AccTile = TileAcc<ExampleOutT, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>;
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMatTile;
-        TileMatB bMatTile;
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile accTile;
-        TASSIGN(aMatTile, 0x0);
-        TASSIGN(bMatTile, 0x20000);
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(accTile, 0x0);
-
-        GlobalA globalA(srcA);
-        GlobalB globalB(srcB);
-        GlobalOut globalOut(out);
-
-        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        TLOAD(aMatTile, globalA);
-        TLOAD(bMatTile, globalB);
-
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        TMOV(aTile, aMatTile);
-        TMOV(bTile, bMatTile);
-
-        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        TMATMUL(accTile, aTile, bTile);
-
-        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-#else
-__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out,
-                                            __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB,
-                                            __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem)
-{
-    // Point the cross-core FIFO signaling ops at the FFTS flag storage used by TPUSH/TPOP handshakes.
-    //t_ffts_base_addr((uint64_t)ffts_addr);
-    constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M;
-    constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES;
-
-    constexpr uint16_t FLAG_ID = 0;
-    constexpr uint8_t FIFO_DEPTH = 2;
-    constexpr uint8_t FIFO_PERIOD = 1;
-    // Local ring-buffer base used by vector-side TPOP to place each popped half-tile before vector compute uses it.
-    constexpr uint32_t localFiFoBase = 0x0;
-
-    using AccTile = TileAcc<ExampleOutT, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N>;
-    using VecTileHalf =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-    using BiasTile =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-    using OutTile =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-
-    // Cube-to-vector FIFO: each GM slot stores one full AccTile, and vector TPOP reads it back as two row halves.
-    using MatPipe = TPipe<FLAG_ID, Direction::DIR_C2V,
-                          EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_N * sizeof(ExampleOutT), FIFO_DEPTH>;
-    // Bind the FIFO protocol to GM slot storage and the vector-side local staging buffer used by TPOP.
-    MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase);
-
-    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
-    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_CASE_TILE_M, 16);
-    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
-    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
-
-    using GlobalA =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
-                                 EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
-    using GlobalB =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
-                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalBias =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalOut =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_CASE_TILE_M,
-                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
-                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>;
-    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMatTile;
-        TileMatB bMatTile;
-        TASSIGN(aMatTile, 0x0);
-        TASSIGN(bMatTile, 0x20000);
-
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile accTile;
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(accTile, 0x0);
-
-        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
-            GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K);
-            GlobalB globalB(srcB);
-
-            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            TLOAD(aMatTile, globalA);
-            TLOAD(bMatTile, globalB);
-
-            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-            TMOV(aTile, aMatTile);
-            TMOV(bTile, bMatTile);
-
-            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-            wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-
-            TMATMUL(accTile, aTile, bTile);
-
-            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-            set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-            wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-            // Push the full accumulator tile into the next GM FIFO slot and signal vector that one split-up-down tile is ready.
-            TPUSH<MatPipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
-
-            set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        }
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-
-    if constexpr (DAV_VEC) {
-        VecTileHalf vecTileHalf;
-        BiasTile biasTile;
-        OutTile outTile;
-        TASSIGN(biasTile, 0x10000);
-        TASSIGN(outTile, 0x20000);
-
-        uint32_t subBlockIdx = get_subblockid();
-
-        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
-            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            // Pop this subcore's half-tile from the next ready FIFO slot into local vector memory based on get_subblockid().
-            // TILE_UP_DOWN means split MxN tile into-> [M/2xN, M/2xN].
-            TPOP<MatPipe, VecTileHalf, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecTileHalf);
-
-            size_t biasOffset =
-                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
-            GlobalBias globalBias(bias + biasOffset);
-
-            TLOAD(biasTile, globalBias);
-
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-            TADD(outTile, vecTileHalf, biasTile);
-
-            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-
-            size_t outOffset =
-                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
-            GlobalOut globalOut(out + outOffset);
-            // Store this vector subcore's output half-tile from local vector memory back to its GM output slice.
-            TSTORE(globalOut, outTile);
-
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        }
-
-        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-#endif
-
-void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
-                             void *stream)
-{
-#ifdef TPUSHPOP_SANITY_ONLY
-    (void)ffts;
-    (void)bias;
-    (void)fifoMem;
-    runSanityMatmul<<<1, nullptr, stream>>>(
-        reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA), reinterpret_cast<ExampleInT *>(srcB));
-#else
-    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
-        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA),
-        reinterpret_cast<ExampleInT *>(srcB), reinterpret_cast<ExampleOutT *>(bias), reinterpret_cast<ExampleOutT *>(fifoMem));
-#endif
-}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
deleted file mode 100644
index 45adb9fc..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Simple Cube To Vector `TPUSH`/`TPOP` Example
-
-This is a stripped-down sibling of `mix-kernel_cpp`.
-
-The kernel is fixed to a single `16x32 @ 32x32` matmul, followed by a bias add on the vector side:
-
-- no tile loop
-- no sanity mode
-- no extra runner configuration
-
-Run it with:
-
-```bash
-python run.py
-```
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
deleted file mode 100644
index fbe697f4..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef KERNEL_CPP
-#error "KERNEL_CPP must be defined at compile time."
-#endif
-
-#include <cstdint>
-
-extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
-
-#include KERNEL_CPP
-
-extern "C" void call_kernel(
-    uint32_t blockDim,
-    void *stream,
-    uint8_t *out,
-    uint8_t *srcA,
-    uint8_t *srcB,
-    uint8_t *bias,
-    uint8_t *fifoMem)
-{
-    void *fftsAddr = nullptr;
-    uint32_t fftsLen = 0;
-    (void)blockDim;
-    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
-    (void)fftsLen;
-
-    LaunchTPushPopMatmulAdd(reinterpret_cast<uint8_t *>(fftsAddr), out, srcA, srcB, bias, fifoMem, stream);
-}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
deleted file mode 100644
index 0d8d8eb7..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}"
-LIB_PATH="${SCRIPT_DIR}/lib.so"
-
-if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then
-    if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then
-        PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include"
-    elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then
-        PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include"
-    else
-        echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2
-        exit 1
-    fi
-fi
-
-rm -f "${LIB_PATH}"
-
-bisheng \
-    -I"${PTO_INCLUDE_PATH}" \
-    -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \
-    -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-    -xcce -Xhost-start -Xhost-end \
-    -mllvm -cce-aicore-stack-size=0x8000 \
-    -mllvm -cce-aicore-function-stack-size=0x8000 \
-    -mllvm -cce-aicore-record-overflow=true \
-    -mllvm -cce-aicore-addr-transform \
-    -mllvm -cce-aicore-dcci-insert-for-scalar=false \
-    --npu-arch=dav-2201 -DMEMORY_BASE \
-    -DKERNEL_CPP="\"${SCRIPT_DIR}/kernel.cpp\"" \
-    "${SCRIPT_DIR}/caller.cpp" \
-    -o "${LIB_PATH}"
-
-echo "Built ${LIB_PATH}."
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
deleted file mode 100644
index 02336d41..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-Flow:
-1. Cube loads A and B from GM through GlobalTensor views.
-2. Cube copies those GM-backed matrix tiles into local matrix tiles:
-   `aMat` at `0x0`, `bMat` at `0x20000`, then converts them to matmul inputs
-   `aTile` and `bTile` and runs one `TMATMUL` into `acc`.
-3. Cube `TPUSH`es the full `16x32` accumulator tile to the C2V pipe.
-4. Vector `TPOP`s its `8x32` half-tile from that pushed accumulator, loads the
-   matching `8x32` bias tile from GM, does `TADD`, and stores the result to GM.
-
-Allocation summary:
-- `GlobalTensor` objects are just GM views over `srcA`, `srcB`, `bias`, and `out`.
-  They do not allocate local on-core memory themselves.
-- The C2V FIFO is also explicit GM memory in this example: `fifoMem` is the GM slot
-  buffer passed into `TPipe`, so cube writes the pushed accumulator tile into GM and
-  vector reads it back from that same GM-backed FIFO.
-- Cube local tiles:
-  `aMat @ 0x0`, `bMat @ 0x20000`, `aTile @ 0x0`, `bTile @ 0x0`, `acc @ 0x0`.
-- Vector local tiles:
-  `biasTile @ 0x10000`, `outTile @ 0x20000`.
-- The cross-core transfer is the matmul result: one full `AccTile<float, 16, 32>`
-  produced on cube and split `up/down` so each vector subcore receives one `8x32`
-  row half via `TPOP`.
-*/
-#include <pto/pto-inst.hpp>
-#include <pto/common/fifo.hpp>
-
-using namespace pto;
-
-using In = half;
-using Out = float;
-
-constexpr uint32_t M = 16;
-constexpr uint32_t K = 32;
-constexpr uint32_t N = 32;
-constexpr uint32_t VEC_CORES = 2;
-constexpr uint32_t VEC_M = M / VEC_CORES;
-
-#ifdef __DAV_CUBE__
-constexpr bool DAV_CUBE = true;
-#else
-constexpr bool DAV_CUBE = false;
-#endif
-
-#ifdef __DAV_VEC__
-constexpr bool DAV_VEC = true;
-#else
-constexpr bool DAV_VEC = false;
-#endif
-
-__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts, __gm__ Out *out, __gm__ In *srcA, __gm__ In *srcB,
-                                            __gm__ Out *bias, __gm__ Out *fifoMem)
-{
-    set_ffts_base_addr((uint64_t)ffts);
-
-    using GlobalA = GlobalTensor<In, Shape<1, 1, 1, M, K>, Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<In, Shape<1, 1, 1, K, N>, Stride<K * N, K * N, K * N, N, 1>>;
-    using GlobalBias = GlobalTensor<Out, Shape<1, 1, 1, VEC_M, N>, Stride<M * N, M * N, VEC_M * N, N, 1>>;
-    using GlobalOut = GlobalTensor<Out, Shape<1, 1, 1, VEC_M, N>, Stride<M * N, M * N, VEC_M * N, N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, In, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, In, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<In, M, K, M, K>;
-    using RightTile = TileRight<In, K, N, K, N>;
-    using AccTile = TileAcc<Out, M, N, M, N>;
-    using VecTile = Tile<TileType::Vec, Out, VEC_M, N, BLayout::RowMajor, VEC_M, N>;
-
-    using Pipe = TPipe<0, Direction::DIR_C2V, M * N * sizeof(Out), 2>;
-    Pipe pipe((__gm__ void *)(uint64_t)fifoMem, 0x0, 0x0);
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMat;
-        TileMatB bMat;
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile acc;
-        TASSIGN(aMat, 0x0);
-        TASSIGN(bMat, 0x20000);
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(acc, 0x0);
-
-        GlobalA globalA(srcA);
-        GlobalB globalB(srcB);
-
-        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        TLOAD(aMat, globalA);
-        TLOAD(bMat, globalB);
-
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        TMOV(aTile, aMat);
-        TMOV(bTile, bMat);
-
-        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        TMATMUL(acc, aTile, bTile);
-
-        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        TPUSH<Pipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(pipe, acc);
-
-        pipe_barrier(PIPE_ALL);
-    }
-
-    if constexpr (DAV_VEC) {
-        VecTile popped;
-        VecTile biasTile;
-        VecTile outTile;
-        TASSIGN(biasTile, 0x10000);
-        TASSIGN(outTile, 0x20000);
-
-        uint32_t subBlock = get_subblockid();
-        uint32_t offset = subBlock * VEC_M * N;
-        GlobalBias globalBias(bias + offset);
-        GlobalOut globalOut(out + offset);
-
-        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-        TPOP<Pipe, VecTile, TileSplitAxis::TILE_UP_DOWN>(pipe, popped);
-        TLOAD(biasTile, globalBias);
-
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        TADD(outTile, popped, biasTile);
-
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(globalOut, outTile);
-
-        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-
-void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
-                             void *stream)
-{
-    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
-        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<Out *>(out), reinterpret_cast<In *>(srcA),
-        reinterpret_cast<In *>(srcB), reinterpret_cast<Out *>(bias), reinterpret_cast<Out *>(fifoMem));
-}
diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py
deleted file mode 100644
index e098ddeb..00000000
--- a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import ctypes
-import os
-import subprocess
-
-import torch
-import torch_npu  # noqa: F401
-
-from ptodsl.test_util import get_test_device
-
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-LIB_PATH = os.path.join(THIS_DIR, "lib.so")
-M = 16
-K = 32
-N = 32
-FIFO_ELEMS = 1024
-ATOL = 5e-2
-RTOL = 5e-2
-
-
-def ptr(tensor: torch.Tensor) -> ctypes.c_void_p:
-    return ctypes.c_void_p(tensor.data_ptr())
-
-
-def main() -> None:
-    subprocess.run(["bash", "compile.sh"], check=True, cwd=THIS_DIR)
-
-    device = get_test_device()
-    torch.npu.set_device(device)
-    torch.manual_seed(0)
-
-    lib = ctypes.CDLL(LIB_PATH)
-    lib.call_kernel.argtypes = [
-        ctypes.c_uint32,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-        ctypes.c_void_p,
-    ]
-    lib.call_kernel.restype = None
-
-    a = torch.randn((M, K), dtype=torch.float16, device=device)
-    b = torch.randn((K, N), dtype=torch.float16, device=device)
-    bias = torch.randn((M, N), dtype=torch.float32, device=device)
-    out = torch.zeros((M, N), dtype=torch.float32, device=device)
-    fifo = torch.zeros((FIFO_ELEMS,), dtype=torch.float32, device=device)
-
-    lib.call_kernel(
-        1,
-        torch.npu.current_stream()._as_parameter_,
-        ptr(out),
-        ptr(a),
-        ptr(b),
-        ptr(bias),
-        ptr(fifo),
-    )
-    torch.npu.synchronize()
-
-    ref = a.float().cpu() @ b.float().cpu() + bias.cpu()
-    out_cpu = out.cpu()
-    max_abs = float((out_cpu - ref).abs().max().item())
-    print(f"max_abs={max_abs:.6f}")
-
-    if not torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL):
-        raise SystemExit("validation failed")
-
-    print("validation passed")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/refs/tpushpop_cv.cpp b/refs/tpushpop_cv.cpp
deleted file mode 100644
index 324ade79..00000000
--- a/refs/tpushpop_cv.cpp
+++ /dev/null
@@ -1,290 +0,0 @@
-#include <pto/pto-inst.hpp>
-#include <pto/common/fifo.hpp>
-
-using namespace pto;
-
-#define VEC_CORES 2
-
-using ExampleInT = half;
-using ExampleOutT = float;
-constexpr uint32_t EXAMPLE_TOTAL_M = 16;
-constexpr uint32_t EXAMPLE_CASE_TILE_M = 16;
-constexpr uint32_t EXAMPLE_TILE_K = 32;
-constexpr uint32_t EXAMPLE_TILE_N = 32;
-
-#ifdef __DAV_CUBE__
-constexpr bool DAV_CUBE = true;
-#else
-constexpr bool DAV_CUBE = false;
-#endif
-
-#ifdef __DAV_VEC__
-constexpr bool DAV_VEC = true;
-#else
-constexpr bool DAV_VEC = false;
-#endif
-
-template <typename T>
-AICORE constexpr inline T CeilAlign(T num_1, T num_2)
-{
-    if (num_2 == 0) {
-        return 0;
-    }
-    return (num_1 + num_2 - 1) / num_2 * num_2;
-}
-
-#ifdef TPUSHPOP_SANITY_ONLY
-__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB)
-{
-    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
-    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_TOTAL_M, 16);
-    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
-    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
-
-    using GlobalA =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
-                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
-    using GlobalB =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
-                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalOut =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_TOTAL_M,
-                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
-                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_TOTAL_M, EXAMPLE_TILE_K>;
-    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
-    using AccTile = TileAcc<ExampleOutT, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N, EXAMPLE_TOTAL_M, EXAMPLE_TILE_N>;
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMatTile;
-        TileMatB bMatTile;
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile accTile;
-        TASSIGN(aMatTile, 0x0);
-        TASSIGN(bMatTile, 0x20000);
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(accTile, 0x0);
-
-        GlobalA globalA(srcA);
-        GlobalB globalB(srcB);
-        GlobalOut globalOut(out);
-
-        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        TLOAD(aMatTile, globalA);
-        TLOAD(bMatTile, globalB);
-
-        set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        TMOV(aTile, aMatTile);
-        TMOV(bTile, bMatTile);
-
-        set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-        wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        TMATMUL(accTile, aTile, bTile);
-
-        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-#else
-__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out,
-                                            __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB,
-                                            __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem)
-{
-    set_ffts_base_addr((uint64_t)ffts_addr);
-    constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M;
-    constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES;
-
-    constexpr uint16_t FLAG_ID = 0;
-    constexpr uint8_t FIFO_DEPTH = 2;
-    constexpr uint8_t FIFO_PERIOD = 1;
-    // local fifo base used for TPOP of vector side(vecTileHalf)
-    constexpr uint32_t localFiFoBase = 0x0;
-
-    using AccTile = TileAcc<ExampleOutT, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_N>;
-    using VecTileHalf =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-    using BiasTile =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-    using OutTile =
-        Tile<TileType::Vec, ExampleOutT, VEC_M, EXAMPLE_TILE_N, BLayout::RowMajor, VEC_M, EXAMPLE_TILE_N>;
-
-    using MatPipe = TPipe<FLAG_ID, Direction::DIR_C2V,
-                          EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_N * sizeof(ExampleOutT), FIFO_DEPTH>;
-    MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase);
-
-    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT);
-    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(EXAMPLE_CASE_TILE_M, 16);
-    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(EXAMPLE_TILE_K, blockAlign);
-    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(EXAMPLE_TILE_N, blockAlign);
-
-    using GlobalA =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_K, EXAMPLE_TOTAL_M * EXAMPLE_TILE_K,
-                                 EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K, EXAMPLE_TILE_K, 1>>;
-    using GlobalB =
-        GlobalTensor<ExampleInT, pto::Shape<1, 1, 1, EXAMPLE_TILE_K, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_K * EXAMPLE_TILE_N,
-                                 EXAMPLE_TILE_K * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalBias =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-    using GlobalOut =
-        GlobalTensor<ExampleOutT, pto::Shape<1, 1, 1, VEC_M, EXAMPLE_TILE_N>,
-                     pto::Stride<EXAMPLE_TOTAL_M * EXAMPLE_TILE_N, EXAMPLE_TOTAL_M * EXAMPLE_TILE_N,
-                                 VEC_M * EXAMPLE_TILE_N, EXAMPLE_TILE_N, 1>>;
-
-    using TileMatA = Tile<TileType::Mat, ExampleInT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, EXAMPLE_CASE_TILE_M,
-                          EXAMPLE_TILE_K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, ExampleInT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, EXAMPLE_TILE_K,
-                          EXAMPLE_TILE_N, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<ExampleInT, ALIGNED_M, ALIGNED_K, EXAMPLE_CASE_TILE_M, EXAMPLE_TILE_K>;
-    using RightTile = TileRight<ExampleInT, ALIGNED_K, ALIGNED_N, EXAMPLE_TILE_K, EXAMPLE_TILE_N>;
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMatTile;
-        TileMatB bMatTile;
-        TASSIGN(aMatTile, 0x0);
-        TASSIGN(bMatTile, 0x20000);
-
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile accTile;
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(accTile, 0x0);
-
-        set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
-            GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K);
-            GlobalB globalB(srcB);
-
-            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            TLOAD(aMatTile, globalA);
-            TLOAD(bMatTile, globalB);
-
-            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-            TMOV(aTile, aMatTile);
-            TMOV(bTile, bMatTile);
-
-            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-            wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-
-            TMATMUL(accTile, aTile, bTile);
-
-            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-            set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-            wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-            TPUSH<MatPipe, AccTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, accTile);
-
-            set_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-        }
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-
-    if constexpr (DAV_VEC) {
-        VecTileHalf vecTileHalf;
-        BiasTile biasTile;
-        OutTile outTile;
-        TASSIGN(biasTile, 0x10000);
-        TASSIGN(outTile, 0x20000);
-
-        uint32_t subBlockIdx = get_subblockid();
-
-        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-        for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) {
-            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            TPOP<MatPipe, VecTileHalf, TileSplitAxis::TILE_UP_DOWN>(mPipe, vecTileHalf);
-
-            size_t biasOffset =
-                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
-            GlobalBias globalBias(bias + biasOffset);
-
-            TLOAD(biasTile, globalBias);
-
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-            TADD(outTile, vecTileHalf, biasTile);
-
-            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-
-            size_t outOffset =
-                static_cast<size_t>(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N;
-            GlobalOut globalOut(out + outOffset);
-            TSTORE(globalOut, outTile);
-
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        }
-
-        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-#endif
-
-void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem,
-                             void *stream)
-{
-#ifdef TPUSHPOP_SANITY_ONLY
-    (void)ffts;
-    (void)bias;
-    (void)fifoMem;
-    runSanityMatmul<<<1, nullptr, stream>>>(
-        reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA), reinterpret_cast<ExampleInT *>(srcB));
-#else
-    runTPushPopMatmulAdd<<<1, nullptr, stream>>>(
-        reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<ExampleOutT *>(out), reinterpret_cast<ExampleInT *>(srcA),
-        reinterpret_cast<ExampleInT *>(srcB), reinterpret_cast<ExampleOutT *>(bias), reinterpret_cast<ExampleOutT *>(fifoMem));
-#endif
-}
diff --git a/refs/tpushpop_vc.cpp b/refs/tpushpop_vc.cpp
deleted file mode 100644
index 69672e57..00000000
--- a/refs/tpushpop_vc.cpp
+++ /dev/null
@@ -1,236 +0,0 @@
-#include <pto/pto-inst.hpp>
-#include <pto/common/fifo.hpp>
-
-using namespace pto;
-
-#ifdef __DAV_CUBE__
-constexpr bool DAV_CUBE = true;
-#else
-constexpr bool DAV_CUBE = false;
-#endif
-
-#ifdef __DAV_VEC__
-constexpr bool DAV_VEC = true;
-#else
-constexpr bool DAV_VEC = false;
-#endif
-
-template <typename T>
-AICORE constexpr inline T CeilAlign(T num_1, T num_2)
-{
-    if (num_2 == 0) {
-        return 0;
-    }
-    return (num_1 + num_2 - 1) / num_2 * num_2;
-}
-
-template <typename QuantT, typename InT, typename OutT, int TOTAL_M, int TOTAL_K, int N, int CASE_TILE_K>
-__global__ AICORE void runTPushPopVCMatmul(__gm__ uint64_t *ffts_addr, __gm__ OutT *out, __gm__ InT *srcA,
-                                           __gm__ QuantT *quantB, __gm__ OutT *scale, __gm__ OutT *offset,
-                                           __gm__ OutT *fifoMem)
-{
-    set_ffts_base_addr((uint64_t)ffts_addr);
-    constexpr uint32_t TILE_K = CASE_TILE_K;
-    constexpr uint32_t HALF_TILE_K = TILE_K / 2;
-    constexpr uint32_t TILE_N = N;
-    constexpr uint32_t NUM_K_TILES = TOTAL_K / CASE_TILE_K;
-
-    constexpr uint16_t FLAG_ID = 0;
-    constexpr uint8_t FIFO_DEPTH = 2;
-    constexpr uint8_t FIFO_PERIOD = 1;
-    // fifo base used for TPOP of cube side (bMatTile)
-    constexpr uint32_t localFiFoBase = 0x20000;
-
-    using VecTileProd = Tile<TileType::Vec, OutT, HALF_TILE_K, TILE_N, BLayout::RowMajor, HALF_TILE_K, TILE_N>;
-    using MatTileCons =
-        Tile<TileType::Mat, OutT, TILE_K, TILE_N, BLayout::ColMajor, TILE_K, TILE_N, SLayout::RowMajor, 512>;
-
-    using MatPipe = TPipe<FLAG_ID, Direction::DIR_V2C, TILE_K * TILE_N * sizeof(OutT), FIFO_DEPTH>;
-    MatPipe mPipe((__gm__ void *)fifoMem, 0x0, localFiFoBase);
-
-    constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(InT);
-    constexpr uint32_t ALIGNED_M = CeilAlign<uint32_t>(TOTAL_M, 16);
-    constexpr uint32_t ALIGNED_K = CeilAlign<uint32_t>(TILE_K, blockAlign);
-    constexpr uint32_t ALIGNED_N = CeilAlign<uint32_t>(TILE_N, blockAlign);
-
-    using GlobalA = GlobalTensor<InT, pto::Shape<1, 1, 1, TOTAL_M, TILE_K>,
-                                 pto::Stride<TOTAL_M * TOTAL_K, TOTAL_M * TOTAL_K, TOTAL_M * TOTAL_K, TOTAL_K, 1>>;
-    using GlobalOut = GlobalTensor<OutT, pto::Shape<1, 1, 1, TOTAL_M, TILE_N>,
-                                   pto::Stride<TOTAL_M * TILE_N, TOTAL_M * TILE_N, TOTAL_M * TILE_N, TILE_N, 1>>;
-
-    using TileMatA =
-        Tile<TileType::Mat, InT, ALIGNED_M, ALIGNED_K, BLayout::ColMajor, TOTAL_M, TILE_K, SLayout::RowMajor, 512>;
-    using LeftTile = TileLeft<InT, ALIGNED_M, ALIGNED_K, TOTAL_M, TILE_K>;
-    using PopTile =
-        Tile<TileType::Mat, OutT, ALIGNED_K, ALIGNED_N, BLayout::ColMajor, TILE_K, TILE_N, SLayout::RowMajor, 512>;
-    using RightTile = TileRight<OutT, ALIGNED_K, ALIGNED_N, TILE_K, TILE_N>;
-    using AccTile = TileAcc<OutT, TOTAL_M, TILE_N, TOTAL_M, TILE_N>;
-
-    using QuantTile = Tile<TileType::Vec, QuantT, HALF_TILE_K, TILE_N, BLayout::RowMajor, HALF_TILE_K, TILE_N>;
-    using ScaleTile = Tile<TileType::Vec, OutT, HALF_TILE_K, 8, BLayout::RowMajor, -1, -1>;
-    using OffsetTile = Tile<TileType::Vec, OutT, HALF_TILE_K, 8, BLayout::RowMajor, -1, -1>;
-
-    if constexpr (DAV_VEC) {
-        QuantTile quantTile;
-        VecTileProd dequantTile;
-        ScaleTile scaleTile(HALF_TILE_K, 1);
-        OffsetTile offsetTile(HALF_TILE_K, 1);
-        TASSIGN(quantTile, 0x0);
-        TASSIGN(dequantTile, 0x10000);
-        TASSIGN(scaleTile, 0x20000);
-        TASSIGN(offsetTile, 0x28000);
-
-        set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-        set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-        using GlobalQuantB =
-            GlobalTensor<QuantT, pto::Shape<1, 1, 1, HALF_TILE_K, TILE_N>,
-                         pto::Stride<TOTAL_K * TILE_N, TOTAL_K * TILE_N, HALF_TILE_K * TILE_N, TILE_N, 1>>;
-        using GlobalScaleOffset =
-            GlobalTensor<OutT, pto::Shape<1, 1, 1, HALF_TILE_K, 1>, pto::Stride<TOTAL_K, TOTAL_K, HALF_TILE_K, 1, 1>>;
-
-        uint32_t subBlockIdx = get_subblockid();
-
-        for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) {
-            GlobalQuantB globalQuantB(quantB + k_tile * TILE_K * TILE_N + subBlockIdx * HALF_TILE_K * TILE_N);
-            GlobalScaleOffset globalScale(scale + k_tile * TILE_K + subBlockIdx * HALF_TILE_K);
-            GlobalScaleOffset globalOffset(offset + k_tile * TILE_K + subBlockIdx * HALF_TILE_K);
-
-            wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            TLOAD(quantTile, globalQuantB);
-            TLOAD(scaleTile, globalScale);
-            TLOAD(offsetTile, globalOffset);
-
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-            TDEQUANT(dequantTile, quantTile, scaleTile, offsetTile);
-
-            set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-
-            TPUSH<MatPipe, VecTileProd, TileSplitAxis::TILE_UP_DOWN>(mPipe, dequantTile);
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-        }
-
-        wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-
-    if constexpr (DAV_CUBE) {
-        TileMatA aMatTile;
-        PopTile bMatTile;
-        TASSIGN(aMatTile, 0x0);
-
-        LeftTile aTile;
-        RightTile bTile;
-        AccTile accTile;
-        TASSIGN(aTile, 0x0);
-        TASSIGN(bTile, 0x0);
-        TASSIGN(accTile, 0x0);
-
-        typename MatPipe::Consumer cons;
-
-        set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-        for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) {
-            GlobalA globalA(srcA + k_tile * TILE_K);
-
-            wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            TLOAD(aMatTile, globalA);
-
-            TPOP<MatPipe, PopTile, TileSplitAxis::TILE_UP_DOWN>(mPipe, bMatTile);
-
-            set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-            wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-            wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-            TMOV(aTile, aMatTile);
-            TMOV(bTile, bMatTile);
-
-            set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-            wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-            set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-
-            if (k_tile == 0) {
-                TMATMUL(accTile, aTile, bTile);
-            } else {
-                TMATMUL_ACC(accTile, accTile, aTile, bTile);
-            }
-
-            set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-        }
-
-        set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-        wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-        GlobalOut globalOut(out);
-        TSTORE<AccTile, GlobalOut>(globalOut, accTile);
-
-        wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1);
-        wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1);
-
-        pipe_barrier(PIPE_ALL);
-    }
-}
-
-template <int32_t tilingKey>
-void LaunchTPushPopVCMatmul(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                            uint8_t *offset, uint8_t *fifoMem, void *stream)
-{
-    if constexpr (tilingKey == 1) {
-        runTPushPopVCMatmul<int8_t, float, float, 16, 64, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    } else if constexpr (tilingKey == 2) {
-        runTPushPopVCMatmul<int8_t, float, float, 16, 128, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    } else if constexpr (tilingKey == 3) {
-        runTPushPopVCMatmul<int8_t, float, float, 16, 256, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int8_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    } else if constexpr (tilingKey == 4) {
-        runTPushPopVCMatmul<int16_t, float, float, 16, 64, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    } else if constexpr (tilingKey == 5) {
-        runTPushPopVCMatmul<int16_t, float, float, 16, 128, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    } else if constexpr (tilingKey == 6) {
-        runTPushPopVCMatmul<int16_t, float, float, 16, 256, 32, 64><<<1, nullptr, stream>>>(
-            reinterpret_cast<uint64_t *>(ffts), reinterpret_cast<float *>(out), reinterpret_cast<float *>(srcA),
-            reinterpret_cast<int16_t *>(quantB), reinterpret_cast<float *>(scale), reinterpret_cast<float *>(offset),
-            reinterpret_cast<float *>(fifoMem));
-    }
-}
-
-template void LaunchTPushPopVCMatmul<1>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
-template void LaunchTPushPopVCMatmul<2>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
-template void LaunchTPushPopVCMatmul<3>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
-template void LaunchTPushPopVCMatmul<4>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
-template void LaunchTPushPopVCMatmul<5>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
-template void LaunchTPushPopVCMatmul<6>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale,
-                                        uint8_t *offset, uint8_t *fifoMem, void *stream);
\ No newline at end of file

From 57e30c0d313a031bf752ae28c5e667abd81dbdf5 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:22:08 +0000
Subject: [PATCH 20/38] test: add old and new

---
 tests/frontend/test_multifunc_ir.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py
index 90fb29f7..c6533ef0 100644
--- a/tests/frontend/test_multifunc_ir.py
+++ b/tests/frontend/test_multifunc_ir.py
@@ -7,6 +7,11 @@ def meta_data():
     return {"ptr_ty": ptr_ty}
 
 
+@to_ir_module(meta_data=meta_data)
+def single_kernel(arg0: "ptr_ty") -> None:
+    pass
+
+
 @to_ir_module(meta_data=meta_data, module=True)
 def build_module():
     @pto.func(kernel="vector")
@@ -18,10 +23,17 @@ def entry(arg0: "ptr_ty") -> None:
         pto.call(worker, arg0)
 
 
-def test_multifunc_builder_shapes_module():
+def test_old_single_function_builder():
+    text = str(single_kernel)
+    assert "func.func @single_kernel" in text
+    assert text.count("func.func @") == 1
+    assert "func.call" not in text
+
+
+def test_new_multi_function_builder():
     text = str(build_module)
     assert "func.func @worker" in text
     assert "pto.kernel_kind = #pto.kernel_kind<vector>" in text
     assert "func.func @entry" in text
     assert "attributes {pto.entry}" in text
-    assert "func.call @worker" in text
+    assert "call @worker" in text

From 7182e8aba82b3b86bab150a680b0208d12c9b175 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:22:54 +0000
Subject: [PATCH 21/38] feat: remove docs

---
 .../aot/tpushpop/mix-kernel_mlir/pto_docs.md  | 822 ------------------
 1 file changed, 822 deletions(-)
 delete mode 100644 examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md
deleted file mode 100644
index 394b2798..00000000
--- a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md
+++ /dev/null
@@ -1,822 +0,0 @@
-# TPUSH/TPOP 前端接口与 PTOAS 实现设计
-
-## 1. 文档范围
-
-本文定义PTOAS TPUSH/TPOP 前端IR接口，以及其在 PTOAS 内部的 lowering、地址传播、flag 分配和 EmitC 映射规则。
-
-本文覆盖两层接口：
-
-- 前端接口
-  - `pto.aic_initialize_pipe`
-  - `pto.aiv_initialize_pipe`
-  - `pto.tpush_to_aiv`
-  - `pto.tpush_to_aic`
-  - `pto.tpop_from_aic`
-  - `pto.tpop_from_aiv`
-  - `pto.tfree_from_aic`
-  - `pto.tfree_from_aiv`
-  - `pto.reserve_buffer`
-  - `pto.import_reserved_buffer`
-- PTOAS 内部统一接口
-  - `pto.initialize_l2g2l_pipe`
-  - `pto.initialize_l2l_pipe`
-  - `pto.tpush`
-  - `pto.declare_tile`
-  - `pto.tpop`
-  - `pto.tfree`
-
-本文只描述接口契约与编译流程，不展开具体 C++ 模板实现细节。
-
-## 2. 设计目标
-
-本设计的目标如下：
-
-- 对前端提供\*\_initialize_pipe/tpush_to_\*/tpop_from_\*/tfree_from_\*IR接口。
-- 在 PTOAS 内部统一为 pipe/tpush/tpop/tfree 指令，便于复用已有 pass。
-- 支持 A2/A3 与 A5 两个平台使用同一套前端接口。
-- 定义consumer slot buffer的分配地址与producer之间的匹配关系，并传播。
-
-## 3. 前端 IR 接口定义
-
-### 3.1 `pto.aic_initialize_pipe`
-
-#### 语义
-
-由 Cube kernel 在函数启动时调用，初始化该函数涉及的通信 pipe。
-
-#### 语法
-
-```mlir
-pto.aic_initialize_pipe(
-    DIR_MASK,
-    SLOT_SIZE,
-    GM_SLOT_BUFFER,
-    C2V_CONSUMER_BUF,
-    V2C_CONSUMER_BUF)
-```
-
-#### 参数
-
-| 参数 | 类型 | 说明 |
-|---|---|---|
-| `DIR_MASK` | 编译期整数常量 | `1`、`2` 或 `3` |
-| `SLOT_SIZE` | 编译期整数常量 | 单 slot 字节数，定义为切分前完整 tile 字节数 |
-| `GM_SLOT_BUFFER` | GM 地址或空值 | A2/A3 路径使用，A5 路径为空 |
-| `C2V_CONSUMER_BUF` | `i32` | C2V 方向 consumer 的 local slot buffer 基址 |
-| `V2C_CONSUMER_BUF` | `i32` | V2C 方向 consumer 的 local slot buffer 基址 |
-
-### 3.2 `pto.aiv_initialize_pipe`
-
-#### 语义
-
-由 Vector kernel 在函数启动时调用，初始化该函数涉及的通信 pipe。
-
-#### 语法
-
-```mlir
-pto.aiv_initialize_pipe(
-    DIR_MASK,
-    SLOT_SIZE,
-    GM_SLOT_BUFFER,
-    C2V_CONSUMER_BUF,
-    V2C_CONSUMER_BUF)
-```
-
-参数语义与 `pto.aic_initialize_pipe` 相同。
-
-### 3.3 前端数据传输接口
-
-#### `pto.tpush_to_aiv`
-
-```mlir
-pto.tpush_to_aiv(%tile) { split = 0 }
-```
-
-- 仅出现在 Cube kernel 中
-- 表示 C2V 方向 producer push
-
-#### `pto.tpush_to_aic`
-
-```mlir
-pto.tpush_to_aic(%tile) { split = 0 }
-```
-
-- 仅出现在 Vector kernel 中
-- 表示 V2C 方向 producer push
-
-#### `pto.tpop_from_aic`
-
-```mlir
-%tile = pto.tpop_from_aic { split = 0 } -> !pto.tile_buf<...>
-```
-
-- 仅出现在 Vector kernel 中
-- 表示 C2V 方向 consumer pop
-
-#### `pto.tpop_from_aiv`
-
-```mlir
-%tile = pto.tpop_from_aiv { split = 0 } -> !pto.tile_buf<...>
-```
-
-- 仅出现在 Cube kernel 中
-- 表示 V2C 方向 consumer pop
-
-#### `pto.tfree_from_aic`
-
-```mlir
-pto.tfree_from_aic { split = 0 }
-```
-
-- 仅出现在 Vector kernel 中
-- 表示 C2V 方向 consumer free
-
-#### `pto.tfree_from_aiv`
-
-```mlir
-pto.tfree_from_aiv { split = 0 }
-```
-
-- 仅出现在 Cube kernel 中
-- 表示 V2C 方向 consumer free
-
-以上前端数据传输接口中的 `split` 均为编译期常量属性，不是运行时 SSA operand。
-
-- 取值使用 `TileSplitAxis` 枚举语义：`0/1/2` 分别对应 `TILE_NO_SPLIT`、`TILE_UP_DOWN`、`TILE_LEFT_RIGHT`
-- lowering 到 PTOAS 内部 IR 时，`split` 继续以属性形式保留
-
-### 3.4 地址提示接口
-
-#### `pto.reserve_buffer`
-
-用于在当前函数内声明一块 consumer slot buffer 预留空间。其合法写法由
-当前编译流程是否启用 local address planning 决定。
-
-```mlir
-%buf = pto.reserve_buffer {
-    name = "c2v_slot_buffer",
-    size = 2048,
-    location = #pto.address_space<vec>,
-    auto = true
-} -> i32
-```
-
-或使用显式地址：
-
-```mlir
-%buf = pto.reserve_buffer {
-    name = "c2v_slot_buffer",
-    size = 2048,
-    location = #pto.address_space<vec>,
-    auto = false,
-    base = 4096
-} -> i32
-```
-
-#### 参数
-
-| 参数 | 类型 | 说明 |
-|---|---|---|
-| `name` | 字符串属性 | 本函数内唯一的预留段名字 |
-| `size` | 整数属性 | 预留字节数 |
-| `location` | 地址空间属性 | 预留空间所在 local 地址空间 |
-| `auto` | `bool` 属性 | 地址解析路径标志；`true` 表示地址由 PTOAS 地址规划路径分配，`false` 表示地址已在输入 IR 中显式给定 |
-| `base` | 可选整数属性 | 显式起始地址；仅 manual 路径使用 |
-
-#### 结果
-
-- 结果类型为 `i32`
-- 结果值表示该 buffer 当前可用的基址
-- 当前可用基址可来自显式 `base`，也可来自 plan memory 回填后的解析地址
-- 在当前约束下，每个函数最多一条 `reserve_buffer`
-- 编译路径与 `auto` 的合法组合只有两种：
-  - 启用 local address planning：`auto = true`，且不带 `base`
-  - 跳过 local address planning：`auto = false`，且显式提供 `base`
-
-#### `pto.import_reserved_buffer`
-
-用于引用 peer function 中已经定义的 `reserve_buffer` 结果。
-
-```mlir
-%buf = pto.import_reserved_buffer {
-    name = "c2v_slot_buffer",
-    peer_func = @vector_kernel
-} -> i32
-```
-
-#### 参数
-
-| 参数 | 类型 | 说明 |
-|---|---|---|
-| `name` | 字符串属性 | peer 侧 `reserve_buffer` 的名字 |
-| `peer_func` | symbol ref | peer 函数符号 |
-
-#### 结果
-
-- 结果类型为 `i32`
-- 结果值表示从 peer `reserve_buffer` 导入的已解析基址
-
-### 3.5 前端层约束
-
-前端 IR 需满足以下约束：
-
-- 每个 Cube function 最多一条 `pto.aic_initialize_pipe`
-- 每个 Vector function 最多一条 `pto.aiv_initialize_pipe`
-- 每个函数内最多一条 C2V 逻辑 pipe 和一条 V2C 逻辑 pipe
-- 每个函数最多一条 `reserve_buffer`
-- 每个函数最多一条 `import_reserved_buffer`
-- `DIR_MASK` 只允许 `1`、`2`、`3`
-- `SLOT_SIZE > 0`
-- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM`
-- C2V consumer 的 `reserve_buffer.location` 必须是 `VEC`
-- V2C consumer 的 `reserve_buffer.location` 必须是 `MAT`
-- `reserve_buffer.name` 在本函数内必须唯一
-- op 级约束：`reserve_buffer.auto = false` 时必须提供 `base`
-- op 级约束：`reserve_buffer.auto = true` 时必须不提供 `base`
-- 启用 local address planning 的编译流程：`reserve_buffer` 只允许 `auto = true`
-- 跳过 local address planning 的编译流程：`reserve_buffer` 只允许 `auto = false` 且显式提供 `base`
-- `import_reserved_buffer` 必须能在 `peer_func` 中找到同名 `reserve_buffer`
-
-## 4. 核心约定
-
-### 4.1 逻辑 pipe
-
-本文中的“逻辑 pipe”指一条单向通信通道。
-
-- C2V：Cube producer -> Vector consumer
-- V2C：Vector producer -> Cube consumer
-
-`DIR_MASK=3` 表示前端一个同时包含 C2V 和 V2C 的初始化请求，在 PTOAS lowering 后拆成两条单向逻辑 pipe：
-
-- 一条 `dir_mask = 1` 的 C2V pipe
-- 一条 `dir_mask = 2` 的 V2C pipe
-
-### 4.2 `split` 的角色
-
-`split` 使用 `TileSplitAxis` 枚举表达：
-
-- `TILE_NO_SPLIT`
-- `TILE_UP_DOWN`
-- `TILE_LEFT_RIGHT`
-
-在 PTOAS 设计中，`split` 的角色定义为：
-
-- `split` 是 `tpush/tpop/tfree` 的逐指令执行模式
-- `split` 在 IR 中表示为编译期常量属性，不是运行时 SSA operand
-- `split` 不参与pipe 初始化
-- `split` 不参与 plan memory、地址传播、flag 分配
-- PTOAS 将 `split` 作为透明的编译期参数向 EmitC 和底层 pto-isa 透传
-
-因此：
-
-- 同一条逻辑 pipe 上可以出现不同 `split` 的 `tpush/tpop/tfree`
-- PTOAS 不要求同一逻辑 pipe 内所有指令使用同一个 `split`
-- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证；PTOAS 仅校验 `split` 枚举合法并向下透传
-
-### 4.3 `SLOT_SIZE` 的定义
-
-`SLOT_SIZE` 的定义固定为：
-
-- 切分前完整 tile 的字节数
-
-即使 `split` 为 `TILE_UP_DOWN` 或 `TILE_LEFT_RIGHT`，`SLOT_SIZE` 仍然表示未切分前的逻辑 tile 总字节数。
-
-`split` 只影响底层 `TPUSH/TPOP/TFREE` 的执行方式，不影响 `SLOT_SIZE` 的含义。
-
-### 4.4 `SLOT_NUM` 规则
-
-`SLOT_NUM` 由 `DIR_MASK` 固定决定：
-
-- `DIR_MASK = 1` 或 `2`：`SLOT_NUM = 8`
-- `DIR_MASK = 3`：拆成两条单向 pipe，且每条 `SLOT_NUM = 4`
-
-`SLOT_NUM` 不由 `split` 决定。
-
-## 5. PTOAS 内部 IR 接口定义
-
-### 5.1 `!pto.pipe`
-
-本文设计的内部 `!pto.pipe` 为不透明 handle。
-
-`!pto.pipe` 的协议信息由其定义 op 上的属性承载，而不是由 type 参数承载。
-
-底层 `pto-isa` 若对 `TPUSH/TPOP` 的模板形态继续演进，不反向约束 `!pto.pipe` 的 type 设计；内部 `!pto.pipe` 仍保持 opaque handle。
-
-### 5.2 `pto.initialize_l2g2l_pipe`
-
-用于 A2/A3 路径。
-
-```mlir
-%pipe = pto.initialize_l2g2l_pipe {
-    dir_mask = 1,
-    slot_size = 512,
-    slot_num = 8,
-    local_slot_num = 8
-}(%gm_addr, %local_addr) -> !pto.pipe
-```
-
-#### 必需属性
-
-- `dir_mask`
-- `slot_size`
-- `slot_num`
-
-#### 可选属性
-
-- `local_slot_num`
-  - 仅 `initialize_l2g2l_pipe` 承载
-  - 表示 GM 路径下 consumer 侧 local slot buffer 的槽数
-  - 仅在通过 GM 传递时对底层 `TPipe` 模板参数有意义，不改变 GM FIFO 的 `slot_num`
-  - 缺省值等于该内部单向 pipe 的 `slot_num`
-  - 因此当前固定规则下：
-    - `DIR_MASK=1/2` 直接 lowering 时，`local_slot_num = 8`
-    - `DIR_MASK=3` 拆成两条单向 pipe 后，每条 `local_slot_num = 4`
-- `flag_base`
-  - 由 PTOAS flag 分配阶段填写
-  - frontend lowering 阶段可以缺省
-  - EmitC 前必须已经解析为显式常量
-
-#### 操作数
-
-- `gm_addr`
-- `local_addr`
-
-### 5.3 `pto.initialize_l2l_pipe`
-
-用于 A5 路径。
-
-```mlir
-%pipe = pto.initialize_l2l_pipe {
-    dir_mask = 1,
-    slot_size = 512,
-    slot_num = 8
-}(%local_addr) -> !pto.pipe
-```
-
-#### 必需属性
-
-- `dir_mask`
-- `slot_size`
-- `slot_num`
-
-#### 可选属性
-
-- `flag_base`
-  - 由 PTOAS flag 分配阶段填写
-  - frontend lowering 阶段可以缺省
-  - EmitC 前必须已经解析为显式常量
-
-#### 操作数
-
-- `local_addr`
-
-### 5.4 `pto.tpush`
-
-```mlir
-pto.tpush(%tile, %pipe) { split = 0 }
-```
-
-### 5.5 `pto.declare_tile`
-
-```mlir
-%tile = pto.declare_tile -> !pto.tile_buf<...>
-```
-
-### 5.6 `pto.tpop`
-
-```mlir
-pto.tpop(%tile, %pipe) { split = 0 }
-```
-
-### 5.7 `pto.tfree`
-
-```mlir
-pto.tfree(%pipe) { split = 0 }
-```
-
-`split` 在内部 IR 中必须以编译期常量属性形式保留，不能在 lowering 时擦除或降为运行时 operand。
-
-## 6. 前端到内部 IR 的 lowering 规则
-
-### 6.1 初始化接口 lowering
-
-#### A2/A3
-
-- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2g2l_pipe`
-- 若前端未提供更具体信息，lowering 默认补上 `local_slot_num = slot_num`
-
-#### A5
-
-- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2l_pipe`
-
-### 6.2 `DIR_MASK=1/2`
-
-- 只生成一条内部 pipe
-- `slot_num = 8`
-- 对 `initialize_l2g2l_pipe`，`local_slot_num = 8`
-
-### 6.3 `DIR_MASK=3`
-
-前端一个 init op 固定拆成两条内部 pipe：
-
-- `%pipe_c2v`：`dir_mask = 1`，`slot_num = 4`
-- `%pipe_v2c`：`dir_mask = 2`，`slot_num = 4`
-
-若 lowering 为 `initialize_l2g2l_pipe`，则两条内部 pipe 还满足：
-
-- `%pipe_c2v`：`local_slot_num = 4`
-- `%pipe_v2c`：`local_slot_num = 4`
-
-地址选择规则：
-
-- `%pipe_c2v` 使用 `C2V_CONSUMER_BUF`
-- `%pipe_v2c` 使用 `V2C_CONSUMER_BUF`
-
-### 6.4 前端数据传输 op 与内部 pipe 的绑定
-
-绑定规则固定如下：
-
-| 前端 op | 所在函数 | 方向 | 使用的内部 pipe |
-|---|---|---|---|
-| `tpush_to_aiv` | Cube | C2V | `dir_mask = 1` |
-| `tpop_from_aic` | Vector | C2V | `dir_mask = 1` |
-| `tfree_from_aic` | Vector | C2V | `dir_mask = 1` |
-| `tpush_to_aic` | Vector | V2C | `dir_mask = 2` |
-| `tpop_from_aiv` | Cube | V2C | `dir_mask = 2` |
-| `tfree_from_aiv` | Cube | V2C | `dir_mask = 2` |
-
-### 6.5 数据传输 op lowering
-
-#### `tpush_to_aiv` / `tpush_to_aic`
-
-lower 为：
-
-```mlir
-pto.tpush(%tile, %pipe) { split = 0 }
-```
-
-#### `tpop_from_aic` / `tpop_from_aiv`
-
-lower 为：
-
-```mlir
-%decl = pto.declare_tile -> !pto.tile_buf<...>
-pto.tpop(%decl, %pipe) { split = 0 }
-```
-
-即：
-
-- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` 是返回 tile 结果值的接口
-- PTOAS 内部 `pto.tpop` 才是 destination-style 形式，显式接收一个 `pto.declare_tile` 结果作为入参
-
-#### `tfree_from_aic` / `tfree_from_aiv`
-
-lower 为：
-
-```mlir
-pto.tfree(%pipe) { split = 0 }
-```
-
-## 7. `reserve_buffer` 与地址传播
-
-### 7.1 设计原则
-
-- `reserve_buffer` 只表示本函数 consumer slot buffer 的本地预留
-- `import_reserved_buffer` 只表示对 peer 预留段地址的引用
-- `reserve_buffer` 用属性描述“如何得到地址”，用结果值统一承载“当前可用地址”
-- 当前编译流程是否启用 local address planning 与 `reserve_buffer.auto` 共同决定地址处理路径
-- 启用 local address planning：`reserve_buffer` 必须使用 `auto = true`，由 `PlanMemory` 分配地址
-- 跳过 local address planning：`reserve_buffer` 必须使用 `auto = false` 且显式提供 `base`，不再进入 `PlanMemory` 分配路径
-- PTOAS 复用现有 `PlanMemory` pass 实现 `reserve_buffer` 地址确定，不额外增加独立的预分配 pass
-- PTOAS 新增独立地址传播 pass，专门处理 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐
-- 地址传播 pass 在 EmitC 之前运行；启用规划时位于 plan memory 之后，跳过规划时直接消费前端已给定地址
-
-### 7.2 使用规则
-
-#### C2V
-
-- consumer 是 Vector
-- Vector function 需要 `reserve_buffer(location = VEC)`
-- Cube function 需要 `import_reserved_buffer(peer_func = @vector_kernel)`
-
-#### V2C
-
-- consumer 是 Cube
-- Cube function 需要 `reserve_buffer(location = MAT)`
-- Vector function 需要 `import_reserved_buffer(peer_func = @cube_kernel)`
-
-### 7.3 编译路径与地址处理路径
-
-对包含 `reserve_buffer` 的函数，PTOAS 按当前编译流程是否启用 local address planning 以及 `auto` 的组合选择地址处理路径：
-
-- 启用 local address planning + `auto = true`
-  - 进入 auto 路径
-  - 由 `PlanMemory` 为 `reserve_buffer` 分配 `base`
-  - 随后由 `pto-resolve-reserved-buffers` 传播地址并完成 peer `flag_base` 对齐
-- 跳过 local address planning + `auto = false` + 显式 `base`
-  - 进入 manual 路径
-  - 跳过 `PlanMemory`
-  - 由 `pto-resolve-reserved-buffers` 直接传播已给定地址并完成 peer `flag_base` 对齐
-
-以下组合均非法：
-
-- 启用 local address planning + `auto = false`
-- 跳过 local address planning + `auto = true`
-
-若函数内不存在 `reserve_buffer`，则保持现有编译流程对 `PlanMemory` 的原始控制行为，不引入额外语义。
-
-### 7.4 启用 local address planning 的 auto 路径
-
-在启用 local address planning 的编译流程中，`reserve_buffer` 必须使用 `auto = true`，并由 plan memory 负责地址分配。
-
-若函数中存在 `reserve_buffer`，则对其 `location` 对应的地址空间执行：
-
-1. 先按现有逻辑完成普通 local buffer 的 `MemPlan`
-2. 再收集该地址空间内已经分配完成的 local 区间
-3. 在剩余空洞中按地址空间对齐要求寻找一段可容纳 `reserve_buffer.size` 的连续区间
-4. 将该区间起始地址回填为这条唯一 `reserve_buffer` 的 `base`
-
-即：
-
-- 普通 `memref.alloc` / tile buffer 等 local 内存仍先由既有 `MemPlan` 按原逻辑分配
-- `reserve_buffer` 不参与普通 local buffer 的 inplace / reuse 规划
-- `reserve_buffer` 在普通 local buffer 分配完成后，再作为独立的一段连续 local 区间进行 hole 分配
-- `reserve_buffer` 不保证位于地址空间起始地址，也不保证形成预留前缀；其语义仅为“在该地址空间中为 consumer slot buffer 找到一段对齐且连续的可用地址”
-- 若整体容量足够但 `MemPlan` 结果将空间打散，导致不存在满足大小和对齐要求的连续空洞，则 `reserve_buffer` 分配失败并报错
-
-### 7.5 跳过 local address planning 的 manual 路径
-
-在跳过 local address planning 的编译流程中：
-
-- 每个 `reserve_buffer` 必须显式提供 `base`
-- PTOAS 只校验 `base` 的基本合法性
-- `PlanMemory` 不参与该函数的 local 地址分配
-- 因此该函数中其他 local buffer 地址也必须已由前端或更前阶段整体确定
-- 地址传播 pass 不做地址分配，只将显式 `base` 传播到 `import_reserved_buffer`
-
-该 manual 路径的目标是：
-
-- 保持前端或外部地址规划结果不被 PTOAS 改写
-- 避免 `reserve_buffer` 显式地址与 PTOAS 自动规划结果相互覆盖
-
-### 7.6 `import_reserved_buffer` 规则
-
-- 不做地址分配
-
-### 7.7 地址传播 pass 规则
-
-对每个 `import_reserved_buffer`：
-
-1. 通过 `peer_func` 找到 peer 函数
-2. 在 peer 函数内查找同名 `reserve_buffer`
-3. 读取对方已经解析出的 `base` 或其等价结果值
-4. 用该常量地址替换 `import_reserved_buffer` 的结果
-
-地址传播完成后：
-
-- producer 与 consumer 对同一逻辑 pipe 使用同一个 local buffer 地址
-- EmitC 只处理解析后的常量地址，不处理 `import_reserved_buffer`
-
-#### 7.7.1 pass 落点
-
-- PTOAS 增加独立 `ModulePass`：`pto-resolve-reserved-buffers`
-- 该 pass 固定运行在 EmitC lowering 之前
-- 启用规划时：运行在 `pto-plan-memory` 之后
-- 跳过规划时：不经过 `pto-plan-memory`，但该 pass 仍会运行
-- 该 pass 不负责地址分配，只消费前一阶段已经确定的 `reserve_buffer.base`
-
-#### 7.7.2 输入假设
-
-- 启用规划时，`reserve_buffer.auto = true`，其 `base` 已由 `PlanMemory` 回填
-- 跳过规划时，`reserve_buffer.auto = false`，其 `base` 已由前端显式给定
-- `import_reserved_buffer.peer_func` 已能解析到合法 peer function
-- `import_reserved_buffer.name` 已能在 peer function 中找到唯一匹配的 `reserve_buffer`
-
-#### 7.7.3 实现流程
-
-pass 在模块级按两步执行：
-
-1. 先建立 peer 对应关系
-2. 再将 `reserve_buffer` / `import_reserved_buffer` 物化为显式常量地址
-
-其中第一步的实现方式是：
-
-- 遍历模块内所有 `pto.initialize_l2l_pipe` / `pto.initialize_l2g2l_pipe`
-- 若其 `local_addr` 来自 `reserve_buffer`，则以“当前函数 + reserve 名字 + dir_mask”识别逻辑 pipe
-- 若其 `local_addr` 来自 `import_reserved_buffer`，则以“peer_func + reserve 名字 + dir_mask”识别逻辑 pipe
-- 将 peer 两侧引用到同一逻辑 pipe 的内部 init op 归并到同一组
-- 若某条 init 未显式提供 `flag_base`，则其 `local_addr` 必须来自 `reserve_buffer` 或 `import_reserved_buffer`
-- 对每个逻辑 pipe 分组，要求必须形成完整 peer init pair：恰好两条 init，且分别来自 peer 两侧函数；若 peer 信息不完整则直接报错
-- 在同一组内，若任一侧已显式提供 `flag_base`，则该值作为该组最终值；若两侧显式值冲突则报错
-- 若同组两侧都未显式提供 `flag_base`，则按默认规则回填：
-  - 单向场景：`flag_base = 0`
-  - 双向场景：C2V 组 `flag_base = 0`，V2C 组 `flag_base = 2`
-- 所谓“双向场景”，是指同一对 peer 函数之间同时存在 `dir_mask = 1` 和 `dir_mask = 2` 两个逻辑 pipe 分组
-- 完成分组决策后，将最终 `flag_base` 回填到该组内所有尚未显式填写的 init op，保证 peer 两侧一致
-
-第二步的实现方式是：
-
-- 对每个 `reserve_buffer`，读取其已解析 `base`
-- 在该 op 位置插入 `arith.constant`
-- 用该常量替换 `reserve_buffer` 结果值的全部 uses
-- 对每个 `import_reserved_buffer`，通过 `peer_func + name` 找到 peer `reserve_buffer`
-- 读取对方已解析 `base`
-- 在当前 op 位置插入同值 `arith.constant`
-- 用该常量替换 `import_reserved_buffer` 结果值的全部 uses
-- 常量替换完成后，删除 `reserve_buffer` / `import_reserved_buffer`
-
-#### 7.7.4 结果 IR 形态
-
-地址传播 pass 之后：
-
-- IR 中不再保留 `reserve_buffer` / `import_reserved_buffer`
-- 内部 pipe init op 的 `local_addr` 只再引用普通 SSA 常量地址
-- 因而后续 EmitC 无需理解 frontend 预留地址语义，只需透传解析后的地址值
-
-#### 7.7.5 失败条件
-
-若出现以下情况，pass 直接报错：
-
-- `reserve_buffer.base` 在 pass 运行时仍未解析
-- 启用规划的编译流程却出现 `reserve_buffer.auto = false`
-- 跳过规划的编译流程却出现 `reserve_buffer.auto = true`
-- `peer_func` 无法解析到函数
-- 在 peer function 中找不到同名 `reserve_buffer`
-- 某条未显式提供 `flag_base` 的内部 init，其 `local_addr` 不来自 `reserve_buffer` / `import_reserved_buffer`
-- 基于 `reserve_buffer` / `import_reserved_buffer` 建立的某个逻辑 pipe 分组，未形成完整 peer init pair
-- peer `flag_base` 已显式给定但两侧取值冲突
-
-## 8. flag 分配规则
-
-### 8.1 总原则
-
-- `flag_base` 由 PTOAS flag 分配阶段在内部 init op 上填写
-- 在 flag 分配完成前，内部 init op 可以暂时不携带 `flag_base`
-- peer 两侧同一逻辑 pipe 必须使用同一个 `flag_base`
-
-### 8.2 单向场景
-
-当前规划中，当 `DIR_MASK = 1` 或 `2` 且函数内仅有该唯一逻辑 pipe 时，可采用：
-
-- 该方向唯一逻辑 pipe 的 `flag_base = 0`
-- 该 pipe 占用逻辑 flag 对：`0` 和 `1`
-
-### 8.3 双向场景
-
-当前规划中，当 `DIR_MASK = 3` 时，可采用：
-
-- C2V pipe：`flag_base = 0`
-- V2C pipe：`flag_base = 2`
-
-因此双向固定占用两组逻辑 flag：
-
-- C2V：`0` / `1`
-- V2C：`2` / `3`
-
-### 8.4 与地址传播的关系
-
-地址传播 pass 在识别出 `import_reserved_buffer` 与 `reserve_buffer` 的 peer 对应关系后，同时可以完成 peer pipe 的 `flag_base` 对齐。
-
-即：
-
-- 基于同一 FIFO 通信的两条 peer init op，必须拿到相同的 `flag_base`
-
-## 9. verifier 规则
-
-### 9.1 前端 verifier
-
-前端 verifier 负责检查：
-
-- 每个函数 init op 数量是否合法
-- 每个函数 `reserve_buffer` / `import_reserved_buffer` 数量是否合法
-- `DIR_MASK` 取值是否合法
-- `SLOT_SIZE > 0`
-- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM`
-- `reserve_buffer.location` 与 consumer 函数类型匹配
-- `reserve_buffer.name` 在函数内唯一
-- `reserve_buffer.auto = false` 时必须带 `base`
-- `reserve_buffer.auto = true` 时必须不带 `base`
-- driver / pipeline 级约束：启用规划的编译流程只接受 `auto = true`
-- driver / pipeline 级约束：跳过规划的编译流程只接受 `auto = false` 且显式 `base`
-- `import_reserved_buffer` 能在 `peer_func` 中找到同名 `reserve_buffer`
-- 方向相关 op 只能出现在合法 kernel 中
-- 前端数据传输 op 的 `split` 必须是合法的编译期常量属性
-
-### 9.2 内部 IR verifier
-
-内部 verifier 负责检查：
-
-- `slot_size > 0`
-- `slot_num` 只允许 `8` 或 `4`
-- `DIR_MASK=1/2` 时，`slot_num` 必须与单向/双向 lowering 规则一致
-- `local_slot_num` 若出现，只允许出现在 `pto.initialize_l2g2l_pipe` 上，且必须大于 `0` 且不大于 `slot_num`
-- `flag_base` 若出现，必须满足基本合法性；是否已填写以及具体分配值由 flag 分配保证
-- `pto.initialize_l2g2l_pipe` 必须提供 `gm_addr` 和 `local_addr`
-- `pto.initialize_l2l_pipe` 必须提供 `local_addr`
-- `dir_mask = 1` 的 pipe 只能被 C2V 方向 lowering 使用
-- `dir_mask = 2` 的 pipe 只能被 V2C 方向 lowering 使用
-- `tpush/tpop/tfree` 的 `split` 必须是合法的编译期常量属性
-
-### 9.3 关于 `split` 的校验边界
-
-PTOAS 对 `split` 的处理边界如下：
-
-- PTOAS 验证 `split` 是合法枚举值
-- PTOAS 要求 `split` 以编译期常量属性形式出现
-- PTOAS 不验证同一逻辑 pipe 上多个 `tpush/tpop/tfree` 的 `split` 是否一致
-- PTOAS 不根据 `split` 改变地址分配、flag 分配或 pipe 配对
-
-因此：
-
-- `split` 混用是否语义正确，不是 PTOAS 静态保证项
-- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证
-- PTOAS 只负责校验 `split` 枚举值合法，并将其透传到底层
-
-## 10. EmitC 与 pto-isa 映射
-
-### 10.1 初始化 op
-
-在进入 EmitC 前：
-
-- 前端 `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe`
-- 前端 `pto.tpush_to_aiv` / `pto.tpush_to_aic`
-- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv`
-- 前端 `pto.tfree_from_aic` / `pto.tfree_from_aiv`
-- `pto.reserve_buffer` / `pto.import_reserved_buffer`
-
-都必须已经被前序 pass 消除。
-
-EmitC 只处理 PTOAS 内部统一 IR，不直接理解前端 pipe 接口或地址提示接口。
-
-EmitC 将以下内部 init op 映射到底层 `TPipe`：
-
-- `pto.initialize_l2l_pipe`
-- `pto.initialize_l2g2l_pipe`
-
-映射时需要使用以下信息：
-
-- `dir_mask`
-- `slot_size`
-- `slot_num`
-- `local_slot_num`
-- `flag_base`
-- `gm_addr`
-- `local_addr`
-
-其中：
-
-- 若 `flag_base` 尚未在 EmitC 前完成填写，PTOAS 应报错。
-
-### 10.2 数据传输 op
-
-EmitC 将以下内部数据传输 op 映射到底层：
-
-- `pto.tpush` -> `TPUSH`
-- `pto.tpop` -> `TPOP`
-- `pto.tfree` -> `TFREE`
-
-映射时需要使用以下信息：
-
-- `tile`
-- `split`
-- `pipe`
-
-其中：
-
-- `split` 不在 PTOAS 内部解释
-- `split` 作为底层 `TPUSH/TPOP/TFREE` 的编译期模板实参透传
-
-### 10.3 InsertSync
-
-`split` 不影响 PTOAS 中的 pipeline derivation 与 InsertSync 规则。
-
-InsertSync 只依赖：
-
-- op 种类
-- init op 形态
-- `dir_mask`
-- 目标架构
-
-而不依赖 `split`。
-
-## 11. 编译流程总览
-
-完整流程如下：
-
-```text
-前端 IR 接口
-  -> lowering pass
-  -> PTOAS 内部统一 IR
-  -> plan memory
-  -> 地址传播 pass
-  -> EmitC
-  -> pto-isa C++ 代码
-```
-
-其中：
-
-- lowering pass 负责拆分 `DIR_MASK=3`、绑定方向与 pipe
-- 启用规划的编译流程中，plan memory 先按既有逻辑规划普通 local buffer，再为 `reserve_buffer` 在目标地址空间中分配 hole
-- 跳过规划的编译流程中，不运行 plan memory；`reserve_buffer.base` 必须已由前端给定
-- 地址传播 pass 负责 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐
-- EmitC 只负责将内部 `initialize_l2l_pipe` / `initialize_l2g2l_pipe` / `tpush` / `tpop` / `tfree` 及其属性透传到底层
\ No newline at end of file

From 48b1eb343d2b10080de9efe8381ce43228ca4698 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 11:24:44 +0000
Subject: [PATCH 22/38] fix: arith import in builder

---
 examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
index 9535b318..0c0cbdf3 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
@@ -1,5 +1,3 @@
-from mlir.dialects import arith
-
 from ptodsl import pto, tile, to_ir_module
 from ptodsl import scalar as s
 

From 0c865a187265eb0710c32eb825607371a9fba36b Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Tue, 7 Apr 2026 14:22:05 +0000
Subject: [PATCH 23/38] test: compare to MLIR pybindings

---
 tests/frontend/test_multifunc_ir.py | 81 ++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py
index c6533ef0..1e0a1668 100644
--- a/tests/frontend/test_multifunc_ir.py
+++ b/tests/frontend/test_multifunc_ir.py
@@ -1,3 +1,15 @@
+from mlir.dialects import func, pto as _pto
+from mlir.ir import (
+    Attribute,
+    Context,
+    FlatSymbolRefAttr,
+    InsertionPoint,
+    Location,
+    Module,
+    Operation,
+    UnitAttr,
+)
+
 from ptodsl import pto, to_ir_module
 
 
@@ -13,7 +25,7 @@ def single_kernel(arg0: "ptr_ty") -> None:
 
 
 @to_ir_module(meta_data=meta_data, module=True)
-def build_module():
+def multi_kernel_module():
     @pto.func(kernel="vector")
     def worker(arg0: "ptr_ty") -> None:
         pass
@@ -23,17 +35,60 @@ def entry(arg0: "ptr_ty") -> None:
         pto.call(worker, arg0)
 
 
-def test_old_single_function_builder():
-    text = str(single_kernel)
-    assert "func.func @single_kernel" in text
-    assert text.count("func.func @") == 1
-    assert "func.call" not in text
+def build_single_verbose():
+    with Context() as ctx, Location.unknown():
+        _pto.register_dialect(ctx, load=True)
+        module = Module.create()
+        ptr_ty = _pto.PtrType.get(pto.float32)
+        fn_ty = func.FunctionType.get([ptr_ty], [])
+
+        with InsertionPoint(module.body):
+            fn = func.FuncOp("single_kernel", fn_ty)
+            entry = fn.add_entry_block()
+
+        with InsertionPoint(entry):
+            func.ReturnOp([])
+
+        module.operation.verify()
+        return module
+
+
+def build_multi_verbose():
+    with Context() as ctx, Location.unknown():
+        _pto.register_dialect(ctx, load=True)
+        module = Module.create()
+        ptr_ty = _pto.PtrType.get(pto.float32)
+        fn_ty = func.FunctionType.get([ptr_ty], [])
+
+        with InsertionPoint(module.body):
+            worker = func.FuncOp("worker", fn_ty)
+            entry = func.FuncOp("entry", fn_ty)
+
+        worker.operation.attributes["pto.kernel_kind"] = Attribute.parse(
+            "#pto.kernel_kind<vector>"
+        )
+        entry.operation.attributes["pto.entry"] = UnitAttr.get(ctx)
+
+        with InsertionPoint(worker.add_entry_block()):
+            func.ReturnOp([])
+
+        entry_block = entry.add_entry_block()
+        with InsertionPoint(entry_block):
+            arg0 = entry_block.arguments[0]
+            Operation.create(
+                "func.call",
+                operands=[arg0],
+                attributes={"callee": FlatSymbolRefAttr.get("worker")},
+            )
+            func.ReturnOp([])
+
+        module.operation.verify()
+        return module
+
+
+def test_old_single_function_builder_matches_raw_mlir():
+    assert str(single_kernel) == str(build_single_verbose())
 
 
-def test_new_multi_function_builder():
-    text = str(build_module)
-    assert "func.func @worker" in text
-    assert "pto.kernel_kind = #pto.kernel_kind<vector>" in text
-    assert "func.func @entry" in text
-    assert "attributes {pto.entry}" in text
-    assert "call @worker" in text
+def test_new_multi_function_builder_matches_raw_mlir():
+    assert str(multi_kernel_module) == str(build_multi_verbose())

From ea019ed351b218540ab25945743c0efa27fe1e2e Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Wed, 8 Apr 2026 07:40:50 +0000
Subject: [PATCH 24/38] fix: names

---
 .../{bidirectional_example.mlir => c2v.mlir}             | 0
 .../{bidirectional_builder.py => c2v_builder.py}         | 0
 examples/aot/tpushpop/mix-kernel_mlir/compile.sh         | 9 +++------
 .../{run_bidirectional_example.py => run.py}             | 0
 4 files changed, 3 insertions(+), 6 deletions(-)
 rename examples/aot/tpushpop/mix-kernel_mlir/{bidirectional_example.mlir => c2v.mlir} (100%)
 rename examples/aot/tpushpop/mix-kernel_mlir/{bidirectional_builder.py => c2v_builder.py} (100%)
 rename examples/aot/tpushpop/mix-kernel_mlir/{run_bidirectional_example.py => run.py} (100%)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
rename to examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
rename to examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index 7169a980..41183489 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -3,20 +3,17 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
-MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
-GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
+MLIR_PATH="${SCRIPT_DIR}/c2v.mlir"
+GENERATED_CPP="${ARTIFACT_DIR}/c2v.cpp"
 LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 
 mkdir -p "${ARTIFACT_DIR}"
 rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
 MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir"
-python bidirectional_builder.py > bidir_gen.mlir
+python c2v_builder.py > c2v_gen.mlir
 ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
 
-#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"
-
-
 
 bisheng \
     -I/sources/pto-isa/include/ \
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py
rename to examples/aot/tpushpop/mix-kernel_mlir/run.py

From f9281639a755d5576dd6494a8abb353f47b454a1 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Wed, 8 Apr 2026 07:47:51 +0000
Subject: [PATCH 25/38] fix: naming

---
 examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index 41183489..dc7ba7c4 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -10,7 +10,7 @@ LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 mkdir -p "${ARTIFACT_DIR}"
 rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
-MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir"
+MLIR_GEN_PATH="${SCRIPT_DIR}/c2v_gen.mlir"
 python c2v_builder.py > c2v_gen.mlir
 ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
 

From 8ae863510269b9123fb47b9bb2dc15afc3cc41b9 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:03:20 +0000
Subject: [PATCH 26/38] feat: deuglify the wrappers

---
 ptodsl/api/pto_general.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index 1ea2a5c8..00637871 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -106,14 +106,17 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None):
 #     auto = true
 # } -> i32
 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
+    """
+        - At most one `pto.reserve_buffer` is expected in one function
+        - `location` must be a supported local address space
+        - Op-level verification requires:
+        - `auto = false` must provide `base`
+        - `auto = true` must not provide `base`
+    """
     # All params are compile time attributes
     # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
-    kwargs = {}
-    if base is not None:
-        kwargs["base"] = base
-    return _pto.ReserveBufferOp(
-        name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs
-    ).result
+
+    return _pto.ReserveBufferOp(name, size, _resolve_address_space_attr(location), auto_alloc, base=base).result
 
 
 # %c2v_import = pto.import_reserved_buffer {
@@ -135,15 +138,12 @@ def aic_initialize_pipe(
 ):
     # wrap
     # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    kwargs = {}
-    if gm_slot_buffer is not None:
-        kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
     return _pto.AicInitializePipeOp(
         dir_mask,
         slot_size,
         c2v_consumer_buf=_unwrap(c2v_consumer_buf),
         v2c_consumer_buf=_unwrap(v2c_consumer_buf),
-        **kwargs,
+        gm_slot_buffer=_unwrap(gm_slot_buffer),
     )
 
 
@@ -162,15 +162,12 @@ def aiv_initialize_pipe(
 ):
     # wrap
     # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
-    kwargs = {}
-    if gm_slot_buffer is not None:
-        kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer)
     return _pto.AivInitializePipeOp(
         dir_mask,
         slot_size,
         c2v_consumer_buf=_unwrap(c2v_consumer_buf),
         v2c_consumer_buf=_unwrap(v2c_consumer_buf),
-        **kwargs,
+        gm_slot_buffer=_unwrap(gm_slot_buffer),
     )
 
 

From 62db3644d0f2b790638ef80c3962836739704e42 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Wed, 8 Apr 2026 09:45:34 +0000
Subject: [PATCH 27/38] feat: add more examples v2c, c2v,

---
 .../aot/tpushpop/mix-kernel_mlir/README.md    |  45 +++++--
 .../mix-kernel_mlir/c2v_add_builder.py        | 113 ++++++++++++++++++
 .../aot/tpushpop/mix-kernel_mlir/compile.sh   |  17 ++-
 examples/aot/tpushpop/mix-kernel_mlir/run.py  |  28 ++++-
 .../tpushpop/mix-kernel_mlir/v2c_builder.py   | 102 ++++++++++++++++
 5 files changed, 285 insertions(+), 20 deletions(-)
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md
index a898a57b..cb86533a 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/README.md
+++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md
@@ -1,17 +1,38 @@
-# Bidirectional `TPUSH`/`TPOP` MLIR Example
-
-This example mirrors the `mix-kernel_cpp` flow, but starts from
-[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir).
-
-The pipeline is:
-
-1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp`
-2. compile the generated C++ together with `caller.cpp`
-3. build `./tpushpop_mlir_lib.so`
-4. launch the generated `pto.entry` kernel from Python
+# Cross core communication with `pto.push_to_aiv` example
 
 ## Run
 
 ```bash
-python run_bidirectional_example.py
+python run.py c2v
+python run.py bidi
 ```
+
+`c2v` is the default, so `python run.py` is the same as `python run.py c2v`.
+
+## How C2V Communication Works
+
+This example sends one `16x16 f32` tile from the Cube kernel to the Vector kernel.
+
+- The host allocates one shared `gm_slot_buffer` and passes it to both kernels.
+- The Vector kernel owns the C2V consumer buffer with `pto.reserve_buffer(name = "c2v_fifo")`.
+- The Cube kernel refers to that same buffer with `pto.import_reserved_buffer(name = "c2v_fifo")`.
+- Both sides call `*_initialize_pipe` with `dir_mask = 1`, which means `C2V`.
+- Cube sends with `pto.tpush_to_aiv(...)`.
+- Vector receives with `pto.tpop_from_aic(...)` and releases the consumed slot with `pto.tfree_from_aic`.
+
+In the generated C++, this becomes the same `TPipe<..., Direction::DIR_C2V, ...>` on both sides:
+
+- Cube: `TPUSH(pipe, acc_tile)`
+- Vector: `TPOP(pipe, vec_tile)` then `TFREE(pipe)`
+
+The important mental model is: `TPUSH`/`TPOP` are the real cross-core handoff, while `gm_slot_buffer` is the shared backing storage that makes the FIFO work.
+
+## How Bidirectional Works
+
+`bidi` starts the same way as `c2v`, but adds a return path:
+
+- Cube computes `x @ x` and sends it to vector over C2V.
+- Vector pops that tile, computes `tile + tile`, and pushes the doubled result back over V2C.
+- Cube pops the returned tile and writes it to GM.
+
+The important difference is that both sides initialize with `dir_mask = 3`, so the same mixed-kernel launch can use both directions of the pipe.
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
new file mode 100644
index 00000000..dde0f705
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
@@ -0,0 +1,113 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+
+def meta_data():
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    i32 = pto.int32
+    tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+    tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+    x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
+    x_left_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="LEFT",
+        config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"),
+    )
+    x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
+    acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
+    vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def module():
+    @pto.func(kernel="cube")
+    def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_import = pto.import_reserved_buffer(
+            name="c2v_fifo",
+            peer_func="@vector_kernel",
+        )
+
+        pto.aic_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_import,
+            v2c_consumer_buf=c0_i32,
+        )
+
+        x_mat_tile = pto.alloc_tile(x_mat_ty)
+        x_left_tile = pto.alloc_tile(x_left_ty)
+        x_right_tile = pto.alloc_tile(x_right_ty)
+        acc_tile = pto.alloc_tile(acc_ty)
+
+        gm_x_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_x,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.load(gm_x_tile_view, x_mat_tile)
+        tile.mov(x_mat_tile, x_left_tile)
+        tile.mov(x_mat_tile, x_right_tile)
+        tile.matmul(x_left_tile, x_right_tile, acc_tile)
+        # Debug step: only send cube's result to vector.
+        pto.tpush_to_aiv(acc_tile, 0)
+
+    @pto.func(kernel="vector")
+    def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC")
+
+        pto.aiv_initialize_pipe(
+            dir_mask=1,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_local,
+            v2c_consumer_buf=c0_i32,
+        )
+
+        gm_y_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_y,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        doubled_tile = pto.alloc_tile(vec_ty)
+        recv_tile = pto.tpop_from_aic(vec_ty, 0)
+        # First isolate the vector-side path: pop, double, store from vector.
+        tile.add(recv_tile, recv_tile, doubled_tile)
+        pto.store(doubled_tile, gm_y_tile_view)
+        pto.tfree_from_aic(0)
+
+    @pto.func(entry=True)
+    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.call(cube_kernel, gm_slot_buffer, gm_x)
+        pto.call(vector_kernel, gm_slot_buffer, gm_y)
+
+
+if __name__ == "__main__":
+    print(module)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index dc7ba7c4..3761f044 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -3,15 +3,24 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
-MLIR_PATH="${SCRIPT_DIR}/c2v.mlir"
-GENERATED_CPP="${ARTIFACT_DIR}/c2v.cpp"
+MODE="${TPUSHPOP_MODE:-c2v}"
+BUILDER_PATH="${SCRIPT_DIR}/${MODE}_builder.py"
+MLIR_GEN_PATH="${SCRIPT_DIR}/${MODE}_gen.mlir"
+GENERATED_CPP="${ARTIFACT_DIR}/${MODE}.cpp"
 LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
 
+case "${MODE}" in
+  c2v|c2v_add|v2c|bidi) ;;
+  *)
+    echo "Unknown TPUSHPOP_MODE: ${MODE}" >&2
+    exit 2
+    ;;
+esac
+
 mkdir -p "${ARTIFACT_DIR}"
 rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
-MLIR_GEN_PATH="${SCRIPT_DIR}/c2v_gen.mlir"
-python c2v_builder.py > c2v_gen.mlir
+python "${BUILDER_PATH}" > "${MLIR_GEN_PATH}"
 ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
 
 
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py
index 1b619869..a59b527c 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py
@@ -1,3 +1,4 @@
+import argparse
 import ctypes
 import os
 import subprocess
@@ -15,17 +16,20 @@
 N = 16
 ATOL = 1e-4
 RTOL = 1e-4
+MODES = ("c2v", "c2v_add", "v2c", "bidi")
 
 
 def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p:
     return ctypes.c_void_p(tensor.data_ptr())
 
 
-def compile_example(compile_script: str) -> None:
+def compile_example(compile_script: str, mode: str) -> None:
+    env = dict(os.environ, TPUSHPOP_MODE=mode)
     subprocess.run(
         ["bash", compile_script],
         check=True,
         cwd=THIS_DIR,
+        env=env,
     )
 
 
@@ -65,8 +69,24 @@ def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tenso
     torch.npu.synchronize()
 
 
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("mode", nargs="?", choices=MODES, default="c2v")
+    return parser.parse_args()
+
+
+def reference(mode: str, x: torch.Tensor) -> torch.Tensor:
+    y = x.cpu() @ x.cpu()
+    if mode == "c2v":
+        return y
+    if mode == "v2c":
+        return x.cpu()
+    return 2 * y
+
+
 def main() -> None:
-    compile_example(DEFAULT_COMPILE_SCRIPT)
+    args = parse_args()
+    compile_example(DEFAULT_COMPILE_SCRIPT, args.mode)
 
     device = get_test_device()
     torch.npu.set_device(device)
@@ -83,7 +103,7 @@ def main() -> None:
     run_kernel(lib, gm_slot_buffer=gm_slot_buffer, x=x, y=y)
     print(y)
 
-    y_ref = x.cpu() @ x.cpu()
+    y_ref = reference(args.mode, x)
     y_cpu = y.cpu()
 
     print(y_ref-y_cpu)
@@ -94,7 +114,7 @@ def main() -> None:
     if not ok:
         raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}")
 
-    print(f"Validation passed using {DEFAULT_LIB_PATH}.")
+    print(f"Validation passed for mode={args.mode} using {DEFAULT_LIB_PATH}.")
 
 
 if __name__ == "__main__":
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
new file mode 100644
index 00000000..782e5c5c
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
@@ -0,0 +1,102 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+
+def meta_data():
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    i32 = pto.int32
+    tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+    tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+    vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+    recv_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="MAT",
+        config=pto.TileBufConfig(
+            blayout="RowMajor",
+            slayout="NoneBox",
+            s_fractal_size=512,
+        ),
+    )
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def module():
+    @pto.func(kernel="cube")
+    def cube_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        v2c_local = pto.reserve_buffer(name="v2c_fifo", size=4096, location="MAT")
+
+        pto.aic_initialize_pipe(
+            dir_mask=2,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c0_i32,
+            v2c_consumer_buf=v2c_local,
+        )
+
+        gm_y_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_y,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.store(pto.tpop_from_aiv(recv_ty, 0), gm_y_tile_view)
+        pto.tfree_from_aiv(0)
+
+    @pto.func(kernel="vector")
+    def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c0_i32 = const(0, type=i32)
+        v2c_import = pto.import_reserved_buffer(
+            name="v2c_fifo",
+            peer_func="@cube_kernel",
+        )
+
+        pto.aiv_initialize_pipe(
+            dir_mask=2,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c0_i32,
+            v2c_consumer_buf=v2c_import,
+        )
+
+        gm_x_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_x,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        send_tile = pto.alloc_tile(vec_ty)
+        pto.load(gm_x_tile_view, send_tile)
+        pto.tpush_to_aic(send_tile, 0)
+
+    @pto.func(entry=True)
+    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.call(cube_kernel, gm_slot_buffer, gm_y)
+        pto.call(vector_kernel, gm_slot_buffer, gm_x)
+
+
+if __name__ == "__main__":
+    print(module)

From 1d831241b93ab17d3f825d90662b477c71854358 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 08:18:49 +0000
Subject: [PATCH 28/38] feat: add ffts address (needed for bidir comm)

---
 .../aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py |  4 +++-
 .../aot/tpushpop/mix-kernel_mlir/c2v_builder.py     |  4 +++-
 examples/aot/tpushpop/mix-kernel_mlir/caller.cpp    | 13 ++++++++++++-
 examples/aot/tpushpop/mix-kernel_mlir/run.py        |  7 ++++++-
 .../aot/tpushpop/mix-kernel_mlir/v2c_builder.py     |  4 +++-
 5 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
index dde0f705..84fe47e8 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
@@ -5,6 +5,7 @@
 
 
 def meta_data():
+    ffts_ty = pto.ffts_type
     dtype = pto.float32
     ptr_ty = pto.PtrType(dtype)
     i32 = pto.int32
@@ -104,7 +105,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
         pto.tfree_from_aic(0)
 
     @pto.func(entry=True)
-    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_x)
         pto.call(vector_kernel, gm_slot_buffer, gm_y)
 
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
index 0c0cbdf3..b46aa886 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
@@ -5,6 +5,7 @@
 
 
 def meta_data():
+    ffts_ty = pto.ffts_type
     dtype = pto.float32
     ptr_ty = pto.PtrType(dtype)
     i32 = pto.int32
@@ -99,7 +100,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
         pto.tfree_from_aic(0)
 
     @pto.func(entry=True)
-    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_x)
         pto.call(vector_kernel, gm_slot_buffer, gm_y)
 
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
index e558e69d..b8a9e8b2 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
+++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
@@ -4,6 +4,8 @@
 
 #include <cstdint>
 
+extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen);
+
 #include KERNEL_CPP
 
 extern "C" void call_kernel(
@@ -13,5 +15,14 @@ extern "C" void call_kernel(
     uint8_t *x,
     uint8_t *y)
 {
-    call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y);
+    void *fftsAddr = nullptr;
+    uint32_t fftsLen = 0;
+    (void)rtGetC2cCtrlAddr(reinterpret_cast<uint64_t *>(&fftsAddr), &fftsLen);
+    (void)fftsLen;
+
+    call_both<<<blockDim, nullptr, stream>>>(
+        (__gm__ int64_t *)fftsAddr,
+        (__gm__ float *)gmSlotBuffer,
+        (__gm__ float *)x,
+        (__gm__ float *)y);
 }
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py
index a59b527c..79735812 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py
@@ -12,6 +12,7 @@
 DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so")
 DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
 DEFAULT_FIFO_BYTES = 4 * 1024
+DEFAULT_FIFO_BYTES_BOTH = 8 * 1024
 M = 16
 N = 16
 ATOL = 1e-4
@@ -57,6 +58,10 @@ def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]:
     return x, y
 
 
+def fifo_bytes_for_mode(mode: str) -> int:
+    return DEFAULT_FIFO_BYTES_BOTH if mode in ("v2c", "bidi") else DEFAULT_FIFO_BYTES
+
+
 def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None:
     stream_ptr = torch.npu.current_stream()._as_parameter_
     lib.call_kernel(
@@ -93,7 +98,7 @@ def main() -> None:
 
     lib = load_lib(DEFAULT_LIB_PATH)
     gm_slot_buffer = make_gm_slot_buffer(
-        fifo_bytes=DEFAULT_FIFO_BYTES,
+        fifo_bytes=fifo_bytes_for_mode(args.mode),
         device=device,
     )
     torch.set_printoptions(precision=1, threshold=2000, linewidth=250, sci_mode=False)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
index 782e5c5c..9e385002 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
@@ -5,6 +5,7 @@
 
 
 def meta_data():
+    ffts_ty = pto.ffts_type
     dtype = pto.float32
     ptr_ty = pto.PtrType(dtype)
     i32 = pto.int32
@@ -93,7 +94,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
         pto.tpush_to_aic(send_tile, 0)
 
     @pto.func(entry=True)
-    def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_y)
         pto.call(vector_kernel, gm_slot_buffer, gm_x)
 

From e32afba095bce046eff566cee50cac70752fde6c Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 08:39:24 +0000
Subject: [PATCH 29/38] feat: unmangle kernel name

---
 examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index 3761f044..b3981545 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -22,7 +22,8 @@ rm -f "${GENERATED_CPP}" "${LIB_PATH}"
 
 python "${BUILDER_PATH}" > "${MLIR_GEN_PATH}"
 ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"
-
+# add extern "C" to function so kernel name is not mangled
+perl -0pi -e 's/\b__global__ AICORE void call_both\(/extern "C" __global__ AICORE void call_both(/' "${GENERATED_CPP}"
 
 bisheng \
     -I/sources/pto-isa/include/ \

From 80295996bcd755a4480bf99cb96c7c3d44314929 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 08:40:29 +0000
Subject: [PATCH 30/38] feat: add ffts functionality to api

---
 ptodsl/api/pto.py         | 3 +++
 ptodsl/api/pto_general.py | 5 +++++
 ptodsl/api/type_def.py    | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py
index f1f3012f..caf5cda8 100644
--- a/ptodsl/api/pto.py
+++ b/ptodsl/api/pto.py
@@ -7,6 +7,7 @@
     aiv_initialize_pipe,
     as_tensor,
     call,
+    set_ffts,
     cube_section,
     get_block_idx,
     get_block_num,
@@ -45,6 +46,7 @@
     "float32",
     "int16",
     "int32",
+    "ffts_type",
     "PtrType",
     "TensorType",
     "SubTensorType",
@@ -56,6 +58,7 @@
     "get_subblock_num",
     "get_block_num",
     "call",
+    "set_ffts",
     "as_tensor",
     "slice_view",
     "vector_section",
diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index 00637871..63606d57 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -52,6 +52,10 @@ def call(callee, *args):
     )
 
 
+def set_ffts(ffts):
+    return _pto.SetFFTsOp(_unwrap(ffts))
+
+
 def as_tensor(tensor_type, *, ptr, shape, strides, layout=None):
     shape_vals = [_unwrap(v) for v in shape]
     stride_vals = [_unwrap(v) for v in strides]
@@ -234,6 +238,7 @@ def print(format, scalar):
     "get_subblock_num",
     "get_block_num",
     "call",
+    "set_ffts",
     "as_tensor",
     "slice_view",
     "vector_section",
diff --git a/ptodsl/api/type_def.py b/ptodsl/api/type_def.py
index 4f66eebb..8e7909b8 100644
--- a/ptodsl/api/type_def.py
+++ b/ptodsl/api/type_def.py
@@ -1,4 +1,5 @@
 from mlir.dialects import pto as _pto
+from mlir.ir import IntegerType, MemRefType
 
 from . import scalar
 
@@ -8,6 +9,8 @@ def __getattr__(name):
     # and resolve them only when user code accesses them inside PTO/MLIR setup.
     if name in {"bool", "float16", "float32", "int16", "int32"}:
         return getattr(scalar, name)
+    if name == "ffts_type":
+        return MemRefType.get([256], IntegerType.get_signless(64))
     raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
 
@@ -108,4 +111,5 @@ def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None):
     "float32",
     "int16",
     "int32",
+    "ffts_type",
 ]

From 10bfb1b2fab45c0cd32ffe3ed45462f63d2cf8e4 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 08:40:59 +0000
Subject: [PATCH 31/38] feat: add bidir example

---
 .../tpushpop/mix-kernel_mlir/bidi_builder.py  | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
new file mode 100644
index 00000000..511db7e2
--- /dev/null
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
@@ -0,0 +1,128 @@
+from ptodsl import pto, tile, to_ir_module
+from ptodsl import scalar as s
+
+const = s.const
+
+
+def meta_data():
+    ffts_ty = pto.ffts_type
+    dtype = pto.float32
+    ptr_ty = pto.PtrType(dtype)
+    i32 = pto.int32
+    tensor_ty = pto.TensorType(rank=2, dtype=dtype)
+    tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
+    x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
+    x_left_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="LEFT",
+        config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"),
+    )
+    x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
+    acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
+    vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
+    # Direct GM writeback from cube needs a row-major NoneBox tile.
+    cube_recv_ty = pto.TileBufType(
+        shape=[16, 16],
+        dtype=dtype,
+        memory_space="MAT",
+        config=pto.TileBufConfig(
+            blayout="RowMajor",
+            slayout="NoneBox",
+            s_fractal_size=512,
+        ),
+    )
+    return locals()
+
+
+@to_ir_module(meta_data=meta_data, module=True)
+def module():
+    @pto.func(kernel="cube")
+    def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        c0 = const(0)
+        c1 = const(1)
+        c16 = const(16)
+        c2v_import = pto.import_reserved_buffer(
+            name="c2v_fifo",
+            peer_func="@vector_kernel",
+        )
+        v2c_local = pto.reserve_buffer(name="v2c_fifo", size=4096, location="MAT")
+
+        # One DIR_BOTH pipe handles both legs of the round trip.
+        pto.aic_initialize_pipe(
+            dir_mask=3,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_import,
+            v2c_consumer_buf=v2c_local,
+        )
+
+        x_mat_tile = pto.alloc_tile(x_mat_ty)
+        x_left_tile = pto.alloc_tile(x_left_ty)
+        x_right_tile = pto.alloc_tile(x_right_ty)
+        acc_tile = pto.alloc_tile(acc_ty)
+
+        gm_x_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_x,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+        gm_y_tile_view = pto.slice_view(
+            tile_view_ty,
+            source=pto.as_tensor(
+                tensor_ty,
+                ptr=gm_y,
+                shape=[c16, c16],
+                strides=[c16, c1],
+            ),
+            offsets=[c0, c0],
+            sizes=[c16, c16],
+        )
+
+        pto.load(gm_x_tile_view, x_mat_tile)
+        tile.mov(x_mat_tile, x_left_tile)
+        tile.mov(x_mat_tile, x_right_tile)
+        tile.matmul(x_left_tile, x_right_tile, acc_tile)
+        pto.tpush_to_aiv(acc_tile, 0)
+        returned_tile = pto.tpop_from_aiv(cube_recv_ty, 0)
+        pto.store(returned_tile, gm_y_tile_view)
+        pto.tfree_from_aiv(0)
+
+    @pto.func(kernel="vector")
+    def vector_kernel(gm_slot_buffer: "ptr_ty") -> None:
+        c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC")
+        v2c_import = pto.import_reserved_buffer(
+            name="v2c_fifo",
+            peer_func="@cube_kernel",
+        )
+
+        # Vector pops cube's tile, doubles it, then pushes the result back.
+        pto.aiv_initialize_pipe(
+            dir_mask=3,
+            slot_size=1024,
+            gm_slot_buffer=gm_slot_buffer,
+            c2v_consumer_buf=c2v_local,
+            v2c_consumer_buf=v2c_import,
+        )
+
+        doubled_tile = pto.alloc_tile(vec_ty)
+        recv_tile = pto.tpop_from_aic(vec_ty, 0)
+        tile.add(recv_tile, recv_tile, doubled_tile)
+        pto.tpush_to_aic(doubled_tile, 0)
+        pto.tfree_from_aic(0)
+
+    @pto.func(entry=True)
+    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+        pto.set_ffts(ffts_addr)
+        pto.call(cube_kernel, gm_slot_buffer, gm_x, gm_y)
+        pto.call(vector_kernel, gm_slot_buffer)
+
+
+if __name__ == "__main__":
+    print(module)

From 56527f4f9621ac604403a3d8f846827f17c10036 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 09:09:52 +0000
Subject: [PATCH 32/38] chore: docker ptoas ver and pto-isa

---
 docker/Dockerfile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 331ca0d1..1c149ef1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -17,7 +17,7 @@ RUN pip install --no-cache-dir \
 
 # certain operations need latest isa header, not CANN 8.5.0 default
 # header on 2026/03/24
-ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af
+ARG PTOISA_COMMIT=a8c3fbf42a2f4a0f609f64e138dda62deefddb8e
 WORKDIR /sources
 RUN git clone https://gitcode.com/cann/pto-isa.git \
     && cd pto-isa && git checkout $PTOISA_COMMIT
@@ -29,10 +29,10 @@ ARG CACHE_BURST=1
 
 # ARG ARCH=x86_64
 ARG ARCH=aarch64
-ARG RELEASE_REPO=huawei-csl/PTOAS
-ARG RELEASE_VER=20260327
-ARG RELEASE_TAG=${RELEASE_VER}
-ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl
+ARG RELEASE_REPO=zhangstevenunity/PTOAS
+ARG RELEASE_VER=0.24
+ARG RELEASE_TAG=v${RELEASE_VER}
+ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
 ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz
 
 WORKDIR /installers/

From d40da05daa23518f26951909a67799e74cad95e9 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 09:19:42 +0000
Subject: [PATCH 33/38] chore: black

---
 .../aot/tpushpop/mix-kernel_mlir/bidi_builder.py     |  4 +++-
 .../aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py  |  4 +++-
 examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py |  4 +++-
 examples/aot/tpushpop/mix-kernel_mlir/run.py         | 12 ++++++++----
 examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py |  4 +++-
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
index 511db7e2..8ea9ad03 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
@@ -118,7 +118,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty") -> None:
         pto.tfree_from_aic(0)
 
     @pto.func(entry=True)
-    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(
+        ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty"
+    ) -> None:
         pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_x, gm_y)
         pto.call(vector_kernel, gm_slot_buffer)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
index 84fe47e8..d0aef4ae 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
@@ -105,7 +105,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
         pto.tfree_from_aic(0)
 
     @pto.func(entry=True)
-    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(
+        ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty"
+    ) -> None:
         pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_x)
         pto.call(vector_kernel, gm_slot_buffer, gm_y)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
index b46aa886..51312f36 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
@@ -100,7 +100,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
         pto.tfree_from_aic(0)
 
     @pto.func(entry=True)
-    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(
+        ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty"
+    ) -> None:
         pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_x)
         pto.call(vector_kernel, gm_slot_buffer, gm_y)
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py
index 79735812..de46b663 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py
@@ -53,7 +53,7 @@ def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor:
 
 
 def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]:
-    x = torch.rand((M, N), dtype=torch.float32, device=device) -0.5
+    x = torch.rand((M, N), dtype=torch.float32, device=device) - 0.5
     y = torch.zeros((M, N), dtype=torch.float32, device=device)
     return x, y
 
@@ -62,7 +62,9 @@ def fifo_bytes_for_mode(mode: str) -> int:
     return DEFAULT_FIFO_BYTES_BOTH if mode in ("v2c", "bidi") else DEFAULT_FIFO_BYTES
 
 
-def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None:
+def run_kernel(
+    lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor
+) -> None:
     stream_ptr = torch.npu.current_stream()._as_parameter_
     lib.call_kernel(
         1,
@@ -111,13 +113,15 @@ def main() -> None:
     y_ref = reference(args.mode, x)
     y_cpu = y.cpu()
 
-    print(y_ref-y_cpu)
+    print(y_ref - y_cpu)
     max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item())
     ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL))
 
     print(f"shape=({M}, {N}) max_abs={max_abs:.6f}")
     if not ok:
-        raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}")
+        raise SystemExit(
+            f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}"
+        )
 
     print(f"Validation passed for mode={args.mode} using {DEFAULT_LIB_PATH}.")
 
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
index 9e385002..96ba943e 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
@@ -94,7 +94,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
         pto.tpush_to_aic(send_tile, 0)
 
     @pto.func(entry=True)
-    def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
+    def call_both(
+        ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty"
+    ) -> None:
         pto.set_ffts(ffts_addr)
         pto.call(cube_kernel, gm_slot_buffer, gm_y)
         pto.call(vector_kernel, gm_slot_buffer, gm_x)

From 7c2a4a03d0628bb5325e7d4fdfe3cbd40623fed6 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 09:24:04 +0000
Subject: [PATCH 34/38] chore: black

---
 ptodsl/api/pto_general.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py
index 63606d57..9187d25d 100644
--- a/ptodsl/api/pto_general.py
+++ b/ptodsl/api/pto_general.py
@@ -111,16 +111,18 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None):
 # } -> i32
 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None):
     """
-        - At most one `pto.reserve_buffer` is expected in one function
-        - `location` must be a supported local address space
-        - Op-level verification requires:
-        - `auto = false` must provide `base`
-        - `auto = true` must not provide `base`
+    - At most one `pto.reserve_buffer` is expected in one function
+    - `location` must be a supported local address space
+    - Op-level verification requires:
+    - `auto = false` must provide `base`
+    - `auto = true` must not provide `base`
     """
     # All params are compile time attributes
     # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value
 
-    return _pto.ReserveBufferOp(name, size, _resolve_address_space_attr(location), auto_alloc, base=base).result
+    return _pto.ReserveBufferOp(
+        name, size, _resolve_address_space_attr(location), auto_alloc, base=base
+    ).result
 
 
 # %c2v_import = pto.import_reserved_buffer {
@@ -136,12 +138,11 @@ def aic_initialize_pipe(
     *,
     dir_mask,
     slot_size,
-    gm_slot_buffer=None, # only needed on a2/a3?
+    gm_slot_buffer=None,  # only needed on a2/a3?
     c2v_consumer_buf,
     v2c_consumer_buf,
 ):
-    # wrap
-    # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    # wrap aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
     return _pto.AicInitializePipeOp(
         dir_mask,
         slot_size,
@@ -160,12 +161,11 @@ def aiv_initialize_pipe(
     *,
     dir_mask,
     slot_size,
-    gm_slot_buffer=None, # only needed on a2/a3
+    gm_slot_buffer=None,  # only needed on a2/a3
     c2v_consumer_buf,
     v2c_consumer_buf,
 ):
-    # wrap
-    # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
+    # wrap aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation
     return _pto.AivInitializePipeOp(
         dir_mask,
         slot_size,

From 2779e56e32bb5b0a99170bdde1495093e6cd4dce Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 09:31:29 +0000
Subject: [PATCH 35/38] feat: gitignore

---
 .gitignore                       | 2 ++
 examples/aot/tpushpop/.gitignore | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 09d8265b..1ccc35d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ __pycache__
 extra-info
 
 *.ptodsl_jit
+
+msprof_res/
diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore
index ab5698d1..b0d498be 100644
--- a/examples/aot/tpushpop/.gitignore
+++ b/examples/aot/tpushpop/.gitignore
@@ -1 +1 @@
-msprof_res/
\ No newline at end of file
+build_artifacts/

From 49eae78555971a0173fee77bb66ca02e81d92a95 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 11:19:28 +0000
Subject: [PATCH 36/38] feat: move files and cleanup

---
 .../aot/tpushpop/mix-kernel_mlir/README.md    | 84 ++++++++++++++-----
 .../aot/tpushpop/mix-kernel_mlir/compile.sh   |  6 +-
 .../{ => kernels}/bidi_builder.py             |  0
 .../{ => kernels}/c2v_add_builder.py          |  0
 .../{ => kernels}/c2v_builder.py              |  0
 .../{ => kernels}/v2c_builder.py              |  0
 examples/aot/tpushpop/mix-kernel_mlir/run.py  |  2 +-
 7 files changed, 67 insertions(+), 25 deletions(-)
 rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/bidi_builder.py (100%)
 rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/c2v_add_builder.py (100%)
 rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/c2v_builder.py (100%)
 rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/v2c_builder.py (100%)

diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md
index cb86533a..886a8c2c 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/README.md
+++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md
@@ -1,38 +1,80 @@
-# Cross core communication with `pto.push_to_aiv` example
+# TPush / TPop mixed-kernel examples
 
-## Run
+Small examples of tile FIFO communication between Cube (`AIC`) and Vector (`AIV`).
 
 ```bash
 python run.py c2v
+python run.py v2c
 python run.py bidi
 ```
 
-`c2v` is the default, so `python run.py` is the same as `python run.py c2v`.
+`python run.py` defaults to `c2v`.
 
-## How C2V Communication Works
+Files:
 
-This example sends one `16x16 f32` tile from the Cube kernel to the Vector kernel.
+- `kernels/` has the Python builders.
+- `build_artifacts/` gets generated MLIR, generated C++, and the `.so`.
+- `gm_slot_buffer` is the GM backing store for the pipe.
+- `caller.cpp` sets the FFTS base before launching the generated kernel.
 
-- The host allocates one shared `gm_slot_buffer` and passes it to both kernels.
-- The Vector kernel owns the C2V consumer buffer with `pto.reserve_buffer(name = "c2v_fifo")`.
-- The Cube kernel refers to that same buffer with `pto.import_reserved_buffer(name = "c2v_fifo")`.
-- Both sides call `*_initialize_pipe` with `dir_mask = 1`, which means `C2V`.
-- Cube sends with `pto.tpush_to_aiv(...)`.
-- Vector receives with `pto.tpop_from_aic(...)` and releases the consumed slot with `pto.tfree_from_aic`.
+Core idea:
 
-In the generated C++, this becomes the same `TPipe<..., Direction::DIR_C2V, ...>` on both sides:
+- `aic_initialize_pipe` / `aiv_initialize_pipe` lower to matching `TPipe<...>` objects.
+- `gm_slot_buffer` is the shared GM slot memory used by that `TPipe`.
+- `tpush_to_aiv` / `tpush_to_aic` lower to `TPUSH(pipe, tile)`.
+- `tpop_from_aic` / `tpop_from_aiv` lower to `TPOP(pipe, tile)`.
+- `tfree_from_aic` / `tfree_from_aiv` lower to `TFREE(pipe)` and release the consumed slot.
 
-- Cube: `TPUSH(pipe, acc_tile)`
-- Vector: `TPOP(pipe, vec_tile)` then `TFREE(pipe)`
+## C2V
 
-The important mental model is: `TPUSH`/`TPOP` are the real cross-core handoff, while `gm_slot_buffer` is the shared backing storage that makes the FIFO work.
+Cube sends. Vector receives.
 
-## How Bidirectional Works
+This example computes `X @ X` on Cube, sends the accumulator tile to Vector, then Vector stores it to GM.
 
-`bidi` starts the same way as `c2v`, but adds a return path:
+```text
+Cube:   load X -> matmul -> tpush_to_aiv
+Vector: tpop_from_aic -> store Y -> tfree_from_aic
+```
+
+Pipe wiring:
+
+- Vector owns the consumer buffer: `reserve_buffer("c2v_fifo", location="VEC")`
+- Cube imports it: `import_reserved_buffer("c2v_fifo", peer_func="@vector_kernel")`
+- Both sides initialize with `dir_mask = 1`
+
+## V2C
+
+Vector sends. Cube receives.
+
+This example loads `X` on Vector, sends that tile to Cube, then Cube stores it to GM.
+
+```text
+Vector: load X -> tpush_to_aic
+Cube:   tpop_from_aiv -> store Y -> tfree_from_aiv
+```
+
+Pipe wiring:
+
+- Cube owns the consumer buffer: `reserve_buffer("v2c_fifo", location="MAT")`
+- Vector imports it: `import_reserved_buffer("v2c_fifo", peer_func="@cube_kernel")`
+- Both sides initialize with `dir_mask = 2`
+
+## BIDI
+
+Both directions are enabled.
+
+This example sends `X @ X` from Cube to Vector. Vector doubles it and sends it back. Cube receives the returned tile and stores it to GM.
+
+```text
+Cube:   matmul -> tpush_to_aiv
+Vector: tpop_from_aic -> add -> tpush_to_aic -> tfree_from_aic
+Cube:   tpop_from_aiv -> store Y -> tfree_from_aiv
+```
+
+Pipe wiring:
 
-- Cube computes `x @ x` and sends it to vector over C2V.
-- Vector pops that tile, computes `tile + tile`, and pushes the doubled result back over V2C.
-- Cube pops the returned tile and writes it to GM.
+- Vector reserves `c2v_fifo`; Cube imports it
+- Cube reserves `v2c_fifo`; Vector imports it
+- Both sides initialize with `dir_mask = 3`
 
-The important difference is that both sides initialize with `dir_mask = 3`, so the same mixed-kernel launch can use both directions of the pipe.
+For `dir_mask = 3`, allocate FIFO backing for both directions. `run.py` uses `8 KiB`.
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
index b3981545..6b7df346 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
+++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh
@@ -4,10 +4,10 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
 MODE="${TPUSHPOP_MODE:-c2v}"
-BUILDER_PATH="${SCRIPT_DIR}/${MODE}_builder.py"
-MLIR_GEN_PATH="${SCRIPT_DIR}/${MODE}_gen.mlir"
+BUILDER_PATH="${SCRIPT_DIR}/kernels/${MODE}_builder.py"
+MLIR_GEN_PATH="${ARTIFACT_DIR}/${MODE}_gen.mlir"
 GENERATED_CPP="${ARTIFACT_DIR}/${MODE}.cpp"
-LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"
+LIB_PATH="${ARTIFACT_DIR}/tpushpop_mlir_lib.so"
 
 case "${MODE}" in
   c2v|c2v_add|v2c|bidi) ;;
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/bidi_builder.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py
rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/bidi_builder.py
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_add_builder.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py
rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_add_builder.py
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_builder.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py
rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_builder.py
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/v2c_builder.py
similarity index 100%
rename from examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py
rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/v2c_builder.py
diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py
index de46b663..f749e378 100644
--- a/examples/aot/tpushpop/mix-kernel_mlir/run.py
+++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py
@@ -9,7 +9,7 @@
 from ptodsl.test_util import get_test_device
 
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so")
+DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "build_artifacts", "tpushpop_mlir_lib.so")
 DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh")
 DEFAULT_FIFO_BYTES = 4 * 1024
 DEFAULT_FIFO_BYTES_BOTH = 8 * 1024

From 8ca2c8f3b3baad3d4352f0d622a10c7bdadc8346 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Thu, 9 Apr 2026 11:51:12 +0000
Subject: [PATCH 37/38] test: add ptoas test

---
 tests/frontend/test_multifunc_ir.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py
index 1e0a1668..0b4343e4 100644
--- a/tests/frontend/test_multifunc_ir.py
+++ b/tests/frontend/test_multifunc_ir.py
@@ -1,3 +1,5 @@
+import subprocess
+
 from mlir.dialects import func, pto as _pto
 from mlir.ir import (
     Attribute,
@@ -92,3 +94,20 @@ def test_old_single_function_builder_matches_raw_mlir():
 
 def test_new_multi_function_builder_matches_raw_mlir():
     assert str(multi_kernel_module) == str(build_multi_verbose())
+
+
+def test_multi_function_module_compiles_with_ptoas(tmp_path):
+    pto_path = tmp_path / "multi_kernel_module.pto"
+    cpp_path = tmp_path / "multi_kernel_module.cpp"
+    pto_path.write_text(str(multi_kernel_module), encoding="utf-8")
+
+    subprocess.run(
+        [
+            "ptoas",
+            "--enable-insert-sync",
+            str(pto_path),
+            "-o",
+            str(cpp_path),
+        ],
+        check=True,
+    )

From d04895870937124108f5aa2ff9928d21e584a9e2 Mon Sep 17 00:00:00 2001
From: fiskrt <43207511+fiskrt@users.noreply.github.com>
Date: Mon, 13 Apr 2026 09:38:52 +0000
Subject: [PATCH 38/38] chore: update pto-isa version in ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cf8f26cb..068abedb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
       RELEASE_VER: 0.24
       RELEASE_TAG: v0.24
       CLI_DIR: /installers/ptoas-cli
-      PTOISA_COMMIT: 2ee948ef636863ed149f176d5327d9db5f349bb6
+      PTOISA_COMMIT: a8c3fbf42a2f4a0f609f64e138dda62deefddb8e
 
     steps:
       - name: Install system packages