From b269e2bacd86716a40c3216bb16db1c46c41193a Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:58:08 +0000 Subject: [PATCH 01/38] feat: init --- .../cube_to_vector_matmul_add/README.md | 31 ++ .../cube_to_vector_matmul_add/caller.cpp | 27 ++ .../cube_to_vector_matmul_add/compile.sh | 46 +++ .../run_tpushpop_cv.py | 150 +++++++++ refs/tpushpop_cv.cpp | 290 ++++++++++++++++++ refs/tpushpop_vc.cpp | 236 ++++++++++++++ 6 files changed, 780 insertions(+) create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/README.md create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh create mode 100644 examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py create mode 100644 refs/tpushpop_cv.cpp create mode 100644 refs/tpushpop_vc.cpp diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md b/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md new file mode 100644 index 00000000..28ae0c39 --- /dev/null +++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md @@ -0,0 +1,31 @@ +# Cube To Vector `TPUSH`/`TPOP` Example + +This example wraps `refs/tpushpop_cv.cpp` into the same `compile.sh` + Python runner flow used by the AOT examples. + +The kernel does: + +- cube-side `TMATMUL` +- `TPUSH` from cube to vector +- vector-side `TPOP` +- vector-side bias add + +## Run + +```bash +python run_tpushpop_cv.py +``` + +That will: + +1. call `compile.sh` +2. build `./tpushpop_cv_lib.so` +3. launch the kernel on NPU +4. compare against `A @ B + bias` + +The wrapper fetches the runtime FFTS/control address inside `caller.cpp` with `rtGetC2cCtrlAddr(...)`, so the Python side only needs to provide the kernel inputs, output, and FIFO backing memory. + +If your environment needs different PTO include roots: + +```bash +PTO_INCLUDE_PATH=/sources/pto-isa/include python run_tpushpop_cv.py +``` diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp b/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp new file mode 100644 index 00000000..fbe697f4 --- /dev/null +++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp @@ -0,0 +1,27 @@ +#ifndef KERNEL_CPP +#error "KERNEL_CPP must be defined at compile time." +#endif + +#include + +extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *out, + uint8_t *srcA, + uint8_t *srcB, + uint8_t *bias, + uint8_t *fifoMem) +{ + void *fftsAddr = nullptr; + uint32_t fftsLen = 0; + (void)blockDim; + (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); + (void)fftsLen; + + LaunchTPushPopMatmulAdd(reinterpret_cast(fftsAddr), out, srcA, srcB, bias, fifoMem, stream); +} diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh b/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh new file mode 100644 index 00000000..4df82d4c --- /dev/null +++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" +ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" +LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so" +EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}" + +if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then + EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS} -DTPUSHPOP_SANITY_ONLY" +fi + +PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}" +if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then + if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then + PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include" + elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then + PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include" + else + echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2 + exit 1 + fi +fi + +mkdir -p "${ARTIFACT_DIR}" +rm -f "${LIB_PATH}" + +bisheng \ + -I"${PTO_INCLUDE_PATH}" \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + ${EXTRA_BISHENG_FLAGS} \ + -DKERNEL_CPP="\"${REPO_ROOT}/refs/tpushpop_cv.cpp\"" \ + "${SCRIPT_DIR}/caller.cpp" \ + -o "${LIB_PATH}" + +echo "Built ${LIB_PATH}." diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py b/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py new file mode 100644 index 00000000..011fc1d2 --- /dev/null +++ b/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py @@ -0,0 +1,150 @@ +import ctypes +import os +import subprocess + +import numpy as np +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so") +DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") +DEFAULT_FIFO_BYTES = 4 * 1024 +TOTAL_M = 16 +K = 32 +N = 32 +INPUT_DTYPE = torch.float16 +SEED = 0 +ATOL = 5e-2 +RTOL = 5e-2 +SANITY_ONLY = False + + +def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: + return ctypes.c_void_p(tensor.data_ptr()) + + +def compile_example(compile_script: str) -> None: + subprocess.run( + ["bash", compile_script], + check=True, + cwd=THIS_DIR, + ) + + +def load_lib(lib_path: str) -> ctypes.CDLL: + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ] + lib.call_kernel.restype = None + return lib + + +def make_buffers( + *, + total_m: int, + k: int, + n: int, + input_dtype: torch.dtype, + device: str, + fifo_bytes: int, +): + src_a = torch.randn((total_m, k), dtype=input_dtype, device=device) + src_b = torch.randn((k, n), dtype=input_dtype, device=device) + bias = torch.randn((total_m, n), dtype=torch.float32, device=device) + out = torch.zeros((total_m, n), dtype=torch.float32, device=device) + + fifo_elems = max(1, (fifo_bytes + 3) // 4) + fifo_mem = torch.zeros((fifo_elems,), dtype=torch.float32, device=device) + return out, src_a, src_b, bias, fifo_mem + + +def run_kernel( + lib: ctypes.CDLL, + *, + out: torch.Tensor, + src_a: torch.Tensor, + src_b: torch.Tensor, + bias: torch.Tensor, + fifo_mem: torch.Tensor, +) -> torch.Tensor: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + 1, + stream_ptr, + torch_to_ctypes(out), + torch_to_ctypes(src_a), + torch_to_ctypes(src_b), + torch_to_ctypes(bias), + torch_to_ctypes(fifo_mem), + ) + torch.npu.synchronize() + return out + + +def reference_result(src_a: torch.Tensor, src_b: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: + ref = torch.matmul(src_a.float().cpu(), src_b.float().cpu()) + if not SANITY_ONLY: + ref = ref + bias.cpu() + return ref.to(torch.float32) + + +def main() -> None: + compile_example(DEFAULT_COMPILE_SCRIPT) + + device = get_test_device() + torch.npu.set_device(device) + torch.manual_seed(SEED) + np.random.seed(SEED) + + lib = load_lib(DEFAULT_LIB_PATH) + out, src_a, src_b, bias, fifo_mem = make_buffers( + total_m=TOTAL_M, + k=K, + n=N, + input_dtype=INPUT_DTYPE, + device=device, + fifo_bytes=DEFAULT_FIFO_BYTES, + ) + + out = run_kernel( + lib, + out=out, + src_a=src_a, + src_b=src_b, + bias=bias, + fifo_mem=fifo_mem, + ) + ref = reference_result(src_a, src_b, bias) + out_cpu = out.cpu() + + max_abs = float(torch.max(torch.abs(out_cpu - ref)).item()) + mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item()) + ok = bool(torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL)) + + print( + f"mode={'sanity_matmul' if SANITY_ONLY else 'tpushpop_cv'} " + f"shape=({TOTAL_M}, {K}, {N}) dtype={INPUT_DTYPE} " + f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}" + ) + + if not ok: + raise SystemExit( + f"Validation failed with atol={ATOL} rtol={RTOL}. " + f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}" + ) + + print(f"Validation passed using {DEFAULT_LIB_PATH}.") + + +if __name__ == "__main__": + main() diff --git a/refs/tpushpop_cv.cpp b/refs/tpushpop_cv.cpp new file mode 100644 index 00000000..324ade79 --- /dev/null +++ b/refs/tpushpop_cv.cpp @@ -0,0 +1,290 @@ +#include +#include + +using namespace pto; + +#define VEC_CORES 2 + +using ExampleInT = half; +using ExampleOutT = float; +constexpr uint32_t EXAMPLE_TOTAL_M = 16; +constexpr uint32_t EXAMPLE_CASE_TILE_M = 16; +constexpr uint32_t EXAMPLE_TILE_K = 32; +constexpr uint32_t EXAMPLE_TILE_N = 32; + +#ifdef __DAV_CUBE__ +constexpr bool DAV_CUBE = true; +#else +constexpr bool DAV_CUBE = false; +#endif + +#ifdef __DAV_VEC__ +constexpr bool DAV_VEC = true; +#else +constexpr bool DAV_VEC = false; +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) +{ + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +#ifdef TPUSHPOP_SANITY_ONLY +__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB) +{ + constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); + constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_TOTAL_M, 16); + constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); + constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); + + using GlobalA = + GlobalTensor, + pto::Stride>; + using GlobalB = + GlobalTensor, + pto::Stride>; + using GlobalOut = + GlobalTensor, + pto::Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + if constexpr (DAV_CUBE) { + TileMatA aMatTile; + TileMatB bMatTile; + LeftTile aTile; + RightTile bTile; + AccTile accTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + GlobalA globalA(srcA); + GlobalB globalB(srcB); + GlobalOut globalOut(out); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(aMatTile, globalA); + TLOAD(bMatTile, globalB); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + TMATMUL(accTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(globalOut, accTile); + + pipe_barrier(PIPE_ALL); + } +} +#else +__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out, + __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB, + __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem) +{ + set_ffts_base_addr((uint64_t)ffts_addr); + constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M; + constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES; + + constexpr uint16_t FLAG_ID = 0; + constexpr uint8_t FIFO_DEPTH = 2; + constexpr uint8_t FIFO_PERIOD = 1; + // local fifo base used for TPOP of vector side(vecTileHalf) + constexpr uint32_t localFiFoBase = 0x0; + + using AccTile = TileAcc; + using VecTileHalf = + Tile; + using BiasTile = + Tile; + using OutTile = + Tile; + + using MatPipe = TPipe; + MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase); + + constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); + constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_CASE_TILE_M, 16); + constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); + constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); + + using GlobalA = + GlobalTensor, + pto::Stride>; + using GlobalB = + GlobalTensor, + pto::Stride>; + using GlobalBias = + GlobalTensor, + pto::Stride>; + using GlobalOut = + GlobalTensor, + pto::Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + + if constexpr (DAV_CUBE) { + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile accTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { + GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K); + GlobalB globalB(srcB); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + TLOAD(aMatTile, globalA); + TLOAD(bMatTile, globalB); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + + TMATMUL(accTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TPUSH(mPipe, accTile); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + } + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } + + if constexpr (DAV_VEC) { + VecTileHalf vecTileHalf; + BiasTile biasTile; + OutTile outTile; + TASSIGN(biasTile, 0x10000); + TASSIGN(outTile, 0x20000); + + uint32_t subBlockIdx = get_subblockid(); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + TPOP(mPipe, vecTileHalf); + + size_t biasOffset = + static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; + GlobalBias globalBias(bias + biasOffset); + + TLOAD(biasTile, globalBias); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + TADD(outTile, vecTileHalf, biasTile); + + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + size_t outOffset = + static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; + GlobalOut globalOut(out + outOffset); + TSTORE(globalOut, outTile); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + } + + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } +} +#endif + +void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, + void *stream) +{ +#ifdef TPUSHPOP_SANITY_ONLY + (void)ffts; + (void)bias; + (void)fifoMem; + runSanityMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(out), reinterpret_cast(srcA), reinterpret_cast(srcB)); +#else + runTPushPopMatmulAdd<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); +#endif +} diff --git a/refs/tpushpop_vc.cpp b/refs/tpushpop_vc.cpp new file mode 100644 index 00000000..69672e57 --- /dev/null +++ b/refs/tpushpop_vc.cpp @@ -0,0 +1,236 @@ +#include +#include + +using namespace pto; + +#ifdef __DAV_CUBE__ +constexpr bool DAV_CUBE = true; +#else +constexpr bool DAV_CUBE = false; +#endif + +#ifdef __DAV_VEC__ +constexpr bool DAV_VEC = true; +#else +constexpr bool DAV_VEC = false; +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) +{ + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +template +__global__ AICORE void runTPushPopVCMatmul(__gm__ uint64_t *ffts_addr, __gm__ OutT *out, __gm__ InT *srcA, + __gm__ QuantT *quantB, __gm__ OutT *scale, __gm__ OutT *offset, + __gm__ OutT *fifoMem) +{ + set_ffts_base_addr((uint64_t)ffts_addr); + constexpr uint32_t TILE_K = CASE_TILE_K; + constexpr uint32_t HALF_TILE_K = TILE_K / 2; + constexpr uint32_t TILE_N = N; + constexpr uint32_t NUM_K_TILES = TOTAL_K / CASE_TILE_K; + + constexpr uint16_t FLAG_ID = 0; + constexpr uint8_t FIFO_DEPTH = 2; + constexpr uint8_t FIFO_PERIOD = 1; + // fifo base used for TPOP of cube side (bMatTile) + constexpr uint32_t localFiFoBase = 0x20000; + + using VecTileProd = Tile; + using MatTileCons = + Tile; + + using MatPipe = TPipe; + MatPipe mPipe((__gm__ void *)fifoMem, 0x0, localFiFoBase); + + constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(InT); + constexpr uint32_t ALIGNED_M = CeilAlign(TOTAL_M, 16); + constexpr uint32_t ALIGNED_K = CeilAlign(TILE_K, blockAlign); + constexpr uint32_t ALIGNED_N = CeilAlign(TILE_N, blockAlign); + + using GlobalA = GlobalTensor, + pto::Stride>; + using GlobalOut = GlobalTensor, + pto::Stride>; + + using TileMatA = + Tile; + using LeftTile = TileLeft; + using PopTile = + Tile; + using RightTile = TileRight; + using AccTile = TileAcc; + + using QuantTile = Tile; + using ScaleTile = Tile; + using OffsetTile = Tile; + + if constexpr (DAV_VEC) { + QuantTile quantTile; + VecTileProd dequantTile; + ScaleTile scaleTile(HALF_TILE_K, 1); + OffsetTile offsetTile(HALF_TILE_K, 1); + TASSIGN(quantTile, 0x0); + TASSIGN(dequantTile, 0x10000); + TASSIGN(scaleTile, 0x20000); + TASSIGN(offsetTile, 0x28000); + + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + using GlobalQuantB = + GlobalTensor, + pto::Stride>; + using GlobalScaleOffset = + GlobalTensor, pto::Stride>; + + uint32_t subBlockIdx = get_subblockid(); + + for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) { + GlobalQuantB globalQuantB(quantB + k_tile * TILE_K * TILE_N + subBlockIdx * HALF_TILE_K * TILE_N); + GlobalScaleOffset globalScale(scale + k_tile * TILE_K + subBlockIdx * HALF_TILE_K); + GlobalScaleOffset globalOffset(offset + k_tile * TILE_K + subBlockIdx * HALF_TILE_K); + + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + TLOAD(quantTile, globalQuantB); + TLOAD(scaleTile, globalScale); + TLOAD(offsetTile, globalOffset); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + TDEQUANT(dequantTile, quantTile, scaleTile, offsetTile); + + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + TPUSH(mPipe, dequantTile); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + } + + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } + + if constexpr (DAV_CUBE) { + TileMatA aMatTile; + PopTile bMatTile; + TASSIGN(aMatTile, 0x0); + + LeftTile aTile; + RightTile bTile; + AccTile accTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + typename MatPipe::Consumer cons; + + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) { + GlobalA globalA(srcA + k_tile * TILE_K); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + TLOAD(aMatTile, globalA); + + TPOP(mPipe, bMatTile); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + if (k_tile == 0) { + TMATMUL(accTile, aTile, bTile); + } else { + TMATMUL_ACC(accTile, accTile, aTile, bTile); + } + + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + } + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + GlobalOut globalOut(out); + TSTORE(globalOut, accTile); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } +} + +template +void LaunchTPushPopVCMatmul(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream) +{ + if constexpr (tilingKey == 1) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } else if constexpr (tilingKey == 2) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } else if constexpr (tilingKey == 3) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } else if constexpr (tilingKey == 4) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } else if constexpr (tilingKey == 5) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } else if constexpr (tilingKey == 6) { + runTPushPopVCMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), + reinterpret_cast(fifoMem)); + } +} + +template void LaunchTPushPopVCMatmul<1>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); +template void LaunchTPushPopVCMatmul<2>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); +template void LaunchTPushPopVCMatmul<3>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); +template void LaunchTPushPopVCMatmul<4>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); +template void LaunchTPushPopVCMatmul<5>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); +template void LaunchTPushPopVCMatmul<6>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, + uint8_t *offset, uint8_t *fifoMem, void *stream); \ No newline at end of file From e821d3da63c414e2d85247c8d9a2480d99699209 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 24 Mar 2026 16:40:29 +0000 Subject: [PATCH 02/38] feat: up isa version, need etc.. for mix-kernels --- docker/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7eda8bc5..d04eba64 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,8 +16,8 @@ RUN pip install --no-cache-dir \ ipython jupyterlab matplotlib pandas # certain operations need latest isa header, not CANN 8.5.0 default -# header on 2026/03/16 -ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613 +# header on 2026/03/24 +ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af WORKDIR /sources RUN git clone https://gitcode.com/cann/pto-isa.git \ && cd pto-isa && git checkout $PTOISA_COMMIT @@ -30,7 +30,7 @@ ARG CACHE_BURST=1 # ARG ARCH=x86_64 ARG ARCH=aarch64 ARG RELEASE_REPO=zhangstevenunity/PTOAS -ARG RELEASE_VER=0.9 +ARG RELEASE_VER=0.15 ARG RELEASE_TAG=v${RELEASE_VER} ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz From a83b06d23020a60f1da49704a664f8115ee26b01 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:59:31 +0000 Subject: [PATCH 03/38] feat: add simpler cpp case --- .../tpushpop/mix-kernel_cpp_simple/README.md | 15 ++ .../tpushpop/mix-kernel_cpp_simple/caller.cpp | 27 +++ .../tpushpop/mix-kernel_cpp_simple/compile.sh | 36 ++++ .../tpushpop/mix-kernel_cpp_simple/kernel.cpp | 156 ++++++++++++++++++ .../aot/tpushpop/mix-kernel_cpp_simple/run.py | 72 ++++++++ 5 files changed, 306 insertions(+) create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/README.md create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp create mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/run.py diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md new file mode 100644 index 00000000..45adb9fc --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md @@ -0,0 +1,15 @@ +# Simple Cube To Vector `TPUSH`/`TPOP` Example + +This is a stripped-down sibling of `mix-kernel_cpp`. + +The kernel is fixed to a single `16x32 @ 32x32` matmul, followed by a bias add on the vector side: + +- no tile loop +- no sanity mode +- no extra runner configuration + +Run it with: + +```bash +python run.py +``` diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp new file mode 100644 index 00000000..fbe697f4 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp @@ -0,0 +1,27 @@ +#ifndef KERNEL_CPP +#error "KERNEL_CPP must be defined at compile time." +#endif + +#include + +extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *out, + uint8_t *srcA, + uint8_t *srcB, + uint8_t *bias, + uint8_t *fifoMem) +{ + void *fftsAddr = nullptr; + uint32_t fftsLen = 0; + (void)blockDim; + (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); + (void)fftsLen; + + LaunchTPushPopMatmulAdd(reinterpret_cast(fftsAddr), out, srcA, srcB, bias, fifoMem, stream); +} diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh new file mode 100644 index 00000000..0d8d8eb7 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}" +LIB_PATH="${SCRIPT_DIR}/lib.so" + +if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then + if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then + PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include" + elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then + PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include" + else + echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2 + exit 1 + fi +fi + +rm -f "${LIB_PATH}" + +bisheng \ + -I"${PTO_INCLUDE_PATH}" \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -DKERNEL_CPP="\"${SCRIPT_DIR}/kernel.cpp\"" \ + "${SCRIPT_DIR}/caller.cpp" \ + -o "${LIB_PATH}" + +echo "Built ${LIB_PATH}." diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp new file mode 100644 index 00000000..02336d41 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp @@ -0,0 +1,156 @@ +/* +Flow: +1. Cube loads A and B from GM through GlobalTensor views. +2. Cube copies those GM-backed matrix tiles into local matrix tiles: + `aMat` at `0x0`, `bMat` at `0x20000`, then converts them to matmul inputs + `aTile` and `bTile` and runs one `TMATMUL` into `acc`. +3. Cube `TPUSH`es the full `16x32` accumulator tile to the C2V pipe. +4. Vector `TPOP`s its `8x32` half-tile from that pushed accumulator, loads the + matching `8x32` bias tile from GM, does `TADD`, and stores the result to GM. + +Allocation summary: +- `GlobalTensor` objects are just GM views over `srcA`, `srcB`, `bias`, and `out`. + They do not allocate local on-core memory themselves. +- The C2V FIFO is also explicit GM memory in this example: `fifoMem` is the GM slot + buffer passed into `TPipe`, so cube writes the pushed accumulator tile into GM and + vector reads it back from that same GM-backed FIFO. +- Cube local tiles: + `aMat @ 0x0`, `bMat @ 0x20000`, `aTile @ 0x0`, `bTile @ 0x0`, `acc @ 0x0`. +- Vector local tiles: + `biasTile @ 0x10000`, `outTile @ 0x20000`. +- The cross-core transfer is the matmul result: one full `AccTile` + produced on cube and split `up/down` so each vector subcore receives one `8x32` + row half via `TPOP`. +*/ +#include +#include + +using namespace pto; + +using In = half; +using Out = float; + +constexpr uint32_t M = 16; +constexpr uint32_t K = 32; +constexpr uint32_t N = 32; +constexpr uint32_t VEC_CORES = 2; +constexpr uint32_t VEC_M = M / VEC_CORES; + +#ifdef __DAV_CUBE__ +constexpr bool DAV_CUBE = true; +#else +constexpr bool DAV_CUBE = false; +#endif + +#ifdef __DAV_VEC__ +constexpr bool DAV_VEC = true; +#else +constexpr bool DAV_VEC = false; +#endif + +__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts, __gm__ Out *out, __gm__ In *srcA, __gm__ In *srcB, + __gm__ Out *bias, __gm__ Out *fifoMem) +{ + set_ffts_base_addr((uint64_t)ffts); + + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride>; + using GlobalBias = GlobalTensor, Stride>; + using GlobalOut = GlobalTensor, Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + using VecTile = Tile; + + using Pipe = TPipe<0, Direction::DIR_C2V, M * N * sizeof(Out), 2>; + Pipe pipe((__gm__ void *)(uint64_t)fifoMem, 0x0, 0x0); + + if constexpr (DAV_CUBE) { + TileMatA aMat; + TileMatB bMat; + LeftTile aTile; + RightTile bTile; + AccTile acc; + TASSIGN(aMat, 0x0); + TASSIGN(bMat, 0x20000); + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(acc, 0x0); + + GlobalA globalA(srcA); + GlobalB globalB(srcB); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(aMat, globalA); + TLOAD(bMat, globalB); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TMOV(aTile, aMat); + TMOV(bTile, bMat); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + TMATMUL(acc, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH(pipe, acc); + + pipe_barrier(PIPE_ALL); + } + + if constexpr (DAV_VEC) { + VecTile popped; + VecTile biasTile; + VecTile outTile; + TASSIGN(biasTile, 0x10000); + TASSIGN(outTile, 0x20000); + + uint32_t subBlock = get_subblockid(); + uint32_t offset = subBlock * VEC_M * N; + GlobalBias globalBias(bias + offset); + GlobalOut globalOut(out + offset); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TPOP(pipe, popped); + TLOAD(biasTile, globalBias); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TADD(outTile, popped, biasTile); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(globalOut, outTile); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } +} + +void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, + void *stream) +{ + runTPushPopMatmulAdd<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); +} diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py new file mode 100644 index 00000000..e098ddeb --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py @@ -0,0 +1,72 @@ +import ctypes +import os +import subprocess + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_PATH = os.path.join(THIS_DIR, "lib.so") +M = 16 +K = 32 +N = 32 +FIFO_ELEMS = 1024 +ATOL = 5e-2 +RTOL = 5e-2 + + +def ptr(tensor: torch.Tensor) -> ctypes.c_void_p: + return ctypes.c_void_p(tensor.data_ptr()) + + +def main() -> None: + subprocess.run(["bash", "compile.sh"], check=True, cwd=THIS_DIR) + + device = get_test_device() + torch.npu.set_device(device) + torch.manual_seed(0) + + lib = ctypes.CDLL(LIB_PATH) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, + ] + lib.call_kernel.restype = None + + a = torch.randn((M, K), dtype=torch.float16, device=device) + b = torch.randn((K, N), dtype=torch.float16, device=device) + bias = torch.randn((M, N), dtype=torch.float32, device=device) + out = torch.zeros((M, N), dtype=torch.float32, device=device) + fifo = torch.zeros((FIFO_ELEMS,), dtype=torch.float32, device=device) + + lib.call_kernel( + 1, + torch.npu.current_stream()._as_parameter_, + ptr(out), + ptr(a), + ptr(b), + ptr(bias), + ptr(fifo), + ) + torch.npu.synchronize() + + ref = a.float().cpu() @ b.float().cpu() + bias.cpu() + out_cpu = out.cpu() + max_abs = float((out_cpu - ref).abs().max().item()) + print(f"max_abs={max_abs:.6f}") + + if not torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL): + raise SystemExit("validation failed") + + print("validation passed") + + +if __name__ == "__main__": + main() From 0ad3ceebe34690de857c7159ab7f31a5610ff2a1 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 26 Mar 2026 17:00:04 +0000 Subject: [PATCH 04/38] feat: rename --- .../README.md | 2 +- .../caller.cpp | 0 .../compile.sh | 4 +- .../run_tpushpop_cv.py | 10 +- .../tpushpop/mix-kernel_cpp/tpushpop_cv.cpp | 297 ++++++++++++++++++ 5 files changed, 309 insertions(+), 4 deletions(-) rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/README.md (79%) rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/caller.cpp (100%) rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/compile.sh (93%) rename examples/aot/tpushpop/{cube_to_vector_matmul_add => mix-kernel_cpp}/run_tpushpop_cv.py (91%) create mode 100644 examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md b/examples/aot/tpushpop/mix-kernel_cpp/README.md similarity index 79% rename from examples/aot/tpushpop/cube_to_vector_matmul_add/README.md rename to examples/aot/tpushpop/mix-kernel_cpp/README.md index 28ae0c39..672e71f1 100644 --- a/examples/aot/tpushpop/cube_to_vector_matmul_add/README.md +++ b/examples/aot/tpushpop/mix-kernel_cpp/README.md @@ -1,6 +1,6 @@ # Cube To Vector `TPUSH`/`TPOP` Example -This example wraps `refs/tpushpop_cv.cpp` into the same `compile.sh` + Python runner flow used by the AOT examples. +This example keeps the kernel source in the same directory as the wrapper, using `./tpushpop_cv.cpp` with the same `compile.sh` + Python runner flow used by the AOT examples. The kernel does: diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp similarity index 100% rename from examples/aot/tpushpop/cube_to_vector_matmul_add/caller.cpp rename to examples/aot/tpushpop/mix-kernel_cpp/caller.cpp diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh similarity index 93% rename from examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh rename to examples/aot/tpushpop/mix-kernel_cpp/compile.sh index 4df82d4c..df924539 100644 --- a/examples/aot/tpushpop/cube_to_vector_matmul_add/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh @@ -2,9 +2,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so" +KERNEL_CPP_PATH="${KERNEL_CPP_PATH:-${SCRIPT_DIR}/tpushpop_cv.cpp}" EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}" if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then @@ -39,7 +39,7 @@ bisheng \ --npu-arch=dav-2201 -DMEMORY_BASE \ -std=gnu++17 \ ${EXTRA_BISHENG_FLAGS} \ - -DKERNEL_CPP="\"${REPO_ROOT}/refs/tpushpop_cv.cpp\"" \ + -DKERNEL_CPP="\"${KERNEL_CPP_PATH}\"" \ "${SCRIPT_DIR}/caller.cpp" \ -o "${LIB_PATH}" diff --git a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py similarity index 91% rename from examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py rename to examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py index 011fc1d2..4e2d468d 100644 --- a/examples/aot/tpushpop/cube_to_vector_matmul_add/run_tpushpop_cv.py +++ b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py @@ -11,8 +11,9 @@ THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so") DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") +DEFAULT_KERNEL_CPP = os.path.join(THIS_DIR, "tpushpop_cv.cpp") DEFAULT_FIFO_BYTES = 4 * 1024 -TOTAL_M = 16 +TOTAL_M = 128 K = 32 N = 32 INPUT_DTYPE = torch.float16 @@ -27,10 +28,13 @@ def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: def compile_example(compile_script: str) -> None: + env = os.environ.copy() + env["KERNEL_CPP_PATH"] = DEFAULT_KERNEL_CPP subprocess.run( ["bash", compile_script], check=True, cwd=THIS_DIR, + env=env, ) @@ -126,6 +130,10 @@ def main() -> None: ) ref = reference_result(src_a, src_b, bias) out_cpu = out.cpu() + assert ref.device == out_cpu.device + torch.npu.synchronize() + torch.set_printoptions(precision=1, sci_mode=False, linewidth=250, threshold=5000) + print(ref-out_cpu) max_abs = float(torch.max(torch.abs(out_cpu - ref)).item()) mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item()) diff --git a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp new file mode 100644 index 00000000..3f4c42b2 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp @@ -0,0 +1,297 @@ +#include +#include + +using namespace pto; + +#define VEC_CORES 2 + +using ExampleInT = half; +using ExampleOutT = float; +constexpr uint32_t EXAMPLE_TOTAL_M = 128; +constexpr uint32_t EXAMPLE_CASE_TILE_M = 16; +constexpr uint32_t EXAMPLE_TILE_K = 32; +constexpr uint32_t EXAMPLE_TILE_N = 32; + +#ifdef __DAV_CUBE__ +constexpr bool DAV_CUBE = true; +#else +constexpr bool DAV_CUBE = false; +#endif + +#ifdef __DAV_VEC__ +constexpr bool DAV_VEC = true; +#else +constexpr bool DAV_VEC = false; +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) +{ + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +#ifdef TPUSHPOP_SANITY_ONLY +__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB) +{ + constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); + constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_TOTAL_M, 16); + constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); + constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); + + using GlobalA = + GlobalTensor, + pto::Stride>; + using GlobalB = + GlobalTensor, + pto::Stride>; + using GlobalOut = + GlobalTensor, + pto::Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + if constexpr (DAV_CUBE) { + TileMatA aMatTile; + TileMatB bMatTile; + LeftTile aTile; + RightTile bTile; + AccTile accTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + GlobalA globalA(srcA); + GlobalB globalB(srcB); + GlobalOut globalOut(out); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(aMatTile, globalA); + TLOAD(bMatTile, globalB); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + TMATMUL(accTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(globalOut, accTile); + + pipe_barrier(PIPE_ALL); + } +} +#else +__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out, + __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB, + __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem) +{ + // Point the cross-core FIFO signaling ops at the FFTS flag storage used by TPUSH/TPOP handshakes. + //t_ffts_base_addr((uint64_t)ffts_addr); + constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M; + constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES; + + constexpr uint16_t FLAG_ID = 0; + constexpr uint8_t FIFO_DEPTH = 2; + constexpr uint8_t FIFO_PERIOD = 1; + // Local ring-buffer base used by vector-side TPOP to place each popped half-tile before vector compute uses it. + constexpr uint32_t localFiFoBase = 0x0; + + using AccTile = TileAcc; + using VecTileHalf = + Tile; + using BiasTile = + Tile; + using OutTile = + Tile; + + // Cube-to-vector FIFO: each GM slot stores one full AccTile, and vector TPOP reads it back as two row halves. + using MatPipe = TPipe; + // Bind the FIFO protocol to GM slot storage and the vector-side local staging buffer used by TPOP. + MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase); + + constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); + constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_CASE_TILE_M, 16); + constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); + constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); + + using GlobalA = + GlobalTensor, + pto::Stride>; + using GlobalB = + GlobalTensor, + pto::Stride>; + using GlobalBias = + GlobalTensor, + pto::Stride>; + using GlobalOut = + GlobalTensor, + pto::Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + using LeftTile = TileLeft; + using RightTile = TileRight; + + if constexpr (DAV_CUBE) { + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile accTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(accTile, 0x0); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { + GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K); + GlobalB globalB(srcB); + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + TLOAD(aMatTile, globalA); + TLOAD(bMatTile, globalB); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + + TMATMUL(accTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + // Push the full accumulator tile into the next GM FIFO slot and signal vector that one split-up-down tile is ready. + TPUSH(mPipe, accTile); + + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + } + + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } + + if constexpr (DAV_VEC) { + VecTileHalf vecTileHalf; + BiasTile biasTile; + OutTile outTile; + TASSIGN(biasTile, 0x10000); + TASSIGN(outTile, 0x20000); + + uint32_t subBlockIdx = get_subblockid(); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + // Pop this subcore's half-tile from the next ready FIFO slot into local vector memory based on get_subblockid(). + // TILE_UP_DOWN means split MxN tile into-> [M/2xN, M/2xN]. + TPOP(mPipe, vecTileHalf); + + size_t biasOffset = + static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; + GlobalBias globalBias(bias + biasOffset); + + TLOAD(biasTile, globalBias); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + TADD(outTile, vecTileHalf, biasTile); + + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + size_t outOffset = + static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; + GlobalOut globalOut(out + outOffset); + // Store this vector subcore's output half-tile from local vector memory back to its GM output slice. + TSTORE(globalOut, outTile); + + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + } + + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + pipe_barrier(PIPE_ALL); + } +} +#endif + +void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, + void *stream) +{ +#ifdef TPUSHPOP_SANITY_ONLY + (void)ffts; + (void)bias; + (void)fifoMem; + runSanityMatmul<<<1, nullptr, stream>>>( + reinterpret_cast(out), reinterpret_cast(srcA), reinterpret_cast(srcB)); +#else + runTPushPopMatmulAdd<<<1, nullptr, stream>>>( + reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), + reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); +#endif +} From 4e034359eded6af2ad30603a8e4d035146958efb Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 26 Mar 2026 17:01:02 +0000 Subject: [PATCH 05/38] feat: add mlir example[ WIP ] --- .../aot/tpushpop/mix-kernel_mlir/README.md | 17 + .../bidirectional_example.mlir | 133 +++ .../aot/tpushpop/mix-kernel_mlir/caller.cpp | 15 + .../aot/tpushpop/mix-kernel_mlir/compile.sh | 32 + .../aot/tpushpop/mix-kernel_mlir/pto_docs.md | 822 ++++++++++++++++++ .../run_bidirectional_example.py | 71 ++ 6 files changed, 1090 insertions(+) create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/README.md create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/caller.cpp create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/compile.sh create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md new file mode 100644 index 00000000..a898a57b --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md @@ -0,0 +1,17 @@ +# Bidirectional `TPUSH`/`TPOP` MLIR Example + +This example mirrors the `mix-kernel_cpp` flow, but starts from +[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir). + +The pipeline is: + +1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp` +2. compile the generated C++ together with `caller.cpp` +3. build `./tpushpop_mlir_lib.so` +4. launch the generated `pto.entry` kernel from Python + +## Run + +```bash +python run_bidirectional_example.py +``` diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir new file mode 100644 index 00000000..01052493 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir @@ -0,0 +1,133 @@ +module { + func.func @cube_kernel(%gm_slot_buffer: i32) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %v2c_local = pto.reserve_buffer { + name = "v2c_fifo", + size = 4096, + location = #pto.address_space, + auto = true + } -> i32 + %c2v_import = pto.import_reserved_buffer { + name = "c2v_fifo", + peer_func = @vector_kernel + } -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} + (gm_slot_buffer = %gm_slot_buffer : i32, + c2v_consumer_buf = %c2v_import : i32, + v2c_consumer_buf = %v2c_local : i32) + + %acc_tile = pto.alloc_tile : !pto.tile_buf + pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} + + %mat_tile = pto.tpop_from_aiv {split = 1} + -> !pto.tile_buf + pto.tfree_from_aiv {split = 2} + return + } + + func.func @vector_kernel(%gm_slot_buffer: i32) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c2v_local = pto.reserve_buffer { + name = "c2v_fifo", + size = 4096, + location = #pto.address_space, + auto = true + } -> i32 + %v2c_import = pto.import_reserved_buffer { + name = "v2c_fifo", + peer_func = @cube_kernel + } -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} + (gm_slot_buffer = %gm_slot_buffer : i32, + c2v_consumer_buf = %c2v_local : i32, + v2c_consumer_buf = %v2c_import : i32) + + %vec_tile = pto.alloc_tile : !pto.tile_buf + pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} + + %recv_tile = pto.tpop_from_aic {split = 1} + -> !pto.tile_buf + pto.tfree_from_aic {split = 2} + return + } + + func.func @cube_kernel_nested(%gm_slot_buffer: i32) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %true = arith.constant true + scf.if %true { + %v2c_local = pto.reserve_buffer { + name = "v2c_fifo_nested", + size = 4096, + location = #pto.address_space, + auto = true + } -> i32 + %c2v_import = pto.import_reserved_buffer { + name = "c2v_fifo_nested", + peer_func = @vector_kernel_nested + } -> i32 + pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} + (gm_slot_buffer = %gm_slot_buffer : i32, + c2v_consumer_buf = %c2v_import : i32, + v2c_consumer_buf = %v2c_local : i32) + + %acc_tile = pto.alloc_tile : !pto.tile_buf + pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} + + %recv_tile = pto.tpop_from_aiv {split = 1} + -> !pto.tile_buf + pto.tfree_from_aiv {split = 2} + } + return + } + + func.func @vector_kernel_nested(%gm_slot_buffer: i32) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %true = arith.constant true + scf.if %true { + %c2v_local = pto.reserve_buffer { + name = "c2v_fifo_nested", + size = 4096, + location = #pto.address_space, + auto = true + } -> i32 + %v2c_import = pto.import_reserved_buffer { + name = "v2c_fifo_nested", + peer_func = @cube_kernel_nested + } -> i32 + pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} + (gm_slot_buffer = %gm_slot_buffer : i32, + c2v_consumer_buf = %c2v_local : i32, + v2c_consumer_buf = %v2c_import : i32) + + %vec_tile = pto.alloc_tile : !pto.tile_buf + pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} + + %recv_tile = pto.tpop_from_aic {split = 1} + -> !pto.tile_buf + pto.tfree_from_aic {split = 2} + } + return + } +} + +// A3-LABEL: AICORE void cube_kernel( +// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( +// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( +// A3: TPUSH +// A3: TPOP +// A3: TFREE + +// A3-LABEL: AICORE void vector_kernel( +// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( +// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( +// A3: TPUSH +// A3: TPOP +// A3: TFREE + +// A3-LABEL: AICORE void cube_kernel_nested( +// A3: if ( +// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( +// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( +// A3: TPUSH +// A3: TPOP +// A3: TFREE diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp new file mode 100644 index 00000000..5926c256 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp @@ -0,0 +1,15 @@ +#ifndef KERNEL_CPP +#error "KERNEL_CPP must be defined at compile time." +#endif + +#include + +#include KERNEL_CPP + +extern "C" void call_kernel( + uint32_t blockDim, + void *stream, + uint8_t *gmSlotBuffer) +{ + bidirectional_example<<>>((__gm__ float *)gmSlotBuffer); +} diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh new file mode 100644 index 00000000..ee6376fb --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" +MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir" +GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp" +LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" + +mkdir -p "${ARTIFACT_DIR}" +rm -f "${GENERATED_CPP}" "${LIB_PATH}" + +ptoas --pto-arch=a3 "${MLIR_PATH}" > "${GENERATED_CPP}" + +bisheng \ + -I/sources/pto-isa/include/ \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ + -xcce -Xhost-start -Xhost-end \ + -mllvm -cce-aicore-stack-size=0x8000 \ + -mllvm -cce-aicore-function-stack-size=0x8000 \ + -mllvm -cce-aicore-record-overflow=true \ + -mllvm -cce-aicore-addr-transform \ + -mllvm -cce-aicore-dcci-insert-for-scalar=false \ + --npu-arch=dav-2201 -DMEMORY_BASE \ + -std=gnu++17 \ + -DKERNEL_CPP="\"${GENERATED_CPP}\"" \ + "${SCRIPT_DIR}/caller.cpp" \ + -o "${LIB_PATH}" + +echo "Generated ${GENERATED_CPP}." +echo "Built ${LIB_PATH}." diff --git a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md new file mode 100644 index 00000000..394b2798 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md @@ -0,0 +1,822 @@ +# TPUSH/TPOP 前端接口与 PTOAS 实现设计 + +## 1. 文档范围 + +本文定义PTOAS TPUSH/TPOP 前端IR接口,以及其在 PTOAS 内部的 lowering、地址传播、flag 分配和 EmitC 映射规则。 + +本文覆盖两层接口: + +- 前端接口 + - `pto.aic_initialize_pipe` + - `pto.aiv_initialize_pipe` + - `pto.tpush_to_aiv` + - `pto.tpush_to_aic` + - `pto.tpop_from_aic` + - `pto.tpop_from_aiv` + - `pto.tfree_from_aic` + - `pto.tfree_from_aiv` + - `pto.reserve_buffer` + - `pto.import_reserved_buffer` +- PTOAS 内部统一接口 + - `pto.initialize_l2g2l_pipe` + - `pto.initialize_l2l_pipe` + - `pto.tpush` + - `pto.declare_tile` + - `pto.tpop` + - `pto.tfree` + +本文只描述接口契约与编译流程,不展开具体 C++ 模板实现细节。 + +## 2. 设计目标 + +本设计的目标如下: + +- 对前端提供\*\_initialize_pipe/tpush_to_\*/tpop_from_\*/tfree_from_\*IR接口。 +- 在 PTOAS 内部统一为 pipe/tpush/tpop/tfree 指令,便于复用已有 pass。 +- 支持 A2/A3 与 A5 两个平台使用同一套前端接口。 +- 定义consumer slot buffer的分配地址与producer之间的匹配关系,并传播。 + +## 3. 前端 IR 接口定义 + +### 3.1 `pto.aic_initialize_pipe` + +#### 语义 + +由 Cube kernel 在函数启动时调用,初始化该函数涉及的通信 pipe。 + +#### 语法 + +```mlir +pto.aic_initialize_pipe( + DIR_MASK, + SLOT_SIZE, + GM_SLOT_BUFFER, + C2V_CONSUMER_BUF, + V2C_CONSUMER_BUF) +``` + +#### 参数 + +| 参数 | 类型 | 说明 | +|---|---|---| +| `DIR_MASK` | 编译期整数常量 | `1`、`2` 或 `3` | +| `SLOT_SIZE` | 编译期整数常量 | 单 slot 字节数,定义为切分前完整 tile 字节数 | +| `GM_SLOT_BUFFER` | GM 地址或空值 | A2/A3 路径使用,A5 路径为空 | +| `C2V_CONSUMER_BUF` | `i32` | C2V 方向 consumer 的 local slot buffer 基址 | +| `V2C_CONSUMER_BUF` | `i32` | V2C 方向 consumer 的 local slot buffer 基址 | + +### 3.2 `pto.aiv_initialize_pipe` + +#### 语义 + +由 Vector kernel 在函数启动时调用,初始化该函数涉及的通信 pipe。 + +#### 语法 + +```mlir +pto.aiv_initialize_pipe( + DIR_MASK, + SLOT_SIZE, + GM_SLOT_BUFFER, + C2V_CONSUMER_BUF, + V2C_CONSUMER_BUF) +``` + +参数语义与 `pto.aic_initialize_pipe` 相同。 + +### 3.3 前端数据传输接口 + +#### `pto.tpush_to_aiv` + +```mlir +pto.tpush_to_aiv(%tile) { split = 0 } +``` + +- 仅出现在 Cube kernel 中 +- 表示 C2V 方向 producer push + +#### `pto.tpush_to_aic` + +```mlir +pto.tpush_to_aic(%tile) { split = 0 } +``` + +- 仅出现在 Vector kernel 中 +- 表示 V2C 方向 producer push + +#### `pto.tpop_from_aic` + +```mlir +%tile = pto.tpop_from_aic { split = 0 } -> !pto.tile_buf<...> +``` + +- 仅出现在 Vector kernel 中 +- 表示 C2V 方向 consumer pop + +#### `pto.tpop_from_aiv` + +```mlir +%tile = pto.tpop_from_aiv { split = 0 } -> !pto.tile_buf<...> +``` + +- 仅出现在 Cube kernel 中 +- 表示 V2C 方向 consumer pop + +#### `pto.tfree_from_aic` + +```mlir +pto.tfree_from_aic { split = 0 } +``` + +- 仅出现在 Vector kernel 中 +- 表示 C2V 方向 consumer free + +#### `pto.tfree_from_aiv` + +```mlir +pto.tfree_from_aiv { split = 0 } +``` + +- 仅出现在 Cube kernel 中 +- 表示 V2C 方向 consumer free + +以上前端数据传输接口中的 `split` 均为编译期常量属性,不是运行时 SSA operand。 + +- 取值使用 `TileSplitAxis` 枚举语义:`0/1/2` 分别对应 `TILE_NO_SPLIT`、`TILE_UP_DOWN`、`TILE_LEFT_RIGHT` +- lowering 到 PTOAS 内部 IR 时,`split` 继续以属性形式保留 + +### 3.4 地址提示接口 + +#### `pto.reserve_buffer` + +用于在当前函数内声明一块 consumer slot buffer 预留空间。其合法写法由 +当前编译流程是否启用 local address planning 决定。 + +```mlir +%buf = pto.reserve_buffer { + name = "c2v_slot_buffer", + size = 2048, + location = #pto.address_space, + auto = true +} -> i32 +``` + +或使用显式地址: + +```mlir +%buf = pto.reserve_buffer { + name = "c2v_slot_buffer", + size = 2048, + location = #pto.address_space, + auto = false, + base = 4096 +} -> i32 +``` + +#### 参数 + +| 参数 | 类型 | 说明 | +|---|---|---| +| `name` | 字符串属性 | 本函数内唯一的预留段名字 | +| `size` | 整数属性 | 预留字节数 | +| `location` | 地址空间属性 | 预留空间所在 local 地址空间 | +| `auto` | `bool` 属性 | 地址解析路径标志;`true` 表示地址由 PTOAS 地址规划路径分配,`false` 表示地址已在输入 IR 中显式给定 | +| `base` | 可选整数属性 | 显式起始地址;仅 manual 路径使用 | + +#### 结果 + +- 结果类型为 `i32` +- 结果值表示该 buffer 当前可用的基址 +- 当前可用基址可来自显式 `base`,也可来自 plan memory 回填后的解析地址 +- 在当前约束下,每个函数最多一条 `reserve_buffer` +- 编译路径与 `auto` 的合法组合只有两种: + - 启用 local address planning:`auto = true`,且不带 `base` + - 跳过 local address planning:`auto = false`,且显式提供 `base` + +#### `pto.import_reserved_buffer` + +用于引用 peer function 中已经定义的 `reserve_buffer` 结果。 + +```mlir +%buf = pto.import_reserved_buffer { + name = "c2v_slot_buffer", + peer_func = @vector_kernel +} -> i32 +``` + +#### 参数 + +| 参数 | 类型 | 说明 | +|---|---|---| +| `name` | 字符串属性 | peer 侧 `reserve_buffer` 的名字 | +| `peer_func` | symbol ref | peer 函数符号 | + +#### 结果 + +- 结果类型为 `i32` +- 结果值表示从 peer `reserve_buffer` 导入的已解析基址 + +### 3.5 前端层约束 + +前端 IR 需满足以下约束: + +- 每个 Cube function 最多一条 `pto.aic_initialize_pipe` +- 每个 Vector function 最多一条 `pto.aiv_initialize_pipe` +- 每个函数内最多一条 C2V 逻辑 pipe 和一条 V2C 逻辑 pipe +- 每个函数最多一条 `reserve_buffer` +- 每个函数最多一条 `import_reserved_buffer` +- `DIR_MASK` 只允许 `1`、`2`、`3` +- `SLOT_SIZE > 0` +- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM` +- C2V consumer 的 `reserve_buffer.location` 必须是 `VEC` +- V2C consumer 的 `reserve_buffer.location` 必须是 `MAT` +- `reserve_buffer.name` 在本函数内必须唯一 +- op 级约束:`reserve_buffer.auto = false` 时必须提供 `base` +- op 级约束:`reserve_buffer.auto = true` 时必须不提供 `base` +- 启用 local address planning 的编译流程:`reserve_buffer` 只允许 `auto = true` +- 跳过 local address planning 的编译流程:`reserve_buffer` 只允许 `auto = false` 且显式提供 `base` +- `import_reserved_buffer` 必须能在 `peer_func` 中找到同名 `reserve_buffer` + +## 4. 核心约定 + +### 4.1 逻辑 pipe + +本文中的“逻辑 pipe”指一条单向通信通道。 + +- C2V:Cube producer -> Vector consumer +- V2C:Vector producer -> Cube consumer + +`DIR_MASK=3` 表示前端一个同时包含 C2V 和 V2C 的初始化请求,在 PTOAS lowering 后拆成两条单向逻辑 pipe: + +- 一条 `dir_mask = 1` 的 C2V pipe +- 一条 `dir_mask = 2` 的 V2C pipe + +### 4.2 `split` 的角色 + +`split` 使用 `TileSplitAxis` 枚举表达: + +- `TILE_NO_SPLIT` +- `TILE_UP_DOWN` +- `TILE_LEFT_RIGHT` + +在 PTOAS 设计中,`split` 的角色定义为: + +- `split` 是 `tpush/tpop/tfree` 的逐指令执行模式 +- `split` 在 IR 中表示为编译期常量属性,不是运行时 SSA operand +- `split` 不参与pipe 初始化 +- `split` 不参与 plan memory、地址传播、flag 分配 +- PTOAS 将 `split` 作为透明的编译期参数向 EmitC 和底层 pto-isa 透传 + +因此: + +- 同一条逻辑 pipe 上可以出现不同 `split` 的 `tpush/tpop/tfree` +- PTOAS 不要求同一逻辑 pipe 内所有指令使用同一个 `split` +- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证;PTOAS 仅校验 `split` 枚举合法并向下透传 + +### 4.3 `SLOT_SIZE` 的定义 + +`SLOT_SIZE` 的定义固定为: + +- 切分前完整 tile 的字节数 + +即使 `split` 为 `TILE_UP_DOWN` 或 `TILE_LEFT_RIGHT`,`SLOT_SIZE` 仍然表示未切分前的逻辑 tile 总字节数。 + +`split` 只影响底层 `TPUSH/TPOP/TFREE` 的执行方式,不影响 `SLOT_SIZE` 的含义。 + +### 4.4 `SLOT_NUM` 规则 + +`SLOT_NUM` 由 `DIR_MASK` 固定决定: + +- `DIR_MASK = 1` 或 `2`:`SLOT_NUM = 8` +- `DIR_MASK = 3`:拆成两条单向 pipe,且每条 `SLOT_NUM = 4` + +`SLOT_NUM` 不由 `split` 决定。 + +## 5. PTOAS 内部 IR 接口定义 + +### 5.1 `!pto.pipe` + +本文设计的内部 `!pto.pipe` 为不透明 handle。 + +`!pto.pipe` 的协议信息由其定义 op 上的属性承载,而不是由 type 参数承载。 + +底层 `pto-isa` 若对 `TPUSH/TPOP` 的模板形态继续演进,不反向约束 `!pto.pipe` 的 type 设计;内部 `!pto.pipe` 仍保持 opaque handle。 + +### 5.2 `pto.initialize_l2g2l_pipe` + +用于 A2/A3 路径。 + +```mlir +%pipe = pto.initialize_l2g2l_pipe { + dir_mask = 1, + slot_size = 512, + slot_num = 8, + local_slot_num = 8 +}(%gm_addr, %local_addr) -> !pto.pipe +``` + +#### 必需属性 + +- `dir_mask` +- `slot_size` +- `slot_num` + +#### 可选属性 + +- `local_slot_num` + - 仅 `initialize_l2g2l_pipe` 承载 + - 表示 GM 路径下 consumer 侧 local slot buffer 的槽数 + - 仅在通过 GM 传递时对底层 `TPipe` 模板参数有意义,不改变 GM FIFO 的 `slot_num` + - 缺省值等于该内部单向 pipe 的 `slot_num` + - 因此当前固定规则下: + - `DIR_MASK=1/2` 直接 lowering 时,`local_slot_num = 8` + - `DIR_MASK=3` 拆成两条单向 pipe 后,每条 `local_slot_num = 4` +- `flag_base` + - 由 PTOAS flag 分配阶段填写 + - frontend lowering 阶段可以缺省 + - EmitC 前必须已经解析为显式常量 + +#### 操作数 + +- `gm_addr` +- `local_addr` + +### 5.3 `pto.initialize_l2l_pipe` + +用于 A5 路径。 + +```mlir +%pipe = pto.initialize_l2l_pipe { + dir_mask = 1, + slot_size = 512, + slot_num = 8 +}(%local_addr) -> !pto.pipe +``` + +#### 必需属性 + +- `dir_mask` +- `slot_size` +- `slot_num` + +#### 可选属性 + +- `flag_base` + - 由 PTOAS flag 分配阶段填写 + - frontend lowering 阶段可以缺省 + - EmitC 前必须已经解析为显式常量 + +#### 操作数 + +- `local_addr` + +### 5.4 `pto.tpush` + +```mlir +pto.tpush(%tile, %pipe) { split = 0 } +``` + +### 5.5 `pto.declare_tile` + +```mlir +%tile = pto.declare_tile -> !pto.tile_buf<...> +``` + +### 5.6 `pto.tpop` + +```mlir +pto.tpop(%tile, %pipe) { split = 0 } +``` + +### 5.7 `pto.tfree` + +```mlir +pto.tfree(%pipe) { split = 0 } +``` + +`split` 在内部 IR 中必须以编译期常量属性形式保留,不能在 lowering 时擦除或降为运行时 operand。 + +## 6. 前端到内部 IR 的 lowering 规则 + +### 6.1 初始化接口 lowering + +#### A2/A3 + +- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2g2l_pipe` +- 若前端未提供更具体信息,lowering 默认补上 `local_slot_num = slot_num` + +#### A5 + +- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2l_pipe` + +### 6.2 `DIR_MASK=1/2` + +- 只生成一条内部 pipe +- `slot_num = 8` +- 对 `initialize_l2g2l_pipe`,`local_slot_num = 8` + +### 6.3 `DIR_MASK=3` + +前端一个 init op 固定拆成两条内部 pipe: + +- `%pipe_c2v`:`dir_mask = 1`,`slot_num = 4` +- `%pipe_v2c`:`dir_mask = 2`,`slot_num = 4` + +若 lowering 为 `initialize_l2g2l_pipe`,则两条内部 pipe 还满足: + +- `%pipe_c2v`:`local_slot_num = 4` +- `%pipe_v2c`:`local_slot_num = 4` + +地址选择规则: + +- `%pipe_c2v` 使用 `C2V_CONSUMER_BUF` +- `%pipe_v2c` 使用 `V2C_CONSUMER_BUF` + +### 6.4 前端数据传输 op 与内部 pipe 的绑定 + +绑定规则固定如下: + +| 前端 op | 所在函数 | 方向 | 使用的内部 pipe | +|---|---|---|---| +| `tpush_to_aiv` | Cube | C2V | `dir_mask = 1` | +| `tpop_from_aic` | Vector | C2V | `dir_mask = 1` | +| `tfree_from_aic` | Vector | C2V | `dir_mask = 1` | +| `tpush_to_aic` | Vector | V2C | `dir_mask = 2` | +| `tpop_from_aiv` | Cube | V2C | `dir_mask = 2` | +| `tfree_from_aiv` | Cube | V2C | `dir_mask = 2` | + +### 6.5 数据传输 op lowering + +#### `tpush_to_aiv` / `tpush_to_aic` + +lower 为: + +```mlir +pto.tpush(%tile, %pipe) { split = 0 } +``` + +#### `tpop_from_aic` / `tpop_from_aiv` + +lower 为: + +```mlir +%decl = pto.declare_tile -> !pto.tile_buf<...> +pto.tpop(%decl, %pipe) { split = 0 } +``` + +即: + +- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` 是返回 tile 结果值的接口 +- PTOAS 内部 `pto.tpop` 才是 destination-style 形式,显式接收一个 `pto.declare_tile` 结果作为入参 + +#### `tfree_from_aic` / `tfree_from_aiv` + +lower 为: + +```mlir +pto.tfree(%pipe) { split = 0 } +``` + +## 7. `reserve_buffer` 与地址传播 + +### 7.1 设计原则 + +- `reserve_buffer` 只表示本函数 consumer slot buffer 的本地预留 +- `import_reserved_buffer` 只表示对 peer 预留段地址的引用 +- `reserve_buffer` 用属性描述“如何得到地址”,用结果值统一承载“当前可用地址” +- 当前编译流程是否启用 local address planning 与 `reserve_buffer.auto` 共同决定地址处理路径 +- 启用 local address planning:`reserve_buffer` 必须使用 `auto = true`,由 `PlanMemory` 分配地址 +- 跳过 local address planning:`reserve_buffer` 必须使用 `auto = false` 且显式提供 `base`,不再进入 `PlanMemory` 分配路径 +- PTOAS 复用现有 `PlanMemory` pass 实现 `reserve_buffer` 地址确定,不额外增加独立的预分配 pass +- PTOAS 新增独立地址传播 pass,专门处理 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐 +- 地址传播 pass 在 EmitC 之前运行;启用规划时位于 plan memory 之后,跳过规划时直接消费前端已给定地址 + +### 7.2 使用规则 + +#### C2V + +- consumer 是 Vector +- Vector function 需要 `reserve_buffer(location = VEC)` +- Cube function 需要 `import_reserved_buffer(peer_func = @vector_kernel)` + +#### V2C + +- consumer 是 Cube +- Cube function 需要 `reserve_buffer(location = MAT)` +- Vector function 需要 `import_reserved_buffer(peer_func = @cube_kernel)` + +### 7.3 编译路径与地址处理路径 + +对包含 `reserve_buffer` 的函数,PTOAS 按当前编译流程是否启用 local address planning 以及 `auto` 的组合选择地址处理路径: + +- 启用 local address planning + `auto = true` + - 进入 auto 路径 + - 由 `PlanMemory` 为 `reserve_buffer` 分配 `base` + - 随后由 `pto-resolve-reserved-buffers` 传播地址并完成 peer `flag_base` 对齐 +- 跳过 local address planning + `auto = false` + 显式 `base` + - 进入 manual 路径 + - 跳过 `PlanMemory` + - 由 `pto-resolve-reserved-buffers` 直接传播已给定地址并完成 peer `flag_base` 对齐 + +以下组合均非法: + +- 启用 local address planning + `auto = false` +- 跳过 local address planning + `auto = true` + +若函数内不存在 `reserve_buffer`,则保持现有编译流程对 `PlanMemory` 的原始控制行为,不引入额外语义。 + +### 7.4 启用 local address planning 的 auto 路径 + +在启用 local address planning 的编译流程中,`reserve_buffer` 必须使用 `auto = true`,并由 plan memory 负责地址分配。 + +若函数中存在 `reserve_buffer`,则对其 `location` 对应的地址空间执行: + +1. 先按现有逻辑完成普通 local buffer 的 `MemPlan` +2. 再收集该地址空间内已经分配完成的 local 区间 +3. 在剩余空洞中按地址空间对齐要求寻找一段可容纳 `reserve_buffer.size` 的连续区间 +4. 将该区间起始地址回填为这条唯一 `reserve_buffer` 的 `base` + +即: + +- 普通 `memref.alloc` / tile buffer 等 local 内存仍先由既有 `MemPlan` 按原逻辑分配 +- `reserve_buffer` 不参与普通 local buffer 的 inplace / reuse 规划 +- `reserve_buffer` 在普通 local buffer 分配完成后,再作为独立的一段连续 local 区间进行 hole 分配 +- `reserve_buffer` 不保证位于地址空间起始地址,也不保证形成预留前缀;其语义仅为“在该地址空间中为 consumer slot buffer 找到一段对齐且连续的可用地址” +- 若整体容量足够但 `MemPlan` 结果将空间打散,导致不存在满足大小和对齐要求的连续空洞,则 `reserve_buffer` 分配失败并报错 + +### 7.5 跳过 local address planning 的 manual 路径 + +在跳过 local address planning 的编译流程中: + +- 每个 `reserve_buffer` 必须显式提供 `base` +- PTOAS 只校验 `base` 的基本合法性 +- `PlanMemory` 不参与该函数的 local 地址分配 +- 因此该函数中其他 local buffer 地址也必须已由前端或更前阶段整体确定 +- 地址传播 pass 不做地址分配,只将显式 `base` 传播到 `import_reserved_buffer` + +该 manual 路径的目标是: + +- 保持前端或外部地址规划结果不被 PTOAS 改写 +- 避免 `reserve_buffer` 显式地址与 PTOAS 自动规划结果相互覆盖 + +### 7.6 `import_reserved_buffer` 规则 + +- 不做地址分配 + +### 7.7 地址传播 pass 规则 + +对每个 `import_reserved_buffer`: + +1. 通过 `peer_func` 找到 peer 函数 +2. 在 peer 函数内查找同名 `reserve_buffer` +3. 读取对方已经解析出的 `base` 或其等价结果值 +4. 用该常量地址替换 `import_reserved_buffer` 的结果 + +地址传播完成后: + +- producer 与 consumer 对同一逻辑 pipe 使用同一个 local buffer 地址 +- EmitC 只处理解析后的常量地址,不处理 `import_reserved_buffer` + +#### 7.7.1 pass 落点 + +- PTOAS 增加独立 `ModulePass`:`pto-resolve-reserved-buffers` +- 该 pass 固定运行在 EmitC lowering 之前 +- 启用规划时:运行在 `pto-plan-memory` 之后 +- 跳过规划时:不经过 `pto-plan-memory`,但该 pass 仍会运行 +- 该 pass 不负责地址分配,只消费前一阶段已经确定的 `reserve_buffer.base` + +#### 7.7.2 输入假设 + +- 启用规划时,`reserve_buffer.auto = true`,其 `base` 已由 `PlanMemory` 回填 +- 跳过规划时,`reserve_buffer.auto = false`,其 `base` 已由前端显式给定 +- `import_reserved_buffer.peer_func` 已能解析到合法 peer function +- `import_reserved_buffer.name` 已能在 peer function 中找到唯一匹配的 `reserve_buffer` + +#### 7.7.3 实现流程 + +pass 在模块级按两步执行: + +1. 先建立 peer 对应关系 +2. 再将 `reserve_buffer` / `import_reserved_buffer` 物化为显式常量地址 + +其中第一步的实现方式是: + +- 遍历模块内所有 `pto.initialize_l2l_pipe` / `pto.initialize_l2g2l_pipe` +- 若其 `local_addr` 来自 `reserve_buffer`,则以“当前函数 + reserve 名字 + dir_mask”识别逻辑 pipe +- 若其 `local_addr` 来自 `import_reserved_buffer`,则以“peer_func + reserve 名字 + dir_mask”识别逻辑 pipe +- 将 peer 两侧引用到同一逻辑 pipe 的内部 init op 归并到同一组 +- 若某条 init 未显式提供 `flag_base`,则其 `local_addr` 必须来自 `reserve_buffer` 或 `import_reserved_buffer` +- 对每个逻辑 pipe 分组,要求必须形成完整 peer init pair:恰好两条 init,且分别来自 peer 两侧函数;若 peer 信息不完整则直接报错 +- 在同一组内,若任一侧已显式提供 `flag_base`,则该值作为该组最终值;若两侧显式值冲突则报错 +- 若同组两侧都未显式提供 `flag_base`,则按默认规则回填: + - 单向场景:`flag_base = 0` + - 双向场景:C2V 组 `flag_base = 0`,V2C 组 `flag_base = 2` +- 所谓“双向场景”,是指同一对 peer 函数之间同时存在 `dir_mask = 1` 和 `dir_mask = 2` 两个逻辑 pipe 分组 +- 完成分组决策后,将最终 `flag_base` 回填到该组内所有尚未显式填写的 init op,保证 peer 两侧一致 + +第二步的实现方式是: + +- 对每个 `reserve_buffer`,读取其已解析 `base` +- 在该 op 位置插入 `arith.constant` +- 用该常量替换 `reserve_buffer` 结果值的全部 uses +- 对每个 `import_reserved_buffer`,通过 `peer_func + name` 找到 peer `reserve_buffer` +- 读取对方已解析 `base` +- 在当前 op 位置插入同值 `arith.constant` +- 用该常量替换 `import_reserved_buffer` 结果值的全部 uses +- 常量替换完成后,删除 `reserve_buffer` / `import_reserved_buffer` + +#### 7.7.4 结果 IR 形态 + +地址传播 pass 之后: + +- IR 中不再保留 `reserve_buffer` / `import_reserved_buffer` +- 内部 pipe init op 的 `local_addr` 只再引用普通 SSA 常量地址 +- 因而后续 EmitC 无需理解 frontend 预留地址语义,只需透传解析后的地址值 + +#### 7.7.5 失败条件 + +若出现以下情况,pass 直接报错: + +- `reserve_buffer.base` 在 pass 运行时仍未解析 +- 启用规划的编译流程却出现 `reserve_buffer.auto = false` +- 跳过规划的编译流程却出现 `reserve_buffer.auto = true` +- `peer_func` 无法解析到函数 +- 在 peer function 中找不到同名 `reserve_buffer` +- 某条未显式提供 `flag_base` 的内部 init,其 `local_addr` 不来自 `reserve_buffer` / `import_reserved_buffer` +- 基于 `reserve_buffer` / `import_reserved_buffer` 建立的某个逻辑 pipe 分组,未形成完整 peer init pair +- peer `flag_base` 已显式给定但两侧取值冲突 + +## 8. flag 分配规则 + +### 8.1 总原则 + +- `flag_base` 由 PTOAS flag 分配阶段在内部 init op 上填写 +- 在 flag 分配完成前,内部 init op 可以暂时不携带 `flag_base` +- peer 两侧同一逻辑 pipe 必须使用同一个 `flag_base` + +### 8.2 单向场景 + +当前规划中,当 `DIR_MASK = 1` 或 `2` 且函数内仅有该唯一逻辑 pipe 时,可采用: + +- 该方向唯一逻辑 pipe 的 `flag_base = 0` +- 该 pipe 占用逻辑 flag 对:`0` 和 `1` + +### 8.3 双向场景 + +当前规划中,当 `DIR_MASK = 3` 时,可采用: + +- C2V pipe:`flag_base = 0` +- V2C pipe:`flag_base = 2` + +因此双向固定占用两组逻辑 flag: + +- C2V:`0` / `1` +- V2C:`2` / `3` + +### 8.4 与地址传播的关系 + +地址传播 pass 在识别出 `import_reserved_buffer` 与 `reserve_buffer` 的 peer 对应关系后,同时可以完成 peer pipe 的 `flag_base` 对齐。 + +即: + +- 基于同一 FIFO 通信的两条 peer init op,必须拿到相同的 `flag_base` + +## 9. verifier 规则 + +### 9.1 前端 verifier + +前端 verifier 负责检查: + +- 每个函数 init op 数量是否合法 +- 每个函数 `reserve_buffer` / `import_reserved_buffer` 数量是否合法 +- `DIR_MASK` 取值是否合法 +- `SLOT_SIZE > 0` +- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM` +- `reserve_buffer.location` 与 consumer 函数类型匹配 +- `reserve_buffer.name` 在函数内唯一 +- `reserve_buffer.auto = false` 时必须带 `base` +- `reserve_buffer.auto = true` 时必须不带 `base` +- driver / pipeline 级约束:启用规划的编译流程只接受 `auto = true` +- driver / pipeline 级约束:跳过规划的编译流程只接受 `auto = false` 且显式 `base` +- `import_reserved_buffer` 能在 `peer_func` 中找到同名 `reserve_buffer` +- 方向相关 op 只能出现在合法 kernel 中 +- 前端数据传输 op 的 `split` 必须是合法的编译期常量属性 + +### 9.2 内部 IR verifier + +内部 verifier 负责检查: + +- `slot_size > 0` +- `slot_num` 只允许 `8` 或 `4` +- `DIR_MASK=1/2` 时,`slot_num` 必须与单向/双向 lowering 规则一致 +- `local_slot_num` 若出现,只允许出现在 `pto.initialize_l2g2l_pipe` 上,且必须大于 `0` 且不大于 `slot_num` +- `flag_base` 若出现,必须满足基本合法性;是否已填写以及具体分配值由 flag 分配保证 +- `pto.initialize_l2g2l_pipe` 必须提供 `gm_addr` 和 `local_addr` +- `pto.initialize_l2l_pipe` 必须提供 `local_addr` +- `dir_mask = 1` 的 pipe 只能被 C2V 方向 lowering 使用 +- `dir_mask = 2` 的 pipe 只能被 V2C 方向 lowering 使用 +- `tpush/tpop/tfree` 的 `split` 必须是合法的编译期常量属性 + +### 9.3 关于 `split` 的校验边界 + +PTOAS 对 `split` 的处理边界如下: + +- PTOAS 验证 `split` 是合法枚举值 +- PTOAS 要求 `split` 以编译期常量属性形式出现 +- PTOAS 不验证同一逻辑 pipe 上多个 `tpush/tpop/tfree` 的 `split` 是否一致 +- PTOAS 不根据 `split` 改变地址分配、flag 分配或 pipe 配对 + +因此: + +- `split` 混用是否语义正确,不是 PTOAS 静态保证项 +- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证 +- PTOAS 只负责校验 `split` 枚举值合法,并将其透传到底层 + +## 10. EmitC 与 pto-isa 映射 + +### 10.1 初始化 op + +在进入 EmitC 前: + +- 前端 `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe` +- 前端 `pto.tpush_to_aiv` / `pto.tpush_to_aic` +- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` +- 前端 `pto.tfree_from_aic` / `pto.tfree_from_aiv` +- `pto.reserve_buffer` / `pto.import_reserved_buffer` + +都必须已经被前序 pass 消除。 + +EmitC 只处理 PTOAS 内部统一 IR,不直接理解前端 pipe 接口或地址提示接口。 + +EmitC 将以下内部 init op 映射到底层 `TPipe`: + +- `pto.initialize_l2l_pipe` +- `pto.initialize_l2g2l_pipe` + +映射时需要使用以下信息: + +- `dir_mask` +- `slot_size` +- `slot_num` +- `local_slot_num` +- `flag_base` +- `gm_addr` +- `local_addr` + +其中: + +- 若 `flag_base` 尚未在 EmitC 前完成填写,PTOAS 应报错。 + +### 10.2 数据传输 op + +EmitC 将以下内部数据传输 op 映射到底层: + +- `pto.tpush` -> `TPUSH` +- `pto.tpop` -> `TPOP` +- `pto.tfree` -> `TFREE` + +映射时需要使用以下信息: + +- `tile` +- `split` +- `pipe` + +其中: + +- `split` 不在 PTOAS 内部解释 +- `split` 作为底层 `TPUSH/TPOP/TFREE` 的编译期模板实参透传 + +### 10.3 InsertSync + +`split` 不影响 PTOAS 中的 pipeline derivation 与 InsertSync 规则。 + +InsertSync 只依赖: + +- op 种类 +- init op 形态 +- `dir_mask` +- 目标架构 + +而不依赖 `split`。 + +## 11. 编译流程总览 + +完整流程如下: + +```text +前端 IR 接口 + -> lowering pass + -> PTOAS 内部统一 IR + -> plan memory + -> 地址传播 pass + -> EmitC + -> pto-isa C++ 代码 +``` + +其中: + +- lowering pass 负责拆分 `DIR_MASK=3`、绑定方向与 pipe +- 启用规划的编译流程中,plan memory 先按既有逻辑规划普通 local buffer,再为 `reserve_buffer` 在目标地址空间中分配 hole +- 跳过规划的编译流程中,不运行 plan memory;`reserve_buffer.base` 必须已由前端给定 +- 地址传播 pass 负责 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐 +- EmitC 只负责将内部 `initialize_l2l_pipe` / `initialize_l2g2l_pipe` / `tpush` / `tpop` / `tfree` 及其属性透传到底层 \ No newline at end of file diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py new file mode 100644 index 00000000..b5db9833 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py @@ -0,0 +1,71 @@ +import ctypes +import os +import subprocess + +import torch +import torch_npu # noqa: F401 + +from ptodsl.test_util import get_test_device + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so") +DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") +DEFAULT_FIFO_BYTES = 4 * 1024 + + +def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: + return ctypes.c_void_p(tensor.data_ptr()) + + +def compile_example(compile_script: str) -> None: + subprocess.run( + ["bash", compile_script], + check=True, + cwd=THIS_DIR, + ) + + +def load_lib(lib_path: str) -> ctypes.CDLL: + lib = ctypes.CDLL(lib_path) + lib.call_kernel.argtypes = [ + ctypes.c_uint32, + ctypes.c_void_p, + ctypes.c_void_p, + ] + lib.call_kernel.restype = None + return lib + + +def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor: + fifo_elems = max(1, (fifo_bytes + 3) // 4) + return torch.zeros((fifo_elems,), dtype=torch.float32, device=device) + + +def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor) -> None: + stream_ptr = torch.npu.current_stream()._as_parameter_ + lib.call_kernel( + 1, + stream_ptr, + torch_to_ctypes(gm_slot_buffer), + ) + torch.npu.synchronize() + + +def main() -> None: + compile_example(DEFAULT_COMPILE_SCRIPT) + + device = get_test_device() + torch.npu.set_device(device) + + lib = load_lib(DEFAULT_LIB_PATH) + gm_slot_buffer = make_gm_slot_buffer( + fifo_bytes=DEFAULT_FIFO_BYTES, + device=device, + ) + + run_kernel(lib, gm_slot_buffer=gm_slot_buffer) + print(f"Launched bidirectional_example using {DEFAULT_LIB_PATH}.") + + +if __name__ == "__main__": + main() From da15c6d80429a83e4dcdb46c1b63591a70472f1a Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 26 Mar 2026 17:01:55 +0000 Subject: [PATCH 06/38] feat: add gitignore --- examples/aot/tpushpop/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/aot/tpushpop/.gitignore diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore new file mode 100644 index 00000000..ab5698d1 --- /dev/null +++ b/examples/aot/tpushpop/.gitignore @@ -0,0 +1 @@ +msprof_res/ \ No newline at end of file From 041625f818a1720b0b7c3790c92e7007fe98d477 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 10:37:20 +0000 Subject: [PATCH 07/38] feat: simple bidirectional transfer working in mlir --- .../bidirectional_example.mlir | 140 ++++++------------ .../aot/tpushpop/mix-kernel_mlir/caller.cpp | 2 +- 2 files changed, 49 insertions(+), 93 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir index 01052493..849f8d37 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir @@ -1,6 +1,37 @@ +// Bidirectional pipe example. +// +// There are two logical FIFO pipes: +// - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel` +// - `v2c_fifo`: vector/kernel `@vector_kernel` pushes to cube/kernel `@cube_kernel` +// +// `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import +// ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe` +// binds those FIFO endpoints to the shared GM slot buffer plus each side's local +// consumer buffer. +// +// What is transferred: +// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv` +// with `split = 0` (no split). Vector receives that same logical `16 x 16` tile +// with `pto.tpop_from_aic`, but in a vector tile type/layout. +// - Vector -> Cube: one full `16 x 16` `f32` vector tile via `pto.tpush_to_aic` +// with `split = 0` (no split). Cube receives that same logical `16 x 16` tile +// with `pto.tpop_from_aiv`, but in a matrix tile type/layout. +// +// Shape summary: +// - All transferred tiles are `rows=16, cols=16, dtype=f32` +// - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major` +// - Vector-produced tile: `loc=vec`, `blayout=row_major`, `slayout=none_box` +// - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major` +// - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box` module { - func.func @cube_kernel(%gm_slot_buffer: i32) - attributes {pto.kernel_kind = #pto.kernel_kind} { + + func.func @call_both(%gm_slot_buffer: !pto.ptr) attributes {pto.entry} { + func.call @cube_kernel(%gm_slot_buffer) : (!pto.ptr) -> () + func.call @vector_kernel(%gm_slot_buffer) : (!pto.ptr) -> () + return + } + + func.func @cube_kernel(%gm_slot_buffer: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { %v2c_local = pto.reserve_buffer { name = "v2c_fifo", size = 4096, @@ -12,20 +43,22 @@ module { peer_func = @vector_kernel } -> i32 pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} - (gm_slot_buffer = %gm_slot_buffer : i32, + (gm_slot_buffer = %gm_slot_buffer : !pto.ptr, c2v_consumer_buf = %c2v_import : i32, v2c_consumer_buf = %v2c_local : i32) %acc_tile = pto.alloc_tile : !pto.tile_buf pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} - %mat_tile = pto.tpop_from_aiv {split = 1} - -> !pto.tile_buf - pto.tfree_from_aiv {split = 2} + %mat_tile = pto.tpop_from_aiv {split = 0} + -> !pto.tile_buf + %left_tile = pto.alloc_tile : !pto.tile_buf + pto.tmov ins(%mat_tile : !pto.tile_buf) outs(%left_tile : !pto.tile_buf) + pto.tfree_from_aiv {split = 0} return } - func.func @vector_kernel(%gm_slot_buffer: i32) + func.func @vector_kernel(%gm_slot_buffer: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { %c2v_local = pto.reserve_buffer { name = "c2v_fifo", @@ -38,96 +71,19 @@ module { peer_func = @cube_kernel } -> i32 pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} - (gm_slot_buffer = %gm_slot_buffer : i32, + (gm_slot_buffer = %gm_slot_buffer : !pto.ptr, c2v_consumer_buf = %c2v_local : i32, v2c_consumer_buf = %v2c_import : i32) - %vec_tile = pto.alloc_tile : !pto.tile_buf - pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} - - %recv_tile = pto.tpop_from_aic {split = 1} - -> !pto.tile_buf - pto.tfree_from_aic {split = 2} - return - } - - func.func @cube_kernel_nested(%gm_slot_buffer: i32) - attributes {pto.kernel_kind = #pto.kernel_kind} { - %true = arith.constant true - scf.if %true { - %v2c_local = pto.reserve_buffer { - name = "v2c_fifo_nested", - size = 4096, - location = #pto.address_space, - auto = true - } -> i32 - %c2v_import = pto.import_reserved_buffer { - name = "c2v_fifo_nested", - peer_func = @vector_kernel_nested - } -> i32 - pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} - (gm_slot_buffer = %gm_slot_buffer : i32, - c2v_consumer_buf = %c2v_import : i32, - v2c_consumer_buf = %v2c_local : i32) - - %acc_tile = pto.alloc_tile : !pto.tile_buf - pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} + %vec_tile = pto.alloc_tile : !pto.tile_buf + pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} - %recv_tile = pto.tpop_from_aiv {split = 1} - -> !pto.tile_buf - pto.tfree_from_aiv {split = 2} - } + %recv_tile = pto.tpop_from_aic {split = 0} + -> !pto.tile_buf + %neg_tile = pto.alloc_tile : !pto.tile_buf + pto.tneg ins(%recv_tile : !pto.tile_buf) outs(%neg_tile : !pto.tile_buf) + pto.tfree_from_aic {split = 0} return } - func.func @vector_kernel_nested(%gm_slot_buffer: i32) - attributes {pto.kernel_kind = #pto.kernel_kind} { - %true = arith.constant true - scf.if %true { - %c2v_local = pto.reserve_buffer { - name = "c2v_fifo_nested", - size = 4096, - location = #pto.address_space, - auto = true - } -> i32 - %v2c_import = pto.import_reserved_buffer { - name = "v2c_fifo_nested", - peer_func = @cube_kernel_nested - } -> i32 - pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} - (gm_slot_buffer = %gm_slot_buffer : i32, - c2v_consumer_buf = %c2v_local : i32, - v2c_consumer_buf = %v2c_import : i32) - - %vec_tile = pto.alloc_tile : !pto.tile_buf - pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} - - %recv_tile = pto.tpop_from_aic {split = 1} - -> !pto.tile_buf - pto.tfree_from_aic {split = 2} - } - return - } } - -// A3-LABEL: AICORE void cube_kernel( -// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( -// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( -// A3: TPUSH -// A3: TPOP -// A3: TFREE - -// A3-LABEL: AICORE void vector_kernel( -// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( -// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( -// A3: TPUSH -// A3: TPOP -// A3: TFREE - -// A3-LABEL: AICORE void cube_kernel_nested( -// A3: if ( -// A3: auto {{v[0-9]+}} = TPipe<0, Direction::DIR_C2V, 1024, 4, 4>( -// A3: auto {{v[0-9]+}} = TPipe<2, Direction::DIR_V2C, 1024, 4, 4>( -// A3: TPUSH -// A3: TPOP -// A3: TFREE diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp index 5926c256..8406b32d 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp +++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp @@ -11,5 +11,5 @@ extern "C" void call_kernel( void *stream, uint8_t *gmSlotBuffer) { - bidirectional_example<<>>((__gm__ float *)gmSlotBuffer); + call_both<<>>((__gm__ float *)gmSlotBuffer); } From 5137291c30671bf27bd341bbea12b626a21b45a5 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 10:49:21 +0000 Subject: [PATCH 08/38] feat: now does simple add --- .../mix-kernel_mlir/bidirectional_example.mlir | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir index 849f8d37..5f209333 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir @@ -13,14 +13,15 @@ // - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv` // with `split = 0` (no split). Vector receives that same logical `16 x 16` tile // with `pto.tpop_from_aic`, but in a vector tile type/layout. -// - Vector -> Cube: one full `16 x 16` `f32` vector tile via `pto.tpush_to_aic` -// with `split = 0` (no split). Cube receives that same logical `16 x 16` tile -// with `pto.tpop_from_aiv`, but in a matrix tile type/layout. +// - Vector -> Cube: the doubled version of that received tile. Vector computes +// `recv_tile + recv_tile` with `pto.tadd`, then sends that full `16 x 16` `f32` +// result back with `pto.tpush_to_aic`. Cube receives it with `pto.tpop_from_aiv` +// in a matrix tile type/layout. // // Shape summary: // - All transferred tiles are `rows=16, cols=16, dtype=f32` // - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major` -// - Vector-produced tile: `loc=vec`, `blayout=row_major`, `slayout=none_box` +// - Vector-produced return tile: `loc=vec`, `blayout=row_major`, `slayout=none_box` // - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major` // - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box` module { @@ -75,14 +76,12 @@ module { c2v_consumer_buf = %c2v_local : i32, v2c_consumer_buf = %v2c_import : i32) - %vec_tile = pto.alloc_tile : !pto.tile_buf - pto.tpush_to_aic(%vec_tile : !pto.tile_buf) {split = 0} - %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %neg_tile = pto.alloc_tile : !pto.tile_buf - pto.tneg ins(%recv_tile : !pto.tile_buf) outs(%neg_tile : !pto.tile_buf) + %sum_tile = pto.alloc_tile : !pto.tile_buf + pto.tadd ins(%recv_tile, %recv_tile : !pto.tile_buf, !pto.tile_buf) outs(%sum_tile : !pto.tile_buf) pto.tfree_from_aic {split = 0} + pto.tpush_to_aic(%sum_tile : !pto.tile_buf) {split = 0} return } From 6210864e2f09b941d7b0f7cf0d7a08498cbbfc33 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 14:02:32 +0000 Subject: [PATCH 09/38] feat: clean working version simple --- .../bidirectional_example.mlir | 81 ++++++++++--------- .../aot/tpushpop/mix-kernel_mlir/caller.cpp | 6 +- .../aot/tpushpop/mix-kernel_mlir/compile.sh | 8 +- .../run_bidirectional_example.py | 33 +++++++- 4 files changed, 79 insertions(+), 49 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir index 5f209333..d5eab7fb 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir @@ -1,87 +1,88 @@ // Bidirectional pipe example. // -// There are two logical FIFO pipes: +// This reduced version only uses the C2V pipe: // - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel` -// - `v2c_fifo`: vector/kernel `@vector_kernel` pushes to cube/kernel `@cube_kernel` // // `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import // ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe` // binds those FIFO endpoints to the shared GM slot buffer plus each side's local // consumer buffer. // +// End-to-end data flow: +// - Cube loads one input matrix `X` from GM. +// - Cube computes `Y = X @ X`. +// - Cube sends that accumulator tile to vector over `c2v_fifo`. +// - Vector pops the tile and stores it to GM as output matrix `Y`. +// // What is transferred: -// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile via `pto.tpush_to_aiv` -// with `split = 0` (no split). Vector receives that same logical `16 x 16` tile -// with `pto.tpop_from_aic`, but in a vector tile type/layout. -// - Vector -> Cube: the doubled version of that received tile. Vector computes -// `recv_tile + recv_tile` with `pto.tadd`, then sends that full `16 x 16` `f32` -// result back with `pto.tpush_to_aic`. Cube receives it with `pto.tpop_from_aiv` -// in a matrix tile type/layout. +// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile `Y = X @ X` +// sent with `pto.tpush_to_aiv` using `split = 0` (no split). Vector receives +// that same logical `16 x 16` tile with `pto.tpop_from_aic` in a vector tile +// type/layout, then stores it to the GM output buffer. // // Shape summary: // - All transferred tiles are `rows=16, cols=16, dtype=f32` -// - Cube-produced tile: `loc=acc`, `blayout=col_major`, `slayout=row_major` -// - Vector-produced return tile: `loc=vec`, `blayout=row_major`, `slayout=none_box` -// - Cube-consumed tile after V2C pop: `loc=mat`, `blayout=col_major`, `slayout=row_major` +// - Cube-produced C2V tile: `loc=acc`, `blayout=col_major`, `slayout=row_major` // - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box` module { - func.func @call_both(%gm_slot_buffer: !pto.ptr) attributes {pto.entry} { - func.call @cube_kernel(%gm_slot_buffer) : (!pto.ptr) -> () - func.call @vector_kernel(%gm_slot_buffer) : (!pto.ptr) -> () + func.func @call_both(%gm_slot_buffer: !pto.ptr, %gm_x: !pto.ptr, %gm_y: !pto.ptr) attributes {pto.entry} { + func.call @cube_kernel(%gm_slot_buffer, %gm_x) : (!pto.ptr, !pto.ptr) -> () + func.call @vector_kernel(%gm_slot_buffer, %gm_y) : (!pto.ptr, !pto.ptr) -> () return } - func.func @cube_kernel(%gm_slot_buffer: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { - %v2c_local = pto.reserve_buffer { - name = "v2c_fifo", - size = 4096, - location = #pto.address_space, - auto = true - } -> i32 + func.func @cube_kernel(%gm_slot_buffer: !pto.ptr, %gm_x: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index %c2v_import = pto.import_reserved_buffer { name = "c2v_fifo", peer_func = @vector_kernel } -> i32 - pto.aic_initialize_pipe {dir_mask = 3, slot_size = 1024} + %c0_i32 = arith.constant 0 : i32 + pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024} (gm_slot_buffer = %gm_slot_buffer : !pto.ptr, c2v_consumer_buf = %c2v_import : i32, - v2c_consumer_buf = %v2c_local : i32) + v2c_consumer_buf = %c0_i32 : i32) + %x_mat_tile = pto.alloc_tile : !pto.tile_buf + %x_left_tile = pto.alloc_tile : !pto.tile_buf + %x_right_tile = pto.alloc_tile : !pto.tile_buf %acc_tile = pto.alloc_tile : !pto.tile_buf + %gm_x_view = pto.make_tensor_view %gm_x, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view + %gm_x_tile_view = pto.partition_view %gm_x_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + pto.tload ins(%gm_x_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%x_mat_tile : !pto.tile_buf) + pto.tmov ins(%x_mat_tile : !pto.tile_buf) outs(%x_left_tile : !pto.tile_buf) + pto.tmov ins(%x_mat_tile : !pto.tile_buf) outs(%x_right_tile : !pto.tile_buf) + pto.tmatmul ins(%x_left_tile, %x_right_tile : !pto.tile_buf, !pto.tile_buf) outs(%acc_tile : !pto.tile_buf) pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} - - %mat_tile = pto.tpop_from_aiv {split = 0} - -> !pto.tile_buf - %left_tile = pto.alloc_tile : !pto.tile_buf - pto.tmov ins(%mat_tile : !pto.tile_buf) outs(%left_tile : !pto.tile_buf) - pto.tfree_from_aiv {split = 0} return } - func.func @vector_kernel(%gm_slot_buffer: !pto.ptr) + func.func @vector_kernel(%gm_slot_buffer: !pto.ptr, %gm_y: !pto.ptr) attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index %c2v_local = pto.reserve_buffer { name = "c2v_fifo", size = 4096, location = #pto.address_space, auto = true } -> i32 - %v2c_import = pto.import_reserved_buffer { - name = "v2c_fifo", - peer_func = @cube_kernel - } -> i32 - pto.aiv_initialize_pipe {dir_mask = 3, slot_size = 1024} + %c0_i32 = arith.constant 0 : i32 + pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} (gm_slot_buffer = %gm_slot_buffer : !pto.ptr, c2v_consumer_buf = %c2v_local : i32, - v2c_consumer_buf = %v2c_import : i32) + v2c_consumer_buf = %c0_i32 : i32) + %gm_y_view = pto.make_tensor_view %gm_y, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view + %gm_y_tile_view = pto.partition_view %gm_y_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf - %sum_tile = pto.alloc_tile : !pto.tile_buf - pto.tadd ins(%recv_tile, %recv_tile : !pto.tile_buf, !pto.tile_buf) outs(%sum_tile : !pto.tile_buf) + pto.tstore ins(%recv_tile : !pto.tile_buf) outs(%gm_y_tile_view : !pto.partition_tensor_view<16x16xf32>) pto.tfree_from_aic {split = 0} - pto.tpush_to_aic(%sum_tile : !pto.tile_buf) {split = 0} return } diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp index 8406b32d..e558e69d 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp +++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp @@ -9,7 +9,9 @@ extern "C" void call_kernel( uint32_t blockDim, void *stream, - uint8_t *gmSlotBuffer) + uint8_t *gmSlotBuffer, + uint8_t *x, + uint8_t *y) { - call_both<<>>((__gm__ float *)gmSlotBuffer); + call_both<<>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y); } diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index ee6376fb..62c64573 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -7,14 +7,14 @@ MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir" GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp" LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" -mkdir -p "${ARTIFACT_DIR}" -rm -f "${GENERATED_CPP}" "${LIB_PATH}" +#mkdir -p "${ARTIFACT_DIR}" +#rm -f "${GENERATED_CPP}" "${LIB_PATH}" -ptoas --pto-arch=a3 "${MLIR_PATH}" > "${GENERATED_CPP}" +#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" bisheng \ -I/sources/pto-isa/include/ \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ + -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 -g \ -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ -xcce -Xhost-start -Xhost-end \ -mllvm -cce-aicore-stack-size=0x8000 \ diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py index b5db9833..0854a6a1 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py @@ -11,6 +11,10 @@ DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so") DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") DEFAULT_FIFO_BYTES = 4 * 1024 +M = 16 +N = 16 +ATOL = 1e-4 +RTOL = 1e-4 def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: @@ -31,6 +35,8 @@ def load_lib(lib_path: str) -> ctypes.CDLL: ctypes.c_uint32, ctypes.c_void_p, ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_void_p, ] lib.call_kernel.restype = None return lib @@ -41,12 +47,20 @@ def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor: return torch.zeros((fifo_elems,), dtype=torch.float32, device=device) -def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor) -> None: +def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]: + x = torch.rand((M, N), dtype=torch.float32, device=device) -0.5 + y = torch.zeros((M, N), dtype=torch.float32, device=device) + return x, y + + +def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( 1, stream_ptr, torch_to_ctypes(gm_slot_buffer), + torch_to_ctypes(x), + torch_to_ctypes(y), ) torch.npu.synchronize() @@ -62,9 +76,22 @@ def main() -> None: fifo_bytes=DEFAULT_FIFO_BYTES, device=device, ) + x, y = make_io_tensors(device=device) + + print(y) + run_kernel(lib, gm_slot_buffer=gm_slot_buffer, x=x, y=y) + print(y) + + y_ref = x.cpu() @ x.cpu() + y_cpu = y.cpu() + max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item()) + ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL)) + + print(f"shape=({M}, {N}) max_abs={max_abs:.6f}") + if not ok: + raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}") - run_kernel(lib, gm_slot_buffer=gm_slot_buffer) - print(f"Launched bidirectional_example using {DEFAULT_LIB_PATH}.") + print(f"Validation passed using {DEFAULT_LIB_PATH}.") if __name__ == "__main__": From f9d8812a50f2c43b543f7098723892682a03f18c Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 14:26:13 +0000 Subject: [PATCH 10/38] feat: clean working version simple --- examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 6 +++--- .../tpushpop/mix-kernel_mlir/run_bidirectional_example.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index 62c64573..8978eb9e 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -7,10 +7,10 @@ MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir" GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp" LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" -#mkdir -p "${ARTIFACT_DIR}" -#rm -f "${GENERATED_CPP}" "${LIB_PATH}" +mkdir -p "${ARTIFACT_DIR}" +rm -f "${GENERATED_CPP}" "${LIB_PATH}" -#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" +ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" bisheng \ -I/sources/pto-isa/include/ \ diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py index 0854a6a1..1b619869 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py @@ -76,6 +76,7 @@ def main() -> None: fifo_bytes=DEFAULT_FIFO_BYTES, device=device, ) + torch.set_printoptions(precision=1, threshold=2000, linewidth=250, sci_mode=False) x, y = make_io_tensors(device=device) print(y) @@ -84,6 +85,8 @@ def main() -> None: y_ref = x.cpu() @ x.cpu() y_cpu = y.cpu() + + print(y_ref-y_cpu) max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item()) ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL)) From d2bf0ca18cfe34cf125ca98ae251422132ac04aa Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:07:25 +0000 Subject: [PATCH 11/38] wip: add transfer ops to dsl --- ptodsl/api/pto.py | 20 +++++++++ ptodsl/api/pto_general.py | 90 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py index f2e2d0ac..0cdf4bdb 100644 --- a/ptodsl/api/pto.py +++ b/ptodsl/api/pto.py @@ -2,15 +2,25 @@ from .scalar import Value, wrap_value from .pto_general import ( alloc_tile, + aic_initialize_pipe, + aiv_initialize_pipe, as_tensor, cube_section, get_block_idx, get_block_num, get_subblock_idx, get_subblock_num, + import_reserved_buffer, load, + reserve_buffer, slice_view, store, + tfree_from_aic, + tfree_from_aiv, + tpop_from_aic, + tpop_from_aiv, + tpush_to_aic, + tpush_to_aiv, vector_section, print, ) @@ -49,9 +59,19 @@ "range", "if_context", "cond", + "reserve_buffer", + "import_reserved_buffer", + "aic_initialize_pipe", + "aiv_initialize_pipe", "alloc_tile", "load", "store", + "tpush_to_aiv", + "tpush_to_aic", + "tpop_from_aic", + "tpop_from_aiv", + "tfree_from_aic", + "tfree_from_aiv", "print", "record_event", "wait_event", diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index c8f649ea..958c8459 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -1,7 +1,7 @@ from contextlib import contextmanager from mlir.dialects import pto as _pto -from mlir.ir import InsertionPoint +from mlir.ir import FlatSymbolRefAttr, InsertionPoint from .scalar import Value, _unwrap @@ -30,6 +30,12 @@ def _resolve_layout_attr(layout): return layout +def _resolve_address_space_attr(location): + if isinstance(location, str): + return _pto.AddressSpaceAttr.get(getattr(_pto.AddressSpace, location.upper())) + return location + + def as_tensor(tensor_type, *, ptr, shape, strides, layout=None): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] @@ -77,6 +83,78 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None): return _pto.AllocTileOp(tile_type, **kwargs).result +# %c2v_local = pto.reserve_buffer { +# name = "c2v_fifo", +# size = 4096, +# location = #pto.address_space, +# auto = true +# } -> i32 +def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): + # TODO: should return be wrapped in Value class? + # All params are compile time attributes + # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value + pass + + +# %c2v_import = pto.import_reserved_buffer { +# name = "c2v_fifo", +# peer_func = @vector_kernel +# } -> i32 +def import_reserved_buffer(*, name, peer_func): + # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value + pass + + +def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf): + # wrap + # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + pass + + +# pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} ( +# gm_slot_buffer = %gm_slot_buffer : !pto.ptr, +# c2v_consumer_buf = %c2v_local : i32, +# v2c_consumer_buf = %c0_i32 : i32 +# ) +def aiv_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf): + # wrap + # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + pass + + +# pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} +def tpush_to_aiv(tile, split): + # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + _pto.tpush_to_aiv(tile, split) + + +def tpush_to_aic(tile, split): + # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + _pto.tpush_to_aic(tile, split) + + +# %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf +def tpop_from_aic(tile_type, split): + # wrap tpop_from_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value + return _pto.tpop_from_aic(tile_type, split) + + +def tpop_from_aiv(tile_type, split): + # wraps tpop_from_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value + return _pto.tpop_from_aiv(tile_type, split) + + +# pto.tfree_from_aic {split = 0} +def tfree_from_aic(split): + # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + _pto.tfree_from_aic(split) + + +def tfree_from_aiv(split): + # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + _pto.tfree_from_aiv(split) + + def load(source, dest): _pto.TLoadOp(None, source, dest) @@ -111,7 +189,17 @@ def print(format, scalar): "vector_section", "cube_section", "alloc_tile", + "reserve_buffer", + "import_reserved_buffer", + "aic_initialize_pipe", + "aiv_initialize_pipe", "load", "store", + "tpush_to_aiv", + "tpush_to_aic", + "tpop_from_aic", + "tpop_from_aiv", + "tfree_from_aic", + "tfree_from_aiv", "print", ] From 6055f694a74e0360f1d4b26ab35b4423b4cf5e35 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:08:20 +0000 Subject: [PATCH 12/38] feat: docker add compiled cpp and bindings --- docker/Dockerfile | 8 ++--- ptodsl/api/pto_general.py | 64 ++++++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d04eba64..331ca0d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,10 +29,10 @@ ARG CACHE_BURST=1 # ARG ARCH=x86_64 ARG ARCH=aarch64 -ARG RELEASE_REPO=zhangstevenunity/PTOAS -ARG RELEASE_VER=0.15 -ARG RELEASE_TAG=v${RELEASE_VER} -ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl +ARG RELEASE_REPO=huawei-csl/PTOAS +ARG RELEASE_VER=20260327 +ARG RELEASE_TAG=${RELEASE_VER} +ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz WORKDIR /installers/ diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 958c8459..ef238787 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -36,6 +36,12 @@ def _resolve_address_space_attr(location): return location +def _resolve_peer_func_attr(peer_func): + if isinstance(peer_func, str): + return FlatSymbolRefAttr.get(peer_func.removeprefix("@")) + return peer_func + + def as_tensor(tensor_type, *, ptr, shape, strides, layout=None): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] @@ -90,10 +96,14 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None): # auto = true # } -> i32 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): - # TODO: should return be wrapped in Value class? # All params are compile time attributes # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - pass + kwargs = {} + if base is not None: + kwargs["base"] = base + return _pto.reserve_buffer( + name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs + ) # %c2v_import = pto.import_reserved_buffer { @@ -102,13 +112,29 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): # } -> i32 def import_reserved_buffer(*, name, peer_func): # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - pass + return _pto.import_reserved_buffer(name, _resolve_peer_func_attr(peer_func)) -def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf): +def aic_initialize_pipe( + *, + dir_mask, + slot_size, + gm_slot_buffer=None, # only needed on a2/a3? + c2v_consumer_buf, + v2c_consumer_buf, +): # wrap # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - pass + kwargs = {} + if gm_slot_buffer is not None: + kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) + return _pto.aic_initialize_pipe( + dir_mask, + slot_size, + _unwrap(c2v_consumer_buf), + _unwrap(v2c_consumer_buf), + **kwargs, + ) # pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024} ( @@ -116,21 +142,37 @@ def aic_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consume # c2v_consumer_buf = %c2v_local : i32, # v2c_consumer_buf = %c0_i32 : i32 # ) -def aiv_initialize_pipe(*, dir_mask, slot_size, gm_slot_buffer=None, c2v_consumer_buf, v2c_consumer_buf): +def aiv_initialize_pipe( + *, + dir_mask, + slot_size, + gm_slot_buffer=None, + c2v_consumer_buf, + v2c_consumer_buf, +): # wrap # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - pass + kwargs = {} + if gm_slot_buffer is not None: + kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) + return _pto.aiv_initialize_pipe( + dir_mask, + slot_size, + _unwrap(c2v_consumer_buf), + _unwrap(v2c_consumer_buf), + **kwargs, + ) # pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} def tpush_to_aiv(tile, split): # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - _pto.tpush_to_aiv(tile, split) + return _pto.tpush_to_aiv(_unwrap(tile), split) def tpush_to_aic(tile, split): # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - _pto.tpush_to_aic(tile, split) + return _pto.tpush_to_aic(_unwrap(tile), split) # %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf @@ -147,12 +189,12 @@ def tpop_from_aiv(tile_type, split): # pto.tfree_from_aic {split = 0} def tfree_from_aic(split): # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - _pto.tfree_from_aic(split) + return _pto.tfree_from_aic(split) def tfree_from_aiv(split): # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - _pto.tfree_from_aiv(split) + return _pto.tfree_from_aiv(split) def load(source, dest): From 8ea9459ebf0f804211de23c9d0a1f3287178614b Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:08:51 +0000 Subject: [PATCH 13/38] feat: use classes instead --- ptodsl/api/pto_general.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index ef238787..27fff68f 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -101,9 +101,9 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): kwargs = {} if base is not None: kwargs["base"] = base - return _pto.reserve_buffer( + return _pto.ReserveBufferOp( name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs - ) + ).result # %c2v_import = pto.import_reserved_buffer { @@ -112,7 +112,7 @@ def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): # } -> i32 def import_reserved_buffer(*, name, peer_func): # wrap import_reserved_buffer(name, peer_func, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - return _pto.import_reserved_buffer(name, _resolve_peer_func_attr(peer_func)) + return _pto.ImportReservedBufferOp(name, _resolve_peer_func_attr(peer_func)).result def aic_initialize_pipe( @@ -128,11 +128,11 @@ def aic_initialize_pipe( kwargs = {} if gm_slot_buffer is not None: kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) - return _pto.aic_initialize_pipe( + return _pto.AicInitializePipeOp( dir_mask, slot_size, - _unwrap(c2v_consumer_buf), - _unwrap(v2c_consumer_buf), + c2v_consumer_buf=_unwrap(c2v_consumer_buf), + v2c_consumer_buf=_unwrap(v2c_consumer_buf), **kwargs, ) @@ -146,7 +146,7 @@ def aiv_initialize_pipe( *, dir_mask, slot_size, - gm_slot_buffer=None, + gm_slot_buffer=None, # only needed on a2/a3 c2v_consumer_buf, v2c_consumer_buf, ): @@ -155,11 +155,11 @@ def aiv_initialize_pipe( kwargs = {} if gm_slot_buffer is not None: kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) - return _pto.aiv_initialize_pipe( + return _pto.AivInitializePipeOp( dir_mask, slot_size, - _unwrap(c2v_consumer_buf), - _unwrap(v2c_consumer_buf), + c2v_consumer_buf=_unwrap(c2v_consumer_buf), + v2c_consumer_buf=_unwrap(v2c_consumer_buf), **kwargs, ) @@ -167,34 +167,34 @@ def aiv_initialize_pipe( # pto.tpush_to_aiv(%acc_tile : !pto.tile_buf) {split = 0} def tpush_to_aiv(tile, split): # wrap tpush_to_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - return _pto.tpush_to_aiv(_unwrap(tile), split) + return _pto.TPushToAivOp(_unwrap(tile), split) def tpush_to_aic(tile, split): # wrap: tpush_to_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - return _pto.tpush_to_aic(_unwrap(tile), split) + return _pto.TPushToAicOp(_unwrap(tile), split) # %recv_tile = pto.tpop_from_aic {split = 0} -> !pto.tile_buf def tpop_from_aic(tile_type, split): # wrap tpop_from_aic(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - return _pto.tpop_from_aic(tile_type, split) + return _pto.TPopFromAicOp(tile_type, split).result def tpop_from_aiv(tile_type, split): # wraps tpop_from_aiv(tile, split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - return _pto.tpop_from_aiv(tile_type, split) + return _pto.TPopFromAivOp(tile_type, split).result # pto.tfree_from_aic {split = 0} def tfree_from_aic(split): # wrap tfree_from_aic(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - return _pto.tfree_from_aic(split) + return _pto.TFreeFromAicOp(split) def tfree_from_aiv(split): # wrap tfree_from_aiv(split, *, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - return _pto.tfree_from_aiv(split) + return _pto.TFreeFromAivOp(split) def load(source, dest): From 3c7dbd9d06e973b64cadbe56f99b78efc57ab584 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 08:47:24 +0000 Subject: [PATCH 14/38] WIP: add builder with multiple funcs --- .../mix-kernel_mlir/bidirectional_builder.py | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py new file mode 100644 index 00000000..fbf68e6b --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py @@ -0,0 +1,165 @@ +from mlir.dialects import arith, func, pto as _pto +from mlir.ir import ( + Attribute, + Context, + FlatSymbolRefAttr, + InsertionPoint, + Location, + Module, + Operation, + UnitAttr, +) + +from ptodsl import pto, tile +from ptodsl import scalar as s + +const = s.const + + +def _call(name, *args): + return Operation.create( + "func.call", + operands=list(args), + attributes={"callee": FlatSymbolRefAttr.get(name)}, + ) + + +def _kernel(fn, kind): + fn.operation.attributes["pto.kernel_kind"] = Attribute.parse( + f"#pto.kernel_kind<{kind}>" + ) + + +def build_module(): + with Context() as ctx, Location.unknown(): + _pto.register_dialect(ctx, load=True) + module = Module.create() + + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + i32 = pto.int32 + tensor_ty = pto.TensorType(rank=2, dtype=dtype) + tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) + left_cfg = pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor") + x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT") + x_left_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="LEFT", + config=left_cfg, + ) + x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT") + acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC") + recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") + call_both_ty = func.FunctionType.get([ptr_ty, ptr_ty, ptr_ty], []) + two_ptr_ty = func.FunctionType.get([ptr_ty, ptr_ty], []) + + with InsertionPoint(module.body): + call_both = func.FuncOp("call_both", call_both_ty) + cube_kernel = func.FuncOp("cube_kernel", two_ptr_ty) + vector_kernel = func.FuncOp("vector_kernel", two_ptr_ty) + + call_both.operation.attributes["pto.entry"] = UnitAttr.get(ctx) + _kernel(cube_kernel, "cube") + _kernel(vector_kernel, "vector") + + call_both_entry = call_both.add_entry_block() + with InsertionPoint(call_both_entry): + gm_slot_buffer, gm_x, gm_y = call_both_entry.arguments + _call("cube_kernel", gm_slot_buffer, gm_x) + _call("vector_kernel", gm_slot_buffer, gm_y) + func.ReturnOp([]) + + cube_entry = cube_kernel.add_entry_block() + with InsertionPoint(cube_entry): + gm_slot_buffer, gm_x = cube_entry.arguments + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = arith.ConstantOp(i32, 0).result + c2v_import = pto.import_reserved_buffer( + name="c2v_fifo", + peer_func="@vector_kernel", + ) + + pto.aic_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_import, + v2c_consumer_buf=c0_i32, + ) + + x_mat_tile = pto.alloc_tile(x_mat_ty) + x_left_tile = pto.alloc_tile(x_left_ty) + x_right_tile = pto.alloc_tile(x_right_ty) + acc_tile = pto.alloc_tile(acc_ty) + + gm_x_view = pto.as_tensor( + tensor_ty, + ptr=gm_x, + shape=[c16, c16], + strides=[c16, c1], + ) + gm_x_tile_view = pto.slice_view( + tile_view_ty, + source=gm_x_view, + offsets=[c0, c0], + sizes=[c16, c16], + ) + + pto.load(gm_x_tile_view, x_mat_tile) + tile.mov(x_mat_tile, x_left_tile) + tile.mov(x_mat_tile, x_right_tile) + tile.matmul(x_left_tile, x_right_tile, acc_tile) + pto.tpush_to_aiv(acc_tile, 0) + func.ReturnOp([]) + + vector_entry = vector_kernel.add_entry_block() + with InsertionPoint(vector_entry): + gm_slot_buffer, gm_y = vector_entry.arguments + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = arith.ConstantOp(i32, 0).result + c2v_local = pto.reserve_buffer( + name="c2v_fifo", + size=4096, + location="VEC", + ) + + pto.aiv_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_local, + v2c_consumer_buf=c0_i32, + ) + + gm_y_view = pto.as_tensor( + tensor_ty, + ptr=gm_y, + shape=[c16, c16], + strides=[c16, c1], + ) + gm_y_tile_view = pto.slice_view( + tile_view_ty, + source=gm_y_view, + offsets=[c0, c0], + sizes=[c16, c16], + ) + + recv_tile = pto.tpop_from_aic(recv_ty, 0) + pto.store(recv_tile, gm_y_tile_view) + pto.tfree_from_aic(0) + func.ReturnOp([]) + + module.operation.verify() + return module + + +module = build_module() + + +if __name__ == "__main__": + print(module) From 834ca8f73949468da2a49f6c0c6b8e660299f43b Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:08:51 +0000 Subject: [PATCH 15/38] feat: add type arg to const() api --- ptodsl/api/scalar.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ptodsl/api/scalar.py b/ptodsl/api/scalar.py index 90938daa..73ad8a1b 100644 --- a/ptodsl/api/scalar.py +++ b/ptodsl/api/scalar.py @@ -98,8 +98,10 @@ def __getattr__(name): raise AttributeError(f"module '{__name__}' has no attribute '{name}'") -def const(value): - return Value(arith.ConstantOp(IndexType.get(), value).result) +def const(value, type=None): + if type is None: + type = IndexType.get() + return Value(arith.ConstantOp(type, value).result) def index_cast(value, index_type=IndexType): From 422f5f23235727059a44c9dec542f65448bc4e0d Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:20:17 +0000 Subject: [PATCH 16/38] WIP: in decorated function we allow multiple functions --- ptodsl/api/pto.py | 4 + ptodsl/api/pto_general.py | 13 ++- ptodsl/compiler/ir.py | 126 ++++++++++++++++++++++------ tests/frontend/test_multifunc_ir.py | 27 ++++++ 4 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 tests/frontend/test_multifunc_ir.py diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py index 0cdf4bdb..f1f3012f 100644 --- a/ptodsl/api/pto.py +++ b/ptodsl/api/pto.py @@ -1,3 +1,4 @@ +from ..compiler.ir import ir_func as func from .control_flow import cond, range, if_context from .scalar import Value, wrap_value from .pto_general import ( @@ -5,6 +6,7 @@ aic_initialize_pipe, aiv_initialize_pipe, as_tensor, + call, cube_section, get_block_idx, get_block_num, @@ -48,10 +50,12 @@ "SubTensorType", "TileBufConfig", "TileBufType", + "func", "get_block_idx", "get_subblock_idx", "get_subblock_num", "get_block_num", + "call", "as_tensor", "slice_view", "vector_section", diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 27fff68f..1ea2a5c8 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -1,7 +1,7 @@ from contextlib import contextmanager from mlir.dialects import pto as _pto -from mlir.ir import FlatSymbolRefAttr, InsertionPoint +from mlir.ir import FlatSymbolRefAttr, InsertionPoint, Operation from .scalar import Value, _unwrap @@ -37,11 +37,21 @@ def _resolve_address_space_attr(location): def _resolve_peer_func_attr(peer_func): + if hasattr(peer_func, "sym_name"): + peer_func = peer_func.sym_name if isinstance(peer_func, str): return FlatSymbolRefAttr.get(peer_func.removeprefix("@")) return peer_func +def call(callee, *args): + return Operation.create( + "func.call", + operands=[_unwrap(arg) for arg in args], + attributes={"callee": _resolve_peer_func_attr(callee)}, + ) + + def as_tensor(tensor_type, *, ptr, shape, strides, layout=None): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] @@ -226,6 +236,7 @@ def print(format, scalar): "get_subblock_idx", "get_subblock_num", "get_block_num", + "call", "as_tensor", "slice_view", "vector_section", diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py index b32730ef..edeb5c24 100644 --- a/ptodsl/compiler/ir.py +++ b/ptodsl/compiler/ir.py @@ -1,11 +1,26 @@ import inspect from mlir.dialects import func, pto as _pto -from mlir.ir import Context, InsertionPoint, Location, Module +from mlir.ir import Attribute, Context, InsertionPoint, Location, Module, UnitAttr from ..api.scalar import wrap_value +_MODULE_STACK = [] + + +class FuncRef: + def __init__(self, sym_name): + self.sym_name = sym_name + + +class _ModuleState: + def __init__(self, *, ctx, module, meta_map): + self.ctx = ctx + self.module = module + self.meta_map = meta_map + + def _resolve_meta(meta_fn): values = meta_fn() if not isinstance(values, dict): @@ -72,38 +87,97 @@ def _restore_globals(fn, old, injected_names): fn.__globals__[name] = old[name] -def to_ir_module(*, meta_data): +def _build_func_body(ir_func, fn, ret_types, meta_map): + entry = ir_func.add_entry_block() + with InsertionPoint(entry): + wrapped_args = [wrap_value(arg) for arg in entry.arguments] + injected = set(meta_map.keys()) + old_globals = _inject_globals(fn, meta_map) + try: + fn(*wrapped_args) + finally: + _restore_globals(fn, old_globals, injected) + + if not ret_types and not _has_func_return(entry): + func.ReturnOp([]) + + +def _current_module_state(): + if not _MODULE_STACK: + raise RuntimeError( + "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`." + ) + return _MODULE_STACK[-1] + + +def ir_func(*, name=None, entry=False, kernel=None): def decorator(fn): + state = _current_module_state() sig = inspect.signature(fn) + arg_types = _resolve_arg_types(sig, state.meta_map) + ret_types = _resolve_ret_types(sig, state.meta_map) + fn_name = name or fn.__name__ + fn_ty = func.FunctionType.get(arg_types, ret_types) + + with InsertionPoint(state.module.body): + ir_op = func.FuncOp(fn_name, fn_ty) + + if entry: + ir_op.operation.attributes["pto.entry"] = UnitAttr.get(state.ctx) + if kernel is not None: + ir_op.operation.attributes["pto.kernel_kind"] = Attribute.parse( + f"#pto.kernel_kind<{kernel}>" + ) + _build_func_body(ir_op, fn, ret_types, state.meta_map) + return FuncRef(fn_name) + + return decorator + + +def _build_single_func_module(fn, meta_map): + sig = inspect.signature(fn) + arg_types = _resolve_arg_types(sig, meta_map) + ret_types = _resolve_ret_types(sig, meta_map) + module = Module.create() + fn_ty = func.FunctionType.get(arg_types, ret_types) + + with InsertionPoint(module.body): + ir_op = func.FuncOp(fn.__name__, fn_ty) + + _build_func_body(ir_op, fn, ret_types, meta_map) + return module + + +def _build_multi_func_module(fn, meta_map, ctx): + if inspect.signature(fn).parameters: + raise ValueError("`module=True` expects a zero-argument builder function.") + + module = Module.create() + injected = set(meta_map.keys()) + old_globals = _inject_globals(fn, meta_map) + _MODULE_STACK.append(_ModuleState(ctx=ctx, module=module, meta_map=meta_map)) + try: + fn() + finally: + _MODULE_STACK.pop() + _restore_globals(fn, old_globals, injected) + return module + + +def to_ir_module(*, meta_data, module=False): + def decorator(fn): with Context() as ctx, Location.unknown(): _pto.register_dialect(ctx, load=True) meta_map = _resolve_meta(meta_data) - arg_types = _resolve_arg_types(sig, meta_map) - ret_types = _resolve_ret_types(sig, meta_map) - module = Module.create() - fn_ty = func.FunctionType.get(arg_types, ret_types) - - with InsertionPoint(module.body): - ir_func = func.FuncOp(fn.__name__, fn_ty) - entry = ir_func.add_entry_block() - - with InsertionPoint(entry): - wrapped_args = [wrap_value(arg) for arg in entry.arguments] - injected = set(meta_map.keys()) - old_globals = _inject_globals(fn, meta_map) - try: - fn(*wrapped_args) - finally: - _restore_globals(fn, old_globals, injected) - - if not ret_types and not _has_func_return(entry): - func.ReturnOp([]) - - module.operation.verify() - return module + if module: + ir_module = _build_multi_func_module(fn, meta_map, ctx) + else: + ir_module = _build_single_func_module(fn, meta_map) + ir_module.operation.verify() + return ir_module return decorator -__all__ = ["to_ir_module"] +__all__ = ["FuncRef", "ir_func", "to_ir_module"] diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py new file mode 100644 index 00000000..90fb29f7 --- /dev/null +++ b/tests/frontend/test_multifunc_ir.py @@ -0,0 +1,27 @@ +from ptodsl import pto, to_ir_module + + +def meta_data(): + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + return {"ptr_ty": ptr_ty} + + +@to_ir_module(meta_data=meta_data, module=True) +def build_module(): + @pto.func(kernel="vector") + def worker(arg0: "ptr_ty") -> None: + pass + + @pto.func(entry=True) + def entry(arg0: "ptr_ty") -> None: + pto.call(worker, arg0) + + +def test_multifunc_builder_shapes_module(): + text = str(build_module) + assert "func.func @worker" in text + assert "pto.kernel_kind = #pto.kernel_kind" in text + assert "func.func @entry" in text + assert "attributes {pto.entry}" in text + assert "func.call @worker" in text From 2362f7ee3212bc1a71b3ec4b1d65e255dcafc1b3 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:50:41 +0000 Subject: [PATCH 17/38] WIP: simplify ir.py --- ptodsl/compiler/ir.py | 139 ++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 79 deletions(-) diff --git a/ptodsl/compiler/ir.py b/ptodsl/compiler/ir.py index edeb5c24..c2cef082 100644 --- a/ptodsl/compiler/ir.py +++ b/ptodsl/compiler/ir.py @@ -6,7 +6,9 @@ from ..api.scalar import wrap_value -_MODULE_STACK = [] +# For the inner decorators to be clean for the user visible API `pto.func(kernel='cube')` +# with no reference to module, we need this: +_CURRENT = None class FuncRef: @@ -14,13 +16,6 @@ def __init__(self, sym_name): self.sym_name = sym_name -class _ModuleState: - def __init__(self, *, ctx, module, meta_map): - self.ctx = ctx - self.module = module - self.meta_map = meta_map - - def _resolve_meta(meta_fn): values = meta_fn() if not isinstance(values, dict): @@ -56,10 +51,7 @@ def _resolve_ret_types(signature, meta_map): if isinstance(ret_annot, (list, tuple)): out = [] for elem in ret_annot: - if isinstance(elem, str): - out.append(meta_map[elem]) - else: - out.append(elem) + out.append(meta_map[elem] if isinstance(elem, str) else elem) return out return [ret_annot] @@ -79,101 +71,90 @@ def _inject_globals(fn, values): return old -def _restore_globals(fn, old, injected_names): - for name in injected_names: +def _restore_globals(fn, old, names): + for name in names: if old[name] is None and name in fn.__globals__: del fn.__globals__[name] else: fn.__globals__[name] = old[name] -def _build_func_body(ir_func, fn, ret_types, meta_map): - entry = ir_func.add_entry_block() - with InsertionPoint(entry): - wrapped_args = [wrap_value(arg) for arg in entry.arguments] - injected = set(meta_map.keys()) - old_globals = _inject_globals(fn, meta_map) +def _define(module, ctx, meta_map, fn, *, name=None, entry=False, kernel=None): + sig = inspect.signature(fn) + arg_types = _resolve_arg_types(sig, meta_map) + ret_types = _resolve_ret_types(sig, meta_map) + fn_name = name or fn.__name__ + fn_ty = func.FunctionType.get(arg_types, ret_types) + + with InsertionPoint(module.body): + ir_func = func.FuncOp(fn_name, fn_ty) + + if entry: + ir_func.operation.attributes["pto.entry"] = UnitAttr.get(ctx) + if kernel is not None: + ir_func.operation.attributes["pto.kernel_kind"] = Attribute.parse( + f"#pto.kernel_kind<{kernel}>" + ) + + block = ir_func.add_entry_block() + with InsertionPoint(block): + wrapped_args = [wrap_value(arg) for arg in block.arguments] + old = _inject_globals(fn, meta_map) try: fn(*wrapped_args) finally: - _restore_globals(fn, old_globals, injected) + _restore_globals(fn, old, meta_map.keys()) - if not ret_types and not _has_func_return(entry): + if not ret_types and not _has_func_return(block): func.ReturnOp([]) - -def _current_module_state(): - if not _MODULE_STACK: - raise RuntimeError( - "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`." - ) - return _MODULE_STACK[-1] + return FuncRef(fn_name) def ir_func(*, name=None, entry=False, kernel=None): def decorator(fn): - state = _current_module_state() - sig = inspect.signature(fn) - arg_types = _resolve_arg_types(sig, state.meta_map) - ret_types = _resolve_ret_types(sig, state.meta_map) - fn_name = name or fn.__name__ - fn_ty = func.FunctionType.get(arg_types, ret_types) - - with InsertionPoint(state.module.body): - ir_op = func.FuncOp(fn_name, fn_ty) - - if entry: - ir_op.operation.attributes["pto.entry"] = UnitAttr.get(state.ctx) - if kernel is not None: - ir_op.operation.attributes["pto.kernel_kind"] = Attribute.parse( - f"#pto.kernel_kind<{kernel}>" + if _CURRENT is None: + raise RuntimeError( + "`pto.func(...)` can only be used inside `@to_ir_module(..., module=True)`." ) - - _build_func_body(ir_op, fn, ret_types, state.meta_map) - return FuncRef(fn_name) + return _define( + _CURRENT["module"], + _CURRENT["ctx"], + _CURRENT["meta_map"], + fn, + name=name, + entry=entry, + kernel=kernel, + ) return decorator -def _build_single_func_module(fn, meta_map): - sig = inspect.signature(fn) - arg_types = _resolve_arg_types(sig, meta_map) - ret_types = _resolve_ret_types(sig, meta_map) - module = Module.create() - fn_ty = func.FunctionType.get(arg_types, ret_types) - - with InsertionPoint(module.body): - ir_op = func.FuncOp(fn.__name__, fn_ty) - - _build_func_body(ir_op, fn, ret_types, meta_map) - return module - - -def _build_multi_func_module(fn, meta_map, ctx): - if inspect.signature(fn).parameters: - raise ValueError("`module=True` expects a zero-argument builder function.") - - module = Module.create() - injected = set(meta_map.keys()) - old_globals = _inject_globals(fn, meta_map) - _MODULE_STACK.append(_ModuleState(ctx=ctx, module=module, meta_map=meta_map)) - try: - fn() - finally: - _MODULE_STACK.pop() - _restore_globals(fn, old_globals, injected) - return module - - def to_ir_module(*, meta_data, module=False): def decorator(fn): + global _CURRENT + with Context() as ctx, Location.unknown(): _pto.register_dialect(ctx, load=True) meta_map = _resolve_meta(meta_data) + ir_module = Module.create() + if module: - ir_module = _build_multi_func_module(fn, meta_map, ctx) + if inspect.signature(fn).parameters: + raise ValueError( + "`module=True` expects a zero-argument builder function." + ) + old = _inject_globals(fn, meta_map) + prev = _CURRENT + _CURRENT = {"ctx": ctx, "module": ir_module, "meta_map": meta_map} + try: + fn() + finally: + _CURRENT = prev + _restore_globals(fn, old, meta_map.keys()) else: - ir_module = _build_single_func_module(fn, meta_map) + _define(ir_module, ctx, meta_map, fn) + ir_module.operation.verify() return ir_module From c7b31f4160179b09c93838e4ca6081f7f2ef51cc Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 10:00:54 +0000 Subject: [PATCH 18/38] use new ptodsl api for builder --- .../mix-kernel_mlir/bidirectional_builder.py | 229 +++++++----------- .../aot/tpushpop/mix-kernel_mlir/compile.sh | 8 +- 2 files changed, 94 insertions(+), 143 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py index fbf68e6b..9535b318 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py @@ -1,164 +1,109 @@ -from mlir.dialects import arith, func, pto as _pto -from mlir.ir import ( - Attribute, - Context, - FlatSymbolRefAttr, - InsertionPoint, - Location, - Module, - Operation, - UnitAttr, -) - -from ptodsl import pto, tile +from mlir.dialects import arith + +from ptodsl import pto, tile, to_ir_module from ptodsl import scalar as s const = s.const -def _call(name, *args): - return Operation.create( - "func.call", - operands=list(args), - attributes={"callee": FlatSymbolRefAttr.get(name)}, +def meta_data(): + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + i32 = pto.int32 + tensor_ty = pto.TensorType(rank=2, dtype=dtype) + tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) + x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT") + x_left_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="LEFT", + config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"), ) + x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT") + acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC") + recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") + return locals() + + +@to_ir_module(meta_data=meta_data, module=True) +def module(): + @pto.func(kernel="cube") + def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + c2v_import = pto.import_reserved_buffer( + name="c2v_fifo", + peer_func="@vector_kernel", + ) + pto.aic_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_import, + v2c_consumer_buf=c0_i32, + ) -def _kernel(fn, kind): - fn.operation.attributes["pto.kernel_kind"] = Attribute.parse( - f"#pto.kernel_kind<{kind}>" - ) - + x_mat_tile = pto.alloc_tile(x_mat_ty) + x_left_tile = pto.alloc_tile(x_left_ty) + x_right_tile = pto.alloc_tile(x_right_ty) + acc_tile = pto.alloc_tile(acc_ty) -def build_module(): - with Context() as ctx, Location.unknown(): - _pto.register_dialect(ctx, load=True) - module = Module.create() - - dtype = pto.float32 - ptr_ty = pto.PtrType(dtype) - i32 = pto.int32 - tensor_ty = pto.TensorType(rank=2, dtype=dtype) - tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) - left_cfg = pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor") - x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT") - x_left_ty = pto.TileBufType( - shape=[16, 16], - dtype=dtype, - memory_space="LEFT", - config=left_cfg, - ) - x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT") - acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC") - recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") - call_both_ty = func.FunctionType.get([ptr_ty, ptr_ty, ptr_ty], []) - two_ptr_ty = func.FunctionType.get([ptr_ty, ptr_ty], []) - - with InsertionPoint(module.body): - call_both = func.FuncOp("call_both", call_both_ty) - cube_kernel = func.FuncOp("cube_kernel", two_ptr_ty) - vector_kernel = func.FuncOp("vector_kernel", two_ptr_ty) - - call_both.operation.attributes["pto.entry"] = UnitAttr.get(ctx) - _kernel(cube_kernel, "cube") - _kernel(vector_kernel, "vector") - - call_both_entry = call_both.add_entry_block() - with InsertionPoint(call_both_entry): - gm_slot_buffer, gm_x, gm_y = call_both_entry.arguments - _call("cube_kernel", gm_slot_buffer, gm_x) - _call("vector_kernel", gm_slot_buffer, gm_y) - func.ReturnOp([]) - - cube_entry = cube_kernel.add_entry_block() - with InsertionPoint(cube_entry): - gm_slot_buffer, gm_x = cube_entry.arguments - c0 = const(0) - c1 = const(1) - c16 = const(16) - c0_i32 = arith.ConstantOp(i32, 0).result - c2v_import = pto.import_reserved_buffer( - name="c2v_fifo", - peer_func="@vector_kernel", - ) - - pto.aic_initialize_pipe( - dir_mask=1, - slot_size=1024, - gm_slot_buffer=gm_slot_buffer, - c2v_consumer_buf=c2v_import, - v2c_consumer_buf=c0_i32, - ) - - x_mat_tile = pto.alloc_tile(x_mat_ty) - x_left_tile = pto.alloc_tile(x_left_ty) - x_right_tile = pto.alloc_tile(x_right_ty) - acc_tile = pto.alloc_tile(acc_ty) - - gm_x_view = pto.as_tensor( + gm_x_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( tensor_ty, ptr=gm_x, shape=[c16, c16], strides=[c16, c1], - ) - gm_x_tile_view = pto.slice_view( - tile_view_ty, - source=gm_x_view, - offsets=[c0, c0], - sizes=[c16, c16], - ) - - pto.load(gm_x_tile_view, x_mat_tile) - tile.mov(x_mat_tile, x_left_tile) - tile.mov(x_mat_tile, x_right_tile) - tile.matmul(x_left_tile, x_right_tile, acc_tile) - pto.tpush_to_aiv(acc_tile, 0) - func.ReturnOp([]) - - vector_entry = vector_kernel.add_entry_block() - with InsertionPoint(vector_entry): - gm_slot_buffer, gm_y = vector_entry.arguments - c0 = const(0) - c1 = const(1) - c16 = const(16) - c0_i32 = arith.ConstantOp(i32, 0).result - c2v_local = pto.reserve_buffer( - name="c2v_fifo", - size=4096, - location="VEC", - ) - - pto.aiv_initialize_pipe( - dir_mask=1, - slot_size=1024, - gm_slot_buffer=gm_slot_buffer, - c2v_consumer_buf=c2v_local, - v2c_consumer_buf=c0_i32, - ) - - gm_y_view = pto.as_tensor( + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + pto.load(gm_x_tile_view, x_mat_tile) + tile.mov(x_mat_tile, x_left_tile) + tile.mov(x_mat_tile, x_right_tile) + tile.matmul(x_left_tile, x_right_tile, acc_tile) + pto.tpush_to_aiv(acc_tile, 0) + + @pto.func(kernel="vector") + def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC") + + pto.aiv_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_local, + v2c_consumer_buf=c0_i32, + ) + + gm_y_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( tensor_ty, ptr=gm_y, shape=[c16, c16], strides=[c16, c1], - ) - gm_y_tile_view = pto.slice_view( - tile_view_ty, - source=gm_y_view, - offsets=[c0, c0], - sizes=[c16, c16], - ) - - recv_tile = pto.tpop_from_aic(recv_ty, 0) - pto.store(recv_tile, gm_y_tile_view) - pto.tfree_from_aic(0) - func.ReturnOp([]) - - module.operation.verify() - return module + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + pto.store(pto.tpop_from_aic(recv_ty, 0), gm_y_tile_view) + pto.tfree_from_aic(0) -module = build_module() + @pto.func(entry=True) + def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.call(cube_kernel, gm_slot_buffer, gm_x) + pto.call(vector_kernel, gm_slot_buffer, gm_y) if __name__ == "__main__": diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index 8978eb9e..7169a980 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -10,7 +10,13 @@ LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" mkdir -p "${ARTIFACT_DIR}" rm -f "${GENERATED_CPP}" "${LIB_PATH}" -ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" +MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir" +python bidirectional_builder.py > bidir_gen.mlir +ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}" + +#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" + + bisheng \ -I/sources/pto-isa/include/ \ From 0597c0a34ab4a97f3536e4e1b27a93e8ccac4e8b Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 10:09:04 +0000 Subject: [PATCH 19/38] feat: remove files --- .../aot/tpushpop/mix-kernel_cpp/README.md | 31 -- .../aot/tpushpop/mix-kernel_cpp/caller.cpp | 27 -- .../aot/tpushpop/mix-kernel_cpp/compile.sh | 46 --- .../mix-kernel_cpp/run_tpushpop_cv.py | 158 ---------- .../tpushpop/mix-kernel_cpp/tpushpop_cv.cpp | 297 ------------------ .../tpushpop/mix-kernel_cpp_simple/README.md | 15 - .../tpushpop/mix-kernel_cpp_simple/caller.cpp | 27 -- .../tpushpop/mix-kernel_cpp_simple/compile.sh | 36 --- .../tpushpop/mix-kernel_cpp_simple/kernel.cpp | 156 --------- .../aot/tpushpop/mix-kernel_cpp_simple/run.py | 72 ----- refs/tpushpop_cv.cpp | 290 ----------------- refs/tpushpop_vc.cpp | 236 -------------- 12 files changed, 1391 deletions(-) delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/README.md delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/caller.cpp delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/compile.sh delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/README.md delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp delete mode 100644 examples/aot/tpushpop/mix-kernel_cpp_simple/run.py delete mode 100644 refs/tpushpop_cv.cpp delete mode 100644 refs/tpushpop_vc.cpp diff --git a/examples/aot/tpushpop/mix-kernel_cpp/README.md b/examples/aot/tpushpop/mix-kernel_cpp/README.md deleted file mode 100644 index 672e71f1..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Cube To Vector `TPUSH`/`TPOP` Example - -This example keeps the kernel source in the same directory as the wrapper, using `./tpushpop_cv.cpp` with the same `compile.sh` + Python runner flow used by the AOT examples. - -The kernel does: - -- cube-side `TMATMUL` -- `TPUSH` from cube to vector -- vector-side `TPOP` -- vector-side bias add - -## Run - -```bash -python run_tpushpop_cv.py -``` - -That will: - -1. call `compile.sh` -2. build `./tpushpop_cv_lib.so` -3. launch the kernel on NPU -4. compare against `A @ B + bias` - -The wrapper fetches the runtime FFTS/control address inside `caller.cpp` with `rtGetC2cCtrlAddr(...)`, so the Python side only needs to provide the kernel inputs, output, and FIFO backing memory. - -If your environment needs different PTO include roots: - -```bash -PTO_INCLUDE_PATH=/sources/pto-isa/include python run_tpushpop_cv.py -``` diff --git a/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp deleted file mode 100644 index fbe697f4..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp/caller.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef KERNEL_CPP -#error "KERNEL_CPP must be defined at compile time." -#endif - -#include - -extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); - -#include KERNEL_CPP - -extern "C" void call_kernel( - uint32_t blockDim, - void *stream, - uint8_t *out, - uint8_t *srcA, - uint8_t *srcB, - uint8_t *bias, - uint8_t *fifoMem) -{ - void *fftsAddr = nullptr; - uint32_t fftsLen = 0; - (void)blockDim; - (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); - (void)fftsLen; - - LaunchTPushPopMatmulAdd(reinterpret_cast(fftsAddr), out, srcA, srcB, bias, fifoMem, stream); -} diff --git a/examples/aot/tpushpop/mix-kernel_cpp/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp/compile.sh deleted file mode 100644 index df924539..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp/compile.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" -LIB_PATH="${SCRIPT_DIR}/tpushpop_cv_lib.so" -KERNEL_CPP_PATH="${KERNEL_CPP_PATH:-${SCRIPT_DIR}/tpushpop_cv.cpp}" -EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS:-}" - -if [[ "${TPUSHPOP_SANITY_ONLY:-}" =~ ^(1|true|TRUE|yes|YES|on|ON)$ ]]; then - EXTRA_BISHENG_FLAGS="${EXTRA_BISHENG_FLAGS} -DTPUSHPOP_SANITY_ONLY" -fi - -PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}" -if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then - if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then - PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include" - elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then - PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include" - else - echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2 - exit 1 - fi -fi - -mkdir -p "${ARTIFACT_DIR}" -rm -f "${LIB_PATH}" - -bisheng \ - -I"${PTO_INCLUDE_PATH}" \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ - -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ - -xcce -Xhost-start -Xhost-end \ - -mllvm -cce-aicore-stack-size=0x8000 \ - -mllvm -cce-aicore-function-stack-size=0x8000 \ - -mllvm -cce-aicore-record-overflow=true \ - -mllvm -cce-aicore-addr-transform \ - -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -std=gnu++17 \ - ${EXTRA_BISHENG_FLAGS} \ - -DKERNEL_CPP="\"${KERNEL_CPP_PATH}\"" \ - "${SCRIPT_DIR}/caller.cpp" \ - -o "${LIB_PATH}" - -echo "Built ${LIB_PATH}." diff --git a/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py b/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py deleted file mode 100644 index 4e2d468d..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp/run_tpushpop_cv.py +++ /dev/null @@ -1,158 +0,0 @@ -import ctypes -import os -import subprocess - -import numpy as np -import torch -import torch_npu # noqa: F401 - -from ptodsl.test_util import get_test_device - -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_cv_lib.so") -DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") -DEFAULT_KERNEL_CPP = os.path.join(THIS_DIR, "tpushpop_cv.cpp") -DEFAULT_FIFO_BYTES = 4 * 1024 -TOTAL_M = 128 -K = 32 -N = 32 -INPUT_DTYPE = torch.float16 -SEED = 0 -ATOL = 5e-2 -RTOL = 5e-2 -SANITY_ONLY = False - - -def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: - return ctypes.c_void_p(tensor.data_ptr()) - - -def compile_example(compile_script: str) -> None: - env = os.environ.copy() - env["KERNEL_CPP_PATH"] = DEFAULT_KERNEL_CPP - subprocess.run( - ["bash", compile_script], - check=True, - cwd=THIS_DIR, - env=env, - ) - - -def load_lib(lib_path: str) -> ctypes.CDLL: - lib = ctypes.CDLL(lib_path) - lib.call_kernel.argtypes = [ - ctypes.c_uint32, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ] - lib.call_kernel.restype = None - return lib - - -def make_buffers( - *, - total_m: int, - k: int, - n: int, - input_dtype: torch.dtype, - device: str, - fifo_bytes: int, -): - src_a = torch.randn((total_m, k), dtype=input_dtype, device=device) - src_b = torch.randn((k, n), dtype=input_dtype, device=device) - bias = torch.randn((total_m, n), dtype=torch.float32, device=device) - out = torch.zeros((total_m, n), dtype=torch.float32, device=device) - - fifo_elems = max(1, (fifo_bytes + 3) // 4) - fifo_mem = torch.zeros((fifo_elems,), dtype=torch.float32, device=device) - return out, src_a, src_b, bias, fifo_mem - - -def run_kernel( - lib: ctypes.CDLL, - *, - out: torch.Tensor, - src_a: torch.Tensor, - src_b: torch.Tensor, - bias: torch.Tensor, - fifo_mem: torch.Tensor, -) -> torch.Tensor: - stream_ptr = torch.npu.current_stream()._as_parameter_ - lib.call_kernel( - 1, - stream_ptr, - torch_to_ctypes(out), - torch_to_ctypes(src_a), - torch_to_ctypes(src_b), - torch_to_ctypes(bias), - torch_to_ctypes(fifo_mem), - ) - torch.npu.synchronize() - return out - - -def reference_result(src_a: torch.Tensor, src_b: torch.Tensor, bias: torch.Tensor) -> torch.Tensor: - ref = torch.matmul(src_a.float().cpu(), src_b.float().cpu()) - if not SANITY_ONLY: - ref = ref + bias.cpu() - return ref.to(torch.float32) - - -def main() -> None: - compile_example(DEFAULT_COMPILE_SCRIPT) - - device = get_test_device() - torch.npu.set_device(device) - torch.manual_seed(SEED) - np.random.seed(SEED) - - lib = load_lib(DEFAULT_LIB_PATH) - out, src_a, src_b, bias, fifo_mem = make_buffers( - total_m=TOTAL_M, - k=K, - n=N, - input_dtype=INPUT_DTYPE, - device=device, - fifo_bytes=DEFAULT_FIFO_BYTES, - ) - - out = run_kernel( - lib, - out=out, - src_a=src_a, - src_b=src_b, - bias=bias, - fifo_mem=fifo_mem, - ) - ref = reference_result(src_a, src_b, bias) - out_cpu = out.cpu() - assert ref.device == out_cpu.device - torch.npu.synchronize() - torch.set_printoptions(precision=1, sci_mode=False, linewidth=250, threshold=5000) - print(ref-out_cpu) - - max_abs = float(torch.max(torch.abs(out_cpu - ref)).item()) - mean_abs = float(torch.mean(torch.abs(out_cpu - ref)).item()) - ok = bool(torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL)) - - print( - f"mode={'sanity_matmul' if SANITY_ONLY else 'tpushpop_cv'} " - f"shape=({TOTAL_M}, {K}, {N}) dtype={INPUT_DTYPE} " - f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}" - ) - - if not ok: - raise SystemExit( - f"Validation failed with atol={ATOL} rtol={RTOL}. " - f"max_abs={max_abs:.6f} mean_abs={mean_abs:.6f}" - ) - - print(f"Validation passed using {DEFAULT_LIB_PATH}.") - - -if __name__ == "__main__": - main() diff --git a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp b/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp deleted file mode 100644 index 3f4c42b2..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp/tpushpop_cv.cpp +++ /dev/null @@ -1,297 +0,0 @@ -#include -#include - -using namespace pto; - -#define VEC_CORES 2 - -using ExampleInT = half; -using ExampleOutT = float; -constexpr uint32_t EXAMPLE_TOTAL_M = 128; -constexpr uint32_t EXAMPLE_CASE_TILE_M = 16; -constexpr uint32_t EXAMPLE_TILE_K = 32; -constexpr uint32_t EXAMPLE_TILE_N = 32; - -#ifdef __DAV_CUBE__ -constexpr bool DAV_CUBE = true; -#else -constexpr bool DAV_CUBE = false; -#endif - -#ifdef __DAV_VEC__ -constexpr bool DAV_VEC = true; -#else -constexpr bool DAV_VEC = false; -#endif - -template -AICORE constexpr inline T CeilAlign(T num_1, T num_2) -{ - if (num_2 == 0) { - return 0; - } - return (num_1 + num_2 - 1) / num_2 * num_2; -} - -#ifdef TPUSHPOP_SANITY_ONLY -__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB) -{ - constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); - constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_TOTAL_M, 16); - constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); - constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); - - using GlobalA = - GlobalTensor, - pto::Stride>; - using GlobalB = - GlobalTensor, - pto::Stride>; - using GlobalOut = - GlobalTensor, - pto::Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - if constexpr (DAV_CUBE) { - TileMatA aMatTile; - TileMatB bMatTile; - LeftTile aTile; - RightTile bTile; - AccTile accTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - GlobalA globalA(srcA); - GlobalB globalB(srcB); - GlobalOut globalOut(out); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - TLOAD(aMatTile, globalA); - TLOAD(bMatTile, globalB); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - TMATMUL(accTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - TSTORE(globalOut, accTile); - - pipe_barrier(PIPE_ALL); - } -} -#else -__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out, - __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB, - __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem) -{ - // Point the cross-core FIFO signaling ops at the FFTS flag storage used by TPUSH/TPOP handshakes. - //t_ffts_base_addr((uint64_t)ffts_addr); - constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M; - constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES; - - constexpr uint16_t FLAG_ID = 0; - constexpr uint8_t FIFO_DEPTH = 2; - constexpr uint8_t FIFO_PERIOD = 1; - // Local ring-buffer base used by vector-side TPOP to place each popped half-tile before vector compute uses it. - constexpr uint32_t localFiFoBase = 0x0; - - using AccTile = TileAcc; - using VecTileHalf = - Tile; - using BiasTile = - Tile; - using OutTile = - Tile; - - // Cube-to-vector FIFO: each GM slot stores one full AccTile, and vector TPOP reads it back as two row halves. - using MatPipe = TPipe; - // Bind the FIFO protocol to GM slot storage and the vector-side local staging buffer used by TPOP. - MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase); - - constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); - constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_CASE_TILE_M, 16); - constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); - constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); - - using GlobalA = - GlobalTensor, - pto::Stride>; - using GlobalB = - GlobalTensor, - pto::Stride>; - using GlobalBias = - GlobalTensor, - pto::Stride>; - using GlobalOut = - GlobalTensor, - pto::Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - - if constexpr (DAV_CUBE) { - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile accTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { - GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K); - GlobalB globalB(srcB); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - TLOAD(aMatTile, globalA); - TLOAD(bMatTile, globalB); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - - TMATMUL(accTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - // Push the full accumulator tile into the next GM FIFO slot and signal vector that one split-up-down tile is ready. - TPUSH(mPipe, accTile); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - } - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } - - if constexpr (DAV_VEC) { - VecTileHalf vecTileHalf; - BiasTile biasTile; - OutTile outTile; - TASSIGN(biasTile, 0x10000); - TASSIGN(outTile, 0x20000); - - uint32_t subBlockIdx = get_subblockid(); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - // Pop this subcore's half-tile from the next ready FIFO slot into local vector memory based on get_subblockid(). - // TILE_UP_DOWN means split MxN tile into-> [M/2xN, M/2xN]. - TPOP(mPipe, vecTileHalf); - - size_t biasOffset = - static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; - GlobalBias globalBias(bias + biasOffset); - - TLOAD(biasTile, globalBias); - - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - TADD(outTile, vecTileHalf, biasTile); - - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - - size_t outOffset = - static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; - GlobalOut globalOut(out + outOffset); - // Store this vector subcore's output half-tile from local vector memory back to its GM output slice. - TSTORE(globalOut, outTile); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - } - - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } -} -#endif - -void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, - void *stream) -{ -#ifdef TPUSHPOP_SANITY_ONLY - (void)ffts; - (void)bias; - (void)fifoMem; - runSanityMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(out), reinterpret_cast(srcA), reinterpret_cast(srcB)); -#else - runTPushPopMatmulAdd<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); -#endif -} diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md b/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md deleted file mode 100644 index 45adb9fc..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp_simple/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Simple Cube To Vector `TPUSH`/`TPOP` Example - -This is a stripped-down sibling of `mix-kernel_cpp`. - -The kernel is fixed to a single `16x32 @ 32x32` matmul, followed by a bias add on the vector side: - -- no tile loop -- no sanity mode -- no extra runner configuration - -Run it with: - -```bash -python run.py -``` diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp deleted file mode 100644 index fbe697f4..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp_simple/caller.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef KERNEL_CPP -#error "KERNEL_CPP must be defined at compile time." -#endif - -#include - -extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); - -#include KERNEL_CPP - -extern "C" void call_kernel( - uint32_t blockDim, - void *stream, - uint8_t *out, - uint8_t *srcA, - uint8_t *srcB, - uint8_t *bias, - uint8_t *fifoMem) -{ - void *fftsAddr = nullptr; - uint32_t fftsLen = 0; - (void)blockDim; - (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); - (void)fftsLen; - - LaunchTPushPopMatmulAdd(reinterpret_cast(fftsAddr), out, srcA, srcB, bias, fifoMem, stream); -} diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh b/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh deleted file mode 100644 index 0d8d8eb7..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp_simple/compile.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PTO_INCLUDE_PATH="${PTO_INCLUDE_PATH:-/sources/pto-isa/include/}" -LIB_PATH="${SCRIPT_DIR}/lib.so" - -if [[ ! -d "${PTO_INCLUDE_PATH}" ]]; then - if [[ -n "${PTO_LIB_PATH:-}" && -d "${PTO_LIB_PATH}/include" ]]; then - PTO_INCLUDE_PATH="${PTO_LIB_PATH}/include" - elif [[ -n "${ASCEND_TOOLKIT_HOME:-}" && -d "${ASCEND_TOOLKIT_HOME}/include" ]]; then - PTO_INCLUDE_PATH="${ASCEND_TOOLKIT_HOME}/include" - else - echo "Could not find PTO headers. Set PTO_INCLUDE_PATH, PTO_LIB_PATH, or ASCEND_TOOLKIT_HOME." >&2 - exit 1 - fi -fi - -rm -f "${LIB_PATH}" - -bisheng \ - -I"${PTO_INCLUDE_PATH}" \ - -fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 \ - -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \ - -xcce -Xhost-start -Xhost-end \ - -mllvm -cce-aicore-stack-size=0x8000 \ - -mllvm -cce-aicore-function-stack-size=0x8000 \ - -mllvm -cce-aicore-record-overflow=true \ - -mllvm -cce-aicore-addr-transform \ - -mllvm -cce-aicore-dcci-insert-for-scalar=false \ - --npu-arch=dav-2201 -DMEMORY_BASE \ - -DKERNEL_CPP="\"${SCRIPT_DIR}/kernel.cpp\"" \ - "${SCRIPT_DIR}/caller.cpp" \ - -o "${LIB_PATH}" - -echo "Built ${LIB_PATH}." diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp b/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp deleted file mode 100644 index 02336d41..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp_simple/kernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* -Flow: -1. Cube loads A and B from GM through GlobalTensor views. -2. Cube copies those GM-backed matrix tiles into local matrix tiles: - `aMat` at `0x0`, `bMat` at `0x20000`, then converts them to matmul inputs - `aTile` and `bTile` and runs one `TMATMUL` into `acc`. -3. Cube `TPUSH`es the full `16x32` accumulator tile to the C2V pipe. -4. Vector `TPOP`s its `8x32` half-tile from that pushed accumulator, loads the - matching `8x32` bias tile from GM, does `TADD`, and stores the result to GM. - -Allocation summary: -- `GlobalTensor` objects are just GM views over `srcA`, `srcB`, `bias`, and `out`. - They do not allocate local on-core memory themselves. -- The C2V FIFO is also explicit GM memory in this example: `fifoMem` is the GM slot - buffer passed into `TPipe`, so cube writes the pushed accumulator tile into GM and - vector reads it back from that same GM-backed FIFO. -- Cube local tiles: - `aMat @ 0x0`, `bMat @ 0x20000`, `aTile @ 0x0`, `bTile @ 0x0`, `acc @ 0x0`. -- Vector local tiles: - `biasTile @ 0x10000`, `outTile @ 0x20000`. -- The cross-core transfer is the matmul result: one full `AccTile` - produced on cube and split `up/down` so each vector subcore receives one `8x32` - row half via `TPOP`. -*/ -#include -#include - -using namespace pto; - -using In = half; -using Out = float; - -constexpr uint32_t M = 16; -constexpr uint32_t K = 32; -constexpr uint32_t N = 32; -constexpr uint32_t VEC_CORES = 2; -constexpr uint32_t VEC_M = M / VEC_CORES; - -#ifdef __DAV_CUBE__ -constexpr bool DAV_CUBE = true; -#else -constexpr bool DAV_CUBE = false; -#endif - -#ifdef __DAV_VEC__ -constexpr bool DAV_VEC = true; -#else -constexpr bool DAV_VEC = false; -#endif - -__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts, __gm__ Out *out, __gm__ In *srcA, __gm__ In *srcB, - __gm__ Out *bias, __gm__ Out *fifoMem) -{ - set_ffts_base_addr((uint64_t)ffts); - - using GlobalA = GlobalTensor, Stride>; - using GlobalB = GlobalTensor, Stride>; - using GlobalBias = GlobalTensor, Stride>; - using GlobalOut = GlobalTensor, Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - using VecTile = Tile; - - using Pipe = TPipe<0, Direction::DIR_C2V, M * N * sizeof(Out), 2>; - Pipe pipe((__gm__ void *)(uint64_t)fifoMem, 0x0, 0x0); - - if constexpr (DAV_CUBE) { - TileMatA aMat; - TileMatB bMat; - LeftTile aTile; - RightTile bTile; - AccTile acc; - TASSIGN(aMat, 0x0); - TASSIGN(bMat, 0x20000); - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(acc, 0x0); - - GlobalA globalA(srcA); - GlobalB globalB(srcB); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - TLOAD(aMat, globalA); - TLOAD(bMat, globalB); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - TMOV(aTile, aMat); - TMOV(bTile, bMat); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - TMATMUL(acc, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - TPUSH(pipe, acc); - - pipe_barrier(PIPE_ALL); - } - - if constexpr (DAV_VEC) { - VecTile popped; - VecTile biasTile; - VecTile outTile; - TASSIGN(biasTile, 0x10000); - TASSIGN(outTile, 0x20000); - - uint32_t subBlock = get_subblockid(); - uint32_t offset = subBlock * VEC_M * N; - GlobalBias globalBias(bias + offset); - GlobalOut globalOut(out + offset); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - TPOP(pipe, popped); - TLOAD(biasTile, globalBias); - - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - TADD(outTile, popped, biasTile); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - TSTORE(globalOut, outTile); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } -} - -void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, - void *stream) -{ - runTPushPopMatmulAdd<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); -} diff --git a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py b/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py deleted file mode 100644 index e098ddeb..00000000 --- a/examples/aot/tpushpop/mix-kernel_cpp_simple/run.py +++ /dev/null @@ -1,72 +0,0 @@ -import ctypes -import os -import subprocess - -import torch -import torch_npu # noqa: F401 - -from ptodsl.test_util import get_test_device - -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -LIB_PATH = os.path.join(THIS_DIR, "lib.so") -M = 16 -K = 32 -N = 32 -FIFO_ELEMS = 1024 -ATOL = 5e-2 -RTOL = 5e-2 - - -def ptr(tensor: torch.Tensor) -> ctypes.c_void_p: - return ctypes.c_void_p(tensor.data_ptr()) - - -def main() -> None: - subprocess.run(["bash", "compile.sh"], check=True, cwd=THIS_DIR) - - device = get_test_device() - torch.npu.set_device(device) - torch.manual_seed(0) - - lib = ctypes.CDLL(LIB_PATH) - lib.call_kernel.argtypes = [ - ctypes.c_uint32, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ] - lib.call_kernel.restype = None - - a = torch.randn((M, K), dtype=torch.float16, device=device) - b = torch.randn((K, N), dtype=torch.float16, device=device) - bias = torch.randn((M, N), dtype=torch.float32, device=device) - out = torch.zeros((M, N), dtype=torch.float32, device=device) - fifo = torch.zeros((FIFO_ELEMS,), dtype=torch.float32, device=device) - - lib.call_kernel( - 1, - torch.npu.current_stream()._as_parameter_, - ptr(out), - ptr(a), - ptr(b), - ptr(bias), - ptr(fifo), - ) - torch.npu.synchronize() - - ref = a.float().cpu() @ b.float().cpu() + bias.cpu() - out_cpu = out.cpu() - max_abs = float((out_cpu - ref).abs().max().item()) - print(f"max_abs={max_abs:.6f}") - - if not torch.allclose(out_cpu, ref, atol=ATOL, rtol=RTOL): - raise SystemExit("validation failed") - - print("validation passed") - - -if __name__ == "__main__": - main() diff --git a/refs/tpushpop_cv.cpp b/refs/tpushpop_cv.cpp deleted file mode 100644 index 324ade79..00000000 --- a/refs/tpushpop_cv.cpp +++ /dev/null @@ -1,290 +0,0 @@ -#include -#include - -using namespace pto; - -#define VEC_CORES 2 - -using ExampleInT = half; -using ExampleOutT = float; -constexpr uint32_t EXAMPLE_TOTAL_M = 16; -constexpr uint32_t EXAMPLE_CASE_TILE_M = 16; -constexpr uint32_t EXAMPLE_TILE_K = 32; -constexpr uint32_t EXAMPLE_TILE_N = 32; - -#ifdef __DAV_CUBE__ -constexpr bool DAV_CUBE = true; -#else -constexpr bool DAV_CUBE = false; -#endif - -#ifdef __DAV_VEC__ -constexpr bool DAV_VEC = true; -#else -constexpr bool DAV_VEC = false; -#endif - -template -AICORE constexpr inline T CeilAlign(T num_1, T num_2) -{ - if (num_2 == 0) { - return 0; - } - return (num_1 + num_2 - 1) / num_2 * num_2; -} - -#ifdef TPUSHPOP_SANITY_ONLY -__global__ AICORE void runSanityMatmul(__gm__ ExampleOutT *out, __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB) -{ - constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); - constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_TOTAL_M, 16); - constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); - constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); - - using GlobalA = - GlobalTensor, - pto::Stride>; - using GlobalB = - GlobalTensor, - pto::Stride>; - using GlobalOut = - GlobalTensor, - pto::Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - using AccTile = TileAcc; - - if constexpr (DAV_CUBE) { - TileMatA aMatTile; - TileMatB bMatTile; - LeftTile aTile; - RightTile bTile; - AccTile accTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - GlobalA globalA(srcA); - GlobalB globalB(srcB); - GlobalOut globalOut(out); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - TLOAD(aMatTile, globalA); - TLOAD(bMatTile, globalB); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - TMATMUL(accTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - TSTORE(globalOut, accTile); - - pipe_barrier(PIPE_ALL); - } -} -#else -__global__ AICORE void runTPushPopMatmulAdd(__gm__ uint64_t *ffts_addr, __gm__ ExampleOutT *out, - __gm__ ExampleInT *srcA, __gm__ ExampleInT *srcB, - __gm__ ExampleOutT *bias, __gm__ ExampleOutT *fifoMem) -{ - set_ffts_base_addr((uint64_t)ffts_addr); - constexpr uint32_t NUM_M_TILES = EXAMPLE_TOTAL_M / EXAMPLE_CASE_TILE_M; - constexpr uint32_t VEC_M = EXAMPLE_CASE_TILE_M / VEC_CORES; - - constexpr uint16_t FLAG_ID = 0; - constexpr uint8_t FIFO_DEPTH = 2; - constexpr uint8_t FIFO_PERIOD = 1; - // local fifo base used for TPOP of vector side(vecTileHalf) - constexpr uint32_t localFiFoBase = 0x0; - - using AccTile = TileAcc; - using VecTileHalf = - Tile; - using BiasTile = - Tile; - using OutTile = - Tile; - - using MatPipe = TPipe; - MatPipe mPipe((__gm__ void *)(uint64_t)fifoMem, 0x0, localFiFoBase); - - constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(ExampleInT); - constexpr uint32_t ALIGNED_M = CeilAlign(EXAMPLE_CASE_TILE_M, 16); - constexpr uint32_t ALIGNED_K = CeilAlign(EXAMPLE_TILE_K, blockAlign); - constexpr uint32_t ALIGNED_N = CeilAlign(EXAMPLE_TILE_N, blockAlign); - - using GlobalA = - GlobalTensor, - pto::Stride>; - using GlobalB = - GlobalTensor, - pto::Stride>; - using GlobalBias = - GlobalTensor, - pto::Stride>; - using GlobalOut = - GlobalTensor, - pto::Stride>; - - using TileMatA = Tile; - using TileMatB = Tile; - using LeftTile = TileLeft; - using RightTile = TileRight; - - if constexpr (DAV_CUBE) { - TileMatA aMatTile; - TileMatB bMatTile; - TASSIGN(aMatTile, 0x0); - TASSIGN(bMatTile, 0x20000); - - LeftTile aTile; - RightTile bTile; - AccTile accTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { - GlobalA globalA(srcA + m_tile * EXAMPLE_CASE_TILE_M * EXAMPLE_TILE_K); - GlobalB globalB(srcB); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - TLOAD(aMatTile, globalA); - TLOAD(bMatTile, globalB); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - - TMATMUL(accTile, aTile, bTile); - - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - TPUSH(mPipe, accTile); - - set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - } - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } - - if constexpr (DAV_VEC) { - VecTileHalf vecTileHalf; - BiasTile biasTile; - OutTile outTile; - TASSIGN(biasTile, 0x10000); - TASSIGN(outTile, 0x20000); - - uint32_t subBlockIdx = get_subblockid(); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - for (int m_tile = 0; m_tile < NUM_M_TILES; m_tile++) { - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - TPOP(mPipe, vecTileHalf); - - size_t biasOffset = - static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; - GlobalBias globalBias(bias + biasOffset); - - TLOAD(biasTile, globalBias); - - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - TADD(outTile, vecTileHalf, biasTile); - - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - - size_t outOffset = - static_cast(m_tile * EXAMPLE_CASE_TILE_M + subBlockIdx * VEC_M) * EXAMPLE_TILE_N; - GlobalOut globalOut(out + outOffset); - TSTORE(globalOut, outTile); - - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - } - - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } -} -#endif - -void LaunchTPushPopMatmulAdd(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *srcB, uint8_t *bias, uint8_t *fifoMem, - void *stream) -{ -#ifdef TPUSHPOP_SANITY_ONLY - (void)ffts; - (void)bias; - (void)fifoMem; - runSanityMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(out), reinterpret_cast(srcA), reinterpret_cast(srcB)); -#else - runTPushPopMatmulAdd<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(srcB), reinterpret_cast(bias), reinterpret_cast(fifoMem)); -#endif -} diff --git a/refs/tpushpop_vc.cpp b/refs/tpushpop_vc.cpp deleted file mode 100644 index 69672e57..00000000 --- a/refs/tpushpop_vc.cpp +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include - -using namespace pto; - -#ifdef __DAV_CUBE__ -constexpr bool DAV_CUBE = true; -#else -constexpr bool DAV_CUBE = false; -#endif - -#ifdef __DAV_VEC__ -constexpr bool DAV_VEC = true; -#else -constexpr bool DAV_VEC = false; -#endif - -template -AICORE constexpr inline T CeilAlign(T num_1, T num_2) -{ - if (num_2 == 0) { - return 0; - } - return (num_1 + num_2 - 1) / num_2 * num_2; -} - -template -__global__ AICORE void runTPushPopVCMatmul(__gm__ uint64_t *ffts_addr, __gm__ OutT *out, __gm__ InT *srcA, - __gm__ QuantT *quantB, __gm__ OutT *scale, __gm__ OutT *offset, - __gm__ OutT *fifoMem) -{ - set_ffts_base_addr((uint64_t)ffts_addr); - constexpr uint32_t TILE_K = CASE_TILE_K; - constexpr uint32_t HALF_TILE_K = TILE_K / 2; - constexpr uint32_t TILE_N = N; - constexpr uint32_t NUM_K_TILES = TOTAL_K / CASE_TILE_K; - - constexpr uint16_t FLAG_ID = 0; - constexpr uint8_t FIFO_DEPTH = 2; - constexpr uint8_t FIFO_PERIOD = 1; - // fifo base used for TPOP of cube side (bMatTile) - constexpr uint32_t localFiFoBase = 0x20000; - - using VecTileProd = Tile; - using MatTileCons = - Tile; - - using MatPipe = TPipe; - MatPipe mPipe((__gm__ void *)fifoMem, 0x0, localFiFoBase); - - constexpr uint32_t blockAlign = C0_SIZE_BYTE / sizeof(InT); - constexpr uint32_t ALIGNED_M = CeilAlign(TOTAL_M, 16); - constexpr uint32_t ALIGNED_K = CeilAlign(TILE_K, blockAlign); - constexpr uint32_t ALIGNED_N = CeilAlign(TILE_N, blockAlign); - - using GlobalA = GlobalTensor, - pto::Stride>; - using GlobalOut = GlobalTensor, - pto::Stride>; - - using TileMatA = - Tile; - using LeftTile = TileLeft; - using PopTile = - Tile; - using RightTile = TileRight; - using AccTile = TileAcc; - - using QuantTile = Tile; - using ScaleTile = Tile; - using OffsetTile = Tile; - - if constexpr (DAV_VEC) { - QuantTile quantTile; - VecTileProd dequantTile; - ScaleTile scaleTile(HALF_TILE_K, 1); - OffsetTile offsetTile(HALF_TILE_K, 1); - TASSIGN(quantTile, 0x0); - TASSIGN(dequantTile, 0x10000); - TASSIGN(scaleTile, 0x20000); - TASSIGN(offsetTile, 0x28000); - - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - using GlobalQuantB = - GlobalTensor, - pto::Stride>; - using GlobalScaleOffset = - GlobalTensor, pto::Stride>; - - uint32_t subBlockIdx = get_subblockid(); - - for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) { - GlobalQuantB globalQuantB(quantB + k_tile * TILE_K * TILE_N + subBlockIdx * HALF_TILE_K * TILE_N); - GlobalScaleOffset globalScale(scale + k_tile * TILE_K + subBlockIdx * HALF_TILE_K); - GlobalScaleOffset globalOffset(offset + k_tile * TILE_K + subBlockIdx * HALF_TILE_K); - - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - TLOAD(quantTile, globalQuantB); - TLOAD(scaleTile, globalScale); - TLOAD(offsetTile, globalOffset); - - set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); - - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - TDEQUANT(dequantTile, quantTile, scaleTile, offsetTile); - - set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - - set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); - - TPUSH(mPipe, dequantTile); - set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - } - - wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } - - if constexpr (DAV_CUBE) { - TileMatA aMatTile; - PopTile bMatTile; - TASSIGN(aMatTile, 0x0); - - LeftTile aTile; - RightTile bTile; - AccTile accTile; - TASSIGN(aTile, 0x0); - TASSIGN(bTile, 0x0); - TASSIGN(accTile, 0x0); - - typename MatPipe::Consumer cons; - - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - for (int k_tile = 0; k_tile < NUM_K_TILES; k_tile++) { - GlobalA globalA(srcA + k_tile * TILE_K); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - TLOAD(aMatTile, globalA); - - TPOP(mPipe, bMatTile); - - set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); - - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - TMOV(aTile, aMatTile); - TMOV(bTile, bMatTile); - - set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); - - set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - - if (k_tile == 0) { - TMATMUL(accTile, aTile, bTile); - } else { - TMATMUL_ACC(accTile, accTile, aTile, bTile); - } - - set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - } - - set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); - - GlobalOut globalOut(out); - TSTORE(globalOut, accTile); - - wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); - wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); - - pipe_barrier(PIPE_ALL); - } -} - -template -void LaunchTPushPopVCMatmul(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream) -{ - if constexpr (tilingKey == 1) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } else if constexpr (tilingKey == 2) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } else if constexpr (tilingKey == 3) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } else if constexpr (tilingKey == 4) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } else if constexpr (tilingKey == 5) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } else if constexpr (tilingKey == 6) { - runTPushPopVCMatmul<<<1, nullptr, stream>>>( - reinterpret_cast(ffts), reinterpret_cast(out), reinterpret_cast(srcA), - reinterpret_cast(quantB), reinterpret_cast(scale), reinterpret_cast(offset), - reinterpret_cast(fifoMem)); - } -} - -template void LaunchTPushPopVCMatmul<1>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); -template void LaunchTPushPopVCMatmul<2>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); -template void LaunchTPushPopVCMatmul<3>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); -template void LaunchTPushPopVCMatmul<4>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); -template void LaunchTPushPopVCMatmul<5>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); -template void LaunchTPushPopVCMatmul<6>(uint8_t *ffts, uint8_t *out, uint8_t *srcA, uint8_t *quantB, uint8_t *scale, - uint8_t *offset, uint8_t *fifoMem, void *stream); \ No newline at end of file From 57e30c0d313a031bf752ae28c5e667abd81dbdf5 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 11:22:08 +0000 Subject: [PATCH 20/38] test: add old and new --- tests/frontend/test_multifunc_ir.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py index 90fb29f7..c6533ef0 100644 --- a/tests/frontend/test_multifunc_ir.py +++ b/tests/frontend/test_multifunc_ir.py @@ -7,6 +7,11 @@ def meta_data(): return {"ptr_ty": ptr_ty} +@to_ir_module(meta_data=meta_data) +def single_kernel(arg0: "ptr_ty") -> None: + pass + + @to_ir_module(meta_data=meta_data, module=True) def build_module(): @pto.func(kernel="vector") @@ -18,10 +23,17 @@ def entry(arg0: "ptr_ty") -> None: pto.call(worker, arg0) -def test_multifunc_builder_shapes_module(): +def test_old_single_function_builder(): + text = str(single_kernel) + assert "func.func @single_kernel" in text + assert text.count("func.func @") == 1 + assert "func.call" not in text + + +def test_new_multi_function_builder(): text = str(build_module) assert "func.func @worker" in text assert "pto.kernel_kind = #pto.kernel_kind" in text assert "func.func @entry" in text assert "attributes {pto.entry}" in text - assert "func.call @worker" in text + assert "call @worker" in text From 7182e8aba82b3b86bab150a680b0208d12c9b175 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 11:22:54 +0000 Subject: [PATCH 21/38] feat: remove docs --- .../aot/tpushpop/mix-kernel_mlir/pto_docs.md | 822 ------------------ 1 file changed, 822 deletions(-) delete mode 100644 examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md diff --git a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md b/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md deleted file mode 100644 index 394b2798..00000000 --- a/examples/aot/tpushpop/mix-kernel_mlir/pto_docs.md +++ /dev/null @@ -1,822 +0,0 @@ -# TPUSH/TPOP 前端接口与 PTOAS 实现设计 - -## 1. 文档范围 - -本文定义PTOAS TPUSH/TPOP 前端IR接口,以及其在 PTOAS 内部的 lowering、地址传播、flag 分配和 EmitC 映射规则。 - -本文覆盖两层接口: - -- 前端接口 - - `pto.aic_initialize_pipe` - - `pto.aiv_initialize_pipe` - - `pto.tpush_to_aiv` - - `pto.tpush_to_aic` - - `pto.tpop_from_aic` - - `pto.tpop_from_aiv` - - `pto.tfree_from_aic` - - `pto.tfree_from_aiv` - - `pto.reserve_buffer` - - `pto.import_reserved_buffer` -- PTOAS 内部统一接口 - - `pto.initialize_l2g2l_pipe` - - `pto.initialize_l2l_pipe` - - `pto.tpush` - - `pto.declare_tile` - - `pto.tpop` - - `pto.tfree` - -本文只描述接口契约与编译流程,不展开具体 C++ 模板实现细节。 - -## 2. 设计目标 - -本设计的目标如下: - -- 对前端提供\*\_initialize_pipe/tpush_to_\*/tpop_from_\*/tfree_from_\*IR接口。 -- 在 PTOAS 内部统一为 pipe/tpush/tpop/tfree 指令,便于复用已有 pass。 -- 支持 A2/A3 与 A5 两个平台使用同一套前端接口。 -- 定义consumer slot buffer的分配地址与producer之间的匹配关系,并传播。 - -## 3. 前端 IR 接口定义 - -### 3.1 `pto.aic_initialize_pipe` - -#### 语义 - -由 Cube kernel 在函数启动时调用,初始化该函数涉及的通信 pipe。 - -#### 语法 - -```mlir -pto.aic_initialize_pipe( - DIR_MASK, - SLOT_SIZE, - GM_SLOT_BUFFER, - C2V_CONSUMER_BUF, - V2C_CONSUMER_BUF) -``` - -#### 参数 - -| 参数 | 类型 | 说明 | -|---|---|---| -| `DIR_MASK` | 编译期整数常量 | `1`、`2` 或 `3` | -| `SLOT_SIZE` | 编译期整数常量 | 单 slot 字节数,定义为切分前完整 tile 字节数 | -| `GM_SLOT_BUFFER` | GM 地址或空值 | A2/A3 路径使用,A5 路径为空 | -| `C2V_CONSUMER_BUF` | `i32` | C2V 方向 consumer 的 local slot buffer 基址 | -| `V2C_CONSUMER_BUF` | `i32` | V2C 方向 consumer 的 local slot buffer 基址 | - -### 3.2 `pto.aiv_initialize_pipe` - -#### 语义 - -由 Vector kernel 在函数启动时调用,初始化该函数涉及的通信 pipe。 - -#### 语法 - -```mlir -pto.aiv_initialize_pipe( - DIR_MASK, - SLOT_SIZE, - GM_SLOT_BUFFER, - C2V_CONSUMER_BUF, - V2C_CONSUMER_BUF) -``` - -参数语义与 `pto.aic_initialize_pipe` 相同。 - -### 3.3 前端数据传输接口 - -#### `pto.tpush_to_aiv` - -```mlir -pto.tpush_to_aiv(%tile) { split = 0 } -``` - -- 仅出现在 Cube kernel 中 -- 表示 C2V 方向 producer push - -#### `pto.tpush_to_aic` - -```mlir -pto.tpush_to_aic(%tile) { split = 0 } -``` - -- 仅出现在 Vector kernel 中 -- 表示 V2C 方向 producer push - -#### `pto.tpop_from_aic` - -```mlir -%tile = pto.tpop_from_aic { split = 0 } -> !pto.tile_buf<...> -``` - -- 仅出现在 Vector kernel 中 -- 表示 C2V 方向 consumer pop - -#### `pto.tpop_from_aiv` - -```mlir -%tile = pto.tpop_from_aiv { split = 0 } -> !pto.tile_buf<...> -``` - -- 仅出现在 Cube kernel 中 -- 表示 V2C 方向 consumer pop - -#### `pto.tfree_from_aic` - -```mlir -pto.tfree_from_aic { split = 0 } -``` - -- 仅出现在 Vector kernel 中 -- 表示 C2V 方向 consumer free - -#### `pto.tfree_from_aiv` - -```mlir -pto.tfree_from_aiv { split = 0 } -``` - -- 仅出现在 Cube kernel 中 -- 表示 V2C 方向 consumer free - -以上前端数据传输接口中的 `split` 均为编译期常量属性,不是运行时 SSA operand。 - -- 取值使用 `TileSplitAxis` 枚举语义:`0/1/2` 分别对应 `TILE_NO_SPLIT`、`TILE_UP_DOWN`、`TILE_LEFT_RIGHT` -- lowering 到 PTOAS 内部 IR 时,`split` 继续以属性形式保留 - -### 3.4 地址提示接口 - -#### `pto.reserve_buffer` - -用于在当前函数内声明一块 consumer slot buffer 预留空间。其合法写法由 -当前编译流程是否启用 local address planning 决定。 - -```mlir -%buf = pto.reserve_buffer { - name = "c2v_slot_buffer", - size = 2048, - location = #pto.address_space, - auto = true -} -> i32 -``` - -或使用显式地址: - -```mlir -%buf = pto.reserve_buffer { - name = "c2v_slot_buffer", - size = 2048, - location = #pto.address_space, - auto = false, - base = 4096 -} -> i32 -``` - -#### 参数 - -| 参数 | 类型 | 说明 | -|---|---|---| -| `name` | 字符串属性 | 本函数内唯一的预留段名字 | -| `size` | 整数属性 | 预留字节数 | -| `location` | 地址空间属性 | 预留空间所在 local 地址空间 | -| `auto` | `bool` 属性 | 地址解析路径标志;`true` 表示地址由 PTOAS 地址规划路径分配,`false` 表示地址已在输入 IR 中显式给定 | -| `base` | 可选整数属性 | 显式起始地址;仅 manual 路径使用 | - -#### 结果 - -- 结果类型为 `i32` -- 结果值表示该 buffer 当前可用的基址 -- 当前可用基址可来自显式 `base`,也可来自 plan memory 回填后的解析地址 -- 在当前约束下,每个函数最多一条 `reserve_buffer` -- 编译路径与 `auto` 的合法组合只有两种: - - 启用 local address planning:`auto = true`,且不带 `base` - - 跳过 local address planning:`auto = false`,且显式提供 `base` - -#### `pto.import_reserved_buffer` - -用于引用 peer function 中已经定义的 `reserve_buffer` 结果。 - -```mlir -%buf = pto.import_reserved_buffer { - name = "c2v_slot_buffer", - peer_func = @vector_kernel -} -> i32 -``` - -#### 参数 - -| 参数 | 类型 | 说明 | -|---|---|---| -| `name` | 字符串属性 | peer 侧 `reserve_buffer` 的名字 | -| `peer_func` | symbol ref | peer 函数符号 | - -#### 结果 - -- 结果类型为 `i32` -- 结果值表示从 peer `reserve_buffer` 导入的已解析基址 - -### 3.5 前端层约束 - -前端 IR 需满足以下约束: - -- 每个 Cube function 最多一条 `pto.aic_initialize_pipe` -- 每个 Vector function 最多一条 `pto.aiv_initialize_pipe` -- 每个函数内最多一条 C2V 逻辑 pipe 和一条 V2C 逻辑 pipe -- 每个函数最多一条 `reserve_buffer` -- 每个函数最多一条 `import_reserved_buffer` -- `DIR_MASK` 只允许 `1`、`2`、`3` -- `SLOT_SIZE > 0` -- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM` -- C2V consumer 的 `reserve_buffer.location` 必须是 `VEC` -- V2C consumer 的 `reserve_buffer.location` 必须是 `MAT` -- `reserve_buffer.name` 在本函数内必须唯一 -- op 级约束:`reserve_buffer.auto = false` 时必须提供 `base` -- op 级约束:`reserve_buffer.auto = true` 时必须不提供 `base` -- 启用 local address planning 的编译流程:`reserve_buffer` 只允许 `auto = true` -- 跳过 local address planning 的编译流程:`reserve_buffer` 只允许 `auto = false` 且显式提供 `base` -- `import_reserved_buffer` 必须能在 `peer_func` 中找到同名 `reserve_buffer` - -## 4. 核心约定 - -### 4.1 逻辑 pipe - -本文中的“逻辑 pipe”指一条单向通信通道。 - -- C2V:Cube producer -> Vector consumer -- V2C:Vector producer -> Cube consumer - -`DIR_MASK=3` 表示前端一个同时包含 C2V 和 V2C 的初始化请求,在 PTOAS lowering 后拆成两条单向逻辑 pipe: - -- 一条 `dir_mask = 1` 的 C2V pipe -- 一条 `dir_mask = 2` 的 V2C pipe - -### 4.2 `split` 的角色 - -`split` 使用 `TileSplitAxis` 枚举表达: - -- `TILE_NO_SPLIT` -- `TILE_UP_DOWN` -- `TILE_LEFT_RIGHT` - -在 PTOAS 设计中,`split` 的角色定义为: - -- `split` 是 `tpush/tpop/tfree` 的逐指令执行模式 -- `split` 在 IR 中表示为编译期常量属性,不是运行时 SSA operand -- `split` 不参与pipe 初始化 -- `split` 不参与 plan memory、地址传播、flag 分配 -- PTOAS 将 `split` 作为透明的编译期参数向 EmitC 和底层 pto-isa 透传 - -因此: - -- 同一条逻辑 pipe 上可以出现不同 `split` 的 `tpush/tpop/tfree` -- PTOAS 不要求同一逻辑 pipe 内所有指令使用同一个 `split` -- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证;PTOAS 仅校验 `split` 枚举合法并向下透传 - -### 4.3 `SLOT_SIZE` 的定义 - -`SLOT_SIZE` 的定义固定为: - -- 切分前完整 tile 的字节数 - -即使 `split` 为 `TILE_UP_DOWN` 或 `TILE_LEFT_RIGHT`,`SLOT_SIZE` 仍然表示未切分前的逻辑 tile 总字节数。 - -`split` 只影响底层 `TPUSH/TPOP/TFREE` 的执行方式,不影响 `SLOT_SIZE` 的含义。 - -### 4.4 `SLOT_NUM` 规则 - -`SLOT_NUM` 由 `DIR_MASK` 固定决定: - -- `DIR_MASK = 1` 或 `2`:`SLOT_NUM = 8` -- `DIR_MASK = 3`:拆成两条单向 pipe,且每条 `SLOT_NUM = 4` - -`SLOT_NUM` 不由 `split` 决定。 - -## 5. PTOAS 内部 IR 接口定义 - -### 5.1 `!pto.pipe` - -本文设计的内部 `!pto.pipe` 为不透明 handle。 - -`!pto.pipe` 的协议信息由其定义 op 上的属性承载,而不是由 type 参数承载。 - -底层 `pto-isa` 若对 `TPUSH/TPOP` 的模板形态继续演进,不反向约束 `!pto.pipe` 的 type 设计;内部 `!pto.pipe` 仍保持 opaque handle。 - -### 5.2 `pto.initialize_l2g2l_pipe` - -用于 A2/A3 路径。 - -```mlir -%pipe = pto.initialize_l2g2l_pipe { - dir_mask = 1, - slot_size = 512, - slot_num = 8, - local_slot_num = 8 -}(%gm_addr, %local_addr) -> !pto.pipe -``` - -#### 必需属性 - -- `dir_mask` -- `slot_size` -- `slot_num` - -#### 可选属性 - -- `local_slot_num` - - 仅 `initialize_l2g2l_pipe` 承载 - - 表示 GM 路径下 consumer 侧 local slot buffer 的槽数 - - 仅在通过 GM 传递时对底层 `TPipe` 模板参数有意义,不改变 GM FIFO 的 `slot_num` - - 缺省值等于该内部单向 pipe 的 `slot_num` - - 因此当前固定规则下: - - `DIR_MASK=1/2` 直接 lowering 时,`local_slot_num = 8` - - `DIR_MASK=3` 拆成两条单向 pipe 后,每条 `local_slot_num = 4` -- `flag_base` - - 由 PTOAS flag 分配阶段填写 - - frontend lowering 阶段可以缺省 - - EmitC 前必须已经解析为显式常量 - -#### 操作数 - -- `gm_addr` -- `local_addr` - -### 5.3 `pto.initialize_l2l_pipe` - -用于 A5 路径。 - -```mlir -%pipe = pto.initialize_l2l_pipe { - dir_mask = 1, - slot_size = 512, - slot_num = 8 -}(%local_addr) -> !pto.pipe -``` - -#### 必需属性 - -- `dir_mask` -- `slot_size` -- `slot_num` - -#### 可选属性 - -- `flag_base` - - 由 PTOAS flag 分配阶段填写 - - frontend lowering 阶段可以缺省 - - EmitC 前必须已经解析为显式常量 - -#### 操作数 - -- `local_addr` - -### 5.4 `pto.tpush` - -```mlir -pto.tpush(%tile, %pipe) { split = 0 } -``` - -### 5.5 `pto.declare_tile` - -```mlir -%tile = pto.declare_tile -> !pto.tile_buf<...> -``` - -### 5.6 `pto.tpop` - -```mlir -pto.tpop(%tile, %pipe) { split = 0 } -``` - -### 5.7 `pto.tfree` - -```mlir -pto.tfree(%pipe) { split = 0 } -``` - -`split` 在内部 IR 中必须以编译期常量属性形式保留,不能在 lowering 时擦除或降为运行时 operand。 - -## 6. 前端到内部 IR 的 lowering 规则 - -### 6.1 初始化接口 lowering - -#### A2/A3 - -- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2g2l_pipe` -- 若前端未提供更具体信息,lowering 默认补上 `local_slot_num = slot_num` - -#### A5 - -- `pto.aic_initialize_pipe` 和 `pto.aiv_initialize_pipe` lower 为 `pto.initialize_l2l_pipe` - -### 6.2 `DIR_MASK=1/2` - -- 只生成一条内部 pipe -- `slot_num = 8` -- 对 `initialize_l2g2l_pipe`,`local_slot_num = 8` - -### 6.3 `DIR_MASK=3` - -前端一个 init op 固定拆成两条内部 pipe: - -- `%pipe_c2v`:`dir_mask = 1`,`slot_num = 4` -- `%pipe_v2c`:`dir_mask = 2`,`slot_num = 4` - -若 lowering 为 `initialize_l2g2l_pipe`,则两条内部 pipe 还满足: - -- `%pipe_c2v`:`local_slot_num = 4` -- `%pipe_v2c`:`local_slot_num = 4` - -地址选择规则: - -- `%pipe_c2v` 使用 `C2V_CONSUMER_BUF` -- `%pipe_v2c` 使用 `V2C_CONSUMER_BUF` - -### 6.4 前端数据传输 op 与内部 pipe 的绑定 - -绑定规则固定如下: - -| 前端 op | 所在函数 | 方向 | 使用的内部 pipe | -|---|---|---|---| -| `tpush_to_aiv` | Cube | C2V | `dir_mask = 1` | -| `tpop_from_aic` | Vector | C2V | `dir_mask = 1` | -| `tfree_from_aic` | Vector | C2V | `dir_mask = 1` | -| `tpush_to_aic` | Vector | V2C | `dir_mask = 2` | -| `tpop_from_aiv` | Cube | V2C | `dir_mask = 2` | -| `tfree_from_aiv` | Cube | V2C | `dir_mask = 2` | - -### 6.5 数据传输 op lowering - -#### `tpush_to_aiv` / `tpush_to_aic` - -lower 为: - -```mlir -pto.tpush(%tile, %pipe) { split = 0 } -``` - -#### `tpop_from_aic` / `tpop_from_aiv` - -lower 为: - -```mlir -%decl = pto.declare_tile -> !pto.tile_buf<...> -pto.tpop(%decl, %pipe) { split = 0 } -``` - -即: - -- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` 是返回 tile 结果值的接口 -- PTOAS 内部 `pto.tpop` 才是 destination-style 形式,显式接收一个 `pto.declare_tile` 结果作为入参 - -#### `tfree_from_aic` / `tfree_from_aiv` - -lower 为: - -```mlir -pto.tfree(%pipe) { split = 0 } -``` - -## 7. `reserve_buffer` 与地址传播 - -### 7.1 设计原则 - -- `reserve_buffer` 只表示本函数 consumer slot buffer 的本地预留 -- `import_reserved_buffer` 只表示对 peer 预留段地址的引用 -- `reserve_buffer` 用属性描述“如何得到地址”,用结果值统一承载“当前可用地址” -- 当前编译流程是否启用 local address planning 与 `reserve_buffer.auto` 共同决定地址处理路径 -- 启用 local address planning:`reserve_buffer` 必须使用 `auto = true`,由 `PlanMemory` 分配地址 -- 跳过 local address planning:`reserve_buffer` 必须使用 `auto = false` 且显式提供 `base`,不再进入 `PlanMemory` 分配路径 -- PTOAS 复用现有 `PlanMemory` pass 实现 `reserve_buffer` 地址确定,不额外增加独立的预分配 pass -- PTOAS 新增独立地址传播 pass,专门处理 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐 -- 地址传播 pass 在 EmitC 之前运行;启用规划时位于 plan memory 之后,跳过规划时直接消费前端已给定地址 - -### 7.2 使用规则 - -#### C2V - -- consumer 是 Vector -- Vector function 需要 `reserve_buffer(location = VEC)` -- Cube function 需要 `import_reserved_buffer(peer_func = @vector_kernel)` - -#### V2C - -- consumer 是 Cube -- Cube function 需要 `reserve_buffer(location = MAT)` -- Vector function 需要 `import_reserved_buffer(peer_func = @cube_kernel)` - -### 7.3 编译路径与地址处理路径 - -对包含 `reserve_buffer` 的函数,PTOAS 按当前编译流程是否启用 local address planning 以及 `auto` 的组合选择地址处理路径: - -- 启用 local address planning + `auto = true` - - 进入 auto 路径 - - 由 `PlanMemory` 为 `reserve_buffer` 分配 `base` - - 随后由 `pto-resolve-reserved-buffers` 传播地址并完成 peer `flag_base` 对齐 -- 跳过 local address planning + `auto = false` + 显式 `base` - - 进入 manual 路径 - - 跳过 `PlanMemory` - - 由 `pto-resolve-reserved-buffers` 直接传播已给定地址并完成 peer `flag_base` 对齐 - -以下组合均非法: - -- 启用 local address planning + `auto = false` -- 跳过 local address planning + `auto = true` - -若函数内不存在 `reserve_buffer`,则保持现有编译流程对 `PlanMemory` 的原始控制行为,不引入额外语义。 - -### 7.4 启用 local address planning 的 auto 路径 - -在启用 local address planning 的编译流程中,`reserve_buffer` 必须使用 `auto = true`,并由 plan memory 负责地址分配。 - -若函数中存在 `reserve_buffer`,则对其 `location` 对应的地址空间执行: - -1. 先按现有逻辑完成普通 local buffer 的 `MemPlan` -2. 再收集该地址空间内已经分配完成的 local 区间 -3. 在剩余空洞中按地址空间对齐要求寻找一段可容纳 `reserve_buffer.size` 的连续区间 -4. 将该区间起始地址回填为这条唯一 `reserve_buffer` 的 `base` - -即: - -- 普通 `memref.alloc` / tile buffer 等 local 内存仍先由既有 `MemPlan` 按原逻辑分配 -- `reserve_buffer` 不参与普通 local buffer 的 inplace / reuse 规划 -- `reserve_buffer` 在普通 local buffer 分配完成后,再作为独立的一段连续 local 区间进行 hole 分配 -- `reserve_buffer` 不保证位于地址空间起始地址,也不保证形成预留前缀;其语义仅为“在该地址空间中为 consumer slot buffer 找到一段对齐且连续的可用地址” -- 若整体容量足够但 `MemPlan` 结果将空间打散,导致不存在满足大小和对齐要求的连续空洞,则 `reserve_buffer` 分配失败并报错 - -### 7.5 跳过 local address planning 的 manual 路径 - -在跳过 local address planning 的编译流程中: - -- 每个 `reserve_buffer` 必须显式提供 `base` -- PTOAS 只校验 `base` 的基本合法性 -- `PlanMemory` 不参与该函数的 local 地址分配 -- 因此该函数中其他 local buffer 地址也必须已由前端或更前阶段整体确定 -- 地址传播 pass 不做地址分配,只将显式 `base` 传播到 `import_reserved_buffer` - -该 manual 路径的目标是: - -- 保持前端或外部地址规划结果不被 PTOAS 改写 -- 避免 `reserve_buffer` 显式地址与 PTOAS 自动规划结果相互覆盖 - -### 7.6 `import_reserved_buffer` 规则 - -- 不做地址分配 - -### 7.7 地址传播 pass 规则 - -对每个 `import_reserved_buffer`: - -1. 通过 `peer_func` 找到 peer 函数 -2. 在 peer 函数内查找同名 `reserve_buffer` -3. 读取对方已经解析出的 `base` 或其等价结果值 -4. 用该常量地址替换 `import_reserved_buffer` 的结果 - -地址传播完成后: - -- producer 与 consumer 对同一逻辑 pipe 使用同一个 local buffer 地址 -- EmitC 只处理解析后的常量地址,不处理 `import_reserved_buffer` - -#### 7.7.1 pass 落点 - -- PTOAS 增加独立 `ModulePass`:`pto-resolve-reserved-buffers` -- 该 pass 固定运行在 EmitC lowering 之前 -- 启用规划时:运行在 `pto-plan-memory` 之后 -- 跳过规划时:不经过 `pto-plan-memory`,但该 pass 仍会运行 -- 该 pass 不负责地址分配,只消费前一阶段已经确定的 `reserve_buffer.base` - -#### 7.7.2 输入假设 - -- 启用规划时,`reserve_buffer.auto = true`,其 `base` 已由 `PlanMemory` 回填 -- 跳过规划时,`reserve_buffer.auto = false`,其 `base` 已由前端显式给定 -- `import_reserved_buffer.peer_func` 已能解析到合法 peer function -- `import_reserved_buffer.name` 已能在 peer function 中找到唯一匹配的 `reserve_buffer` - -#### 7.7.3 实现流程 - -pass 在模块级按两步执行: - -1. 先建立 peer 对应关系 -2. 再将 `reserve_buffer` / `import_reserved_buffer` 物化为显式常量地址 - -其中第一步的实现方式是: - -- 遍历模块内所有 `pto.initialize_l2l_pipe` / `pto.initialize_l2g2l_pipe` -- 若其 `local_addr` 来自 `reserve_buffer`,则以“当前函数 + reserve 名字 + dir_mask”识别逻辑 pipe -- 若其 `local_addr` 来自 `import_reserved_buffer`,则以“peer_func + reserve 名字 + dir_mask”识别逻辑 pipe -- 将 peer 两侧引用到同一逻辑 pipe 的内部 init op 归并到同一组 -- 若某条 init 未显式提供 `flag_base`,则其 `local_addr` 必须来自 `reserve_buffer` 或 `import_reserved_buffer` -- 对每个逻辑 pipe 分组,要求必须形成完整 peer init pair:恰好两条 init,且分别来自 peer 两侧函数;若 peer 信息不完整则直接报错 -- 在同一组内,若任一侧已显式提供 `flag_base`,则该值作为该组最终值;若两侧显式值冲突则报错 -- 若同组两侧都未显式提供 `flag_base`,则按默认规则回填: - - 单向场景:`flag_base = 0` - - 双向场景:C2V 组 `flag_base = 0`,V2C 组 `flag_base = 2` -- 所谓“双向场景”,是指同一对 peer 函数之间同时存在 `dir_mask = 1` 和 `dir_mask = 2` 两个逻辑 pipe 分组 -- 完成分组决策后,将最终 `flag_base` 回填到该组内所有尚未显式填写的 init op,保证 peer 两侧一致 - -第二步的实现方式是: - -- 对每个 `reserve_buffer`,读取其已解析 `base` -- 在该 op 位置插入 `arith.constant` -- 用该常量替换 `reserve_buffer` 结果值的全部 uses -- 对每个 `import_reserved_buffer`,通过 `peer_func + name` 找到 peer `reserve_buffer` -- 读取对方已解析 `base` -- 在当前 op 位置插入同值 `arith.constant` -- 用该常量替换 `import_reserved_buffer` 结果值的全部 uses -- 常量替换完成后,删除 `reserve_buffer` / `import_reserved_buffer` - -#### 7.7.4 结果 IR 形态 - -地址传播 pass 之后: - -- IR 中不再保留 `reserve_buffer` / `import_reserved_buffer` -- 内部 pipe init op 的 `local_addr` 只再引用普通 SSA 常量地址 -- 因而后续 EmitC 无需理解 frontend 预留地址语义,只需透传解析后的地址值 - -#### 7.7.5 失败条件 - -若出现以下情况,pass 直接报错: - -- `reserve_buffer.base` 在 pass 运行时仍未解析 -- 启用规划的编译流程却出现 `reserve_buffer.auto = false` -- 跳过规划的编译流程却出现 `reserve_buffer.auto = true` -- `peer_func` 无法解析到函数 -- 在 peer function 中找不到同名 `reserve_buffer` -- 某条未显式提供 `flag_base` 的内部 init,其 `local_addr` 不来自 `reserve_buffer` / `import_reserved_buffer` -- 基于 `reserve_buffer` / `import_reserved_buffer` 建立的某个逻辑 pipe 分组,未形成完整 peer init pair -- peer `flag_base` 已显式给定但两侧取值冲突 - -## 8. flag 分配规则 - -### 8.1 总原则 - -- `flag_base` 由 PTOAS flag 分配阶段在内部 init op 上填写 -- 在 flag 分配完成前,内部 init op 可以暂时不携带 `flag_base` -- peer 两侧同一逻辑 pipe 必须使用同一个 `flag_base` - -### 8.2 单向场景 - -当前规划中,当 `DIR_MASK = 1` 或 `2` 且函数内仅有该唯一逻辑 pipe 时,可采用: - -- 该方向唯一逻辑 pipe 的 `flag_base = 0` -- 该 pipe 占用逻辑 flag 对:`0` 和 `1` - -### 8.3 双向场景 - -当前规划中,当 `DIR_MASK = 3` 时,可采用: - -- C2V pipe:`flag_base = 0` -- V2C pipe:`flag_base = 2` - -因此双向固定占用两组逻辑 flag: - -- C2V:`0` / `1` -- V2C:`2` / `3` - -### 8.4 与地址传播的关系 - -地址传播 pass 在识别出 `import_reserved_buffer` 与 `reserve_buffer` 的 peer 对应关系后,同时可以完成 peer pipe 的 `flag_base` 对齐。 - -即: - -- 基于同一 FIFO 通信的两条 peer init op,必须拿到相同的 `flag_base` - -## 9. verifier 规则 - -### 9.1 前端 verifier - -前端 verifier 负责检查: - -- 每个函数 init op 数量是否合法 -- 每个函数 `reserve_buffer` / `import_reserved_buffer` 数量是否合法 -- `DIR_MASK` 取值是否合法 -- `SLOT_SIZE > 0` -- `reserve_buffer.size == SLOT_SIZE * SLOT_NUM` -- `reserve_buffer.location` 与 consumer 函数类型匹配 -- `reserve_buffer.name` 在函数内唯一 -- `reserve_buffer.auto = false` 时必须带 `base` -- `reserve_buffer.auto = true` 时必须不带 `base` -- driver / pipeline 级约束:启用规划的编译流程只接受 `auto = true` -- driver / pipeline 级约束:跳过规划的编译流程只接受 `auto = false` 且显式 `base` -- `import_reserved_buffer` 能在 `peer_func` 中找到同名 `reserve_buffer` -- 方向相关 op 只能出现在合法 kernel 中 -- 前端数据传输 op 的 `split` 必须是合法的编译期常量属性 - -### 9.2 内部 IR verifier - -内部 verifier 负责检查: - -- `slot_size > 0` -- `slot_num` 只允许 `8` 或 `4` -- `DIR_MASK=1/2` 时,`slot_num` 必须与单向/双向 lowering 规则一致 -- `local_slot_num` 若出现,只允许出现在 `pto.initialize_l2g2l_pipe` 上,且必须大于 `0` 且不大于 `slot_num` -- `flag_base` 若出现,必须满足基本合法性;是否已填写以及具体分配值由 flag 分配保证 -- `pto.initialize_l2g2l_pipe` 必须提供 `gm_addr` 和 `local_addr` -- `pto.initialize_l2l_pipe` 必须提供 `local_addr` -- `dir_mask = 1` 的 pipe 只能被 C2V 方向 lowering 使用 -- `dir_mask = 2` 的 pipe 只能被 V2C 方向 lowering 使用 -- `tpush/tpop/tfree` 的 `split` 必须是合法的编译期常量属性 - -### 9.3 关于 `split` 的校验边界 - -PTOAS 对 `split` 的处理边界如下: - -- PTOAS 验证 `split` 是合法枚举值 -- PTOAS 要求 `split` 以编译期常量属性形式出现 -- PTOAS 不验证同一逻辑 pipe 上多个 `tpush/tpop/tfree` 的 `split` 是否一致 -- PTOAS 不根据 `split` 改变地址分配、flag 分配或 pipe 配对 - -因此: - -- `split` 混用是否语义正确,不是 PTOAS 静态保证项 -- `split` 相关的语义正确性由前端生成逻辑或前端 verifier 保证 -- PTOAS 只负责校验 `split` 枚举值合法,并将其透传到底层 - -## 10. EmitC 与 pto-isa 映射 - -### 10.1 初始化 op - -在进入 EmitC 前: - -- 前端 `pto.aic_initialize_pipe` / `pto.aiv_initialize_pipe` -- 前端 `pto.tpush_to_aiv` / `pto.tpush_to_aic` -- 前端 `pto.tpop_from_aic` / `pto.tpop_from_aiv` -- 前端 `pto.tfree_from_aic` / `pto.tfree_from_aiv` -- `pto.reserve_buffer` / `pto.import_reserved_buffer` - -都必须已经被前序 pass 消除。 - -EmitC 只处理 PTOAS 内部统一 IR,不直接理解前端 pipe 接口或地址提示接口。 - -EmitC 将以下内部 init op 映射到底层 `TPipe`: - -- `pto.initialize_l2l_pipe` -- `pto.initialize_l2g2l_pipe` - -映射时需要使用以下信息: - -- `dir_mask` -- `slot_size` -- `slot_num` -- `local_slot_num` -- `flag_base` -- `gm_addr` -- `local_addr` - -其中: - -- 若 `flag_base` 尚未在 EmitC 前完成填写,PTOAS 应报错。 - -### 10.2 数据传输 op - -EmitC 将以下内部数据传输 op 映射到底层: - -- `pto.tpush` -> `TPUSH` -- `pto.tpop` -> `TPOP` -- `pto.tfree` -> `TFREE` - -映射时需要使用以下信息: - -- `tile` -- `split` -- `pipe` - -其中: - -- `split` 不在 PTOAS 内部解释 -- `split` 作为底层 `TPUSH/TPOP/TFREE` 的编译期模板实参透传 - -### 10.3 InsertSync - -`split` 不影响 PTOAS 中的 pipeline derivation 与 InsertSync 规则。 - -InsertSync 只依赖: - -- op 种类 -- init op 形态 -- `dir_mask` -- 目标架构 - -而不依赖 `split`。 - -## 11. 编译流程总览 - -完整流程如下: - -```text -前端 IR 接口 - -> lowering pass - -> PTOAS 内部统一 IR - -> plan memory - -> 地址传播 pass - -> EmitC - -> pto-isa C++ 代码 -``` - -其中: - -- lowering pass 负责拆分 `DIR_MASK=3`、绑定方向与 pipe -- 启用规划的编译流程中,plan memory 先按既有逻辑规划普通 local buffer,再为 `reserve_buffer` 在目标地址空间中分配 hole -- 跳过规划的编译流程中,不运行 plan memory;`reserve_buffer.base` 必须已由前端给定 -- 地址传播 pass 负责 `import_reserved_buffer` 常量替换与 peer pipe 的 `flag_base` 对齐 -- EmitC 只负责将内部 `initialize_l2l_pipe` / `initialize_l2g2l_pipe` / `tpush` / `tpop` / `tfree` 及其属性透传到底层 \ No newline at end of file From 48b1eb343d2b10080de9efe8381ce43228ca4698 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 11:24:44 +0000 Subject: [PATCH 22/38] fix: arith import in builder --- examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py index 9535b318..0c0cbdf3 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py @@ -1,5 +1,3 @@ -from mlir.dialects import arith - from ptodsl import pto, tile, to_ir_module from ptodsl import scalar as s From 0c865a187265eb0710c32eb825607371a9fba36b Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:22:05 +0000 Subject: [PATCH 23/38] test: compare to MLIR pybindings --- tests/frontend/test_multifunc_ir.py | 81 ++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py index c6533ef0..1e0a1668 100644 --- a/tests/frontend/test_multifunc_ir.py +++ b/tests/frontend/test_multifunc_ir.py @@ -1,3 +1,15 @@ +from mlir.dialects import func, pto as _pto +from mlir.ir import ( + Attribute, + Context, + FlatSymbolRefAttr, + InsertionPoint, + Location, + Module, + Operation, + UnitAttr, +) + from ptodsl import pto, to_ir_module @@ -13,7 +25,7 @@ def single_kernel(arg0: "ptr_ty") -> None: @to_ir_module(meta_data=meta_data, module=True) -def build_module(): +def multi_kernel_module(): @pto.func(kernel="vector") def worker(arg0: "ptr_ty") -> None: pass @@ -23,17 +35,60 @@ def entry(arg0: "ptr_ty") -> None: pto.call(worker, arg0) -def test_old_single_function_builder(): - text = str(single_kernel) - assert "func.func @single_kernel" in text - assert text.count("func.func @") == 1 - assert "func.call" not in text +def build_single_verbose(): + with Context() as ctx, Location.unknown(): + _pto.register_dialect(ctx, load=True) + module = Module.create() + ptr_ty = _pto.PtrType.get(pto.float32) + fn_ty = func.FunctionType.get([ptr_ty], []) + + with InsertionPoint(module.body): + fn = func.FuncOp("single_kernel", fn_ty) + entry = fn.add_entry_block() + + with InsertionPoint(entry): + func.ReturnOp([]) + + module.operation.verify() + return module + + +def build_multi_verbose(): + with Context() as ctx, Location.unknown(): + _pto.register_dialect(ctx, load=True) + module = Module.create() + ptr_ty = _pto.PtrType.get(pto.float32) + fn_ty = func.FunctionType.get([ptr_ty], []) + + with InsertionPoint(module.body): + worker = func.FuncOp("worker", fn_ty) + entry = func.FuncOp("entry", fn_ty) + + worker.operation.attributes["pto.kernel_kind"] = Attribute.parse( + "#pto.kernel_kind" + ) + entry.operation.attributes["pto.entry"] = UnitAttr.get(ctx) + + with InsertionPoint(worker.add_entry_block()): + func.ReturnOp([]) + + entry_block = entry.add_entry_block() + with InsertionPoint(entry_block): + arg0 = entry_block.arguments[0] + Operation.create( + "func.call", + operands=[arg0], + attributes={"callee": FlatSymbolRefAttr.get("worker")}, + ) + func.ReturnOp([]) + + module.operation.verify() + return module + + +def test_old_single_function_builder_matches_raw_mlir(): + assert str(single_kernel) == str(build_single_verbose()) -def test_new_multi_function_builder(): - text = str(build_module) - assert "func.func @worker" in text - assert "pto.kernel_kind = #pto.kernel_kind" in text - assert "func.func @entry" in text - assert "attributes {pto.entry}" in text - assert "call @worker" in text +def test_new_multi_function_builder_matches_raw_mlir(): + assert str(multi_kernel_module) == str(build_multi_verbose()) From ea019ed351b218540ab25945743c0efa27fe1e2e Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:40:50 +0000 Subject: [PATCH 24/38] fix: names --- .../{bidirectional_example.mlir => c2v.mlir} | 0 .../{bidirectional_builder.py => c2v_builder.py} | 0 examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 9 +++------ .../{run_bidirectional_example.py => run.py} | 0 4 files changed, 3 insertions(+), 6 deletions(-) rename examples/aot/tpushpop/mix-kernel_mlir/{bidirectional_example.mlir => c2v.mlir} (100%) rename examples/aot/tpushpop/mix-kernel_mlir/{bidirectional_builder.py => c2v_builder.py} (100%) rename examples/aot/tpushpop/mix-kernel_mlir/{run_bidirectional_example.py => run.py} (100%) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir b/examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir rename to examples/aot/tpushpop/mix-kernel_mlir/c2v.mlir diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py rename to examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index 7169a980..41183489 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -3,20 +3,17 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" -MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir" -GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp" +MLIR_PATH="${SCRIPT_DIR}/c2v.mlir" +GENERATED_CPP="${ARTIFACT_DIR}/c2v.cpp" LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" mkdir -p "${ARTIFACT_DIR}" rm -f "${GENERATED_CPP}" "${LIB_PATH}" MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir" -python bidirectional_builder.py > bidir_gen.mlir +python c2v_builder.py > c2v_gen.mlir ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}" -#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}" - - bisheng \ -I/sources/pto-isa/include/ \ diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/run_bidirectional_example.py rename to examples/aot/tpushpop/mix-kernel_mlir/run.py From f9281639a755d5576dd6494a8abb353f47b454a1 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:47:51 +0000 Subject: [PATCH 25/38] fix: naming --- examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index 41183489..dc7ba7c4 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -10,7 +10,7 @@ LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" mkdir -p "${ARTIFACT_DIR}" rm -f "${GENERATED_CPP}" "${LIB_PATH}" -MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir" +MLIR_GEN_PATH="${SCRIPT_DIR}/c2v_gen.mlir" python c2v_builder.py > c2v_gen.mlir ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}" From 8ae863510269b9123fb47b9bb2dc15afc3cc41b9 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:03:20 +0000 Subject: [PATCH 26/38] feat: deuglify the wrappers --- ptodsl/api/pto_general.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 1ea2a5c8..00637871 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -106,14 +106,17 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None): # auto = true # } -> i32 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): + """ + - At most one `pto.reserve_buffer` is expected in one function + - `location` must be a supported local address space + - Op-level verification requires: + - `auto = false` must provide `base` + - `auto = true` must not provide `base` + """ # All params are compile time attributes # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - kwargs = {} - if base is not None: - kwargs["base"] = base - return _pto.ReserveBufferOp( - name, size, _resolve_address_space_attr(location), auto_alloc, **kwargs - ).result + + return _pto.ReserveBufferOp(name, size, _resolve_address_space_attr(location), auto_alloc, base=base).result # %c2v_import = pto.import_reserved_buffer { @@ -135,15 +138,12 @@ def aic_initialize_pipe( ): # wrap # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - kwargs = {} - if gm_slot_buffer is not None: - kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) return _pto.AicInitializePipeOp( dir_mask, slot_size, c2v_consumer_buf=_unwrap(c2v_consumer_buf), v2c_consumer_buf=_unwrap(v2c_consumer_buf), - **kwargs, + gm_slot_buffer=_unwrap(gm_slot_buffer), ) @@ -162,15 +162,12 @@ def aiv_initialize_pipe( ): # wrap # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation - kwargs = {} - if gm_slot_buffer is not None: - kwargs["gm_slot_buffer"] = _unwrap(gm_slot_buffer) return _pto.AivInitializePipeOp( dir_mask, slot_size, c2v_consumer_buf=_unwrap(c2v_consumer_buf), v2c_consumer_buf=_unwrap(v2c_consumer_buf), - **kwargs, + gm_slot_buffer=_unwrap(gm_slot_buffer), ) From 62db3644d0f2b790638ef80c3962836739704e42 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Wed, 8 Apr 2026 09:45:34 +0000 Subject: [PATCH 27/38] feat: add more examples v2c, c2v, --- .../aot/tpushpop/mix-kernel_mlir/README.md | 45 +++++-- .../mix-kernel_mlir/c2v_add_builder.py | 113 ++++++++++++++++++ .../aot/tpushpop/mix-kernel_mlir/compile.sh | 17 ++- examples/aot/tpushpop/mix-kernel_mlir/run.py | 28 ++++- .../tpushpop/mix-kernel_mlir/v2c_builder.py | 102 ++++++++++++++++ 5 files changed, 285 insertions(+), 20 deletions(-) create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md index a898a57b..cb86533a 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/README.md +++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md @@ -1,17 +1,38 @@ -# Bidirectional `TPUSH`/`TPOP` MLIR Example - -This example mirrors the `mix-kernel_cpp` flow, but starts from -[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir). - -The pipeline is: - -1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp` -2. compile the generated C++ together with `caller.cpp` -3. build `./tpushpop_mlir_lib.so` -4. launch the generated `pto.entry` kernel from Python +# Cross core communication with `pto.push_to_aiv` example ## Run ```bash -python run_bidirectional_example.py +python run.py c2v +python run.py bidi ``` + +`c2v` is the default, so `python run.py` is the same as `python run.py c2v`. + +## How C2V Communication Works + +This example sends one `16x16 f32` tile from the Cube kernel to the Vector kernel. + +- The host allocates one shared `gm_slot_buffer` and passes it to both kernels. +- The Vector kernel owns the C2V consumer buffer with `pto.reserve_buffer(name = "c2v_fifo")`. +- The Cube kernel refers to that same buffer with `pto.import_reserved_buffer(name = "c2v_fifo")`. +- Both sides call `*_initialize_pipe` with `dir_mask = 1`, which means `C2V`. +- Cube sends with `pto.tpush_to_aiv(...)`. +- Vector receives with `pto.tpop_from_aic(...)` and releases the consumed slot with `pto.tfree_from_aic`. + +In the generated C++, this becomes the same `TPipe<..., Direction::DIR_C2V, ...>` on both sides: + +- Cube: `TPUSH(pipe, acc_tile)` +- Vector: `TPOP(pipe, vec_tile)` then `TFREE(pipe)` + +The important mental model is: `TPUSH`/`TPOP` are the real cross-core handoff, while `gm_slot_buffer` is the shared backing storage that makes the FIFO work. + +## How Bidirectional Works + +`bidi` starts the same way as `c2v`, but adds a return path: + +- Cube computes `x @ x` and sends it to vector over C2V. +- Vector pops that tile, computes `tile + tile`, and pushes the doubled result back over V2C. +- Cube pops the returned tile and writes it to GM. + +The important difference is that both sides initialize with `dir_mask = 3`, so the same mixed-kernel launch can use both directions of the pipe. diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py new file mode 100644 index 00000000..dde0f705 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py @@ -0,0 +1,113 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + + +def meta_data(): + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + i32 = pto.int32 + tensor_ty = pto.TensorType(rank=2, dtype=dtype) + tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) + x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT") + x_left_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="LEFT", + config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"), + ) + x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT") + acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC") + vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") + return locals() + + +@to_ir_module(meta_data=meta_data, module=True) +def module(): + @pto.func(kernel="cube") + def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + c2v_import = pto.import_reserved_buffer( + name="c2v_fifo", + peer_func="@vector_kernel", + ) + + pto.aic_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_import, + v2c_consumer_buf=c0_i32, + ) + + x_mat_tile = pto.alloc_tile(x_mat_ty) + x_left_tile = pto.alloc_tile(x_left_ty) + x_right_tile = pto.alloc_tile(x_right_ty) + acc_tile = pto.alloc_tile(acc_ty) + + gm_x_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_x, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + pto.load(gm_x_tile_view, x_mat_tile) + tile.mov(x_mat_tile, x_left_tile) + tile.mov(x_mat_tile, x_right_tile) + tile.matmul(x_left_tile, x_right_tile, acc_tile) + # Debug step: only send cube's result to vector. + pto.tpush_to_aiv(acc_tile, 0) + + @pto.func(kernel="vector") + def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC") + + pto.aiv_initialize_pipe( + dir_mask=1, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_local, + v2c_consumer_buf=c0_i32, + ) + + gm_y_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_y, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + doubled_tile = pto.alloc_tile(vec_ty) + recv_tile = pto.tpop_from_aic(vec_ty, 0) + # First isolate the vector-side path: pop, double, store from vector. + tile.add(recv_tile, recv_tile, doubled_tile) + pto.store(doubled_tile, gm_y_tile_view) + pto.tfree_from_aic(0) + + @pto.func(entry=True) + def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.call(cube_kernel, gm_slot_buffer, gm_x) + pto.call(vector_kernel, gm_slot_buffer, gm_y) + + +if __name__ == "__main__": + print(module) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index dc7ba7c4..3761f044 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -3,15 +3,24 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" -MLIR_PATH="${SCRIPT_DIR}/c2v.mlir" -GENERATED_CPP="${ARTIFACT_DIR}/c2v.cpp" +MODE="${TPUSHPOP_MODE:-c2v}" +BUILDER_PATH="${SCRIPT_DIR}/${MODE}_builder.py" +MLIR_GEN_PATH="${SCRIPT_DIR}/${MODE}_gen.mlir" +GENERATED_CPP="${ARTIFACT_DIR}/${MODE}.cpp" LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" +case "${MODE}" in + c2v|c2v_add|v2c|bidi) ;; + *) + echo "Unknown TPUSHPOP_MODE: ${MODE}" >&2 + exit 2 + ;; +esac + mkdir -p "${ARTIFACT_DIR}" rm -f "${GENERATED_CPP}" "${LIB_PATH}" -MLIR_GEN_PATH="${SCRIPT_DIR}/c2v_gen.mlir" -python c2v_builder.py > c2v_gen.mlir +python "${BUILDER_PATH}" > "${MLIR_GEN_PATH}" ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}" diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py index 1b619869..a59b527c 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py @@ -1,3 +1,4 @@ +import argparse import ctypes import os import subprocess @@ -15,17 +16,20 @@ N = 16 ATOL = 1e-4 RTOL = 1e-4 +MODES = ("c2v", "c2v_add", "v2c", "bidi") def torch_to_ctypes(tensor: torch.Tensor) -> ctypes.c_void_p: return ctypes.c_void_p(tensor.data_ptr()) -def compile_example(compile_script: str) -> None: +def compile_example(compile_script: str, mode: str) -> None: + env = dict(os.environ, TPUSHPOP_MODE=mode) subprocess.run( ["bash", compile_script], check=True, cwd=THIS_DIR, + env=env, ) @@ -65,8 +69,24 @@ def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tenso torch.npu.synchronize() +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("mode", nargs="?", choices=MODES, default="c2v") + return parser.parse_args() + + +def reference(mode: str, x: torch.Tensor) -> torch.Tensor: + y = x.cpu() @ x.cpu() + if mode == "c2v": + return y + if mode == "v2c": + return x.cpu() + return 2 * y + + def main() -> None: - compile_example(DEFAULT_COMPILE_SCRIPT) + args = parse_args() + compile_example(DEFAULT_COMPILE_SCRIPT, args.mode) device = get_test_device() torch.npu.set_device(device) @@ -83,7 +103,7 @@ def main() -> None: run_kernel(lib, gm_slot_buffer=gm_slot_buffer, x=x, y=y) print(y) - y_ref = x.cpu() @ x.cpu() + y_ref = reference(args.mode, x) y_cpu = y.cpu() print(y_ref-y_cpu) @@ -94,7 +114,7 @@ def main() -> None: if not ok: raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}") - print(f"Validation passed using {DEFAULT_LIB_PATH}.") + print(f"Validation passed for mode={args.mode} using {DEFAULT_LIB_PATH}.") if __name__ == "__main__": diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py new file mode 100644 index 00000000..782e5c5c --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py @@ -0,0 +1,102 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + + +def meta_data(): + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + i32 = pto.int32 + tensor_ty = pto.TensorType(rank=2, dtype=dtype) + tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) + vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") + recv_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="MAT", + config=pto.TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=512, + ), + ) + return locals() + + +@to_ir_module(meta_data=meta_data, module=True) +def module(): + @pto.func(kernel="cube") + def cube_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + v2c_local = pto.reserve_buffer(name="v2c_fifo", size=4096, location="MAT") + + pto.aic_initialize_pipe( + dir_mask=2, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c0_i32, + v2c_consumer_buf=v2c_local, + ) + + gm_y_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_y, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + pto.store(pto.tpop_from_aiv(recv_ty, 0), gm_y_tile_view) + pto.tfree_from_aiv(0) + + @pto.func(kernel="vector") + def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c0_i32 = const(0, type=i32) + v2c_import = pto.import_reserved_buffer( + name="v2c_fifo", + peer_func="@cube_kernel", + ) + + pto.aiv_initialize_pipe( + dir_mask=2, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c0_i32, + v2c_consumer_buf=v2c_import, + ) + + gm_x_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_x, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + send_tile = pto.alloc_tile(vec_ty) + pto.load(gm_x_tile_view, send_tile) + pto.tpush_to_aic(send_tile, 0) + + @pto.func(entry=True) + def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.call(cube_kernel, gm_slot_buffer, gm_y) + pto.call(vector_kernel, gm_slot_buffer, gm_x) + + +if __name__ == "__main__": + print(module) From 1d831241b93ab17d3f825d90662b477c71854358 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 08:18:49 +0000 Subject: [PATCH 28/38] feat: add ffts address (needed for bidir comm) --- .../aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py | 4 +++- .../aot/tpushpop/mix-kernel_mlir/c2v_builder.py | 4 +++- examples/aot/tpushpop/mix-kernel_mlir/caller.cpp | 13 ++++++++++++- examples/aot/tpushpop/mix-kernel_mlir/run.py | 7 ++++++- .../aot/tpushpop/mix-kernel_mlir/v2c_builder.py | 4 +++- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py index dde0f705..84fe47e8 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py @@ -5,6 +5,7 @@ def meta_data(): + ffts_ty = pto.ffts_type dtype = pto.float32 ptr_ty = pto.PtrType(dtype) i32 = pto.int32 @@ -104,7 +105,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: pto.tfree_from_aic(0) @pto.func(entry=True) - def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_x) pto.call(vector_kernel, gm_slot_buffer, gm_y) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py index 0c0cbdf3..b46aa886 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py @@ -5,6 +5,7 @@ def meta_data(): + ffts_ty = pto.ffts_type dtype = pto.float32 ptr_ty = pto.PtrType(dtype) i32 = pto.int32 @@ -99,7 +100,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: pto.tfree_from_aic(0) @pto.func(entry=True) - def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_x) pto.call(vector_kernel, gm_slot_buffer, gm_y) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp index e558e69d..b8a9e8b2 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp +++ b/examples/aot/tpushpop/mix-kernel_mlir/caller.cpp @@ -4,6 +4,8 @@ #include +extern "C" int rtGetC2cCtrlAddr(uint64_t *ctrlAddr, uint32_t *ctrlLen); + #include KERNEL_CPP extern "C" void call_kernel( @@ -13,5 +15,14 @@ extern "C" void call_kernel( uint8_t *x, uint8_t *y) { - call_both<<>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y); + void *fftsAddr = nullptr; + uint32_t fftsLen = 0; + (void)rtGetC2cCtrlAddr(reinterpret_cast(&fftsAddr), &fftsLen); + (void)fftsLen; + + call_both<<>>( + (__gm__ int64_t *)fftsAddr, + (__gm__ float *)gmSlotBuffer, + (__gm__ float *)x, + (__gm__ float *)y); } diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py index a59b527c..79735812 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py @@ -12,6 +12,7 @@ DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so") DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") DEFAULT_FIFO_BYTES = 4 * 1024 +DEFAULT_FIFO_BYTES_BOTH = 8 * 1024 M = 16 N = 16 ATOL = 1e-4 @@ -57,6 +58,10 @@ def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]: return x, y +def fifo_bytes_for_mode(mode: str) -> int: + return DEFAULT_FIFO_BYTES_BOTH if mode in ("v2c", "bidi") else DEFAULT_FIFO_BYTES + + def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( @@ -93,7 +98,7 @@ def main() -> None: lib = load_lib(DEFAULT_LIB_PATH) gm_slot_buffer = make_gm_slot_buffer( - fifo_bytes=DEFAULT_FIFO_BYTES, + fifo_bytes=fifo_bytes_for_mode(args.mode), device=device, ) torch.set_printoptions(precision=1, threshold=2000, linewidth=250, sci_mode=False) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py index 782e5c5c..9e385002 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py @@ -5,6 +5,7 @@ def meta_data(): + ffts_ty = pto.ffts_type dtype = pto.float32 ptr_ty = pto.PtrType(dtype) i32 = pto.int32 @@ -93,7 +94,8 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None: pto.tpush_to_aic(send_tile, 0) @pto.func(entry=True) - def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_y) pto.call(vector_kernel, gm_slot_buffer, gm_x) From e32afba095bce046eff566cee50cac70752fde6c Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 08:39:24 +0000 Subject: [PATCH 29/38] feat: unmangle kernel name --- examples/aot/tpushpop/mix-kernel_mlir/compile.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index 3761f044..b3981545 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -22,7 +22,8 @@ rm -f "${GENERATED_CPP}" "${LIB_PATH}" python "${BUILDER_PATH}" > "${MLIR_GEN_PATH}" ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}" - +# add extern "C" to function so kernel name is not mangled +perl -0pi -e 's/\b__global__ AICORE void call_both\(/extern "C" __global__ AICORE void call_both(/' "${GENERATED_CPP}" bisheng \ -I/sources/pto-isa/include/ \ From 80295996bcd755a4480bf99cb96c7c3d44314929 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 08:40:29 +0000 Subject: [PATCH 30/38] feat: add ffts functionality to api --- ptodsl/api/pto.py | 3 +++ ptodsl/api/pto_general.py | 5 +++++ ptodsl/api/type_def.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/ptodsl/api/pto.py b/ptodsl/api/pto.py index f1f3012f..caf5cda8 100644 --- a/ptodsl/api/pto.py +++ b/ptodsl/api/pto.py @@ -7,6 +7,7 @@ aiv_initialize_pipe, as_tensor, call, + set_ffts, cube_section, get_block_idx, get_block_num, @@ -45,6 +46,7 @@ "float32", "int16", "int32", + "ffts_type", "PtrType", "TensorType", "SubTensorType", @@ -56,6 +58,7 @@ "get_subblock_num", "get_block_num", "call", + "set_ffts", "as_tensor", "slice_view", "vector_section", diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 00637871..63606d57 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -52,6 +52,10 @@ def call(callee, *args): ) +def set_ffts(ffts): + return _pto.SetFFTsOp(_unwrap(ffts)) + + def as_tensor(tensor_type, *, ptr, shape, strides, layout=None): shape_vals = [_unwrap(v) for v in shape] stride_vals = [_unwrap(v) for v in strides] @@ -234,6 +238,7 @@ def print(format, scalar): "get_subblock_num", "get_block_num", "call", + "set_ffts", "as_tensor", "slice_view", "vector_section", diff --git a/ptodsl/api/type_def.py b/ptodsl/api/type_def.py index 4f66eebb..8e7909b8 100644 --- a/ptodsl/api/type_def.py +++ b/ptodsl/api/type_def.py @@ -1,4 +1,5 @@ from mlir.dialects import pto as _pto +from mlir.ir import IntegerType, MemRefType from . import scalar @@ -8,6 +9,8 @@ def __getattr__(name): # and resolve them only when user code accesses them inside PTO/MLIR setup. if name in {"bool", "float16", "float32", "int16", "int32"}: return getattr(scalar, name) + if name == "ffts_type": + return MemRefType.get([256], IntegerType.get_signless(64)) raise AttributeError(f"module '{__name__}' has no attribute '{name}'") @@ -108,4 +111,5 @@ def TileBufType(*, shape, dtype, memory_space, valid_shape=None, config=None): "float32", "int16", "int32", + "ffts_type", ] From 10bfb1b2fab45c0cd32ffe3ed45462f63d2cf8e4 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 08:40:59 +0000 Subject: [PATCH 31/38] feat: add bidir example --- .../tpushpop/mix-kernel_mlir/bidi_builder.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py new file mode 100644 index 00000000..511db7e2 --- /dev/null +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py @@ -0,0 +1,128 @@ +from ptodsl import pto, tile, to_ir_module +from ptodsl import scalar as s + +const = s.const + + +def meta_data(): + ffts_ty = pto.ffts_type + dtype = pto.float32 + ptr_ty = pto.PtrType(dtype) + i32 = pto.int32 + tensor_ty = pto.TensorType(rank=2, dtype=dtype) + tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype) + x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT") + x_left_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="LEFT", + config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"), + ) + x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT") + acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC") + vec_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC") + # Direct GM writeback from cube needs a row-major NoneBox tile. + cube_recv_ty = pto.TileBufType( + shape=[16, 16], + dtype=dtype, + memory_space="MAT", + config=pto.TileBufConfig( + blayout="RowMajor", + slayout="NoneBox", + s_fractal_size=512, + ), + ) + return locals() + + +@to_ir_module(meta_data=meta_data, module=True) +def module(): + @pto.func(kernel="cube") + def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + c0 = const(0) + c1 = const(1) + c16 = const(16) + c2v_import = pto.import_reserved_buffer( + name="c2v_fifo", + peer_func="@vector_kernel", + ) + v2c_local = pto.reserve_buffer(name="v2c_fifo", size=4096, location="MAT") + + # One DIR_BOTH pipe handles both legs of the round trip. + pto.aic_initialize_pipe( + dir_mask=3, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_import, + v2c_consumer_buf=v2c_local, + ) + + x_mat_tile = pto.alloc_tile(x_mat_ty) + x_left_tile = pto.alloc_tile(x_left_ty) + x_right_tile = pto.alloc_tile(x_right_ty) + acc_tile = pto.alloc_tile(acc_ty) + + gm_x_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_x, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + gm_y_tile_view = pto.slice_view( + tile_view_ty, + source=pto.as_tensor( + tensor_ty, + ptr=gm_y, + shape=[c16, c16], + strides=[c16, c1], + ), + offsets=[c0, c0], + sizes=[c16, c16], + ) + + pto.load(gm_x_tile_view, x_mat_tile) + tile.mov(x_mat_tile, x_left_tile) + tile.mov(x_mat_tile, x_right_tile) + tile.matmul(x_left_tile, x_right_tile, acc_tile) + pto.tpush_to_aiv(acc_tile, 0) + returned_tile = pto.tpop_from_aiv(cube_recv_ty, 0) + pto.store(returned_tile, gm_y_tile_view) + pto.tfree_from_aiv(0) + + @pto.func(kernel="vector") + def vector_kernel(gm_slot_buffer: "ptr_ty") -> None: + c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC") + v2c_import = pto.import_reserved_buffer( + name="v2c_fifo", + peer_func="@cube_kernel", + ) + + # Vector pops cube's tile, doubles it, then pushes the result back. + pto.aiv_initialize_pipe( + dir_mask=3, + slot_size=1024, + gm_slot_buffer=gm_slot_buffer, + c2v_consumer_buf=c2v_local, + v2c_consumer_buf=v2c_import, + ) + + doubled_tile = pto.alloc_tile(vec_ty) + recv_tile = pto.tpop_from_aic(vec_ty, 0) + tile.add(recv_tile, recv_tile, doubled_tile) + pto.tpush_to_aic(doubled_tile, 0) + pto.tfree_from_aic(0) + + @pto.func(entry=True) + def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + pto.set_ffts(ffts_addr) + pto.call(cube_kernel, gm_slot_buffer, gm_x, gm_y) + pto.call(vector_kernel, gm_slot_buffer) + + +if __name__ == "__main__": + print(module) From 56527f4f9621ac604403a3d8f846827f17c10036 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 09:09:52 +0000 Subject: [PATCH 32/38] chore: docker ptoas ver and pto-isa --- docker/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 331ca0d1..1c149ef1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,7 +17,7 @@ RUN pip install --no-cache-dir \ # certain operations need latest isa header, not CANN 8.5.0 default # header on 2026/03/24 -ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af +ARG PTOISA_COMMIT=a8c3fbf42a2f4a0f609f64e138dda62deefddb8e WORKDIR /sources RUN git clone https://gitcode.com/cann/pto-isa.git \ && cd pto-isa && git checkout $PTOISA_COMMIT @@ -29,10 +29,10 @@ ARG CACHE_BURST=1 # ARG ARCH=x86_64 ARG ARCH=aarch64 -ARG RELEASE_REPO=huawei-csl/PTOAS -ARG RELEASE_VER=20260327 -ARG RELEASE_TAG=${RELEASE_VER} -ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl +ARG RELEASE_REPO=zhangstevenunity/PTOAS +ARG RELEASE_VER=0.24 +ARG RELEASE_TAG=v${RELEASE_VER} +ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz WORKDIR /installers/ From d40da05daa23518f26951909a67799e74cad95e9 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 09:19:42 +0000 Subject: [PATCH 33/38] chore: black --- .../aot/tpushpop/mix-kernel_mlir/bidi_builder.py | 4 +++- .../aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py | 4 +++- examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py | 4 +++- examples/aot/tpushpop/mix-kernel_mlir/run.py | 12 ++++++++---- examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py | 4 +++- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py index 511db7e2..8ea9ad03 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py @@ -118,7 +118,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty") -> None: pto.tfree_from_aic(0) @pto.func(entry=True) - def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both( + ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty" + ) -> None: pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_x, gm_y) pto.call(vector_kernel, gm_slot_buffer) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py index 84fe47e8..d0aef4ae 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py @@ -105,7 +105,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: pto.tfree_from_aic(0) @pto.func(entry=True) - def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both( + ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty" + ) -> None: pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_x) pto.call(vector_kernel, gm_slot_buffer, gm_y) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py index b46aa886..51312f36 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py @@ -100,7 +100,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None: pto.tfree_from_aic(0) @pto.func(entry=True) - def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both( + ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty" + ) -> None: pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_x) pto.call(vector_kernel, gm_slot_buffer, gm_y) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py index 79735812..de46b663 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py @@ -53,7 +53,7 @@ def make_gm_slot_buffer(*, fifo_bytes: int, device: str) -> torch.Tensor: def make_io_tensors(*, device: str) -> tuple[torch.Tensor, torch.Tensor]: - x = torch.rand((M, N), dtype=torch.float32, device=device) -0.5 + x = torch.rand((M, N), dtype=torch.float32, device=device) - 0.5 y = torch.zeros((M, N), dtype=torch.float32, device=device) return x, y @@ -62,7 +62,9 @@ def fifo_bytes_for_mode(mode: str) -> int: return DEFAULT_FIFO_BYTES_BOTH if mode in ("v2c", "bidi") else DEFAULT_FIFO_BYTES -def run_kernel(lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> None: +def run_kernel( + lib: ctypes.CDLL, *, gm_slot_buffer: torch.Tensor, x: torch.Tensor, y: torch.Tensor +) -> None: stream_ptr = torch.npu.current_stream()._as_parameter_ lib.call_kernel( 1, @@ -111,13 +113,15 @@ def main() -> None: y_ref = reference(args.mode, x) y_cpu = y.cpu() - print(y_ref-y_cpu) + print(y_ref - y_cpu) max_abs = float(torch.max(torch.abs(y_cpu - y_ref)).item()) ok = bool(torch.allclose(y_cpu, y_ref, atol=ATOL, rtol=RTOL)) print(f"shape=({M}, {N}) max_abs={max_abs:.6f}") if not ok: - raise SystemExit(f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}") + raise SystemExit( + f"Validation failed with atol={ATOL} rtol={RTOL}. max_abs={max_abs:.6f}" + ) print(f"Validation passed for mode={args.mode} using {DEFAULT_LIB_PATH}.") diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py index 9e385002..96ba943e 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py @@ -94,7 +94,9 @@ def vector_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None: pto.tpush_to_aic(send_tile, 0) @pto.func(entry=True) - def call_both(ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None: + def call_both( + ffts_addr: "ffts_ty", gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty" + ) -> None: pto.set_ffts(ffts_addr) pto.call(cube_kernel, gm_slot_buffer, gm_y) pto.call(vector_kernel, gm_slot_buffer, gm_x) From 7c2a4a03d0628bb5325e7d4fdfe3cbd40623fed6 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 09:24:04 +0000 Subject: [PATCH 34/38] chore: black --- ptodsl/api/pto_general.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ptodsl/api/pto_general.py b/ptodsl/api/pto_general.py index 63606d57..9187d25d 100644 --- a/ptodsl/api/pto_general.py +++ b/ptodsl/api/pto_general.py @@ -111,16 +111,18 @@ def alloc_tile(tile_type, *, addr=None, valid_row=None, valid_col=None): # } -> i32 def reserve_buffer(*, name, size, location, auto_alloc=True, base=None): """ - - At most one `pto.reserve_buffer` is expected in one function - - `location` must be a supported local address space - - Op-level verification requires: - - `auto = false` must provide `base` - - `auto = true` must not provide `base` + - At most one `pto.reserve_buffer` is expected in one function + - `location` must be a supported local address space + - Op-level verification requires: + - `auto = false` must provide `base` + - `auto = true` must not provide `base` """ # All params are compile time attributes # wrap reserve_buffer(name, size, location, auto_alloc, *, base=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Value - return _pto.ReserveBufferOp(name, size, _resolve_address_space_attr(location), auto_alloc, base=base).result + return _pto.ReserveBufferOp( + name, size, _resolve_address_space_attr(location), auto_alloc, base=base + ).result # %c2v_import = pto.import_reserved_buffer { @@ -136,12 +138,11 @@ def aic_initialize_pipe( *, dir_mask, slot_size, - gm_slot_buffer=None, # only needed on a2/a3? + gm_slot_buffer=None, # only needed on a2/a3? c2v_consumer_buf, v2c_consumer_buf, ): - # wrap - # aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + # wrap aic_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation return _pto.AicInitializePipeOp( dir_mask, slot_size, @@ -160,12 +161,11 @@ def aiv_initialize_pipe( *, dir_mask, slot_size, - gm_slot_buffer=None, # only needed on a2/a3 + gm_slot_buffer=None, # only needed on a2/a3 c2v_consumer_buf, v2c_consumer_buf, ): - # wrap - # aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation + # wrap aiv_initialize_pipe(dir_mask, slot_size, c2v_consumer_buf, v2c_consumer_buf, *, gm_slot_buffer=None, loc=None, ip=None) -> mlir._mlir_libs._mlir.ir.Operation return _pto.AivInitializePipeOp( dir_mask, slot_size, From 2779e56e32bb5b0a99170bdde1495093e6cd4dce Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 09:31:29 +0000 Subject: [PATCH 35/38] feat: gitignore --- .gitignore | 2 ++ examples/aot/tpushpop/.gitignore | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 09d8265b..1ccc35d0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ __pycache__ extra-info *.ptodsl_jit + +msprof_res/ diff --git a/examples/aot/tpushpop/.gitignore b/examples/aot/tpushpop/.gitignore index ab5698d1..b0d498be 100644 --- a/examples/aot/tpushpop/.gitignore +++ b/examples/aot/tpushpop/.gitignore @@ -1 +1 @@ -msprof_res/ \ No newline at end of file +build_artifacts/ From 49eae78555971a0173fee77bb66ca02e81d92a95 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:19:28 +0000 Subject: [PATCH 36/38] feat: move files and cleanup --- .../aot/tpushpop/mix-kernel_mlir/README.md | 84 ++++++++++++++----- .../aot/tpushpop/mix-kernel_mlir/compile.sh | 6 +- .../{ => kernels}/bidi_builder.py | 0 .../{ => kernels}/c2v_add_builder.py | 0 .../{ => kernels}/c2v_builder.py | 0 .../{ => kernels}/v2c_builder.py | 0 examples/aot/tpushpop/mix-kernel_mlir/run.py | 2 +- 7 files changed, 67 insertions(+), 25 deletions(-) rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/bidi_builder.py (100%) rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/c2v_add_builder.py (100%) rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/c2v_builder.py (100%) rename examples/aot/tpushpop/mix-kernel_mlir/{ => kernels}/v2c_builder.py (100%) diff --git a/examples/aot/tpushpop/mix-kernel_mlir/README.md b/examples/aot/tpushpop/mix-kernel_mlir/README.md index cb86533a..886a8c2c 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/README.md +++ b/examples/aot/tpushpop/mix-kernel_mlir/README.md @@ -1,38 +1,80 @@ -# Cross core communication with `pto.push_to_aiv` example +# TPush / TPop mixed-kernel examples -## Run +Small examples of tile FIFO communication between Cube (`AIC`) and Vector (`AIV`). ```bash python run.py c2v +python run.py v2c python run.py bidi ``` -`c2v` is the default, so `python run.py` is the same as `python run.py c2v`. +`python run.py` defaults to `c2v`. -## How C2V Communication Works +Files: -This example sends one `16x16 f32` tile from the Cube kernel to the Vector kernel. +- `kernels/` has the Python builders. +- `build_artifacts/` gets generated MLIR, generated C++, and the `.so`. +- `gm_slot_buffer` is the GM backing store for the pipe. +- `caller.cpp` sets the FFTS base before launching the generated kernel. -- The host allocates one shared `gm_slot_buffer` and passes it to both kernels. -- The Vector kernel owns the C2V consumer buffer with `pto.reserve_buffer(name = "c2v_fifo")`. -- The Cube kernel refers to that same buffer with `pto.import_reserved_buffer(name = "c2v_fifo")`. -- Both sides call `*_initialize_pipe` with `dir_mask = 1`, which means `C2V`. -- Cube sends with `pto.tpush_to_aiv(...)`. -- Vector receives with `pto.tpop_from_aic(...)` and releases the consumed slot with `pto.tfree_from_aic`. +Core idea: -In the generated C++, this becomes the same `TPipe<..., Direction::DIR_C2V, ...>` on both sides: +- `aic_initialize_pipe` / `aiv_initialize_pipe` lower to matching `TPipe<...>` objects. +- `gm_slot_buffer` is the shared GM slot memory used by that `TPipe`. +- `tpush_to_aiv` / `tpush_to_aic` lower to `TPUSH(pipe, tile)`. +- `tpop_from_aic` / `tpop_from_aiv` lower to `TPOP(pipe, tile)`. +- `tfree_from_aic` / `tfree_from_aiv` lower to `TFREE(pipe)` and release the consumed slot. -- Cube: `TPUSH(pipe, acc_tile)` -- Vector: `TPOP(pipe, vec_tile)` then `TFREE(pipe)` +## C2V -The important mental model is: `TPUSH`/`TPOP` are the real cross-core handoff, while `gm_slot_buffer` is the shared backing storage that makes the FIFO work. +Cube sends. Vector receives. -## How Bidirectional Works +This example computes `X @ X` on Cube, sends the accumulator tile to Vector, then Vector stores it to GM. -`bidi` starts the same way as `c2v`, but adds a return path: +```text +Cube: load X -> matmul -> tpush_to_aiv +Vector: tpop_from_aic -> store Y -> tfree_from_aic +``` + +Pipe wiring: + +- Vector owns the consumer buffer: `reserve_buffer("c2v_fifo", location="VEC")` +- Cube imports it: `import_reserved_buffer("c2v_fifo", peer_func="@vector_kernel")` +- Both sides initialize with `dir_mask = 1` + +## V2C + +Vector sends. Cube receives. + +This example loads `X` on Vector, sends that tile to Cube, then Cube stores it to GM. + +```text +Vector: load X -> tpush_to_aic +Cube: tpop_from_aiv -> store Y -> tfree_from_aiv +``` + +Pipe wiring: + +- Cube owns the consumer buffer: `reserve_buffer("v2c_fifo", location="MAT")` +- Vector imports it: `import_reserved_buffer("v2c_fifo", peer_func="@cube_kernel")` +- Both sides initialize with `dir_mask = 2` + +## BIDI + +Both directions are enabled. + +This example sends `X @ X` from Cube to Vector. Vector doubles it and sends it back. Cube receives the returned tile and stores it to GM. + +```text +Cube: matmul -> tpush_to_aiv +Vector: tpop_from_aic -> add -> tpush_to_aic -> tfree_from_aic +Cube: tpop_from_aiv -> store Y -> tfree_from_aiv +``` + +Pipe wiring: -- Cube computes `x @ x` and sends it to vector over C2V. -- Vector pops that tile, computes `tile + tile`, and pushes the doubled result back over V2C. -- Cube pops the returned tile and writes it to GM. +- Vector reserves `c2v_fifo`; Cube imports it +- Cube reserves `v2c_fifo`; Vector imports it +- Both sides initialize with `dir_mask = 3` -The important difference is that both sides initialize with `dir_mask = 3`, so the same mixed-kernel launch can use both directions of the pipe. +For `dir_mask = 3`, allocate FIFO backing for both directions. `run.py` uses `8 KiB`. diff --git a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh index b3981545..6b7df346 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/compile.sh +++ b/examples/aot/tpushpop/mix-kernel_mlir/compile.sh @@ -4,10 +4,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts" MODE="${TPUSHPOP_MODE:-c2v}" -BUILDER_PATH="${SCRIPT_DIR}/${MODE}_builder.py" -MLIR_GEN_PATH="${SCRIPT_DIR}/${MODE}_gen.mlir" +BUILDER_PATH="${SCRIPT_DIR}/kernels/${MODE}_builder.py" +MLIR_GEN_PATH="${ARTIFACT_DIR}/${MODE}_gen.mlir" GENERATED_CPP="${ARTIFACT_DIR}/${MODE}.cpp" -LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so" +LIB_PATH="${ARTIFACT_DIR}/tpushpop_mlir_lib.so" case "${MODE}" in c2v|c2v_add|v2c|bidi) ;; diff --git a/examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/bidi_builder.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/bidi_builder.py rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/bidi_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_add_builder.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/c2v_add_builder.py rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_add_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_builder.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/c2v_builder.py rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/c2v_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py b/examples/aot/tpushpop/mix-kernel_mlir/kernels/v2c_builder.py similarity index 100% rename from examples/aot/tpushpop/mix-kernel_mlir/v2c_builder.py rename to examples/aot/tpushpop/mix-kernel_mlir/kernels/v2c_builder.py diff --git a/examples/aot/tpushpop/mix-kernel_mlir/run.py b/examples/aot/tpushpop/mix-kernel_mlir/run.py index de46b663..f749e378 100644 --- a/examples/aot/tpushpop/mix-kernel_mlir/run.py +++ b/examples/aot/tpushpop/mix-kernel_mlir/run.py @@ -9,7 +9,7 @@ from ptodsl.test_util import get_test_device THIS_DIR = os.path.dirname(os.path.abspath(__file__)) -DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "tpushpop_mlir_lib.so") +DEFAULT_LIB_PATH = os.path.join(THIS_DIR, "build_artifacts", "tpushpop_mlir_lib.so") DEFAULT_COMPILE_SCRIPT = os.path.join(THIS_DIR, "compile.sh") DEFAULT_FIFO_BYTES = 4 * 1024 DEFAULT_FIFO_BYTES_BOTH = 8 * 1024 From 8ca2c8f3b3baad3d4352f0d622a10c7bdadc8346 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:51:12 +0000 Subject: [PATCH 37/38] test: add ptoas test --- tests/frontend/test_multifunc_ir.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/frontend/test_multifunc_ir.py b/tests/frontend/test_multifunc_ir.py index 1e0a1668..0b4343e4 100644 --- a/tests/frontend/test_multifunc_ir.py +++ b/tests/frontend/test_multifunc_ir.py @@ -1,3 +1,5 @@ +import subprocess + from mlir.dialects import func, pto as _pto from mlir.ir import ( Attribute, @@ -92,3 +94,20 @@ def test_old_single_function_builder_matches_raw_mlir(): def test_new_multi_function_builder_matches_raw_mlir(): assert str(multi_kernel_module) == str(build_multi_verbose()) + + +def test_multi_function_module_compiles_with_ptoas(tmp_path): + pto_path = tmp_path / "multi_kernel_module.pto" + cpp_path = tmp_path / "multi_kernel_module.cpp" + pto_path.write_text(str(multi_kernel_module), encoding="utf-8") + + subprocess.run( + [ + "ptoas", + "--enable-insert-sync", + str(pto_path), + "-o", + str(cpp_path), + ], + check=True, + ) From d04895870937124108f5aa2ff9928d21e584a9e2 Mon Sep 17 00:00:00 2001 From: fiskrt <43207511+fiskrt@users.noreply.github.com> Date: Mon, 13 Apr 2026 09:38:52 +0000 Subject: [PATCH 38/38] chore: update pto-isa version in ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cf8f26cb..068abedb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,7 +50,7 @@ jobs: RELEASE_VER: 0.24 RELEASE_TAG: v0.24 CLI_DIR: /installers/ptoas-cli - PTOISA_COMMIT: 2ee948ef636863ed149f176d5327d9db5f349bb6 + PTOISA_COMMIT: a8c3fbf42a2f4a0f609f64e138dda62deefddb8e steps: - name: Install system packages