Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b269e2b
feat: init
fiskrt Mar 24, 2026
e821d3d
feat: up isa version, need etc.. for mix-kernels
fiskrt Mar 24, 2026
a83b06d
feat: add simpler cpp case
fiskrt Mar 26, 2026
0ad3cee
feat: rename
fiskrt Mar 26, 2026
4e03435
feat: add mlir example[ WIP ]
fiskrt Mar 26, 2026
da15c6d
feat: add gitignore
fiskrt Mar 26, 2026
041625f
feat: simple bidirectional transfer working in mlir
fiskrt Mar 27, 2026
5137291
feat: now does simple add
fiskrt Mar 27, 2026
6210864
feat: clean working version simple
fiskrt Mar 27, 2026
f9d8812
feat: clean working version simple
fiskrt Mar 27, 2026
d2bf0ca
wip: add transfer ops to dsl
fiskrt Mar 27, 2026
6055f69
feat: docker add compiled cpp and bindings
fiskrt Mar 27, 2026
8ea9459
feat: use classes instead
fiskrt Mar 27, 2026
3c7dbd9
WIP: add builder with multiple funcs
fiskrt Apr 7, 2026
834ca8f
feat: add type arg to const() api
fiskrt Apr 7, 2026
422f5f2
WIP: in decorated function we allow multiple functions
fiskrt Apr 7, 2026
2362f7e
WIP: simplify ir.py
fiskrt Apr 7, 2026
c7b31f4
use new ptodsl api for builder
fiskrt Apr 7, 2026
0597c0a
feat: remove files
fiskrt Apr 7, 2026
57e30c0
test: add old and new
fiskrt Apr 7, 2026
7182e8a
feat: remove docs
fiskrt Apr 7, 2026
48b1eb3
fix: arith import in builder
fiskrt Apr 7, 2026
0c865a1
test: compare to MLIR pybindings
fiskrt Apr 7, 2026
ea019ed
fix: names
fiskrt Apr 8, 2026
f928163
fix: naming
fiskrt Apr 8, 2026
8ae8635
feat: deuglify the wrappers
fiskrt Apr 8, 2026
62db364
feat: add more examples v2c, c2v,
fiskrt Apr 8, 2026
1d83124
feat: add ffts address (needed for bidir comm)
fiskrt Apr 9, 2026
e32afba
feat: unmangle kernel name
fiskrt Apr 9, 2026
8029599
feat: add ffts functionality to api
fiskrt Apr 9, 2026
10bfb1b
feat: add bidir example
fiskrt Apr 9, 2026
56527f4
chore: docker ptoas ver and pto-isa
fiskrt Apr 9, 2026
9d6c1e0
Merge remote-tracking branch 'origin/main' into feat-mix-kernel-gb
fiskrt Apr 9, 2026
d40da05
chore: black
fiskrt Apr 9, 2026
7c2a4a0
chore: black
fiskrt Apr 9, 2026
2779e56
feat: gitignore
fiskrt Apr 9, 2026
49eae78
feat: move files and cleanup
fiskrt Apr 9, 2026
8ca2c8f
test: add ptoas test
fiskrt Apr 9, 2026
d048958
chore: update pto-isa version in ci
fiskrt Apr 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ RUN pip install --no-cache-dir \
ipython jupyterlab matplotlib pandas

# certain operations need latest isa header, not CANN 8.5.0 default
# header on 2026/03/16
ARG PTOISA_COMMIT=313817be696792a4e16a7ea5994ec98e34391613
# header on 2026/03/24
ARG PTOISA_COMMIT=febd8a15a9dc03f87b6aa293c3ab66a67b6e80af
WORKDIR /sources
RUN git clone https://gitcode.com/cann/pto-isa.git \
&& cd pto-isa && git checkout $PTOISA_COMMIT
Expand All @@ -29,10 +29,10 @@ ARG CACHE_BURST=1

# ARG ARCH=x86_64
ARG ARCH=aarch64
ARG RELEASE_REPO=zhangstevenunity/PTOAS
ARG RELEASE_VER=0.9
ARG RELEASE_TAG=v${RELEASE_VER}
ARG WHEEL_NAME=ptoas-${RELEASE_VER}-cp311-none-manylinux_2_34_${ARCH}.whl
ARG RELEASE_REPO=huawei-csl/PTOAS
ARG RELEASE_VER=20260327
ARG RELEASE_TAG=${RELEASE_VER}
ARG WHEEL_NAME=ptoas-0.18-cp311-none-manylinux_2_34_${ARCH}.whl
ARG CLI_TAR_NAME=ptoas-bin-${ARCH}.tar.gz

WORKDIR /installers/
Expand Down
1 change: 1 addition & 0 deletions examples/aot/tpushpop/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
msprof_res/
17 changes: 17 additions & 0 deletions examples/aot/tpushpop/mix-kernel_mlir/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Bidirectional `TPUSH`/`TPOP` MLIR Example

This example mirrors the `mix-kernel_cpp` flow, but starts from
[`bidirectional_example.mlir`](/home/fskogh/pto-dsl/examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir).

The pipeline is:

1. run `ptoas --pto-arch=a3 bidirectional_example.mlir > build_artifacts/bidirectional_example.cpp`
2. compile the generated C++ together with `caller.cpp`
3. build `./tpushpop_mlir_lib.so`
4. launch the generated `pto.entry` kernel from Python

## Run

```bash
python run_bidirectional_example.py
```
108 changes: 108 additions & 0 deletions examples/aot/tpushpop/mix-kernel_mlir/bidirectional_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from ptodsl import pto, tile, to_ir_module
from ptodsl import scalar as s

const = s.const


def meta_data():
dtype = pto.float32
ptr_ty = pto.PtrType(dtype)
i32 = pto.int32
tensor_ty = pto.TensorType(rank=2, dtype=dtype)
tile_view_ty = pto.SubTensorType(shape=[16, 16], dtype=dtype)
x_mat_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="MAT")
x_left_ty = pto.TileBufType(
shape=[16, 16],
dtype=dtype,
memory_space="LEFT",
config=pto.TileBufConfig(blayout="ColMajor", slayout="RowMajor"),
)
x_right_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="RIGHT")
acc_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="ACC")
recv_ty = pto.TileBufType(shape=[16, 16], dtype=dtype, memory_space="VEC")
return locals()


@to_ir_module(meta_data=meta_data, module=True)
def module():
@pto.func(kernel="cube")
def cube_kernel(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty") -> None:
c0 = const(0)
c1 = const(1)
c16 = const(16)
c0_i32 = const(0, type=i32)
c2v_import = pto.import_reserved_buffer(
name="c2v_fifo",
peer_func="@vector_kernel",
)

pto.aic_initialize_pipe(
dir_mask=1,
slot_size=1024,
gm_slot_buffer=gm_slot_buffer,
c2v_consumer_buf=c2v_import,
v2c_consumer_buf=c0_i32,
)

x_mat_tile = pto.alloc_tile(x_mat_ty)
x_left_tile = pto.alloc_tile(x_left_ty)
x_right_tile = pto.alloc_tile(x_right_ty)
acc_tile = pto.alloc_tile(acc_ty)

gm_x_tile_view = pto.slice_view(
tile_view_ty,
source=pto.as_tensor(
tensor_ty,
ptr=gm_x,
shape=[c16, c16],
strides=[c16, c1],
),
offsets=[c0, c0],
sizes=[c16, c16],
)

pto.load(gm_x_tile_view, x_mat_tile)
tile.mov(x_mat_tile, x_left_tile)
tile.mov(x_mat_tile, x_right_tile)
tile.matmul(x_left_tile, x_right_tile, acc_tile)
pto.tpush_to_aiv(acc_tile, 0)

@pto.func(kernel="vector")
def vector_kernel(gm_slot_buffer: "ptr_ty", gm_y: "ptr_ty") -> None:
c0 = const(0)
c1 = const(1)
c16 = const(16)
c0_i32 = const(0, type=i32)
c2v_local = pto.reserve_buffer(name="c2v_fifo", size=4096, location="VEC")

pto.aiv_initialize_pipe(
dir_mask=1,
slot_size=1024,
gm_slot_buffer=gm_slot_buffer,
c2v_consumer_buf=c2v_local,
v2c_consumer_buf=c0_i32,
)

gm_y_tile_view = pto.slice_view(
tile_view_ty,
source=pto.as_tensor(
tensor_ty,
ptr=gm_y,
shape=[c16, c16],
strides=[c16, c1],
),
offsets=[c0, c0],
sizes=[c16, c16],
)

pto.store(pto.tpop_from_aic(recv_ty, 0), gm_y_tile_view)
pto.tfree_from_aic(0)

@pto.func(entry=True)
def call_both(gm_slot_buffer: "ptr_ty", gm_x: "ptr_ty", gm_y: "ptr_ty") -> None:
pto.call(cube_kernel, gm_slot_buffer, gm_x)
pto.call(vector_kernel, gm_slot_buffer, gm_y)


if __name__ == "__main__":
print(module)
89 changes: 89 additions & 0 deletions examples/aot/tpushpop/mix-kernel_mlir/bidirectional_example.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Bidirectional pipe example.
//
// This reduced version only uses the C2V pipe:
// - `c2v_fifo`: cube/kernel `@cube_kernel` pushes to vector/kernel `@vector_kernel`
//
// `gm_slot_buffer` is the GM-backed slot storage for these pipes. The reserve/import
// ops connect each side of the same named FIFO, and `aic/aiv_initialize_pipe`
// binds those FIFO endpoints to the shared GM slot buffer plus each side's local
// consumer buffer.
//
// End-to-end data flow:
// - Cube loads one input matrix `X` from GM.
// - Cube computes `Y = X @ X`.
// - Cube sends that accumulator tile to vector over `c2v_fifo`.
// - Vector pops the tile and stores it to GM as output matrix `Y`.
//
// What is transferred:
// - Cube -> Vector: one full `16 x 16` `f32` accumulator tile `Y = X @ X`
// sent with `pto.tpush_to_aiv` using `split = 0` (no split). Vector receives
// that same logical `16 x 16` tile with `pto.tpop_from_aic` in a vector tile
// type/layout, then stores it to the GM output buffer.
//
// Shape summary:
// - All transferred tiles are `rows=16, cols=16, dtype=f32`
// - Cube-produced C2V tile: `loc=acc`, `blayout=col_major`, `slayout=row_major`
// - Vector-consumed tile after C2V pop: `loc=vec`, `blayout=row_major`, `slayout=none_box`
module {

func.func @call_both(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>) attributes {pto.entry} {
func.call @cube_kernel(%gm_slot_buffer, %gm_x) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
func.call @vector_kernel(%gm_slot_buffer, %gm_y) : (!pto.ptr<f32>, !pto.ptr<f32>) -> ()
return
}

func.func @cube_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_x: !pto.ptr<f32>) attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c2v_import = pto.import_reserved_buffer {
name = "c2v_fifo",
peer_func = @vector_kernel
} -> i32
%c0_i32 = arith.constant 0 : i32
pto.aic_initialize_pipe {dir_mask = 1, slot_size = 1024}
(gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
c2v_consumer_buf = %c2v_import : i32,
v2c_consumer_buf = %c0_i32 : i32)

%x_mat_tile = pto.alloc_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%x_left_tile = pto.alloc_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>
%x_right_tile = pto.alloc_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>
%acc_tile = pto.alloc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
%gm_x_view = pto.make_tensor_view %gm_x, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
%gm_x_tile_view = pto.partition_view %gm_x_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
pto.tload ins(%gm_x_tile_view : !pto.partition_tensor_view<16x16xf32>) outs(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_left_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
pto.tmov ins(%x_mat_tile : !pto.tile_buf<loc=mat, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>) outs(%x_right_tile : !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>)
pto.tmatmul ins(%x_left_tile, %x_right_tile : !pto.tile_buf<loc=left, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=512, pad=0>, !pto.tile_buf<loc=right, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=col_major, fractal=512, pad=0>) outs(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
pto.tpush_to_aiv(%acc_tile : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>) {split = 0}
return
}

func.func @vector_kernel(%gm_slot_buffer: !pto.ptr<f32>, %gm_y: !pto.ptr<f32>)
attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c2v_local = pto.reserve_buffer {
name = "c2v_fifo",
size = 4096,
location = #pto.address_space<vec>,
auto = true
} -> i32
%c0_i32 = arith.constant 0 : i32
pto.aiv_initialize_pipe {dir_mask = 1, slot_size = 1024}
(gm_slot_buffer = %gm_slot_buffer : !pto.ptr<f32>,
c2v_consumer_buf = %c2v_local : i32,
v2c_consumer_buf = %c0_i32 : i32)

%gm_y_view = pto.make_tensor_view %gm_y, shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
%gm_y_tile_view = pto.partition_view %gm_y_view, offsets = [%c0, %c0], sizes = [%c16, %c16] : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
%recv_tile = pto.tpop_from_aic {split = 0}
-> !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tstore ins(%recv_tile : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%gm_y_tile_view : !pto.partition_tensor_view<16x16xf32>)
pto.tfree_from_aic {split = 0}
return
}

}
17 changes: 17 additions & 0 deletions examples/aot/tpushpop/mix-kernel_mlir/caller.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef KERNEL_CPP
#error "KERNEL_CPP must be defined at compile time."
#endif

#include <cstdint>

#include KERNEL_CPP

extern "C" void call_kernel(
uint32_t blockDim,
void *stream,
uint8_t *gmSlotBuffer,
uint8_t *x,
uint8_t *y)
{
call_both<<<blockDim, nullptr, stream>>>((__gm__ float *)gmSlotBuffer, (__gm__ float *)x, (__gm__ float *)y);
}
38 changes: 38 additions & 0 deletions examples/aot/tpushpop/mix-kernel_mlir/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ARTIFACT_DIR="${SCRIPT_DIR}/build_artifacts"
MLIR_PATH="${SCRIPT_DIR}/bidirectional_example.mlir"
GENERATED_CPP="${ARTIFACT_DIR}/bidirectional_example.cpp"
LIB_PATH="${SCRIPT_DIR}/tpushpop_mlir_lib.so"

mkdir -p "${ARTIFACT_DIR}"
rm -f "${GENERATED_CPP}" "${LIB_PATH}"

MLIR_GEN_PATH="${SCRIPT_DIR}/bidir_gen.mlir"
python bidirectional_builder.py > bidir_gen.mlir
ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_GEN_PATH}" > "${GENERATED_CPP}"

#ptoas --pto-arch=a3 --enable-insert-sync "${MLIR_PATH}" > "${GENERATED_CPP}"



bisheng \
-I/sources/pto-isa/include/ \
-fPIC -shared -D_FORTIFY_SOURCE=2 -O2 -std=c++17 -g \
-Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong \
-xcce -Xhost-start -Xhost-end \
-mllvm -cce-aicore-stack-size=0x8000 \
-mllvm -cce-aicore-function-stack-size=0x8000 \
-mllvm -cce-aicore-record-overflow=true \
-mllvm -cce-aicore-addr-transform \
-mllvm -cce-aicore-dcci-insert-for-scalar=false \
--npu-arch=dav-2201 -DMEMORY_BASE \
-std=gnu++17 \
-DKERNEL_CPP="\"${GENERATED_CPP}\"" \
"${SCRIPT_DIR}/caller.cpp" \
-o "${LIB_PATH}"

echo "Generated ${GENERATED_CPP}."
echo "Built ${LIB_PATH}."
Loading
Loading