Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -e
python ./geglu_builder.py > ./geglu.pto
ptoas --enable-insert-sync ./geglu.pto -o ./geglu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#include "pto/pto-inst.hpp"
using namespace pto;

enum class PTOAutoSyncTailMode : int {
kBarrierAll = 0,
kSetWaitMte3ToSEvent0 = 1,
};

static AICORE inline void ptoas_auto_sync_tail(
PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) {
switch (mode) {
case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0:
set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0);
break;
case PTOAutoSyncTailMode::kBarrierAll:
default:
pipe_barrier(PIPE_ALL);
break;
}
}

__global__ AICORE void _kernel(__gm__ half* v1, __gm__ half* v2, __gm__ half* v3, int32_t v4, int32_t v5) {
unsigned v6 = 1;
unsigned v7 = 0;
const int32_t v8 = 16384;
const int32_t v9 = 1;
const int32_t v10 = 0;
const int64_t v11 = 0;
const int64_t v12 = 32768;
const int64_t v13 = 65536;
const int64_t v14 = 98304;
const int64_t v15 = 131072;
using T = float;

#if defined(__DAV_VEC__)
set_mask_norm();
set_vector_mask(-1, -1);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
if (v5 > v10) {
if (v5 <= v8) {
int64_t v16 = get_block_idx();
int64_t v17 = get_subblockid();
int64_t v18 = get_subblockdim();
int64_t v19 = (int64_t) v18;
int64_t v20 = get_block_num();
int32_t v21 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v20) * (uint64_t) v19);
int32_t v22 = v4 / v21;
int32_t v23 = v4 % v21 != v10 && v4 < v10 == v21 < v10 ? v22 + v9 : v22;
int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v16) * (uint64_t) v19) + (uint64_t) ((int64_t) v17))) * (uint32_t) v23);
int32_t v25 = (int32_t) ((uint32_t) v24 + (uint32_t) v23);
int32_t v26 = (int32_t) ((uint32_t) ((uint32_t) v25 < (uint32_t) v4 ? v25 : v4) - (uint32_t) v24);
int32_t v27 = (int32_t) ((uint32_t) v4 * (uint32_t) v5);
if (v26 > v10) {
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, 16384, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v28;
TASSIGN(v28, v11);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v29 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v5);
__ubuf__ half* v30 = v28.data();
uint64_t v31 = reinterpret_cast<uint64_t>(v30);
TASSIGN(v29, v31);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, 16384, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v32;
TASSIGN(v32, v12);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v33 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v5);
__ubuf__ half* v34 = v32.data();
uint64_t v35 = reinterpret_cast<uint64_t>(v34);
TASSIGN(v33, v35);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, 16384, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v36;
TASSIGN(v36, v13);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v37 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v5);
__ubuf__ half* v38 = v36.data();
uint64_t v39 = reinterpret_cast<uint64_t>(v38);
TASSIGN(v37, v39);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, 16384, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v40;
TASSIGN(v40, v14);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v41 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v5);
__ubuf__ half* v42 = v40.data();
uint64_t v43 = reinterpret_cast<uint64_t>(v42);
TASSIGN(v41, v43);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, 16384, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v44;
TASSIGN(v44, v15);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null> v45 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null, CompactMode::Null>(v5);
__ubuf__ half* v46 = v44.data();
uint64_t v47 = reinterpret_cast<uint64_t>(v46);
TASSIGN(v45, v47);
for (size_t v48 = (size_t) v10; v48 < ((size_t) v26); v48 += (size_t) v9) {
int32_t v49 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v24 + (uint32_t) ((int32_t) v48)) * (uint32_t) v5);
unsigned v50 = (unsigned) v5 * v6;
pto::Shape<1, 1, 1, 1, -1> v51 = pto::Shape<1, 1, 1, 1, -1>(v5);
pto::Stride<-1, -1, -1, -1, 1> v52 = pto::Stride<-1, -1, -1, -1, 1>(v50, v50, v50, v50);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v53 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v7 + (unsigned) v49 * (unsigned) v9), v51, v52);
unsigned v54 = (unsigned) v5 * v6;
pto::Shape<1, 1, 1, 1, -1> v55 = pto::Shape<1, 1, 1, 1, -1>(v5);
pto::Stride<-1, -1, -1, -1, 1> v56 = pto::Stride<-1, -1, -1, -1, 1>(v54, v54, v54, v54);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v57 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v7 + (unsigned) v49 * (unsigned) v9), v55, v56);
unsigned v58 = (unsigned) v5 * v6;
pto::Shape<1, 1, 1, 1, -1> v59 = pto::Shape<1, 1, 1, 1, -1>(v5);
pto::Stride<-1, -1, -1, -1, 1> v60 = pto::Stride<-1, -1, -1, -1, 1>(v58, v58, v58, v58);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v61 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v7 + (unsigned) v49 * (unsigned) v9), v59, v60);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
TLOAD(v29, v53);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
TLOAD(v33, v57);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
TSUB(v45, v29, v29);
pipe_barrier(PIPE_V);
TEXP(v37, v45);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
TADD(v41, v29, v29);
pipe_barrier(PIPE_V);
TEXP(v41, v41);
pipe_barrier(PIPE_V);
TSUB(v45, v41, v37);
pipe_barrier(PIPE_V);
TADD(v41, v41, v37);
pipe_barrier(PIPE_V);
TDIV(v45, v45, v41);
pipe_barrier(PIPE_V);
TADD(v41, v37, v45);
pipe_barrier(PIPE_V);
TMUL(v41, v29, v41);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
TADD(v45, v37, v37);
pipe_barrier(PIPE_V);
TDIV(v41, v41, v45);
pipe_barrier(PIPE_V);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
TMUL(v41, v41, v33);
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
pipe_barrier(PIPE_MTE3);
TSTORE(v61, v41);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
};
};
};
}
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
#endif // __DAV_VEC__

ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll);
return;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
module {
func.func @_kernel(%arg0: !pto.ptr<f16>, %arg1: !pto.ptr<f16>, %arg2: !pto.ptr<f16>, %arg3: i32, %arg4: i32) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16384 = arith.constant 16384 : index
%0 = arith.index_cast %arg3 : i32 to index
%1 = arith.index_cast %arg4 : i32 to index
pto.section.vector {
%2 = arith.cmpi sgt, %1, %c0 : index
scf.if %2 {
%3 = arith.cmpi sge, %c16384, %1 : index
scf.if %3 {
%4 = pto.get_block_idx
%5 = pto.get_subblock_idx
%6 = pto.get_subblock_num
%7 = pto.get_block_num
%8 = arith.muli %4, %6 : i64
%9 = arith.addi %8, %5 : i64
%10 = arith.index_cast %9 : i64 to index
%11 = arith.muli %7, %6 : i64
%12 = arith.index_cast %11 : i64 to index
%13 = arith.ceildivsi %0, %12 : index
%14 = arith.muli %10, %13 : index
%15 = arith.addi %14, %13 : index
%16 = arith.minui %15, %0 : index
%17 = arith.subi %16, %14 : index
%18 = arith.muli %0, %1 : index
%19 = pto.make_tensor_view %arg0, shape = [%18], strides = [%c1] : !pto.tensor_view<?xf16>
%20 = pto.make_tensor_view %arg1, shape = [%18], strides = [%c1] : !pto.tensor_view<?xf16>
%21 = pto.make_tensor_view %arg2, shape = [%18], strides = [%c1] : !pto.tensor_view<?xf16>
%22 = arith.cmpi sgt, %17, %c0 : index
scf.if %22 {
%23 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%24 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%25 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%26 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%27 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
scf.for %arg5 = %c0 to %17 step %c1 {
%28 = arith.addi %14, %arg5 : index
%29 = arith.muli %28, %1 : index
%30 = pto.partition_view %19, offsets = [%29], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%31 = pto.partition_view %20, offsets = [%29], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%32 = pto.partition_view %21, offsets = [%29], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
pto.tload ins(%30 : !pto.partition_tensor_view<1x16384xf16>) outs(%23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tload ins(%31 : !pto.partition_tensor_view<1x16384xf16>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%23, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.texp ins(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%23, %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.texp ins(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%26, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%26, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tdiv ins(%27, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%25, %27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tmul ins(%23, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%25, %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tdiv ins(%26, %27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tmul ins(%26, %24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tstore ins(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%32 : !pto.partition_tensor_view<1x16384xf16>)
}
}
}
}
}
return
}
}

Loading
Loading