From 005c0e366054210b63563a9e139538ffdf0eb497 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 12 Mar 2026 15:51:03 +0000 Subject: [PATCH 1/2] wip --- CMakeLists.txt | 14 ++++++++++++-- csrc/host/utils.h | 1 + csrc/kernel/kernel_abs.cpp | 2 -- csrc/kernel/kernel_batch_matrix_square.cpp | 2 -- csrc/kernel/kernel_simple_matmul.cpp | 3 --- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 9 +++------ csrc/kernel/kernel_tri_inv_rec_unroll.cpp | 3 +-- csrc/kernel/kernel_tri_inv_trick.cpp | 2 -- csrc/kernel/kernel_utils.h | 1 - 9 files changed, 17 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f24285ff..77fb6212 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,8 +47,14 @@ set(CMAKE_COMPILER bisheng) set(CMAKE_C_COMPILER ${CMAKE_COMPILER}) set(CMAKE_CXX_COMPILER ${CMAKE_COMPILER}) -add_compile_options(-D_FORTIFY_SOURCE=2 -O2 -std=c++17 -Wno-macro-redefined - -Wno-ignored-attributes -fstack-protector-strong) +add_compile_options( + -D_FORTIFY_SOURCE=2 + -O2 + -DMEMORY_BASE + -std=c++17 + -Wno-macro-redefined + -Wno-ignored-attributes + -fstack-protector-strong) add_link_options(-s -Wl,-z,relro -Wl,-z,now) @@ -114,6 +120,10 @@ ascendc_library( csrc/kernel/kernel_tri_inv_rec_unroll.cpp csrc/kernel/kernel_tri_inv_trick.cpp) +# TODO(anastasios): Configure this depending on the NPU device. For A5, use +# REGISTER_BASE. +ascendc_compile_definitions(no_workspace_kernel PRIVATE MEMORY_BASE=1) + ascendc_include_directories( no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include ${libpto_isa_headers_SOURCE_DIR}/include/pto/common) diff --git a/csrc/host/utils.h b/csrc/host/utils.h index db004410..e458783b 100644 --- a/csrc/host/utils.h +++ b/csrc/host/utils.h @@ -90,6 +90,7 @@ inline at::Tensor CopyTensorHostToDevice(const at::Tensor& cpu_tensor) { * @param [in] scalar_data_type Data type of scalar * @return at::Tensor Tensor on NPU containing the `cpu_scalar`. */ +[[maybe_unused]] inline at::Tensor CopyScalarToDevice(const c10::Scalar& cpu_scalar, at::ScalarType scalar_data_type) { return CopyTensorHostToDevice( diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 496e0395..e14bcc51 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -9,8 +9,6 @@ for the full License text. #if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) -#define MEMORY_BASE - #include #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp index 34827034..0704f127 100644 --- a/csrc/kernel/kernel_batch_matrix_square.cpp +++ b/csrc/kernel/kernel_batch_matrix_square.cpp @@ -6,8 +6,6 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ - -#define MEMORY_BASE #include #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp index 465ca488..a3a5b69e 100644 --- a/csrc/kernel/kernel_simple_matmul.cpp +++ b/csrc/kernel/kernel_simple_matmul.cpp @@ -9,7 +9,6 @@ for the full License text. #if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) // Placeholder for VEC compilation (the real kernel is CUBE-only). -#define MEMORY_BASE #include extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a, @@ -25,8 +24,6 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, #elif (__CHECK_FEATURE_AT_PRECOMPILE) || \ (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) -#define MEMORY_BASE - #include #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index 9277bcc2..a60e32d2 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -7,10 +7,6 @@ See LICENSE in the root of the software repository: for the full License text. */ -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - -#define MEMORY_BASE - #include #include "kernel_utils.h" @@ -32,6 +28,8 @@ using namespace pto; template AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, uint32_t total_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) + set_mask_norm(); set_vector_mask(-1, -1); @@ -143,6 +141,7 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, } wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif } extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( @@ -170,5 +169,3 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32( runTTriInv((__gm__ float*)x, (__gm__ float*)z, in_length); } } - -#endif diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp index 238a3940..b9ff8ead 100644 --- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp +++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp @@ -7,7 +7,7 @@ See LICENSE in the root of the software repository: for the full License text. */ -#define MEMORY_BASE +#include #include #include "kernel_utils.h" @@ -448,7 +448,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv, /* Initializations */ constexpr uint32_t TileLen = MatrixSize * MatrixSize; constexpr uint32_t FractalSize = 16; // fractal size for half - constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize; constexpr uint32_t NumL0Buffers = 2; if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) { diff --git a/csrc/kernel/kernel_tri_inv_trick.cpp b/csrc/kernel/kernel_tri_inv_trick.cpp index 206a1618..3b3ac470 100644 --- a/csrc/kernel/kernel_tri_inv_trick.cpp +++ b/csrc/kernel/kernel_tri_inv_trick.cpp @@ -6,8 +6,6 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ - -#define MEMORY_BASE #include #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 0c1a9d80..0b6ae4c0 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -8,7 +8,6 @@ for the full License text. */ #pragma once -#define MEMORY_BASE #include namespace kernel_utils { From 9810a061fbec3fc0a22e399e02de739750b0f2d3 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 12 Mar 2026 16:36:11 +0000 Subject: [PATCH 2/2] (refactor) define MEMORY_BASE in kernel_utills.h --- CMakeLists.txt | 14 ++--------- csrc/kernel/kernel_abs.cpp | 10 +++----- csrc/kernel/kernel_batch_matrix_square.cpp | 3 +-- csrc/kernel/kernel_simple_matmul.cpp | 28 ++++------------------ csrc/kernel/kernel_tri_inv_col_sweep.cpp | 4 ---- csrc/kernel/kernel_tri_inv_rec_unroll.cpp | 4 ---- csrc/kernel/kernel_tri_inv_trick.cpp | 3 +-- csrc/kernel/kernel_utils.h | 6 +++++ 8 files changed, 18 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77fb6212..f24285ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,14 +47,8 @@ set(CMAKE_COMPILER bisheng) set(CMAKE_C_COMPILER ${CMAKE_COMPILER}) set(CMAKE_CXX_COMPILER ${CMAKE_COMPILER}) -add_compile_options( - -D_FORTIFY_SOURCE=2 - -O2 - -DMEMORY_BASE - -std=c++17 - -Wno-macro-redefined - -Wno-ignored-attributes - -fstack-protector-strong) +add_compile_options(-D_FORTIFY_SOURCE=2 -O2 -std=c++17 -Wno-macro-redefined + -Wno-ignored-attributes -fstack-protector-strong) add_link_options(-s -Wl,-z,relro -Wl,-z,now) @@ -120,10 +114,6 @@ ascendc_library( csrc/kernel/kernel_tri_inv_rec_unroll.cpp csrc/kernel/kernel_tri_inv_trick.cpp) -# TODO(anastasios): Configure this depending on the NPU device. For A5, use -# REGISTER_BASE. -ascendc_compile_definitions(no_workspace_kernel PRIVATE MEMORY_BASE=1) - ascendc_include_directories( no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include ${libpto_isa_headers_SOURCE_DIR}/include/pto/common) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index e14bcc51..a0022d38 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -7,16 +7,13 @@ See LICENSE in the root of the software repository: for the full License text. */ -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - -#include - -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#include "kernel_utils.h" using namespace pto; template AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) // define GlobalData on global memory with shape and stride using ShapeDim5 = pto::Shape<1, 1, 1, matrix_size, matrix_size>; using StrideDim5 = pto::Stride<1, 1, 1, matrix_size, 1>; @@ -76,6 +73,7 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) { set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); } +#endif } extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, @@ -91,5 +89,3 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, // main kernel, in_length is dynamic input runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); } - -#endif diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp index 0704f127..c794c827 100644 --- a/csrc/kernel/kernel_batch_matrix_square.cpp +++ b/csrc/kernel/kernel_batch_matrix_square.cpp @@ -6,9 +6,8 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ -#include -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#include "kernel_utils.h" using namespace pto; diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp index a3a5b69e..755bcfc2 100644 --- a/csrc/kernel/kernel_simple_matmul.cpp +++ b/csrc/kernel/kernel_simple_matmul.cpp @@ -6,27 +6,7 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ -#if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - -// Placeholder for VEC compilation (the real kernel is CUBE-only). -#include - -extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) {} - -extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) {} - -#elif (__CHECK_FEATURE_AT_PRECOMPILE) || \ - (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) - -#include - -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#include "kernel_utils.h" using namespace pto; @@ -45,6 +25,9 @@ AICORE inline void WaitFlag(uint32_t id) { template AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b, __gm__ OutputT* c) { +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation + constexpr uint32_t tile_len = matrix_size * matrix_size; /* Global Memory / Tensors */ @@ -113,6 +96,7 @@ AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b, SetFlag(0); // M pipe sets flag for FIX pipe WaitFlag(0); // FIX pipe waits for M pipe to set flag TSTORE(c_global_out, c_l0_tile); +#endif } template @@ -158,5 +142,3 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, run_simple_matmul((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c, matrix_size); } - -#endif diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index a60e32d2..33dc16b6 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -7,12 +7,8 @@ See LICENSE in the root of the software repository: for the full License text. */ -#include - #include "kernel_utils.h" -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" - using namespace pto; /** diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp index b9ff8ead..86bf90f6 100644 --- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp +++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp @@ -7,12 +7,8 @@ See LICENSE in the root of the software repository: for the full License text. */ -#include -#include - #include "kernel_utils.h" -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" using namespace pto; using namespace kernel_utils; diff --git a/csrc/kernel/kernel_tri_inv_trick.cpp b/csrc/kernel/kernel_tri_inv_trick.cpp index 3b3ac470..b62d3ae5 100644 --- a/csrc/kernel/kernel_tri_inv_trick.cpp +++ b/csrc/kernel/kernel_tri_inv_trick.cpp @@ -6,9 +6,8 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ -#include -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#include "kernel_utils.h" using namespace pto; diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 0b6ae4c0..20936361 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -8,8 +8,14 @@ for the full License text. */ #pragma once +#define MEMORY_BASE + +#include #include +// To avoid #include "kernel_operator.h" +#define GM_ADDR __gm__ uint8_t* + namespace kernel_utils { /** * @brief Do a sync step (set-wait flag) between two pipes.