From a3fe045e0db44bffe38a0fbbef9b71712eb9cf6b Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 06:03:52 +0000 Subject: [PATCH 1/6] (abs) add CPU support --- Makefile | 14 +++++++++-- csrc/kernel/kernel_abs.cpp | 21 ++++++++++++---- csrc/kernel/kernel_utils.h | 22 +++++++++++++++++ run_abs_cpu.py | 49 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 6 deletions(-) create mode 100644 run_abs_cpu.py diff --git a/Makefile b/Makefile index 0bf7a3e8..155b1223 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ CSRC_KERNEL_DIR := csrc/kernel .PHONY: clean setup_once build_cmake build_wheel install docs test test_tri_inv clean: - rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/ pto_kernels-*.whl + rm -rf build/ dist/ extra-info/ *.so *.egg-info/ kernel_meta/ pto_kernels-*.whl setup_once: pip3 install -r requirements.txt @@ -31,10 +31,20 @@ compile_%: -I$(CSRC_KERNEL_DIR) \ -I$(PTO_LIB_PATH)/include \ --npu-arch=dav-2201 \ - -Wno-ignored-attributes \ + -Wno-ignored-attributes \ $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ -o libkernel_$*.so +compile_cpu_%: + g++-15 -fPIC -shared -D__CPU_SIM -std=c++20 \ + -I$(CSRC_KERNEL_DIR) \ + -I$(PTO_LIB_PATH)/include \ + -D_FORTIFY_SOURCE=2 -Wno-macro-redefined -Wno-ignored-attributes \ + -fstack-protector-strong \ + $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ + -o libkernel_$*.so + + install: python3 -m pip install --force-reinstall pto_kernels-*.whl diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index d75e729d..a729fd3a 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -107,7 +107,7 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) +#if defined(__DAV_C220_VEC__) constexpr uint32_t TILE_LEN = 128; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); #else @@ -119,7 +119,7 @@ extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) +#if defined(__DAV_C220_VEC__) constexpr uint32_t TILE_LEN = 128; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); @@ -131,6 +131,19 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, } extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, - uint8_t* y, uint32_t in_length) { - vabs_fp16<<>>(x, y, in_length); + uint8_t* z, uint32_t in_length) { +#ifndef __CPU_SIM + vabs_fp16<<>>(x, z, in_length); +#else + printf("Running CPU mode. block_dim=%d , in_length=%d\n", blockDim, + in_length); + set_block_num(blockDim); + for (uint32_t i = 0; i < blockDim; ++i) { + printf("hello:%d\n", i); + { + pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2); + vabs_fp16(x, z, in_length); + } + } +#endif } diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 61acd5bf..48c266be 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -8,9 +8,31 @@ for the full License text. */ #pragma once +#include #include #include +#ifdef __CPU_SIM + +#define __DAV_C220_VEC__ +#define __DAV_C220_CUBE__ + +/** + * * @brief Thread-local variables to store block information for CPU + * simulation. + * */ +inline thread_local uint32_t g_block_num = 1; + +/** + * * @brief Global accessor for block number in CPU simulation. + * * + * * We need this function because pto/common/cpu_stub.hpp doesn't define it. + * */ +extern "C" uint32_t get_block_num() { return g_block_num; } + +extern "C" void set_block_num(uint32_t block_num) { g_block_num = block_num; } +#endif + // clang-format off: so it does not get wrongfully flagged by linter #ifndef GM_ADDR #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/run_abs_cpu.py b/run_abs_cpu.py new file mode 100644 index 00000000..d255f2fd --- /dev/null +++ b/run_abs_cpu.py @@ -0,0 +1,49 @@ +import os +import ctypes +import torch + +# Select device "cpu" or "npu" +DEVICE = "cpu" + +if __name__ == "__main__": + + try: + lib_path = "libkernel_abs.so" + lib_path = os.path.abspath(lib_path) + lib = ctypes.CDLL(lib_path) + print(f"Loaded library from {lib_path}") + + lib.call_vabs_fp16.restype = None + lib.call_vabs_fp16.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # y + ctypes.c_void_p, # x + ctypes.c_int, # N + ] + stream_ptr = torch.npu.current_stream()._as_parameter_ # noqa + + def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + block_num = 4 + length = [block_num, 128] + + x = torch.randn(length, device="cpu", dtype=torch.float16).to(DEVICE) + actual = torch.empty_like(x) + expected = torch.abs(x) + + print(f"Input: {x[-20:]}") + lib.call_vabs_fp16( + block_num, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(actual), + x.numel(), + ) + is_close = torch.allclose(actual, expected) + print(f"Is all close? {is_close}") + print(actual[0, :10]) + print(expected[0, :10]) + finally: + del lib # triggers dlclose in CPython From 4881ab2bbcdd6fea4989e4a00f92e391f297d231 Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 06:05:44 +0000 Subject: [PATCH 2/6] remove printfs --- csrc/kernel/kernel_abs.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index a729fd3a..618059b5 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -135,11 +135,8 @@ extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, #ifndef __CPU_SIM vabs_fp16<<>>(x, z, in_length); #else - printf("Running CPU mode. block_dim=%d , in_length=%d\n", blockDim, - in_length); set_block_num(blockDim); for (uint32_t i = 0; i < blockDim; ++i) { - printf("hello:%d\n", i); { pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2); vabs_fp16(x, z, in_length); From b8a926383d38358853345369f3badcf79c5d1fa5 Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 06:21:51 +0000 Subject: [PATCH 3/6] remove local thread --- csrc/kernel/kernel_abs.cpp | 1 - csrc/kernel/kernel_utils.h | 11 ++--------- run_abs_cpu.py | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 618059b5..e45a918c 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -135,7 +135,6 @@ extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, #ifndef __CPU_SIM vabs_fp16<<>>(x, z, in_length); #else - set_block_num(blockDim); for (uint32_t i = 0; i < blockDim; ++i) { { pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2); diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 48c266be..b7d9d4d8 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -14,23 +14,16 @@ for the full License text. #ifdef __CPU_SIM +// To make sure that both vector and cube code is executed #define __DAV_C220_VEC__ #define __DAV_C220_CUBE__ -/** - * * @brief Thread-local variables to store block information for CPU - * simulation. - * */ -inline thread_local uint32_t g_block_num = 1; - /** * * @brief Global accessor for block number in CPU simulation. * * * * We need this function because pto/common/cpu_stub.hpp doesn't define it. * */ -extern "C" uint32_t get_block_num() { return g_block_num; } - -extern "C" void set_block_num(uint32_t block_num) { g_block_num = block_num; } +extern "C" uint32_t get_block_num() { return 1; } #endif // clang-format off: so it does not get wrongfully flagged by linter diff --git a/run_abs_cpu.py b/run_abs_cpu.py index d255f2fd..d4a04a8c 100644 --- a/run_abs_cpu.py +++ b/run_abs_cpu.py @@ -33,7 +33,6 @@ def torch_to_ctypes(tensor): actual = torch.empty_like(x) expected = torch.abs(x) - print(f"Input: {x[-20:]}") lib.call_vabs_fp16( block_num, stream_ptr, From ec9ff8b7d168595752822c4fc1a8c708b7e8629d Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 06:24:35 +0000 Subject: [PATCH 4/6] fix --- csrc/kernel/kernel_utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index b7d9d4d8..d7624d81 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -19,10 +19,10 @@ for the full License text. #define __DAV_C220_CUBE__ /** - * * @brief Global accessor for block number in CPU simulation. - * * - * * We need this function because pto/common/cpu_stub.hpp doesn't define it. - * */ + * @brief Global accessor for block number in CPU simulation. + * + * We need this function because pto/common/cpu_stub.hpp doesn't define it. + */ extern "C" uint32_t get_block_num() { return 1; } #endif From 0da10b6d8fe5a3c7d07ea7f86cde4da154e39daf Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 07:16:18 +0000 Subject: [PATCH 5/6] Revert "remove local thread" This reverts commit b8a926383d38358853345369f3badcf79c5d1fa5. --- csrc/kernel/kernel_abs.cpp | 1 + csrc/kernel/kernel_utils.h | 14 ++++++++++++-- run_abs_cpu.py | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index e45a918c..618059b5 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -135,6 +135,7 @@ extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, #ifndef __CPU_SIM vabs_fp16<<>>(x, z, in_length); #else + set_block_num(blockDim); for (uint32_t i = 0; i < blockDim; ++i) { { pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2); diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index d7624d81..a30f3eaa 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -14,16 +14,26 @@ for the full License text. #ifdef __CPU_SIM -// To make sure that both vector and cube code is executed #define __DAV_C220_VEC__ #define __DAV_C220_CUBE__ +/** + * * @brief Thread-local variables to store block information for CPU + * simulation. + * */ +inline thread_local uint32_t g_block_num = 1; + /** * @brief Global accessor for block number in CPU simulation. * * We need this function because pto/common/cpu_stub.hpp doesn't define it. */ -extern "C" uint32_t get_block_num() { return 1; } +extern "C" uint32_t get_block_num() { return g_block_num; } + +/** + * @brief Set the number of blocks. + */ +extern "C" void set_block_num(uint32_t block_num) { g_block_num = block_num; } #endif // clang-format off: so it does not get wrongfully flagged by linter diff --git a/run_abs_cpu.py b/run_abs_cpu.py index d4a04a8c..d255f2fd 100644 --- a/run_abs_cpu.py +++ b/run_abs_cpu.py @@ -33,6 +33,7 @@ def torch_to_ctypes(tensor): actual = torch.empty_like(x) expected = torch.abs(x) + print(f"Input: {x[-20:]}") lib.call_vabs_fp16( block_num, stream_ptr, From af8d45563ccfe0ae9e89dbe66bfb2c2b44301312 Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 13 May 2026 11:46:41 +0000 Subject: [PATCH 6/6] fix --- CMakeLists.txt | 2 +- Makefile | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c9e5eba7..a6054f79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.16.0) project(pto-kernels) set(LINUX TRUE) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_C_STANDARD 11) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/Makefile b/Makefile index 155b1223..e8e7c5af 100644 --- a/Makefile +++ b/Makefile @@ -39,8 +39,10 @@ compile_cpu_%: g++-15 -fPIC -shared -D__CPU_SIM -std=c++20 \ -I$(CSRC_KERNEL_DIR) \ -I$(PTO_LIB_PATH)/include \ - -D_FORTIFY_SOURCE=2 -Wno-macro-redefined -Wno-ignored-attributes \ - -fstack-protector-strong \ + -D_FORTIFY_SOURCE=2 \ + -Wno-macro-redefined \ + -Wno-ignored-attributes \ + -fstack-protector-strong \ $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ -o libkernel_$*.so