diff --git a/Makefile b/Makefile index 00d4c776..340fe970 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ CSRC_KERNEL_DIR := csrc/kernel .PHONY: clean setup_once build wheel install docs test test_tri_inv clean: - rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/ pto_kernels-*.whl + rm -rf build/ dist/ extra-info/ *.so *.egg-info/ kernel_meta/ pto_kernels-*.whl setup_once: pip3 install -r requirements.txt @@ -31,10 +31,22 @@ compile_%: -I$(CSRC_KERNEL_DIR) \ -I$(PTO_LIB_PATH)/include \ --npu-arch=dav-2201 \ - -Wno-ignored-attributes \ + -Wno-ignored-attributes \ $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ -o libkernel_$*.so +compile_cpu_%: + g++-15 -fPIC -shared -D__CPU_SIM -std=c++20 \ + -I$(CSRC_KERNEL_DIR) \ + -I$(PTO_LIB_PATH)/include \ + -D_FORTIFY_SOURCE=2 \ + -Wno-macro-redefined \ + -Wno-ignored-attributes \ + -fstack-protector-strong \ + $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ + -o libkernel_$*.so + + install: python3 -m pip install --force-reinstall pto_kernels-*.whl diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index f7792cff..b068e409 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -131,6 +131,16 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, } extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, - uint8_t* y, uint32_t in_length) { - vabs_fp16<<>>(x, y, in_length); + uint8_t* z, uint32_t in_length) { +#ifndef __CPU_SIM + vabs_fp16<<>>(x, z, in_length); +#else + set_block_num(blockDim); + for (uint32_t i = 0; i < blockDim; ++i) { + { + pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2); + vabs_fp16(x, z, in_length); + } + } +#endif } diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 1ae74498..a9ad05d3 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -8,9 +8,34 @@ for the full License text. */ #pragma once +#include #include #include +#ifdef __CPU_SIM + +#define __DAV_C220_VEC__ +#define __DAV_C220_CUBE__ + +/** + * * @brief Thread-local variables to store block information for CPU + * simulation. + * */ +inline thread_local uint32_t g_block_num = 1; + +/** + * @brief Global accessor for block number in CPU simulation. + * + * We need this function because pto/common/cpu_stub.hpp doesn't define it. + */ +extern "C" uint32_t get_block_num() { return g_block_num; } + +/** + * @brief Set the number of blocks. + */ +extern "C" void set_block_num(uint32_t block_num) { g_block_num = block_num; } +#endif + // clang-format off: so it does not get wrongfully flagged by linter #ifndef GM_ADDR #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" diff --git a/run_abs_cpu.py b/run_abs_cpu.py new file mode 100644 index 00000000..d255f2fd --- /dev/null +++ b/run_abs_cpu.py @@ -0,0 +1,49 @@ +import os +import ctypes +import torch + +# Select device "cpu" or "npu" +DEVICE = "cpu" + +if __name__ == "__main__": + + try: + lib_path = "libkernel_abs.so" + lib_path = os.path.abspath(lib_path) + lib = ctypes.CDLL(lib_path) + print(f"Loaded library from {lib_path}") + + lib.call_vabs_fp16.restype = None + lib.call_vabs_fp16.argtypes = [ + ctypes.c_uint32, # blockDim + ctypes.c_void_p, # stream + ctypes.c_void_p, # y + ctypes.c_void_p, # x + ctypes.c_int, # N + ] + stream_ptr = torch.npu.current_stream()._as_parameter_ # noqa + + def torch_to_ctypes(tensor): + return ctypes.c_void_p(tensor.data_ptr()) + + block_num = 4 + length = [block_num, 128] + + x = torch.randn(length, device="cpu", dtype=torch.float16).to(DEVICE) + actual = torch.empty_like(x) + expected = torch.abs(x) + + print(f"Input: {x[-20:]}") + lib.call_vabs_fp16( + block_num, + stream_ptr, + torch_to_ctypes(x), + torch_to_ctypes(actual), + x.numel(), + ) + is_close = torch.allclose(actual, expected) + print(f"Is all close? {is_close}") + print(actual[0, :10]) + print(expected[0, :10]) + finally: + del lib # triggers dlclose in CPython