Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ CSRC_KERNEL_DIR := csrc/kernel
.PHONY: clean setup_once build wheel install docs test test_tri_inv

clean:
rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/ pto_kernels-*.whl
rm -rf build/ dist/ extra-info/ *.so *.egg-info/ kernel_meta/ pto_kernels-*.whl

setup_once:
pip3 install -r requirements.txt
Expand All @@ -31,10 +31,22 @@ compile_%:
-I$(CSRC_KERNEL_DIR) \
-I$(PTO_LIB_PATH)/include \
--npu-arch=dav-2201 \
-Wno-ignored-attributes \
-Wno-ignored-attributes \
$(CSRC_KERNEL_DIR)/kernel_$*.cpp \
-o libkernel_$*.so

compile_cpu_%:
g++-15 -fPIC -shared -D__CPU_SIM -std=c++20 \
-I$(CSRC_KERNEL_DIR) \
-I$(PTO_LIB_PATH)/include \
-D_FORTIFY_SOURCE=2 \
-Wno-macro-redefined \
-Wno-ignored-attributes \
-fstack-protector-strong \
$(CSRC_KERNEL_DIR)/kernel_$*.cpp \
-o libkernel_$*.so



install:
python3 -m pip install --force-reinstall pto_kernels-*.whl
Expand Down
14 changes: 12 additions & 2 deletions csrc/kernel/kernel_abs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,16 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z,
}

extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x,
uint8_t* y, uint32_t in_length) {
vabs_fp16<<<blockDim * 2, nullptr, stream>>>(x, y, in_length);
uint8_t* z, uint32_t in_length) {
#ifndef __CPU_SIM
vabs_fp16<<<blockDim * 2, nullptr, stream>>>(x, z, in_length);
#else
set_block_num(blockDim);
for (uint32_t i = 0; i < blockDim; ++i) {
{
pto::cpu_sim::ScopedExecutionContext ctx(i, 0, 2);
vabs_fp16(x, z, in_length);
}
}
#endif
}
25 changes: 25 additions & 0 deletions csrc/kernel/kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,34 @@ for the full License text.
*/
#pragma once

#include <cstdint>
#include <pto/pto-inst.hpp>
#include <type_traits>

#ifdef __CPU_SIM

#define __DAV_C220_VEC__
#define __DAV_C220_CUBE__

/**
* * @brief Thread-local variables to store block information for CPU
* simulation.
* */
inline thread_local uint32_t g_block_num = 1;

/**
* @brief Global accessor for block number in CPU simulation.
*
* We need this function because pto/common/cpu_stub.hpp doesn't define it.
*/
extern "C" uint32_t get_block_num() { return g_block_num; }

/**
* @brief Set the number of blocks.
*/
extern "C" void set_block_num(uint32_t block_num) { g_block_num = block_num; }
#endif

// clang-format off: so it does not get wrongfully flagged by linter
#ifndef GM_ADDR
#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
Expand Down
49 changes: 49 additions & 0 deletions run_abs_cpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import ctypes
import torch

# Select device "cpu" or "npu"
DEVICE = "cpu"

if __name__ == "__main__":

try:
lib_path = "libkernel_abs.so"
lib_path = os.path.abspath(lib_path)
lib = ctypes.CDLL(lib_path)
print(f"Loaded library from {lib_path}")

lib.call_vabs_fp16.restype = None
lib.call_vabs_fp16.argtypes = [
ctypes.c_uint32, # blockDim
ctypes.c_void_p, # stream
ctypes.c_void_p, # y
ctypes.c_void_p, # x
ctypes.c_int, # N
]
stream_ptr = torch.npu.current_stream()._as_parameter_ # noqa

def torch_to_ctypes(tensor):
return ctypes.c_void_p(tensor.data_ptr())

block_num = 4
length = [block_num, 128]

x = torch.randn(length, device="cpu", dtype=torch.float16).to(DEVICE)
actual = torch.empty_like(x)
expected = torch.abs(x)

print(f"Input: {x[-20:]}")
lib.call_vabs_fp16(
block_num,
stream_ptr,
torch_to_ctypes(x),
torch_to_ctypes(actual),
x.numel(),
)
is_close = torch.allclose(actual, expected)
print(f"Is all close? {is_close}")
print(actual[0, :10])
print(expected[0, :10])
finally:
del lib # triggers dlclose in CPython
Loading