Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 31 additions & 35 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,9 @@ add_link_options(-s -Wl,-z,relro -Wl,-z,now)
set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h"
"SHELL:-include stddef.h")

include_directories(${ASCEND_HOME_PATH}/include
include_directories(${ASCEND_CANN_PACKAGE_PATH}/include
${ASCEND_DRIVER_PATH}/kernel/inc)

if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(
FATAL_ERROR
"ascendc_kernel_cmake does not exist, please check whether the cann package is installed."
)
endif()

include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

include(FetchContent)

# certain operations need newer pto-isa header, not CANN 8.5.0 default (pin
Expand Down Expand Up @@ -108,21 +90,35 @@ message("* TORCH_NPU_PATH : ${TORCH_NPU_PATH}")
message("* TORCH_LIBRARIES : ${TORCH_LIBRARIES}")
message("***********************************************************")

ascendc_library(
no_workspace_kernel
SHARED
csrc/kernel/kernel_tri_inv_col_sweep.cpp
csrc/kernel/kernel_abs.cpp
csrc/kernel/kernel_csr_gather.cpp
csrc/kernel/kernel_simple_matmul.cpp
csrc/kernel/kernel_batch_matrix_square.cpp
csrc/kernel/kernel_tri_inv_rec_unroll.cpp
csrc/kernel/kernel_tri_inv_trick.cpp
csrc/kernel/kernel_swiglu.cpp)

ascendc_include_directories(
no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include
${libpto_isa_headers_SOURCE_DIR}/include/pto/common)
set(KERNEL_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_col_sweep.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_abs.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_simple_matmul.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_batch_matrix_square.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_trick.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_csr_gather.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_swiglu.cpp)

set(NO_WORKSPACE_KERNEL_LIB
${CMAKE_CURRENT_BINARY_DIR}/libno_workspace_kernel.so)

add_custom_command(
OUTPUT ${NO_WORKSPACE_KERNEL_LIB}
COMMAND
bisheng -fPIC -shared -xcce -O0 -g -std=c++17 --npu-arch=dav-2201
-I${libpto_isa_headers_SOURCE_DIR}/include ${KERNEL_SOURCES} -o
${NO_WORKSPACE_KERNEL_LIB}
DEPENDS ${KERNEL_SOURCES}
COMMENT "Building no_workspace_kernel with bisheng compiler")
Comment thread
zouzias marked this conversation as resolved.

add_custom_target(no_workspace_kernel_build ALL
DEPENDS ${NO_WORKSPACE_KERNEL_LIB})

add_library(no_workspace_kernel SHARED IMPORTED GLOBAL)
set_target_properties(no_workspace_kernel PROPERTIES IMPORTED_LOCATION
${NO_WORKSPACE_KERNEL_LIB})
add_dependencies(no_workspace_kernel no_workspace_kernel_build)

pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp)

Expand Down Expand Up @@ -154,7 +150,7 @@ if(PIP_INSTALL)
LINK_FLAGS "-Wl,-rpath,\${ORIGIN}/lib")

# install dynamic libraries under site-packages/pto_kernels/libs
install(TARGETS no_workspace_kernel LIBRARY DESTINATION pto_kernels/lib)
install(FILES ${NO_WORKSPACE_KERNEL_LIB} DESTINATION pto_kernels/lib)
else()
message(STATUS ">>>======================================================")
message(STATUS ">>> Ignoring dynamic libraries COPY inside Python wheel.")
Expand Down
18 changes: 17 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
# https://github.com/huawei-csl/pto-kernels/
# for the full License text.
# --------------------------------------------------------------------------------
.PHONY: clean setup_once build_wheel install test
PTO_LIB_PATH ?= $(ASCEND_TOOLKIT_HOME)
CSRC_KERNEL_DIR := csrc/kernel

.PHONY: clean setup_once build_cmake build_wheel install docs test test_tri_inv

clean:
rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/
Expand All @@ -20,6 +23,19 @@ build_cmake: clean
build_wheel:
export CMAKE_GENERATOR="Unix Makefiles" && pip wheel -v . --extra-index-url https://download.pytorch.org/whl/cpu


# 'make compile_abs' compiles 'kernel_abs.cpp' into 'libkernel_abs.so' without building the whole wheel package.
# This is useful for development and debugging of individual kernels.
compile_%:
bisheng -fPIC -shared -xcce -DMEMORY_BASE -O2 -std=c++17 \
-I$(CSRC_KERNEL_DIR) \
-I$(PTO_LIB_PATH)/include \
--npu-arch=dav-2201 \
-Wno-ignored-attributes \
$(CSRC_KERNEL_DIR)/kernel_$*.cpp \
-o libkernel_$*.so


install:
python3 -m pip install --force-reinstall pto_kernels-*.whl

Expand Down
6 changes: 3 additions & 3 deletions csrc/host/pybind11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ for the full License text.

#include "torch_abs.h"
#include "torch_batch_matrix_square.h"
#include "torch_csr_gather.h"
// #include "torch_csr_gather.h"
#include "torch_simple_matmul.h"
#include "torch_swiglu.h"
#include "torch_tri_inv.h"
#include "torch_tri_inv_col_sweep.h"
#include "torch_tri_inv_rec_unroll.h"
#include "torch_tri_inv_trick.h"

Expand All @@ -37,7 +37,7 @@ PYBIND11_MODULE(pto_kernels_ops, m) {
pybind11::arg("device_id") = 0);
m.def("pto_abs", &pto_isa_ops::run_abs);
m.def("pto_batch_matrix_square", &pto_isa_ops::run_batch_matrix_square);
m.def("pto_csr_gather", &pto_isa_ops::run_csr_gather);
// m.def("pto_csr_gather", &pto_isa_ops::run_csr_gather);
m.def("pto_simple_matmul", &pto_isa_ops::run_simple_matmul);
m.def("pto_swiglu", &pto_isa_ops::run_swiglu, py::arg("x"),
py::arg("dim") = -1);
Expand Down
23 changes: 14 additions & 9 deletions csrc/host/torch_abs.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_vabs_fp16.h"
#include "aclrtlaunch_vabs_fp32.h"
#include "utils.h"

namespace pto_isa_ops {
extern "C" void call_vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x,
void* y, uint32_t num_elements);

extern "C" void call_vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x,
void* y, uint32_t num_elements);

namespace pto_isa_ops {
/**
* @brief Runs element-wise absolute value.
*
Expand All @@ -26,26 +30,27 @@ namespace pto_isa_ops {

at::Tensor run_abs(const at::Tensor& x) {
const auto dtype = x.options().dtype();
at::Tensor z = at::empty_like(x);
const at::Tensor z = at::empty_like(x);
// Define the number of blocks of vector core
const uint32_t total_size = x.numel();
// FIXME: tile length is fixed to 128 for now
constexpr uint32_t TILE_SIZE = 128;

// Persistent kernel launch parameter
uint32_t total_tiles = (total_size + TILE_SIZE - 1) / TILE_SIZE;
uint32_t block_dim = GetNumVectorCores();
uint32_t block_dim = GetNumCubeCores();

if (total_tiles < block_dim) {
block_dim = total_tiles;
}

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
if (dtype == at::kHalf) {
EXEC_KERNEL_CMD(vabs_fp16, block_dim, x, z, total_size);

call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z),
total_size);
} else if (dtype == at::kFloat) {
EXEC_KERNEL_CMD(vabs_fp32, block_dim, x, z, total_size);

call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z),
total_size);
} else {
throw std::runtime_error("Unsupported dtype for `pto_abs` kernel");
}
Expand Down
17 changes: 13 additions & 4 deletions csrc/host/torch_batch_matrix_square.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_batch_matrix_square_fp16.h"
#include "aclrtlaunch_batch_matrix_square_fp32.h"
#include "utils.h"

extern "C" void call_batch_matrix_square_fp16(uint32_t blockDim,
aclrtStream stream, void* z,
void* x, uint32_t matrix_size);
extern "C" void call_batch_matrix_square_fp32(uint32_t blockDim,
aclrtStream stream, void* z,
void* x, uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down Expand Up @@ -46,10 +52,13 @@ at::Tensor run_batch_matrix_square(const at::Tensor& x) {
at::zeros({block_dim, matrix_size, matrix_size},
at::TensorOptions().dtype(dtype_out).device(device));

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
if (dtype == at::kHalf) {
EXEC_KERNEL_CMD(batch_matrix_square_fp16, block_dim, z, x, matrix_size);
call_batch_matrix_square_fp16(block_dim, acl_stream, ConvertType(z),
ConvertType(x), matrix_size);
} else if (dtype == at::kFloat) {
EXEC_KERNEL_CMD(batch_matrix_square_fp32, block_dim, z, x, matrix_size);
call_batch_matrix_square_fp32(block_dim, acl_stream, ConvertType(z),
ConvertType(x), matrix_size);
}

return z;
Expand Down
16 changes: 12 additions & 4 deletions csrc/host/torch_simple_matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@ for the full License text.
#include <ATen/ATen.h>
#include <torch/library.h>

#include "aclrtlaunch_simple_matmul_fp16.h"
#include "aclrtlaunch_simple_matmul_fp32.h"
#include "utils.h"

extern "C" void call_simple_matmul_fp16(uint32_t blockDim, aclrtStream stream,
void* a, void* b, void* c,
uint32_t matrix_size);
extern "C" void call_simple_matmul_fp32(uint32_t blockDim, aclrtStream stream,
void* a, void* b, void* c,
uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down Expand Up @@ -45,10 +50,13 @@ at::Tensor run_simple_matmul(const at::Tensor& a, const at::Tensor& b) {
at::ones({matrix_size, matrix_size},
at::TensorOptions().dtype(dtype_out).device(device));

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
if (dtype == at::kHalf) {
EXEC_KERNEL_CMD(simple_matmul_fp16, block_dim, a, b, c, matrix_size);
call_simple_matmul_fp16(block_dim, acl_stream, ConvertType(a),
ConvertType(b), ConvertType(c), matrix_size);
} else if (dtype == at::kFloat) {
EXEC_KERNEL_CMD(simple_matmul_fp32, block_dim, a, b, c, matrix_size);
call_simple_matmul_fp32(block_dim, acl_stream, ConvertType(a),
ConvertType(b), ConvertType(c), matrix_size);
}

return c;
Expand Down
8 changes: 6 additions & 2 deletions csrc/host/torch_swiglu.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ for the full License text.

#include <limits>

#include "aclrtlaunch_swiglu_fp16.h"
extern "C" uint32_t call_swiglu_fp16(uint32_t blockDim, aclrtStream stream,
void* x, void* y, uint32_t batch,
uint32_t input_n);
#include "utils.h"

namespace pto_isa_ops {
Expand Down Expand Up @@ -62,8 +64,10 @@ at::Tensor run_swiglu(const at::Tensor& x, int64_t dim = -1) {
const uint32_t input_n = static_cast<uint32_t>(input_n_i64);
const uint32_t block_dim = GetNumCubeCores();

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
at::Tensor y = at::empty({batch_i64, output_n_i64}, x.options());
EXEC_KERNEL_CMD(swiglu_fp16, block_dim, x, y, batch, input_n);
call_swiglu_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(y), batch,
input_n);
return y;
}

Expand Down
26 changes: 17 additions & 9 deletions csrc/host/torch_tri_inv.h → csrc/host/torch_tri_inv_col_sweep.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@ for the full License text.
#include <ATen/ATen.h>
#include <torch/library.h>

#include "aclrtlaunch_triv_inv_col_sweep_fp16.h"
#include "aclrtlaunch_triv_inv_col_sweep_fp32.h"
#include "utils.h"

extern "C" void call_triv_inv_col_sweep_fp16(uint32_t blockDim,
aclrtStream stream, void* M_inv,
void* M, uint32_t num_elems,
uint32_t matrix_size);

extern "C" void call_triv_inv_col_sweep_fp32(uint32_t blockDim,
aclrtStream stream, void* M_inv,
void* M, uint32_t num_elems,
uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down Expand Up @@ -48,16 +56,16 @@ at::Tensor run_tri_inv(const at::Tensor& x) {

const at::Tensor z = at::empty_like(x);

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
if (dtype == at::kHalf) {
EXEC_KERNEL_CMD(triv_inv_col_sweep_fp16, block_dim, x, z, num_elems,
matrix_size);

call_triv_inv_col_sweep_fp16(block_dim, acl_stream, ConvertType(z),
ConvertType(x), num_elems, matrix_size);
} else if (dtype == at::kFloat) {
EXEC_KERNEL_CMD(triv_inv_col_sweep_fp32, block_dim, x, z, num_elems,
matrix_size);

call_triv_inv_col_sweep_fp32(block_dim, acl_stream, ConvertType(z),
ConvertType(x), num_elems, matrix_size);
} else {
throw std::runtime_error("Unsupported dtype for `tri_inv` kernel");
throw std::runtime_error(
"Unsupported dtype for `triv_inv_col_sweep` kernel");
}

return z;
Expand Down
20 changes: 15 additions & 5 deletions csrc/host/torch_tri_inv_rec_unroll.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_tri_inv_rec_unroll_fp16.h"
#include "utils.h"

extern "C" void call_tri_inv_rec_unroll_fp16(
uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg,
uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads,
void* cu_seqlens);

namespace pto_isa_ops {

/**
Expand Down Expand Up @@ -72,14 +77,19 @@ at::Tensor run_tri_inv_rec_unroll(
at::TensorOptions().dtype(dtype).device(device));
I_neg.fill_diagonal_(-1);

auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
if (dtype == at::kHalf) {
if (cu_seqlens.numel() == 1) {
void* void_null_ptr = nullptr;
EXEC_KERNEL_CMD(tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg,
matrix_size, total_tiles, num_bsnd_heads, void_null_ptr);
call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv),
ConvertType(M), ConvertType(I_neg),
matrix_size, total_tiles, num_bsnd_heads,
void_null_ptr);
} else {
EXEC_KERNEL_CMD(tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg,
matrix_size, total_tiles, num_bsnd_heads, cu_seqlens);
call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv),
ConvertType(M), ConvertType(I_neg),
matrix_size, total_tiles, num_bsnd_heads,
cu_seqlens.data_ptr());
}
}

Expand Down
Loading
Loading