Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 28 additions & 32 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,9 @@ add_link_options(-s -Wl,-z,relro -Wl,-z,now)
set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h"
"SHELL:-include stddef.h")

include_directories(${ASCEND_HOME_PATH}/include
include_directories(${ASCEND_CANN_PACKAGE_PATH}/include
${ASCEND_DRIVER_PATH}/kernel/inc)

if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR
${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(
FATAL_ERROR
"ascendc_kernel_cmake does not exist, please check whether the cann package is installed."
)
endif()

include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

include(FetchContent)

FetchContent_Declare(
Expand Down Expand Up @@ -104,19 +86,33 @@ message("* TORCH_NPU_PATH : ${TORCH_NPU_PATH}")
message("* TORCH_LIBRARIES : ${TORCH_LIBRARIES}")
message("***********************************************************")

ascendc_library(
no_workspace_kernel
SHARED
csrc/kernel/kernel_tri_inv_col_sweep.cpp
csrc/kernel/kernel_abs.cpp
csrc/kernel/kernel_simple_matmul.cpp
csrc/kernel/kernel_batch_matrix_square.cpp
csrc/kernel/kernel_tri_inv_rec_unroll.cpp
csrc/kernel/kernel_tri_inv_trick.cpp)
set(KERNEL_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_col_sweep.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_abs.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_simple_matmul.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_batch_matrix_square.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_trick.cpp)

set(NO_WORKSPACE_KERNEL_LIB
${CMAKE_CURRENT_BINARY_DIR}/libno_workspace_kernel.so)

add_custom_command(
OUTPUT ${NO_WORKSPACE_KERNEL_LIB}
COMMAND
bisheng -fPIC -shared -xcce -O2 -std=c++17 --npu-arch=dav-2201
-I${libpto_isa_headers_SOURCE_DIR}/include ${KERNEL_SOURCES} -o
${NO_WORKSPACE_KERNEL_LIB}
DEPENDS ${KERNEL_SOURCES}
COMMENT "Building no_workspace_kernel with bisheng compiler")
Comment thread
zouzias marked this conversation as resolved.

add_custom_target(no_workspace_kernel_build ALL
DEPENDS ${NO_WORKSPACE_KERNEL_LIB})

ascendc_include_directories(
no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include
${libpto_isa_headers_SOURCE_DIR}/include/pto/common)
add_library(no_workspace_kernel SHARED IMPORTED GLOBAL)
set_target_properties(no_workspace_kernel PROPERTIES IMPORTED_LOCATION
${NO_WORKSPACE_KERNEL_LIB})
add_dependencies(no_workspace_kernel no_workspace_kernel_build)

pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp)

Expand Down Expand Up @@ -148,7 +144,7 @@ if(PIP_INSTALL)
LINK_FLAGS "-Wl,-rpath,\${ORIGIN}/lib")

# install dynamic libraries under site-packages/pto_kernels/libs
install(TARGETS no_workspace_kernel LIBRARY DESTINATION pto_kernels/lib)
install(FILES ${NO_WORKSPACE_KERNEL_LIB} DESTINATION pto_kernels/lib)
else()
message(STATUS ">>>======================================================")
message(STATUS ">>> Ignoring dynamic libraries COPY inside Python wheel.")
Expand Down
2 changes: 1 addition & 1 deletion csrc/host/pybind11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ for the full License text.
#include "torch_abs.h"
#include "torch_batch_matrix_square.h"
#include "torch_simple_matmul.h"
#include "torch_tri_inv.h"
#include "torch_tri_inv_col_sweep.h"
#include "torch_tri_inv_rec_unroll.h"
#include "torch_tri_inv_trick.h"

Expand Down
9 changes: 7 additions & 2 deletions csrc/host/torch_abs.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,17 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_vabs_fp16.h"
#include "aclrtlaunch_vabs_fp32.h"
#include "utils.h"

extern "C" aclError vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x,
void* z, uint32_t in_length);

extern "C" aclError vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x,
void* z, uint32_t in_length);

namespace pto_isa_ops {

/**
Expand Down
10 changes: 8 additions & 2 deletions csrc/host/torch_batch_matrix_square.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_batch_matrix_square_fp16.h"
#include "aclrtlaunch_batch_matrix_square_fp32.h"
#include "utils.h"

extern "C" aclError batch_matrix_square_fp16(uint32_t blockDim,
aclrtStream stream, void* z,
void* x, uint32_t matrix_size);
extern "C" aclError batch_matrix_square_fp32(uint32_t blockDim,
aclrtStream stream, void* z,
void* x, uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down
9 changes: 7 additions & 2 deletions csrc/host/torch_simple_matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@ for the full License text.
#include <ATen/ATen.h>
#include <torch/library.h>

#include "aclrtlaunch_simple_matmul_fp16.h"
#include "aclrtlaunch_simple_matmul_fp32.h"
#include "utils.h"

extern "C" aclError simple_matmul_fp16(uint32_t blockDim, aclrtStream stream,
void* a, void* b, void* c,
uint32_t matrix_size);
extern "C" aclError simple_matmul_fp32(uint32_t blockDim, aclrtStream stream,
void* a, void* b, void* c,
uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down
12 changes: 10 additions & 2 deletions csrc/host/torch_tri_inv.h → csrc/host/torch_tri_inv_col_sweep.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@ for the full License text.
#include <ATen/ATen.h>
#include <torch/library.h>

#include "aclrtlaunch_triv_inv_col_sweep_fp16.h"
#include "aclrtlaunch_triv_inv_col_sweep_fp32.h"
#include "utils.h"

extern "C" aclError triv_inv_col_sweep_fp16(uint32_t blockDim,
aclrtStream stream, void* M_inv,
void* M, uint32_t num_elems,
uint32_t matrix_size);

extern "C" aclError triv_inv_col_sweep_fp32(uint32_t blockDim,
aclrtStream stream, void* M_inv,
void* M, uint32_t num_elems,
uint32_t matrix_size);

namespace pto_isa_ops {

/**
Expand Down
9 changes: 8 additions & 1 deletion csrc/host/torch_tri_inv_rec_unroll.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@ for the full License text.
#pragma once

#include <ATen/ATen.h>
#include <acl/acl.h>
#include <torch/library.h>

#include "aclrtlaunch_tri_inv_rec_unroll_fp16.h"
#include "utils.h"

extern "C" aclError tri_inv_rec_unroll_fp16(
uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg,
uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads);
extern "C" aclError tri_inv_rec_unroll_fp32(
uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg,
uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads);

namespace pto_isa_ops {

/**
Expand Down
6 changes: 5 additions & 1 deletion csrc/host/torch_tri_inv_trick.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@ for the full License text.
#include <ATen/ATen.h>
#include <torch/library.h>

#include "aclrtlaunch_tri_inv_trick_fp16.h"
#include "utils.h"

extern "C" aclError tri_inv_trick_fp16(uint32_t blockDim, aclrtStream stream,
void* M_inv, void* M, void* I_neg,
uint32_t matrix_size,
uint32_t max_block_size);

namespace pto_isa_ops {

/**
Expand Down
29 changes: 13 additions & 16 deletions csrc/host/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ namespace pto_isa_ops {

#define DEVICE_TYPE c10::DeviceType::PrivateUse1

// Copied from tools/build/asc_rt/ascendc_runtime.h to avoid dependency on the
// header file. See
// https://gitcode.com/cann/asc-devkit/blob/v8.5.0/tools/build/asc_rt/ascendc_runtime.h
#define ASSERT_RETVAL(exp, ret) \
do { \
if (!(exp)) { \
Expand Down Expand Up @@ -132,18 +129,18 @@ constexpr auto ConvertTypes(Ts&... args) {
return std::make_tuple(ConvertType(args)...);
}

#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...) \
do { \
auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \
auto converted_params = pto_isa_ops::ConvertTypes(__VA_ARGS__); \
auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \
std::apply( \
[&](auto&&... params) { \
ACLRT_LAUNCH_KERNEL(kernel_name)(blockdim, acl_stream, params...); \
}, \
converted_params); \
return 0; \
}; \
at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call); \
#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...) \
do { \
auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \
auto converted_params = pto_isa_ops::ConvertTypes(__VA_ARGS__); \
auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \
std::apply( \
[&](auto&&... params) { \
kernel_name(blockdim, acl_stream, params...); \
}, \
converted_params); \
return 0; \
}; \
at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call); \
} while (false)
} // namespace pto_isa_ops
9 changes: 4 additions & 5 deletions csrc/kernel/kernel_abs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ See LICENSE in the root of the software repository:
https://github.com/huawei-csl/pto-kernels/
for the full License text.
*/

#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

#define MEMORY_BASE

#include <pto/pto-inst.hpp>
Expand All @@ -28,6 +25,8 @@ using namespace pto;
*/
template <typename T, unsigned TILE_LEN>
AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

// define GlobalData on global memory with shape and stride
using ShapeDim5 = pto::Shape<1, 1, 1, TILE_LEN, TILE_LEN>;
using StrideDim5 = pto::Stride<1, 1, 1, TILE_LEN, 1>;
Expand Down Expand Up @@ -83,6 +82,8 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
}

#endif
}

extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z,
Expand All @@ -96,5 +97,3 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z,
constexpr unsigned TILE_LEN = 64;
runTAbs<float, TILE_LEN>((__gm__ float*)x, (__gm__ float*)z, in_length);
}

#endif
27 changes: 4 additions & 23 deletions csrc/kernel/kernel_simple_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,10 @@ See LICENSE in the root of the software repository:
https://github.com/huawei-csl/pto-kernels/
for the full License text.
*/
#if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

// Placeholder for VEC compilation (the real kernel is CUBE-only).
#define MEMORY_BASE
#include <pto/common/type.hpp>

extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a,
__gm__ void* b,
__gm__ void* c,
uint32_t matrix_size) {}

extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
__gm__ void* b,
__gm__ void* c,
uint32_t matrix_size) {}

#elif (__CHECK_FEATURE_AT_PRECOMPILE) || \
(__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__))

#define MEMORY_BASE

#include <pto/pto-inst.hpp>

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"

using namespace pto;

constexpr unsigned NUM_BLOCKS = 20; // number of AICs
Expand Down Expand Up @@ -124,6 +103,9 @@ AICORE void run_simple_matmul(__gm__ T* a, __gm__ T* b, __gm__ float* c,
static_assert(std::is_same_v<T, half> or std::is_same_v<T, float>,
"simple_matmul supports only fp16/fp32.");

#if (__CHECK_FEATURE_AT_PRECOMPILE) || \
(__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation

switch (matrix_size) {
case 16:
runKernelSimpleMatMul<T, float, 16>(a, b, c);
Expand All @@ -144,6 +126,7 @@ AICORE void run_simple_matmul(__gm__ T* a, __gm__ T* b, __gm__ float* c,
runKernelSimpleMatMul<T, float, 128>(a, b, c);
break;
}
#endif
}

extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a,
Expand All @@ -161,5 +144,3 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
run_simple_matmul<float>((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c,
matrix_size);
}

#endif
10 changes: 4 additions & 6 deletions csrc/kernel/kernel_tri_inv_col_sweep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,12 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

#define MEMORY_BASE

#include <pto/pto-inst.hpp>

#include "kernel_utils.h"

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
#define GM_ADDR __gm__ uint8_t*

using namespace pto;

Expand All @@ -32,6 +29,8 @@ using namespace pto;
template <typename T, unsigned S /* Matrix Size */>
AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
uint32_t total_length) {
#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

set_mask_norm();
set_vector_mask(-1, -1);

Expand Down Expand Up @@ -143,6 +142,7 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
}
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
#endif
}

extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
Expand Down Expand Up @@ -170,5 +170,3 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32(
runTTriInv<float, 128>((__gm__ float*)x, (__gm__ float*)z, in_length);
}
}

#endif
Loading