Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ jobs:
run: |
apt-get update
apt-get install -yq python-is-python3 python3-dev cmake
wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run
chmod +x cann-pto-isa_linux-x86_64.run
./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/Ascend/ascend-toolkit
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue
Expand Down Expand Up @@ -121,6 +124,9 @@ jobs:
run: |
apt-get update
apt-get install -yq python-is-python3 python3-dev cmake
wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run
chmod +x cann-pto-isa_linux-x86_64.run
./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/pto-isa
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue
Expand Down
22 changes: 7 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,6 @@ endif()

include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)

include(FetchContent)

# certain operations need newer pto-isa header, not CANN 8.5.0 default (pin
# commit on 2026/03/16
# https://gitcode.com/cann/pto-isa/commit/313817be696792a4e16a7ea5994ec98e34391613?ref=master)
# to use default CANN 8.5.0 headers set GIT_TAG "8.5.0"
FetchContent_Declare(
libpto_isa_headers
GIT_REPOSITORY https://gitcode.com/cann/pto-isa.git
GIT_TAG v9.0.0)

FetchContent_Populate(libpto_isa_headers)

# TORCH_NPU_PATH is the location where PyTorch Ascend Adapter (torch_npu) is
# installed.
execute_process(
Expand Down Expand Up @@ -139,8 +126,12 @@ ascendc_library(
csrc/kernel/kernel_gdn_wy_fast.cpp)

ascendc_include_directories(
no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include
${libpto_isa_headers_SOURCE_DIR}/include/pto/common)
no_workspace_kernel PRIVATE ${ASCEND_CANN_PACKAGE_PATH}/include
${ASCEND_CANN_PACKAGE_PATH}/include/pto/common)

# TODO: enable -Wall -Wextra -Werror after fixing all warnings in the kernel
# code ascendc_compile_options(no_workspace_kernel PRIVATE -Wall -Wextra
# -Werror)

if(BASE_MODE STREQUAL "MEMORY")
message(STATUS "BASE_MODE is MEMORY")
Expand All @@ -156,6 +147,7 @@ pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp)
# pybind11 does not work with C++20, so we set C++17 for the pybind11 module
# target
set_target_properties(pto_kernels_ops PROPERTIES CXX_STANDARD 17)
target_compile_options(pto_kernels_ops PRIVATE -Wall -Wextra -Werror)

target_link_libraries(pto_kernels_ops PRIVATE ${TORCH_LIBRARIES} c10 torch_cpu
torch_npu no_workspace_kernel)
Expand Down
35 changes: 18 additions & 17 deletions csrc/kernel/kernel_scan_ul1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#include <pto/pto-inst.hpp>

#include "kernel_utils.h"

using namespace pto;

constexpr unsigned UB_SIZE = 0x30000; // 192KB UB of A2A3

/**
* @brief Kernel implementation for scan operation on a single cube.
*
Expand All @@ -37,8 +33,6 @@ template <typename InputT, typename OutputT, uint32_t matrix_size>
AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
__gm__ InputT* u, __gm__ InputT* l,
__gm__ OutputT* s) {
#if defined(__DAV_CUBE__)

// Type definitions for different memory levels
// GM
using Shape = pto::Shape<1, 1, 1, matrix_size, matrix_size>;
Expand All @@ -50,15 +44,10 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
using TileL1In =
Tile<TileType::Mat, InputT, matrix_size, matrix_size, BLayout::ColMajor,
matrix_size, matrix_size, SLayout::RowMajor, 512>;
using TileL1Out =
Tile<TileType::Mat, OutputT, matrix_size, matrix_size, BLayout::ColMajor,
matrix_size, matrix_size, SLayout::RowMajor, 512>;

// L0
using TileL0A = TileLeft<InputT, matrix_size, matrix_size>;
using TileL0AOut = TileLeft<OutputT, matrix_size, matrix_size>;
using TileL0B = TileRight<InputT, matrix_size, matrix_size>;
using TileL0BOut = TileRight<OutputT, matrix_size, matrix_size>;
using TileL0C = TileAcc<OutputT, matrix_size, matrix_size>;

// GM Data
Expand All @@ -79,8 +68,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
TASSIGN(xL1, 0x0);
const uint32_t tile_l1_in_byte_size =
matrix_size * matrix_size * sizeof(InputT);
const uint32_t tile_l1_out_byte_size =
matrix_size * matrix_size * sizeof(OutputT);
TASSIGN(oL1, 0x0 + tile_l1_in_byte_size);
TASSIGN(uL1, 0x0 + 2 * tile_l1_in_byte_size);
TASSIGN(c1L1, 0x0 + 3 * tile_l1_in_byte_size);
Expand Down Expand Up @@ -184,10 +171,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);

TSTORE(sGlobal, sL0);

#else
// Nothing to do on VEC
#endif
}

template <typename T>
Expand Down Expand Up @@ -219,15 +202,33 @@ extern "C" __global__ AICORE void scan_ul1_fp16(__gm__ void* x, __gm__ void* o,
__gm__ void* u, __gm__ void* l,
__gm__ void* s,
uint32_t matrix_size) {
#if defined(__DAV_CUBE__)
run_scan_ul1((__gm__ half*)x, (__gm__ half*)o, (__gm__ half*)u,
(__gm__ half*)l, (__gm__ float*)s, matrix_size);
#else
(void)x;
(void)o;
(void)u;
(void)l;
(void)s;
(void)matrix_size;
#endif
}

extern "C" __global__ AICORE void scan_ul1_fp32(__gm__ void* x, __gm__ void* o,
__gm__ void* u,

__gm__ void* l, __gm__ void* s,
uint32_t matrix_size) {
#if defined(__DAV_CUBE__)
run_scan_ul1((__gm__ float*)x, (__gm__ float*)o, (__gm__ float*)u,
(__gm__ float*)l, (__gm__ float*)s, matrix_size);
#else
(void)x;
(void)o;
(void)u;
(void)l;
(void)s;
(void)matrix_size;
#endif
}
3 changes: 0 additions & 3 deletions csrc/kernel/kernel_simple_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ for the full License text.

using namespace pto;

constexpr unsigned NUM_BLOCKS = 20; // number of AICs
constexpr unsigned UB_SIZE = 0x30000; // 192KB UB of A2A3

template <pipe_t SrcPipe, pipe_t DstPipe>
AICORE inline void SetFlag(uint32_t id) {
set_flag(SrcPipe, DstPipe, static_cast<event_t>(id));
Expand Down
11 changes: 10 additions & 1 deletion csrc/kernel/kernel_tri_inv_col_sweep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,6 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
GM_ADDR x, GM_ADDR z, uint32_t in_length, uint32_t matrix_size) {
#if defined(__DAV_VEC__)

if (matrix_size == 16) {
runTTriInv<half, 16>((__gm__ half*)x, (__gm__ half*)z, in_length);
} else if (matrix_size == 32) {
Expand All @@ -167,6 +166,11 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
} else if (matrix_size == 128) {
runTTriInv<half, 128>((__gm__ half*)x, (__gm__ half*)z, in_length);
}
#else
(void)x;
(void)z;
(void)in_length;
(void)matrix_size;
#endif
}

Expand All @@ -183,5 +187,10 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32(
} else if (matrix_size == 128) {
runTTriInv<float, 128>((__gm__ float*)x, (__gm__ float*)z, in_length);
}
#else
(void)x;
(void)z;
(void)in_length;
(void)matrix_size;
#endif
}
6 changes: 3 additions & 3 deletions csrc/kernel/kernel_tri_inv_ns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@ AICORE void runKernelTriInvNS(__gm__ OutputT* M_inv, __gm__ InputT* M,
#if defined(__DAV_CUBE__) // Cube compilation

constexpr uint32_t TileLen = MatrixSize * MatrixSize;
const uint32_t global_index = get_block_idx() * TileLen;
constexpr uint32_t NumL0Buffers = 2;
const uint32_t max_iters_per_aic = kernel_utils::CeilDiv(
total_tiles, (uint32_t)(NumTilesPerCubeIter * get_block_num()));
Expand Down Expand Up @@ -390,12 +389,13 @@ extern "C" __global__ AICORE void tri_inv_ns_fp16(
__gm__ void* tensor_out, __gm__ void* tensor_in,
__gm__ void* identity_neg_in, __gm__ void* identity_over_n_in,
uint32_t matrix_size, uint32_t num_iters, uint32_t num_matrices) {
if (num_matrices <= get_block_num()) {
const uint32_t block_dim = get_block_num();
if (num_matrices <= block_dim) {
run_tri_inv_ns<half, 1>((__gm__ float*)tensor_out, (__gm__ half*)tensor_in,
(__gm__ half*)identity_neg_in,
(__gm__ half*)identity_over_n_in, matrix_size,
num_iters, num_matrices);
} else if (num_matrices <= 2 * get_block_num()) {
} else if (num_matrices <= 2 * block_dim) {
run_tri_inv_ns<half, 2>((__gm__ float*)tensor_out, (__gm__ half*)tensor_in,
(__gm__ half*)identity_neg_in,
(__gm__ half*)identity_over_n_in, matrix_size,
Expand Down
10 changes: 5 additions & 5 deletions csrc/kernel/kernel_tri_inv_rec_unroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv,
/* Initializations */
constexpr uint32_t TileLen = MatrixSize * MatrixSize;
constexpr uint32_t FractalSize = 16; // fractal size for half
constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize;
constexpr uint32_t NumL0Buffers = 2;

if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) {
Expand Down Expand Up @@ -753,13 +752,14 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices(
__gm__ OutputT* tensor_out, __gm__ InputT* tensor_in, uint32_t matrix_size,
uint32_t num_matrices, uint32_t num_bsnd_heads, uint32_t is_lower = 0,
__gm__ int32_t* cu_seqlens = nullptr) {
const uint32_t num_blocks = static_cast<uint32_t>(get_block_num());
if (num_bsnd_heads == 0) {
if (num_matrices <= get_block_num()) {
if (num_matrices <= num_blocks) {
run_tri_inv_rec_unroll<half, OutputT, 1 /* NumTilesPerCubeIter */,
false /* IsBSND */>(
tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
is_lower, cu_seqlens);
} else if (num_matrices <= 2 * get_block_num()) {
} else if (num_matrices <= 2 * num_blocks) {
run_tri_inv_rec_unroll<half, OutputT, 2 /* NumTilesPerCubeIter */,
false /* IsBSND */>(
tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
Expand All @@ -771,12 +771,12 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices(
is_lower, cu_seqlens);
}
} else {
if (num_matrices <= get_block_num()) {
if (num_matrices <= num_blocks) {
run_tri_inv_rec_unroll<half, OutputT, 1 /* NumTilesPerCubeIter */,
true /* IsBSND */>(
tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
is_lower, cu_seqlens);
} else if (num_matrices <= 2 * get_block_num()) {
} else if (num_matrices <= 2 * num_blocks) {
run_tri_inv_rec_unroll<half, OutputT, 2 /* NumTilesPerCubeIter */,
true /* IsBSND */>(
tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
Expand Down
13 changes: 13 additions & 0 deletions csrc/kernel/kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,20 @@ for the full License text.
*/
#pragma once

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wall"
#pragma clang diagnostic ignored "-Wextra"
#include <pto/pto-inst.hpp>
#pragma clang diagnostic pop
#elif defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wall"
#pragma GCC diagnostic ignored "-Wextra"
#include <pto/pto-inst.hpp>
#pragma GCC diagnostic pop
#endif

#include <type_traits>

// clang-format off: so it does not get wrongfully flagged by linter
Expand Down
Loading