diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6afa01cb..0d788208 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -85,6 +85,9 @@ jobs: run: | apt-get update apt-get install -yq python-is-python3 python3-dev cmake + wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run + chmod +x cann-pto-isa_linux-x86_64.run + ./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/Ascend/ascend-toolkit source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue @@ -121,6 +124,9 @@ jobs: run: | apt-get update apt-get install -yq python-is-python3 python3-dev cmake + wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run + chmod +x cann-pto-isa_linux-x86_64.run + ./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/pto-isa source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue diff --git a/CMakeLists.txt b/CMakeLists.txt index 40c2a84d..cbc92f42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,19 +87,6 @@ endif() include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) -include(FetchContent) - -# certain operations need newer pto-isa header, not CANN 8.5.0 default (pin -# commit on 2026/03/16 -# https://gitcode.com/cann/pto-isa/commit/313817be696792a4e16a7ea5994ec98e34391613?ref=master) -# to use default CANN 8.5.0 headers set GIT_TAG "8.5.0" -FetchContent_Declare( - libpto_isa_headers - GIT_REPOSITORY https://gitcode.com/cann/pto-isa.git - GIT_TAG v9.0.0) - -FetchContent_Populate(libpto_isa_headers) - # TORCH_NPU_PATH is the location where PyTorch Ascend Adapter (torch_npu) is # installed. execute_process( @@ -139,8 +126,12 @@ ascendc_library( csrc/kernel/kernel_gdn_wy_fast.cpp) ascendc_include_directories( - no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include - ${libpto_isa_headers_SOURCE_DIR}/include/pto/common) + no_workspace_kernel PRIVATE ${ASCEND_CANN_PACKAGE_PATH}/include + ${ASCEND_CANN_PACKAGE_PATH}/include/pto/common) + +# TODO: enable -Wall -Wextra -Werror after fixing all warnings in the kernel +# code ascendc_compile_options(no_workspace_kernel PRIVATE -Wall -Wextra +# -Werror) if(BASE_MODE STREQUAL "MEMORY") message(STATUS "BASE_MODE is MEMORY") @@ -156,6 +147,7 @@ pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp) # pybind11 does not work with C++20, so we set C++17 for the pybind11 module # target set_target_properties(pto_kernels_ops PROPERTIES CXX_STANDARD 17) +target_compile_options(pto_kernels_ops PRIVATE -Wall -Wextra -Werror) target_link_libraries(pto_kernels_ops PRIVATE ${TORCH_LIBRARIES} c10 torch_cpu torch_npu no_workspace_kernel) diff --git a/csrc/kernel/kernel_scan_ul1.cpp b/csrc/kernel/kernel_scan_ul1.cpp index 7b50b92b..13aa8e6e 100644 --- a/csrc/kernel/kernel_scan_ul1.cpp +++ b/csrc/kernel/kernel_scan_ul1.cpp @@ -7,14 +7,10 @@ See LICENSE in the root of the software repository: for the full License text. */ -#include - #include "kernel_utils.h" using namespace pto; -constexpr unsigned UB_SIZE = 0x30000; // 192KB UB of A2A3 - /** * @brief Kernel implementation for scan operation on a single cube. * @@ -37,8 +33,6 @@ template AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o, __gm__ InputT* u, __gm__ InputT* l, __gm__ OutputT* s) { -#if defined(__DAV_CUBE__) - // Type definitions for different memory levels // GM using Shape = pto::Shape<1, 1, 1, matrix_size, matrix_size>; @@ -50,15 +44,10 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o, using TileL1In = Tile; - using TileL1Out = - Tile; // L0 using TileL0A = TileLeft; - using TileL0AOut = TileLeft; using TileL0B = TileRight; - using TileL0BOut = TileRight; using TileL0C = TileAcc; // GM Data @@ -79,8 +68,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o, TASSIGN(xL1, 0x0); const uint32_t tile_l1_in_byte_size = matrix_size * matrix_size * sizeof(InputT); - const uint32_t tile_l1_out_byte_size = - matrix_size * matrix_size * sizeof(OutputT); TASSIGN(oL1, 0x0 + tile_l1_in_byte_size); TASSIGN(uL1, 0x0 + 2 * tile_l1_in_byte_size); TASSIGN(c1L1, 0x0 + 3 * tile_l1_in_byte_size); @@ -184,10 +171,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o, wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); TSTORE(sGlobal, sL0); - -#else -// Nothing to do on VEC -#endif } template @@ -219,8 +202,17 @@ extern "C" __global__ AICORE void scan_ul1_fp16(__gm__ void* x, __gm__ void* o, __gm__ void* u, __gm__ void* l, __gm__ void* s, uint32_t matrix_size) { +#if defined(__DAV_CUBE__) run_scan_ul1((__gm__ half*)x, (__gm__ half*)o, (__gm__ half*)u, (__gm__ half*)l, (__gm__ float*)s, matrix_size); +#else + (void)x; + (void)o; + (void)u; + (void)l; + (void)s; + (void)matrix_size; +#endif } extern "C" __global__ AICORE void scan_ul1_fp32(__gm__ void* x, __gm__ void* o, @@ -228,6 +220,15 @@ extern "C" __global__ AICORE void scan_ul1_fp32(__gm__ void* x, __gm__ void* o, __gm__ void* l, __gm__ void* s, uint32_t matrix_size) { +#if defined(__DAV_CUBE__) run_scan_ul1((__gm__ float*)x, (__gm__ float*)o, (__gm__ float*)u, (__gm__ float*)l, (__gm__ float*)s, matrix_size); +#else + (void)x; + (void)o; + (void)u; + (void)l; + (void)s; + (void)matrix_size; +#endif } diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp index d8a6515c..35fe6870 100644 --- a/csrc/kernel/kernel_simple_matmul.cpp +++ b/csrc/kernel/kernel_simple_matmul.cpp @@ -11,9 +11,6 @@ for the full License text. using namespace pto; -constexpr unsigned NUM_BLOCKS = 20; // number of AICs -constexpr unsigned UB_SIZE = 0x30000; // 192KB UB of A2A3 - template AICORE inline void SetFlag(uint32_t id) { set_flag(SrcPipe, DstPipe, static_cast(id)); diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index b645006a..d23823c8 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -157,7 +157,6 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( GM_ADDR x, GM_ADDR z, uint32_t in_length, uint32_t matrix_size) { #if defined(__DAV_VEC__) - if (matrix_size == 16) { runTTriInv((__gm__ half*)x, (__gm__ half*)z, in_length); } else if (matrix_size == 32) { @@ -167,6 +166,11 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( } else if (matrix_size == 128) { runTTriInv((__gm__ half*)x, (__gm__ half*)z, in_length); } +#else + (void)x; + (void)z; + (void)in_length; + (void)matrix_size; #endif } @@ -183,5 +187,10 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32( } else if (matrix_size == 128) { runTTriInv((__gm__ float*)x, (__gm__ float*)z, in_length); } +#else + (void)x; + (void)z; + (void)in_length; + (void)matrix_size; #endif } diff --git a/csrc/kernel/kernel_tri_inv_ns.cpp b/csrc/kernel/kernel_tri_inv_ns.cpp index d871e13f..656ece5a 100644 --- a/csrc/kernel/kernel_tri_inv_ns.cpp +++ b/csrc/kernel/kernel_tri_inv_ns.cpp @@ -212,7 +212,6 @@ AICORE void runKernelTriInvNS(__gm__ OutputT* M_inv, __gm__ InputT* M, #if defined(__DAV_CUBE__) // Cube compilation constexpr uint32_t TileLen = MatrixSize * MatrixSize; - const uint32_t global_index = get_block_idx() * TileLen; constexpr uint32_t NumL0Buffers = 2; const uint32_t max_iters_per_aic = kernel_utils::CeilDiv( total_tiles, (uint32_t)(NumTilesPerCubeIter * get_block_num())); @@ -390,12 +389,13 @@ extern "C" __global__ AICORE void tri_inv_ns_fp16( __gm__ void* tensor_out, __gm__ void* tensor_in, __gm__ void* identity_neg_in, __gm__ void* identity_over_n_in, uint32_t matrix_size, uint32_t num_iters, uint32_t num_matrices) { - if (num_matrices <= get_block_num()) { + const uint32_t block_dim = get_block_num(); + if (num_matrices <= block_dim) { run_tri_inv_ns((__gm__ float*)tensor_out, (__gm__ half*)tensor_in, (__gm__ half*)identity_neg_in, (__gm__ half*)identity_over_n_in, matrix_size, num_iters, num_matrices); - } else if (num_matrices <= 2 * get_block_num()) { + } else if (num_matrices <= 2 * block_dim) { run_tri_inv_ns((__gm__ float*)tensor_out, (__gm__ half*)tensor_in, (__gm__ half*)identity_neg_in, (__gm__ half*)identity_over_n_in, matrix_size, diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp index 3677b0c0..ce6e4640 100644 --- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp +++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp @@ -470,7 +470,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv, /* Initializations */ constexpr uint32_t TileLen = MatrixSize * MatrixSize; constexpr uint32_t FractalSize = 16; // fractal size for half - constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize; constexpr uint32_t NumL0Buffers = 2; if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) { @@ -753,13 +752,14 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices( __gm__ OutputT* tensor_out, __gm__ InputT* tensor_in, uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads, uint32_t is_lower = 0, __gm__ int32_t* cu_seqlens = nullptr) { + const uint32_t num_blocks = static_cast(get_block_num()); if (num_bsnd_heads == 0) { - if (num_matrices <= get_block_num()) { + if (num_matrices <= num_blocks) { run_tri_inv_rec_unroll( tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads, is_lower, cu_seqlens); - } else if (num_matrices <= 2 * get_block_num()) { + } else if (num_matrices <= 2 * num_blocks) { run_tri_inv_rec_unroll( tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads, @@ -771,12 +771,12 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices( is_lower, cu_seqlens); } } else { - if (num_matrices <= get_block_num()) { + if (num_matrices <= num_blocks) { run_tri_inv_rec_unroll( tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads, is_lower, cu_seqlens); - } else if (num_matrices <= 2 * get_block_num()) { + } else if (num_matrices <= 2 * num_blocks) { run_tri_inv_rec_unroll( tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads, diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h index 61acd5bf..17e7a912 100644 --- a/csrc/kernel/kernel_utils.h +++ b/csrc/kernel/kernel_utils.h @@ -8,7 +8,20 @@ for the full License text. */ #pragma once +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wall" +#pragma clang diagnostic ignored "-Wextra" #include +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wall" +#pragma GCC diagnostic ignored "-Wextra" +#include +#pragma GCC diagnostic pop +#endif + #include // clang-format off: so it does not get wrongfully flagged by linter