diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6afa01cb..0d788208 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -85,6 +85,9 @@ jobs:
         run: |
           apt-get update
           apt-get install -yq python-is-python3 python3-dev cmake
+          wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run
+          chmod +x cann-pto-isa_linux-x86_64.run
+          ./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/Ascend/ascend-toolkit
           source /usr/local/Ascend/ascend-toolkit/set_env.sh
           source /usr/local/Ascend/nnal/atb/set_env.sh
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue
@@ -121,6 +124,9 @@ jobs:
         run: |
           apt-get update
           apt-get install -yq python-is-python3 python3-dev cmake
+          wget http://container-obsfs-filesystem.obs.cn-north-4.myhuaweicloud.com/package/cann/pto-isa/version_compile/master/release_version/ubuntu_x86/cann-pto-isa_linux-x86_64.run
+          chmod +x cann-pto-isa_linux-x86_64.run
+          ./cann-pto-isa_linux-x86_64.run --full -q --install-path=/usr/local/pto-isa
           source /usr/local/Ascend/ascend-toolkit/set_env.sh
           source /usr/local/Ascend/nnal/atb/set_env.sh
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib # to avoid libascend_hal.so issue
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40c2a84d..cbc92f42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,19 +87,6 @@ endif()
 
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 
-include(FetchContent)
-
-# certain operations need newer pto-isa header, not CANN 8.5.0 default (pin
-# commit on 2026/03/16
-# https://gitcode.com/cann/pto-isa/commit/313817be696792a4e16a7ea5994ec98e34391613?ref=master)
-# to use default CANN 8.5.0 headers set GIT_TAG "8.5.0"
-FetchContent_Declare(
-  libpto_isa_headers
-  GIT_REPOSITORY https://gitcode.com/cann/pto-isa.git
-  GIT_TAG v9.0.0)
-
-FetchContent_Populate(libpto_isa_headers)
-
 # TORCH_NPU_PATH is the location where PyTorch Ascend Adapter (torch_npu) is
 # installed.
 execute_process(
@@ -139,8 +126,12 @@ ascendc_library(
   csrc/kernel/kernel_gdn_wy_fast.cpp)
 
 ascendc_include_directories(
-  no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include
-  ${libpto_isa_headers_SOURCE_DIR}/include/pto/common)
+  no_workspace_kernel PRIVATE ${ASCEND_CANN_PACKAGE_PATH}/include
+  ${ASCEND_CANN_PACKAGE_PATH}/include/pto/common)
+
+# TODO: enable -Wall -Wextra -Werror after fixing all warnings in the kernel
+# code ascendc_compile_options(no_workspace_kernel PRIVATE -Wall -Wextra
+# -Werror)
 
 if(BASE_MODE STREQUAL "MEMORY")
   message(STATUS "BASE_MODE is MEMORY")
@@ -156,6 +147,7 @@ pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp)
 # pybind11 does not work with C++20, so we set C++17 for the pybind11 module
 # target
 set_target_properties(pto_kernels_ops PROPERTIES CXX_STANDARD 17)
+target_compile_options(pto_kernels_ops PRIVATE -Wall -Wextra -Werror)
 
 target_link_libraries(pto_kernels_ops PRIVATE ${TORCH_LIBRARIES} c10 torch_cpu
                                               torch_npu no_workspace_kernel)
diff --git a/csrc/kernel/kernel_scan_ul1.cpp b/csrc/kernel/kernel_scan_ul1.cpp
index 7b50b92b..13aa8e6e 100644
--- a/csrc/kernel/kernel_scan_ul1.cpp
+++ b/csrc/kernel/kernel_scan_ul1.cpp
@@ -7,14 +7,10 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#include <pto/pto-inst.hpp>
-
 #include "kernel_utils.h"
 
 using namespace pto;
 
-constexpr unsigned UB_SIZE = 0x30000;  // 192KB UB of A2A3
-
 /**
  * @brief Kernel implementation for scan operation on a single cube.
  *
@@ -37,8 +33,6 @@ template <typename InputT, typename OutputT, uint32_t matrix_size>
 AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
                              __gm__ InputT* u, __gm__ InputT* l,
                              __gm__ OutputT* s) {
-#if defined(__DAV_CUBE__)
-
   // Type definitions for different memory levels
   // GM
   using Shape = pto::Shape<1, 1, 1, matrix_size, matrix_size>;
@@ -50,15 +44,10 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
   using TileL1In =
       Tile<TileType::Mat, InputT, matrix_size, matrix_size, BLayout::ColMajor,
            matrix_size, matrix_size, SLayout::RowMajor, 512>;
-  using TileL1Out =
-      Tile<TileType::Mat, OutputT, matrix_size, matrix_size, BLayout::ColMajor,
-           matrix_size, matrix_size, SLayout::RowMajor, 512>;
 
   // L0
   using TileL0A = TileLeft<InputT, matrix_size, matrix_size>;
-  using TileL0AOut = TileLeft<OutputT, matrix_size, matrix_size>;
   using TileL0B = TileRight<InputT, matrix_size, matrix_size>;
-  using TileL0BOut = TileRight<OutputT, matrix_size, matrix_size>;
   using TileL0C = TileAcc<OutputT, matrix_size, matrix_size>;
 
   // GM Data
@@ -79,8 +68,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
   TASSIGN(xL1, 0x0);
   const uint32_t tile_l1_in_byte_size =
       matrix_size * matrix_size * sizeof(InputT);
-  const uint32_t tile_l1_out_byte_size =
-      matrix_size * matrix_size * sizeof(OutputT);
   TASSIGN(oL1, 0x0 + tile_l1_in_byte_size);
   TASSIGN(uL1, 0x0 + 2 * tile_l1_in_byte_size);
   TASSIGN(c1L1, 0x0 + 3 * tile_l1_in_byte_size);
@@ -184,10 +171,6 @@ AICORE void runKernelScanUl1(__gm__ InputT* x, __gm__ InputT* o,
   wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
   TSTORE(sGlobal, sL0);
-
-#else
-// Nothing to do on VEC
-#endif
 }
 
 template <typename T>
@@ -219,8 +202,17 @@ extern "C" __global__ AICORE void scan_ul1_fp16(__gm__ void* x, __gm__ void* o,
                                                 __gm__ void* u, __gm__ void* l,
                                                 __gm__ void* s,
                                                 uint32_t matrix_size) {
+#if defined(__DAV_CUBE__)
   run_scan_ul1((__gm__ half*)x, (__gm__ half*)o, (__gm__ half*)u,
                (__gm__ half*)l, (__gm__ float*)s, matrix_size);
+#else
+  (void)x;
+  (void)o;
+  (void)u;
+  (void)l;
+  (void)s;
+  (void)matrix_size;
+#endif
 }
 
 extern "C" __global__ AICORE void scan_ul1_fp32(__gm__ void* x, __gm__ void* o,
@@ -228,6 +220,15 @@ extern "C" __global__ AICORE void scan_ul1_fp32(__gm__ void* x, __gm__ void* o,
 
                                                 __gm__ void* l, __gm__ void* s,
                                                 uint32_t matrix_size) {
+#if defined(__DAV_CUBE__)
   run_scan_ul1((__gm__ float*)x, (__gm__ float*)o, (__gm__ float*)u,
                (__gm__ float*)l, (__gm__ float*)s, matrix_size);
+#else
+  (void)x;
+  (void)o;
+  (void)u;
+  (void)l;
+  (void)s;
+  (void)matrix_size;
+#endif
 }
diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp
index d8a6515c..35fe6870 100644
--- a/csrc/kernel/kernel_simple_matmul.cpp
+++ b/csrc/kernel/kernel_simple_matmul.cpp
@@ -11,9 +11,6 @@ for the full License text.
 
 using namespace pto;
 
-constexpr unsigned NUM_BLOCKS = 20;    // number of AICs
-constexpr unsigned UB_SIZE = 0x30000;  // 192KB UB of A2A3
-
 template <pipe_t SrcPipe, pipe_t DstPipe>
 AICORE inline void SetFlag(uint32_t id) {
   set_flag(SrcPipe, DstPipe, static_cast<event_t>(id));
diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp
index b645006a..d23823c8 100644
--- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp
+++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp
@@ -157,7 +157,6 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
 extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
     GM_ADDR x, GM_ADDR z, uint32_t in_length, uint32_t matrix_size) {
 #if defined(__DAV_VEC__)
-
   if (matrix_size == 16) {
     runTTriInv<half, 16>((__gm__ half*)x, (__gm__ half*)z, in_length);
   } else if (matrix_size == 32) {
@@ -167,6 +166,11 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
   } else if (matrix_size == 128) {
     runTTriInv<half, 128>((__gm__ half*)x, (__gm__ half*)z, in_length);
   }
+#else
+  (void)x;
+  (void)z;
+  (void)in_length;
+  (void)matrix_size;
 #endif
 }
 
@@ -183,5 +187,10 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32(
   } else if (matrix_size == 128) {
     runTTriInv<float, 128>((__gm__ float*)x, (__gm__ float*)z, in_length);
   }
+#else
+  (void)x;
+  (void)z;
+  (void)in_length;
+  (void)matrix_size;
 #endif
 }
diff --git a/csrc/kernel/kernel_tri_inv_ns.cpp b/csrc/kernel/kernel_tri_inv_ns.cpp
index d871e13f..656ece5a 100644
--- a/csrc/kernel/kernel_tri_inv_ns.cpp
+++ b/csrc/kernel/kernel_tri_inv_ns.cpp
@@ -212,7 +212,6 @@ AICORE void runKernelTriInvNS(__gm__ OutputT* M_inv, __gm__ InputT* M,
 #if defined(__DAV_CUBE__)  // Cube compilation
 
   constexpr uint32_t TileLen = MatrixSize * MatrixSize;
-  const uint32_t global_index = get_block_idx() * TileLen;
   constexpr uint32_t NumL0Buffers = 2;
   const uint32_t max_iters_per_aic = kernel_utils::CeilDiv(
       total_tiles, (uint32_t)(NumTilesPerCubeIter * get_block_num()));
@@ -390,12 +389,13 @@ extern "C" __global__ AICORE void tri_inv_ns_fp16(
     __gm__ void* tensor_out, __gm__ void* tensor_in,
     __gm__ void* identity_neg_in, __gm__ void* identity_over_n_in,
     uint32_t matrix_size, uint32_t num_iters, uint32_t num_matrices) {
-  if (num_matrices <= get_block_num()) {
+  const uint32_t block_dim = get_block_num();
+  if (num_matrices <= block_dim) {
     run_tri_inv_ns<half, 1>((__gm__ float*)tensor_out, (__gm__ half*)tensor_in,
                             (__gm__ half*)identity_neg_in,
                             (__gm__ half*)identity_over_n_in, matrix_size,
                             num_iters, num_matrices);
-  } else if (num_matrices <= 2 * get_block_num()) {
+  } else if (num_matrices <= 2 * block_dim) {
     run_tri_inv_ns<half, 2>((__gm__ float*)tensor_out, (__gm__ half*)tensor_in,
                             (__gm__ half*)identity_neg_in,
                             (__gm__ half*)identity_over_n_in, matrix_size,
diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
index 3677b0c0..ce6e4640 100644
--- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
+++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
@@ -470,7 +470,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv,
   /* Initializations */
   constexpr uint32_t TileLen = MatrixSize * MatrixSize;
   constexpr uint32_t FractalSize = 16;  // fractal size for half
-  constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize;
   constexpr uint32_t NumL0Buffers = 2;
 
   if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) {
@@ -753,13 +752,14 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices(
     __gm__ OutputT* tensor_out, __gm__ InputT* tensor_in, uint32_t matrix_size,
     uint32_t num_matrices, uint32_t num_bsnd_heads, uint32_t is_lower = 0,
     __gm__ int32_t* cu_seqlens = nullptr) {
+  const uint32_t num_blocks = static_cast<uint32_t>(get_block_num());
   if (num_bsnd_heads == 0) {
-    if (num_matrices <= get_block_num()) {
+    if (num_matrices <= num_blocks) {
       run_tri_inv_rec_unroll<half, OutputT, 1 /* NumTilesPerCubeIter */,
                              false /* IsBSND */>(
           tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
           is_lower, cu_seqlens);
-    } else if (num_matrices <= 2 * get_block_num()) {
+    } else if (num_matrices <= 2 * num_blocks) {
       run_tri_inv_rec_unroll<half, OutputT, 2 /* NumTilesPerCubeIter */,
                              false /* IsBSND */>(
           tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
@@ -771,12 +771,12 @@ AICORE void run_tri_inv_rec_unroll_per_num_matrices(
           is_lower, cu_seqlens);
     }
   } else {
-    if (num_matrices <= get_block_num()) {
+    if (num_matrices <= num_blocks) {
       run_tri_inv_rec_unroll<half, OutputT, 1 /* NumTilesPerCubeIter */,
                              true /* IsBSND */>(
           tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
           is_lower, cu_seqlens);
-    } else if (num_matrices <= 2 * get_block_num()) {
+    } else if (num_matrices <= 2 * num_blocks) {
       run_tri_inv_rec_unroll<half, OutputT, 2 /* NumTilesPerCubeIter */,
                              true /* IsBSND */>(
           tensor_out, tensor_in, matrix_size, num_matrices, num_bsnd_heads,
diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h
index 61acd5bf..17e7a912 100644
--- a/csrc/kernel/kernel_utils.h
+++ b/csrc/kernel/kernel_utils.h
@@ -8,7 +8,20 @@ for the full License text.
 */
 #pragma once
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wextra"
 #include <pto/pto-inst.hpp>
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wall"
+#pragma GCC diagnostic ignored "-Wextra"
+#include <pto/pto-inst.hpp>
+#pragma GCC diagnostic pop
+#endif
+
 #include <type_traits>
 
 // clang-format off: so it does not get wrongfully flagged by linter