huawei-csl · zouzias · Mar 12, 2026 · Mar 12, 2026
diff --git a/csrc/host/utils.h b/csrc/host/utils.h
@@ -90,6 +90,7 @@ inline at::Tensor CopyTensorHostToDevice(const at::Tensor& cpu_tensor) {
  * @param [in] scalar_data_type Data type of scalar
  * @return at::Tensor Tensor on NPU containing the `cpu_scalar`.
  */
+[[maybe_unused]]
 inline at::Tensor CopyScalarToDevice(const c10::Scalar& cpu_scalar,
                                      at::ScalarType scalar_data_type) {
   return CopyTensorHostToDevice(

diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp
@@ -7,18 +7,13 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
-
-#define MEMORY_BASE
-
-#include <pto/pto-inst.hpp>
-
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
+#include "kernel_utils.h"
 
 using namespace pto;
 
 template <typename T, unsigned matrix_size>
 AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
+#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
   // define GlobalData on global memory with shape and stride
   using ShapeDim5 = pto::Shape<1, 1, 1, matrix_size, matrix_size>;
   using StrideDim5 = pto::Stride<1, 1, 1, matrix_size, 1>;
@@ -78,6 +73,7 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
     set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
   }
+#endif
 }
 
 extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z,
@@ -93,5 +89,3 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z,
   // main kernel, in_length is dynamic input
   runTAbs<float, martix_size>((__gm__ float*)x, (__gm__ float*)z, in_length);
 }
-
-#endif
diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp
@@ -7,10 +7,7 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#define MEMORY_BASE
-#include <pto/pto-inst.hpp>
-
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
+#include "kernel_utils.h"
 
 using namespace pto;
 

diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp
@@ -6,30 +6,7 @@ See LICENSE in the root of the software repository:
 https://github.com/huawei-csl/pto-kernels/
 for the full License text.
 */
-#if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
-
-// Placeholder for VEC compilation (the real kernel is CUBE-only).
-#define MEMORY_BASE
-#include <pto/common/type.hpp>
-
-extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a,
-                                                     __gm__ void* b,
-                                                     __gm__ void* c,
-                                                     uint32_t matrix_size) {}
-
-extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
-                                                     __gm__ void* b,
-                                                     __gm__ void* c,
-                                                     uint32_t matrix_size) {}
-
-#elif (__CHECK_FEATURE_AT_PRECOMPILE) || \
-    (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__))
-
-#define MEMORY_BASE
-
-#include <pto/pto-inst.hpp>
-
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
+#include "kernel_utils.h"
 
 using namespace pto;
 
@@ -48,6 +25,9 @@ AICORE inline void WaitFlag(uint32_t id) {
 template <typename InputT, typename OutputT, uint32_t matrix_size>
 AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b,
                                   __gm__ OutputT* c) {
+#if (__CHECK_FEATURE_AT_PRECOMPILE) || \
+    (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__))  // Cube compilation
+
   constexpr uint32_t tile_len = matrix_size * matrix_size;
 
   /* Global Memory / Tensors */
@@ -116,6 +96,7 @@ AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b,
   SetFlag<PIPE_M, PIPE_FIX>(0);   // M pipe sets flag for FIX pipe
   WaitFlag<PIPE_M, PIPE_FIX>(0);  // FIX pipe waits for M pipe to set flag
   TSTORE(c_global_out, c_l0_tile);
+#endif
 }
 
 template <typename T>
@@ -161,5 +142,3 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
   run_simple_matmul<float>((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c,
                            matrix_size);
 }
-
-#endif
diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp
@@ -7,16 +7,8 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
-
-#define MEMORY_BASE
-
-#include <pto/pto-inst.hpp>
-
 #include "kernel_utils.h"
 
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
-
 using namespace pto;
 
 /**
@@ -32,6 +24,8 @@ using namespace pto;
 template <typename T, unsigned S /* Matrix Size */>
 AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
                        uint32_t total_length) {
+#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
+
   set_mask_norm();
   set_vector_mask(-1, -1);
 
@@ -143,6 +137,7 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
   }
   wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
   wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+#endif
 }
 
 extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
@@ -170,5 +165,3 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32(
     runTTriInv<float, 128>((__gm__ float*)x, (__gm__ float*)z, in_length);
   }
 }
-
-#endif
diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
@@ -7,12 +7,8 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#define MEMORY_BASE
-#include <pto/pto-inst.hpp>
-
 #include "kernel_utils.h"
 
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
 using namespace pto;
 using namespace kernel_utils;
 
@@ -448,7 +444,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv,
   /* Initializations */
   constexpr uint32_t TileLen = MatrixSize * MatrixSize;
   constexpr uint32_t FractalSize = 16;  // fractal size for half
-  constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize;
   constexpr uint32_t NumL0Buffers = 2;
 
   if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) {

diff --git a/csrc/kernel/kernel_tri_inv_trick.cpp b/csrc/kernel/kernel_tri_inv_trick.cpp
@@ -7,10 +7,7 @@ See LICENSE in the root of the software repository:
 for the full License text.
 */
 
-#define MEMORY_BASE
-#include <pto/pto-inst.hpp>
-
-#define GM_ADDR __gm__ uint8_t*  // To avoid #include "kernel_operator.h"
+#include "kernel_utils.h"
 
 using namespace pto;
 

diff --git a/csrc/kernel/kernel_utils.h b/csrc/kernel/kernel_utils.h
@@ -9,8 +9,13 @@ for the full License text.
 #pragma once
 
 #define MEMORY_BASE
+
+#include <cstdint>
 #include <pto/pto-inst.hpp>
 
+// To avoid #include "kernel_operator.h"
+#define GM_ADDR __gm__ uint8_t*
+
 namespace kernel_utils {
 /**
  * @brief Do a sync step (set-wait flag) between two pipes.