huawei-csl · zouzias · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,27 +55,9 @@ add_link_options(-s -Wl,-z,relro -Wl,-z,now)
 set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h"
                               "SHELL:-include stddef.h")
 
-include_directories(${ASCEND_HOME_PATH}/include
+include_directories(${ASCEND_CANN_PACKAGE_PATH}/include
                     ${ASCEND_DRIVER_PATH}/kernel/inc)
 
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
-  set(ASCENDC_CMAKE_DIR
-      ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-  set(ASCENDC_CMAKE_DIR
-      ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-  set(ASCENDC_CMAKE_DIR
-      ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
-  message(
-    FATAL_ERROR
-      "ascendc_kernel_cmake does not exist, please check whether the cann package is installed."
-  )
-endif()
-
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-
 include(FetchContent)
 
 # certain operations need newer pto-isa header, not CANN 8.5.0 default (pin
@@ -108,21 +90,35 @@ message("* TORCH_NPU_PATH           : ${TORCH_NPU_PATH}")
 message("* TORCH_LIBRARIES          : ${TORCH_LIBRARIES}")
 message("***********************************************************")
 
-ascendc_library(
-  no_workspace_kernel
-  SHARED
-  csrc/kernel/kernel_tri_inv_col_sweep.cpp
-  csrc/kernel/kernel_abs.cpp
-  csrc/kernel/kernel_csr_gather.cpp
-  csrc/kernel/kernel_simple_matmul.cpp
-  csrc/kernel/kernel_batch_matrix_square.cpp
-  csrc/kernel/kernel_tri_inv_rec_unroll.cpp
-  csrc/kernel/kernel_tri_inv_trick.cpp
-  csrc/kernel/kernel_swiglu.cpp)
-
-ascendc_include_directories(
-  no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include
-  ${libpto_isa_headers_SOURCE_DIR}/include/pto/common)
+set(KERNEL_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_col_sweep.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_simple_matmul.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_batch_matrix_square.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_rec_unroll.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_trick.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_csr_gather.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_swiglu.cpp)
+
+set(NO_WORKSPACE_KERNEL_LIB
+    ${CMAKE_CURRENT_BINARY_DIR}/libno_workspace_kernel.so)
+
+add_custom_command(
+  OUTPUT ${NO_WORKSPACE_KERNEL_LIB}
+  COMMAND
+    bisheng -fPIC -shared -xcce -O0 -g -std=c++17 --npu-arch=dav-2201
+    -I${libpto_isa_headers_SOURCE_DIR}/include ${KERNEL_SOURCES} -o
+    ${NO_WORKSPACE_KERNEL_LIB}
+  DEPENDS ${KERNEL_SOURCES}
+  COMMENT "Building no_workspace_kernel with bisheng compiler")
+
+add_custom_target(no_workspace_kernel_build ALL
+                  DEPENDS ${NO_WORKSPACE_KERNEL_LIB})
+
+add_library(no_workspace_kernel SHARED IMPORTED GLOBAL)
+set_target_properties(no_workspace_kernel PROPERTIES IMPORTED_LOCATION
+                                                     ${NO_WORKSPACE_KERNEL_LIB})
+add_dependencies(no_workspace_kernel no_workspace_kernel_build)
 
 pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp)
 
@@ -154,7 +150,7 @@ if(PIP_INSTALL)
                LINK_FLAGS "-Wl,-rpath,\${ORIGIN}/lib")
 
   # install dynamic libraries under site-packages/pto_kernels/libs
-  install(TARGETS no_workspace_kernel LIBRARY DESTINATION pto_kernels/lib)
+  install(FILES ${NO_WORKSPACE_KERNEL_LIB} DESTINATION pto_kernels/lib)
 else()
   message(STATUS ">>>======================================================")
   message(STATUS ">>> Ignoring dynamic libraries COPY inside Python wheel.")

diff --git a/Makefile b/Makefile
@@ -5,7 +5,10 @@
 # https://github.com/huawei-csl/pto-kernels/
 # for the full License text.
 # --------------------------------------------------------------------------------
-.PHONY: clean setup_once build_wheel install test
+PTO_LIB_PATH    ?= $(ASCEND_TOOLKIT_HOME)
+CSRC_KERNEL_DIR := csrc/kernel
+
+.PHONY: clean setup_once build_cmake build_wheel install docs test test_tri_inv
 
 clean:
 	rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/
@@ -20,6 +23,19 @@ build_cmake: clean
 build_wheel:
 	export CMAKE_GENERATOR="Unix Makefiles" && pip wheel -v  . --extra-index-url https://download.pytorch.org/whl/cpu
 
+
+# 'make compile_abs' compiles 'kernel_abs.cpp' into 'libkernel_abs.so' without building the whole wheel package.
+# This is useful for development and debugging of individual kernels.
+compile_%:
+	bisheng -fPIC -shared -xcce -DMEMORY_BASE -O2 -std=c++17 \
+		-I$(CSRC_KERNEL_DIR) \
+		-I$(PTO_LIB_PATH)/include \
+		--npu-arch=dav-2201 \
+	        -Wno-ignored-attributes \
+		$(CSRC_KERNEL_DIR)/kernel_$*.cpp \
+		-o libkernel_$*.so
+
+
 install:
 	python3 -m pip install --force-reinstall pto_kernels-*.whl
 

diff --git a/csrc/host/pybind11.cpp b/csrc/host/pybind11.cpp
@@ -11,10 +11,10 @@ for the full License text.
 
 #include "torch_abs.h"
 #include "torch_batch_matrix_square.h"
-#include "torch_csr_gather.h"
+// #include "torch_csr_gather.h"
 #include "torch_simple_matmul.h"
 #include "torch_swiglu.h"
-#include "torch_tri_inv.h"
+#include "torch_tri_inv_col_sweep.h"
 #include "torch_tri_inv_rec_unroll.h"
 #include "torch_tri_inv_trick.h"
 
@@ -37,7 +37,7 @@ PYBIND11_MODULE(pto_kernels_ops, m) {
       pybind11::arg("device_id") = 0);
   m.def("pto_abs", &pto_isa_ops::run_abs);
   m.def("pto_batch_matrix_square", &pto_isa_ops::run_batch_matrix_square);
-  m.def("pto_csr_gather", &pto_isa_ops::run_csr_gather);
+  //  m.def("pto_csr_gather", &pto_isa_ops::run_csr_gather);
   m.def("pto_simple_matmul", &pto_isa_ops::run_simple_matmul);
   m.def("pto_swiglu", &pto_isa_ops::run_swiglu, py::arg("x"),
         py::arg("dim") = -1);

diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h
@@ -9,14 +9,18 @@ for the full License text.
 #pragma once
 
 #include <ATen/ATen.h>
+#include <acl/acl.h>
 #include <torch/library.h>
 
-#include "aclrtlaunch_vabs_fp16.h"
-#include "aclrtlaunch_vabs_fp32.h"
 #include "utils.h"
 
-namespace pto_isa_ops {
+extern "C" void call_vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x,
+                               void* y, uint32_t num_elements);
+
+extern "C" void call_vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x,
+                               void* y, uint32_t num_elements);
 
+namespace pto_isa_ops {
 /**
  * @brief Runs element-wise absolute value.
  *
@@ -26,26 +30,27 @@ namespace pto_isa_ops {
 
 at::Tensor run_abs(const at::Tensor& x) {
   const auto dtype = x.options().dtype();
-  at::Tensor z = at::empty_like(x);
+  const at::Tensor z = at::empty_like(x);
   // Define the number of blocks of vector core
   const uint32_t total_size = x.numel();
   // FIXME: tile length is fixed to 128 for now
   constexpr uint32_t TILE_SIZE = 128;
 
   // Persistent kernel launch parameter
   uint32_t total_tiles = (total_size + TILE_SIZE - 1) / TILE_SIZE;
-  uint32_t block_dim = GetNumVectorCores();
+  uint32_t block_dim = GetNumCubeCores();
 
   if (total_tiles < block_dim) {
     block_dim = total_tiles;
   }
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   if (dtype == at::kHalf) {
-    EXEC_KERNEL_CMD(vabs_fp16, block_dim, x, z, total_size);
-
+    call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z),
+                   total_size);
   } else if (dtype == at::kFloat) {
-    EXEC_KERNEL_CMD(vabs_fp32, block_dim, x, z, total_size);
-
+    call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z),
+                   total_size);
   } else {
     throw std::runtime_error("Unsupported dtype for `pto_abs` kernel");
   }

diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h
@@ -9,12 +9,18 @@ for the full License text.
 #pragma once
 
 #include <ATen/ATen.h>
+#include <acl/acl.h>
 #include <torch/library.h>
 
-#include "aclrtlaunch_batch_matrix_square_fp16.h"
-#include "aclrtlaunch_batch_matrix_square_fp32.h"
 #include "utils.h"
 
+extern "C" void call_batch_matrix_square_fp16(uint32_t blockDim,
+                                              aclrtStream stream, void* z,
+                                              void* x, uint32_t matrix_size);
+extern "C" void call_batch_matrix_square_fp32(uint32_t blockDim,
+                                              aclrtStream stream, void* z,
+                                              void* x, uint32_t matrix_size);
+
 namespace pto_isa_ops {
 
 /**
@@ -46,10 +52,13 @@ at::Tensor run_batch_matrix_square(const at::Tensor& x) {
       at::zeros({block_dim, matrix_size, matrix_size},
                 at::TensorOptions().dtype(dtype_out).device(device));
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   if (dtype == at::kHalf) {
-    EXEC_KERNEL_CMD(batch_matrix_square_fp16, block_dim, z, x, matrix_size);
+    call_batch_matrix_square_fp16(block_dim, acl_stream, ConvertType(z),
+                                  ConvertType(x), matrix_size);
   } else if (dtype == at::kFloat) {
-    EXEC_KERNEL_CMD(batch_matrix_square_fp32, block_dim, z, x, matrix_size);
+    call_batch_matrix_square_fp32(block_dim, acl_stream, ConvertType(z),
+                                  ConvertType(x), matrix_size);
   }
 
   return z;

diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h
@@ -11,10 +11,15 @@ for the full License text.
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-#include "aclrtlaunch_simple_matmul_fp16.h"
-#include "aclrtlaunch_simple_matmul_fp32.h"
 #include "utils.h"
 
+extern "C" void call_simple_matmul_fp16(uint32_t blockDim, aclrtStream stream,
+                                        void* a, void* b, void* c,
+                                        uint32_t matrix_size);
+extern "C" void call_simple_matmul_fp32(uint32_t blockDim, aclrtStream stream,
+                                        void* a, void* b, void* c,
+                                        uint32_t matrix_size);
+
 namespace pto_isa_ops {
 
 /**
@@ -45,10 +50,13 @@ at::Tensor run_simple_matmul(const at::Tensor& a, const at::Tensor& b) {
       at::ones({matrix_size, matrix_size},
                at::TensorOptions().dtype(dtype_out).device(device));
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   if (dtype == at::kHalf) {
-    EXEC_KERNEL_CMD(simple_matmul_fp16, block_dim, a, b, c, matrix_size);
+    call_simple_matmul_fp16(block_dim, acl_stream, ConvertType(a),
+                            ConvertType(b), ConvertType(c), matrix_size);
   } else if (dtype == at::kFloat) {
-    EXEC_KERNEL_CMD(simple_matmul_fp32, block_dim, a, b, c, matrix_size);
+    call_simple_matmul_fp32(block_dim, acl_stream, ConvertType(a),
+                            ConvertType(b), ConvertType(c), matrix_size);
   }
 
   return c;

diff --git a/csrc/host/torch_swiglu.h b/csrc/host/torch_swiglu.h
@@ -13,7 +13,9 @@ for the full License text.
 
 #include <limits>
 
-#include "aclrtlaunch_swiglu_fp16.h"
+extern "C" uint32_t call_swiglu_fp16(uint32_t blockDim, aclrtStream stream,
+                                     void* x, void* y, uint32_t batch,
+                                     uint32_t input_n);
 #include "utils.h"
 
 namespace pto_isa_ops {
@@ -62,8 +64,10 @@ at::Tensor run_swiglu(const at::Tensor& x, int64_t dim = -1) {
   const uint32_t input_n = static_cast<uint32_t>(input_n_i64);
   const uint32_t block_dim = GetNumCubeCores();
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   at::Tensor y = at::empty({batch_i64, output_n_i64}, x.options());
-  EXEC_KERNEL_CMD(swiglu_fp16, block_dim, x, y, batch, input_n);
+  call_swiglu_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(y), batch,
+                   input_n);
   return y;
 }
 

diff --git a/csrc/host/torch_tri_inv.h → csrc/host/torch_tri_inv_col_sweep.h b/csrc/host/torch_tri_inv.h → csrc/host/torch_tri_inv_col_sweep.h
@@ -11,10 +11,18 @@ for the full License text.
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-#include "aclrtlaunch_triv_inv_col_sweep_fp16.h"
-#include "aclrtlaunch_triv_inv_col_sweep_fp32.h"
 #include "utils.h"
 
+extern "C" void call_triv_inv_col_sweep_fp16(uint32_t blockDim,
+                                             aclrtStream stream, void* M_inv,
+                                             void* M, uint32_t num_elems,
+                                             uint32_t matrix_size);
+
+extern "C" void call_triv_inv_col_sweep_fp32(uint32_t blockDim,
+                                             aclrtStream stream, void* M_inv,
+                                             void* M, uint32_t num_elems,
+                                             uint32_t matrix_size);
+
 namespace pto_isa_ops {
 
 /**
@@ -48,16 +56,16 @@ at::Tensor run_tri_inv(const at::Tensor& x) {
 
   const at::Tensor z = at::empty_like(x);
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   if (dtype == at::kHalf) {
-    EXEC_KERNEL_CMD(triv_inv_col_sweep_fp16, block_dim, x, z, num_elems,
-                    matrix_size);
-
+    call_triv_inv_col_sweep_fp16(block_dim, acl_stream, ConvertType(z),
+                                 ConvertType(x), num_elems, matrix_size);
   } else if (dtype == at::kFloat) {
-    EXEC_KERNEL_CMD(triv_inv_col_sweep_fp32, block_dim, x, z, num_elems,
-                    matrix_size);
-
+    call_triv_inv_col_sweep_fp32(block_dim, acl_stream, ConvertType(z),
+                                 ConvertType(x), num_elems, matrix_size);
   } else {
-    throw std::runtime_error("Unsupported dtype for `tri_inv` kernel");
+    throw std::runtime_error(
+        "Unsupported dtype for `triv_inv_col_sweep` kernel");
   }
 
   return z;

diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h
@@ -9,11 +9,16 @@ for the full License text.
 #pragma once
 
 #include <ATen/ATen.h>
+#include <acl/acl.h>
 #include <torch/library.h>
 
-#include "aclrtlaunch_tri_inv_rec_unroll_fp16.h"
 #include "utils.h"
 
+extern "C" void call_tri_inv_rec_unroll_fp16(
+    uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg,
+    uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads,
+    void* cu_seqlens);
+
 namespace pto_isa_ops {
 
 /**
@@ -72,14 +77,19 @@ at::Tensor run_tri_inv_rec_unroll(
                 at::TensorOptions().dtype(dtype).device(device));
   I_neg.fill_diagonal_(-1);
 
+  auto acl_stream = c10_npu::getCurrentNPUStream().stream(true);
   if (dtype == at::kHalf) {
     if (cu_seqlens.numel() == 1) {
       void* void_null_ptr = nullptr;
-      EXEC_KERNEL_CMD(tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg,
-                      matrix_size, total_tiles, num_bsnd_heads, void_null_ptr);
+      call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv),
+                                   ConvertType(M), ConvertType(I_neg),
+                                   matrix_size, total_tiles, num_bsnd_heads,
+                                   void_null_ptr);
     } else {
-      EXEC_KERNEL_CMD(tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg,
-                      matrix_size, total_tiles, num_bsnd_heads, cu_seqlens);
+      call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv),
+                                   ConvertType(M), ConvertType(I_neg),
+                                   matrix_size, total_tiles, num_bsnd_heads,
+                                   cu_seqlens.data_ptr());
     }
   }