Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions csrc/host/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ inline at::Tensor CopyTensorHostToDevice(const at::Tensor& cpu_tensor) {
* @param [in] scalar_data_type Data type of scalar
* @return at::Tensor Tensor on NPU containing the `cpu_scalar`.
*/
[[maybe_unused]]
inline at::Tensor CopyScalarToDevice(const c10::Scalar& cpu_scalar,
at::ScalarType scalar_data_type) {
return CopyTensorHostToDevice(
Expand Down
12 changes: 3 additions & 9 deletions csrc/kernel/kernel_abs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,13 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

#define MEMORY_BASE

#include <pto/pto-inst.hpp>

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
#include "kernel_utils.h"

using namespace pto;

template <typename T, unsigned matrix_size>
AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)
// define GlobalData on global memory with shape and stride
using ShapeDim5 = pto::Shape<1, 1, 1, matrix_size, matrix_size>;
using StrideDim5 = pto::Stride<1, 1, 1, matrix_size, 1>;
Expand Down Expand Up @@ -78,6 +73,7 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) {
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
}
#endif
}

extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z,
Expand All @@ -93,5 +89,3 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z,
// main kernel, in_length is dynamic input
runTAbs<float, martix_size>((__gm__ float*)x, (__gm__ float*)z, in_length);
}

#endif
5 changes: 1 addition & 4 deletions csrc/kernel/kernel_batch_matrix_square.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#define MEMORY_BASE
#include <pto/pto-inst.hpp>

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
#include "kernel_utils.h"

using namespace pto;

Expand Down
31 changes: 5 additions & 26 deletions csrc/kernel/kernel_simple_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,7 @@ See LICENSE in the root of the software repository:
https://github.com/huawei-csl/pto-kernels/
for the full License text.
*/
#if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

// Placeholder for VEC compilation (the real kernel is CUBE-only).
#define MEMORY_BASE
#include <pto/common/type.hpp>

extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a,
__gm__ void* b,
__gm__ void* c,
uint32_t matrix_size) {}

extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
__gm__ void* b,
__gm__ void* c,
uint32_t matrix_size) {}

#elif (__CHECK_FEATURE_AT_PRECOMPILE) || \
(__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__))

#define MEMORY_BASE

#include <pto/pto-inst.hpp>

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
#include "kernel_utils.h"

using namespace pto;

Expand All @@ -48,6 +25,9 @@ AICORE inline void WaitFlag(uint32_t id) {
template <typename InputT, typename OutputT, uint32_t matrix_size>
AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b,
__gm__ OutputT* c) {
#if (__CHECK_FEATURE_AT_PRECOMPILE) || \
(__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation

constexpr uint32_t tile_len = matrix_size * matrix_size;

/* Global Memory / Tensors */
Expand Down Expand Up @@ -116,6 +96,7 @@ AICORE void runKernelSimpleMatMul(__gm__ InputT* a, __gm__ InputT* b,
SetFlag<PIPE_M, PIPE_FIX>(0); // M pipe sets flag for FIX pipe
WaitFlag<PIPE_M, PIPE_FIX>(0); // FIX pipe waits for M pipe to set flag
TSTORE(c_global_out, c_l0_tile);
#endif
}

template <typename T>
Expand Down Expand Up @@ -161,5 +142,3 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a,
run_simple_matmul<float>((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c,
matrix_size);
}

#endif
13 changes: 3 additions & 10 deletions csrc/kernel/kernel_tri_inv_col_sweep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,8 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

#define MEMORY_BASE

#include <pto/pto-inst.hpp>

#include "kernel_utils.h"

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"

using namespace pto;

/**
Expand All @@ -32,6 +24,8 @@ using namespace pto;
template <typename T, unsigned S /* Matrix Size */>
AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
uint32_t total_length) {
#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__)

set_mask_norm();
set_vector_mask(-1, -1);

Expand Down Expand Up @@ -143,6 +137,7 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out,
}
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
#endif
}

extern "C" __global__ AICORE void triv_inv_col_sweep_fp16(
Expand Down Expand Up @@ -170,5 +165,3 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32(
runTTriInv<float, 128>((__gm__ float*)x, (__gm__ float*)z, in_length);
}
}

#endif
5 changes: 0 additions & 5 deletions csrc/kernel/kernel_tri_inv_rec_unroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,8 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#define MEMORY_BASE
#include <pto/pto-inst.hpp>

#include "kernel_utils.h"

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
using namespace pto;
using namespace kernel_utils;

Expand Down Expand Up @@ -448,7 +444,6 @@ AICORE inline void TriInvRecUnrollKernel(__gm__ OutputT* M_inv,
/* Initializations */
constexpr uint32_t TileLen = MatrixSize * MatrixSize;
constexpr uint32_t FractalSize = 16; // fractal size for half
constexpr uint32_t NumFractalsRowWise = MatrixSize / FractalSize;
constexpr uint32_t NumL0Buffers = 2;

if (get_block_idx() * NumTilesPerCubeIter >= total_tiles) {
Expand Down
5 changes: 1 addition & 4 deletions csrc/kernel/kernel_tri_inv_trick.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ See LICENSE in the root of the software repository:
for the full License text.
*/

#define MEMORY_BASE
#include <pto/pto-inst.hpp>

#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h"
#include "kernel_utils.h"

using namespace pto;

Expand Down
5 changes: 5 additions & 0 deletions csrc/kernel/kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,13 @@ for the full License text.
#pragma once

#define MEMORY_BASE

#include <cstdint>
#include <pto/pto-inst.hpp>

// To avoid #include "kernel_operator.h"
#define GM_ADDR __gm__ uint8_t*

namespace kernel_utils {
/**
* @brief Do a sync step (set-wait flag) between two pipes.
Expand Down
Loading