From d413a66004df747f0994c6f0dcbab2fb5427b533 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Wed, 1 Apr 2026 21:22:00 +0000 Subject: [PATCH 01/23] (cmake) use bisheng --- CMakeLists.txt | 60 +++++++++++++-------------- csrc/host/torch_abs.h | 9 +++- csrc/host/torch_batch_matrix_square.h | 12 +++++- csrc/host/torch_simple_matmul.h | 11 ++++- csrc/host/torch_tri_inv.h | 14 ++++++- csrc/host/torch_tri_inv_rec_unroll.h | 9 +++- csrc/host/torch_tri_inv_trick.h | 5 ++- csrc/host/utils.h | 2 + 8 files changed, 80 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f24285ff..78f27c8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,27 +55,9 @@ add_link_options(-s -Wl,-z,relro -Wl,-z,now) set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h" "SHELL:-include stddef.h") -include_directories(${ASCEND_HOME_PATH}/include +include_directories(${ASCEND_CANN_PACKAGE_PATH}/include ${ASCEND_DRIVER_PATH}/kernel/inc) -if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR - ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR - ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) - set(ASCENDC_CMAKE_DIR - ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) -else() - message( - FATAL_ERROR - "ascendc_kernel_cmake does not exist, please check whether the cann package is installed." - ) -endif() - -include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - include(FetchContent) FetchContent_Declare( @@ -104,19 +86,33 @@ message("* TORCH_NPU_PATH : ${TORCH_NPU_PATH}") message("* TORCH_LIBRARIES : ${TORCH_LIBRARIES}") message("***********************************************************") -ascendc_library( - no_workspace_kernel - SHARED - csrc/kernel/kernel_tri_inv_col_sweep.cpp - csrc/kernel/kernel_abs.cpp - csrc/kernel/kernel_simple_matmul.cpp - csrc/kernel/kernel_batch_matrix_square.cpp - csrc/kernel/kernel_tri_inv_rec_unroll.cpp - csrc/kernel/kernel_tri_inv_trick.cpp) - -ascendc_include_directories( - no_workspace_kernel PRIVATE ${libpto_isa_headers_SOURCE_DIR}/include - ${libpto_isa_headers_SOURCE_DIR}/include/pto/common) +set(KERNEL_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_col_sweep.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_simple_matmul.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_batch_matrix_square.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_rec_unroll.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_trick.cpp) + +set(NO_WORKSPACE_KERNEL_LIB + ${CMAKE_CURRENT_BINARY_DIR}/libno_workspace_kernel.so) + +add_custom_command( + OUTPUT ${NO_WORKSPACE_KERNEL_LIB} + COMMAND + bisheng -fPIC -shared -xcce -O2 -std=c++17 --npu-arch=dav-2201 + -I${libpto_isa_headers_SOURCE_DIR}/include ${KERNEL_SOURCES} -o + ${NO_WORKSPACE_KERNEL_LIB} + DEPENDS ${KERNEL_SOURCES} + COMMENT "Building no_workspace_kernel with bisheng compiler") + +add_custom_target(no_workspace_kernel_build ALL + DEPENDS ${NO_WORKSPACE_KERNEL_LIB}) + +add_library(no_workspace_kernel SHARED IMPORTED GLOBAL) +set_target_properties(no_workspace_kernel PROPERTIES IMPORTED_LOCATION + ${NO_WORKSPACE_KERNEL_LIB}) +add_dependencies(no_workspace_kernel no_workspace_kernel_build) pybind11_add_module(pto_kernels_ops csrc/host/pybind11.cpp) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 55fc2361..ee5da15b 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -9,12 +9,17 @@ for the full License text. #pragma once #include +#include #include -#include "aclrtlaunch_vabs_fp16.h" -#include "aclrtlaunch_vabs_fp32.h" #include "utils.h" +extern "C" aclError aclrtlaunch_vabs_fp16(uint32_t blockDim, aclrtStream stream, + void* x, void* z, uint32_t in_length); + +extern "C" aclError aclrtlaunch_vabs_fp32(uint32_t blockDim, aclrtStream stream, + void* x, void* z, uint32_t in_length); + namespace pto_isa_ops { /** diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h index 662d3cc2..6321a711 100644 --- a/csrc/host/torch_batch_matrix_square.h +++ b/csrc/host/torch_batch_matrix_square.h @@ -9,12 +9,20 @@ for the full License text. #pragma once #include +#include #include -#include "aclrtlaunch_batch_matrix_square_fp16.h" -#include "aclrtlaunch_batch_matrix_square_fp32.h" #include "utils.h" +extern "C" aclError aclrtlaunch_batch_matrix_square_fp16(uint32_t blockDim, + aclrtStream stream, + void* z, void* x, + uint32_t matrix_size); +extern "C" aclError aclrtlaunch_batch_matrix_square_fp32(uint32_t blockDim, + aclrtStream stream, + void* z, void* x, + uint32_t matrix_size); + namespace pto_isa_ops { /** diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h index bb517be9..ecfa11cd 100644 --- a/csrc/host/torch_simple_matmul.h +++ b/csrc/host/torch_simple_matmul.h @@ -11,10 +11,17 @@ for the full License text. #include #include -#include "aclrtlaunch_simple_matmul_fp16.h" -#include "aclrtlaunch_simple_matmul_fp32.h" #include "utils.h" +extern "C" aclError aclrtlaunch_simple_matmul_fp16(uint32_t blockDim, + aclrtStream stream, void* a, + void* b, void* c, + uint32_t matrix_size); +extern "C" aclError aclrtlaunch_simple_matmul_fp32(uint32_t blockDim, + aclrtStream stream, void* a, + void* b, void* c, + uint32_t matrix_size); + namespace pto_isa_ops { /** diff --git a/csrc/host/torch_tri_inv.h b/csrc/host/torch_tri_inv.h index 951e8d4e..5140b059 100644 --- a/csrc/host/torch_tri_inv.h +++ b/csrc/host/torch_tri_inv.h @@ -11,10 +11,20 @@ for the full License text. #include #include -#include "aclrtlaunch_triv_inv_col_sweep_fp16.h" -#include "aclrtlaunch_triv_inv_col_sweep_fp32.h" #include "utils.h" +extern "C" aclError aclrtlaunch_triv_inv_col_sweep_fp16(uint32_t blockDim, + aclrtStream stream, + void* M_inv, void* M, + uint32_t num_elems, + uint32_t matrix_size); + +extern "C" aclError aclrtlaunch_triv_inv_col_sweep_fp32(uint32_t blockDim, + aclrtStream stream, + void* M_inv, void* M, + uint32_t num_elems, + uint32_t matrix_size); + namespace pto_isa_ops { /** diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index 2b08a308..7a698bd6 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -9,11 +9,18 @@ for the full License text. #pragma once #include +#include #include -#include "aclrtlaunch_tri_inv_rec_unroll_fp16.h" #include "utils.h" +extern "C" aclError aclrtlaunch_tri_inv_rec_unroll_fp16( + uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, + uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); +extern "C" aclError aclrtlaunch_tri_inv_rec_unroll_fp32( + uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, + uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); + namespace pto_isa_ops { /** diff --git a/csrc/host/torch_tri_inv_trick.h b/csrc/host/torch_tri_inv_trick.h index 36f6ac2d..98146d32 100644 --- a/csrc/host/torch_tri_inv_trick.h +++ b/csrc/host/torch_tri_inv_trick.h @@ -11,9 +11,12 @@ for the full License text. #include #include -#include "aclrtlaunch_tri_inv_trick_fp16.h" #include "utils.h" +extern "C" aclError aclrtlaunch_tri_inv_trick_fp16( + uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, + uint32_t matrix_size, uint32_t max_block_size); + namespace pto_isa_ops { /** diff --git a/csrc/host/utils.h b/csrc/host/utils.h index db004410..8b5b0d9f 100644 --- a/csrc/host/utils.h +++ b/csrc/host/utils.h @@ -25,6 +25,8 @@ namespace pto_isa_ops { // Copied from tools/build/asc_rt/ascendc_runtime.h to avoid dependency on the // header file. See // https://gitcode.com/cann/asc-devkit/blob/v8.5.0/tools/build/asc_rt/ascendc_runtime.h +#define ACLRT_LAUNCH_KERNEL(kernel_name) aclrtlaunch_##kernel_name + #define ASSERT_RETVAL(exp, ret) \ do { \ if (!(exp)) { \ From 702c91facb66135e745fdd3799c96dac9ff9cb2d Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Wed, 1 Apr 2026 21:34:53 +0000 Subject: [PATCH 02/23] fix --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78f27c8a..7034466e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,7 +144,7 @@ if(PIP_INSTALL) LINK_FLAGS "-Wl,-rpath,\${ORIGIN}/lib") # install dynamic libraries under site-packages/pto_kernels/libs - install(TARGETS no_workspace_kernel LIBRARY DESTINATION pto_kernels/lib) + install(FILES ${NO_WORKSPACE_KERNEL_LIB} DESTINATION pto_kernels/lib) else() message(STATUS ">>>======================================================") message(STATUS ">>> Ignoring dynamic libraries COPY inside Python wheel.") From b45cee6aa96851b465545f5ebd9d4506efd89afb Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Wed, 1 Apr 2026 21:43:55 +0000 Subject: [PATCH 03/23] fix --- csrc/host/torch_abs.h | 8 +++---- csrc/host/torch_batch_matrix_square.h | 14 ++++++------ csrc/host/torch_simple_matmul.h | 14 ++++++------ csrc/host/torch_tri_inv.h | 20 ++++++++--------- csrc/host/torch_tri_inv_rec_unroll.h | 4 ++-- csrc/host/torch_tri_inv_trick.h | 7 +++--- csrc/host/utils.h | 31 +++++++++++---------------- 7 files changed, 44 insertions(+), 54 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index ee5da15b..51aaaabc 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -14,11 +14,11 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_vabs_fp16(uint32_t blockDim, aclrtStream stream, - void* x, void* z, uint32_t in_length); +extern "C" aclError vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x, + void* z, uint32_t in_length); -extern "C" aclError aclrtlaunch_vabs_fp32(uint32_t blockDim, aclrtStream stream, - void* x, void* z, uint32_t in_length); +extern "C" aclError vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x, + void* z, uint32_t in_length); namespace pto_isa_ops { diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h index 6321a711..f5aab24a 100644 --- a/csrc/host/torch_batch_matrix_square.h +++ b/csrc/host/torch_batch_matrix_square.h @@ -14,14 +14,12 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_batch_matrix_square_fp16(uint32_t blockDim, - aclrtStream stream, - void* z, void* x, - uint32_t matrix_size); -extern "C" aclError aclrtlaunch_batch_matrix_square_fp32(uint32_t blockDim, - aclrtStream stream, - void* z, void* x, - uint32_t matrix_size); +extern "C" aclError batch_matrix_square_fp16(uint32_t blockDim, + aclrtStream stream, void* z, + void* x, uint32_t matrix_size); +extern "C" aclError batch_matrix_square_fp32(uint32_t blockDim, + aclrtStream stream, void* z, + void* x, uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h index ecfa11cd..01228c32 100644 --- a/csrc/host/torch_simple_matmul.h +++ b/csrc/host/torch_simple_matmul.h @@ -13,14 +13,12 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_simple_matmul_fp16(uint32_t blockDim, - aclrtStream stream, void* a, - void* b, void* c, - uint32_t matrix_size); -extern "C" aclError aclrtlaunch_simple_matmul_fp32(uint32_t blockDim, - aclrtStream stream, void* a, - void* b, void* c, - uint32_t matrix_size); +extern "C" aclError simple_matmul_fp16(uint32_t blockDim, aclrtStream stream, + void* a, void* b, void* c, + uint32_t matrix_size); +extern "C" aclError simple_matmul_fp32(uint32_t blockDim, aclrtStream stream, + void* a, void* b, void* c, + uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_tri_inv.h b/csrc/host/torch_tri_inv.h index 5140b059..9cb30a12 100644 --- a/csrc/host/torch_tri_inv.h +++ b/csrc/host/torch_tri_inv.h @@ -13,17 +13,15 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_triv_inv_col_sweep_fp16(uint32_t blockDim, - aclrtStream stream, - void* M_inv, void* M, - uint32_t num_elems, - uint32_t matrix_size); - -extern "C" aclError aclrtlaunch_triv_inv_col_sweep_fp32(uint32_t blockDim, - aclrtStream stream, - void* M_inv, void* M, - uint32_t num_elems, - uint32_t matrix_size); +extern "C" aclError triv_inv_col_sweep_fp16(uint32_t blockDim, + aclrtStream stream, void* M_inv, + void* M, uint32_t num_elems, + uint32_t matrix_size); + +extern "C" aclError triv_inv_col_sweep_fp32(uint32_t blockDim, + aclrtStream stream, void* M_inv, + void* M, uint32_t num_elems, + uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index 7a698bd6..ec9d0b20 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -14,10 +14,10 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_tri_inv_rec_unroll_fp16( +extern "C" aclError tri_inv_rec_unroll_fp16( uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); -extern "C" aclError aclrtlaunch_tri_inv_rec_unroll_fp32( +extern "C" aclError tri_inv_rec_unroll_fp32( uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); diff --git a/csrc/host/torch_tri_inv_trick.h b/csrc/host/torch_tri_inv_trick.h index 98146d32..7d4380c1 100644 --- a/csrc/host/torch_tri_inv_trick.h +++ b/csrc/host/torch_tri_inv_trick.h @@ -13,9 +13,10 @@ for the full License text. #include "utils.h" -extern "C" aclError aclrtlaunch_tri_inv_trick_fp16( - uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, - uint32_t matrix_size, uint32_t max_block_size); +extern "C" aclError tri_inv_trick_fp16(uint32_t blockDim, aclrtStream stream, + void* M_inv, void* M, void* I_neg, + uint32_t matrix_size, + uint32_t max_block_size); namespace pto_isa_ops { diff --git a/csrc/host/utils.h b/csrc/host/utils.h index 8b5b0d9f..681771a7 100644 --- a/csrc/host/utils.h +++ b/csrc/host/utils.h @@ -22,11 +22,6 @@ namespace pto_isa_ops { #define DEVICE_TYPE c10::DeviceType::PrivateUse1 -// Copied from tools/build/asc_rt/ascendc_runtime.h to avoid dependency on the -// header file. See -// https://gitcode.com/cann/asc-devkit/blob/v8.5.0/tools/build/asc_rt/ascendc_runtime.h -#define ACLRT_LAUNCH_KERNEL(kernel_name) aclrtlaunch_##kernel_name - #define ASSERT_RETVAL(exp, ret) \ do { \ if (!(exp)) { \ @@ -134,18 +129,18 @@ constexpr auto ConvertTypes(Ts&... args) { return std::make_tuple(ConvertType(args)...); } -#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...) \ - do { \ - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ - auto converted_params = pto_isa_ops::ConvertTypes(__VA_ARGS__); \ - auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \ - std::apply( \ - [&](auto&&... params) { \ - ACLRT_LAUNCH_KERNEL(kernel_name)(blockdim, acl_stream, params...); \ - }, \ - converted_params); \ - return 0; \ - }; \ - at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call); \ +#define EXEC_KERNEL_CMD(kernel_name, blockdim, ...) \ + do { \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + auto converted_params = pto_isa_ops::ConvertTypes(__VA_ARGS__); \ + auto acl_call = [acl_stream, blockdim, converted_params]() -> int { \ + std::apply( \ + [&](auto&&... params) { \ + kernel_name(blockdim, acl_stream, params...); \ + }, \ + converted_params); \ + return 0; \ + }; \ + at_npu::native::OpCommand::RunOpApi(#kernel_name, acl_call); \ } while (false) } // namespace pto_isa_ops From 4780099ca53700869080fee8b3521966e1455209 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Wed, 1 Apr 2026 21:54:17 +0000 Subject: [PATCH 04/23] fix abs --- csrc/kernel/kernel_abs.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 92abbdd1..9c0514d7 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -6,9 +6,6 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ - -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - #define MEMORY_BASE #include @@ -28,6 +25,8 @@ using namespace pto; */ template AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) + // define GlobalData on global memory with shape and stride using ShapeDim5 = pto::Shape<1, 1, 1, TILE_LEN, TILE_LEN>; using StrideDim5 = pto::Stride<1, 1, 1, TILE_LEN, 1>; @@ -83,6 +82,8 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) { set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); } + +#endif } extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, @@ -96,5 +97,3 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, constexpr unsigned TILE_LEN = 64; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); } - -#endif From 1217f9ce9ee5c66a75006e70e2bb01be9b993a5b Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Wed, 1 Apr 2026 22:06:56 +0000 Subject: [PATCH 05/23] fix --- csrc/host/pybind11.cpp | 2 +- ...ch_tri_inv.h => torch_tri_inv_col_sweep.h} | 0 csrc/kernel/kernel_simple_matmul.cpp | 27 +++---------------- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 10 +++---- 4 files changed, 9 insertions(+), 30 deletions(-) rename csrc/host/{torch_tri_inv.h => torch_tri_inv_col_sweep.h} (100%) diff --git a/csrc/host/pybind11.cpp b/csrc/host/pybind11.cpp index bcd429c1..b0dc7312 100644 --- a/csrc/host/pybind11.cpp +++ b/csrc/host/pybind11.cpp @@ -12,7 +12,7 @@ for the full License text. #include "torch_abs.h" #include "torch_batch_matrix_square.h" #include "torch_simple_matmul.h" -#include "torch_tri_inv.h" +#include "torch_tri_inv_col_sweep.h" #include "torch_tri_inv_rec_unroll.h" #include "torch_tri_inv_trick.h" diff --git a/csrc/host/torch_tri_inv.h b/csrc/host/torch_tri_inv_col_sweep.h similarity index 100% rename from csrc/host/torch_tri_inv.h rename to csrc/host/torch_tri_inv_col_sweep.h diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp index 465ca488..e9d051a5 100644 --- a/csrc/kernel/kernel_simple_matmul.cpp +++ b/csrc/kernel/kernel_simple_matmul.cpp @@ -6,31 +6,10 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ -#if defined __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) -// Placeholder for VEC compilation (the real kernel is CUBE-only). #define MEMORY_BASE -#include - -extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) {} - -extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) {} - -#elif (__CHECK_FEATURE_AT_PRECOMPILE) || \ - (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) - -#define MEMORY_BASE - #include -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" - using namespace pto; constexpr unsigned NUM_BLOCKS = 20; // number of AICs @@ -124,6 +103,9 @@ AICORE void run_simple_matmul(__gm__ T* a, __gm__ T* b, __gm__ float* c, static_assert(std::is_same_v or std::is_same_v, "simple_matmul supports only fp16/fp32."); +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation + switch (matrix_size) { case 16: runKernelSimpleMatMul(a, b, c); @@ -144,6 +126,7 @@ AICORE void run_simple_matmul(__gm__ T* a, __gm__ T* b, __gm__ float* c, runKernelSimpleMatMul(a, b, c); break; } +#endif } extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a, @@ -161,5 +144,3 @@ extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, run_simple_matmul((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c, matrix_size); } - -#endif diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index 9277bcc2..009b78de 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -7,15 +7,12 @@ See LICENSE in the root of the software repository: for the full License text. */ -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - #define MEMORY_BASE - #include #include "kernel_utils.h" -#define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#define GM_ADDR __gm__ uint8_t* using namespace pto; @@ -32,6 +29,8 @@ using namespace pto; template AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, uint32_t total_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) + set_mask_norm(); set_vector_mask(-1, -1); @@ -143,6 +142,7 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, } wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif } extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( @@ -170,5 +170,3 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32( runTTriInv((__gm__ float*)x, (__gm__ float*)z, in_length); } } - -#endif From 39451822eeff390bbf725895d4b25de2d52fb16d Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 09:16:40 +0000 Subject: [PATCH 06/23] fix --- csrc/kernel/kernel_abs.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 9c0514d7..95987ee5 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -86,14 +86,17 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_length) { #endif } -extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, - uint32_t in_length) { +__global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { constexpr unsigned TILE_LEN = 64; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); } -extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, - uint32_t in_length) { +__global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { constexpr unsigned TILE_LEN = 64; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); } + +extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, + uint8_t* y, uint32_t num_elements) { + vabs_fp16<<>>(x, y, num_elements); +} From ef0711f59bd66f206d0cb93d8bef7a4115112dfd Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 14:47:27 +0000 Subject: [PATCH 07/23] fix --- csrc/host/torch_abs.h | 15 ++++++++------- csrc/kernel/kernel_abs.cpp | 5 +++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 51aaaabc..16eeaf23 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -14,14 +14,13 @@ for the full License text. #include "utils.h" -extern "C" aclError vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x, - void* z, uint32_t in_length); +extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, void* x, + void* y, uint32_t num_elements); -extern "C" aclError vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x, - void* z, uint32_t in_length); +extern "C" void call_vabs_fp32(uint32_t blockDim, void* stream, void* x, + void* y, uint32_t num_elements); namespace pto_isa_ops { - /** * @brief Runs element-wise absolute value. * @@ -44,10 +43,12 @@ at::Tensor run_abs(const at::Tensor& x) { } if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(vabs_fp16, block_dim, x, z, total_len); + call_vabs_fp16(block_dim, nullptr, ConvertType(x), ConvertType(z), + total_len); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(vabs_fp32, block_dim, x, z, total_len); + call_vabs_fp32(block_dim, nullptr, ConvertType(x), ConvertType(z), + total_len); } else { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 95987ee5..28015bcc 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -100,3 +100,8 @@ extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, uint8_t* y, uint32_t num_elements) { vabs_fp16<<>>(x, y, num_elements); } + +extern "C" void call_vabs_fp32(uint32_t blockDim, void* stream, uint8_t* x, + uint8_t* y, uint32_t num_elements) { + vabs_fp32<<>>(x, y, num_elements); +} From 6e9c71637650fa7d7e20076276260d4c04caab0d Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 14:52:36 +0000 Subject: [PATCH 08/23] abs pass tests --- csrc/host/torch_abs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 16eeaf23..762d5df1 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -42,12 +42,14 @@ at::Tensor run_abs(const at::Tensor& x) { "pto_abs supports only inputs with length that is multiple of 64."); } + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + if (dtype == at::kHalf) { - call_vabs_fp16(block_dim, nullptr, ConvertType(x), ConvertType(z), + call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_len); } else if (dtype == at::kFloat) { - call_vabs_fp32(block_dim, nullptr, ConvertType(x), ConvertType(z), + call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_len); } else { From 19cfc0795177ef9ddea6ae2686bb1bb7ff650796 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 16:06:53 +0000 Subject: [PATCH 09/23] fix --- csrc/host/torch_abs.h | 2 -- csrc/host/torch_batch_matrix_square.h | 7 ++++-- csrc/host/torch_simple_matmul.h | 7 ++++-- csrc/host/torch_tri_inv_col_sweep.h | 14 ++++++------ csrc/host/torch_tri_inv_rec_unroll.h | 6 +++-- csrc/host/torch_tri_inv_trick.h | 7 ++++-- csrc/kernel/kernel_abs.cpp | 8 +++---- csrc/kernel/kernel_batch_matrix_square.cpp | 20 +++++++++++++---- csrc/kernel/kernel_simple_matmul.cpp | 26 +++++++++++++++------- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 26 ++++++++++++++++++---- csrc/kernel/kernel_tri_inv_rec_unroll.cpp | 19 ++++++++++++---- csrc/kernel/kernel_tri_inv_trick.cpp | 9 ++++++++ 12 files changed, 110 insertions(+), 41 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 762d5df1..381cc15c 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -43,11 +43,9 @@ at::Tensor run_abs(const at::Tensor& x) { } auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); - if (dtype == at::kHalf) { call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_len); - } else if (dtype == at::kFloat) { call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_len); diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h index f5aab24a..64ffa38c 100644 --- a/csrc/host/torch_batch_matrix_square.h +++ b/csrc/host/torch_batch_matrix_square.h @@ -52,10 +52,13 @@ at::Tensor run_batch_matrix_square(const at::Tensor& x) { at::zeros({block_dim, matrix_size, matrix_size}, at::TensorOptions().dtype(dtype_out).device(device)); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(batch_matrix_square_fp16, block_dim, z, x, matrix_size); + call_batch_matrix_square_fp16(block_dim, acl_stream, ConvertType(z), + ConvertType(x), matrix_size); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(batch_matrix_square_fp32, block_dim, z, x, matrix_size); + call_batch_matrix_square_fp32(block_dim, acl_stream, ConvertType(z), + ConvertType(x), matrix_size); } return z; diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h index 01228c32..61e676ad 100644 --- a/csrc/host/torch_simple_matmul.h +++ b/csrc/host/torch_simple_matmul.h @@ -50,10 +50,13 @@ at::Tensor run_simple_matmul(const at::Tensor& a, const at::Tensor& b) { at::ones({matrix_size, matrix_size}, at::TensorOptions().dtype(dtype_out).device(device)); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(simple_matmul_fp16, block_dim, a, b, c, matrix_size); + call_simple_matmul_fp16(block_dim, acl_stream, ConvertType(a), + ConvertType(b), ConvertType(c), matrix_size); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(simple_matmul_fp32, block_dim, a, b, c, matrix_size); + call_simple_matmul_fp32(block_dim, acl_stream, ConvertType(a), + ConvertType(b), ConvertType(c), matrix_size); } return c; diff --git a/csrc/host/torch_tri_inv_col_sweep.h b/csrc/host/torch_tri_inv_col_sweep.h index 9cb30a12..17a338cf 100644 --- a/csrc/host/torch_tri_inv_col_sweep.h +++ b/csrc/host/torch_tri_inv_col_sweep.h @@ -56,16 +56,16 @@ at::Tensor run_tri_inv(const at::Tensor& x) { const at::Tensor z = at::empty_like(x); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(triv_inv_col_sweep_fp16, block_dim, x, z, num_elems, - matrix_size); - + call_triv_inv_col_sweep_fp16(block_dim, acl_stream, ConvertType(z), + ConvertType(x), num_elems, matrix_size); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(triv_inv_col_sweep_fp32, block_dim, x, z, num_elems, - matrix_size); - + call_triv_inv_col_sweep_fp32(block_dim, acl_stream, ConvertType(z), + ConvertType(x), num_elems, matrix_size); } else { - throw std::runtime_error("Unsupported dtype for `tri_inv` kernel"); + throw std::runtime_error( + "Unsupported dtype for `triv_inv_col_sweep` kernel"); } return z; diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index ec9d0b20..96c79e00 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -66,9 +66,11 @@ at::Tensor run_tri_inv_rec_unroll(const at::Tensor& M, at::TensorOptions().dtype(dtype).device(device)); I_neg.fill_diagonal_(-1); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg, - matrix_size, total_tiles, num_bsnd_heads); + call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv), + ConvertType(M), ConvertType(I_neg), + matrix_size, total_tiles, num_bsnd_heads); } return M_inv; diff --git a/csrc/host/torch_tri_inv_trick.h b/csrc/host/torch_tri_inv_trick.h index 7d4380c1..b84c951c 100644 --- a/csrc/host/torch_tri_inv_trick.h +++ b/csrc/host/torch_tri_inv_trick.h @@ -52,9 +52,12 @@ at::Tensor run_tri_inv_trick(const at::Tensor& M) { at::zeros({matrix_size, matrix_size}, at::TensorOptions().dtype(dtype).device(device)); I_neg.fill_diagonal_(-1); + + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(tri_inv_trick_fp16, block_dim, M_inv, M, I_neg, matrix_size, - max_block_size); + call_tri_inv_trick_fp16(block_dim, acl_stream, ConvertType(M_inv), + ConvertType(M), ConvertType(I_neg), matrix_size, + max_block_size); } return M_inv; diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 28015bcc..434e8a4e 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -96,12 +96,12 @@ __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); } -extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, uint8_t* x, +extern "C" void call_vabs_fp16(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint32_t num_elements) { - vabs_fp16<<>>(x, y, num_elements); + vabs_fp16<<>>(x, y, num_elements); } -extern "C" void call_vabs_fp32(uint32_t blockDim, void* stream, uint8_t* x, +extern "C" void call_vabs_fp32(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint32_t num_elements) { - vabs_fp32<<>>(x, y, num_elements); + vabs_fp32<<>>(x, y, num_elements); } diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp index 34827034..d29501f4 100644 --- a/csrc/kernel/kernel_batch_matrix_square.cpp +++ b/csrc/kernel/kernel_batch_matrix_square.cpp @@ -108,13 +108,25 @@ AICORE void run_batch_matrix_square(__gm__ float* z, __gm__ InputT* x, } } -extern "C" __global__ AICORE void batch_matrix_square_fp16( - __gm__ void* z, __gm__ void* x, uint32_t matrix_size) { +__global__ AICORE void batch_matrix_square_fp16(__gm__ void* z, __gm__ void* x, + uint32_t matrix_size) { run_batch_matrix_square((__gm__ float*)z, (__gm__ half*)x, matrix_size); } -extern "C" __global__ AICORE void batch_matrix_square_fp32( - __gm__ void* z, __gm__ void* x, uint32_t matrix_size) { +__global__ AICORE void batch_matrix_square_fp32(__gm__ void* z, __gm__ void* x, + uint32_t matrix_size) { run_batch_matrix_square((__gm__ float*)z, (__gm__ float*)x, matrix_size); } + +extern "C" void call_batch_matrix_square_fp16(uint32_t block_dim, void* stream, + uint8_t* z, uint8_t* x, + uint32_t matrix_size) { + batch_matrix_square_fp16<<>>(z, x, num_elements); +} + +extern "C" void call_batch_matrix_square_fp32(uint32_t block_dim, void* stream, + uint8_t* z, uint8_t* x, + uint32_t matrix_size) { + batch_matrix_square_fp32<<>>(z, x, num_elements); +} diff --git a/csrc/kernel/kernel_simple_matmul.cpp b/csrc/kernel/kernel_simple_matmul.cpp index e9d051a5..c044d082 100644 --- a/csrc/kernel/kernel_simple_matmul.cpp +++ b/csrc/kernel/kernel_simple_matmul.cpp @@ -129,18 +129,28 @@ AICORE void run_simple_matmul(__gm__ T* a, __gm__ T* b, __gm__ float* c, #endif } -extern "C" __global__ AICORE void simple_matmul_fp16(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) { +__global__ AICORE void simple_matmul_fp16(__gm__ void* a, __gm__ void* b, + __gm__ void* c, + uint32_t matrix_size) { run_simple_matmul((__gm__ half*)a, (__gm__ half*)b, (__gm__ float*)c, matrix_size); } -extern "C" __global__ AICORE void simple_matmul_fp32(__gm__ void* a, - __gm__ void* b, - __gm__ void* c, - uint32_t matrix_size) { +__global__ AICORE void simple_matmul_fp32(__gm__ void* a, __gm__ void* b, + __gm__ void* c, + uint32_t matrix_size) { run_simple_matmul((__gm__ float*)a, (__gm__ float*)b, (__gm__ float*)c, matrix_size); } + +extern "C" void call_simple_matmul_fp16(uint32_t block_dim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* c, + uint32_t matrix_size) { + simple_matmul_fp16<<>>(a, b, c, matrix_size); +} + +extern "C" void call_simple_matmul_fp32(uint32_t block_dim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* c, + uint32_t matrix_size) { + simple_matmul_fp32<<>>(a, b, c, matrix_size); +} diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index 009b78de..22131b22 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -145,8 +145,9 @@ AICORE void runTTriInv(__gm__ T* vec_in, __gm__ T* vec_out, #endif } -extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( - GM_ADDR x, GM_ADDR z, uint32_t in_length, uint32_t matrix_size) { +__global__ AICORE void triv_inv_col_sweep_fp16(GM_ADDR x, GM_ADDR z, + uint32_t in_length, + uint32_t matrix_size) { if (matrix_size == 16) { runTTriInv((__gm__ half*)x, (__gm__ half*)z, in_length); } else if (matrix_size == 32) { @@ -158,8 +159,9 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp16( } } -extern "C" __global__ AICORE void triv_inv_col_sweep_fp32( - GM_ADDR x, GM_ADDR z, uint32_t in_length, uint32_t matrix_size) { +__global__ AICORE void triv_inv_col_sweep_fp32(GM_ADDR x, GM_ADDR z, + uint32_t in_length, + uint32_t matrix_size) { if (matrix_size == 16) { runTTriInv((__gm__ float*)x, (__gm__ float*)z, in_length); } else if (matrix_size == 32) { @@ -170,3 +172,19 @@ extern "C" __global__ AICORE void triv_inv_col_sweep_fp32( runTTriInv((__gm__ float*)x, (__gm__ float*)z, in_length); } } + +extern "C" void triv_inv_col_sweep_fp16(uint32_t block_dim, void* stream, + uint8_t* x, uint8_t* y, + uint32_t in_length, + uint32_t matrix_size) { + triv_inv_col_sweep_fp16<<>>(x, y, in_length, + matrix_size); +} + +extern "C" void triv_inv_col_sweep_fp32(uint32_t block_dim, void* stream, + uint8_t* x, uint8_t* y, + uint32_t in_length, + uint32_t matrix_size) { + triv_inv_col_sweep_fp32<<>>(x, y, in_length, + matrix_size); +} diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp index 238a3940..1cfda3f9 100644 --- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp +++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp @@ -666,10 +666,12 @@ AICORE void run_tri_inv_rec_unroll(__gm__ float* tensor_out, * strided accesses. If each tile is stored consecutively (and row-wise) in * memory, then num_bsnd_heads=0. */ -extern "C" __global__ AICORE void tri_inv_rec_unroll_fp16( - __gm__ void* tensor_out, __gm__ void* tensor_in, - __gm__ void* minus_identity_in, uint32_t matrix_size, uint32_t num_matrices, - uint32_t num_bsnd_heads) { +__global__ AICORE void tri_inv_rec_unroll_fp16(__gm__ void* tensor_out, + __gm__ void* tensor_in, + __gm__ void* minus_identity_in, + uint32_t matrix_size, + uint32_t num_matrices, + uint32_t num_bsnd_heads) { if (num_bsnd_heads == 0) { if (num_matrices <= get_block_num()) { run_tri_inv_rec_unroll>>( + tensor_out, tensor_in, minus_identity_in, matrix_size, num_matrices, + num_bsnd_heads); +} diff --git a/csrc/kernel/kernel_tri_inv_trick.cpp b/csrc/kernel/kernel_tri_inv_trick.cpp index 206a1618..ea1b5fca 100644 --- a/csrc/kernel/kernel_tri_inv_trick.cpp +++ b/csrc/kernel/kernel_tri_inv_trick.cpp @@ -214,3 +214,12 @@ extern "C" __global__ AICORE void tri_inv_trick_fp16(__gm__ void* tensor_out, (__gm__ half*)identity_in, matrix_size, max_block_size); } + +extern "C" void call_tri_inv_trick_fp16(uint32_t block_dim, void* stream, + uint8_t* tensor_out, uint8_t* tensor_in, + uint8_t* identity_in, + uint32_t matrix_size, + uint32_t max_block_size) { + tri_inv_trick_fp16<<>>( + tensor_out, tensor_in, identity_in, matrix_size, max_block_size); +} From 3853f46e4d614314ee1cad73635f85eca10d8cb9 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 16:19:29 +0000 Subject: [PATCH 10/23] fix --- csrc/host/torch_batch_matrix_square.h | 12 ++++++------ csrc/host/torch_simple_matmul.h | 12 ++++++------ csrc/host/torch_tri_inv_col_sweep.h | 18 ++++++++++-------- csrc/host/torch_tri_inv_rec_unroll.h | 5 +---- csrc/host/torch_tri_inv_trick.h | 8 ++++---- csrc/kernel/kernel_batch_matrix_square.cpp | 4 ++-- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 8 ++++---- 7 files changed, 33 insertions(+), 34 deletions(-) diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h index 64ffa38c..51fec411 100644 --- a/csrc/host/torch_batch_matrix_square.h +++ b/csrc/host/torch_batch_matrix_square.h @@ -14,12 +14,12 @@ for the full License text. #include "utils.h" -extern "C" aclError batch_matrix_square_fp16(uint32_t blockDim, - aclrtStream stream, void* z, - void* x, uint32_t matrix_size); -extern "C" aclError batch_matrix_square_fp32(uint32_t blockDim, - aclrtStream stream, void* z, - void* x, uint32_t matrix_size); +extern "C" void call_batch_matrix_square_fp16(uint32_t blockDim, + aclrtStream stream, void* z, + void* x, uint32_t matrix_size); +extern "C" void call_batch_matrix_square_fp32(uint32_t blockDim, + aclrtStream stream, void* z, + void* x, uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h index 61e676ad..649ebbef 100644 --- a/csrc/host/torch_simple_matmul.h +++ b/csrc/host/torch_simple_matmul.h @@ -13,12 +13,12 @@ for the full License text. #include "utils.h" -extern "C" aclError simple_matmul_fp16(uint32_t blockDim, aclrtStream stream, - void* a, void* b, void* c, - uint32_t matrix_size); -extern "C" aclError simple_matmul_fp32(uint32_t blockDim, aclrtStream stream, - void* a, void* b, void* c, - uint32_t matrix_size); +extern "C" void call_simple_matmul_fp16(uint32_t blockDim, aclrtStream stream, + void* a, void* b, void* c, + uint32_t matrix_size); +extern "C" void call_simple_matmul_fp32(uint32_t blockDim, aclrtStream stream, + void* a, void* b, void* c, + uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_tri_inv_col_sweep.h b/csrc/host/torch_tri_inv_col_sweep.h index 17a338cf..de1578a9 100644 --- a/csrc/host/torch_tri_inv_col_sweep.h +++ b/csrc/host/torch_tri_inv_col_sweep.h @@ -13,15 +13,17 @@ for the full License text. #include "utils.h" -extern "C" aclError triv_inv_col_sweep_fp16(uint32_t blockDim, - aclrtStream stream, void* M_inv, - void* M, uint32_t num_elems, - uint32_t matrix_size); +extern "C" aclError call_triv_inv_col_sweep_fp16(uint32_t blockDim, + aclrtStream stream, + void* M_inv, void* M, + uint32_t num_elems, + uint32_t matrix_size); -extern "C" aclError triv_inv_col_sweep_fp32(uint32_t blockDim, - aclrtStream stream, void* M_inv, - void* M, uint32_t num_elems, - uint32_t matrix_size); +extern "C" aclError call_triv_inv_col_sweep_fp32(uint32_t blockDim, + aclrtStream stream, + void* M_inv, void* M, + uint32_t num_elems, + uint32_t matrix_size); namespace pto_isa_ops { diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index 96c79e00..15cacdf6 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -14,10 +14,7 @@ for the full License text. #include "utils.h" -extern "C" aclError tri_inv_rec_unroll_fp16( - uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, - uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); -extern "C" aclError tri_inv_rec_unroll_fp32( +extern "C" void call_tri_inv_rec_unroll_fp16( uint32_t blockDim, aclrtStream stream, void* M_inv, void* M, void* I_neg, uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads); diff --git a/csrc/host/torch_tri_inv_trick.h b/csrc/host/torch_tri_inv_trick.h index b84c951c..55a53b45 100644 --- a/csrc/host/torch_tri_inv_trick.h +++ b/csrc/host/torch_tri_inv_trick.h @@ -13,10 +13,10 @@ for the full License text. #include "utils.h" -extern "C" aclError tri_inv_trick_fp16(uint32_t blockDim, aclrtStream stream, - void* M_inv, void* M, void* I_neg, - uint32_t matrix_size, - uint32_t max_block_size); +extern "C" void call_tri_inv_trick_fp16(uint32_t blockDim, aclrtStream stream, + void* M_inv, void* M, void* I_neg, + uint32_t matrix_size, + uint32_t max_block_size); namespace pto_isa_ops { diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp index d29501f4..3bc3feb7 100644 --- a/csrc/kernel/kernel_batch_matrix_square.cpp +++ b/csrc/kernel/kernel_batch_matrix_square.cpp @@ -122,11 +122,11 @@ __global__ AICORE void batch_matrix_square_fp32(__gm__ void* z, __gm__ void* x, extern "C" void call_batch_matrix_square_fp16(uint32_t block_dim, void* stream, uint8_t* z, uint8_t* x, uint32_t matrix_size) { - batch_matrix_square_fp16<<>>(z, x, num_elements); + batch_matrix_square_fp16<<>>(z, x, matrix_size); } extern "C" void call_batch_matrix_square_fp32(uint32_t block_dim, void* stream, uint8_t* z, uint8_t* x, uint32_t matrix_size) { - batch_matrix_square_fp32<<>>(z, x, num_elements); + batch_matrix_square_fp32<<>>(z, x, matrix_size); } diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index 22131b22..cbcdb96a 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -181,10 +181,10 @@ extern "C" void triv_inv_col_sweep_fp16(uint32_t block_dim, void* stream, matrix_size); } -extern "C" void triv_inv_col_sweep_fp32(uint32_t block_dim, void* stream, - uint8_t* x, uint8_t* y, - uint32_t in_length, - uint32_t matrix_size) { +extern "C" void call_triv_inv_col_sweep_fp32(uint32_t block_dim, void* stream, + uint8_t* x, uint8_t* y, + uint32_t in_length, + uint32_t matrix_size) { triv_inv_col_sweep_fp32<<>>(x, y, in_length, matrix_size); } From c234e4e630e47f7a3d6df8137918bf1f39157200 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 16:20:01 +0000 Subject: [PATCH 11/23] fix --- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index cbcdb96a..4d0838f1 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -173,10 +173,10 @@ __global__ AICORE void triv_inv_col_sweep_fp32(GM_ADDR x, GM_ADDR z, } } -extern "C" void triv_inv_col_sweep_fp16(uint32_t block_dim, void* stream, - uint8_t* x, uint8_t* y, - uint32_t in_length, - uint32_t matrix_size) { +extern "C" void call_triv_inv_col_sweep_fp16(uint32_t block_dim, void* stream, + uint8_t* x, uint8_t* y, + uint32_t in_length, + uint32_t matrix_size) { triv_inv_col_sweep_fp16<<>>(x, y, in_length, matrix_size); } From 57e13b247f0d169536a07806ec2e943528c33eeb Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 16:34:56 +0000 Subject: [PATCH 12/23] fix sync --- csrc/host/torch_batch_matrix_square.h | 2 +- csrc/host/torch_simple_matmul.h | 2 +- csrc/host/torch_tri_inv_col_sweep.h | 2 +- csrc/host/torch_tri_inv_rec_unroll.h | 2 +- csrc/host/torch_tri_inv_trick.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/host/torch_batch_matrix_square.h b/csrc/host/torch_batch_matrix_square.h index 51fec411..55e72606 100644 --- a/csrc/host/torch_batch_matrix_square.h +++ b/csrc/host/torch_batch_matrix_square.h @@ -52,7 +52,7 @@ at::Tensor run_batch_matrix_square(const at::Tensor& x) { at::zeros({block_dim, matrix_size, matrix_size}, at::TensorOptions().dtype(dtype_out).device(device)); - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_batch_matrix_square_fp16(block_dim, acl_stream, ConvertType(z), ConvertType(x), matrix_size); diff --git a/csrc/host/torch_simple_matmul.h b/csrc/host/torch_simple_matmul.h index 649ebbef..13e78833 100644 --- a/csrc/host/torch_simple_matmul.h +++ b/csrc/host/torch_simple_matmul.h @@ -50,7 +50,7 @@ at::Tensor run_simple_matmul(const at::Tensor& a, const at::Tensor& b) { at::ones({matrix_size, matrix_size}, at::TensorOptions().dtype(dtype_out).device(device)); - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_simple_matmul_fp16(block_dim, acl_stream, ConvertType(a), ConvertType(b), ConvertType(c), matrix_size); diff --git a/csrc/host/torch_tri_inv_col_sweep.h b/csrc/host/torch_tri_inv_col_sweep.h index de1578a9..37b6e327 100644 --- a/csrc/host/torch_tri_inv_col_sweep.h +++ b/csrc/host/torch_tri_inv_col_sweep.h @@ -58,7 +58,7 @@ at::Tensor run_tri_inv(const at::Tensor& x) { const at::Tensor z = at::empty_like(x); - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_triv_inv_col_sweep_fp16(block_dim, acl_stream, ConvertType(z), ConvertType(x), num_elems, matrix_size); diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index 15cacdf6..b79c3929 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -63,7 +63,7 @@ at::Tensor run_tri_inv_rec_unroll(const at::Tensor& M, at::TensorOptions().dtype(dtype).device(device)); I_neg.fill_diagonal_(-1); - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv), ConvertType(M), ConvertType(I_neg), diff --git a/csrc/host/torch_tri_inv_trick.h b/csrc/host/torch_tri_inv_trick.h index 55a53b45..1e28af35 100644 --- a/csrc/host/torch_tri_inv_trick.h +++ b/csrc/host/torch_tri_inv_trick.h @@ -53,7 +53,7 @@ at::Tensor run_tri_inv_trick(const at::Tensor& M) { at::TensorOptions().dtype(dtype).device(device)); I_neg.fill_diagonal_(-1); - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_tri_inv_trick_fp16(block_dim, acl_stream, ConvertType(M_inv), ConvertType(M), ConvertType(I_neg), matrix_size, From 068b9bc83571529f9c2d0da5abd2beb36726a903 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 2 Apr 2026 16:45:52 +0000 Subject: [PATCH 13/23] fix --- csrc/kernel/kernel_tri_inv_col_sweep.cpp | 14 ++++++++------ csrc/kernel/kernel_tri_inv_rec_unroll.cpp | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/csrc/kernel/kernel_tri_inv_col_sweep.cpp b/csrc/kernel/kernel_tri_inv_col_sweep.cpp index 4d0838f1..09f17253 100644 --- a/csrc/kernel/kernel_tri_inv_col_sweep.cpp +++ b/csrc/kernel/kernel_tri_inv_col_sweep.cpp @@ -174,17 +174,19 @@ __global__ AICORE void triv_inv_col_sweep_fp32(GM_ADDR x, GM_ADDR z, } extern "C" void call_triv_inv_col_sweep_fp16(uint32_t block_dim, void* stream, - uint8_t* x, uint8_t* y, + uint8_t* tensor_out, + uint8_t* tensor_in, uint32_t in_length, uint32_t matrix_size) { - triv_inv_col_sweep_fp16<<>>(x, y, in_length, - matrix_size); + triv_inv_col_sweep_fp16<<>>( + tensor_in, tensor_out, in_length, matrix_size); } extern "C" void call_triv_inv_col_sweep_fp32(uint32_t block_dim, void* stream, - uint8_t* x, uint8_t* y, + uint8_t* tensor_out, + uint8_t* tensor_in, uint32_t in_length, uint32_t matrix_size) { - triv_inv_col_sweep_fp32<<>>(x, y, in_length, - matrix_size); + triv_inv_col_sweep_fp32<<>>( + tensor_in, tensor_out, in_length, matrix_size); } diff --git a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp index 1cfda3f9..5e94cddc 100644 --- a/csrc/kernel/kernel_tri_inv_rec_unroll.cpp +++ b/csrc/kernel/kernel_tri_inv_rec_unroll.cpp @@ -716,7 +716,7 @@ __global__ AICORE void tri_inv_rec_unroll_fp16(__gm__ void* tensor_out, } extern "C" void call_tri_inv_rec_unroll_fp16( - uint32_t block_dim, void* stream, uint8_t* tensor_in, uint8_t* tensor_out, + uint32_t block_dim, void* stream, uint8_t* tensor_out, uint8_t* tensor_in, uint8_t* minus_identity_in, uint32_t matrix_size, uint32_t num_matrices, uint32_t num_bsnd_heads) { tri_inv_rec_unroll_fp16<<>>( From 80d6eebbae3376145b675d803c21ea04ffb87956 Mon Sep 17 00:00:00 2001 From: anastasios Date: Fri, 3 Apr 2026 10:57:41 +0000 Subject: [PATCH 14/23] (abs) stream set flag to true --- csrc/host/torch_abs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 381cc15c..c6871621 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -42,7 +42,7 @@ at::Tensor run_abs(const at::Tensor& x) { "pto_abs supports only inputs with length that is multiple of 64."); } - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_len); From dfc37e71e5597295b62ceac6d324d578e63c1c50 Mon Sep 17 00:00:00 2001 From: anastasios Date: Sat, 11 Apr 2026 10:06:19 +0000 Subject: [PATCH 15/23] WIP --- csrc/host/torch_abs.h | 6 ++++-- csrc/host/torch_tri_inv_col_sweep.h | 18 ++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index cfb36b75..fb8e2b7d 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -14,9 +14,11 @@ for the full License text. #include "utils.h" -extern "C" void vabs_fp16(void* x, void* y, uint32_t num_elements); +extern "C" void call_vabs_fp16(uint32_t blockDim, aclrtStream stream, void* x, + void* y, uint32_t num_elements); -extern "C" void vabs_fp32(void* x, void* y, uint32_t num_elements); +extern "C" void call_vabs_fp32(uint32_t blockDim, aclrtStream stream, void* x, + void* y, uint32_t num_elements); namespace pto_isa_ops { /** diff --git a/csrc/host/torch_tri_inv_col_sweep.h b/csrc/host/torch_tri_inv_col_sweep.h index 37b6e327..b15dbd3c 100644 --- a/csrc/host/torch_tri_inv_col_sweep.h +++ b/csrc/host/torch_tri_inv_col_sweep.h @@ -13,17 +13,15 @@ for the full License text. #include "utils.h" -extern "C" aclError call_triv_inv_col_sweep_fp16(uint32_t blockDim, - aclrtStream stream, - void* M_inv, void* M, - uint32_t num_elems, - uint32_t matrix_size); +extern "C" void call_triv_inv_col_sweep_fp16(uint32_t blockDim, + aclrtStream stream, void* M_inv, + void* M, uint32_t num_elems, + uint32_t matrix_size); -extern "C" aclError call_triv_inv_col_sweep_fp32(uint32_t blockDim, - aclrtStream stream, - void* M_inv, void* M, - uint32_t num_elems, - uint32_t matrix_size); +extern "C" void call_triv_inv_col_sweep_fp32(uint32_t blockDim, + aclrtStream stream, void* M_inv, + void* M, uint32_t num_elems, + uint32_t matrix_size); namespace pto_isa_ops { From 7cc67898c2f1864efe51ccf260161b93bdc0730c Mon Sep 17 00:00:00 2001 From: anastasios Date: Tue, 28 Apr 2026 23:26:44 +0000 Subject: [PATCH 16/23] fix --- csrc/host/torch_abs.h | 6 ++++-- csrc/host/torch_swiglu.h | 9 ++++++--- csrc/kernel/kernel_abs.cpp | 18 +++++++++++++----- csrc/kernel/kernel_swiglu.cpp | 10 +++++++--- csrc/kernel/kernel_tri_inv_trick.cpp | 10 +++++----- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index fb8e2b7d..3feca31d 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -46,9 +46,11 @@ at::Tensor run_abs(const at::Tensor& x) { auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(vabs_fp16, block_dim, x, z, total_size); + call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), + total_size); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(vabs_fp32, block_dim, x, z, total_size); + call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z), + total_size); } else { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); } diff --git a/csrc/host/torch_swiglu.h b/csrc/host/torch_swiglu.h index 7e6bb349..12eb6a56 100644 --- a/csrc/host/torch_swiglu.h +++ b/csrc/host/torch_swiglu.h @@ -13,8 +13,9 @@ for the full License text. #include -extern "C" uint32_t swiglu_fp16(void* x, void* y, uint32_t batch, - uint32_t input_n); +extern "C" uint32_t call_swiglu_fp16(uint32_t blockDim, aclrtStream stream, + void* x, void* y, uint32_t batch, + uint32_t input_n); #include "utils.h" namespace pto_isa_ops { @@ -63,8 +64,10 @@ at::Tensor run_swiglu(const at::Tensor& x, int64_t dim = -1) { const uint32_t input_n = static_cast(input_n_i64); const uint32_t block_dim = GetNumCubeCores(); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); at::Tensor y = at::empty({batch_i64, output_n_i64}, x.options()); - EXEC_KERNEL_CMD(swiglu_fp16, block_dim, x, y, batch, input_n); + call_swiglu_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(y), batch, + input_n); return y; } diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index d80eb5ec..f61fb8de 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -108,14 +108,22 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { #endif } -extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, - uint32_t in_length) { +__global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); } -extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, - uint32_t in_length) { +__global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); -} \ No newline at end of file +} + +extern "C" void call_vabs_fp16(uint32_t block_dim, void* stream, GM_ADDR x, + GM_ADDR z, uint32_t num_elements) { + vabs_fp16<<>>(x, z, num_elements); +} + +extern "C" void call_vabs_fp32(uint32_t block_dim, void* stream, GM_ADDR x, + GM_ADDR z, uint32_t num_elements) { + vabs_fp32<<>>(x, z, num_elements); +} diff --git a/csrc/kernel/kernel_swiglu.cpp b/csrc/kernel/kernel_swiglu.cpp index a165a1e9..a7b1aa71 100644 --- a/csrc/kernel/kernel_swiglu.cpp +++ b/csrc/kernel/kernel_swiglu.cpp @@ -400,9 +400,8 @@ AICORE void runTSwiGLU(__gm__ T* x, __gm__ T* y, uint32_t batch, #endif -extern "C" __global__ AICORE void swiglu_fp16(GM_ADDR x, GM_ADDR y, - uint32_t batch, - uint32_t input_n) { +__global__ AICORE void swiglu_fp16(GM_ADDR x, GM_ADDR y, uint32_t batch, + uint32_t input_n) { #if defined(__DAV_VEC__) const uint32_t num_cores = get_block_num() * get_subblockdim(); const uint32_t vid = get_block_idx() * get_subblockdim() + get_subblockid(); @@ -415,3 +414,8 @@ extern "C" __global__ AICORE void swiglu_fp16(GM_ADDR x, GM_ADDR y, (void)input_n; #endif } + +extern "C" void call_swiglu_fp16(uint32_t block_dim, void* stream, GM_ADDR x, + GM_ADDR y, uint32_t batch, uint32_t input_n) { + swiglu_fp16<<>>(x, y, batch, input_n); +} diff --git a/csrc/kernel/kernel_tri_inv_trick.cpp b/csrc/kernel/kernel_tri_inv_trick.cpp index ea1b5fca..0136f241 100644 --- a/csrc/kernel/kernel_tri_inv_trick.cpp +++ b/csrc/kernel/kernel_tri_inv_trick.cpp @@ -205,11 +205,11 @@ AICORE void run_tri_inv_trick(__gm__ float* tensor_out, } } -extern "C" __global__ AICORE void tri_inv_trick_fp16(__gm__ void* tensor_out, - __gm__ void* tensor_in, - __gm__ void* identity_in, - uint32_t matrix_size, - uint32_t max_block_size) { +__global__ AICORE void tri_inv_trick_fp16(__gm__ void* tensor_out, + __gm__ void* tensor_in, + __gm__ void* identity_in, + uint32_t matrix_size, + uint32_t max_block_size) { run_tri_inv_trick((__gm__ float*)tensor_out, (__gm__ half*)tensor_in, (__gm__ half*)identity_in, matrix_size, max_block_size); From 5c1498ea8a1842a919ac271091c3a6cbf165e6e9 Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 29 Apr 2026 10:00:00 +0000 Subject: [PATCH 17/23] fix --- csrc/host/torch_abs.h | 2 ++ csrc/kernel/kernel_abs.cpp | 8 ++++---- csrc/kernel/kernel_batch_matrix_square.cpp | 13 +++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 3feca31d..4dd8bdae 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -55,6 +55,8 @@ at::Tensor run_abs(const at::Tensor& x) { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); } + aclrtSynchronizeStream(acl_stream); + return z; } } // namespace pto_isa_ops diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index f61fb8de..81e9b695 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -25,8 +25,6 @@ using namespace pto; */ template AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - // Define GM tile type using ShapeDim5 = pto::Shape<1, 1, 1, 1, DYNAMIC>; using StrideDim5 = pto::Stride<1, 1, 1, 1, 1>; @@ -104,18 +102,20 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { // Signal end of MTE3 (current store) to vector core set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); } - -#endif } __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); +#endif } __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); +#endif } extern "C" void call_vabs_fp16(uint32_t block_dim, void* stream, GM_ADDR x, diff --git a/csrc/kernel/kernel_batch_matrix_square.cpp b/csrc/kernel/kernel_batch_matrix_square.cpp index 3bc3feb7..409b4006 100644 --- a/csrc/kernel/kernel_batch_matrix_square.cpp +++ b/csrc/kernel/kernel_batch_matrix_square.cpp @@ -16,9 +16,6 @@ using namespace pto; template AICORE void runKernelBatchMatrixSquare(__gm__ OutputT* z, __gm__ InputT* x) { -#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ - (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation - constexpr uint32_t TileLen = MatrixSize * MatrixSize; const uint32_t global_index = get_block_idx() * TileLen; @@ -79,9 +76,6 @@ AICORE void runKernelBatchMatrixSquare(__gm__ OutputT* z, __gm__ InputT* x) { wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); // FIX pipe waits for M pipe to set flag TSTORE(z_global_out, c_l0_tile); -#else -// Nothing to do on AIV -#endif } template @@ -110,13 +104,20 @@ AICORE void run_batch_matrix_square(__gm__ float* z, __gm__ InputT* x, __global__ AICORE void batch_matrix_square_fp16(__gm__ void* z, __gm__ void* x, uint32_t matrix_size) { +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation run_batch_matrix_square((__gm__ float*)z, (__gm__ half*)x, matrix_size); +#endif } __global__ AICORE void batch_matrix_square_fp32(__gm__ void* z, __gm__ void* x, uint32_t matrix_size) { +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + (__CCE_AICORE__ == 220 && defined(__DAV_C220_CUBE__)) // Cube compilation + run_batch_matrix_square((__gm__ float*)z, (__gm__ float*)x, matrix_size); +#endif } extern "C" void call_batch_matrix_square_fp16(uint32_t block_dim, void* stream, From ae8c488d5512a41f4d22b8f00b438a3af7f61d6f Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 29 Apr 2026 12:53:46 +0000 Subject: [PATCH 18/23] WIP --- csrc/host/torch_abs.h | 6 ++---- csrc/host/torch_tri_inv_rec_unroll.h | 12 ++++++++---- csrc/kernel/kernel_abs.cpp | 12 ++++++++---- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 4dd8bdae..95eac0e7 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -30,7 +30,7 @@ namespace pto_isa_ops { at::Tensor run_abs(const at::Tensor& x) { const auto dtype = x.options().dtype(); - at::Tensor z = at::empty_like(x); + const at::Tensor z = at::empty_like(x); // Define the number of blocks of vector core const uint32_t total_size = x.numel(); // FIXME: tile length is fixed to 128 for now @@ -44,7 +44,7 @@ at::Tensor run_abs(const at::Tensor& x) { block_dim = total_tiles; } - auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), total_size); @@ -55,8 +55,6 @@ at::Tensor run_abs(const at::Tensor& x) { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); } - aclrtSynchronizeStream(acl_stream); - return z; } } // namespace pto_isa_ops diff --git a/csrc/host/torch_tri_inv_rec_unroll.h b/csrc/host/torch_tri_inv_rec_unroll.h index 9beeb3a5..ca362a83 100644 --- a/csrc/host/torch_tri_inv_rec_unroll.h +++ b/csrc/host/torch_tri_inv_rec_unroll.h @@ -81,11 +81,15 @@ at::Tensor run_tri_inv_rec_unroll( if (dtype == at::kHalf) { if (cu_seqlens.numel() == 1) { void* void_null_ptr = nullptr; - EXEC_KERNEL_CMD(call_tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg, - matrix_size, total_tiles, num_bsnd_heads, void_null_ptr); + call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv), + ConvertType(M), ConvertType(I_neg), + matrix_size, total_tiles, num_bsnd_heads, + void_null_ptr); } else { - EXEC_KERNEL_CMD(call_tri_inv_rec_unroll_fp16, block_dim, M_inv, M, I_neg, - matrix_size, total_tiles, num_bsnd_heads, cu_seqlens); + call_tri_inv_rec_unroll_fp16(block_dim, acl_stream, ConvertType(M_inv), + ConvertType(M), ConvertType(I_neg), + matrix_size, total_tiles, num_bsnd_heads, + cu_seqlens.data_ptr()); } } diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 81e9b695..9fba8a7f 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -104,15 +104,19 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { } } -__global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) +extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, + uint32_t in_length) { +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); #endif } -__global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) +extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, + uint32_t in_length) { +#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ + __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) constexpr unsigned TILE_LEN = 128; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); #endif From 8f7e7246859fcd18966dcba1be0c29306ec71b8d Mon Sep 17 00:00:00 2001 From: anastasios Date: Wed, 29 Apr 2026 15:56:03 +0000 Subject: [PATCH 19/23] fix --- csrc/host/torch_abs.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 95eac0e7..3c179a83 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -44,13 +44,10 @@ at::Tensor run_abs(const at::Tensor& x) { block_dim = total_tiles; } - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); if (dtype == at::kHalf) { - call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), - total_size); + EXEC_KERNEL_CMD(call_vabs_fp16, block_dim, x, z, total_size); } else if (dtype == at::kFloat) { - call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z), - total_size); + EXEC_KERNEL_CMD(call_vabs_fp32, block_dim, x, z, total_size); } else { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); } From 2e0622a8e71e2b82f07050eff309607864777ea2 Mon Sep 17 00:00:00 2001 From: anastasios Date: Thu, 30 Apr 2026 17:31:42 +0000 Subject: [PATCH 20/23] fix --- csrc/host/torch_abs.h | 9 ++++++--- csrc/kernel/kernel_abs.cpp | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/host/torch_abs.h b/csrc/host/torch_abs.h index 3c179a83..bd0d58b5 100644 --- a/csrc/host/torch_abs.h +++ b/csrc/host/torch_abs.h @@ -38,16 +38,19 @@ at::Tensor run_abs(const at::Tensor& x) { // Persistent kernel launch parameter uint32_t total_tiles = (total_size + TILE_SIZE - 1) / TILE_SIZE; - uint32_t block_dim = GetNumVectorCores(); + uint32_t block_dim = GetNumCubeCores(); if (total_tiles < block_dim) { block_dim = total_tiles; } + auto acl_stream = c10_npu::getCurrentNPUStream().stream(true); if (dtype == at::kHalf) { - EXEC_KERNEL_CMD(call_vabs_fp16, block_dim, x, z, total_size); + call_vabs_fp16(block_dim, acl_stream, ConvertType(x), ConvertType(z), + total_size); } else if (dtype == at::kFloat) { - EXEC_KERNEL_CMD(call_vabs_fp32, block_dim, x, z, total_size); + call_vabs_fp32(block_dim, acl_stream, ConvertType(x), ConvertType(z), + total_size); } else { throw std::runtime_error("Unsupported dtype for `pto_abs` kernel"); } diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index 9fba8a7f..b641efca 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -124,10 +124,10 @@ extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, extern "C" void call_vabs_fp16(uint32_t block_dim, void* stream, GM_ADDR x, GM_ADDR z, uint32_t num_elements) { - vabs_fp16<<>>(x, z, num_elements); + vabs_fp16<<>>(x, z, num_elements); } extern "C" void call_vabs_fp32(uint32_t block_dim, void* stream, GM_ADDR x, GM_ADDR z, uint32_t num_elements) { - vabs_fp32<<>>(x, z, num_elements); + vabs_fp32<<>>(x, z, num_elements); } From 621d636897aa6a8b3f50a02210cb854e0915a923 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Thu, 30 Apr 2026 19:23:57 +0200 Subject: [PATCH 21/23] (makefile) introduce 'make compile_' (#145) * Fix formatting of Makefile for shared library build --------- Co-authored-by: anastasios --- Makefile | 18 ++- csrc/kernel/constants.h | 307 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 csrc/kernel/constants.h diff --git a/Makefile b/Makefile index 666f32b2..7228fa0f 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,10 @@ # https://github.com/huawei-csl/pto-kernels/ # for the full License text. # -------------------------------------------------------------------------------- -.PHONY: clean setup_once build_wheel install test +PTO_LIB_PATH ?= $(ASCEND_TOOLKIT_HOME) +CSRC_KERNEL_DIR := csrc/kernel + +.PHONY: clean setup_once build_cmake build_wheel install docs test test_tri_inv clean: rm -rf build/ dist/ extra-info/ *.egg-info/ kernel_meta/ @@ -20,6 +23,19 @@ build_cmake: clean build_wheel: export CMAKE_GENERATOR="Unix Makefiles" && pip wheel -v . --extra-index-url https://download.pytorch.org/whl/cpu + +# 'make compile_abs' compiles 'kernel_abs.cpp' into 'libkernel_abs.so' without building the whole wheel package. +# This is useful for development and debugging of individual kernels. +compile_%: + bisheng -fPIC -shared -xcce -DMEMORY_BASE -O2 -std=c++17 \ + -I$(CSRC_KERNEL_DIR) \ + -I$(PTO_LIB_PATH)/include \ + --npu-arch=dav-2201 \ + -Wno-ignored-attributes \ + $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ + -o libkernel_$*.so + + install: python3 -m pip install --force-reinstall pto_kernels-*.whl diff --git a/csrc/kernel/constants.h b/csrc/kernel/constants.h new file mode 100644 index 00000000..647c1f25 --- /dev/null +++ b/csrc/kernel/constants.h @@ -0,0 +1,307 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +All rights reserved. + +See LICENSE in the root of the software repository: +https://github.com/huawei-csl/pto-kernels/ +for the full License text. +*/ + +/** + * Minus identity matrices statically allocated in the binary of the + * kernel. + * + * To generate these matrices below, you can run the following Python code and +copy-paste the output into this file. Each matrix is stored in row-major order, +and has a trailing comma after each row to facilitate copy-pasting.: + * + * +```python +import numpy as np +for s in [16, 32, 64, 128]: + I_minus_s = -np.eye(s).astype("int") + np.savetxt(f"matrix_{s}.csv", I_minus_s, fmt='%5.0f', delimiter=",") + !sed '$!s/$/,/' matrix_{s}_with_trailing_comma.csv > matrix_{s}_final.csv +``` + */ +#pragma once + +#ifndef MEMORY_BASE +#define MEMORY_BASE +#endif +#include + +#define CONST_HALF_TO_GM(x) \ + reinterpret_cast<__gm__ half*>(const_cast<__gm__ half*>((x))) + +// clang-format off +constexpr static __gm__ half minus_eye_fp16_16[256] = {-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1}; + +constexpr static __gm__ half minus_eye_fp16_32[1024] = {-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1}; + + +constexpr static __gm__ half minus_eye_fp16_64[4096] = {-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1}; + +constexpr static __gm__ half minus_eye_fp16_128[16384] = { + -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1}; +// clang-format on + +AICORE inline __gm__ half* load_minus_eye_fp16_matrix(uint32_t matmul_size) { + switch (matmul_size) { + case 16: + return CONST_HALF_TO_GM(minus_eye_fp16_16); + break; + + case 32: + return CONST_HALF_TO_GM(minus_eye_fp16_32); + break; + + case 64: + return CONST_HALF_TO_GM(minus_eye_fp16_64); + break; + + case 128: + return CONST_HALF_TO_GM(minus_eye_fp16_128); + break; + + default: + return nullptr; + break; + } +} From 31b67b0c57ae408a87119250bfd71f121c8939ba Mon Sep 17 00:00:00 2001 From: anastasios Date: Thu, 30 Apr 2026 17:45:09 +0000 Subject: [PATCH 22/23] fix --- CMakeLists.txt | 7 +++---- Makefile | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 518d5679..771ebbbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,9 +97,8 @@ set(KERNEL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_batch_matrix_square.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_rec_unroll.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_tri_inv_trick.cpp -# ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_csr_gather.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_swiglu.cpp -) + # ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_csr_gather.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernel/kernel_swiglu.cpp) set(NO_WORKSPACE_KERNEL_LIB ${CMAKE_CURRENT_BINARY_DIR}/libno_workspace_kernel.so) @@ -107,7 +106,7 @@ set(NO_WORKSPACE_KERNEL_LIB add_custom_command( OUTPUT ${NO_WORKSPACE_KERNEL_LIB} COMMAND - bisheng -fPIC -shared -xcce -O2 -std=c++17 --npu-arch=dav-2201 + bisheng -fPIC -shared -xcce -O0 -g -std=c++17 --npu-arch=dav-2201 -I${libpto_isa_headers_SOURCE_DIR}/include ${KERNEL_SOURCES} -o ${NO_WORKSPACE_KERNEL_LIB} DEPENDS ${KERNEL_SOURCES} diff --git a/Makefile b/Makefile index 7228fa0f..b8c33ac2 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ build_wheel: # 'make compile_abs' compiles 'kernel_abs.cpp' into 'libkernel_abs.so' without building the whole wheel package. # This is useful for development and debugging of individual kernels. compile_%: - bisheng -fPIC -shared -xcce -DMEMORY_BASE -O2 -std=c++17 \ + bisheng -fPIC -shared -xcce -DMEMORY_BASE -O0 -std=c++17 \ -I$(CSRC_KERNEL_DIR) \ -I$(PTO_LIB_PATH)/include \ --npu-arch=dav-2201 \ From ff155ba3d27fbfa701d05b1b8e900aea94435341 Mon Sep 17 00:00:00 2001 From: anastasios Date: Thu, 30 Apr 2026 17:53:19 +0000 Subject: [PATCH 23/23] fix --- Makefile | 4 ++-- csrc/kernel/kernel_abs.cpp | 42 +++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index b8c33ac2..31b3a891 100644 --- a/Makefile +++ b/Makefile @@ -27,11 +27,11 @@ build_wheel: # 'make compile_abs' compiles 'kernel_abs.cpp' into 'libkernel_abs.so' without building the whole wheel package. # This is useful for development and debugging of individual kernels. compile_%: - bisheng -fPIC -shared -xcce -DMEMORY_BASE -O0 -std=c++17 \ + bisheng -fPIC -shared -xcce -DMEMORY_BASE -O2 -std=c++17 \ -I$(CSRC_KERNEL_DIR) \ -I$(PTO_LIB_PATH)/include \ --npu-arch=dav-2201 \ - -Wno-ignored-attributes \ + -Wno-ignored-attributes \ $(CSRC_KERNEL_DIR)/kernel_$*.cpp \ -o libkernel_$*.so diff --git a/csrc/kernel/kernel_abs.cpp b/csrc/kernel/kernel_abs.cpp index b641efca..d24e13ab 100644 --- a/csrc/kernel/kernel_abs.cpp +++ b/csrc/kernel/kernel_abs.cpp @@ -6,11 +6,13 @@ See LICENSE in the root of the software repository: https://github.com/huawei-csl/pto-kernels/ for the full License text. */ -#define MEMORY_BASE -#include +#include "kernel_utils.h" +// clang-format off: so it does not get wrongfully flagged by linter +#ifndef GM_ADDR #define GM_ADDR __gm__ uint8_t* // To avoid #include "kernel_operator.h" +#endif using namespace pto; @@ -23,7 +25,7 @@ using namespace pto; * @param z Output tensor * @param total_size Number of elements */ -template +template AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { // Define GM tile type using ShapeDim5 = pto::Shape<1, 1, 1, 1, DYNAMIC>; @@ -102,32 +104,38 @@ AICORE void runTAbs(__gm__ T* x, __gm__ T* z, uint32_t total_size) { // Signal end of MTE3 (current store) to vector core set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); } + + // Cleanup flags + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); } extern "C" __global__ AICORE void vabs_fp16(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ - __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - constexpr unsigned TILE_LEN = 128; +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) + constexpr uint32_t TILE_LEN = 128; runTAbs((__gm__ half*)x, (__gm__ half*)z, in_length); +#else + (void)x; + (void)z; + (void)in_length; #endif } extern "C" __global__ AICORE void vabs_fp32(GM_ADDR x, GM_ADDR z, uint32_t in_length) { -#if (__CHECK_FEATURE_AT_PRECOMPILE) || \ - __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) - constexpr unsigned TILE_LEN = 128; +#if __CCE_AICORE__ == 220 && defined(__DAV_C220_VEC__) + + constexpr uint32_t TILE_LEN = 128; runTAbs((__gm__ float*)x, (__gm__ float*)z, in_length); +#else + (void)x; + (void)z; + (void)in_length; #endif } -extern "C" void call_vabs_fp16(uint32_t block_dim, void* stream, GM_ADDR x, - GM_ADDR z, uint32_t num_elements) { - vabs_fp16<<>>(x, z, num_elements); -} - -extern "C" void call_vabs_fp32(uint32_t block_dim, void* stream, GM_ADDR x, - GM_ADDR z, uint32_t num_elements) { - vabs_fp32<<>>(x, z, num_elements); +extern "C" void call_vabs_fp16(uint32_t blockDim, void* stream, GM_ADDR x, + GM_ADDR y, uint32_t in_length) { + vabs_fp16<<>>(x, y, in_length); }