diff --git a/kernels/manual/a5/README.md b/kernels/manual/a5/README.md index 434177380..3d502c6b9 100644 --- a/kernels/manual/a5/README.md +++ b/kernels/manual/a5/README.md @@ -7,6 +7,7 @@ This directory contains manual, performance-oriented kernel examples targeting A - Flash-Attention kernel: [flash_atten](flash_atten/README.md) - MXFP4 matrix multiplication performance kernel: [matmul_mxfp4_performance](matmul_mxfp4_performance/README.md) - MXFP8 matrix multiplication performance kernel: [matmul_mxfp8_performance](matmul_mxfp8_performance/README.md) +- MHC (Multi-Head Computation) kernels: [mhc](mhc/README.md) ## Common setup diff --git a/kernels/manual/a5/mhc/CMakeLists.txt b/kernels/manual/a5/mhc/CMakeLists.txt new file mode 100644 index 000000000..fdd8c6aa8 --- /dev/null +++ b/kernels/manual/a5/mhc/CMakeLists.txt @@ -0,0 +1,68 @@ +cmake_minimum_required(VERSION 3.16) +project(mhc_kernels) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + +if(NOT DEFINED ENV{ASCEND_HOME_PATH}) + message(FATAL_ERROR "Cannot find ASCEND_HOME_PATH, please run set_env.sh.") +else() + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +endif() + +set(ASCEND_DRIVER_PATH /usr/local/Ascend/driver) +set(CMAKE_COMPILER bisheng) +set(CMAKE_C_COMPILER ${CMAKE_COMPILER}) +set(CMAKE_CXX_COMPILER ${CMAKE_COMPILER}) + +add_compile_options(-D_FORTIFY_SOURCE=2 -O2 -std=c++17 -Wno-macro-redefined -Wno-ignored-attributes -fstack-protector-strong) +add_link_options(-s -Wl,-z,relro -Wl,-z,now) + +set(CMAKE_CCE_COMPILE_OPTIONS + -xcce -Xhost-start -Xhost-end + "SHELL:-mllvm -cce-aicore-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-function-stack-size=0x8000" + "SHELL:-mllvm -cce-aicore-record-overflow=true" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-dcci-insert-for-scalar=false" +) + +set(CMAKE_CPP_COMPILE_OPTIONS -xc++ "SHELL:-include stdint.h" "SHELL:-include stddef.h") + +include_directories( + ${PROJECT_SOURCE_DIR}/../../../../include/ + ${ASCEND_HOME_PATH}/include + ${ASCEND_HOME_PATH}/pkg_inc/runtime + ${ASCEND_DRIVER_PATH}/kernel/inc +) + +# Build all 15 kernel .so files +set(MHC_KERNELS + expand_to_mhc_fwd expand_to_mhc_bwd + head_compute_mix_fwd head_compute_mix_bwd + pre_split_mixes_fwd pre_split_mixes_bwd + pre_apply_mix_fwd pre_apply_mix_bwd + pre_norm_fn_fwd + fn_normw_merge_fwd fn_normw_merge_bwd + post_fwd post_bwd + sinkhorn_normalize_fwd sinkhorn_normalize_bwd +) + +foreach(KERNEL ${MHC_KERNELS}) + add_library(${KERNEL}_kernel SHARED ${KERNEL}.cpp) + target_compile_options(${KERNEL}_kernel PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --npu-arch=dav-3510 -DMEMORY_BASE) +endforeach() + +# Build caller .so (wraps expand_to_mhc_fwd for the test) +add_library(mhc_caller SHARED caller.cpp) +target_compile_options(mhc_caller PRIVATE ${CMAKE_CCE_COMPILE_OPTIONS} --npu-arch=dav-3510 -DMEMORY_BASE) + +# Build host test executable +add_executable(mhc_test main.cpp) +target_compile_options(mhc_test PRIVATE ${CMAKE_CPP_COMPILE_OPTIONS}) +target_include_directories(mhc_test PRIVATE ${PROJECT_SOURCE_DIR}/../../../../tests/common) +target_link_directories(mhc_test PUBLIC ${ASCEND_HOME_PATH}/lib64) +target_link_libraries(mhc_test PRIVATE mhc_caller ascendcl stdc++ m pthread) diff --git a/kernels/manual/a5/mhc/README.md b/kernels/manual/a5/mhc/README.md new file mode 100644 index 000000000..f240d119c --- /dev/null +++ b/kernels/manual/a5/mhc/README.md @@ -0,0 +1,49 @@ +# MHC (Multi-Head Computation) Kernels + +PTO-ISA kernels for the MHC architecture from [DeepSeek TileKernels](https://github.com/deepseek-ai/TileKernels). + +## Overview + +MHC extends the standard Transformer residual connection from a single stream to multiple parallel heads with learnable mixing: + +``` +x[m] = Σ_in comb[in, m] * residual[in] + post_mix[m] * up(F(down(x))) +``` + +This directory contains 15 kernels (7 forward + 8 backward) that implement the full MHC forward and backward pass: + +| Kernel | Description | Data types | +|--------|-------------|------------| +| `expand_to_mhc_fwd/bwd` | Broadcast x to multi-head / reduce gradient | bf16 | +| `pre_apply_mix_fwd/bwd` | Weighted sum across heads | bf16 + f32 | +| `pre_norm_fn_fwd` | RMSNorm + FN weight projection | bf16 → f32 | +| `fn_normw_merge_fwd/bwd` | Fuse norm weight with FN weight | f32 | +| `head_compute_mix_fwd/bwd` | Sigmoid activation for head mix | f32 | +| `pre_split_mixes_fwd/bwd` | Split raw params into pre/post/comb | f32 | +| `sinkhorn_normalize_fwd/bwd` | Sinkhorn iteration for doubly-stochastic comb matrix | f32 | +| `post_fwd/bwd` | Final multi-head residual combination | bf16 + f32 | + +## Generation + +These kernels were generated from [PTO-DSL](https://github.com/huawei-csl/pto-dsl) Python source via the following pipeline: + +``` +PTO-DSL Python → MLIR IR (.pto) → ptoas assembler → PTO-ISA C++ (.cpp) +``` + +Source: [PTO-Gym PR #7](https://github.com/PTO-ISA/PTO-Gym/pull/7) (`tilekernels_ptodsl/mhc/`) + +Two post-processing steps were applied for Ascend A5 (dav-3510) compatibility: +1. Tile shapes padded to 32-byte alignment (via DSL `_meta_data` modification) +2. `pipe_barrier(PIPE_V)` replaced with `pipe_barrier(PIPE_ALL)` + +## Build & Run + +```bash +source /usr/local/Ascend/ascend-toolkit/latest/set_env.sh +bash run.sh +``` + +## Parameters + +All kernels use `mhc_mult=4` and `hidden_size=1280` (matching DeepSeek-V3 configuration). diff --git a/kernels/manual/a5/mhc/caller.cpp b/kernels/manual/a5/mhc/caller.cpp new file mode 100644 index 000000000..72a408781 --- /dev/null +++ b/kernels/manual/a5/mhc/caller.cpp @@ -0,0 +1,15 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +CANN Open Software License Agreement Version 2.0 + +Caller wrappers for MHC kernels. Each wrapper exports a C function that +launches the corresponding __global__ kernel with the <<<>>> syntax. +*/ +#include "expand_to_mhc_fwd.cpp" +#include + +extern "C" void call_expand_fwd(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *out, int32_t tokens, int32_t hidden) { + tilekernels_mhc_expand_to_mhc_fwd_m4<<>>( + (bfloat16_t *)x, (bfloat16_t *)out, tokens, hidden); +} diff --git a/kernels/manual/a5/mhc/expand_to_mhc_bwd.cpp b/kernels/manual/a5/mhc/expand_to_mhc_bwd.cpp new file mode 100644 index 000000000..675a66419 --- /dev/null +++ b/kernels/manual/a5/mhc/expand_to_mhc_bwd.cpp @@ -0,0 +1,164 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_expand_to_mhc_bwd_m4(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4) { + RoundMode v5 = RoundMode::CAST_RINT; + unsigned v6 = 0; + const int32_t v7 = 4; + const int32_t v8 = 1024; + const int32_t v9 = 1; + const int32_t v10 = 0; + const int32_t v11 = 2; + const int32_t v12 = 3; + const int64_t v13 = 0; + const int64_t v14 = 16384; + const int64_t v15 = 49152; + const int64_t v16 = 81920; + using T = float; + size_t v17 = (size_t) v9; + int32_t v18 = (int32_t) ((uint32_t) v3 * (uint32_t) v7); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v19 = get_block_idx(); + int64_t v20 = get_block_num(); + int32_t v21 = (int32_t) ((int64_t) v20); + int32_t v22 = v3 / v21; + int32_t v23 = v3 % v21 != v10 && v3 < v10 == v21 < v10 ? v22 + v9 : v22; + int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v19) * (uint32_t) v23); + int32_t v25 = (int32_t) ((uint32_t) v24 + (uint32_t) v23); + int32_t v26 = v4 / v8; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + for (size_t v27 = (size_t) v24; v27 < ((size_t) ((uint32_t) v25 < (uint32_t) v3 ? v25 : v3)); v27 += v17) { + int32_t v28 = (int32_t) v27; + for (size_t v29 = (size_t) v10; v29 < ((size_t) (v4 % v8 != v10 && v4 < v10 == v8 < v10 ? v26 + v9 : v26)); v29 += v17) { + int32_t v30 = (int32_t) ((uint32_t) ((int32_t) v29) * (uint32_t) v8); + int32_t v31 = (int32_t) ((uint32_t) v4 - (uint32_t) v30); + int32_t v32 = (uint32_t) v31 < (uint32_t) v8 ? v31 : v8; + Tile v33; + TASSIGN(v33, v13); + Tile v34 = Tile(v32); + __ubuf__ bfloat16_t* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + Tile v37; + TASSIGN(v37, v14); + Tile v38 = Tile(v32); + __ubuf__ float* v39 = v37.data(); + uint64_t v40 = reinterpret_cast(v39); + TASSIGN(v38, v40); + Tile v41; + TASSIGN(v41, v15); + Tile v42 = Tile(v32); + __ubuf__ float* v43 = v41.data(); + uint64_t v44 = reinterpret_cast(v43); + TASSIGN(v42, v44); + Tile v45; + TASSIGN(v45, v16); + Tile v46 = Tile(v32); + __ubuf__ bfloat16_t* v47 = v45.data(); + uint64_t v48 = reinterpret_cast(v47); + TASSIGN(v46, v48); + int32_t v49 = (int32_t) ((uint32_t) v28 * (uint32_t) v7); + unsigned v50 = (unsigned) v32; + unsigned v51 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v52 = pto::Shape<1, 1, 1, 1, -1>(v32); + pto::Stride<-1, -1, -1, -1, 1> v53 = pto::Stride<-1, -1, -1, -1, 1>(v51, v51, v51, v51); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v54 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v49 * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v52, v53); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v34, v54); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v38, v34, v5); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMOV(v42, v38); + unsigned v55 = (unsigned) v32; + unsigned v56 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v57 = pto::Shape<1, 1, 1, 1, -1>(v32); + pto::Stride<-1, -1, -1, -1, 1> v58 = pto::Stride<-1, -1, -1, -1, 1>(v56, v56, v56, v56); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v59 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v9) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v57, v58); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v34, v59); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v38, v34, v5); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TADD(v42, v42, v38); + unsigned v60 = (unsigned) v32; + unsigned v61 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v62 = pto::Shape<1, 1, 1, 1, -1>(v32); + pto::Stride<-1, -1, -1, -1, 1> v63 = pto::Stride<-1, -1, -1, -1, 1>(v61, v61, v61, v61); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v64 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v11) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v62, v63); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v34, v64); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TCVT(v38, v34, v5); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TADD(v42, v42, v38); + unsigned v65 = (unsigned) v32; + unsigned v66 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v67 = pto::Shape<1, 1, 1, 1, -1>(v32); + pto::Stride<-1, -1, -1, -1, 1> v68 = pto::Stride<-1, -1, -1, -1, 1>(v66, v66, v66, v66); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v69 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) ((int32_t) (uint32_t) v49 + (uint32_t) v12) * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v67, v68); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v34, v69); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TCVT(v38, v34, v5); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v42, v42, v38); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v46, v42, v5); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v70 = (unsigned) v32; + unsigned v71 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v72 = pto::Shape<1, 1, 1, 1, -1>(v32); + pto::Stride<-1, -1, -1, -1, 1> v73 = pto::Stride<-1, -1, -1, -1, 1>(v71, v71, v71, v71); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v74 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v28 * (unsigned) v4 + (unsigned) v30 * (unsigned) v9), v72, v73); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v74, v46); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/expand_to_mhc_fwd.cpp b/kernels/manual/a5/mhc/expand_to_mhc_fwd.cpp new file mode 100644 index 000000000..f8a2aedd2 --- /dev/null +++ b/kernels/manual/a5/mhc/expand_to_mhc_fwd.cpp @@ -0,0 +1,108 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_expand_to_mhc_fwd_m4(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, int32_t v3, int32_t v4) { + unsigned v5 = 0; + const int32_t v6 = 4; + const int32_t v7 = 1024; + const int32_t v8 = 1; + const int32_t v9 = 0; + const int32_t v10 = 2; + const int32_t v11 = 3; + const int64_t v12 = 0; + using T = float; + size_t v13 = (size_t) v8; + int32_t v14 = (int32_t) ((uint32_t) v3 * (uint32_t) v6); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v15 = get_block_idx(); + int64_t v16 = get_block_num(); + int32_t v17 = (int32_t) ((int64_t) v16); + int32_t v18 = v3 / v17; + int32_t v19 = v3 % v17 != v9 && v3 < v9 == v17 < v9 ? v18 + v8 : v18; + int32_t v20 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v15) * (uint32_t) v19); + int32_t v21 = (int32_t) ((uint32_t) v20 + (uint32_t) v19); + int32_t v22 = v4 / v7; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v23 = (size_t) v20; v23 < ((size_t) ((uint32_t) v21 < (uint32_t) v3 ? v21 : v3)); v23 += v13) { + int32_t v24 = (int32_t) v23; + for (size_t v25 = (size_t) v9; v25 < ((size_t) (v4 % v7 != v9 && v4 < v9 == v7 < v9 ? v22 + v8 : v22)); v25 += v13) { + int32_t v26 = (int32_t) ((uint32_t) ((int32_t) v25) * (uint32_t) v7); + int32_t v27 = (int32_t) ((uint32_t) v4 - (uint32_t) v26); + int32_t v28 = (uint32_t) v27 < (uint32_t) v7 ? v27 : v7; + Tile v29; + TASSIGN(v29, v12); + Tile v30 = Tile(v28); + __ubuf__ bfloat16_t* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + unsigned v33 = (unsigned) v28; + unsigned v34 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v35 = pto::Shape<1, 1, 1, 1, -1>(v28); + pto::Stride<-1, -1, -1, -1, 1> v36 = pto::Stride<-1, -1, -1, -1, 1>(v34, v34, v34, v34); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v37 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v5 + (unsigned) v24 * (unsigned) v4 + (unsigned) v26 * (unsigned) v8), v35, v36); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v30, v37); + set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + int32_t v38 = (int32_t) ((uint32_t) v24 * (uint32_t) v6); + unsigned v39 = (unsigned) v28; + unsigned v40 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v41 = pto::Shape<1, 1, 1, 1, -1>(v28); + pto::Stride<-1, -1, -1, -1, 1> v42 = pto::Stride<-1, -1, -1, -1, 1>(v40, v40, v40, v40); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v43 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) v38 * (unsigned) v4 + (unsigned) v26 * (unsigned) v8), v41, v42); + wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v43, v30); + unsigned v44 = (unsigned) v28; + unsigned v45 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v46 = pto::Shape<1, 1, 1, 1, -1>(v28); + pto::Stride<-1, -1, -1, -1, 1> v47 = pto::Stride<-1, -1, -1, -1, 1>(v45, v45, v45, v45); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v48 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) ((int32_t) (uint32_t) v38 + (uint32_t) v8) * (unsigned) v4 + (unsigned) v26 * (unsigned) v8), v46, v47); + pipe_barrier(PIPE_MTE3); + TSTORE(v48, v30); + unsigned v49 = (unsigned) v28; + unsigned v50 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v51 = pto::Shape<1, 1, 1, 1, -1>(v28); + pto::Stride<-1, -1, -1, -1, 1> v52 = pto::Stride<-1, -1, -1, -1, 1>(v50, v50, v50, v50); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v53 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) ((int32_t) (uint32_t) v38 + (uint32_t) v10) * (unsigned) v4 + (unsigned) v26 * (unsigned) v8), v51, v52); + pipe_barrier(PIPE_MTE3); + TSTORE(v53, v30); + unsigned v54 = (unsigned) v28; + unsigned v55 = (unsigned) v4; + pto::Shape<1, 1, 1, 1, -1> v56 = pto::Shape<1, 1, 1, 1, -1>(v28); + pto::Stride<-1, -1, -1, -1, 1> v57 = pto::Stride<-1, -1, -1, -1, 1>(v55, v55, v55, v55); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v58 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) ((int32_t) (uint32_t) v38 + (uint32_t) v11) * (unsigned) v4 + (unsigned) v26 * (unsigned) v8), v56, v57); + pipe_barrier(PIPE_MTE3); + TSTORE(v58, v30); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + }; + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/fn_normw_merge_bwd.cpp b/kernels/manual/a5/mhc/fn_normw_merge_bwd.cpp new file mode 100644 index 000000000..4646f96ca --- /dev/null +++ b/kernels/manual/a5/mhc/fn_normw_merge_bwd.cpp @@ -0,0 +1,1429 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_fn_normw_merge_bwd_m4_h1280(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, __gm__ float* v5) { + unsigned v6 = 23; + unsigned v7 = 22; + unsigned v8 = 21; + unsigned v9 = 20; + unsigned v10 = 19; + unsigned v11 = 18; + unsigned v12 = 17; + unsigned v13 = 16; + unsigned v14 = 15; + unsigned v15 = 14; + unsigned v16 = 13; + unsigned v17 = 12; + unsigned v18 = 11; + unsigned v19 = 10; + unsigned v20 = 9; + unsigned v21 = 8; + unsigned v22 = 7; + unsigned v23 = 6; + unsigned v24 = 5; + unsigned v25 = 4; + unsigned v26 = 3; + unsigned v27 = 2; + unsigned v28 = 1; + unsigned v29 = 0; + const int32_t v30 = 0; + const int32_t v31 = 5120; + const int32_t v32 = 1; + const int32_t v33 = 5; + const int32_t v34 = 1024; + const int64_t v35 = 0; + const int64_t v36 = 32768; + const int64_t v37 = 65536; + const int64_t v38 = 98304; + const int64_t v39 = 131072; + const int64_t v40 = 163840; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v41 = get_block_idx(); + int64_t v42 = get_block_num(); + int32_t v43 = (int32_t) ((int64_t) v42); + int32_t v44 = v33 / v43; + int32_t v45 = v33 % v43 != v30 && v33 < v30 == v43 < v30 ? v44 + v32 : v44; + int32_t v46 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v41) * (uint32_t) v45); + int32_t v47 = (int32_t) ((uint32_t) v46 + (uint32_t) v45); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v48 = (size_t) v46; v48 < ((size_t) ((uint32_t) v47 < (uint32_t) v33 ? v47 : v33)); v48 += (size_t) v32) { + int32_t v49 = (int32_t) ((uint32_t) ((int32_t) v48) * (uint32_t) v34); + Tile v50; + TASSIGN(v50, v35); + Tile v51; + __ubuf__ float* v52 = v50.data(); + uint64_t v53 = reinterpret_cast(v52); + TASSIGN(v51, v53); + Tile v54; + TASSIGN(v54, v36); + Tile v55; + __ubuf__ float* v56 = v54.data(); + uint64_t v57 = reinterpret_cast(v56); + TASSIGN(v55, v57); + pto::Shape<1, 1, 1, 1, 1024> v58 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v59 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v60 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v29 + v29 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v58, v59); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v51, v60); + pto::Shape<1, 1, 1, 1, 1024> v61 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v62 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v63 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v5 + (v29 + v29 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v61, v62); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v55, v63); + Tile v64; + TASSIGN(v64, v37); + Tile v65; + __ubuf__ float* v66 = v64.data(); + uint64_t v67 = reinterpret_cast(v66); + TASSIGN(v65, v67); + Tile v68; + TASSIGN(v68, v38); + Tile v69; + __ubuf__ float* v70 = v68.data(); + uint64_t v71 = reinterpret_cast(v70); + TASSIGN(v69, v71); + Tile v72; + TASSIGN(v72, v39); + Tile v73; + __ubuf__ float* v74 = v72.data(); + uint64_t v75 = reinterpret_cast(v74); + TASSIGN(v73, v75); + Tile v76; + TASSIGN(v76, v40); + Tile v77; + __ubuf__ float* v78 = v76.data(); + uint64_t v79 = reinterpret_cast(v78); + TASSIGN(v77, v79); + pto::Shape<1, 1, 1, 1, 1024> v80 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v81 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v82 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v29 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v80, v81); + TLOAD(v65, v82); + pto::Shape<1, 1, 1, 1, 1024> v83 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v84 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v85 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v29 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v83, v84); + TLOAD(v69, v85); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v86 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v87 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v88 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v29 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v86, v87); + TLOAD(v73, v88); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(v77, v69, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v73, v73, v77); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v88, v73); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v77, v69, v65); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v77); + Tile v89; + TASSIGN(v89, v37); + Tile v90; + __ubuf__ float* v91 = v89.data(); + uint64_t v92 = reinterpret_cast(v91); + TASSIGN(v90, v92); + Tile v93; + TASSIGN(v93, v38); + Tile v94; + __ubuf__ float* v95 = v93.data(); + uint64_t v96 = reinterpret_cast(v95); + TASSIGN(v94, v96); + Tile v97; + TASSIGN(v97, v39); + Tile v98; + __ubuf__ float* v99 = v97.data(); + uint64_t v100 = reinterpret_cast(v99); + TASSIGN(v98, v100); + Tile v101; + TASSIGN(v101, v40); + Tile v102; + __ubuf__ float* v103 = v101.data(); + uint64_t v104 = reinterpret_cast(v103); + TASSIGN(v102, v104); + pto::Shape<1, 1, 1, 1, 1024> v105 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v106 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v107 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v28 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v105, v106); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v90, v107); + pto::Shape<1, 1, 1, 1, 1024> v108 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v109 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v110 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v28 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v108, v109); + TLOAD(v94, v110); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 1024> v111 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v112 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v113 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v28 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v111, v112); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v98, v113); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v102, v94, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TADD(v98, v98, v102); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v113, v98); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TMUL(v102, v94, v90); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v102); + Tile v114; + TASSIGN(v114, v37); + Tile v115; + __ubuf__ float* v116 = v114.data(); + uint64_t v117 = reinterpret_cast(v116); + TASSIGN(v115, v117); + Tile v118; + TASSIGN(v118, v38); + Tile v119; + __ubuf__ float* v120 = v118.data(); + uint64_t v121 = reinterpret_cast(v120); + TASSIGN(v119, v121); + Tile v122; + TASSIGN(v122, v39); + Tile v123; + __ubuf__ float* v124 = v122.data(); + uint64_t v125 = reinterpret_cast(v124); + TASSIGN(v123, v125); + Tile v126; + TASSIGN(v126, v40); + Tile v127; + __ubuf__ float* v128 = v126.data(); + uint64_t v129 = reinterpret_cast(v128); + TASSIGN(v127, v129); + pto::Shape<1, 1, 1, 1, 1024> v130 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v131 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v132 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v27 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v130, v131); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v115, v132); + pto::Shape<1, 1, 1, 1, 1024> v133 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v134 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v135 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v27 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v133, v134); + TLOAD(v119, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 1024> v136 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v137 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v138 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v27 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v136, v137); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + TLOAD(v123, v138); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TMUL(v127, v119, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TADD(v123, v123, v127); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v138, v123); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TMUL(v127, v119, v115); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v127); + Tile v139; + TASSIGN(v139, v37); + Tile v140; + __ubuf__ float* v141 = v139.data(); + uint64_t v142 = reinterpret_cast(v141); + TASSIGN(v140, v142); + Tile v143; + TASSIGN(v143, v38); + Tile v144; + __ubuf__ float* v145 = v143.data(); + uint64_t v146 = reinterpret_cast(v145); + TASSIGN(v144, v146); + Tile v147; + TASSIGN(v147, v39); + Tile v148; + __ubuf__ float* v149 = v147.data(); + uint64_t v150 = reinterpret_cast(v149); + TASSIGN(v148, v150); + Tile v151; + TASSIGN(v151, v40); + Tile v152; + __ubuf__ float* v153 = v151.data(); + uint64_t v154 = reinterpret_cast(v153); + TASSIGN(v152, v154); + pto::Shape<1, 1, 1, 1, 1024> v155 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v156 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v157 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v26 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v155, v156); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v140, v157); + pto::Shape<1, 1, 1, 1, 1024> v158 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v159 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v160 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v26 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v158, v159); + TLOAD(v144, v160); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + pto::Shape<1, 1, 1, 1, 1024> v161 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v162 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v163 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v26 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v161, v162); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + TLOAD(v148, v163); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TMUL(v152, v144, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(v148, v148, v152); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v163, v148); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TMUL(v152, v144, v140); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v152); + Tile v164; + TASSIGN(v164, v37); + Tile v165; + __ubuf__ float* v166 = v164.data(); + uint64_t v167 = reinterpret_cast(v166); + TASSIGN(v165, v167); + Tile v168; + TASSIGN(v168, v38); + Tile v169; + __ubuf__ float* v170 = v168.data(); + uint64_t v171 = reinterpret_cast(v170); + TASSIGN(v169, v171); + Tile v172; + TASSIGN(v172, v39); + Tile v173; + __ubuf__ float* v174 = v172.data(); + uint64_t v175 = reinterpret_cast(v174); + TASSIGN(v173, v175); + Tile v176; + TASSIGN(v176, v40); + Tile v177; + __ubuf__ float* v178 = v176.data(); + uint64_t v179 = reinterpret_cast(v178); + TASSIGN(v177, v179); + pto::Shape<1, 1, 1, 1, 1024> v180 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v181 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v182 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v25 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v180, v181); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v165, v182); + pto::Shape<1, 1, 1, 1, 1024> v183 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v184 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v185 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v25 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v183, v184); + TLOAD(v169, v185); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v186 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v187 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v188 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v25 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v186, v187); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + TLOAD(v173, v188); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v177, v169, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v173, v173, v177); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v188, v173); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TMUL(v177, v169, v165); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v177); + Tile v189; + TASSIGN(v189, v37); + Tile v190; + __ubuf__ float* v191 = v189.data(); + uint64_t v192 = reinterpret_cast(v191); + TASSIGN(v190, v192); + Tile v193; + TASSIGN(v193, v38); + Tile v194; + __ubuf__ float* v195 = v193.data(); + uint64_t v196 = reinterpret_cast(v195); + TASSIGN(v194, v196); + Tile v197; + TASSIGN(v197, v39); + Tile v198; + __ubuf__ float* v199 = v197.data(); + uint64_t v200 = reinterpret_cast(v199); + TASSIGN(v198, v200); + Tile v201; + TASSIGN(v201, v40); + Tile v202; + __ubuf__ float* v203 = v201.data(); + uint64_t v204 = reinterpret_cast(v203); + TASSIGN(v202, v204); + pto::Shape<1, 1, 1, 1, 1024> v205 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v206 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v207 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v24 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v205, v206); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v190, v207); + pto::Shape<1, 1, 1, 1, 1024> v208 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v209 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v210 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v24 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v208, v209); + TLOAD(v194, v210); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v211 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v212 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v213 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v24 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v211, v212); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v198, v213); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v202, v194, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v198, v198, v202); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + TSTORE(v213, v198); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + pipe_barrier(PIPE_ALL); + TMUL(v202, v194, v190); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v202); + Tile v214; + TASSIGN(v214, v37); + Tile v215; + __ubuf__ float* v216 = v214.data(); + uint64_t v217 = reinterpret_cast(v216); + TASSIGN(v215, v217); + Tile v218; + TASSIGN(v218, v38); + Tile v219; + __ubuf__ float* v220 = v218.data(); + uint64_t v221 = reinterpret_cast(v220); + TASSIGN(v219, v221); + Tile v222; + TASSIGN(v222, v39); + Tile v223; + __ubuf__ float* v224 = v222.data(); + uint64_t v225 = reinterpret_cast(v224); + TASSIGN(v223, v225); + Tile v226; + TASSIGN(v226, v40); + Tile v227; + __ubuf__ float* v228 = v226.data(); + uint64_t v229 = reinterpret_cast(v228); + TASSIGN(v227, v229); + pto::Shape<1, 1, 1, 1, 1024> v230 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v231 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v232 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v23 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v230, v231); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + TLOAD(v215, v232); + pto::Shape<1, 1, 1, 1, 1024> v233 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v234 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v235 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v23 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v233, v234); + TLOAD(v219, v235); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v236 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v237 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v238 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v23 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v236, v237); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + TLOAD(v223, v238); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v227, v219, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v223, v223, v227); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + TSTORE(v238, v223); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v227, v219, v215); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v227); + Tile v239; + TASSIGN(v239, v37); + Tile v240; + __ubuf__ float* v241 = v239.data(); + uint64_t v242 = reinterpret_cast(v241); + TASSIGN(v240, v242); + Tile v243; + TASSIGN(v243, v38); + Tile v244; + __ubuf__ float* v245 = v243.data(); + uint64_t v246 = reinterpret_cast(v245); + TASSIGN(v244, v246); + Tile v247; + TASSIGN(v247, v39); + Tile v248; + __ubuf__ float* v249 = v247.data(); + uint64_t v250 = reinterpret_cast(v249); + TASSIGN(v248, v250); + Tile v251; + TASSIGN(v251, v40); + Tile v252; + __ubuf__ float* v253 = v251.data(); + uint64_t v254 = reinterpret_cast(v253); + TASSIGN(v252, v254); + pto::Shape<1, 1, 1, 1, 1024> v255 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v256 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v257 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v22 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v255, v256); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v240, v257); + pto::Shape<1, 1, 1, 1, 1024> v258 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v259 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v260 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v22 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v258, v259); + TLOAD(v244, v260); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v261 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v262 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v263 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v22 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v261, v262); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v248, v263); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v252, v244, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v248, v248, v252); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + TSTORE(v263, v248); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v252, v244, v240); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v252); + Tile v264; + TASSIGN(v264, v37); + Tile v265; + __ubuf__ float* v266 = v264.data(); + uint64_t v267 = reinterpret_cast(v266); + TASSIGN(v265, v267); + Tile v268; + TASSIGN(v268, v38); + Tile v269; + __ubuf__ float* v270 = v268.data(); + uint64_t v271 = reinterpret_cast(v270); + TASSIGN(v269, v271); + Tile v272; + TASSIGN(v272, v39); + Tile v273; + __ubuf__ float* v274 = v272.data(); + uint64_t v275 = reinterpret_cast(v274); + TASSIGN(v273, v275); + Tile v276; + TASSIGN(v276, v40); + Tile v277; + __ubuf__ float* v278 = v276.data(); + uint64_t v279 = reinterpret_cast(v278); + TASSIGN(v277, v279); + pto::Shape<1, 1, 1, 1, 1024> v280 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v281 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v282 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v21 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v280, v281); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v265, v282); + pto::Shape<1, 1, 1, 1, 1024> v283 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v284 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v285 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v21 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v283, v284); + TLOAD(v269, v285); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v286 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v287 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v288 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v21 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v286, v287); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v273, v288); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v277, v269, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v273, v273, v277); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v288, v273); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v277, v269, v265); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v277); + Tile v289; + TASSIGN(v289, v37); + Tile v290; + __ubuf__ float* v291 = v289.data(); + uint64_t v292 = reinterpret_cast(v291); + TASSIGN(v290, v292); + Tile v293; + TASSIGN(v293, v38); + Tile v294; + __ubuf__ float* v295 = v293.data(); + uint64_t v296 = reinterpret_cast(v295); + TASSIGN(v294, v296); + Tile v297; + TASSIGN(v297, v39); + Tile v298; + __ubuf__ float* v299 = v297.data(); + uint64_t v300 = reinterpret_cast(v299); + TASSIGN(v298, v300); + Tile v301; + TASSIGN(v301, v40); + Tile v302; + __ubuf__ float* v303 = v301.data(); + uint64_t v304 = reinterpret_cast(v303); + TASSIGN(v302, v304); + pto::Shape<1, 1, 1, 1, 1024> v305 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v306 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v307 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v20 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v305, v306); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v290, v307); + pto::Shape<1, 1, 1, 1, 1024> v308 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v309 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v310 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v20 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v308, v309); + TLOAD(v294, v310); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v311 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v312 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v313 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v20 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v311, v312); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v298, v313); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v302, v294, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v298, v298, v302); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v313, v298); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v302, v294, v290); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v302); + Tile v314; + TASSIGN(v314, v37); + Tile v315; + __ubuf__ float* v316 = v314.data(); + uint64_t v317 = reinterpret_cast(v316); + TASSIGN(v315, v317); + Tile v318; + TASSIGN(v318, v38); + Tile v319; + __ubuf__ float* v320 = v318.data(); + uint64_t v321 = reinterpret_cast(v320); + TASSIGN(v319, v321); + Tile v322; + TASSIGN(v322, v39); + Tile v323; + __ubuf__ float* v324 = v322.data(); + uint64_t v325 = reinterpret_cast(v324); + TASSIGN(v323, v325); + Tile v326; + TASSIGN(v326, v40); + Tile v327; + __ubuf__ float* v328 = v326.data(); + uint64_t v329 = reinterpret_cast(v328); + TASSIGN(v327, v329); + pto::Shape<1, 1, 1, 1, 1024> v330 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v331 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v332 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v19 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v330, v331); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v315, v332); + pto::Shape<1, 1, 1, 1, 1024> v333 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v334 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v335 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v19 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v333, v334); + TLOAD(v319, v335); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v336 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v337 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v338 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v19 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v336, v337); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v323, v338); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v327, v319, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v323, v323, v327); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v338, v323); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v327, v319, v315); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v327); + Tile v339; + TASSIGN(v339, v37); + Tile v340; + __ubuf__ float* v341 = v339.data(); + uint64_t v342 = reinterpret_cast(v341); + TASSIGN(v340, v342); + Tile v343; + TASSIGN(v343, v38); + Tile v344; + __ubuf__ float* v345 = v343.data(); + uint64_t v346 = reinterpret_cast(v345); + TASSIGN(v344, v346); + Tile v347; + TASSIGN(v347, v39); + Tile v348; + __ubuf__ float* v349 = v347.data(); + uint64_t v350 = reinterpret_cast(v349); + TASSIGN(v348, v350); + Tile v351; + TASSIGN(v351, v40); + Tile v352; + __ubuf__ float* v353 = v351.data(); + uint64_t v354 = reinterpret_cast(v353); + TASSIGN(v352, v354); + pto::Shape<1, 1, 1, 1, 1024> v355 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v356 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v357 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v18 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v355, v356); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v340, v357); + pto::Shape<1, 1, 1, 1, 1024> v358 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v359 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v360 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v18 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v358, v359); + TLOAD(v344, v360); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v361 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v362 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v363 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v18 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v361, v362); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v348, v363); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v352, v344, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v348, v348, v352); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v363, v348); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v352, v344, v340); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v352); + Tile v364; + TASSIGN(v364, v37); + Tile v365; + __ubuf__ float* v366 = v364.data(); + uint64_t v367 = reinterpret_cast(v366); + TASSIGN(v365, v367); + Tile v368; + TASSIGN(v368, v38); + Tile v369; + __ubuf__ float* v370 = v368.data(); + uint64_t v371 = reinterpret_cast(v370); + TASSIGN(v369, v371); + Tile v372; + TASSIGN(v372, v39); + Tile v373; + __ubuf__ float* v374 = v372.data(); + uint64_t v375 = reinterpret_cast(v374); + TASSIGN(v373, v375); + Tile v376; + TASSIGN(v376, v40); + Tile v377; + __ubuf__ float* v378 = v376.data(); + uint64_t v379 = reinterpret_cast(v378); + TASSIGN(v377, v379); + pto::Shape<1, 1, 1, 1, 1024> v380 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v381 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v382 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v17 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v380, v381); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v365, v382); + pto::Shape<1, 1, 1, 1, 1024> v383 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v384 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v385 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v17 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v383, v384); + TLOAD(v369, v385); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v386 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v387 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v388 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v17 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v386, v387); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v373, v388); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v377, v369, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v373, v373, v377); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v388, v373); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v377, v369, v365); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v377); + Tile v389; + TASSIGN(v389, v37); + Tile v390; + __ubuf__ float* v391 = v389.data(); + uint64_t v392 = reinterpret_cast(v391); + TASSIGN(v390, v392); + Tile v393; + TASSIGN(v393, v38); + Tile v394; + __ubuf__ float* v395 = v393.data(); + uint64_t v396 = reinterpret_cast(v395); + TASSIGN(v394, v396); + Tile v397; + TASSIGN(v397, v39); + Tile v398; + __ubuf__ float* v399 = v397.data(); + uint64_t v400 = reinterpret_cast(v399); + TASSIGN(v398, v400); + Tile v401; + TASSIGN(v401, v40); + Tile v402; + __ubuf__ float* v403 = v401.data(); + uint64_t v404 = reinterpret_cast(v403); + TASSIGN(v402, v404); + pto::Shape<1, 1, 1, 1, 1024> v405 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v406 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v407 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v16 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v405, v406); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v390, v407); + pto::Shape<1, 1, 1, 1, 1024> v408 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v409 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v410 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v16 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v408, v409); + TLOAD(v394, v410); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v411 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v412 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v413 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v16 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v411, v412); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v398, v413); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v402, v394, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v398, v398, v402); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v413, v398); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v402, v394, v390); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v402); + Tile v414; + TASSIGN(v414, v37); + Tile v415; + __ubuf__ float* v416 = v414.data(); + uint64_t v417 = reinterpret_cast(v416); + TASSIGN(v415, v417); + Tile v418; + TASSIGN(v418, v38); + Tile v419; + __ubuf__ float* v420 = v418.data(); + uint64_t v421 = reinterpret_cast(v420); + TASSIGN(v419, v421); + Tile v422; + TASSIGN(v422, v39); + Tile v423; + __ubuf__ float* v424 = v422.data(); + uint64_t v425 = reinterpret_cast(v424); + TASSIGN(v423, v425); + Tile v426; + TASSIGN(v426, v40); + Tile v427; + __ubuf__ float* v428 = v426.data(); + uint64_t v429 = reinterpret_cast(v428); + TASSIGN(v427, v429); + pto::Shape<1, 1, 1, 1, 1024> v430 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v431 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v432 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v15 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v430, v431); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v415, v432); + pto::Shape<1, 1, 1, 1, 1024> v433 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v434 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v435 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v15 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v433, v434); + TLOAD(v419, v435); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v436 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v437 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v438 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v15 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v436, v437); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v423, v438); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v427, v419, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v423, v423, v427); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v438, v423); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v427, v419, v415); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v427); + Tile v439; + TASSIGN(v439, v37); + Tile v440; + __ubuf__ float* v441 = v439.data(); + uint64_t v442 = reinterpret_cast(v441); + TASSIGN(v440, v442); + Tile v443; + TASSIGN(v443, v38); + Tile v444; + __ubuf__ float* v445 = v443.data(); + uint64_t v446 = reinterpret_cast(v445); + TASSIGN(v444, v446); + Tile v447; + TASSIGN(v447, v39); + Tile v448; + __ubuf__ float* v449 = v447.data(); + uint64_t v450 = reinterpret_cast(v449); + TASSIGN(v448, v450); + Tile v451; + TASSIGN(v451, v40); + Tile v452; + __ubuf__ float* v453 = v451.data(); + uint64_t v454 = reinterpret_cast(v453); + TASSIGN(v452, v454); + pto::Shape<1, 1, 1, 1, 1024> v455 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v456 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v457 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v14 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v455, v456); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v440, v457); + pto::Shape<1, 1, 1, 1, 1024> v458 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v459 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v460 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v14 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v458, v459); + TLOAD(v444, v460); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v461 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v462 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v463 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v14 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v461, v462); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v448, v463); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v452, v444, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v448, v448, v452); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v463, v448); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v452, v444, v440); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v452); + Tile v464; + TASSIGN(v464, v37); + Tile v465; + __ubuf__ float* v466 = v464.data(); + uint64_t v467 = reinterpret_cast(v466); + TASSIGN(v465, v467); + Tile v468; + TASSIGN(v468, v38); + Tile v469; + __ubuf__ float* v470 = v468.data(); + uint64_t v471 = reinterpret_cast(v470); + TASSIGN(v469, v471); + Tile v472; + TASSIGN(v472, v39); + Tile v473; + __ubuf__ float* v474 = v472.data(); + uint64_t v475 = reinterpret_cast(v474); + TASSIGN(v473, v475); + Tile v476; + TASSIGN(v476, v40); + Tile v477; + __ubuf__ float* v478 = v476.data(); + uint64_t v479 = reinterpret_cast(v478); + TASSIGN(v477, v479); + pto::Shape<1, 1, 1, 1, 1024> v480 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v481 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v482 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v13 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v480, v481); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v465, v482); + pto::Shape<1, 1, 1, 1, 1024> v483 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v484 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v485 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v13 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v483, v484); + TLOAD(v469, v485); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v486 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v487 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v488 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v13 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v486, v487); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v473, v488); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v477, v469, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v473, v473, v477); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v488, v473); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v477, v469, v465); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v477); + Tile v489; + TASSIGN(v489, v37); + Tile v490; + __ubuf__ float* v491 = v489.data(); + uint64_t v492 = reinterpret_cast(v491); + TASSIGN(v490, v492); + Tile v493; + TASSIGN(v493, v38); + Tile v494; + __ubuf__ float* v495 = v493.data(); + uint64_t v496 = reinterpret_cast(v495); + TASSIGN(v494, v496); + Tile v497; + TASSIGN(v497, v39); + Tile v498; + __ubuf__ float* v499 = v497.data(); + uint64_t v500 = reinterpret_cast(v499); + TASSIGN(v498, v500); + Tile v501; + TASSIGN(v501, v40); + Tile v502; + __ubuf__ float* v503 = v501.data(); + uint64_t v504 = reinterpret_cast(v503); + TASSIGN(v502, v504); + pto::Shape<1, 1, 1, 1, 1024> v505 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v506 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v507 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v12 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v505, v506); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v490, v507); + pto::Shape<1, 1, 1, 1, 1024> v508 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v509 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v510 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v12 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v508, v509); + TLOAD(v494, v510); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v511 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v512 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v513 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v12 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v511, v512); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v498, v513); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v502, v494, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v498, v498, v502); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v513, v498); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v502, v494, v490); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v502); + Tile v514; + TASSIGN(v514, v37); + Tile v515; + __ubuf__ float* v516 = v514.data(); + uint64_t v517 = reinterpret_cast(v516); + TASSIGN(v515, v517); + Tile v518; + TASSIGN(v518, v38); + Tile v519; + __ubuf__ float* v520 = v518.data(); + uint64_t v521 = reinterpret_cast(v520); + TASSIGN(v519, v521); + Tile v522; + TASSIGN(v522, v39); + Tile v523; + __ubuf__ float* v524 = v522.data(); + uint64_t v525 = reinterpret_cast(v524); + TASSIGN(v523, v525); + Tile v526; + TASSIGN(v526, v40); + Tile v527; + __ubuf__ float* v528 = v526.data(); + uint64_t v529 = reinterpret_cast(v528); + TASSIGN(v527, v529); + pto::Shape<1, 1, 1, 1, 1024> v530 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v531 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v532 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v11 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v530, v531); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v515, v532); + pto::Shape<1, 1, 1, 1, 1024> v533 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v534 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v535 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v11 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v533, v534); + TLOAD(v519, v535); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v536 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v537 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v538 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v11 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v536, v537); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v523, v538); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v527, v519, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v523, v523, v527); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v538, v523); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v527, v519, v515); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v527); + Tile v539; + TASSIGN(v539, v37); + Tile v540; + __ubuf__ float* v541 = v539.data(); + uint64_t v542 = reinterpret_cast(v541); + TASSIGN(v540, v542); + Tile v543; + TASSIGN(v543, v38); + Tile v544; + __ubuf__ float* v545 = v543.data(); + uint64_t v546 = reinterpret_cast(v545); + TASSIGN(v544, v546); + Tile v547; + TASSIGN(v547, v39); + Tile v548; + __ubuf__ float* v549 = v547.data(); + uint64_t v550 = reinterpret_cast(v549); + TASSIGN(v548, v550); + Tile v551; + TASSIGN(v551, v40); + Tile v552; + __ubuf__ float* v553 = v551.data(); + uint64_t v554 = reinterpret_cast(v553); + TASSIGN(v552, v554); + pto::Shape<1, 1, 1, 1, 1024> v555 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v556 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v557 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v10 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v555, v556); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v540, v557); + pto::Shape<1, 1, 1, 1, 1024> v558 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v559 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v560 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v10 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v558, v559); + TLOAD(v544, v560); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v561 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v562 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v563 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v10 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v561, v562); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v548, v563); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v552, v544, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v548, v548, v552); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v563, v548); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v552, v544, v540); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v552); + Tile v564; + TASSIGN(v564, v37); + Tile v565; + __ubuf__ float* v566 = v564.data(); + uint64_t v567 = reinterpret_cast(v566); + TASSIGN(v565, v567); + Tile v568; + TASSIGN(v568, v38); + Tile v569; + __ubuf__ float* v570 = v568.data(); + uint64_t v571 = reinterpret_cast(v570); + TASSIGN(v569, v571); + Tile v572; + TASSIGN(v572, v39); + Tile v573; + __ubuf__ float* v574 = v572.data(); + uint64_t v575 = reinterpret_cast(v574); + TASSIGN(v573, v575); + Tile v576; + TASSIGN(v576, v40); + Tile v577; + __ubuf__ float* v578 = v576.data(); + uint64_t v579 = reinterpret_cast(v578); + TASSIGN(v577, v579); + pto::Shape<1, 1, 1, 1, 1024> v580 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v581 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v582 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v9 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v580, v581); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v565, v582); + pto::Shape<1, 1, 1, 1, 1024> v583 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v584 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v585 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v9 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v583, v584); + TLOAD(v569, v585); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v586 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v587 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v588 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v9 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v586, v587); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v573, v588); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v577, v569, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v573, v573, v577); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v588, v573); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v577, v569, v565); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v577); + Tile v589; + TASSIGN(v589, v37); + Tile v590; + __ubuf__ float* v591 = v589.data(); + uint64_t v592 = reinterpret_cast(v591); + TASSIGN(v590, v592); + Tile v593; + TASSIGN(v593, v38); + Tile v594; + __ubuf__ float* v595 = v593.data(); + uint64_t v596 = reinterpret_cast(v595); + TASSIGN(v594, v596); + Tile v597; + TASSIGN(v597, v39); + Tile v598; + __ubuf__ float* v599 = v597.data(); + uint64_t v600 = reinterpret_cast(v599); + TASSIGN(v598, v600); + Tile v601; + TASSIGN(v601, v40); + Tile v602; + __ubuf__ float* v603 = v601.data(); + uint64_t v604 = reinterpret_cast(v603); + TASSIGN(v602, v604); + pto::Shape<1, 1, 1, 1, 1024> v605 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v606 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v607 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v8 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v605, v606); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v590, v607); + pto::Shape<1, 1, 1, 1, 1024> v608 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v609 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v610 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v8 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v608, v609); + TLOAD(v594, v610); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v611 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v612 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v613 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v8 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v611, v612); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v598, v613); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v602, v594, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v598, v598, v602); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v613, v598); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v602, v594, v590); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v602); + Tile v614; + TASSIGN(v614, v37); + Tile v615; + __ubuf__ float* v616 = v614.data(); + uint64_t v617 = reinterpret_cast(v616); + TASSIGN(v615, v617); + Tile v618; + TASSIGN(v618, v38); + Tile v619; + __ubuf__ float* v620 = v618.data(); + uint64_t v621 = reinterpret_cast(v620); + TASSIGN(v619, v621); + Tile v622; + TASSIGN(v622, v39); + Tile v623; + __ubuf__ float* v624 = v622.data(); + uint64_t v625 = reinterpret_cast(v624); + TASSIGN(v623, v625); + Tile v626; + TASSIGN(v626, v40); + Tile v627; + __ubuf__ float* v628 = v626.data(); + uint64_t v629 = reinterpret_cast(v628); + TASSIGN(v627, v629); + pto::Shape<1, 1, 1, 1, 1024> v630 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v631 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v632 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v7 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v630, v631); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v615, v632); + pto::Shape<1, 1, 1, 1, 1024> v633 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v634 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v635 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v7 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v633, v634); + TLOAD(v619, v635); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v636 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v637 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v638 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v7 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v636, v637); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v623, v638); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v627, v619, v51); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v623, v623, v627); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v638, v623); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v627, v619, v615); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v627); + Tile v639; + TASSIGN(v639, v37); + Tile v640; + __ubuf__ float* v641 = v639.data(); + uint64_t v642 = reinterpret_cast(v641); + TASSIGN(v640, v642); + Tile v643; + TASSIGN(v643, v38); + Tile v644; + __ubuf__ float* v645 = v643.data(); + uint64_t v646 = reinterpret_cast(v645); + TASSIGN(v644, v646); + Tile v647; + TASSIGN(v647, v39); + Tile v648; + __ubuf__ float* v649 = v647.data(); + uint64_t v650 = reinterpret_cast(v649); + TASSIGN(v648, v650); + Tile v651; + TASSIGN(v651, v40); + Tile v652; + __ubuf__ float* v653 = v651.data(); + uint64_t v654 = reinterpret_cast(v653); + TASSIGN(v652, v654); + pto::Shape<1, 1, 1, 1, 1024> v655 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v656 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v657 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v29 + v6 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v655, v656); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v640, v657); + pto::Shape<1, 1, 1, 1, 1024> v658 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v659 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v660 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v29 + v6 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v658, v659); + TLOAD(v644, v660); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v661 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v662 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v663 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v4 + (v29 + v6 * (unsigned) v31 + (unsigned) v49 * (unsigned) v32), v661, v662); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v648, v663); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TMUL(v652, v644, v51); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v648, v648, v652); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v663, v648); + pipe_barrier(PIPE_ALL); + TMUL(v652, v644, v640); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v652); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v63, v55); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/fn_normw_merge_fwd.cpp b/kernels/manual/a5/mhc/fn_normw_merge_fwd.cpp new file mode 100644 index 000000000..b7f198d19 --- /dev/null +++ b/kernels/manual/a5/mhc/fn_normw_merge_fwd.cpp @@ -0,0 +1,259 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_fn_normw_merge_fwd_m4_h1280(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3) { + unsigned v4 = 4096; + unsigned v5 = 3072; + unsigned v6 = 2048; + unsigned v7 = 1024; + unsigned v8 = 0; + const int32_t v9 = 0; + const int32_t v10 = 24; + const int32_t v11 = 5120; + const int32_t v12 = 1; + const int64_t v13 = 0; + const int64_t v14 = 32768; + const int64_t v15 = 65536; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v16 = get_block_idx(); + int64_t v17 = get_block_num(); + int32_t v18 = (int32_t) ((int64_t) v17); + int32_t v19 = v10 / v18; + int32_t v20 = v10 % v18 != v9 && v10 < v9 == v18 < v9 ? v19 + v12 : v19; + int32_t v21 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v16) * (uint32_t) v20); + int32_t v22 = (int32_t) ((uint32_t) v21 + (uint32_t) v20); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + for (size_t v23 = (size_t) v21; v23 < ((size_t) ((uint32_t) v22 < (uint32_t) v10 ? v22 : v10)); v23 += (size_t) v12) { + int32_t v24 = (int32_t) v23; + Tile v25; + TASSIGN(v25, v13); + Tile v26; + __ubuf__ float* v27 = v25.data(); + uint64_t v28 = reinterpret_cast(v27); + TASSIGN(v26, v28); + Tile v29; + TASSIGN(v29, v14); + Tile v30; + __ubuf__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + Tile v33; + TASSIGN(v33, v15); + Tile v34; + __ubuf__ float* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + pto::Shape<1, 1, 1, 1, 1024> v37 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v38 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v39 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v8 + (unsigned) v24 * (unsigned) v11 + v8 * (unsigned) v12), v37, v38); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v26, v39); + pto::Shape<1, 1, 1, 1, 1024> v40 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v41 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v42 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v11 + v8 * (unsigned) v12), v40, v41); + TLOAD(v30, v42); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMUL(v34, v26, v30); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v43 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v44 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v45 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v8 + (unsigned) v24 * (unsigned) v11 + v8 * (unsigned) v12), v43, v44); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v45, v34); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + Tile v46; + TASSIGN(v46, v13); + Tile v47; + __ubuf__ float* v48 = v46.data(); + uint64_t v49 = reinterpret_cast(v48); + TASSIGN(v47, v49); + Tile v50; + TASSIGN(v50, v14); + Tile v51; + __ubuf__ float* v52 = v50.data(); + uint64_t v53 = reinterpret_cast(v52); + TASSIGN(v51, v53); + Tile v54; + TASSIGN(v54, v15); + Tile v55; + __ubuf__ float* v56 = v54.data(); + uint64_t v57 = reinterpret_cast(v56); + TASSIGN(v55, v57); + pto::Shape<1, 1, 1, 1, 1024> v58 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v59 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v60 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v8 + (unsigned) v24 * (unsigned) v11 + v7 * (unsigned) v12), v58, v59); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v47, v60); + pto::Shape<1, 1, 1, 1, 1024> v61 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v62 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v63 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v11 + v7 * (unsigned) v12), v61, v62); + TLOAD(v51, v63); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + TMUL(v55, v47, v51); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1024> v64 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v65 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v66 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v8 + (unsigned) v24 * (unsigned) v11 + v7 * (unsigned) v12), v64, v65); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v66, v55); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + Tile v67; + TASSIGN(v67, v13); + Tile v68; + __ubuf__ float* v69 = v67.data(); + uint64_t v70 = reinterpret_cast(v69); + TASSIGN(v68, v70); + Tile v71; + TASSIGN(v71, v14); + Tile v72; + __ubuf__ float* v73 = v71.data(); + uint64_t v74 = reinterpret_cast(v73); + TASSIGN(v72, v74); + Tile v75; + TASSIGN(v75, v15); + Tile v76; + __ubuf__ float* v77 = v75.data(); + uint64_t v78 = reinterpret_cast(v77); + TASSIGN(v76, v78); + pto::Shape<1, 1, 1, 1, 1024> v79 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v80 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v81 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v8 + (unsigned) v24 * (unsigned) v11 + v6 * (unsigned) v12), v79, v80); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v68, v81); + pto::Shape<1, 1, 1, 1, 1024> v82 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v83 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v84 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v11 + v6 * (unsigned) v12), v82, v83); + TLOAD(v72, v84); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + TMUL(v76, v68, v72); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 1024> v85 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v86 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v87 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v8 + (unsigned) v24 * (unsigned) v11 + v6 * (unsigned) v12), v85, v86); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v87, v76); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + Tile v88; + TASSIGN(v88, v13); + Tile v89; + __ubuf__ float* v90 = v88.data(); + uint64_t v91 = reinterpret_cast(v90); + TASSIGN(v89, v91); + Tile v92; + TASSIGN(v92, v14); + Tile v93; + __ubuf__ float* v94 = v92.data(); + uint64_t v95 = reinterpret_cast(v94); + TASSIGN(v93, v95); + Tile v96; + TASSIGN(v96, v15); + Tile v97; + __ubuf__ float* v98 = v96.data(); + uint64_t v99 = reinterpret_cast(v98); + TASSIGN(v97, v99); + pto::Shape<1, 1, 1, 1, 1024> v100 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v101 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v102 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v8 + (unsigned) v24 * (unsigned) v11 + v5 * (unsigned) v12), v100, v101); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v89, v102); + pto::Shape<1, 1, 1, 1, 1024> v103 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v104 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v105 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v11 + v5 * (unsigned) v12), v103, v104); + TLOAD(v93, v105); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + TMUL(v97, v89, v93); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1024> v106 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v107 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v108 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v8 + (unsigned) v24 * (unsigned) v11 + v5 * (unsigned) v12), v106, v107); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v108, v97); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID5); + Tile v109; + TASSIGN(v109, v13); + Tile v110; + __ubuf__ float* v111 = v109.data(); + uint64_t v112 = reinterpret_cast(v111); + TASSIGN(v110, v112); + Tile v113; + TASSIGN(v113, v14); + Tile v114; + __ubuf__ float* v115 = v113.data(); + uint64_t v116 = reinterpret_cast(v115); + TASSIGN(v114, v116); + Tile v117; + TASSIGN(v117, v15); + Tile v118; + __ubuf__ float* v119 = v117.data(); + uint64_t v120 = reinterpret_cast(v119); + TASSIGN(v118, v120); + pto::Shape<1, 1, 1, 1, 1024> v121 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v122 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v123 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v8 + (unsigned) v24 * (unsigned) v11 + v4 * (unsigned) v12), v121, v122); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v110, v123); + pto::Shape<1, 1, 1, 1, 1024> v124 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v125 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v126 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v8 + v8 * (unsigned) v11 + v4 * (unsigned) v12), v124, v125); + TLOAD(v114, v126); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID5); + TMUL(v118, v110, v114); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 1024> v127 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v128 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v129 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v3 + (v8 + (unsigned) v24 * (unsigned) v11 + v4 * (unsigned) v12), v127, v128); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v129, v118); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/head_compute_mix_bwd.cpp b/kernels/manual/a5/mhc/head_compute_mix_bwd.cpp new file mode 100644 index 000000000..09ac8bcfd --- /dev/null +++ b/kernels/manual/a5/mhc/head_compute_mix_bwd.cpp @@ -0,0 +1,253 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_head_compute_mix_bwd_m4(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, __gm__ float* v5, __gm__ float* v6, __gm__ float* v7, int32_t v8) { + unsigned v9 = 0; + const int32_t v10 = 1; + const int32_t v11 = 0; + const int32_t v12 = 4; + const float v13 = 0.0f; + const float v14 = -1.0f; + const float v15 = 1.0f; + const int64_t v16 = 160; + const int64_t v17 = 192; + const int64_t v18 = 0; + const int64_t v19 = 32; + const int64_t v20 = 64; + const int64_t v21 = 224; + const int64_t v22 = 256; + const int64_t v23 = 288; + const int64_t v24 = 320; + const int64_t v25 = 352; + const int64_t v26 = 384; + const int64_t v27 = 416; + const int64_t v28 = 448; + const int64_t v29 = 480; + const int64_t v30 = 512; + const int64_t v31 = 96; + const int64_t v32 = 128; + const int64_t v33 = 544; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + Tile v34; + TASSIGN(v34, v16); + Tile v35; + __ubuf__ float* v36 = v34.data(); + uint64_t v37 = reinterpret_cast(v36); + TASSIGN(v35, v37); + Tile v38; + TASSIGN(v38, v17); + Tile v39; + __ubuf__ float* v40 = v38.data(); + uint64_t v41 = reinterpret_cast(v40); + TASSIGN(v39, v41); + Tile v42; + TASSIGN(v42, v18); + Tile v43; + __ubuf__ float* v44 = v42.data(); + uint64_t v45 = reinterpret_cast(v44); + TASSIGN(v43, v45); + Tile v46; + TASSIGN(v46, v19); + Tile v47; + __ubuf__ float* v48 = v46.data(); + uint64_t v49 = reinterpret_cast(v48); + TASSIGN(v47, v49); + Tile v50; + TASSIGN(v50, v20); + Tile v51; + __ubuf__ float* v52 = v50.data(); + uint64_t v53 = reinterpret_cast(v52); + TASSIGN(v51, v53); + Tile v54; + TASSIGN(v54, v21); + Tile v55; + __ubuf__ float* v56 = v54.data(); + uint64_t v57 = reinterpret_cast(v56); + TASSIGN(v55, v57); + Tile v58; + TASSIGN(v58, v22); + Tile v59; + __ubuf__ float* v60 = v58.data(); + uint64_t v61 = reinterpret_cast(v60); + TASSIGN(v59, v61); + Tile v62; + TASSIGN(v62, v23); + Tile v63; + __ubuf__ float* v64 = v62.data(); + uint64_t v65 = reinterpret_cast(v64); + TASSIGN(v63, v65); + Tile v66; + TASSIGN(v66, v24); + Tile v67; + __ubuf__ float* v68 = v66.data(); + uint64_t v69 = reinterpret_cast(v68); + TASSIGN(v67, v69); + Tile v70; + TASSIGN(v70, v25); + Tile v71; + __ubuf__ float* v72 = v70.data(); + uint64_t v73 = reinterpret_cast(v72); + TASSIGN(v71, v73); + Tile v74; + TASSIGN(v74, v26); + Tile v75; + __ubuf__ float* v76 = v74.data(); + uint64_t v77 = reinterpret_cast(v76); + TASSIGN(v75, v77); + Tile v78; + TASSIGN(v78, v27); + Tile v79; + __ubuf__ float* v80 = v78.data(); + uint64_t v81 = reinterpret_cast(v80); + TASSIGN(v79, v81); + Tile v82; + TASSIGN(v82, v28); + Tile v83; + __ubuf__ float* v84 = v82.data(); + uint64_t v85 = reinterpret_cast(v84); + TASSIGN(v83, v85); + Tile v86; + TASSIGN(v86, v29); + Tile v87; + __ubuf__ float* v88 = v86.data(); + uint64_t v89 = reinterpret_cast(v88); + TASSIGN(v87, v89); + Tile v90; + TASSIGN(v90, v30); + Tile v91; + __ubuf__ float* v92 = v90.data(); + uint64_t v93 = reinterpret_cast(v92); + TASSIGN(v91, v93); + Tile v94; + TASSIGN(v94, v31); + Tile v95; + __ubuf__ float* v96 = v94.data(); + uint64_t v97 = reinterpret_cast(v96); + TASSIGN(v95, v97); + Tile v98; + TASSIGN(v98, v32); + Tile v99; + __ubuf__ float* v100 = v98.data(); + uint64_t v101 = reinterpret_cast(v100); + TASSIGN(v99, v101); + Tile v102; + TASSIGN(v102, v33); + Tile v103; + __ubuf__ float* v104 = v102.data(); + uint64_t v105 = reinterpret_cast(v104); + TASSIGN(v103, v105); + pto::Shape<1, 1, 1, 1, 4> v106 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v107 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v108 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v9 + v9 * (unsigned) v12 + v9 * (unsigned) v10), v106, v107); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TLOAD(v43, v108); + pto::Shape<1, 1, 1, 1, 1> v109 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<1, 1, 1, 1, 1> v110 = pto::Stride<1, 1, 1, 1, 1>(); + GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND> v111 = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND>(v3 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v10), v109, v110); + TLOAD(v47, v111); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v51, v47); + TMULS(v95, v47, v13); + TMULS(v99, v43, v13); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + for (size_t v112 = (size_t) v11; v112 < ((size_t) v8); v112 += (size_t) v10) { + int32_t v113 = (int32_t) v112; + pto::Shape<1, 1, 1, 1, 4> v114 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v115 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v116 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v9 + (unsigned) v113 * (unsigned) v12 + v9 * (unsigned) v10), v114, v115); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v35, v116); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 4> v117 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v118 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v119 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v1 + (v9 + (unsigned) v113 * (unsigned) v12 + v9 * (unsigned) v10), v117, v118); + TLOAD(v39, v119); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v55, v35, v51); + pipe_barrier(PIPE_ALL); + TADD(v55, v55, v43); + pipe_barrier(PIPE_ALL); + TMULS(v59, v55, v14); + pipe_barrier(PIPE_ALL); + TEXP(v63, v59); + pipe_barrier(PIPE_ALL); + TADDS(v67, v63, v15); + pipe_barrier(PIPE_ALL); + TRECIP(v71, v67); + pipe_barrier(PIPE_ALL); + TMULS(v75, v71, v14); + pipe_barrier(PIPE_ALL); + TADDS(v75, v75, v15); + pipe_barrier(PIPE_ALL); + TMUL(v79, v71, v75); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TMUL(v79, v79, v39); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMUL(v83, v79, v51); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 4> v120 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v121 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v122 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v9 + (unsigned) v113 * (unsigned) v12 + v9 * (unsigned) v10), v120, v121); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v122, v83); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TADD(v99, v99, v79); + TMUL(v87, v79, v35); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TROWSUM(v91, v87, v103); + pipe_barrier(PIPE_ALL); + TADD(v95, v95, v91); + } + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v123 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<1, 1, 1, 1, 1> v124 = pto::Stride<1, 1, 1, 1, 1>(); + GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND> v125 = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND>(v6 + (v9 + v9 * (unsigned) v10 + v9 * (unsigned) v10), v123, v124); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v125, v95); + pto::Shape<1, 1, 1, 1, 4> v126 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v127 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v128 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v7 + (v9 + v9 * (unsigned) v12 + v9 * (unsigned) v10), v126, v127); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v128, v99); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/head_compute_mix_fwd.cpp b/kernels/manual/a5/mhc/head_compute_mix_fwd.cpp new file mode 100644 index 000000000..d671a0000 --- /dev/null +++ b/kernels/manual/a5/mhc/head_compute_mix_fwd.cpp @@ -0,0 +1,170 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_head_compute_mix_fwd_m4(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, int32_t v5) { + unsigned v6 = 0; + const int32_t v7 = 0; + const int32_t v8 = 1; + const int32_t v9 = 4; + const float v10 = -1.0f; + const float v11 = 1.0f; + const float v12 = 9.99999997E-7f; + const int64_t v13 = 96; + const int64_t v14 = 0; + const int64_t v15 = 32; + const int64_t v16 = 64; + const int64_t v17 = 128; + const int64_t v18 = 160; + const int64_t v19 = 192; + const int64_t v20 = 224; + const int64_t v21 = 256; + const int64_t v22 = 288; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v23 = get_block_idx(); + int64_t v24 = get_block_num(); + int32_t v25 = (int32_t) ((int64_t) v24); + int32_t v26 = v5 / v25; + int32_t v27 = v5 % v25 != v7 && v5 < v7 == v25 < v7 ? v26 + v8 : v26; + int32_t v28 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v23) * (uint32_t) v27); + int32_t v29 = (int32_t) ((uint32_t) v28 + (uint32_t) v27); + Tile v30; + TASSIGN(v30, v13); + Tile v31; + __ubuf__ float* v32 = v30.data(); + uint64_t v33 = reinterpret_cast(v32); + TASSIGN(v31, v33); + Tile v34; + TASSIGN(v34, v14); + Tile v35; + __ubuf__ float* v36 = v34.data(); + uint64_t v37 = reinterpret_cast(v36); + TASSIGN(v35, v37); + Tile v38; + TASSIGN(v38, v15); + Tile v39; + __ubuf__ float* v40 = v38.data(); + uint64_t v41 = reinterpret_cast(v40); + TASSIGN(v39, v41); + Tile v42; + TASSIGN(v42, v16); + Tile v43; + __ubuf__ float* v44 = v42.data(); + uint64_t v45 = reinterpret_cast(v44); + TASSIGN(v43, v45); + Tile v46; + TASSIGN(v46, v17); + Tile v47; + __ubuf__ float* v48 = v46.data(); + uint64_t v49 = reinterpret_cast(v48); + TASSIGN(v47, v49); + Tile v50; + TASSIGN(v50, v18); + Tile v51; + __ubuf__ float* v52 = v50.data(); + uint64_t v53 = reinterpret_cast(v52); + TASSIGN(v51, v53); + Tile v54; + TASSIGN(v54, v19); + Tile v55; + __ubuf__ float* v56 = v54.data(); + uint64_t v57 = reinterpret_cast(v56); + TASSIGN(v55, v57); + Tile v58; + TASSIGN(v58, v20); + Tile v59; + __ubuf__ float* v60 = v58.data(); + uint64_t v61 = reinterpret_cast(v60); + TASSIGN(v59, v61); + Tile v62; + TASSIGN(v62, v21); + Tile v63; + __ubuf__ float* v64 = v62.data(); + uint64_t v65 = reinterpret_cast(v64); + TASSIGN(v63, v65); + Tile v66; + TASSIGN(v66, v22); + Tile v67; + __ubuf__ float* v68 = v66.data(); + uint64_t v69 = reinterpret_cast(v68); + TASSIGN(v67, v69); + pto::Shape<1, 1, 1, 1, 4> v70 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v71 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v72 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v6 + v6 * (unsigned) v9 + v6 * (unsigned) v8), v70, v71); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TLOAD(v35, v72); + pto::Shape<1, 1, 1, 1, 1> v73 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<1, 1, 1, 1, 1> v74 = pto::Stride<1, 1, 1, 1, 1>(); + GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND> v75 = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, pto::Layout::ND>(v2 + (v6 + v6 * (unsigned) v8 + v6 * (unsigned) v8), v73, v74); + TLOAD(v39, v75); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v43, v39); + for (size_t v76 = (size_t) v28; v76 < ((size_t) ((uint32_t) v29 < (uint32_t) v5 ? v29 : v5)); v76 += (size_t) v8) { + int32_t v77 = (int32_t) v76; + pto::Shape<1, 1, 1, 1, 4> v78 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v79 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v80 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v77 * (unsigned) v9 + v6 * (unsigned) v8), v78, v79); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v31, v80); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v47, v31, v43); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v47, v47, v35); + pipe_barrier(PIPE_ALL); + TMULS(v51, v47, v10); + pipe_barrier(PIPE_ALL); + TEXP(v55, v51); + pipe_barrier(PIPE_ALL); + TADDS(v59, v55, v11); + pipe_barrier(PIPE_ALL); + TRECIP(v63, v59); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TADDS(v67, v63, v12); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 4> v81 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v82 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v83 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v6 + (unsigned) v77 * (unsigned) v9 + v6 * (unsigned) v8), v81, v82); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v83, v67); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/main.cpp b/kernels/manual/a5/mhc/main.cpp new file mode 100644 index 000000000..012677215 --- /dev/null +++ b/kernels/manual/a5/mhc/main.cpp @@ -0,0 +1,89 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +CANN Open Software License Agreement Version 2.0 +*/ + +#include "acl/acl.h" +#include "test_common.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace PtoTestCommon; + +/* ---------- kernel launch declarations (from .so) ---------- */ +extern "C" void call_expand_fwd(uint32_t blockDim, void *stream, + uint8_t *x, uint8_t *out, int32_t tokens, int32_t hidden); + +/* ---------- bf16 helpers ---------- */ +static uint16_t f32_to_bf16(float v) { + uint32_t u; + memcpy(&u, &v, 4); + return (uint16_t)(u >> 16); +} +static float bf16_to_f32(uint16_t v) { + uint32_t u = (uint32_t)v << 16; + float f; + memcpy(&f, &u, 4); + return f; +} + +/* ---------- expand_to_mhc_fwd golden ---------- */ +static void golden_expand_fwd(const uint16_t *x, uint16_t *out, + int tokens, int hidden, int mhc) { + for (int t = 0; t < tokens; t++) + for (int m = 0; m < mhc; m++) + memcpy(out + (t * mhc + m) * hidden, x + t * hidden, hidden * sizeof(uint16_t)); +} + +int main(int argc, char **argv) { + const int tokens = 64, hidden = 1280, mhc = 4; + const int blockDim = 32; + + /* ACL init */ + aclInit(nullptr); + aclrtSetDevice(0); + void *stream = nullptr; + aclrtCreateStream(&stream); + + /* host data */ + const int x_elems = tokens * hidden; + const int out_elems = tokens * mhc * hidden; + std::vector h_x(x_elems), h_out(out_elems, 0), h_golden(out_elems); + + srand(42); + for (int i = 0; i < x_elems; i++) + h_x[i] = f32_to_bf16((float)(rand() % 1000 - 500) / 100.0f); + + golden_expand_fwd(h_x.data(), h_golden.data(), tokens, hidden, mhc); + + /* device memory */ + void *d_x = nullptr, *d_out = nullptr; + aclrtMalloc(&d_x, x_elems * 2, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc(&d_out, out_elems * 2, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMemcpy(d_x, x_elems * 2, h_x.data(), x_elems * 2, ACL_MEMCPY_HOST_TO_DEVICE); + + /* launch */ + call_expand_fwd(blockDim, stream, + (uint8_t *)d_x, (uint8_t *)d_out, tokens, hidden); + aclrtSynchronizeStream(stream); + + /* copy back */ + aclrtMemcpy(h_out.data(), out_elems * 2, d_out, out_elems * 2, ACL_MEMCPY_DEVICE_TO_HOST); + + /* verify */ + bool pass = (memcmp(h_out.data(), h_golden.data(), out_elems * 2) == 0); + printf("expand_to_mhc_fwd: %s\n", pass ? "PASSED" : "FAILED"); + + /* cleanup */ + aclrtFree(d_x); + aclrtFree(d_out); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + return pass ? 0 : 1; +} diff --git a/kernels/manual/a5/mhc/post_bwd.cpp b/kernels/manual/a5/mhc/post_bwd.cpp new file mode 100644 index 000000000..358418dea --- /dev/null +++ b/kernels/manual/a5/mhc/post_bwd.cpp @@ -0,0 +1,2740 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_post_bwd_m4(__gm__ bfloat16_t* v1, __gm__ float* v2, __gm__ bfloat16_t* v3, __gm__ float* v4, __gm__ bfloat16_t* v5, __gm__ float* v6, __gm__ bfloat16_t* v7, __gm__ float* v8, __gm__ bfloat16_t* v9, int32_t v10, int32_t v11) { + unsigned v12 = 15; + unsigned v13 = 14; + unsigned v14 = 13; + unsigned v15 = 12; + unsigned v16 = 11; + unsigned v17 = 10; + unsigned v18 = 9; + unsigned v19 = 8; + unsigned v20 = 7; + unsigned v21 = 6; + unsigned v22 = 5; + unsigned v23 = 4; + unsigned v24 = 3; + unsigned v25 = 2; + unsigned v26 = 1; + RoundMode v27 = RoundMode::CAST_RINT; + unsigned v28 = 0; + const int32_t v29 = 16; + const int32_t v30 = 4; + const int32_t v31 = 1024; + const int32_t v32 = 1; + const int32_t v33 = 0; + const int32_t v34 = 2; + const int32_t v35 = 3; + const int64_t v36 = 0; + const int64_t v37 = 256; + const int64_t v38 = 33024; + const int64_t v39 = 16640; + const int64_t v40 = 65792; + const int64_t v41 = 114944; + const int64_t v42 = 16384; + const int64_t v43 = 49152; + const int64_t v44 = 49408; + const int64_t v45 = 82176; + using T = float; + size_t v46 = (size_t) v33; + size_t v47 = (size_t) v32; + int32_t v48 = (int32_t) ((uint32_t) v10 * (uint32_t) v30); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v49 = get_block_idx(); + int64_t v50 = get_block_num(); + int32_t v51 = (int32_t) ((int64_t) v50); + int32_t v52 = v10 / v51; + int32_t v53 = v10 % v51 != v33 && v10 < v33 == v51 < v33 ? v52 + v32 : v52; + int32_t v54 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v49) * (uint32_t) v53); + int32_t v55 = (int32_t) ((uint32_t) v54 + (uint32_t) v53); + int32_t v56 = v11 / v31; + size_t v57 = (size_t) (v11 % v31 != v33 && v11 < v33 == v31 < v33 ? v56 + v32 : v56); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + for (size_t v58 = (size_t) v54; v58 < ((size_t) ((uint32_t) v55 < (uint32_t) v10 ? v55 : v10)); v58 += v47) { + int32_t v59 = (int32_t) v58; + int32_t v60 = (int32_t) ((uint32_t) v59 * (uint32_t) v30); + Tile v61; + TASSIGN(v61, v36); + Tile v62; + __ubuf__ float* v63 = v61.data(); + uint64_t v64 = reinterpret_cast(v63); + TASSIGN(v62, v64); + pto::Shape<1, 1, 1, 1, 1> v65 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v66 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v67 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v28 + (unsigned) v59 * (unsigned) v30 + v28 * (unsigned) v32), v65, v66); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v62, v67); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v62, v62, v62); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v68 = v46; v68 < v57; v68 += v47) { + int32_t v69 = (int32_t) ((uint32_t) ((int32_t) v68) * (uint32_t) v31); + int32_t v70 = (int32_t) ((uint32_t) v11 - (uint32_t) v69); + int32_t v71 = (uint32_t) v70 < (uint32_t) v31 ? v70 : v31; + Tile v72; + TASSIGN(v72, v37); + Tile v73 = Tile(v71); + __ubuf__ bfloat16_t* v74 = v72.data(); + uint64_t v75 = reinterpret_cast(v74); + TASSIGN(v73, v75); + Tile v76; + TASSIGN(v76, v38); + Tile v77 = Tile(v71); + __ubuf__ float* v78 = v76.data(); + uint64_t v79 = reinterpret_cast(v78); + TASSIGN(v77, v79); + Tile v80; + TASSIGN(v80, v39); + Tile v81 = Tile(v71); + __ubuf__ bfloat16_t* v82 = v80.data(); + uint64_t v83 = reinterpret_cast(v82); + TASSIGN(v81, v83); + Tile v84; + TASSIGN(v84, v40); + Tile v85 = Tile(v71); + __ubuf__ float* v86 = v84.data(); + uint64_t v87 = reinterpret_cast(v86); + TASSIGN(v85, v87); + Tile v88; + TASSIGN(v88, v37); + Tile v89 = Tile(v71); + __ubuf__ float* v90 = v88.data(); + uint64_t v91 = reinterpret_cast(v90); + TASSIGN(v89, v91); + Tile v92; + TASSIGN(v92, v38); + Tile v93 = Tile(v71); + __ubuf__ float* v94 = v92.data(); + uint64_t v95 = reinterpret_cast(v94); + TASSIGN(v93, v95); + Tile v96; + TASSIGN(v96, v40); + Tile v97; + __ubuf__ float* v98 = v96.data(); + uint64_t v99 = reinterpret_cast(v98); + TASSIGN(v97, v99); + unsigned v100 = (unsigned) v71; + unsigned v101 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v102 = pto::Shape<1, 1, 1, 1, -1>(v71); + pto::Stride<-1, -1, -1, -1, 1> v103 = pto::Stride<-1, -1, -1, -1, 1>(v101, v101, v101, v101); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v104 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v69 * (unsigned) v32), v102, v103); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v73, v104); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + unsigned v105 = (unsigned) v71; + unsigned v106 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v107 = pto::Shape<1, 1, 1, 1, -1>(v71); + pto::Stride<-1, -1, -1, -1, 1> v108 = pto::Stride<-1, -1, -1, -1, 1>(v106, v106, v106, v106); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v109 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v28 + (unsigned) v59 * (unsigned) v11 + (unsigned) v69 * (unsigned) v32), v107, v108); + TLOAD(v81, v109); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCVT(v77, v73, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TCVT(v85, v81, v27); + pipe_barrier(PIPE_ALL); + TMUL(v89, v77, v85); + pipe_barrier(PIPE_ALL); + TROWSUM(v97, v89, v93); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v62, v62, v97); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v110 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v111 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v112 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v8 + (v28 + (unsigned) v59 * (unsigned) v30 + v28 * (unsigned) v32), v110, v111); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v112, v62); + int32_t v113 = (int32_t) ((uint32_t) v60 + (uint32_t) v32); + Tile v114; + TASSIGN(v114, v36); + Tile v115; + __ubuf__ float* v116 = v114.data(); + uint64_t v117 = reinterpret_cast(v116); + TASSIGN(v115, v117); + pto::Shape<1, 1, 1, 1, 1> v118 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v119 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v120 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v28 + (unsigned) v59 * (unsigned) v30 + v26 * (unsigned) v32), v118, v119); + TLOAD(v115, v120); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TSUB(v115, v115, v115); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + for (size_t v121 = v46; v121 < v57; v121 += v47) { + int32_t v122 = (int32_t) ((uint32_t) ((int32_t) v121) * (uint32_t) v31); + int32_t v123 = (int32_t) ((uint32_t) v11 - (uint32_t) v122); + int32_t v124 = (uint32_t) v123 < (uint32_t) v31 ? v123 : v31; + Tile v125; + TASSIGN(v125, v37); + Tile v126 = Tile(v124); + __ubuf__ bfloat16_t* v127 = v125.data(); + uint64_t v128 = reinterpret_cast(v127); + TASSIGN(v126, v128); + Tile v129; + TASSIGN(v129, v38); + Tile v130 = Tile(v124); + __ubuf__ float* v131 = v129.data(); + uint64_t v132 = reinterpret_cast(v131); + TASSIGN(v130, v132); + Tile v133; + TASSIGN(v133, v39); + Tile v134 = Tile(v124); + __ubuf__ bfloat16_t* v135 = v133.data(); + uint64_t v136 = reinterpret_cast(v135); + TASSIGN(v134, v136); + Tile v137; + TASSIGN(v137, v40); + Tile v138 = Tile(v124); + __ubuf__ float* v139 = v137.data(); + uint64_t v140 = reinterpret_cast(v139); + TASSIGN(v138, v140); + Tile v141; + TASSIGN(v141, v37); + Tile v142 = Tile(v124); + __ubuf__ float* v143 = v141.data(); + uint64_t v144 = reinterpret_cast(v143); + TASSIGN(v142, v144); + Tile v145; + TASSIGN(v145, v38); + Tile v146 = Tile(v124); + __ubuf__ float* v147 = v145.data(); + uint64_t v148 = reinterpret_cast(v147); + TASSIGN(v146, v148); + Tile v149; + TASSIGN(v149, v40); + Tile v150; + __ubuf__ float* v151 = v149.data(); + uint64_t v152 = reinterpret_cast(v151); + TASSIGN(v150, v152); + unsigned v153 = (unsigned) v124; + unsigned v154 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v155 = pto::Shape<1, 1, 1, 1, -1>(v124); + pto::Stride<-1, -1, -1, -1, 1> v156 = pto::Stride<-1, -1, -1, -1, 1>(v154, v154, v154, v154); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v157 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v122 * (unsigned) v32), v155, v156); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v126, v157); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + unsigned v158 = (unsigned) v124; + unsigned v159 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v160 = pto::Shape<1, 1, 1, 1, -1>(v124); + pto::Stride<-1, -1, -1, -1, 1> v161 = pto::Stride<-1, -1, -1, -1, 1>(v159, v159, v159, v159); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v162 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v28 + (unsigned) v59 * (unsigned) v11 + (unsigned) v122 * (unsigned) v32), v160, v161); + TLOAD(v134, v162); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + TCVT(v130, v126, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TCVT(v138, v134, v27); + pipe_barrier(PIPE_ALL); + TMUL(v142, v130, v138); + pipe_barrier(PIPE_ALL); + TROWSUM(v150, v142, v146); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v115, v115, v150); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v163 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v164 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v165 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v8 + (v28 + (unsigned) v59 * (unsigned) v30 + v26 * (unsigned) v32), v163, v164); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v165, v115); + int32_t v166 = (int32_t) ((uint32_t) v60 + (uint32_t) v34); + Tile v167; + TASSIGN(v167, v36); + Tile v168; + __ubuf__ float* v169 = v167.data(); + uint64_t v170 = reinterpret_cast(v169); + TASSIGN(v168, v170); + pto::Shape<1, 1, 1, 1, 1> v171 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v172 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v173 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v28 + (unsigned) v59 * (unsigned) v30 + v25 * (unsigned) v32), v171, v172); + TLOAD(v168, v173); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TSUB(v168, v168, v168); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + for (size_t v174 = v46; v174 < v57; v174 += v47) { + int32_t v175 = (int32_t) ((uint32_t) ((int32_t) v174) * (uint32_t) v31); + int32_t v176 = (int32_t) ((uint32_t) v11 - (uint32_t) v175); + int32_t v177 = (uint32_t) v176 < (uint32_t) v31 ? v176 : v31; + Tile v178; + TASSIGN(v178, v37); + Tile v179 = Tile(v177); + __ubuf__ bfloat16_t* v180 = v178.data(); + uint64_t v181 = reinterpret_cast(v180); + TASSIGN(v179, v181); + Tile v182; + TASSIGN(v182, v38); + Tile v183 = Tile(v177); + __ubuf__ float* v184 = v182.data(); + uint64_t v185 = reinterpret_cast(v184); + TASSIGN(v183, v185); + Tile v186; + TASSIGN(v186, v39); + Tile v187 = Tile(v177); + __ubuf__ bfloat16_t* v188 = v186.data(); + uint64_t v189 = reinterpret_cast(v188); + TASSIGN(v187, v189); + Tile v190; + TASSIGN(v190, v40); + Tile v191 = Tile(v177); + __ubuf__ float* v192 = v190.data(); + uint64_t v193 = reinterpret_cast(v192); + TASSIGN(v191, v193); + Tile v194; + TASSIGN(v194, v37); + Tile v195 = Tile(v177); + __ubuf__ float* v196 = v194.data(); + uint64_t v197 = reinterpret_cast(v196); + TASSIGN(v195, v197); + Tile v198; + TASSIGN(v198, v38); + Tile v199 = Tile(v177); + __ubuf__ float* v200 = v198.data(); + uint64_t v201 = reinterpret_cast(v200); + TASSIGN(v199, v201); + Tile v202; + TASSIGN(v202, v40); + Tile v203; + __ubuf__ float* v204 = v202.data(); + uint64_t v205 = reinterpret_cast(v204); + TASSIGN(v203, v205); + unsigned v206 = (unsigned) v177; + unsigned v207 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v208 = pto::Shape<1, 1, 1, 1, -1>(v177); + pto::Stride<-1, -1, -1, -1, 1> v209 = pto::Stride<-1, -1, -1, -1, 1>(v207, v207, v207, v207); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v210 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v175 * (unsigned) v32), v208, v209); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v179, v210); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v211 = (unsigned) v177; + unsigned v212 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v213 = pto::Shape<1, 1, 1, 1, -1>(v177); + pto::Stride<-1, -1, -1, -1, 1> v214 = pto::Stride<-1, -1, -1, -1, 1>(v212, v212, v212, v212); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v215 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v28 + (unsigned) v59 * (unsigned) v11 + (unsigned) v175 * (unsigned) v32), v213, v214); + TLOAD(v187, v215); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v183, v179, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v191, v187, v27); + pipe_barrier(PIPE_ALL); + TMUL(v195, v183, v191); + pipe_barrier(PIPE_ALL); + TROWSUM(v203, v195, v199); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TADD(v168, v168, v203); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 1> v216 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v217 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v218 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v8 + (v28 + (unsigned) v59 * (unsigned) v30 + v25 * (unsigned) v32), v216, v217); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v218, v168); + int32_t v219 = (int32_t) ((uint32_t) v60 + (uint32_t) v35); + Tile v220; + TASSIGN(v220, v36); + Tile v221; + __ubuf__ float* v222 = v220.data(); + uint64_t v223 = reinterpret_cast(v222); + TASSIGN(v221, v223); + pto::Shape<1, 1, 1, 1, 1> v224 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v225 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v226 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v28 + (unsigned) v59 * (unsigned) v30 + v24 * (unsigned) v32), v224, v225); + TLOAD(v221, v226); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v221, v221, v221); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + for (size_t v227 = v46; v227 < v57; v227 += v47) { + int32_t v228 = (int32_t) ((uint32_t) ((int32_t) v227) * (uint32_t) v31); + int32_t v229 = (int32_t) ((uint32_t) v11 - (uint32_t) v228); + int32_t v230 = (uint32_t) v229 < (uint32_t) v31 ? v229 : v31; + Tile v231; + TASSIGN(v231, v37); + Tile v232 = Tile(v230); + __ubuf__ bfloat16_t* v233 = v231.data(); + uint64_t v234 = reinterpret_cast(v233); + TASSIGN(v232, v234); + Tile v235; + TASSIGN(v235, v38); + Tile v236 = Tile(v230); + __ubuf__ float* v237 = v235.data(); + uint64_t v238 = reinterpret_cast(v237); + TASSIGN(v236, v238); + Tile v239; + TASSIGN(v239, v39); + Tile v240 = Tile(v230); + __ubuf__ bfloat16_t* v241 = v239.data(); + uint64_t v242 = reinterpret_cast(v241); + TASSIGN(v240, v242); + Tile v243; + TASSIGN(v243, v40); + Tile v244 = Tile(v230); + __ubuf__ float* v245 = v243.data(); + uint64_t v246 = reinterpret_cast(v245); + TASSIGN(v244, v246); + Tile v247; + TASSIGN(v247, v37); + Tile v248 = Tile(v230); + __ubuf__ float* v249 = v247.data(); + uint64_t v250 = reinterpret_cast(v249); + TASSIGN(v248, v250); + Tile v251; + TASSIGN(v251, v38); + Tile v252 = Tile(v230); + __ubuf__ float* v253 = v251.data(); + uint64_t v254 = reinterpret_cast(v253); + TASSIGN(v252, v254); + Tile v255; + TASSIGN(v255, v40); + Tile v256; + __ubuf__ float* v257 = v255.data(); + uint64_t v258 = reinterpret_cast(v257); + TASSIGN(v256, v258); + unsigned v259 = (unsigned) v230; + unsigned v260 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v261 = pto::Shape<1, 1, 1, 1, -1>(v230); + pto::Stride<-1, -1, -1, -1, 1> v262 = pto::Stride<-1, -1, -1, -1, 1>(v260, v260, v260, v260); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v263 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v228 * (unsigned) v32), v261, v262); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v232, v263); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v264 = (unsigned) v230; + unsigned v265 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v266 = pto::Shape<1, 1, 1, 1, -1>(v230); + pto::Stride<-1, -1, -1, -1, 1> v267 = pto::Stride<-1, -1, -1, -1, 1>(v265, v265, v265, v265); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v268 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v28 + (unsigned) v59 * (unsigned) v11 + (unsigned) v228 * (unsigned) v32), v266, v267); + TLOAD(v240, v268); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v236, v232, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v244, v240, v27); + pipe_barrier(PIPE_ALL); + TMUL(v248, v236, v244); + pipe_barrier(PIPE_ALL); + TROWSUM(v256, v248, v252); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v221, v221, v256); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + pto::Shape<1, 1, 1, 1, 1> v269 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v270 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v271 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v8 + (v28 + (unsigned) v59 * (unsigned) v30 + v24 * (unsigned) v32), v269, v270); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v271, v221); + Tile v272; + TASSIGN(v272, v36); + Tile v273; + __ubuf__ float* v274 = v272.data(); + uint64_t v275 = reinterpret_cast(v274); + TASSIGN(v273, v275); + pto::Shape<1, 1, 1, 1, 1> v276 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v277 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v278 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v28 * (unsigned) v32), v276, v277); + TLOAD(v273, v278); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v273, v273, v273); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v279 = v46; v279 < v57; v279 += v47) { + int32_t v280 = (int32_t) ((uint32_t) ((int32_t) v279) * (uint32_t) v31); + int32_t v281 = (int32_t) ((uint32_t) v11 - (uint32_t) v280); + int32_t v282 = (uint32_t) v281 < (uint32_t) v31 ? v281 : v31; + Tile v283; + TASSIGN(v283, v37); + Tile v284 = Tile(v282); + __ubuf__ bfloat16_t* v285 = v283.data(); + uint64_t v286 = reinterpret_cast(v285); + TASSIGN(v284, v286); + Tile v287; + TASSIGN(v287, v38); + Tile v288 = Tile(v282); + __ubuf__ float* v289 = v287.data(); + uint64_t v290 = reinterpret_cast(v289); + TASSIGN(v288, v290); + Tile v291; + TASSIGN(v291, v39); + Tile v292 = Tile(v282); + __ubuf__ bfloat16_t* v293 = v291.data(); + uint64_t v294 = reinterpret_cast(v293); + TASSIGN(v292, v294); + Tile v295; + TASSIGN(v295, v40); + Tile v296 = Tile(v282); + __ubuf__ float* v297 = v295.data(); + uint64_t v298 = reinterpret_cast(v297); + TASSIGN(v296, v298); + Tile v299; + TASSIGN(v299, v37); + Tile v300 = Tile(v282); + __ubuf__ float* v301 = v299.data(); + uint64_t v302 = reinterpret_cast(v301); + TASSIGN(v300, v302); + Tile v303; + TASSIGN(v303, v38); + Tile v304 = Tile(v282); + __ubuf__ float* v305 = v303.data(); + uint64_t v306 = reinterpret_cast(v305); + TASSIGN(v304, v306); + Tile v307; + TASSIGN(v307, v40); + Tile v308; + __ubuf__ float* v309 = v307.data(); + uint64_t v310 = reinterpret_cast(v309); + TASSIGN(v308, v310); + unsigned v311 = (unsigned) v282; + unsigned v312 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v313 = pto::Shape<1, 1, 1, 1, -1>(v282); + pto::Stride<-1, -1, -1, -1, 1> v314 = pto::Stride<-1, -1, -1, -1, 1>(v312, v312, v312, v312); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v315 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v280 * (unsigned) v32), v313, v314); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v284, v315); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v316 = (unsigned) v282; + unsigned v317 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v318 = pto::Shape<1, 1, 1, 1, -1>(v282); + pto::Stride<-1, -1, -1, -1, 1> v319 = pto::Stride<-1, -1, -1, -1, 1>(v317, v317, v317, v317); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v320 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v280 * (unsigned) v32), v318, v319); + TLOAD(v292, v320); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v288, v284, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v296, v292, v27); + pipe_barrier(PIPE_ALL); + TMUL(v300, v288, v296); + pipe_barrier(PIPE_ALL); + TROWSUM(v308, v300, v304); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v273, v273, v308); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v321 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v322 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v323 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v28 * (unsigned) v32), v321, v322); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v323, v273); + Tile v324; + TASSIGN(v324, v36); + Tile v325; + __ubuf__ float* v326 = v324.data(); + uint64_t v327 = reinterpret_cast(v326); + TASSIGN(v325, v327); + pto::Shape<1, 1, 1, 1, 1> v328 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v329 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v330 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v26 * (unsigned) v32), v328, v329); + TLOAD(v325, v330); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v325, v325, v325); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v331 = v46; v331 < v57; v331 += v47) { + int32_t v332 = (int32_t) ((uint32_t) ((int32_t) v331) * (uint32_t) v31); + int32_t v333 = (int32_t) ((uint32_t) v11 - (uint32_t) v332); + int32_t v334 = (uint32_t) v333 < (uint32_t) v31 ? v333 : v31; + Tile v335; + TASSIGN(v335, v37); + Tile v336 = Tile(v334); + __ubuf__ bfloat16_t* v337 = v335.data(); + uint64_t v338 = reinterpret_cast(v337); + TASSIGN(v336, v338); + Tile v339; + TASSIGN(v339, v38); + Tile v340 = Tile(v334); + __ubuf__ float* v341 = v339.data(); + uint64_t v342 = reinterpret_cast(v341); + TASSIGN(v340, v342); + Tile v343; + TASSIGN(v343, v39); + Tile v344 = Tile(v334); + __ubuf__ bfloat16_t* v345 = v343.data(); + uint64_t v346 = reinterpret_cast(v345); + TASSIGN(v344, v346); + Tile v347; + TASSIGN(v347, v40); + Tile v348 = Tile(v334); + __ubuf__ float* v349 = v347.data(); + uint64_t v350 = reinterpret_cast(v349); + TASSIGN(v348, v350); + Tile v351; + TASSIGN(v351, v37); + Tile v352 = Tile(v334); + __ubuf__ float* v353 = v351.data(); + uint64_t v354 = reinterpret_cast(v353); + TASSIGN(v352, v354); + Tile v355; + TASSIGN(v355, v38); + Tile v356 = Tile(v334); + __ubuf__ float* v357 = v355.data(); + uint64_t v358 = reinterpret_cast(v357); + TASSIGN(v356, v358); + Tile v359; + TASSIGN(v359, v40); + Tile v360; + __ubuf__ float* v361 = v359.data(); + uint64_t v362 = reinterpret_cast(v361); + TASSIGN(v360, v362); + unsigned v363 = (unsigned) v334; + unsigned v364 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v365 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v366 = pto::Stride<-1, -1, -1, -1, 1>(v364, v364, v364, v364); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v367 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v332 * (unsigned) v32), v365, v366); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v336, v367); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v368 = (unsigned) v334; + unsigned v369 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v370 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v371 = pto::Stride<-1, -1, -1, -1, 1>(v369, v369, v369, v369); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v372 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v332 * (unsigned) v32), v370, v371); + TLOAD(v344, v372); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v340, v336, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v348, v344, v27); + pipe_barrier(PIPE_ALL); + TMUL(v352, v340, v348); + pipe_barrier(PIPE_ALL); + TROWSUM(v360, v352, v356); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v325, v325, v360); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v373 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v374 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v375 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v26 * (unsigned) v32), v373, v374); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + TSTORE(v375, v325); + Tile v376; + TASSIGN(v376, v36); + Tile v377; + __ubuf__ float* v378 = v376.data(); + uint64_t v379 = reinterpret_cast(v378); + TASSIGN(v377, v379); + pto::Shape<1, 1, 1, 1, 1> v380 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v381 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v382 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v25 * (unsigned) v32), v380, v381); + TLOAD(v377, v382); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v377, v377, v377); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v383 = v46; v383 < v57; v383 += v47) { + int32_t v384 = (int32_t) ((uint32_t) ((int32_t) v383) * (uint32_t) v31); + int32_t v385 = (int32_t) ((uint32_t) v11 - (uint32_t) v384); + int32_t v386 = (uint32_t) v385 < (uint32_t) v31 ? v385 : v31; + Tile v387; + TASSIGN(v387, v37); + Tile v388 = Tile(v386); + __ubuf__ bfloat16_t* v389 = v387.data(); + uint64_t v390 = reinterpret_cast(v389); + TASSIGN(v388, v390); + Tile v391; + TASSIGN(v391, v38); + Tile v392 = Tile(v386); + __ubuf__ float* v393 = v391.data(); + uint64_t v394 = reinterpret_cast(v393); + TASSIGN(v392, v394); + Tile v395; + TASSIGN(v395, v39); + Tile v396 = Tile(v386); + __ubuf__ bfloat16_t* v397 = v395.data(); + uint64_t v398 = reinterpret_cast(v397); + TASSIGN(v396, v398); + Tile v399; + TASSIGN(v399, v40); + Tile v400 = Tile(v386); + __ubuf__ float* v401 = v399.data(); + uint64_t v402 = reinterpret_cast(v401); + TASSIGN(v400, v402); + Tile v403; + TASSIGN(v403, v37); + Tile v404 = Tile(v386); + __ubuf__ float* v405 = v403.data(); + uint64_t v406 = reinterpret_cast(v405); + TASSIGN(v404, v406); + Tile v407; + TASSIGN(v407, v38); + Tile v408 = Tile(v386); + __ubuf__ float* v409 = v407.data(); + uint64_t v410 = reinterpret_cast(v409); + TASSIGN(v408, v410); + Tile v411; + TASSIGN(v411, v40); + Tile v412; + __ubuf__ float* v413 = v411.data(); + uint64_t v414 = reinterpret_cast(v413); + TASSIGN(v412, v414); + unsigned v415 = (unsigned) v386; + unsigned v416 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v417 = pto::Shape<1, 1, 1, 1, -1>(v386); + pto::Stride<-1, -1, -1, -1, 1> v418 = pto::Stride<-1, -1, -1, -1, 1>(v416, v416, v416, v416); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v419 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v384 * (unsigned) v32), v417, v418); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v388, v419); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v420 = (unsigned) v386; + unsigned v421 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v422 = pto::Shape<1, 1, 1, 1, -1>(v386); + pto::Stride<-1, -1, -1, -1, 1> v423 = pto::Stride<-1, -1, -1, -1, 1>(v421, v421, v421, v421); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v424 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v384 * (unsigned) v32), v422, v423); + TLOAD(v396, v424); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v392, v388, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v400, v396, v27); + pipe_barrier(PIPE_ALL); + TMUL(v404, v392, v400); + pipe_barrier(PIPE_ALL); + TROWSUM(v412, v404, v408); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v377, v377, v412); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v425 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v426 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v427 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v25 * (unsigned) v32), v425, v426); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + TSTORE(v427, v377); + Tile v428; + TASSIGN(v428, v36); + Tile v429; + __ubuf__ float* v430 = v428.data(); + uint64_t v431 = reinterpret_cast(v430); + TASSIGN(v429, v431); + pto::Shape<1, 1, 1, 1, 1> v432 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v433 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v434 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v24 * (unsigned) v32), v432, v433); + TLOAD(v429, v434); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v429, v429, v429); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v435 = v46; v435 < v57; v435 += v47) { + int32_t v436 = (int32_t) ((uint32_t) ((int32_t) v435) * (uint32_t) v31); + int32_t v437 = (int32_t) ((uint32_t) v11 - (uint32_t) v436); + int32_t v438 = (uint32_t) v437 < (uint32_t) v31 ? v437 : v31; + Tile v439; + TASSIGN(v439, v37); + Tile v440 = Tile(v438); + __ubuf__ bfloat16_t* v441 = v439.data(); + uint64_t v442 = reinterpret_cast(v441); + TASSIGN(v440, v442); + Tile v443; + TASSIGN(v443, v38); + Tile v444 = Tile(v438); + __ubuf__ float* v445 = v443.data(); + uint64_t v446 = reinterpret_cast(v445); + TASSIGN(v444, v446); + Tile v447; + TASSIGN(v447, v39); + Tile v448 = Tile(v438); + __ubuf__ bfloat16_t* v449 = v447.data(); + uint64_t v450 = reinterpret_cast(v449); + TASSIGN(v448, v450); + Tile v451; + TASSIGN(v451, v40); + Tile v452 = Tile(v438); + __ubuf__ float* v453 = v451.data(); + uint64_t v454 = reinterpret_cast(v453); + TASSIGN(v452, v454); + Tile v455; + TASSIGN(v455, v37); + Tile v456 = Tile(v438); + __ubuf__ float* v457 = v455.data(); + uint64_t v458 = reinterpret_cast(v457); + TASSIGN(v456, v458); + Tile v459; + TASSIGN(v459, v38); + Tile v460 = Tile(v438); + __ubuf__ float* v461 = v459.data(); + uint64_t v462 = reinterpret_cast(v461); + TASSIGN(v460, v462); + Tile v463; + TASSIGN(v463, v40); + Tile v464; + __ubuf__ float* v465 = v463.data(); + uint64_t v466 = reinterpret_cast(v465); + TASSIGN(v464, v466); + unsigned v467 = (unsigned) v438; + unsigned v468 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v469 = pto::Shape<1, 1, 1, 1, -1>(v438); + pto::Stride<-1, -1, -1, -1, 1> v470 = pto::Stride<-1, -1, -1, -1, 1>(v468, v468, v468, v468); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v471 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v436 * (unsigned) v32), v469, v470); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v440, v471); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v472 = (unsigned) v438; + unsigned v473 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v474 = pto::Shape<1, 1, 1, 1, -1>(v438); + pto::Stride<-1, -1, -1, -1, 1> v475 = pto::Stride<-1, -1, -1, -1, 1>(v473, v473, v473, v473); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v476 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v436 * (unsigned) v32), v474, v475); + TLOAD(v448, v476); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v444, v440, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v452, v448, v27); + pipe_barrier(PIPE_ALL); + TMUL(v456, v444, v452); + pipe_barrier(PIPE_ALL); + TROWSUM(v464, v456, v460); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v429, v429, v464); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v477 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v478 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v479 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v24 * (unsigned) v32), v477, v478); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + TSTORE(v479, v429); + Tile v480; + TASSIGN(v480, v36); + Tile v481; + __ubuf__ float* v482 = v480.data(); + uint64_t v483 = reinterpret_cast(v482); + TASSIGN(v481, v483); + pto::Shape<1, 1, 1, 1, 1> v484 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v485 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v486 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v23 * (unsigned) v32), v484, v485); + TLOAD(v481, v486); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v481, v481, v481); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v487 = v46; v487 < v57; v487 += v47) { + int32_t v488 = (int32_t) ((uint32_t) ((int32_t) v487) * (uint32_t) v31); + int32_t v489 = (int32_t) ((uint32_t) v11 - (uint32_t) v488); + int32_t v490 = (uint32_t) v489 < (uint32_t) v31 ? v489 : v31; + Tile v491; + TASSIGN(v491, v37); + Tile v492 = Tile(v490); + __ubuf__ bfloat16_t* v493 = v491.data(); + uint64_t v494 = reinterpret_cast(v493); + TASSIGN(v492, v494); + Tile v495; + TASSIGN(v495, v38); + Tile v496 = Tile(v490); + __ubuf__ float* v497 = v495.data(); + uint64_t v498 = reinterpret_cast(v497); + TASSIGN(v496, v498); + Tile v499; + TASSIGN(v499, v39); + Tile v500 = Tile(v490); + __ubuf__ bfloat16_t* v501 = v499.data(); + uint64_t v502 = reinterpret_cast(v501); + TASSIGN(v500, v502); + Tile v503; + TASSIGN(v503, v40); + Tile v504 = Tile(v490); + __ubuf__ float* v505 = v503.data(); + uint64_t v506 = reinterpret_cast(v505); + TASSIGN(v504, v506); + Tile v507; + TASSIGN(v507, v37); + Tile v508 = Tile(v490); + __ubuf__ float* v509 = v507.data(); + uint64_t v510 = reinterpret_cast(v509); + TASSIGN(v508, v510); + Tile v511; + TASSIGN(v511, v38); + Tile v512 = Tile(v490); + __ubuf__ float* v513 = v511.data(); + uint64_t v514 = reinterpret_cast(v513); + TASSIGN(v512, v514); + Tile v515; + TASSIGN(v515, v40); + Tile v516; + __ubuf__ float* v517 = v515.data(); + uint64_t v518 = reinterpret_cast(v517); + TASSIGN(v516, v518); + unsigned v519 = (unsigned) v490; + unsigned v520 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v521 = pto::Shape<1, 1, 1, 1, -1>(v490); + pto::Stride<-1, -1, -1, -1, 1> v522 = pto::Stride<-1, -1, -1, -1, 1>(v520, v520, v520, v520); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v523 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v488 * (unsigned) v32), v521, v522); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v492, v523); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v524 = (unsigned) v490; + unsigned v525 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v526 = pto::Shape<1, 1, 1, 1, -1>(v490); + pto::Stride<-1, -1, -1, -1, 1> v527 = pto::Stride<-1, -1, -1, -1, 1>(v525, v525, v525, v525); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v528 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v488 * (unsigned) v32), v526, v527); + TLOAD(v500, v528); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v496, v492, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v504, v500, v27); + pipe_barrier(PIPE_ALL); + TMUL(v508, v496, v504); + pipe_barrier(PIPE_ALL); + TROWSUM(v516, v508, v512); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v481, v481, v516); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v529 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v530 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v531 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v23 * (unsigned) v32), v529, v530); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v531, v481); + Tile v532; + TASSIGN(v532, v36); + Tile v533; + __ubuf__ float* v534 = v532.data(); + uint64_t v535 = reinterpret_cast(v534); + TASSIGN(v533, v535); + pto::Shape<1, 1, 1, 1, 1> v536 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v537 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v538 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v22 * (unsigned) v32), v536, v537); + TLOAD(v533, v538); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v533, v533, v533); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v539 = v46; v539 < v57; v539 += v47) { + int32_t v540 = (int32_t) ((uint32_t) ((int32_t) v539) * (uint32_t) v31); + int32_t v541 = (int32_t) ((uint32_t) v11 - (uint32_t) v540); + int32_t v542 = (uint32_t) v541 < (uint32_t) v31 ? v541 : v31; + Tile v543; + TASSIGN(v543, v37); + Tile v544 = Tile(v542); + __ubuf__ bfloat16_t* v545 = v543.data(); + uint64_t v546 = reinterpret_cast(v545); + TASSIGN(v544, v546); + Tile v547; + TASSIGN(v547, v38); + Tile v548 = Tile(v542); + __ubuf__ float* v549 = v547.data(); + uint64_t v550 = reinterpret_cast(v549); + TASSIGN(v548, v550); + Tile v551; + TASSIGN(v551, v39); + Tile v552 = Tile(v542); + __ubuf__ bfloat16_t* v553 = v551.data(); + uint64_t v554 = reinterpret_cast(v553); + TASSIGN(v552, v554); + Tile v555; + TASSIGN(v555, v40); + Tile v556 = Tile(v542); + __ubuf__ float* v557 = v555.data(); + uint64_t v558 = reinterpret_cast(v557); + TASSIGN(v556, v558); + Tile v559; + TASSIGN(v559, v37); + Tile v560 = Tile(v542); + __ubuf__ float* v561 = v559.data(); + uint64_t v562 = reinterpret_cast(v561); + TASSIGN(v560, v562); + Tile v563; + TASSIGN(v563, v38); + Tile v564 = Tile(v542); + __ubuf__ float* v565 = v563.data(); + uint64_t v566 = reinterpret_cast(v565); + TASSIGN(v564, v566); + Tile v567; + TASSIGN(v567, v40); + Tile v568; + __ubuf__ float* v569 = v567.data(); + uint64_t v570 = reinterpret_cast(v569); + TASSIGN(v568, v570); + unsigned v571 = (unsigned) v542; + unsigned v572 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v573 = pto::Shape<1, 1, 1, 1, -1>(v542); + pto::Stride<-1, -1, -1, -1, 1> v574 = pto::Stride<-1, -1, -1, -1, 1>(v572, v572, v572, v572); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v575 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v540 * (unsigned) v32), v573, v574); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v544, v575); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v576 = (unsigned) v542; + unsigned v577 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v578 = pto::Shape<1, 1, 1, 1, -1>(v542); + pto::Stride<-1, -1, -1, -1, 1> v579 = pto::Stride<-1, -1, -1, -1, 1>(v577, v577, v577, v577); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v580 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v540 * (unsigned) v32), v578, v579); + TLOAD(v552, v580); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v548, v544, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v556, v552, v27); + pipe_barrier(PIPE_ALL); + TMUL(v560, v548, v556); + pipe_barrier(PIPE_ALL); + TROWSUM(v568, v560, v564); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v533, v533, v568); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v581 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v582 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v583 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v22 * (unsigned) v32), v581, v582); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v583, v533); + Tile v584; + TASSIGN(v584, v36); + Tile v585; + __ubuf__ float* v586 = v584.data(); + uint64_t v587 = reinterpret_cast(v586); + TASSIGN(v585, v587); + pto::Shape<1, 1, 1, 1, 1> v588 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v589 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v590 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v21 * (unsigned) v32), v588, v589); + TLOAD(v585, v590); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v585, v585, v585); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v591 = v46; v591 < v57; v591 += v47) { + int32_t v592 = (int32_t) ((uint32_t) ((int32_t) v591) * (uint32_t) v31); + int32_t v593 = (int32_t) ((uint32_t) v11 - (uint32_t) v592); + int32_t v594 = (uint32_t) v593 < (uint32_t) v31 ? v593 : v31; + Tile v595; + TASSIGN(v595, v37); + Tile v596 = Tile(v594); + __ubuf__ bfloat16_t* v597 = v595.data(); + uint64_t v598 = reinterpret_cast(v597); + TASSIGN(v596, v598); + Tile v599; + TASSIGN(v599, v38); + Tile v600 = Tile(v594); + __ubuf__ float* v601 = v599.data(); + uint64_t v602 = reinterpret_cast(v601); + TASSIGN(v600, v602); + Tile v603; + TASSIGN(v603, v39); + Tile v604 = Tile(v594); + __ubuf__ bfloat16_t* v605 = v603.data(); + uint64_t v606 = reinterpret_cast(v605); + TASSIGN(v604, v606); + Tile v607; + TASSIGN(v607, v40); + Tile v608 = Tile(v594); + __ubuf__ float* v609 = v607.data(); + uint64_t v610 = reinterpret_cast(v609); + TASSIGN(v608, v610); + Tile v611; + TASSIGN(v611, v37); + Tile v612 = Tile(v594); + __ubuf__ float* v613 = v611.data(); + uint64_t v614 = reinterpret_cast(v613); + TASSIGN(v612, v614); + Tile v615; + TASSIGN(v615, v38); + Tile v616 = Tile(v594); + __ubuf__ float* v617 = v615.data(); + uint64_t v618 = reinterpret_cast(v617); + TASSIGN(v616, v618); + Tile v619; + TASSIGN(v619, v40); + Tile v620; + __ubuf__ float* v621 = v619.data(); + uint64_t v622 = reinterpret_cast(v621); + TASSIGN(v620, v622); + unsigned v623 = (unsigned) v594; + unsigned v624 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v625 = pto::Shape<1, 1, 1, 1, -1>(v594); + pto::Stride<-1, -1, -1, -1, 1> v626 = pto::Stride<-1, -1, -1, -1, 1>(v624, v624, v624, v624); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v627 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v592 * (unsigned) v32), v625, v626); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v596, v627); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v628 = (unsigned) v594; + unsigned v629 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v630 = pto::Shape<1, 1, 1, 1, -1>(v594); + pto::Stride<-1, -1, -1, -1, 1> v631 = pto::Stride<-1, -1, -1, -1, 1>(v629, v629, v629, v629); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v632 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v592 * (unsigned) v32), v630, v631); + TLOAD(v604, v632); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v600, v596, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v608, v604, v27); + pipe_barrier(PIPE_ALL); + TMUL(v612, v600, v608); + pipe_barrier(PIPE_ALL); + TROWSUM(v620, v612, v616); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v585, v585, v620); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v633 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v634 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v635 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v21 * (unsigned) v32), v633, v634); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v635, v585); + Tile v636; + TASSIGN(v636, v36); + Tile v637; + __ubuf__ float* v638 = v636.data(); + uint64_t v639 = reinterpret_cast(v638); + TASSIGN(v637, v639); + pto::Shape<1, 1, 1, 1, 1> v640 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v641 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v642 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v20 * (unsigned) v32), v640, v641); + TLOAD(v637, v642); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v637, v637, v637); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v643 = v46; v643 < v57; v643 += v47) { + int32_t v644 = (int32_t) ((uint32_t) ((int32_t) v643) * (uint32_t) v31); + int32_t v645 = (int32_t) ((uint32_t) v11 - (uint32_t) v644); + int32_t v646 = (uint32_t) v645 < (uint32_t) v31 ? v645 : v31; + Tile v647; + TASSIGN(v647, v37); + Tile v648 = Tile(v646); + __ubuf__ bfloat16_t* v649 = v647.data(); + uint64_t v650 = reinterpret_cast(v649); + TASSIGN(v648, v650); + Tile v651; + TASSIGN(v651, v38); + Tile v652 = Tile(v646); + __ubuf__ float* v653 = v651.data(); + uint64_t v654 = reinterpret_cast(v653); + TASSIGN(v652, v654); + Tile v655; + TASSIGN(v655, v39); + Tile v656 = Tile(v646); + __ubuf__ bfloat16_t* v657 = v655.data(); + uint64_t v658 = reinterpret_cast(v657); + TASSIGN(v656, v658); + Tile v659; + TASSIGN(v659, v40); + Tile v660 = Tile(v646); + __ubuf__ float* v661 = v659.data(); + uint64_t v662 = reinterpret_cast(v661); + TASSIGN(v660, v662); + Tile v663; + TASSIGN(v663, v37); + Tile v664 = Tile(v646); + __ubuf__ float* v665 = v663.data(); + uint64_t v666 = reinterpret_cast(v665); + TASSIGN(v664, v666); + Tile v667; + TASSIGN(v667, v38); + Tile v668 = Tile(v646); + __ubuf__ float* v669 = v667.data(); + uint64_t v670 = reinterpret_cast(v669); + TASSIGN(v668, v670); + Tile v671; + TASSIGN(v671, v40); + Tile v672; + __ubuf__ float* v673 = v671.data(); + uint64_t v674 = reinterpret_cast(v673); + TASSIGN(v672, v674); + unsigned v675 = (unsigned) v646; + unsigned v676 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v677 = pto::Shape<1, 1, 1, 1, -1>(v646); + pto::Stride<-1, -1, -1, -1, 1> v678 = pto::Stride<-1, -1, -1, -1, 1>(v676, v676, v676, v676); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v679 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v644 * (unsigned) v32), v677, v678); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v648, v679); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v680 = (unsigned) v646; + unsigned v681 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v682 = pto::Shape<1, 1, 1, 1, -1>(v646); + pto::Stride<-1, -1, -1, -1, 1> v683 = pto::Stride<-1, -1, -1, -1, 1>(v681, v681, v681, v681); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v684 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v644 * (unsigned) v32), v682, v683); + TLOAD(v656, v684); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v652, v648, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v660, v656, v27); + pipe_barrier(PIPE_ALL); + TMUL(v664, v652, v660); + pipe_barrier(PIPE_ALL); + TROWSUM(v672, v664, v668); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v637, v637, v672); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v685 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v686 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v687 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v20 * (unsigned) v32), v685, v686); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v687, v637); + Tile v688; + TASSIGN(v688, v36); + Tile v689; + __ubuf__ float* v690 = v688.data(); + uint64_t v691 = reinterpret_cast(v690); + TASSIGN(v689, v691); + pto::Shape<1, 1, 1, 1, 1> v692 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v693 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v694 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v19 * (unsigned) v32), v692, v693); + TLOAD(v689, v694); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v689, v689, v689); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v695 = v46; v695 < v57; v695 += v47) { + int32_t v696 = (int32_t) ((uint32_t) ((int32_t) v695) * (uint32_t) v31); + int32_t v697 = (int32_t) ((uint32_t) v11 - (uint32_t) v696); + int32_t v698 = (uint32_t) v697 < (uint32_t) v31 ? v697 : v31; + Tile v699; + TASSIGN(v699, v37); + Tile v700 = Tile(v698); + __ubuf__ bfloat16_t* v701 = v699.data(); + uint64_t v702 = reinterpret_cast(v701); + TASSIGN(v700, v702); + Tile v703; + TASSIGN(v703, v38); + Tile v704 = Tile(v698); + __ubuf__ float* v705 = v703.data(); + uint64_t v706 = reinterpret_cast(v705); + TASSIGN(v704, v706); + Tile v707; + TASSIGN(v707, v39); + Tile v708 = Tile(v698); + __ubuf__ bfloat16_t* v709 = v707.data(); + uint64_t v710 = reinterpret_cast(v709); + TASSIGN(v708, v710); + Tile v711; + TASSIGN(v711, v40); + Tile v712 = Tile(v698); + __ubuf__ float* v713 = v711.data(); + uint64_t v714 = reinterpret_cast(v713); + TASSIGN(v712, v714); + Tile v715; + TASSIGN(v715, v37); + Tile v716 = Tile(v698); + __ubuf__ float* v717 = v715.data(); + uint64_t v718 = reinterpret_cast(v717); + TASSIGN(v716, v718); + Tile v719; + TASSIGN(v719, v38); + Tile v720 = Tile(v698); + __ubuf__ float* v721 = v719.data(); + uint64_t v722 = reinterpret_cast(v721); + TASSIGN(v720, v722); + Tile v723; + TASSIGN(v723, v40); + Tile v724; + __ubuf__ float* v725 = v723.data(); + uint64_t v726 = reinterpret_cast(v725); + TASSIGN(v724, v726); + unsigned v727 = (unsigned) v698; + unsigned v728 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v729 = pto::Shape<1, 1, 1, 1, -1>(v698); + pto::Stride<-1, -1, -1, -1, 1> v730 = pto::Stride<-1, -1, -1, -1, 1>(v728, v728, v728, v728); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v731 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v696 * (unsigned) v32), v729, v730); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v700, v731); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v732 = (unsigned) v698; + unsigned v733 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v734 = pto::Shape<1, 1, 1, 1, -1>(v698); + pto::Stride<-1, -1, -1, -1, 1> v735 = pto::Stride<-1, -1, -1, -1, 1>(v733, v733, v733, v733); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v736 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v696 * (unsigned) v32), v734, v735); + TLOAD(v708, v736); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v704, v700, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v712, v708, v27); + pipe_barrier(PIPE_ALL); + TMUL(v716, v704, v712); + pipe_barrier(PIPE_ALL); + TROWSUM(v724, v716, v720); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v689, v689, v724); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v737 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v738 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v739 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v19 * (unsigned) v32), v737, v738); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v739, v689); + Tile v740; + TASSIGN(v740, v36); + Tile v741; + __ubuf__ float* v742 = v740.data(); + uint64_t v743 = reinterpret_cast(v742); + TASSIGN(v741, v743); + pto::Shape<1, 1, 1, 1, 1> v744 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v745 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v746 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v18 * (unsigned) v32), v744, v745); + TLOAD(v741, v746); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v741, v741, v741); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v747 = v46; v747 < v57; v747 += v47) { + int32_t v748 = (int32_t) ((uint32_t) ((int32_t) v747) * (uint32_t) v31); + int32_t v749 = (int32_t) ((uint32_t) v11 - (uint32_t) v748); + int32_t v750 = (uint32_t) v749 < (uint32_t) v31 ? v749 : v31; + Tile v751; + TASSIGN(v751, v37); + Tile v752 = Tile(v750); + __ubuf__ bfloat16_t* v753 = v751.data(); + uint64_t v754 = reinterpret_cast(v753); + TASSIGN(v752, v754); + Tile v755; + TASSIGN(v755, v38); + Tile v756 = Tile(v750); + __ubuf__ float* v757 = v755.data(); + uint64_t v758 = reinterpret_cast(v757); + TASSIGN(v756, v758); + Tile v759; + TASSIGN(v759, v39); + Tile v760 = Tile(v750); + __ubuf__ bfloat16_t* v761 = v759.data(); + uint64_t v762 = reinterpret_cast(v761); + TASSIGN(v760, v762); + Tile v763; + TASSIGN(v763, v40); + Tile v764 = Tile(v750); + __ubuf__ float* v765 = v763.data(); + uint64_t v766 = reinterpret_cast(v765); + TASSIGN(v764, v766); + Tile v767; + TASSIGN(v767, v37); + Tile v768 = Tile(v750); + __ubuf__ float* v769 = v767.data(); + uint64_t v770 = reinterpret_cast(v769); + TASSIGN(v768, v770); + Tile v771; + TASSIGN(v771, v38); + Tile v772 = Tile(v750); + __ubuf__ float* v773 = v771.data(); + uint64_t v774 = reinterpret_cast(v773); + TASSIGN(v772, v774); + Tile v775; + TASSIGN(v775, v40); + Tile v776; + __ubuf__ float* v777 = v775.data(); + uint64_t v778 = reinterpret_cast(v777); + TASSIGN(v776, v778); + unsigned v779 = (unsigned) v750; + unsigned v780 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v781 = pto::Shape<1, 1, 1, 1, -1>(v750); + pto::Stride<-1, -1, -1, -1, 1> v782 = pto::Stride<-1, -1, -1, -1, 1>(v780, v780, v780, v780); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v783 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v748 * (unsigned) v32), v781, v782); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v752, v783); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v784 = (unsigned) v750; + unsigned v785 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v786 = pto::Shape<1, 1, 1, 1, -1>(v750); + pto::Stride<-1, -1, -1, -1, 1> v787 = pto::Stride<-1, -1, -1, -1, 1>(v785, v785, v785, v785); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v788 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v748 * (unsigned) v32), v786, v787); + TLOAD(v760, v788); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v756, v752, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v764, v760, v27); + pipe_barrier(PIPE_ALL); + TMUL(v768, v756, v764); + pipe_barrier(PIPE_ALL); + TROWSUM(v776, v768, v772); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v741, v741, v776); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v789 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v790 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v791 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v18 * (unsigned) v32), v789, v790); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v791, v741); + Tile v792; + TASSIGN(v792, v36); + Tile v793; + __ubuf__ float* v794 = v792.data(); + uint64_t v795 = reinterpret_cast(v794); + TASSIGN(v793, v795); + pto::Shape<1, 1, 1, 1, 1> v796 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v797 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v798 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v17 * (unsigned) v32), v796, v797); + TLOAD(v793, v798); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v793, v793, v793); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v799 = v46; v799 < v57; v799 += v47) { + int32_t v800 = (int32_t) ((uint32_t) ((int32_t) v799) * (uint32_t) v31); + int32_t v801 = (int32_t) ((uint32_t) v11 - (uint32_t) v800); + int32_t v802 = (uint32_t) v801 < (uint32_t) v31 ? v801 : v31; + Tile v803; + TASSIGN(v803, v37); + Tile v804 = Tile(v802); + __ubuf__ bfloat16_t* v805 = v803.data(); + uint64_t v806 = reinterpret_cast(v805); + TASSIGN(v804, v806); + Tile v807; + TASSIGN(v807, v38); + Tile v808 = Tile(v802); + __ubuf__ float* v809 = v807.data(); + uint64_t v810 = reinterpret_cast(v809); + TASSIGN(v808, v810); + Tile v811; + TASSIGN(v811, v39); + Tile v812 = Tile(v802); + __ubuf__ bfloat16_t* v813 = v811.data(); + uint64_t v814 = reinterpret_cast(v813); + TASSIGN(v812, v814); + Tile v815; + TASSIGN(v815, v40); + Tile v816 = Tile(v802); + __ubuf__ float* v817 = v815.data(); + uint64_t v818 = reinterpret_cast(v817); + TASSIGN(v816, v818); + Tile v819; + TASSIGN(v819, v37); + Tile v820 = Tile(v802); + __ubuf__ float* v821 = v819.data(); + uint64_t v822 = reinterpret_cast(v821); + TASSIGN(v820, v822); + Tile v823; + TASSIGN(v823, v38); + Tile v824 = Tile(v802); + __ubuf__ float* v825 = v823.data(); + uint64_t v826 = reinterpret_cast(v825); + TASSIGN(v824, v826); + Tile v827; + TASSIGN(v827, v40); + Tile v828; + __ubuf__ float* v829 = v827.data(); + uint64_t v830 = reinterpret_cast(v829); + TASSIGN(v828, v830); + unsigned v831 = (unsigned) v802; + unsigned v832 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v833 = pto::Shape<1, 1, 1, 1, -1>(v802); + pto::Stride<-1, -1, -1, -1, 1> v834 = pto::Stride<-1, -1, -1, -1, 1>(v832, v832, v832, v832); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v835 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v800 * (unsigned) v32), v833, v834); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v804, v835); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v836 = (unsigned) v802; + unsigned v837 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v838 = pto::Shape<1, 1, 1, 1, -1>(v802); + pto::Stride<-1, -1, -1, -1, 1> v839 = pto::Stride<-1, -1, -1, -1, 1>(v837, v837, v837, v837); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v840 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v800 * (unsigned) v32), v838, v839); + TLOAD(v812, v840); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v808, v804, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v816, v812, v27); + pipe_barrier(PIPE_ALL); + TMUL(v820, v808, v816); + pipe_barrier(PIPE_ALL); + TROWSUM(v828, v820, v824); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v793, v793, v828); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v841 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v842 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v843 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v17 * (unsigned) v32), v841, v842); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v843, v793); + Tile v844; + TASSIGN(v844, v36); + Tile v845; + __ubuf__ float* v846 = v844.data(); + uint64_t v847 = reinterpret_cast(v846); + TASSIGN(v845, v847); + pto::Shape<1, 1, 1, 1, 1> v848 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v849 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v850 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v16 * (unsigned) v32), v848, v849); + TLOAD(v845, v850); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v845, v845, v845); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v851 = v46; v851 < v57; v851 += v47) { + int32_t v852 = (int32_t) ((uint32_t) ((int32_t) v851) * (uint32_t) v31); + int32_t v853 = (int32_t) ((uint32_t) v11 - (uint32_t) v852); + int32_t v854 = (uint32_t) v853 < (uint32_t) v31 ? v853 : v31; + Tile v855; + TASSIGN(v855, v37); + Tile v856 = Tile(v854); + __ubuf__ bfloat16_t* v857 = v855.data(); + uint64_t v858 = reinterpret_cast(v857); + TASSIGN(v856, v858); + Tile v859; + TASSIGN(v859, v38); + Tile v860 = Tile(v854); + __ubuf__ float* v861 = v859.data(); + uint64_t v862 = reinterpret_cast(v861); + TASSIGN(v860, v862); + Tile v863; + TASSIGN(v863, v39); + Tile v864 = Tile(v854); + __ubuf__ bfloat16_t* v865 = v863.data(); + uint64_t v866 = reinterpret_cast(v865); + TASSIGN(v864, v866); + Tile v867; + TASSIGN(v867, v40); + Tile v868 = Tile(v854); + __ubuf__ float* v869 = v867.data(); + uint64_t v870 = reinterpret_cast(v869); + TASSIGN(v868, v870); + Tile v871; + TASSIGN(v871, v37); + Tile v872 = Tile(v854); + __ubuf__ float* v873 = v871.data(); + uint64_t v874 = reinterpret_cast(v873); + TASSIGN(v872, v874); + Tile v875; + TASSIGN(v875, v38); + Tile v876 = Tile(v854); + __ubuf__ float* v877 = v875.data(); + uint64_t v878 = reinterpret_cast(v877); + TASSIGN(v876, v878); + Tile v879; + TASSIGN(v879, v40); + Tile v880; + __ubuf__ float* v881 = v879.data(); + uint64_t v882 = reinterpret_cast(v881); + TASSIGN(v880, v882); + unsigned v883 = (unsigned) v854; + unsigned v884 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v885 = pto::Shape<1, 1, 1, 1, -1>(v854); + pto::Stride<-1, -1, -1, -1, 1> v886 = pto::Stride<-1, -1, -1, -1, 1>(v884, v884, v884, v884); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v887 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v852 * (unsigned) v32), v885, v886); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v856, v887); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v888 = (unsigned) v854; + unsigned v889 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v890 = pto::Shape<1, 1, 1, 1, -1>(v854); + pto::Stride<-1, -1, -1, -1, 1> v891 = pto::Stride<-1, -1, -1, -1, 1>(v889, v889, v889, v889); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v892 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v852 * (unsigned) v32), v890, v891); + TLOAD(v864, v892); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v860, v856, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v868, v864, v27); + pipe_barrier(PIPE_ALL); + TMUL(v872, v860, v868); + pipe_barrier(PIPE_ALL); + TROWSUM(v880, v872, v876); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v845, v845, v880); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v893 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v894 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v895 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v16 * (unsigned) v32), v893, v894); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v895, v845); + Tile v896; + TASSIGN(v896, v36); + Tile v897; + __ubuf__ float* v898 = v896.data(); + uint64_t v899 = reinterpret_cast(v898); + TASSIGN(v897, v899); + pto::Shape<1, 1, 1, 1, 1> v900 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v901 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v902 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v15 * (unsigned) v32), v900, v901); + TLOAD(v897, v902); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v897, v897, v897); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v903 = v46; v903 < v57; v903 += v47) { + int32_t v904 = (int32_t) ((uint32_t) ((int32_t) v903) * (uint32_t) v31); + int32_t v905 = (int32_t) ((uint32_t) v11 - (uint32_t) v904); + int32_t v906 = (uint32_t) v905 < (uint32_t) v31 ? v905 : v31; + Tile v907; + TASSIGN(v907, v37); + Tile v908 = Tile(v906); + __ubuf__ bfloat16_t* v909 = v907.data(); + uint64_t v910 = reinterpret_cast(v909); + TASSIGN(v908, v910); + Tile v911; + TASSIGN(v911, v38); + Tile v912 = Tile(v906); + __ubuf__ float* v913 = v911.data(); + uint64_t v914 = reinterpret_cast(v913); + TASSIGN(v912, v914); + Tile v915; + TASSIGN(v915, v39); + Tile v916 = Tile(v906); + __ubuf__ bfloat16_t* v917 = v915.data(); + uint64_t v918 = reinterpret_cast(v917); + TASSIGN(v916, v918); + Tile v919; + TASSIGN(v919, v40); + Tile v920 = Tile(v906); + __ubuf__ float* v921 = v919.data(); + uint64_t v922 = reinterpret_cast(v921); + TASSIGN(v920, v922); + Tile v923; + TASSIGN(v923, v37); + Tile v924 = Tile(v906); + __ubuf__ float* v925 = v923.data(); + uint64_t v926 = reinterpret_cast(v925); + TASSIGN(v924, v926); + Tile v927; + TASSIGN(v927, v38); + Tile v928 = Tile(v906); + __ubuf__ float* v929 = v927.data(); + uint64_t v930 = reinterpret_cast(v929); + TASSIGN(v928, v930); + Tile v931; + TASSIGN(v931, v40); + Tile v932; + __ubuf__ float* v933 = v931.data(); + uint64_t v934 = reinterpret_cast(v933); + TASSIGN(v932, v934); + unsigned v935 = (unsigned) v906; + unsigned v936 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v937 = pto::Shape<1, 1, 1, 1, -1>(v906); + pto::Stride<-1, -1, -1, -1, 1> v938 = pto::Stride<-1, -1, -1, -1, 1>(v936, v936, v936, v936); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v939 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v904 * (unsigned) v32), v937, v938); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v908, v939); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v940 = (unsigned) v906; + unsigned v941 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v942 = pto::Shape<1, 1, 1, 1, -1>(v906); + pto::Stride<-1, -1, -1, -1, 1> v943 = pto::Stride<-1, -1, -1, -1, 1>(v941, v941, v941, v941); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v944 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v904 * (unsigned) v32), v942, v943); + TLOAD(v916, v944); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v912, v908, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v920, v916, v27); + pipe_barrier(PIPE_ALL); + TMUL(v924, v912, v920); + pipe_barrier(PIPE_ALL); + TROWSUM(v932, v924, v928); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v897, v897, v932); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v945 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v946 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v947 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v15 * (unsigned) v32), v945, v946); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v947, v897); + Tile v948; + TASSIGN(v948, v36); + Tile v949; + __ubuf__ float* v950 = v948.data(); + uint64_t v951 = reinterpret_cast(v950); + TASSIGN(v949, v951); + pto::Shape<1, 1, 1, 1, 1> v952 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v953 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v954 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v14 * (unsigned) v32), v952, v953); + TLOAD(v949, v954); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v949, v949, v949); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v955 = v46; v955 < v57; v955 += v47) { + int32_t v956 = (int32_t) ((uint32_t) ((int32_t) v955) * (uint32_t) v31); + int32_t v957 = (int32_t) ((uint32_t) v11 - (uint32_t) v956); + int32_t v958 = (uint32_t) v957 < (uint32_t) v31 ? v957 : v31; + Tile v959; + TASSIGN(v959, v37); + Tile v960 = Tile(v958); + __ubuf__ bfloat16_t* v961 = v959.data(); + uint64_t v962 = reinterpret_cast(v961); + TASSIGN(v960, v962); + Tile v963; + TASSIGN(v963, v38); + Tile v964 = Tile(v958); + __ubuf__ float* v965 = v963.data(); + uint64_t v966 = reinterpret_cast(v965); + TASSIGN(v964, v966); + Tile v967; + TASSIGN(v967, v39); + Tile v968 = Tile(v958); + __ubuf__ bfloat16_t* v969 = v967.data(); + uint64_t v970 = reinterpret_cast(v969); + TASSIGN(v968, v970); + Tile v971; + TASSIGN(v971, v40); + Tile v972 = Tile(v958); + __ubuf__ float* v973 = v971.data(); + uint64_t v974 = reinterpret_cast(v973); + TASSIGN(v972, v974); + Tile v975; + TASSIGN(v975, v37); + Tile v976 = Tile(v958); + __ubuf__ float* v977 = v975.data(); + uint64_t v978 = reinterpret_cast(v977); + TASSIGN(v976, v978); + Tile v979; + TASSIGN(v979, v38); + Tile v980 = Tile(v958); + __ubuf__ float* v981 = v979.data(); + uint64_t v982 = reinterpret_cast(v981); + TASSIGN(v980, v982); + Tile v983; + TASSIGN(v983, v40); + Tile v984; + __ubuf__ float* v985 = v983.data(); + uint64_t v986 = reinterpret_cast(v985); + TASSIGN(v984, v986); + unsigned v987 = (unsigned) v958; + unsigned v988 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v989 = pto::Shape<1, 1, 1, 1, -1>(v958); + pto::Stride<-1, -1, -1, -1, 1> v990 = pto::Stride<-1, -1, -1, -1, 1>(v988, v988, v988, v988); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v991 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v956 * (unsigned) v32), v989, v990); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v960, v991); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v992 = (unsigned) v958; + unsigned v993 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v994 = pto::Shape<1, 1, 1, 1, -1>(v958); + pto::Stride<-1, -1, -1, -1, 1> v995 = pto::Stride<-1, -1, -1, -1, 1>(v993, v993, v993, v993); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v996 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v956 * (unsigned) v32), v994, v995); + TLOAD(v968, v996); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v964, v960, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v972, v968, v27); + pipe_barrier(PIPE_ALL); + TMUL(v976, v964, v972); + pipe_barrier(PIPE_ALL); + TROWSUM(v984, v976, v980); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v949, v949, v984); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v997 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v998 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v999 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v14 * (unsigned) v32), v997, v998); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v999, v949); + Tile v1000; + TASSIGN(v1000, v36); + Tile v1001; + __ubuf__ float* v1002 = v1000.data(); + uint64_t v1003 = reinterpret_cast(v1002); + TASSIGN(v1001, v1003); + pto::Shape<1, 1, 1, 1, 1> v1004 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v1005 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v1006 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v13 * (unsigned) v32), v1004, v1005); + TLOAD(v1001, v1006); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v1001, v1001, v1001); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v1007 = v46; v1007 < v57; v1007 += v47) { + int32_t v1008 = (int32_t) ((uint32_t) ((int32_t) v1007) * (uint32_t) v31); + int32_t v1009 = (int32_t) ((uint32_t) v11 - (uint32_t) v1008); + int32_t v1010 = (uint32_t) v1009 < (uint32_t) v31 ? v1009 : v31; + Tile v1011; + TASSIGN(v1011, v37); + Tile v1012 = Tile(v1010); + __ubuf__ bfloat16_t* v1013 = v1011.data(); + uint64_t v1014 = reinterpret_cast(v1013); + TASSIGN(v1012, v1014); + Tile v1015; + TASSIGN(v1015, v38); + Tile v1016 = Tile(v1010); + __ubuf__ float* v1017 = v1015.data(); + uint64_t v1018 = reinterpret_cast(v1017); + TASSIGN(v1016, v1018); + Tile v1019; + TASSIGN(v1019, v39); + Tile v1020 = Tile(v1010); + __ubuf__ bfloat16_t* v1021 = v1019.data(); + uint64_t v1022 = reinterpret_cast(v1021); + TASSIGN(v1020, v1022); + Tile v1023; + TASSIGN(v1023, v40); + Tile v1024 = Tile(v1010); + __ubuf__ float* v1025 = v1023.data(); + uint64_t v1026 = reinterpret_cast(v1025); + TASSIGN(v1024, v1026); + Tile v1027; + TASSIGN(v1027, v37); + Tile v1028 = Tile(v1010); + __ubuf__ float* v1029 = v1027.data(); + uint64_t v1030 = reinterpret_cast(v1029); + TASSIGN(v1028, v1030); + Tile v1031; + TASSIGN(v1031, v38); + Tile v1032 = Tile(v1010); + __ubuf__ float* v1033 = v1031.data(); + uint64_t v1034 = reinterpret_cast(v1033); + TASSIGN(v1032, v1034); + Tile v1035; + TASSIGN(v1035, v40); + Tile v1036; + __ubuf__ float* v1037 = v1035.data(); + uint64_t v1038 = reinterpret_cast(v1037); + TASSIGN(v1036, v1038); + unsigned v1039 = (unsigned) v1010; + unsigned v1040 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1041 = pto::Shape<1, 1, 1, 1, -1>(v1010); + pto::Stride<-1, -1, -1, -1, 1> v1042 = pto::Stride<-1, -1, -1, -1, 1>(v1040, v1040, v1040, v1040); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1043 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1008 * (unsigned) v32), v1041, v1042); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1012, v1043); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v1044 = (unsigned) v1010; + unsigned v1045 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1046 = pto::Shape<1, 1, 1, 1, -1>(v1010); + pto::Stride<-1, -1, -1, -1, 1> v1047 = pto::Stride<-1, -1, -1, -1, 1>(v1045, v1045, v1045, v1045); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1048 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1008 * (unsigned) v32), v1046, v1047); + TLOAD(v1020, v1048); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1016, v1012, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v1024, v1020, v27); + pipe_barrier(PIPE_ALL); + TMUL(v1028, v1016, v1024); + pipe_barrier(PIPE_ALL); + TROWSUM(v1036, v1028, v1032); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v1001, v1001, v1036); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v1049 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v1050 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v1051 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v13 * (unsigned) v32), v1049, v1050); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1051, v1001); + Tile v1052; + TASSIGN(v1052, v36); + Tile v1053; + __ubuf__ float* v1054 = v1052.data(); + uint64_t v1055 = reinterpret_cast(v1054); + TASSIGN(v1053, v1055); + pto::Shape<1, 1, 1, 1, 1> v1056 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v1057 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v1058 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v2 + (v28 + (unsigned) v59 * (unsigned) v29 + v12 * (unsigned) v32), v1056, v1057); + TLOAD(v1053, v1058); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSUB(v1053, v1053, v1053); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v1059 = v46; v1059 < v57; v1059 += v47) { + int32_t v1060 = (int32_t) ((uint32_t) ((int32_t) v1059) * (uint32_t) v31); + int32_t v1061 = (int32_t) ((uint32_t) v11 - (uint32_t) v1060); + int32_t v1062 = (uint32_t) v1061 < (uint32_t) v31 ? v1061 : v31; + Tile v1063; + TASSIGN(v1063, v37); + Tile v1064 = Tile(v1062); + __ubuf__ bfloat16_t* v1065 = v1063.data(); + uint64_t v1066 = reinterpret_cast(v1065); + TASSIGN(v1064, v1066); + Tile v1067; + TASSIGN(v1067, v38); + Tile v1068 = Tile(v1062); + __ubuf__ float* v1069 = v1067.data(); + uint64_t v1070 = reinterpret_cast(v1069); + TASSIGN(v1068, v1070); + Tile v1071; + TASSIGN(v1071, v39); + Tile v1072 = Tile(v1062); + __ubuf__ bfloat16_t* v1073 = v1071.data(); + uint64_t v1074 = reinterpret_cast(v1073); + TASSIGN(v1072, v1074); + Tile v1075; + TASSIGN(v1075, v40); + Tile v1076 = Tile(v1062); + __ubuf__ float* v1077 = v1075.data(); + uint64_t v1078 = reinterpret_cast(v1077); + TASSIGN(v1076, v1078); + Tile v1079; + TASSIGN(v1079, v37); + Tile v1080 = Tile(v1062); + __ubuf__ float* v1081 = v1079.data(); + uint64_t v1082 = reinterpret_cast(v1081); + TASSIGN(v1080, v1082); + Tile v1083; + TASSIGN(v1083, v38); + Tile v1084 = Tile(v1062); + __ubuf__ float* v1085 = v1083.data(); + uint64_t v1086 = reinterpret_cast(v1085); + TASSIGN(v1084, v1086); + Tile v1087; + TASSIGN(v1087, v40); + Tile v1088; + __ubuf__ float* v1089 = v1087.data(); + uint64_t v1090 = reinterpret_cast(v1089); + TASSIGN(v1088, v1090); + unsigned v1091 = (unsigned) v1062; + unsigned v1092 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1093 = pto::Shape<1, 1, 1, 1, -1>(v1062); + pto::Stride<-1, -1, -1, -1, 1> v1094 = pto::Stride<-1, -1, -1, -1, 1>(v1092, v1092, v1092, v1092); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1095 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1060 * (unsigned) v32), v1093, v1094); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1064, v1095); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v1096 = (unsigned) v1062; + unsigned v1097 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1098 = pto::Shape<1, 1, 1, 1, -1>(v1062); + pto::Stride<-1, -1, -1, -1, 1> v1099 = pto::Stride<-1, -1, -1, -1, 1>(v1097, v1097, v1097, v1097); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1100 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1060 * (unsigned) v32), v1098, v1099); + TLOAD(v1072, v1100); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1068, v1064, v27); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v1076, v1072, v27); + pipe_barrier(PIPE_ALL); + TMUL(v1080, v1068, v1076); + pipe_barrier(PIPE_ALL); + TROWSUM(v1088, v1080, v1084); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v1053, v1053, v1088); + }; + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v1101 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v1102 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v1103 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v28 + (unsigned) v59 * (unsigned) v29 + v12 * (unsigned) v32), v1101, v1102); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1103, v1053); + for (size_t v1104 = v46; v1104 < v57; v1104 += v47) { + int32_t v1105 = (int32_t) ((uint32_t) ((int32_t) v1104) * (uint32_t) v31); + int32_t v1106 = (int32_t) ((uint32_t) v11 - (uint32_t) v1105); + int32_t v1107 = (uint32_t) v1106 < (uint32_t) v31 ? v1106 : v31; + Tile v1108; + TASSIGN(v1108, v41); + Tile v1109 = Tile(v1107); + __ubuf__ float* v1110 = v1108.data(); + uint64_t v1111 = reinterpret_cast(v1110); + TASSIGN(v1109, v1111); + Tile v1112; + TASSIGN(v1112, v36); + Tile v1113 = Tile(v1107); + __ubuf__ bfloat16_t* v1114 = v1112.data(); + uint64_t v1115 = reinterpret_cast(v1114); + TASSIGN(v1113, v1115); + Tile v1116; + TASSIGN(v1116, v42); + Tile v1117 = Tile(v1107); + __ubuf__ float* v1118 = v1116.data(); + uint64_t v1119 = reinterpret_cast(v1118); + TASSIGN(v1117, v1119); + Tile v1120; + TASSIGN(v1120, v43); + Tile v1121; + __ubuf__ float* v1122 = v1120.data(); + uint64_t v1123 = reinterpret_cast(v1122); + TASSIGN(v1121, v1123); + Tile v1124; + TASSIGN(v1124, v44); + Tile v1125 = Tile(v1107); + __ubuf__ float* v1126 = v1124.data(); + uint64_t v1127 = reinterpret_cast(v1126); + TASSIGN(v1125, v1127); + Tile v1128; + TASSIGN(v1128, v45); + Tile v1129 = Tile(v1107); + __ubuf__ float* v1130 = v1128.data(); + uint64_t v1131 = reinterpret_cast(v1130); + TASSIGN(v1129, v1131); + Tile v1132; + TASSIGN(v1132, v36); + Tile v1133 = Tile(v1107); + __ubuf__ bfloat16_t* v1134 = v1132.data(); + uint64_t v1135 = reinterpret_cast(v1134); + TASSIGN(v1133, v1135); + unsigned v1136 = (unsigned) v1107; + unsigned v1137 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1138 = pto::Shape<1, 1, 1, 1, -1>(v1107); + pto::Stride<-1, -1, -1, -1, 1> v1139 = pto::Stride<-1, -1, -1, -1, 1>(v1137, v1137, v1137, v1137); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1140 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1105 * (unsigned) v32), v1138, v1139); + TLOAD(v1113, v1140); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1117, v1113, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1121, v278); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1125, v1121); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1129, v1117, v1125); + pipe_barrier(PIPE_ALL); + TMOV(v1109, v1129); + unsigned v1141 = (unsigned) v1107; + unsigned v1142 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1143 = pto::Shape<1, 1, 1, 1, -1>(v1107); + pto::Stride<-1, -1, -1, -1, 1> v1144 = pto::Stride<-1, -1, -1, -1, 1>(v1142, v1142, v1142, v1142); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1145 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1105 * (unsigned) v32), v1143, v1144); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1113, v1145); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1117, v1113, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1121, v330); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1125, v1121); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1129, v1117, v1125); + pipe_barrier(PIPE_ALL); + TADD(v1109, v1109, v1129); + unsigned v1146 = (unsigned) v1107; + unsigned v1147 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1148 = pto::Shape<1, 1, 1, 1, -1>(v1107); + pto::Stride<-1, -1, -1, -1, 1> v1149 = pto::Stride<-1, -1, -1, -1, 1>(v1147, v1147, v1147, v1147); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1150 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1105 * (unsigned) v32), v1148, v1149); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1113, v1150); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1117, v1113, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1121, v382); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1125, v1121); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1129, v1117, v1125); + pipe_barrier(PIPE_ALL); + TADD(v1109, v1109, v1129); + unsigned v1151 = (unsigned) v1107; + unsigned v1152 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1153 = pto::Shape<1, 1, 1, 1, -1>(v1107); + pto::Stride<-1, -1, -1, -1, 1> v1154 = pto::Stride<-1, -1, -1, -1, 1>(v1152, v1152, v1152, v1152); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1155 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1105 * (unsigned) v32), v1153, v1154); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1113, v1155); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1117, v1113, v27); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1121, v434); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1125, v1121); + pipe_barrier(PIPE_ALL); + TMUL(v1129, v1117, v1125); + pipe_barrier(PIPE_ALL); + TADD(v1109, v1109, v1129); + pipe_barrier(PIPE_ALL); + TCVT(v1133, v1109, v27); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v1156 = (unsigned) v1107; + unsigned v1157 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1158 = pto::Shape<1, 1, 1, 1, -1>(v1107); + pto::Stride<-1, -1, -1, -1, 1> v1159 = pto::Stride<-1, -1, -1, -1, 1>(v1157, v1157, v1157, v1157); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1160 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v7 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1105 * (unsigned) v32), v1158, v1159); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v1160, v1133); + }; + for (size_t v1161 = v46; v1161 < v57; v1161 += v47) { + int32_t v1162 = (int32_t) ((uint32_t) ((int32_t) v1161) * (uint32_t) v31); + int32_t v1163 = (int32_t) ((uint32_t) v11 - (uint32_t) v1162); + int32_t v1164 = (uint32_t) v1163 < (uint32_t) v31 ? v1163 : v31; + Tile v1165; + TASSIGN(v1165, v41); + Tile v1166 = Tile(v1164); + __ubuf__ float* v1167 = v1165.data(); + uint64_t v1168 = reinterpret_cast(v1167); + TASSIGN(v1166, v1168); + Tile v1169; + TASSIGN(v1169, v36); + Tile v1170 = Tile(v1164); + __ubuf__ bfloat16_t* v1171 = v1169.data(); + uint64_t v1172 = reinterpret_cast(v1171); + TASSIGN(v1170, v1172); + Tile v1173; + TASSIGN(v1173, v42); + Tile v1174 = Tile(v1164); + __ubuf__ float* v1175 = v1173.data(); + uint64_t v1176 = reinterpret_cast(v1175); + TASSIGN(v1174, v1176); + Tile v1177; + TASSIGN(v1177, v43); + Tile v1178; + __ubuf__ float* v1179 = v1177.data(); + uint64_t v1180 = reinterpret_cast(v1179); + TASSIGN(v1178, v1180); + Tile v1181; + TASSIGN(v1181, v44); + Tile v1182 = Tile(v1164); + __ubuf__ float* v1183 = v1181.data(); + uint64_t v1184 = reinterpret_cast(v1183); + TASSIGN(v1182, v1184); + Tile v1185; + TASSIGN(v1185, v45); + Tile v1186 = Tile(v1164); + __ubuf__ float* v1187 = v1185.data(); + uint64_t v1188 = reinterpret_cast(v1187); + TASSIGN(v1186, v1188); + Tile v1189; + TASSIGN(v1189, v36); + Tile v1190 = Tile(v1164); + __ubuf__ bfloat16_t* v1191 = v1189.data(); + uint64_t v1192 = reinterpret_cast(v1191); + TASSIGN(v1190, v1192); + unsigned v1193 = (unsigned) v1164; + unsigned v1194 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1195 = pto::Shape<1, 1, 1, 1, -1>(v1164); + pto::Stride<-1, -1, -1, -1, 1> v1196 = pto::Stride<-1, -1, -1, -1, 1>(v1194, v1194, v1194, v1194); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1197 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1162 * (unsigned) v32), v1195, v1196); + TLOAD(v1170, v1197); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1174, v1170, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1178, v486); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1182, v1178); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1186, v1174, v1182); + pipe_barrier(PIPE_ALL); + TMOV(v1166, v1186); + unsigned v1198 = (unsigned) v1164; + unsigned v1199 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1200 = pto::Shape<1, 1, 1, 1, -1>(v1164); + pto::Stride<-1, -1, -1, -1, 1> v1201 = pto::Stride<-1, -1, -1, -1, 1>(v1199, v1199, v1199, v1199); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1202 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1162 * (unsigned) v32), v1200, v1201); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1170, v1202); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1174, v1170, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1178, v538); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1182, v1178); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1186, v1174, v1182); + pipe_barrier(PIPE_ALL); + TADD(v1166, v1166, v1186); + unsigned v1203 = (unsigned) v1164; + unsigned v1204 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1205 = pto::Shape<1, 1, 1, 1, -1>(v1164); + pto::Stride<-1, -1, -1, -1, 1> v1206 = pto::Stride<-1, -1, -1, -1, 1>(v1204, v1204, v1204, v1204); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1207 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1162 * (unsigned) v32), v1205, v1206); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1170, v1207); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1174, v1170, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1178, v590); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1182, v1178); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1186, v1174, v1182); + pipe_barrier(PIPE_ALL); + TADD(v1166, v1166, v1186); + unsigned v1208 = (unsigned) v1164; + unsigned v1209 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1210 = pto::Shape<1, 1, 1, 1, -1>(v1164); + pto::Stride<-1, -1, -1, -1, 1> v1211 = pto::Stride<-1, -1, -1, -1, 1>(v1209, v1209, v1209, v1209); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1212 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1162 * (unsigned) v32), v1210, v1211); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1170, v1212); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1174, v1170, v27); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1178, v642); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1182, v1178); + pipe_barrier(PIPE_ALL); + TMUL(v1186, v1174, v1182); + pipe_barrier(PIPE_ALL); + TADD(v1166, v1166, v1186); + pipe_barrier(PIPE_ALL); + TCVT(v1190, v1166, v27); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v1213 = (unsigned) v1164; + unsigned v1214 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1215 = pto::Shape<1, 1, 1, 1, -1>(v1164); + pto::Stride<-1, -1, -1, -1, 1> v1216 = pto::Stride<-1, -1, -1, -1, 1>(v1214, v1214, v1214, v1214); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1217 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v7 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1162 * (unsigned) v32), v1215, v1216); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v1217, v1190); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + for (size_t v1218 = v46; v1218 < v57; v1218 += v47) { + int32_t v1219 = (int32_t) ((uint32_t) ((int32_t) v1218) * (uint32_t) v31); + int32_t v1220 = (int32_t) ((uint32_t) v11 - (uint32_t) v1219); + int32_t v1221 = (uint32_t) v1220 < (uint32_t) v31 ? v1220 : v31; + Tile v1222; + TASSIGN(v1222, v41); + Tile v1223 = Tile(v1221); + __ubuf__ float* v1224 = v1222.data(); + uint64_t v1225 = reinterpret_cast(v1224); + TASSIGN(v1223, v1225); + Tile v1226; + TASSIGN(v1226, v36); + Tile v1227 = Tile(v1221); + __ubuf__ bfloat16_t* v1228 = v1226.data(); + uint64_t v1229 = reinterpret_cast(v1228); + TASSIGN(v1227, v1229); + Tile v1230; + TASSIGN(v1230, v42); + Tile v1231 = Tile(v1221); + __ubuf__ float* v1232 = v1230.data(); + uint64_t v1233 = reinterpret_cast(v1232); + TASSIGN(v1231, v1233); + Tile v1234; + TASSIGN(v1234, v43); + Tile v1235; + __ubuf__ float* v1236 = v1234.data(); + uint64_t v1237 = reinterpret_cast(v1236); + TASSIGN(v1235, v1237); + Tile v1238; + TASSIGN(v1238, v44); + Tile v1239 = Tile(v1221); + __ubuf__ float* v1240 = v1238.data(); + uint64_t v1241 = reinterpret_cast(v1240); + TASSIGN(v1239, v1241); + Tile v1242; + TASSIGN(v1242, v45); + Tile v1243 = Tile(v1221); + __ubuf__ float* v1244 = v1242.data(); + uint64_t v1245 = reinterpret_cast(v1244); + TASSIGN(v1243, v1245); + Tile v1246; + TASSIGN(v1246, v36); + Tile v1247 = Tile(v1221); + __ubuf__ bfloat16_t* v1248 = v1246.data(); + uint64_t v1249 = reinterpret_cast(v1248); + TASSIGN(v1247, v1249); + unsigned v1250 = (unsigned) v1221; + unsigned v1251 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1252 = pto::Shape<1, 1, 1, 1, -1>(v1221); + pto::Stride<-1, -1, -1, -1, 1> v1253 = pto::Stride<-1, -1, -1, -1, 1>(v1251, v1251, v1251, v1251); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1254 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1219 * (unsigned) v32), v1252, v1253); + TLOAD(v1227, v1254); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1231, v1227, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1235, v694); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1239, v1235); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1243, v1231, v1239); + pipe_barrier(PIPE_ALL); + TMOV(v1223, v1243); + unsigned v1255 = (unsigned) v1221; + unsigned v1256 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1257 = pto::Shape<1, 1, 1, 1, -1>(v1221); + pto::Stride<-1, -1, -1, -1, 1> v1258 = pto::Stride<-1, -1, -1, -1, 1>(v1256, v1256, v1256, v1256); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1259 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1219 * (unsigned) v32), v1257, v1258); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1227, v1259); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1231, v1227, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1235, v746); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1239, v1235); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1243, v1231, v1239); + pipe_barrier(PIPE_ALL); + TADD(v1223, v1223, v1243); + unsigned v1260 = (unsigned) v1221; + unsigned v1261 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1262 = pto::Shape<1, 1, 1, 1, -1>(v1221); + pto::Stride<-1, -1, -1, -1, 1> v1263 = pto::Stride<-1, -1, -1, -1, 1>(v1261, v1261, v1261, v1261); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1264 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1219 * (unsigned) v32), v1262, v1263); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1227, v1264); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1231, v1227, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1235, v798); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1239, v1235); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1243, v1231, v1239); + pipe_barrier(PIPE_ALL); + TADD(v1223, v1223, v1243); + unsigned v1265 = (unsigned) v1221; + unsigned v1266 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1267 = pto::Shape<1, 1, 1, 1, -1>(v1221); + pto::Stride<-1, -1, -1, -1, 1> v1268 = pto::Stride<-1, -1, -1, -1, 1>(v1266, v1266, v1266, v1266); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1269 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1219 * (unsigned) v32), v1267, v1268); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1227, v1269); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1231, v1227, v27); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1235, v850); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1239, v1235); + pipe_barrier(PIPE_ALL); + TMUL(v1243, v1231, v1239); + pipe_barrier(PIPE_ALL); + TADD(v1223, v1223, v1243); + pipe_barrier(PIPE_ALL); + TCVT(v1247, v1223, v27); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v1270 = (unsigned) v1221; + unsigned v1271 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1272 = pto::Shape<1, 1, 1, 1, -1>(v1221); + pto::Stride<-1, -1, -1, -1, 1> v1273 = pto::Stride<-1, -1, -1, -1, 1>(v1271, v1271, v1271, v1271); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1274 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v7 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1219 * (unsigned) v32), v1272, v1273); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v1274, v1247); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + for (size_t v1275 = v46; v1275 < v57; v1275 += v47) { + int32_t v1276 = (int32_t) ((uint32_t) ((int32_t) v1275) * (uint32_t) v31); + int32_t v1277 = (int32_t) ((uint32_t) v11 - (uint32_t) v1276); + int32_t v1278 = (uint32_t) v1277 < (uint32_t) v31 ? v1277 : v31; + Tile v1279; + TASSIGN(v1279, v41); + Tile v1280 = Tile(v1278); + __ubuf__ float* v1281 = v1279.data(); + uint64_t v1282 = reinterpret_cast(v1281); + TASSIGN(v1280, v1282); + Tile v1283; + TASSIGN(v1283, v36); + Tile v1284 = Tile(v1278); + __ubuf__ bfloat16_t* v1285 = v1283.data(); + uint64_t v1286 = reinterpret_cast(v1285); + TASSIGN(v1284, v1286); + Tile v1287; + TASSIGN(v1287, v42); + Tile v1288 = Tile(v1278); + __ubuf__ float* v1289 = v1287.data(); + uint64_t v1290 = reinterpret_cast(v1289); + TASSIGN(v1288, v1290); + Tile v1291; + TASSIGN(v1291, v43); + Tile v1292; + __ubuf__ float* v1293 = v1291.data(); + uint64_t v1294 = reinterpret_cast(v1293); + TASSIGN(v1292, v1294); + Tile v1295; + TASSIGN(v1295, v44); + Tile v1296 = Tile(v1278); + __ubuf__ float* v1297 = v1295.data(); + uint64_t v1298 = reinterpret_cast(v1297); + TASSIGN(v1296, v1298); + Tile v1299; + TASSIGN(v1299, v45); + Tile v1300 = Tile(v1278); + __ubuf__ float* v1301 = v1299.data(); + uint64_t v1302 = reinterpret_cast(v1301); + TASSIGN(v1300, v1302); + Tile v1303; + TASSIGN(v1303, v36); + Tile v1304 = Tile(v1278); + __ubuf__ bfloat16_t* v1305 = v1303.data(); + uint64_t v1306 = reinterpret_cast(v1305); + TASSIGN(v1304, v1306); + unsigned v1307 = (unsigned) v1278; + unsigned v1308 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1309 = pto::Shape<1, 1, 1, 1, -1>(v1278); + pto::Stride<-1, -1, -1, -1, 1> v1310 = pto::Stride<-1, -1, -1, -1, 1>(v1308, v1308, v1308, v1308); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1311 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1276 * (unsigned) v32), v1309, v1310); + TLOAD(v1284, v1311); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1288, v1284, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1292, v902); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1296, v1292); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1300, v1288, v1296); + pipe_barrier(PIPE_ALL); + TMOV(v1280, v1300); + unsigned v1312 = (unsigned) v1278; + unsigned v1313 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1314 = pto::Shape<1, 1, 1, 1, -1>(v1278); + pto::Stride<-1, -1, -1, -1, 1> v1315 = pto::Stride<-1, -1, -1, -1, 1>(v1313, v1313, v1313, v1313); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1316 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1276 * (unsigned) v32), v1314, v1315); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1284, v1316); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1288, v1284, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1292, v954); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1296, v1292); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1300, v1288, v1296); + pipe_barrier(PIPE_ALL); + TADD(v1280, v1280, v1300); + unsigned v1317 = (unsigned) v1278; + unsigned v1318 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1319 = pto::Shape<1, 1, 1, 1, -1>(v1278); + pto::Stride<-1, -1, -1, -1, 1> v1320 = pto::Stride<-1, -1, -1, -1, 1>(v1318, v1318, v1318, v1318); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1321 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1276 * (unsigned) v32), v1319, v1320); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1284, v1321); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1288, v1284, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1292, v1006); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1296, v1292); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1300, v1288, v1296); + pipe_barrier(PIPE_ALL); + TADD(v1280, v1280, v1300); + unsigned v1322 = (unsigned) v1278; + unsigned v1323 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1324 = pto::Shape<1, 1, 1, 1, -1>(v1278); + pto::Stride<-1, -1, -1, -1, 1> v1325 = pto::Stride<-1, -1, -1, -1, 1>(v1323, v1323, v1323, v1323); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1326 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1276 * (unsigned) v32), v1324, v1325); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1284, v1326); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1288, v1284, v27); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1292, v1058); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1296, v1292); + pipe_barrier(PIPE_ALL); + TMUL(v1300, v1288, v1296); + pipe_barrier(PIPE_ALL); + TADD(v1280, v1280, v1300); + pipe_barrier(PIPE_ALL); + TCVT(v1304, v1280, v27); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v1327 = (unsigned) v1278; + unsigned v1328 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1329 = pto::Shape<1, 1, 1, 1, -1>(v1278); + pto::Stride<-1, -1, -1, -1, 1> v1330 = pto::Stride<-1, -1, -1, -1, 1>(v1328, v1328, v1328, v1328); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1331 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v7 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1276 * (unsigned) v32), v1329, v1330); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v1331, v1304); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + for (size_t v1332 = v46; v1332 < v57; v1332 += v47) { + int32_t v1333 = (int32_t) ((uint32_t) ((int32_t) v1332) * (uint32_t) v31); + int32_t v1334 = (int32_t) ((uint32_t) v11 - (uint32_t) v1333); + int32_t v1335 = (uint32_t) v1334 < (uint32_t) v31 ? v1334 : v31; + Tile v1336; + TASSIGN(v1336, v41); + Tile v1337 = Tile(v1335); + __ubuf__ float* v1338 = v1336.data(); + uint64_t v1339 = reinterpret_cast(v1338); + TASSIGN(v1337, v1339); + Tile v1340; + TASSIGN(v1340, v36); + Tile v1341 = Tile(v1335); + __ubuf__ bfloat16_t* v1342 = v1340.data(); + uint64_t v1343 = reinterpret_cast(v1342); + TASSIGN(v1341, v1343); + Tile v1344; + TASSIGN(v1344, v42); + Tile v1345 = Tile(v1335); + __ubuf__ float* v1346 = v1344.data(); + uint64_t v1347 = reinterpret_cast(v1346); + TASSIGN(v1345, v1347); + Tile v1348; + TASSIGN(v1348, v43); + Tile v1349; + __ubuf__ float* v1350 = v1348.data(); + uint64_t v1351 = reinterpret_cast(v1350); + TASSIGN(v1349, v1351); + Tile v1352; + TASSIGN(v1352, v44); + Tile v1353 = Tile(v1335); + __ubuf__ float* v1354 = v1352.data(); + uint64_t v1355 = reinterpret_cast(v1354); + TASSIGN(v1353, v1355); + Tile v1356; + TASSIGN(v1356, v45); + Tile v1357 = Tile(v1335); + __ubuf__ float* v1358 = v1356.data(); + uint64_t v1359 = reinterpret_cast(v1358); + TASSIGN(v1357, v1359); + Tile v1360; + TASSIGN(v1360, v36); + Tile v1361 = Tile(v1335); + __ubuf__ bfloat16_t* v1362 = v1360.data(); + uint64_t v1363 = reinterpret_cast(v1362); + TASSIGN(v1361, v1363); + unsigned v1364 = (unsigned) v1335; + unsigned v1365 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1366 = pto::Shape<1, 1, 1, 1, -1>(v1335); + pto::Stride<-1, -1, -1, -1, 1> v1367 = pto::Stride<-1, -1, -1, -1, 1>(v1365, v1365, v1365, v1365); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1368 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v60 * (unsigned) v11 + (unsigned) v1333 * (unsigned) v32), v1366, v1367); + TLOAD(v1341, v1368); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1345, v1341, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1349, v67); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1353, v1349); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1357, v1345, v1353); + pipe_barrier(PIPE_ALL); + TMOV(v1337, v1357); + unsigned v1369 = (unsigned) v1335; + unsigned v1370 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1371 = pto::Shape<1, 1, 1, 1, -1>(v1335); + pto::Stride<-1, -1, -1, -1, 1> v1372 = pto::Stride<-1, -1, -1, -1, 1>(v1370, v1370, v1370, v1370); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1373 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v113 * (unsigned) v11 + (unsigned) v1333 * (unsigned) v32), v1371, v1372); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1341, v1373); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1345, v1341, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1349, v120); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1353, v1349); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1357, v1345, v1353); + pipe_barrier(PIPE_ALL); + TADD(v1337, v1337, v1357); + unsigned v1374 = (unsigned) v1335; + unsigned v1375 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1376 = pto::Shape<1, 1, 1, 1, -1>(v1335); + pto::Stride<-1, -1, -1, -1, 1> v1377 = pto::Stride<-1, -1, -1, -1, 1>(v1375, v1375, v1375, v1375); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1378 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v166 * (unsigned) v11 + (unsigned) v1333 * (unsigned) v32), v1376, v1377); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1341, v1378); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1345, v1341, v27); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1349, v173); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1353, v1349); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v1357, v1345, v1353); + pipe_barrier(PIPE_ALL); + TADD(v1337, v1337, v1357); + unsigned v1379 = (unsigned) v1335; + unsigned v1380 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1381 = pto::Shape<1, 1, 1, 1, -1>(v1335); + pto::Stride<-1, -1, -1, -1, 1> v1382 = pto::Stride<-1, -1, -1, -1, 1>(v1380, v1380, v1380, v1380); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1383 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v28 + (unsigned) v219 * (unsigned) v11 + (unsigned) v1333 * (unsigned) v32), v1381, v1382); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v1341, v1383); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1345, v1341, v27); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v1349, v226); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v1353, v1349); + pipe_barrier(PIPE_ALL); + TMUL(v1357, v1345, v1353); + pipe_barrier(PIPE_ALL); + TADD(v1337, v1337, v1357); + pipe_barrier(PIPE_ALL); + TCVT(v1361, v1337, v27); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v1384 = (unsigned) v1335; + unsigned v1385 = (unsigned) v11; + pto::Shape<1, 1, 1, 1, -1> v1386 = pto::Shape<1, 1, 1, 1, -1>(v1335); + pto::Stride<-1, -1, -1, -1, 1> v1387 = pto::Stride<-1, -1, -1, -1, 1>(v1385, v1385, v1385, v1385); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v1388 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v9 + (v28 + (unsigned) v59 * (unsigned) v11 + (unsigned) v1333 * (unsigned) v32), v1386, v1387); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v1388, v1361); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/post_fwd.cpp b/kernels/manual/a5/mhc/post_fwd.cpp new file mode 100644 index 000000000..faafdcc76 --- /dev/null +++ b/kernels/manual/a5/mhc/post_fwd.cpp @@ -0,0 +1,883 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_post_fwd_m4(__gm__ float* v1, __gm__ bfloat16_t* v2, __gm__ float* v3, __gm__ bfloat16_t* v4, __gm__ bfloat16_t* v5, int32_t v6, int32_t v7) { + unsigned v8 = 15; + unsigned v9 = 11; + unsigned v10 = 7; + unsigned v11 = 3; + unsigned v12 = 14; + unsigned v13 = 10; + unsigned v14 = 6; + unsigned v15 = 2; + unsigned v16 = 13; + unsigned v17 = 9; + unsigned v18 = 5; + unsigned v19 = 1; + unsigned v20 = 12; + unsigned v21 = 8; + unsigned v22 = 4; + RoundMode v23 = RoundMode::CAST_RINT; + unsigned v24 = 0; + const int32_t v25 = 16; + const int32_t v26 = 4; + const int32_t v27 = 1024; + const int32_t v28 = 1; + const int32_t v29 = 0; + const int32_t v30 = 2; + const int32_t v31 = 3; + const int64_t v32 = 0; + const int64_t v33 = 256; + const int64_t v34 = 16640; + const int64_t v35 = 114944; + const int64_t v36 = 49408; + const int64_t v37 = 82176; + const int64_t v38 = 115200; + using T = float; + size_t v39 = (size_t) v29; + size_t v40 = (size_t) v28; + int32_t v41 = (int32_t) ((uint32_t) v6 * (uint32_t) v26); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v42 = get_block_idx(); + int64_t v43 = get_block_num(); + int32_t v44 = (int32_t) ((int64_t) v43); + int32_t v45 = v6 / v44; + int32_t v46 = v6 % v44 != v29 && v6 < v29 == v44 < v29 ? v45 + v28 : v45; + int32_t v47 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v42) * (uint32_t) v46); + int32_t v48 = (int32_t) ((uint32_t) v47 + (uint32_t) v46); + int32_t v49 = v7 / v27; + size_t v50 = (size_t) (v7 % v27 != v29 && v7 < v29 == v27 < v29 ? v49 + v28 : v49); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + for (size_t v51 = (size_t) v47; v51 < ((size_t) ((uint32_t) v48 < (uint32_t) v6 ? v48 : v6)); v51 += v40) { + int32_t v52 = (int32_t) v51; + int32_t v53 = (int32_t) ((uint32_t) v52 * (uint32_t) v26); + Tile v54; + TASSIGN(v54, v32); + Tile v55; + __ubuf__ float* v56 = v54.data(); + uint64_t v57 = reinterpret_cast(v56); + TASSIGN(v55, v57); + pto::Shape<1, 1, 1, 1, 1> v58 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v59 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v60 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v24 + (unsigned) v52 * (unsigned) v26 + v24 * (unsigned) v28), v58, v59); + pipe_barrier(PIPE_MTE2); + TLOAD(v55, v60); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + for (size_t v61 = v39; v61 < v50; v61 += v40) { + int32_t v62 = (int32_t) ((uint32_t) ((int32_t) v61) * (uint32_t) v27); + int32_t v63 = (int32_t) ((uint32_t) v7 - (uint32_t) v62); + int32_t v64 = (uint32_t) v63 < (uint32_t) v27 ? v63 : v27; + Tile v65; + TASSIGN(v65, v33); + Tile v66 = Tile(v64); + __ubuf__ bfloat16_t* v67 = v65.data(); + uint64_t v68 = reinterpret_cast(v67); + TASSIGN(v66, v68); + Tile v69; + TASSIGN(v69, v34); + Tile v70 = Tile(v64); + __ubuf__ float* v71 = v69.data(); + uint64_t v72 = reinterpret_cast(v71); + TASSIGN(v70, v72); + Tile v73; + TASSIGN(v73, v33); + Tile v74 = Tile(v64); + __ubuf__ bfloat16_t* v75 = v73.data(); + uint64_t v76 = reinterpret_cast(v75); + TASSIGN(v74, v76); + Tile v77; + TASSIGN(v77, v34); + Tile v78 = Tile(v64); + __ubuf__ float* v79 = v77.data(); + uint64_t v80 = reinterpret_cast(v79); + TASSIGN(v78, v80); + Tile v81; + TASSIGN(v81, v35); + Tile v82; + __ubuf__ float* v83 = v81.data(); + uint64_t v84 = reinterpret_cast(v83); + TASSIGN(v82, v84); + Tile v85; + TASSIGN(v85, v36); + Tile v86 = Tile(v64); + __ubuf__ float* v87 = v85.data(); + uint64_t v88 = reinterpret_cast(v87); + TASSIGN(v86, v88); + Tile v89; + TASSIGN(v89, v37); + Tile v90 = Tile(v64); + __ubuf__ float* v91 = v89.data(); + uint64_t v92 = reinterpret_cast(v91); + TASSIGN(v90, v92); + Tile v93; + TASSIGN(v93, v38); + Tile v94 = Tile(v64); + __ubuf__ float* v95 = v93.data(); + uint64_t v96 = reinterpret_cast(v95); + TASSIGN(v94, v96); + Tile v97; + TASSIGN(v97, v33); + Tile v98 = Tile(v64); + __ubuf__ bfloat16_t* v99 = v97.data(); + uint64_t v100 = reinterpret_cast(v99); + TASSIGN(v98, v100); + unsigned v101 = (unsigned) v64; + unsigned v102 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v103 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v104 = pto::Stride<-1, -1, -1, -1, 1>(v102, v102, v102, v102); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v105 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v24 + (unsigned) v52 * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v103, v104); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v66, v105); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v70, v66, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TROWEXPAND(v86, v55); + pipe_barrier(PIPE_ALL); + TMUL(v90, v70, v86); + unsigned v106 = (unsigned) v64; + unsigned v107 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v108 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v109 = pto::Stride<-1, -1, -1, -1, 1>(v107, v107, v107, v107); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v110 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v53 * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v108, v109); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v74, v110); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TCVT(v78, v74, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v111 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v112 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v113 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v24 * (unsigned) v28), v111, v112); + TLOAD(v82, v113); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TROWEXPAND(v86, v82); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v94, v78, v86); + pipe_barrier(PIPE_ALL); + TADD(v90, v90, v94); + unsigned v114 = (unsigned) v64; + unsigned v115 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v116 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v117 = pto::Stride<-1, -1, -1, -1, 1>(v115, v115, v115, v115); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v118 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v28) * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v116, v117); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v74, v118); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TCVT(v78, v74, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v119 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v120 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v121 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v22 * (unsigned) v28), v119, v120); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v82, v121); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + TROWEXPAND(v86, v82); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TMUL(v94, v78, v86); + pipe_barrier(PIPE_ALL); + TADD(v90, v90, v94); + unsigned v122 = (unsigned) v64; + unsigned v123 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v124 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v125 = pto::Stride<-1, -1, -1, -1, 1>(v123, v123, v123, v123); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v126 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v30) * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v124, v125); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v74, v126); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TCVT(v78, v74, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 1> v127 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v128 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v129 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v21 * (unsigned) v28), v127, v128); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v82, v129); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TROWEXPAND(v86, v82); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TMUL(v94, v78, v86); + pipe_barrier(PIPE_ALL); + TADD(v90, v90, v94); + unsigned v130 = (unsigned) v64; + unsigned v131 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v132 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v133 = pto::Stride<-1, -1, -1, -1, 1>(v131, v131, v131, v131); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v134 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v31) * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v132, v133); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v74, v134); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v78, v74, v23); + pto::Shape<1, 1, 1, 1, 1> v135 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v136 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v137 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v20 * (unsigned) v28), v135, v136); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v82, v137); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v86, v82); + pipe_barrier(PIPE_ALL); + TMUL(v94, v78, v86); + pipe_barrier(PIPE_ALL); + TADD(v90, v90, v94); + pipe_barrier(PIPE_ALL); + TCVT(v98, v90, v23); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v138 = (unsigned) v64; + unsigned v139 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v140 = pto::Shape<1, 1, 1, 1, -1>(v64); + pto::Stride<-1, -1, -1, -1, 1> v141 = pto::Stride<-1, -1, -1, -1, 1>(v139, v139, v139, v139); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v142 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v24 + (unsigned) v53 * (unsigned) v7 + (unsigned) v62 * (unsigned) v28), v140, v141); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v142, v98); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + int32_t v143 = (int32_t) ((uint32_t) v53 + (uint32_t) v28); + Tile v144; + TASSIGN(v144, v32); + Tile v145; + __ubuf__ float* v146 = v144.data(); + uint64_t v147 = reinterpret_cast(v146); + TASSIGN(v145, v147); + pto::Shape<1, 1, 1, 1, 1> v148 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v149 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v150 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v24 + (unsigned) v52 * (unsigned) v26 + v19 * (unsigned) v28), v148, v149); + pipe_barrier(PIPE_MTE2); + TLOAD(v145, v150); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + for (size_t v151 = v39; v151 < v50; v151 += v40) { + int32_t v152 = (int32_t) ((uint32_t) ((int32_t) v151) * (uint32_t) v27); + int32_t v153 = (int32_t) ((uint32_t) v7 - (uint32_t) v152); + int32_t v154 = (uint32_t) v153 < (uint32_t) v27 ? v153 : v27; + Tile v155; + TASSIGN(v155, v33); + Tile v156 = Tile(v154); + __ubuf__ bfloat16_t* v157 = v155.data(); + uint64_t v158 = reinterpret_cast(v157); + TASSIGN(v156, v158); + Tile v159; + TASSIGN(v159, v34); + Tile v160 = Tile(v154); + __ubuf__ float* v161 = v159.data(); + uint64_t v162 = reinterpret_cast(v161); + TASSIGN(v160, v162); + Tile v163; + TASSIGN(v163, v33); + Tile v164 = Tile(v154); + __ubuf__ bfloat16_t* v165 = v163.data(); + uint64_t v166 = reinterpret_cast(v165); + TASSIGN(v164, v166); + Tile v167; + TASSIGN(v167, v34); + Tile v168 = Tile(v154); + __ubuf__ float* v169 = v167.data(); + uint64_t v170 = reinterpret_cast(v169); + TASSIGN(v168, v170); + Tile v171; + TASSIGN(v171, v35); + Tile v172; + __ubuf__ float* v173 = v171.data(); + uint64_t v174 = reinterpret_cast(v173); + TASSIGN(v172, v174); + Tile v175; + TASSIGN(v175, v36); + Tile v176 = Tile(v154); + __ubuf__ float* v177 = v175.data(); + uint64_t v178 = reinterpret_cast(v177); + TASSIGN(v176, v178); + Tile v179; + TASSIGN(v179, v37); + Tile v180 = Tile(v154); + __ubuf__ float* v181 = v179.data(); + uint64_t v182 = reinterpret_cast(v181); + TASSIGN(v180, v182); + Tile v183; + TASSIGN(v183, v38); + Tile v184 = Tile(v154); + __ubuf__ float* v185 = v183.data(); + uint64_t v186 = reinterpret_cast(v185); + TASSIGN(v184, v186); + Tile v187; + TASSIGN(v187, v33); + Tile v188 = Tile(v154); + __ubuf__ bfloat16_t* v189 = v187.data(); + uint64_t v190 = reinterpret_cast(v189); + TASSIGN(v188, v190); + unsigned v191 = (unsigned) v154; + unsigned v192 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v193 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v194 = pto::Stride<-1, -1, -1, -1, 1>(v192, v192, v192, v192); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v195 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v24 + (unsigned) v52 * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v193, v194); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v156, v195); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v160, v156, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + TROWEXPAND(v176, v145); + pipe_barrier(PIPE_ALL); + TMUL(v180, v160, v176); + unsigned v196 = (unsigned) v154; + unsigned v197 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v198 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v199 = pto::Stride<-1, -1, -1, -1, 1>(v197, v197, v197, v197); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v200 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v53 * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v198, v199); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + TLOAD(v164, v200); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TCVT(v168, v164, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v201 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v202 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v203 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v19 * (unsigned) v28), v201, v202); + TLOAD(v172, v203); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v176, v172); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v184, v168, v176); + pipe_barrier(PIPE_ALL); + TADD(v180, v180, v184); + unsigned v204 = (unsigned) v154; + unsigned v205 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v206 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v207 = pto::Stride<-1, -1, -1, -1, 1>(v205, v205, v205, v205); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v208 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v143 * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v206, v207); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v164, v208); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v168, v164, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v209 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v210 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v211 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v18 * (unsigned) v28), v209, v210); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v172, v211); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v176, v172); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v184, v168, v176); + pipe_barrier(PIPE_ALL); + TADD(v180, v180, v184); + unsigned v212 = (unsigned) v154; + unsigned v213 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v214 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v215 = pto::Stride<-1, -1, -1, -1, 1>(v213, v213, v213, v213); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v216 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v30) * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v214, v215); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v164, v216); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v168, v164, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v217 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v218 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v219 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v17 * (unsigned) v28), v217, v218); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v172, v219); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v176, v172); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v184, v168, v176); + pipe_barrier(PIPE_ALL); + TADD(v180, v180, v184); + unsigned v220 = (unsigned) v154; + unsigned v221 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v222 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v223 = pto::Stride<-1, -1, -1, -1, 1>(v221, v221, v221, v221); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v224 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v31) * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v222, v223); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v164, v224); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v168, v164, v23); + pto::Shape<1, 1, 1, 1, 1> v225 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v226 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v227 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v16 * (unsigned) v28), v225, v226); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v172, v227); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v176, v172); + pipe_barrier(PIPE_ALL); + TMUL(v184, v168, v176); + pipe_barrier(PIPE_ALL); + TADD(v180, v180, v184); + pipe_barrier(PIPE_ALL); + TCVT(v188, v180, v23); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + unsigned v228 = (unsigned) v154; + unsigned v229 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v230 = pto::Shape<1, 1, 1, 1, -1>(v154); + pto::Stride<-1, -1, -1, -1, 1> v231 = pto::Stride<-1, -1, -1, -1, 1>(v229, v229, v229, v229); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v232 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v24 + (unsigned) v143 * (unsigned) v7 + (unsigned) v152 * (unsigned) v28), v230, v231); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pipe_barrier(PIPE_MTE3); + TSTORE(v232, v188); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + int32_t v233 = (int32_t) ((uint32_t) v53 + (uint32_t) v30); + Tile v234; + TASSIGN(v234, v32); + Tile v235; + __ubuf__ float* v236 = v234.data(); + uint64_t v237 = reinterpret_cast(v236); + TASSIGN(v235, v237); + pto::Shape<1, 1, 1, 1, 1> v238 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v239 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v240 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v24 + (unsigned) v52 * (unsigned) v26 + v15 * (unsigned) v28), v238, v239); + pipe_barrier(PIPE_MTE2); + TLOAD(v235, v240); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + for (size_t v241 = v39; v241 < v50; v241 += v40) { + int32_t v242 = (int32_t) ((uint32_t) ((int32_t) v241) * (uint32_t) v27); + int32_t v243 = (int32_t) ((uint32_t) v7 - (uint32_t) v242); + int32_t v244 = (uint32_t) v243 < (uint32_t) v27 ? v243 : v27; + Tile v245; + TASSIGN(v245, v33); + Tile v246 = Tile(v244); + __ubuf__ bfloat16_t* v247 = v245.data(); + uint64_t v248 = reinterpret_cast(v247); + TASSIGN(v246, v248); + Tile v249; + TASSIGN(v249, v34); + Tile v250 = Tile(v244); + __ubuf__ float* v251 = v249.data(); + uint64_t v252 = reinterpret_cast(v251); + TASSIGN(v250, v252); + Tile v253; + TASSIGN(v253, v33); + Tile v254 = Tile(v244); + __ubuf__ bfloat16_t* v255 = v253.data(); + uint64_t v256 = reinterpret_cast(v255); + TASSIGN(v254, v256); + Tile v257; + TASSIGN(v257, v34); + Tile v258 = Tile(v244); + __ubuf__ float* v259 = v257.data(); + uint64_t v260 = reinterpret_cast(v259); + TASSIGN(v258, v260); + Tile v261; + TASSIGN(v261, v35); + Tile v262; + __ubuf__ float* v263 = v261.data(); + uint64_t v264 = reinterpret_cast(v263); + TASSIGN(v262, v264); + Tile v265; + TASSIGN(v265, v36); + Tile v266 = Tile(v244); + __ubuf__ float* v267 = v265.data(); + uint64_t v268 = reinterpret_cast(v267); + TASSIGN(v266, v268); + Tile v269; + TASSIGN(v269, v37); + Tile v270 = Tile(v244); + __ubuf__ float* v271 = v269.data(); + uint64_t v272 = reinterpret_cast(v271); + TASSIGN(v270, v272); + Tile v273; + TASSIGN(v273, v38); + Tile v274 = Tile(v244); + __ubuf__ float* v275 = v273.data(); + uint64_t v276 = reinterpret_cast(v275); + TASSIGN(v274, v276); + Tile v277; + TASSIGN(v277, v33); + Tile v278 = Tile(v244); + __ubuf__ bfloat16_t* v279 = v277.data(); + uint64_t v280 = reinterpret_cast(v279); + TASSIGN(v278, v280); + unsigned v281 = (unsigned) v244; + unsigned v282 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v283 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v284 = pto::Stride<-1, -1, -1, -1, 1>(v282, v282, v282, v282); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v285 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v24 + (unsigned) v52 * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v283, v284); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v246, v285); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v250, v246, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TROWEXPAND(v266, v235); + pipe_barrier(PIPE_ALL); + TMUL(v270, v250, v266); + unsigned v286 = (unsigned) v244; + unsigned v287 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v288 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v289 = pto::Stride<-1, -1, -1, -1, 1>(v287, v287, v287, v287); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v290 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v53 * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v288, v289); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v254, v290); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TCVT(v258, v254, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v291 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v292 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v293 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v15 * (unsigned) v28), v291, v292); + TLOAD(v262, v293); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v266, v262); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v274, v258, v266); + pipe_barrier(PIPE_ALL); + TADD(v270, v270, v274); + unsigned v294 = (unsigned) v244; + unsigned v295 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v296 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v297 = pto::Stride<-1, -1, -1, -1, 1>(v295, v295, v295, v295); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v298 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v143 * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v296, v297); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v254, v298); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v258, v254, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v299 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v300 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v301 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v14 * (unsigned) v28), v299, v300); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v262, v301); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v266, v262); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v274, v258, v266); + pipe_barrier(PIPE_ALL); + TADD(v270, v270, v274); + unsigned v302 = (unsigned) v244; + unsigned v303 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v304 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v305 = pto::Stride<-1, -1, -1, -1, 1>(v303, v303, v303, v303); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v306 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v233 * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v304, v305); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v254, v306); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v258, v254, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v307 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v308 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v309 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v13 * (unsigned) v28), v307, v308); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v262, v309); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v266, v262); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v274, v258, v266); + pipe_barrier(PIPE_ALL); + TADD(v270, v270, v274); + unsigned v310 = (unsigned) v244; + unsigned v311 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v312 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v313 = pto::Stride<-1, -1, -1, -1, 1>(v311, v311, v311, v311); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v314 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) ((int32_t) (uint32_t) v53 + (uint32_t) v31) * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v312, v313); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v254, v314); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v258, v254, v23); + pto::Shape<1, 1, 1, 1, 1> v315 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v316 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v317 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v12 * (unsigned) v28), v315, v316); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v262, v317); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v266, v262); + pipe_barrier(PIPE_ALL); + TMUL(v274, v258, v266); + pipe_barrier(PIPE_ALL); + TADD(v270, v270, v274); + pipe_barrier(PIPE_ALL); + TCVT(v278, v270, v23); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + unsigned v318 = (unsigned) v244; + unsigned v319 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v320 = pto::Shape<1, 1, 1, 1, -1>(v244); + pto::Stride<-1, -1, -1, -1, 1> v321 = pto::Stride<-1, -1, -1, -1, 1>(v319, v319, v319, v319); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v322 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v24 + (unsigned) v233 * (unsigned) v7 + (unsigned) v242 * (unsigned) v28), v320, v321); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pipe_barrier(PIPE_MTE3); + TSTORE(v322, v278); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + int32_t v323 = (int32_t) ((uint32_t) v53 + (uint32_t) v31); + Tile v324; + TASSIGN(v324, v32); + Tile v325; + __ubuf__ float* v326 = v324.data(); + uint64_t v327 = reinterpret_cast(v326); + TASSIGN(v325, v327); + pto::Shape<1, 1, 1, 1, 1> v328 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v329 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v330 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v24 + (unsigned) v52 * (unsigned) v26 + v11 * (unsigned) v28), v328, v329); + pipe_barrier(PIPE_MTE2); + TLOAD(v325, v330); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + for (size_t v331 = v39; v331 < v50; v331 += v40) { + int32_t v332 = (int32_t) ((uint32_t) ((int32_t) v331) * (uint32_t) v27); + int32_t v333 = (int32_t) ((uint32_t) v7 - (uint32_t) v332); + int32_t v334 = (uint32_t) v333 < (uint32_t) v27 ? v333 : v27; + Tile v335; + TASSIGN(v335, v33); + Tile v336 = Tile(v334); + __ubuf__ bfloat16_t* v337 = v335.data(); + uint64_t v338 = reinterpret_cast(v337); + TASSIGN(v336, v338); + Tile v339; + TASSIGN(v339, v34); + Tile v340 = Tile(v334); + __ubuf__ float* v341 = v339.data(); + uint64_t v342 = reinterpret_cast(v341); + TASSIGN(v340, v342); + Tile v343; + TASSIGN(v343, v33); + Tile v344 = Tile(v334); + __ubuf__ bfloat16_t* v345 = v343.data(); + uint64_t v346 = reinterpret_cast(v345); + TASSIGN(v344, v346); + Tile v347; + TASSIGN(v347, v34); + Tile v348 = Tile(v334); + __ubuf__ float* v349 = v347.data(); + uint64_t v350 = reinterpret_cast(v349); + TASSIGN(v348, v350); + Tile v351; + TASSIGN(v351, v35); + Tile v352; + __ubuf__ float* v353 = v351.data(); + uint64_t v354 = reinterpret_cast(v353); + TASSIGN(v352, v354); + Tile v355; + TASSIGN(v355, v36); + Tile v356 = Tile(v334); + __ubuf__ float* v357 = v355.data(); + uint64_t v358 = reinterpret_cast(v357); + TASSIGN(v356, v358); + Tile v359; + TASSIGN(v359, v37); + Tile v360 = Tile(v334); + __ubuf__ float* v361 = v359.data(); + uint64_t v362 = reinterpret_cast(v361); + TASSIGN(v360, v362); + Tile v363; + TASSIGN(v363, v38); + Tile v364 = Tile(v334); + __ubuf__ float* v365 = v363.data(); + uint64_t v366 = reinterpret_cast(v365); + TASSIGN(v364, v366); + Tile v367; + TASSIGN(v367, v33); + Tile v368 = Tile(v334); + __ubuf__ bfloat16_t* v369 = v367.data(); + uint64_t v370 = reinterpret_cast(v369); + TASSIGN(v368, v370); + unsigned v371 = (unsigned) v334; + unsigned v372 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v373 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v374 = pto::Stride<-1, -1, -1, -1, 1>(v372, v372, v372, v372); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v375 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v24 + (unsigned) v52 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v373, v374); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v336, v375); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v340, v336, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TROWEXPAND(v356, v325); + pipe_barrier(PIPE_ALL); + TMUL(v360, v340, v356); + unsigned v376 = (unsigned) v334; + unsigned v377 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v378 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v379 = pto::Stride<-1, -1, -1, -1, 1>(v377, v377, v377, v377); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v380 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v53 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v378, v379); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v344, v380); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TCVT(v348, v344, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v381 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v382 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v383 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v11 * (unsigned) v28), v381, v382); + TLOAD(v352, v383); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v356, v352); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v364, v348, v356); + pipe_barrier(PIPE_ALL); + TADD(v360, v360, v364); + unsigned v384 = (unsigned) v334; + unsigned v385 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v386 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v387 = pto::Stride<-1, -1, -1, -1, 1>(v385, v385, v385, v385); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v388 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v143 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v386, v387); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v344, v388); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v348, v344, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v389 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v390 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v391 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v10 * (unsigned) v28), v389, v390); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v352, v391); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v356, v352); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v364, v348, v356); + pipe_barrier(PIPE_ALL); + TADD(v360, v360, v364); + unsigned v392 = (unsigned) v334; + unsigned v393 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v394 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v395 = pto::Stride<-1, -1, -1, -1, 1>(v393, v393, v393, v393); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v396 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v233 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v394, v395); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v344, v396); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v348, v344, v23); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v397 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v398 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v399 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v9 * (unsigned) v28), v397, v398); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v352, v399); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v356, v352); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + pipe_barrier(PIPE_ALL); + TMUL(v364, v348, v356); + pipe_barrier(PIPE_ALL); + TADD(v360, v360, v364); + unsigned v400 = (unsigned) v334; + unsigned v401 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v402 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v403 = pto::Stride<-1, -1, -1, -1, 1>(v401, v401, v401, v401); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v404 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v24 + (unsigned) v323 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v402, v403); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v344, v404); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v348, v344, v23); + pto::Shape<1, 1, 1, 1, 1> v405 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<16, 16, 16, 16, 1> v406 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v407 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v1 + (v24 + (unsigned) v52 * (unsigned) v25 + v8 * (unsigned) v28), v405, v406); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v352, v407); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v356, v352); + pipe_barrier(PIPE_ALL); + TMUL(v364, v348, v356); + pipe_barrier(PIPE_ALL); + TADD(v360, v360, v364); + pipe_barrier(PIPE_ALL); + TCVT(v368, v360, v23); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + unsigned v408 = (unsigned) v334; + unsigned v409 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v410 = pto::Shape<1, 1, 1, 1, -1>(v334); + pto::Stride<-1, -1, -1, -1, 1> v411 = pto::Stride<-1, -1, -1, -1, 1>(v409, v409, v409, v409); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v412 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v5 + (v24 + (unsigned) v323 * (unsigned) v7 + (unsigned) v332 * (unsigned) v28), v410, v411); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + pipe_barrier(PIPE_MTE3); + TSTORE(v412, v368); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + }; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/pre_apply_mix_bwd.cpp b/kernels/manual/a5/mhc/pre_apply_mix_bwd.cpp new file mode 100644 index 000000000..991086ac6 --- /dev/null +++ b/kernels/manual/a5/mhc/pre_apply_mix_bwd.cpp @@ -0,0 +1,744 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_pre_apply_mix_bwd_m4(__gm__ bfloat16_t* v1, __gm__ bfloat16_t* v2, __gm__ float* v3, __gm__ bfloat16_t* v4, __gm__ float* v5, int32_t v6, int32_t v7) { + unsigned v8 = 3; + unsigned v9 = 2; + unsigned v10 = 1; + RoundMode v11 = RoundMode::CAST_RINT; + unsigned v12 = 0; + const int32_t v13 = 4; + const int32_t v14 = 1024; + const int32_t v15 = 1; + const int32_t v16 = 0; + const int32_t v17 = 2; + const int32_t v18 = 3; + const int64_t v19 = 0; + const int64_t v20 = 512; + const int64_t v21 = 256; + const int64_t v22 = 768; + const int64_t v23 = 49920; + const int64_t v24 = 17152; + const int64_t v25 = 82688; + const int64_t v26 = 33536; + const int64_t v27 = 115456; + const int64_t v28 = 148224; + using T = float; + size_t v29 = (size_t) v16; + size_t v30 = (size_t) v15; + int32_t v31 = (int32_t) ((uint32_t) v6 * (uint32_t) v13); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v32 = get_block_idx(); + int64_t v33 = get_block_num(); + int32_t v34 = (int32_t) ((int64_t) v33); + int32_t v35 = v6 / v34; + int32_t v36 = v6 % v34 != v16 && v6 < v16 == v34 < v16 ? v35 + v15 : v35; + int32_t v37 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v32) * (uint32_t) v36); + int32_t v38 = (int32_t) ((uint32_t) v37 + (uint32_t) v36); + int32_t v39 = v7 / v14; + size_t v40 = (size_t) (v7 % v14 != v16 && v7 < v16 == v14 < v16 ? v39 + v15 : v39); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v41 = (size_t) v37; v41 < ((size_t) ((uint32_t) v38 < (uint32_t) v6 ? v38 : v6)); v41 += v30) { + int32_t v42 = (int32_t) v41; + int32_t v43 = (int32_t) ((uint32_t) v42 * (uint32_t) v13); + Tile v44; + TASSIGN(v44, v19); + Tile v45; + __ubuf__ float* v46 = v44.data(); + uint64_t v47 = reinterpret_cast(v46); + TASSIGN(v45, v47); + Tile v48; + TASSIGN(v48, v20); + Tile v49; + __ubuf__ float* v50 = v48.data(); + uint64_t v51 = reinterpret_cast(v50); + TASSIGN(v49, v51); + Tile v52; + TASSIGN(v52, v21); + Tile v53; + __ubuf__ float* v54 = v52.data(); + uint64_t v55 = reinterpret_cast(v54); + TASSIGN(v53, v55); + pto::Shape<1, 1, 1, 1, 1> v56 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v57 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v58 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v12 + (unsigned) v42 * (unsigned) v13 + v12 * (unsigned) v15), v56, v57); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v45, v58); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMOV(v53, v45); + pipe_barrier(PIPE_ALL); + TSUB(v53, v53, v45); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + for (size_t v59 = v29; v59 < v40; v59 += v30) { + int32_t v60 = (int32_t) ((uint32_t) ((int32_t) v59) * (uint32_t) v14); + int32_t v61 = (int32_t) ((uint32_t) v7 - (uint32_t) v60); + int32_t v62 = (uint32_t) v61 < (uint32_t) v14 ? v61 : v14; + Tile v63; + TASSIGN(v63, v22); + Tile v64 = Tile(v62); + __ubuf__ bfloat16_t* v65 = v63.data(); + uint64_t v66 = reinterpret_cast(v65); + TASSIGN(v64, v66); + Tile v67; + TASSIGN(v67, v23); + Tile v68 = Tile(v62); + __ubuf__ float* v69 = v67.data(); + uint64_t v70 = reinterpret_cast(v69); + TASSIGN(v68, v70); + Tile v71; + TASSIGN(v71, v24); + Tile v72 = Tile(v62); + __ubuf__ bfloat16_t* v73 = v71.data(); + uint64_t v74 = reinterpret_cast(v73); + TASSIGN(v72, v74); + Tile v75; + TASSIGN(v75, v25); + Tile v76 = Tile(v62); + __ubuf__ float* v77 = v75.data(); + uint64_t v78 = reinterpret_cast(v77); + TASSIGN(v76, v78); + Tile v79; + TASSIGN(v79, v26); + Tile v80 = Tile(v62); + __ubuf__ bfloat16_t* v81 = v79.data(); + uint64_t v82 = reinterpret_cast(v81); + TASSIGN(v80, v82); + Tile v83; + TASSIGN(v83, v22); + Tile v84 = Tile(v62); + __ubuf__ float* v85 = v83.data(); + uint64_t v86 = reinterpret_cast(v85); + TASSIGN(v84, v86); + Tile v87; + TASSIGN(v87, v27); + Tile v88 = Tile(v62); + __ubuf__ float* v89 = v87.data(); + uint64_t v90 = reinterpret_cast(v89); + TASSIGN(v88, v90); + Tile v91; + TASSIGN(v91, v28); + Tile v92 = Tile(v62); + __ubuf__ float* v93 = v91.data(); + uint64_t v94 = reinterpret_cast(v93); + TASSIGN(v92, v94); + Tile v95; + TASSIGN(v95, v22); + Tile v96 = Tile(v62); + __ubuf__ float* v97 = v95.data(); + uint64_t v98 = reinterpret_cast(v97); + TASSIGN(v96, v98); + Tile v99; + TASSIGN(v99, v26); + Tile v100 = Tile(v62); + __ubuf__ bfloat16_t* v101 = v99.data(); + uint64_t v102 = reinterpret_cast(v101); + TASSIGN(v100, v102); + unsigned v103 = (unsigned) v62; + unsigned v104 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v105 = pto::Shape<1, 1, 1, 1, -1>(v62); + pto::Stride<-1, -1, -1, -1, 1> v106 = pto::Stride<-1, -1, -1, -1, 1>(v104, v104, v104, v104); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v107 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v12 + (unsigned) v42 * (unsigned) v7 + (unsigned) v60 * (unsigned) v15), v105, v106); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v64, v107); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + unsigned v108 = (unsigned) v62; + unsigned v109 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v110 = pto::Shape<1, 1, 1, 1, -1>(v62); + pto::Stride<-1, -1, -1, -1, 1> v111 = pto::Stride<-1, -1, -1, -1, 1>(v109, v109, v109, v109); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v112 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v12 + (unsigned) v43 * (unsigned) v7 + (unsigned) v60 * (unsigned) v15), v110, v111); + TLOAD(v72, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + unsigned v113 = (unsigned) v62; + unsigned v114 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v115 = pto::Shape<1, 1, 1, 1, -1>(v62); + pto::Stride<-1, -1, -1, -1, 1> v116 = pto::Stride<-1, -1, -1, -1, 1>(v114, v114, v114, v114); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v117 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v12 + (unsigned) v43 * (unsigned) v7 + (unsigned) v60 * (unsigned) v15), v115, v116); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v80, v117); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCVT(v68, v64, v11); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v76, v72, v11); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TCVT(v84, v80, v11); + TROWEXPAND(v88, v45); + pipe_barrier(PIPE_ALL); + TMUL(v92, v68, v88); + pipe_barrier(PIPE_ALL); + TADD(v84, v84, v92); + pipe_barrier(PIPE_ALL); + TCVT(v100, v84, v11); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v117, v100); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TMUL(v92, v68, v76); + pipe_barrier(PIPE_ALL); + TROWSUM(v49, v92, v96); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v53, v53, v49); + }; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v118 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v119 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v120 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v12 + (unsigned) v42 * (unsigned) v13 + v12 * (unsigned) v15), v118, v119); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pipe_barrier(PIPE_MTE3); + TSTORE(v120, v53); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + int32_t v121 = (int32_t) ((uint32_t) v43 + (uint32_t) v15); + Tile v122; + TASSIGN(v122, v19); + Tile v123; + __ubuf__ float* v124 = v122.data(); + uint64_t v125 = reinterpret_cast(v124); + TASSIGN(v123, v125); + Tile v126; + TASSIGN(v126, v20); + Tile v127; + __ubuf__ float* v128 = v126.data(); + uint64_t v129 = reinterpret_cast(v128); + TASSIGN(v127, v129); + Tile v130; + TASSIGN(v130, v21); + Tile v131; + __ubuf__ float* v132 = v130.data(); + uint64_t v133 = reinterpret_cast(v132); + TASSIGN(v131, v133); + pto::Shape<1, 1, 1, 1, 1> v134 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v135 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v136 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v12 + (unsigned) v42 * (unsigned) v13 + v10 * (unsigned) v15), v134, v135); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v123, v136); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + TMOV(v131, v123); + pipe_barrier(PIPE_ALL); + TSUB(v131, v131, v123); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + for (size_t v137 = v29; v137 < v40; v137 += v30) { + int32_t v138 = (int32_t) ((uint32_t) ((int32_t) v137) * (uint32_t) v14); + int32_t v139 = (int32_t) ((uint32_t) v7 - (uint32_t) v138); + int32_t v140 = (uint32_t) v139 < (uint32_t) v14 ? v139 : v14; + Tile v141; + TASSIGN(v141, v22); + Tile v142 = Tile(v140); + __ubuf__ bfloat16_t* v143 = v141.data(); + uint64_t v144 = reinterpret_cast(v143); + TASSIGN(v142, v144); + Tile v145; + TASSIGN(v145, v23); + Tile v146 = Tile(v140); + __ubuf__ float* v147 = v145.data(); + uint64_t v148 = reinterpret_cast(v147); + TASSIGN(v146, v148); + Tile v149; + TASSIGN(v149, v24); + Tile v150 = Tile(v140); + __ubuf__ bfloat16_t* v151 = v149.data(); + uint64_t v152 = reinterpret_cast(v151); + TASSIGN(v150, v152); + Tile v153; + TASSIGN(v153, v25); + Tile v154 = Tile(v140); + __ubuf__ float* v155 = v153.data(); + uint64_t v156 = reinterpret_cast(v155); + TASSIGN(v154, v156); + Tile v157; + TASSIGN(v157, v26); + Tile v158 = Tile(v140); + __ubuf__ bfloat16_t* v159 = v157.data(); + uint64_t v160 = reinterpret_cast(v159); + TASSIGN(v158, v160); + Tile v161; + TASSIGN(v161, v22); + Tile v162 = Tile(v140); + __ubuf__ float* v163 = v161.data(); + uint64_t v164 = reinterpret_cast(v163); + TASSIGN(v162, v164); + Tile v165; + TASSIGN(v165, v27); + Tile v166 = Tile(v140); + __ubuf__ float* v167 = v165.data(); + uint64_t v168 = reinterpret_cast(v167); + TASSIGN(v166, v168); + Tile v169; + TASSIGN(v169, v28); + Tile v170 = Tile(v140); + __ubuf__ float* v171 = v169.data(); + uint64_t v172 = reinterpret_cast(v171); + TASSIGN(v170, v172); + Tile v173; + TASSIGN(v173, v22); + Tile v174 = Tile(v140); + __ubuf__ float* v175 = v173.data(); + uint64_t v176 = reinterpret_cast(v175); + TASSIGN(v174, v176); + Tile v177; + TASSIGN(v177, v26); + Tile v178 = Tile(v140); + __ubuf__ bfloat16_t* v179 = v177.data(); + uint64_t v180 = reinterpret_cast(v179); + TASSIGN(v178, v180); + unsigned v181 = (unsigned) v140; + unsigned v182 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v183 = pto::Shape<1, 1, 1, 1, -1>(v140); + pto::Stride<-1, -1, -1, -1, 1> v184 = pto::Stride<-1, -1, -1, -1, 1>(v182, v182, v182, v182); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v185 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v12 + (unsigned) v42 * (unsigned) v7 + (unsigned) v138 * (unsigned) v15), v183, v184); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v142, v185); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + unsigned v186 = (unsigned) v140; + unsigned v187 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v188 = pto::Shape<1, 1, 1, 1, -1>(v140); + pto::Stride<-1, -1, -1, -1, 1> v189 = pto::Stride<-1, -1, -1, -1, 1>(v187, v187, v187, v187); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v190 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v12 + (unsigned) v121 * (unsigned) v7 + (unsigned) v138 * (unsigned) v15), v188, v189); + TLOAD(v150, v190); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + unsigned v191 = (unsigned) v140; + unsigned v192 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v193 = pto::Shape<1, 1, 1, 1, -1>(v140); + pto::Stride<-1, -1, -1, -1, 1> v194 = pto::Stride<-1, -1, -1, -1, 1>(v192, v192, v192, v192); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v195 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v12 + (unsigned) v121 * (unsigned) v7 + (unsigned) v138 * (unsigned) v15), v193, v194); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + TLOAD(v158, v195); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TCVT(v146, v142, v11); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TCVT(v154, v150, v11); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v162, v158, v11); + TROWEXPAND(v166, v123); + pipe_barrier(PIPE_ALL); + TMUL(v170, v146, v166); + pipe_barrier(PIPE_ALL); + TADD(v162, v162, v170); + pipe_barrier(PIPE_ALL); + TCVT(v178, v162, v11); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pipe_barrier(PIPE_MTE3); + TSTORE(v195, v178); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + TMUL(v170, v146, v154); + pipe_barrier(PIPE_ALL); + TROWSUM(v127, v170, v174); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v131, v131, v127); + }; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + pto::Shape<1, 1, 1, 1, 1> v196 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v197 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v198 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v12 + (unsigned) v42 * (unsigned) v13 + v10 * (unsigned) v15), v196, v197); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v198, v131); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + int32_t v199 = (int32_t) ((uint32_t) v43 + (uint32_t) v17); + Tile v200; + TASSIGN(v200, v19); + Tile v201; + __ubuf__ float* v202 = v200.data(); + uint64_t v203 = reinterpret_cast(v202); + TASSIGN(v201, v203); + Tile v204; + TASSIGN(v204, v20); + Tile v205; + __ubuf__ float* v206 = v204.data(); + uint64_t v207 = reinterpret_cast(v206); + TASSIGN(v205, v207); + Tile v208; + TASSIGN(v208, v21); + Tile v209; + __ubuf__ float* v210 = v208.data(); + uint64_t v211 = reinterpret_cast(v210); + TASSIGN(v209, v211); + pto::Shape<1, 1, 1, 1, 1> v212 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v213 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v214 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v12 + (unsigned) v42 * (unsigned) v13 + v9 * (unsigned) v15), v212, v213); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v201, v214); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + TMOV(v209, v201); + pipe_barrier(PIPE_ALL); + TSUB(v209, v209, v201); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + for (size_t v215 = v29; v215 < v40; v215 += v30) { + int32_t v216 = (int32_t) ((uint32_t) ((int32_t) v215) * (uint32_t) v14); + int32_t v217 = (int32_t) ((uint32_t) v7 - (uint32_t) v216); + int32_t v218 = (uint32_t) v217 < (uint32_t) v14 ? v217 : v14; + Tile v219; + TASSIGN(v219, v22); + Tile v220 = Tile(v218); + __ubuf__ bfloat16_t* v221 = v219.data(); + uint64_t v222 = reinterpret_cast(v221); + TASSIGN(v220, v222); + Tile v223; + TASSIGN(v223, v23); + Tile v224 = Tile(v218); + __ubuf__ float* v225 = v223.data(); + uint64_t v226 = reinterpret_cast(v225); + TASSIGN(v224, v226); + Tile v227; + TASSIGN(v227, v24); + Tile v228 = Tile(v218); + __ubuf__ bfloat16_t* v229 = v227.data(); + uint64_t v230 = reinterpret_cast(v229); + TASSIGN(v228, v230); + Tile v231; + TASSIGN(v231, v25); + Tile v232 = Tile(v218); + __ubuf__ float* v233 = v231.data(); + uint64_t v234 = reinterpret_cast(v233); + TASSIGN(v232, v234); + Tile v235; + TASSIGN(v235, v26); + Tile v236 = Tile(v218); + __ubuf__ bfloat16_t* v237 = v235.data(); + uint64_t v238 = reinterpret_cast(v237); + TASSIGN(v236, v238); + Tile v239; + TASSIGN(v239, v22); + Tile v240 = Tile(v218); + __ubuf__ float* v241 = v239.data(); + uint64_t v242 = reinterpret_cast(v241); + TASSIGN(v240, v242); + Tile v243; + TASSIGN(v243, v27); + Tile v244 = Tile(v218); + __ubuf__ float* v245 = v243.data(); + uint64_t v246 = reinterpret_cast(v245); + TASSIGN(v244, v246); + Tile v247; + TASSIGN(v247, v28); + Tile v248 = Tile(v218); + __ubuf__ float* v249 = v247.data(); + uint64_t v250 = reinterpret_cast(v249); + TASSIGN(v248, v250); + Tile v251; + TASSIGN(v251, v22); + Tile v252 = Tile(v218); + __ubuf__ float* v253 = v251.data(); + uint64_t v254 = reinterpret_cast(v253); + TASSIGN(v252, v254); + Tile v255; + TASSIGN(v255, v26); + Tile v256 = Tile(v218); + __ubuf__ bfloat16_t* v257 = v255.data(); + uint64_t v258 = reinterpret_cast(v257); + TASSIGN(v256, v258); + unsigned v259 = (unsigned) v218; + unsigned v260 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v261 = pto::Shape<1, 1, 1, 1, -1>(v218); + pto::Stride<-1, -1, -1, -1, 1> v262 = pto::Stride<-1, -1, -1, -1, 1>(v260, v260, v260, v260); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v263 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v12 + (unsigned) v42 * (unsigned) v7 + (unsigned) v216 * (unsigned) v15), v261, v262); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v220, v263); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v264 = (unsigned) v218; + unsigned v265 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v266 = pto::Shape<1, 1, 1, 1, -1>(v218); + pto::Stride<-1, -1, -1, -1, 1> v267 = pto::Stride<-1, -1, -1, -1, 1>(v265, v265, v265, v265); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v268 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v12 + (unsigned) v199 * (unsigned) v7 + (unsigned) v216 * (unsigned) v15), v266, v267); + TLOAD(v228, v268); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + unsigned v269 = (unsigned) v218; + unsigned v270 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v271 = pto::Shape<1, 1, 1, 1, -1>(v218); + pto::Stride<-1, -1, -1, -1, 1> v272 = pto::Stride<-1, -1, -1, -1, 1>(v270, v270, v270, v270); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v273 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v12 + (unsigned) v199 * (unsigned) v7 + (unsigned) v216 * (unsigned) v15), v271, v272); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v236, v273); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v224, v220, v11); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCVT(v232, v228, v11); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v240, v236, v11); + TROWEXPAND(v244, v201); + pipe_barrier(PIPE_ALL); + TMUL(v248, v224, v244); + pipe_barrier(PIPE_ALL); + TADD(v240, v240, v248); + pipe_barrier(PIPE_ALL); + TCVT(v256, v240, v11); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + pipe_barrier(PIPE_MTE3); + TSTORE(v273, v256); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TMUL(v248, v224, v232); + pipe_barrier(PIPE_ALL); + TROWSUM(v205, v248, v252); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v209, v209, v205); + }; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v274 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v275 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v276 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v12 + (unsigned) v42 * (unsigned) v13 + v9 * (unsigned) v15), v274, v275); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + TSTORE(v276, v209); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + int32_t v277 = (int32_t) ((uint32_t) v43 + (uint32_t) v18); + Tile v278; + TASSIGN(v278, v19); + Tile v279; + __ubuf__ float* v280 = v278.data(); + uint64_t v281 = reinterpret_cast(v280); + TASSIGN(v279, v281); + Tile v282; + TASSIGN(v282, v20); + Tile v283; + __ubuf__ float* v284 = v282.data(); + uint64_t v285 = reinterpret_cast(v284); + TASSIGN(v283, v285); + Tile v286; + TASSIGN(v286, v21); + Tile v287; + __ubuf__ float* v288 = v286.data(); + uint64_t v289 = reinterpret_cast(v288); + TASSIGN(v287, v289); + pto::Shape<1, 1, 1, 1, 1> v290 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v291 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v292 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v3 + (v12 + (unsigned) v42 * (unsigned) v13 + v8 * (unsigned) v15), v290, v291); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v279, v292); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + TMOV(v287, v279); + pipe_barrier(PIPE_ALL); + TSUB(v287, v287, v279); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + for (size_t v293 = v29; v293 < v40; v293 += v30) { + int32_t v294 = (int32_t) ((uint32_t) ((int32_t) v293) * (uint32_t) v14); + int32_t v295 = (int32_t) ((uint32_t) v7 - (uint32_t) v294); + int32_t v296 = (uint32_t) v295 < (uint32_t) v14 ? v295 : v14; + Tile v297; + TASSIGN(v297, v22); + Tile v298 = Tile(v296); + __ubuf__ bfloat16_t* v299 = v297.data(); + uint64_t v300 = reinterpret_cast(v299); + TASSIGN(v298, v300); + Tile v301; + TASSIGN(v301, v23); + Tile v302 = Tile(v296); + __ubuf__ float* v303 = v301.data(); + uint64_t v304 = reinterpret_cast(v303); + TASSIGN(v302, v304); + Tile v305; + TASSIGN(v305, v24); + Tile v306 = Tile(v296); + __ubuf__ bfloat16_t* v307 = v305.data(); + uint64_t v308 = reinterpret_cast(v307); + TASSIGN(v306, v308); + Tile v309; + TASSIGN(v309, v25); + Tile v310 = Tile(v296); + __ubuf__ float* v311 = v309.data(); + uint64_t v312 = reinterpret_cast(v311); + TASSIGN(v310, v312); + Tile v313; + TASSIGN(v313, v26); + Tile v314 = Tile(v296); + __ubuf__ bfloat16_t* v315 = v313.data(); + uint64_t v316 = reinterpret_cast(v315); + TASSIGN(v314, v316); + Tile v317; + TASSIGN(v317, v22); + Tile v318 = Tile(v296); + __ubuf__ float* v319 = v317.data(); + uint64_t v320 = reinterpret_cast(v319); + TASSIGN(v318, v320); + Tile v321; + TASSIGN(v321, v27); + Tile v322 = Tile(v296); + __ubuf__ float* v323 = v321.data(); + uint64_t v324 = reinterpret_cast(v323); + TASSIGN(v322, v324); + Tile v325; + TASSIGN(v325, v28); + Tile v326 = Tile(v296); + __ubuf__ float* v327 = v325.data(); + uint64_t v328 = reinterpret_cast(v327); + TASSIGN(v326, v328); + Tile v329; + TASSIGN(v329, v22); + Tile v330 = Tile(v296); + __ubuf__ float* v331 = v329.data(); + uint64_t v332 = reinterpret_cast(v331); + TASSIGN(v330, v332); + Tile v333; + TASSIGN(v333, v26); + Tile v334 = Tile(v296); + __ubuf__ bfloat16_t* v335 = v333.data(); + uint64_t v336 = reinterpret_cast(v335); + TASSIGN(v334, v336); + unsigned v337 = (unsigned) v296; + unsigned v338 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v339 = pto::Shape<1, 1, 1, 1, -1>(v296); + pto::Stride<-1, -1, -1, -1, 1> v340 = pto::Stride<-1, -1, -1, -1, 1>(v338, v338, v338, v338); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v341 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v12 + (unsigned) v42 * (unsigned) v7 + (unsigned) v294 * (unsigned) v15), v339, v340); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v298, v341); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + unsigned v342 = (unsigned) v296; + unsigned v343 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v344 = pto::Shape<1, 1, 1, 1, -1>(v296); + pto::Stride<-1, -1, -1, -1, 1> v345 = pto::Stride<-1, -1, -1, -1, 1>(v343, v343, v343, v343); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v346 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v2 + (v12 + (unsigned) v277 * (unsigned) v7 + (unsigned) v294 * (unsigned) v15), v344, v345); + TLOAD(v306, v346); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + unsigned v347 = (unsigned) v296; + unsigned v348 = (unsigned) v7; + pto::Shape<1, 1, 1, 1, -1> v349 = pto::Shape<1, 1, 1, 1, -1>(v296); + pto::Stride<-1, -1, -1, -1, 1> v350 = pto::Stride<-1, -1, -1, -1, 1>(v348, v348, v348, v348); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v351 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v4 + (v12 + (unsigned) v277 * (unsigned) v7 + (unsigned) v294 * (unsigned) v15), v349, v350); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v314, v351); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v302, v298, v11); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCVT(v310, v306, v11); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v318, v314, v11); + TROWEXPAND(v322, v279); + pipe_barrier(PIPE_ALL); + TMUL(v326, v302, v322); + pipe_barrier(PIPE_ALL); + TADD(v318, v318, v326); + pipe_barrier(PIPE_ALL); + TCVT(v334, v318, v11); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + pipe_barrier(PIPE_MTE3); + TSTORE(v351, v334); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TMUL(v326, v302, v310); + pipe_barrier(PIPE_ALL); + TROWSUM(v283, v326, v330); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v287, v287, v283); + }; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v352 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v353 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v354 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v12 + (unsigned) v42 * (unsigned) v13 + v8 * (unsigned) v15), v352, v353); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + TSTORE(v354, v287); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/pre_apply_mix_fwd.cpp b/kernels/manual/a5/mhc/pre_apply_mix_fwd.cpp new file mode 100644 index 000000000..b320b6508 --- /dev/null +++ b/kernels/manual/a5/mhc/pre_apply_mix_fwd.cpp @@ -0,0 +1,233 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_pre_apply_mix_fwd_m4(__gm__ bfloat16_t* v1, __gm__ float* v2, __gm__ bfloat16_t* v3, int32_t v4, int32_t v5) { + unsigned v6 = 3; + unsigned v7 = 2; + unsigned v8 = 1; + RoundMode v9 = RoundMode::CAST_RINT; + unsigned v10 = 0; + const int32_t v11 = 4; + const int32_t v12 = 1024; + const int32_t v13 = 1; + const int32_t v14 = 0; + const int32_t v15 = 2; + const int32_t v16 = 3; + const int64_t v17 = 0; + const int64_t v18 = 16384; + const int64_t v19 = 49152; + const int64_t v20 = 49408; + const int64_t v21 = 82176; + const int64_t v22 = 114944; + const int64_t v23 = 147712; + using T = float; + size_t v24 = (size_t) v13; + int32_t v25 = (int32_t) ((uint32_t) v4 * (uint32_t) v11); + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v26 = get_block_idx(); + int64_t v27 = get_block_num(); + int32_t v28 = (int32_t) ((int64_t) v27); + int32_t v29 = v4 / v28; + int32_t v30 = v4 % v28 != v14 && v4 < v14 == v28 < v14 ? v29 + v13 : v29; + int32_t v31 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v26) * (uint32_t) v30); + int32_t v32 = (int32_t) ((uint32_t) v31 + (uint32_t) v30); + int32_t v33 = v5 / v12; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + for (size_t v34 = (size_t) v31; v34 < ((size_t) ((uint32_t) v32 < (uint32_t) v4 ? v32 : v4)); v34 += v24) { + int32_t v35 = (int32_t) v34; + for (size_t v36 = (size_t) v14; v36 < ((size_t) (v5 % v12 != v14 && v5 < v14 == v12 < v14 ? v33 + v13 : v33)); v36 += v24) { + int32_t v37 = (int32_t) ((uint32_t) ((int32_t) v36) * (uint32_t) v12); + int32_t v38 = (int32_t) ((uint32_t) v5 - (uint32_t) v37); + int32_t v39 = (uint32_t) v38 < (uint32_t) v12 ? v38 : v12; + Tile v40; + TASSIGN(v40, v17); + Tile v41 = Tile(v39); + __ubuf__ bfloat16_t* v42 = v40.data(); + uint64_t v43 = reinterpret_cast(v42); + TASSIGN(v41, v43); + Tile v44; + TASSIGN(v44, v18); + Tile v45 = Tile(v39); + __ubuf__ float* v46 = v44.data(); + uint64_t v47 = reinterpret_cast(v46); + TASSIGN(v45, v47); + Tile v48; + TASSIGN(v48, v19); + Tile v49; + __ubuf__ float* v50 = v48.data(); + uint64_t v51 = reinterpret_cast(v50); + TASSIGN(v49, v51); + Tile v52; + TASSIGN(v52, v20); + Tile v53 = Tile(v39); + __ubuf__ float* v54 = v52.data(); + uint64_t v55 = reinterpret_cast(v54); + TASSIGN(v53, v55); + Tile v56; + TASSIGN(v56, v21); + Tile v57 = Tile(v39); + __ubuf__ float* v58 = v56.data(); + uint64_t v59 = reinterpret_cast(v58); + TASSIGN(v57, v59); + Tile v60; + TASSIGN(v60, v22); + Tile v61 = Tile(v39); + __ubuf__ float* v62 = v60.data(); + uint64_t v63 = reinterpret_cast(v62); + TASSIGN(v61, v63); + Tile v64; + TASSIGN(v64, v23); + Tile v65 = Tile(v39); + __ubuf__ bfloat16_t* v66 = v64.data(); + uint64_t v67 = reinterpret_cast(v66); + TASSIGN(v65, v67); + int32_t v68 = (int32_t) ((uint32_t) v35 * (uint32_t) v11); + unsigned v69 = (unsigned) v39; + unsigned v70 = (unsigned) v5; + pto::Shape<1, 1, 1, 1, -1> v71 = pto::Shape<1, 1, 1, 1, -1>(v39); + pto::Stride<-1, -1, -1, -1, 1> v72 = pto::Stride<-1, -1, -1, -1, 1>(v70, v70, v70, v70); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v73 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v10 + (unsigned) v68 * (unsigned) v5 + (unsigned) v37 * (unsigned) v13), v71, v72); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v41, v73); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v45, v41, v9); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 1> v74 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v75 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v76 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v10 + (unsigned) v35 * (unsigned) v11 + v10 * (unsigned) v13), v74, v75); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v49, v76); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPAND(v53, v49); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TMUL(v57, v45, v53); + pipe_barrier(PIPE_ALL); + TMOV(v61, v57); + unsigned v77 = (unsigned) v39; + unsigned v78 = (unsigned) v5; + pto::Shape<1, 1, 1, 1, -1> v79 = pto::Shape<1, 1, 1, 1, -1>(v39); + pto::Stride<-1, -1, -1, -1, 1> v80 = pto::Stride<-1, -1, -1, -1, 1>(v78, v78, v78, v78); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v81 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v10 + (unsigned) ((int32_t) (uint32_t) v68 + (uint32_t) v13) * (unsigned) v5 + (unsigned) v37 * (unsigned) v13), v79, v80); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v41, v81); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v45, v41, v9); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pto::Shape<1, 1, 1, 1, 1> v82 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v83 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v84 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v10 + (unsigned) v35 * (unsigned) v11 + v8 * (unsigned) v13), v82, v83); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v49, v84); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TROWEXPAND(v53, v49); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + pipe_barrier(PIPE_ALL); + TMUL(v57, v45, v53); + pipe_barrier(PIPE_ALL); + TADD(v61, v61, v57); + unsigned v85 = (unsigned) v39; + unsigned v86 = (unsigned) v5; + pto::Shape<1, 1, 1, 1, -1> v87 = pto::Shape<1, 1, 1, 1, -1>(v39); + pto::Stride<-1, -1, -1, -1, 1> v88 = pto::Stride<-1, -1, -1, -1, 1>(v86, v86, v86, v86); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v89 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v10 + (unsigned) ((int32_t) (uint32_t) v68 + (uint32_t) v15) * (unsigned) v5 + (unsigned) v37 * (unsigned) v13), v87, v88); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v41, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + TCVT(v45, v41, v9); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 1> v90 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v91 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v92 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v10 + (unsigned) v35 * (unsigned) v11 + v7 * (unsigned) v13), v90, v91); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + TLOAD(v49, v92); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TROWEXPAND(v53, v49); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TMUL(v57, v45, v53); + pipe_barrier(PIPE_ALL); + TADD(v61, v61, v57); + unsigned v93 = (unsigned) v39; + unsigned v94 = (unsigned) v5; + pto::Shape<1, 1, 1, 1, -1> v95 = pto::Shape<1, 1, 1, 1, -1>(v39); + pto::Stride<-1, -1, -1, -1, 1> v96 = pto::Stride<-1, -1, -1, -1, 1>(v94, v94, v94, v94); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v97 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v10 + (unsigned) ((int32_t) (uint32_t) v68 + (uint32_t) v16) * (unsigned) v5 + (unsigned) v37 * (unsigned) v13), v95, v96); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v41, v97); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TCVT(v45, v41, v9); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v98 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<4, 4, 4, 4, 1> v99 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v100 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v10 + (unsigned) v35 * (unsigned) v11 + v6 * (unsigned) v13), v98, v99); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v49, v100); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v53, v49); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TMUL(v57, v45, v53); + pipe_barrier(PIPE_ALL); + TADD(v61, v61, v57); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v65, v61, v9); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + unsigned v101 = (unsigned) v39; + unsigned v102 = (unsigned) v5; + pto::Shape<1, 1, 1, 1, -1> v103 = pto::Shape<1, 1, 1, 1, -1>(v39); + pto::Stride<-1, -1, -1, -1, 1> v104 = pto::Stride<-1, -1, -1, -1, 1>(v102, v102, v102, v102); + GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v105 = GlobalTensor, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v3 + (v10 + (unsigned) v35 * (unsigned) v5 + (unsigned) v37 * (unsigned) v13), v103, v104); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v105, v65); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + }; + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/pre_norm_fn_fwd.cpp b/kernels/manual/a5/mhc/pre_norm_fn_fwd.cpp new file mode 100644 index 000000000..6d9b0ba1e --- /dev/null +++ b/kernels/manual/a5/mhc/pre_norm_fn_fwd.cpp @@ -0,0 +1,7106 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_pre_norm_fn_fwd_m4_h1280(__gm__ bfloat16_t* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 23; + unsigned v6 = 22; + unsigned v7 = 21; + unsigned v8 = 20; + unsigned v9 = 19; + unsigned v10 = 18; + unsigned v11 = 17; + unsigned v12 = 16; + unsigned v13 = 15; + unsigned v14 = 14; + unsigned v15 = 13; + unsigned v16 = 12; + unsigned v17 = 11; + unsigned v18 = 10; + unsigned v19 = 9; + unsigned v20 = 8; + unsigned v21 = 7; + unsigned v22 = 6; + unsigned v23 = 5; + unsigned v24 = 4; + unsigned v25 = 3; + unsigned v26 = 2; + unsigned v27 = 1; + unsigned v28 = 4096; + unsigned v29 = 3072; + unsigned v30 = 2048; + unsigned v31 = 1024; + RoundMode v32 = RoundMode::CAST_RINT; + unsigned v33 = 0; + const int32_t v34 = 0; + const int32_t v35 = 24; + const int32_t v36 = 5120; + const int32_t v37 = 1; + const float v38 = 1.95312503E-4f; + const float v39 = 9.99999997E-7f; + const int64_t v40 = 0; + const int64_t v41 = 16384; + const int64_t v42 = 49152; + const int64_t v43 = 32768; + const int64_t v44 = 256; + const int64_t v45 = 16640; + const int64_t v46 = 49408; + const int64_t v47 = 33024; + const int64_t v48 = 82176; + const int64_t v49 = 512; + const int64_t v50 = 49664; + const int64_t v51 = 16896; + const int64_t v52 = 82432; + const int64_t v53 = 33280; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v54 = get_block_idx(); + int64_t v55 = get_block_num(); + int32_t v56 = (int32_t) ((int64_t) v55); + int32_t v57 = v4 / v56; + int32_t v58 = v4 % v56 != v34 && v4 < v34 == v56 < v34 ? v57 + v37 : v57; + int32_t v59 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v54) * (uint32_t) v58); + int32_t v60 = (int32_t) ((uint32_t) v59 + (uint32_t) v58); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v61 = (size_t) v59; v61 < ((size_t) ((uint32_t) v60 < (uint32_t) v4 ? v60 : v4)); v61 += (size_t) v37) { + int32_t v62 = (int32_t) v61; + Tile v63; + TASSIGN(v63, v40); + Tile v64; + __ubuf__ float* v65 = v63.data(); + uint64_t v66 = reinterpret_cast(v65); + TASSIGN(v64, v66); + Tile v67; + TASSIGN(v67, v40); + Tile v68; + __ubuf__ bfloat16_t* v69 = v67.data(); + uint64_t v70 = reinterpret_cast(v69); + TASSIGN(v68, v70); + Tile v71; + TASSIGN(v71, v41); + Tile v72; + __ubuf__ float* v73 = v71.data(); + uint64_t v74 = reinterpret_cast(v73); + TASSIGN(v72, v74); + Tile v75; + TASSIGN(v75, v42); + Tile v76; + __ubuf__ float* v77 = v75.data(); + uint64_t v78 = reinterpret_cast(v77); + TASSIGN(v76, v78); + Tile v79; + TASSIGN(v79, v40); + Tile v80; + __ubuf__ float* v81 = v79.data(); + uint64_t v82 = reinterpret_cast(v81); + TASSIGN(v80, v82); + Tile v83; + TASSIGN(v83, v43); + Tile v84; + __ubuf__ float* v85 = v83.data(); + uint64_t v86 = reinterpret_cast(v85); + TASSIGN(v84, v86); + pto::Shape<1, 1, 1, 1, 1024> v87 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v88 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v89 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v33 + (unsigned) v62 * (unsigned) v36 + v33 * (unsigned) v37), v87, v88); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v68, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v72, v68, v32); + pipe_barrier(PIPE_ALL); + TMUL(v76, v72, v72); + pipe_barrier(PIPE_ALL); + TROWSUM(v84, v76, v80); + pipe_barrier(PIPE_ALL); + TMOV(v64, v84); + Tile v90; + TASSIGN(v90, v44); + Tile v91; + __ubuf__ bfloat16_t* v92 = v90.data(); + uint64_t v93 = reinterpret_cast(v92); + TASSIGN(v91, v93); + Tile v94; + TASSIGN(v94, v45); + Tile v95; + __ubuf__ float* v96 = v94.data(); + uint64_t v97 = reinterpret_cast(v96); + TASSIGN(v95, v97); + Tile v98; + TASSIGN(v98, v46); + Tile v99; + __ubuf__ float* v100 = v98.data(); + uint64_t v101 = reinterpret_cast(v100); + TASSIGN(v99, v101); + Tile v102; + TASSIGN(v102, v44); + Tile v103; + __ubuf__ float* v104 = v102.data(); + uint64_t v105 = reinterpret_cast(v104); + TASSIGN(v103, v105); + Tile v106; + TASSIGN(v106, v47); + Tile v107; + __ubuf__ float* v108 = v106.data(); + uint64_t v109 = reinterpret_cast(v108); + TASSIGN(v107, v109); + pto::Shape<1, 1, 1, 1, 1024> v110 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v111 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v112 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v33 + (unsigned) v62 * (unsigned) v36 + v31 * (unsigned) v37), v110, v111); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v91, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCVT(v95, v91, v32); + pipe_barrier(PIPE_ALL); + TMUL(v99, v95, v95); + pipe_barrier(PIPE_ALL); + TROWSUM(v107, v99, v103); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v64, v64, v107); + Tile v113; + TASSIGN(v113, v44); + Tile v114; + __ubuf__ bfloat16_t* v115 = v113.data(); + uint64_t v116 = reinterpret_cast(v115); + TASSIGN(v114, v116); + Tile v117; + TASSIGN(v117, v45); + Tile v118; + __ubuf__ float* v119 = v117.data(); + uint64_t v120 = reinterpret_cast(v119); + TASSIGN(v118, v120); + Tile v121; + TASSIGN(v121, v46); + Tile v122; + __ubuf__ float* v123 = v121.data(); + uint64_t v124 = reinterpret_cast(v123); + TASSIGN(v122, v124); + Tile v125; + TASSIGN(v125, v44); + Tile v126; + __ubuf__ float* v127 = v125.data(); + uint64_t v128 = reinterpret_cast(v127); + TASSIGN(v126, v128); + Tile v129; + TASSIGN(v129, v47); + Tile v130; + __ubuf__ float* v131 = v129.data(); + uint64_t v132 = reinterpret_cast(v131); + TASSIGN(v130, v132); + pto::Shape<1, 1, 1, 1, 1024> v133 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v134 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v135 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v33 + (unsigned) v62 * (unsigned) v36 + v30 * (unsigned) v37), v133, v134); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v114, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v118, v114, v32); + pipe_barrier(PIPE_ALL); + TMUL(v122, v118, v118); + pipe_barrier(PIPE_ALL); + TROWSUM(v130, v122, v126); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TADD(v64, v64, v130); + Tile v136; + TASSIGN(v136, v44); + Tile v137; + __ubuf__ bfloat16_t* v138 = v136.data(); + uint64_t v139 = reinterpret_cast(v138); + TASSIGN(v137, v139); + Tile v140; + TASSIGN(v140, v45); + Tile v141; + __ubuf__ float* v142 = v140.data(); + uint64_t v143 = reinterpret_cast(v142); + TASSIGN(v141, v143); + Tile v144; + TASSIGN(v144, v46); + Tile v145; + __ubuf__ float* v146 = v144.data(); + uint64_t v147 = reinterpret_cast(v146); + TASSIGN(v145, v147); + Tile v148; + TASSIGN(v148, v44); + Tile v149; + __ubuf__ float* v150 = v148.data(); + uint64_t v151 = reinterpret_cast(v150); + TASSIGN(v149, v151); + Tile v152; + TASSIGN(v152, v47); + Tile v153; + __ubuf__ float* v154 = v152.data(); + uint64_t v155 = reinterpret_cast(v154); + TASSIGN(v153, v155); + pto::Shape<1, 1, 1, 1, 1024> v156 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v157 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v158 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v33 + (unsigned) v62 * (unsigned) v36 + v29 * (unsigned) v37), v156, v157); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v137, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TCVT(v141, v137, v32); + pipe_barrier(PIPE_ALL); + TMUL(v145, v141, v141); + pipe_barrier(PIPE_ALL); + TROWSUM(v153, v145, v149); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TADD(v64, v64, v153); + Tile v159; + TASSIGN(v159, v44); + Tile v160; + __ubuf__ bfloat16_t* v161 = v159.data(); + uint64_t v162 = reinterpret_cast(v161); + TASSIGN(v160, v162); + Tile v163; + TASSIGN(v163, v45); + Tile v164; + __ubuf__ float* v165 = v163.data(); + uint64_t v166 = reinterpret_cast(v165); + TASSIGN(v164, v166); + Tile v167; + TASSIGN(v167, v46); + Tile v168; + __ubuf__ float* v169 = v167.data(); + uint64_t v170 = reinterpret_cast(v169); + TASSIGN(v168, v170); + Tile v171; + TASSIGN(v171, v44); + Tile v172; + __ubuf__ float* v173 = v171.data(); + uint64_t v174 = reinterpret_cast(v173); + TASSIGN(v172, v174); + Tile v175; + TASSIGN(v175, v47); + Tile v176; + __ubuf__ float* v177 = v175.data(); + uint64_t v178 = reinterpret_cast(v177); + TASSIGN(v176, v178); + pto::Shape<1, 1, 1, 1, 1024> v179 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v180 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v181 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v1 + (v33 + (unsigned) v62 * (unsigned) v36 + v28 * (unsigned) v37), v179, v180); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID4); + TLOAD(v160, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + TCVT(v164, v160, v32); + pipe_barrier(PIPE_ALL); + TMUL(v168, v164, v164); + pipe_barrier(PIPE_ALL); + TROWSUM(v176, v168, v172); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + pipe_barrier(PIPE_ALL); + TADD(v64, v64, v176); + pipe_barrier(PIPE_ALL); + TMULS(v64, v64, v38); + pipe_barrier(PIPE_ALL); + TADDS(v64, v64, v39); + pipe_barrier(PIPE_ALL); + TRSQRT(v64, v64); + Tile v182; + TASSIGN(v182, v44); + Tile v183; + __ubuf__ float* v184 = v182.data(); + uint64_t v185 = reinterpret_cast(v184); + TASSIGN(v183, v185); + Tile v186; + TASSIGN(v186, v44); + Tile v187; + __ubuf__ bfloat16_t* v188 = v186.data(); + uint64_t v189 = reinterpret_cast(v188); + TASSIGN(v187, v189); + Tile v190; + TASSIGN(v190, v46); + Tile v191; + __ubuf__ float* v192 = v190.data(); + uint64_t v193 = reinterpret_cast(v192); + TASSIGN(v191, v193); + Tile v194; + TASSIGN(v194, v45); + Tile v195; + __ubuf__ float* v196 = v194.data(); + uint64_t v197 = reinterpret_cast(v196); + TASSIGN(v195, v197); + Tile v198; + TASSIGN(v198, v48); + Tile v199; + __ubuf__ float* v200 = v198.data(); + uint64_t v201 = reinterpret_cast(v200); + TASSIGN(v199, v201); + Tile v202; + TASSIGN(v202, v44); + Tile v203; + __ubuf__ float* v204 = v202.data(); + uint64_t v205 = reinterpret_cast(v204); + TASSIGN(v203, v205); + Tile v206; + TASSIGN(v206, v47); + Tile v207; + __ubuf__ float* v208 = v206.data(); + uint64_t v209 = reinterpret_cast(v208); + TASSIGN(v207, v209); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID5); + TLOAD(v187, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 1024> v210 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v211 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v212 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v33 * (unsigned) v36 + v33 * (unsigned) v37), v210, v211); + TLOAD(v195, v212); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TCVT(v191, v187, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TMUL(v199, v191, v195); + pipe_barrier(PIPE_ALL); + TROWSUM(v207, v199, v203); + pipe_barrier(PIPE_ALL); + TMOV(v183, v207); + Tile v213; + TASSIGN(v213, v49); + Tile v214; + __ubuf__ bfloat16_t* v215 = v213.data(); + uint64_t v216 = reinterpret_cast(v215); + TASSIGN(v214, v216); + Tile v217; + TASSIGN(v217, v50); + Tile v218; + __ubuf__ float* v219 = v217.data(); + uint64_t v220 = reinterpret_cast(v219); + TASSIGN(v218, v220); + Tile v221; + TASSIGN(v221, v51); + Tile v222; + __ubuf__ float* v223 = v221.data(); + uint64_t v224 = reinterpret_cast(v223); + TASSIGN(v222, v224); + Tile v225; + TASSIGN(v225, v52); + Tile v226; + __ubuf__ float* v227 = v225.data(); + uint64_t v228 = reinterpret_cast(v227); + TASSIGN(v226, v228); + Tile v229; + TASSIGN(v229, v49); + Tile v230; + __ubuf__ float* v231 = v229.data(); + uint64_t v232 = reinterpret_cast(v231); + TASSIGN(v230, v232); + Tile v233; + TASSIGN(v233, v53); + Tile v234; + __ubuf__ float* v235 = v233.data(); + uint64_t v236 = reinterpret_cast(v235); + TASSIGN(v234, v236); + TLOAD(v214, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v237 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v238 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v239 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v33 * (unsigned) v36 + v31 * (unsigned) v37), v237, v238); + TLOAD(v222, v239); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v218, v214, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v226, v218, v222); + pipe_barrier(PIPE_ALL); + TROWSUM(v234, v226, v230); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + pipe_barrier(PIPE_ALL); + TADD(v183, v183, v234); + Tile v240; + TASSIGN(v240, v49); + Tile v241; + __ubuf__ bfloat16_t* v242 = v240.data(); + uint64_t v243 = reinterpret_cast(v242); + TASSIGN(v241, v243); + Tile v244; + TASSIGN(v244, v50); + Tile v245; + __ubuf__ float* v246 = v244.data(); + uint64_t v247 = reinterpret_cast(v246); + TASSIGN(v245, v247); + Tile v248; + TASSIGN(v248, v51); + Tile v249; + __ubuf__ float* v250 = v248.data(); + uint64_t v251 = reinterpret_cast(v250); + TASSIGN(v249, v251); + Tile v252; + TASSIGN(v252, v52); + Tile v253; + __ubuf__ float* v254 = v252.data(); + uint64_t v255 = reinterpret_cast(v254); + TASSIGN(v253, v255); + Tile v256; + TASSIGN(v256, v49); + Tile v257; + __ubuf__ float* v258 = v256.data(); + uint64_t v259 = reinterpret_cast(v258); + TASSIGN(v257, v259); + Tile v260; + TASSIGN(v260, v53); + Tile v261; + __ubuf__ float* v262 = v260.data(); + uint64_t v263 = reinterpret_cast(v262); + TASSIGN(v261, v263); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID6); + TLOAD(v241, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v264 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v265 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v266 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v33 * (unsigned) v36 + v30 * (unsigned) v37), v264, v265); + TLOAD(v249, v266); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v245, v241, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v253, v245, v249); + pipe_barrier(PIPE_ALL); + TROWSUM(v261, v253, v257); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + pipe_barrier(PIPE_ALL); + TADD(v183, v183, v261); + Tile v267; + TASSIGN(v267, v49); + Tile v268; + __ubuf__ bfloat16_t* v269 = v267.data(); + uint64_t v270 = reinterpret_cast(v269); + TASSIGN(v268, v270); + Tile v271; + TASSIGN(v271, v50); + Tile v272; + __ubuf__ float* v273 = v271.data(); + uint64_t v274 = reinterpret_cast(v273); + TASSIGN(v272, v274); + Tile v275; + TASSIGN(v275, v51); + Tile v276; + __ubuf__ float* v277 = v275.data(); + uint64_t v278 = reinterpret_cast(v277); + TASSIGN(v276, v278); + Tile v279; + TASSIGN(v279, v52); + Tile v280; + __ubuf__ float* v281 = v279.data(); + uint64_t v282 = reinterpret_cast(v281); + TASSIGN(v280, v282); + Tile v283; + TASSIGN(v283, v49); + Tile v284; + __ubuf__ float* v285 = v283.data(); + uint64_t v286 = reinterpret_cast(v285); + TASSIGN(v284, v286); + Tile v287; + TASSIGN(v287, v53); + Tile v288; + __ubuf__ float* v289 = v287.data(); + uint64_t v290 = reinterpret_cast(v289); + TASSIGN(v288, v290); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID7); + TLOAD(v268, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v291 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v292 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v293 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v33 * (unsigned) v36 + v29 * (unsigned) v37), v291, v292); + TLOAD(v276, v293); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v272, v268, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v280, v272, v276); + pipe_barrier(PIPE_ALL); + TROWSUM(v288, v280, v284); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v183, v183, v288); + Tile v294; + TASSIGN(v294, v49); + Tile v295; + __ubuf__ bfloat16_t* v296 = v294.data(); + uint64_t v297 = reinterpret_cast(v296); + TASSIGN(v295, v297); + Tile v298; + TASSIGN(v298, v50); + Tile v299; + __ubuf__ float* v300 = v298.data(); + uint64_t v301 = reinterpret_cast(v300); + TASSIGN(v299, v301); + Tile v302; + TASSIGN(v302, v51); + Tile v303; + __ubuf__ float* v304 = v302.data(); + uint64_t v305 = reinterpret_cast(v304); + TASSIGN(v303, v305); + Tile v306; + TASSIGN(v306, v52); + Tile v307; + __ubuf__ float* v308 = v306.data(); + uint64_t v309 = reinterpret_cast(v308); + TASSIGN(v307, v309); + Tile v310; + TASSIGN(v310, v49); + Tile v311; + __ubuf__ float* v312 = v310.data(); + uint64_t v313 = reinterpret_cast(v312); + TASSIGN(v311, v313); + Tile v314; + TASSIGN(v314, v53); + Tile v315; + __ubuf__ float* v316 = v314.data(); + uint64_t v317 = reinterpret_cast(v316); + TASSIGN(v315, v317); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v295, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v318 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v319 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v320 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v33 * (unsigned) v36 + v28 * (unsigned) v37), v318, v319); + TLOAD(v303, v320); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v299, v295, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v307, v299, v303); + pipe_barrier(PIPE_ALL); + TROWSUM(v315, v307, v311); + pipe_barrier(PIPE_ALL); + TADD(v183, v183, v315); + pipe_barrier(PIPE_ALL); + TMUL(v183, v183, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v321 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v322 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v323 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v33 * (unsigned) v37), v321, v322); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v323, v183); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v324; + TASSIGN(v324, v44); + Tile v325; + __ubuf__ float* v326 = v324.data(); + uint64_t v327 = reinterpret_cast(v326); + TASSIGN(v325, v327); + Tile v328; + TASSIGN(v328, v44); + Tile v329; + __ubuf__ bfloat16_t* v330 = v328.data(); + uint64_t v331 = reinterpret_cast(v330); + TASSIGN(v329, v331); + Tile v332; + TASSIGN(v332, v46); + Tile v333; + __ubuf__ float* v334 = v332.data(); + uint64_t v335 = reinterpret_cast(v334); + TASSIGN(v333, v335); + Tile v336; + TASSIGN(v336, v45); + Tile v337; + __ubuf__ float* v338 = v336.data(); + uint64_t v339 = reinterpret_cast(v338); + TASSIGN(v337, v339); + Tile v340; + TASSIGN(v340, v48); + Tile v341; + __ubuf__ float* v342 = v340.data(); + uint64_t v343 = reinterpret_cast(v342); + TASSIGN(v341, v343); + Tile v344; + TASSIGN(v344, v44); + Tile v345; + __ubuf__ float* v346 = v344.data(); + uint64_t v347 = reinterpret_cast(v346); + TASSIGN(v345, v347); + Tile v348; + TASSIGN(v348, v47); + Tile v349; + __ubuf__ float* v350 = v348.data(); + uint64_t v351 = reinterpret_cast(v350); + TASSIGN(v349, v351); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v329, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v352 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v353 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v354 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v27 * (unsigned) v36 + v33 * (unsigned) v37), v352, v353); + TLOAD(v337, v354); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v333, v329, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v341, v333, v337); + pipe_barrier(PIPE_ALL); + TROWSUM(v349, v341, v345); + pipe_barrier(PIPE_ALL); + TMOV(v325, v349); + Tile v355; + TASSIGN(v355, v49); + Tile v356; + __ubuf__ bfloat16_t* v357 = v355.data(); + uint64_t v358 = reinterpret_cast(v357); + TASSIGN(v356, v358); + Tile v359; + TASSIGN(v359, v50); + Tile v360; + __ubuf__ float* v361 = v359.data(); + uint64_t v362 = reinterpret_cast(v361); + TASSIGN(v360, v362); + Tile v363; + TASSIGN(v363, v51); + Tile v364; + __ubuf__ float* v365 = v363.data(); + uint64_t v366 = reinterpret_cast(v365); + TASSIGN(v364, v366); + Tile v367; + TASSIGN(v367, v52); + Tile v368; + __ubuf__ float* v369 = v367.data(); + uint64_t v370 = reinterpret_cast(v369); + TASSIGN(v368, v370); + Tile v371; + TASSIGN(v371, v49); + Tile v372; + __ubuf__ float* v373 = v371.data(); + uint64_t v374 = reinterpret_cast(v373); + TASSIGN(v372, v374); + Tile v375; + TASSIGN(v375, v53); + Tile v376; + __ubuf__ float* v377 = v375.data(); + uint64_t v378 = reinterpret_cast(v377); + TASSIGN(v376, v378); + TLOAD(v356, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v379 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v380 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v381 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v27 * (unsigned) v36 + v31 * (unsigned) v37), v379, v380); + TLOAD(v364, v381); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v360, v356, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v368, v360, v364); + pipe_barrier(PIPE_ALL); + TROWSUM(v376, v368, v372); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v325, v325, v376); + Tile v382; + TASSIGN(v382, v49); + Tile v383; + __ubuf__ bfloat16_t* v384 = v382.data(); + uint64_t v385 = reinterpret_cast(v384); + TASSIGN(v383, v385); + Tile v386; + TASSIGN(v386, v50); + Tile v387; + __ubuf__ float* v388 = v386.data(); + uint64_t v389 = reinterpret_cast(v388); + TASSIGN(v387, v389); + Tile v390; + TASSIGN(v390, v51); + Tile v391; + __ubuf__ float* v392 = v390.data(); + uint64_t v393 = reinterpret_cast(v392); + TASSIGN(v391, v393); + Tile v394; + TASSIGN(v394, v52); + Tile v395; + __ubuf__ float* v396 = v394.data(); + uint64_t v397 = reinterpret_cast(v396); + TASSIGN(v395, v397); + Tile v398; + TASSIGN(v398, v49); + Tile v399; + __ubuf__ float* v400 = v398.data(); + uint64_t v401 = reinterpret_cast(v400); + TASSIGN(v399, v401); + Tile v402; + TASSIGN(v402, v53); + Tile v403; + __ubuf__ float* v404 = v402.data(); + uint64_t v405 = reinterpret_cast(v404); + TASSIGN(v403, v405); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v383, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v406 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v407 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v408 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v27 * (unsigned) v36 + v30 * (unsigned) v37), v406, v407); + TLOAD(v391, v408); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v387, v383, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v395, v387, v391); + pipe_barrier(PIPE_ALL); + TROWSUM(v403, v395, v399); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v325, v325, v403); + Tile v409; + TASSIGN(v409, v49); + Tile v410; + __ubuf__ bfloat16_t* v411 = v409.data(); + uint64_t v412 = reinterpret_cast(v411); + TASSIGN(v410, v412); + Tile v413; + TASSIGN(v413, v50); + Tile v414; + __ubuf__ float* v415 = v413.data(); + uint64_t v416 = reinterpret_cast(v415); + TASSIGN(v414, v416); + Tile v417; + TASSIGN(v417, v51); + Tile v418; + __ubuf__ float* v419 = v417.data(); + uint64_t v420 = reinterpret_cast(v419); + TASSIGN(v418, v420); + Tile v421; + TASSIGN(v421, v52); + Tile v422; + __ubuf__ float* v423 = v421.data(); + uint64_t v424 = reinterpret_cast(v423); + TASSIGN(v422, v424); + Tile v425; + TASSIGN(v425, v49); + Tile v426; + __ubuf__ float* v427 = v425.data(); + uint64_t v428 = reinterpret_cast(v427); + TASSIGN(v426, v428); + Tile v429; + TASSIGN(v429, v53); + Tile v430; + __ubuf__ float* v431 = v429.data(); + uint64_t v432 = reinterpret_cast(v431); + TASSIGN(v430, v432); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v410, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v433 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v434 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v435 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v27 * (unsigned) v36 + v29 * (unsigned) v37), v433, v434); + TLOAD(v418, v435); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v414, v410, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v422, v414, v418); + pipe_barrier(PIPE_ALL); + TROWSUM(v430, v422, v426); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v325, v325, v430); + Tile v436; + TASSIGN(v436, v49); + Tile v437; + __ubuf__ bfloat16_t* v438 = v436.data(); + uint64_t v439 = reinterpret_cast(v438); + TASSIGN(v437, v439); + Tile v440; + TASSIGN(v440, v50); + Tile v441; + __ubuf__ float* v442 = v440.data(); + uint64_t v443 = reinterpret_cast(v442); + TASSIGN(v441, v443); + Tile v444; + TASSIGN(v444, v51); + Tile v445; + __ubuf__ float* v446 = v444.data(); + uint64_t v447 = reinterpret_cast(v446); + TASSIGN(v445, v447); + Tile v448; + TASSIGN(v448, v52); + Tile v449; + __ubuf__ float* v450 = v448.data(); + uint64_t v451 = reinterpret_cast(v450); + TASSIGN(v449, v451); + Tile v452; + TASSIGN(v452, v49); + Tile v453; + __ubuf__ float* v454 = v452.data(); + uint64_t v455 = reinterpret_cast(v454); + TASSIGN(v453, v455); + Tile v456; + TASSIGN(v456, v53); + Tile v457; + __ubuf__ float* v458 = v456.data(); + uint64_t v459 = reinterpret_cast(v458); + TASSIGN(v457, v459); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v437, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v460 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v461 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v462 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v27 * (unsigned) v36 + v28 * (unsigned) v37), v460, v461); + TLOAD(v445, v462); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v441, v437, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v449, v441, v445); + pipe_barrier(PIPE_ALL); + TROWSUM(v457, v449, v453); + pipe_barrier(PIPE_ALL); + TADD(v325, v325, v457); + pipe_barrier(PIPE_ALL); + TMUL(v325, v325, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v463 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v464 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v465 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v27 * (unsigned) v37), v463, v464); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v465, v325); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + Tile v466; + TASSIGN(v466, v44); + Tile v467; + __ubuf__ float* v468 = v466.data(); + uint64_t v469 = reinterpret_cast(v468); + TASSIGN(v467, v469); + Tile v470; + TASSIGN(v470, v44); + Tile v471; + __ubuf__ bfloat16_t* v472 = v470.data(); + uint64_t v473 = reinterpret_cast(v472); + TASSIGN(v471, v473); + Tile v474; + TASSIGN(v474, v46); + Tile v475; + __ubuf__ float* v476 = v474.data(); + uint64_t v477 = reinterpret_cast(v476); + TASSIGN(v475, v477); + Tile v478; + TASSIGN(v478, v45); + Tile v479; + __ubuf__ float* v480 = v478.data(); + uint64_t v481 = reinterpret_cast(v480); + TASSIGN(v479, v481); + Tile v482; + TASSIGN(v482, v48); + Tile v483; + __ubuf__ float* v484 = v482.data(); + uint64_t v485 = reinterpret_cast(v484); + TASSIGN(v483, v485); + Tile v486; + TASSIGN(v486, v44); + Tile v487; + __ubuf__ float* v488 = v486.data(); + uint64_t v489 = reinterpret_cast(v488); + TASSIGN(v487, v489); + Tile v490; + TASSIGN(v490, v47); + Tile v491; + __ubuf__ float* v492 = v490.data(); + uint64_t v493 = reinterpret_cast(v492); + TASSIGN(v491, v493); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3); + TLOAD(v471, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v494 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v495 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v496 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v26 * (unsigned) v36 + v33 * (unsigned) v37), v494, v495); + TLOAD(v479, v496); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v475, v471, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v483, v475, v479); + pipe_barrier(PIPE_ALL); + TROWSUM(v491, v483, v487); + pipe_barrier(PIPE_ALL); + TMOV(v467, v491); + Tile v497; + TASSIGN(v497, v49); + Tile v498; + __ubuf__ bfloat16_t* v499 = v497.data(); + uint64_t v500 = reinterpret_cast(v499); + TASSIGN(v498, v500); + Tile v501; + TASSIGN(v501, v50); + Tile v502; + __ubuf__ float* v503 = v501.data(); + uint64_t v504 = reinterpret_cast(v503); + TASSIGN(v502, v504); + Tile v505; + TASSIGN(v505, v51); + Tile v506; + __ubuf__ float* v507 = v505.data(); + uint64_t v508 = reinterpret_cast(v507); + TASSIGN(v506, v508); + Tile v509; + TASSIGN(v509, v52); + Tile v510; + __ubuf__ float* v511 = v509.data(); + uint64_t v512 = reinterpret_cast(v511); + TASSIGN(v510, v512); + Tile v513; + TASSIGN(v513, v49); + Tile v514; + __ubuf__ float* v515 = v513.data(); + uint64_t v516 = reinterpret_cast(v515); + TASSIGN(v514, v516); + Tile v517; + TASSIGN(v517, v53); + Tile v518; + __ubuf__ float* v519 = v517.data(); + uint64_t v520 = reinterpret_cast(v519); + TASSIGN(v518, v520); + TLOAD(v498, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v521 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v522 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v523 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v26 * (unsigned) v36 + v31 * (unsigned) v37), v521, v522); + TLOAD(v506, v523); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v502, v498, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v510, v502, v506); + pipe_barrier(PIPE_ALL); + TROWSUM(v518, v510, v514); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v467, v467, v518); + Tile v524; + TASSIGN(v524, v49); + Tile v525; + __ubuf__ bfloat16_t* v526 = v524.data(); + uint64_t v527 = reinterpret_cast(v526); + TASSIGN(v525, v527); + Tile v528; + TASSIGN(v528, v50); + Tile v529; + __ubuf__ float* v530 = v528.data(); + uint64_t v531 = reinterpret_cast(v530); + TASSIGN(v529, v531); + Tile v532; + TASSIGN(v532, v51); + Tile v533; + __ubuf__ float* v534 = v532.data(); + uint64_t v535 = reinterpret_cast(v534); + TASSIGN(v533, v535); + Tile v536; + TASSIGN(v536, v52); + Tile v537; + __ubuf__ float* v538 = v536.data(); + uint64_t v539 = reinterpret_cast(v538); + TASSIGN(v537, v539); + Tile v540; + TASSIGN(v540, v49); + Tile v541; + __ubuf__ float* v542 = v540.data(); + uint64_t v543 = reinterpret_cast(v542); + TASSIGN(v541, v543); + Tile v544; + TASSIGN(v544, v53); + Tile v545; + __ubuf__ float* v546 = v544.data(); + uint64_t v547 = reinterpret_cast(v546); + TASSIGN(v545, v547); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v525, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v548 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v549 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v550 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v26 * (unsigned) v36 + v30 * (unsigned) v37), v548, v549); + TLOAD(v533, v550); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v529, v525, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v537, v529, v533); + pipe_barrier(PIPE_ALL); + TROWSUM(v545, v537, v541); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v467, v467, v545); + Tile v551; + TASSIGN(v551, v49); + Tile v552; + __ubuf__ bfloat16_t* v553 = v551.data(); + uint64_t v554 = reinterpret_cast(v553); + TASSIGN(v552, v554); + Tile v555; + TASSIGN(v555, v50); + Tile v556; + __ubuf__ float* v557 = v555.data(); + uint64_t v558 = reinterpret_cast(v557); + TASSIGN(v556, v558); + Tile v559; + TASSIGN(v559, v51); + Tile v560; + __ubuf__ float* v561 = v559.data(); + uint64_t v562 = reinterpret_cast(v561); + TASSIGN(v560, v562); + Tile v563; + TASSIGN(v563, v52); + Tile v564; + __ubuf__ float* v565 = v563.data(); + uint64_t v566 = reinterpret_cast(v565); + TASSIGN(v564, v566); + Tile v567; + TASSIGN(v567, v49); + Tile v568; + __ubuf__ float* v569 = v567.data(); + uint64_t v570 = reinterpret_cast(v569); + TASSIGN(v568, v570); + Tile v571; + TASSIGN(v571, v53); + Tile v572; + __ubuf__ float* v573 = v571.data(); + uint64_t v574 = reinterpret_cast(v573); + TASSIGN(v572, v574); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v552, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v575 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v576 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v577 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v26 * (unsigned) v36 + v29 * (unsigned) v37), v575, v576); + TLOAD(v560, v577); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v556, v552, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v564, v556, v560); + pipe_barrier(PIPE_ALL); + TROWSUM(v572, v564, v568); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v467, v467, v572); + Tile v578; + TASSIGN(v578, v49); + Tile v579; + __ubuf__ bfloat16_t* v580 = v578.data(); + uint64_t v581 = reinterpret_cast(v580); + TASSIGN(v579, v581); + Tile v582; + TASSIGN(v582, v50); + Tile v583; + __ubuf__ float* v584 = v582.data(); + uint64_t v585 = reinterpret_cast(v584); + TASSIGN(v583, v585); + Tile v586; + TASSIGN(v586, v51); + Tile v587; + __ubuf__ float* v588 = v586.data(); + uint64_t v589 = reinterpret_cast(v588); + TASSIGN(v587, v589); + Tile v590; + TASSIGN(v590, v52); + Tile v591; + __ubuf__ float* v592 = v590.data(); + uint64_t v593 = reinterpret_cast(v592); + TASSIGN(v591, v593); + Tile v594; + TASSIGN(v594, v49); + Tile v595; + __ubuf__ float* v596 = v594.data(); + uint64_t v597 = reinterpret_cast(v596); + TASSIGN(v595, v597); + Tile v598; + TASSIGN(v598, v53); + Tile v599; + __ubuf__ float* v600 = v598.data(); + uint64_t v601 = reinterpret_cast(v600); + TASSIGN(v599, v601); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v579, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v602 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v603 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v604 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v26 * (unsigned) v36 + v28 * (unsigned) v37), v602, v603); + TLOAD(v587, v604); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v583, v579, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v591, v583, v587); + pipe_barrier(PIPE_ALL); + TROWSUM(v599, v591, v595); + pipe_barrier(PIPE_ALL); + TADD(v467, v467, v599); + pipe_barrier(PIPE_ALL); + TMUL(v467, v467, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 1> v605 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v606 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v607 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v26 * (unsigned) v37), v605, v606); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v607, v467); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + Tile v608; + TASSIGN(v608, v44); + Tile v609; + __ubuf__ float* v610 = v608.data(); + uint64_t v611 = reinterpret_cast(v610); + TASSIGN(v609, v611); + Tile v612; + TASSIGN(v612, v44); + Tile v613; + __ubuf__ bfloat16_t* v614 = v612.data(); + uint64_t v615 = reinterpret_cast(v614); + TASSIGN(v613, v615); + Tile v616; + TASSIGN(v616, v46); + Tile v617; + __ubuf__ float* v618 = v616.data(); + uint64_t v619 = reinterpret_cast(v618); + TASSIGN(v617, v619); + Tile v620; + TASSIGN(v620, v45); + Tile v621; + __ubuf__ float* v622 = v620.data(); + uint64_t v623 = reinterpret_cast(v622); + TASSIGN(v621, v623); + Tile v624; + TASSIGN(v624, v48); + Tile v625; + __ubuf__ float* v626 = v624.data(); + uint64_t v627 = reinterpret_cast(v626); + TASSIGN(v625, v627); + Tile v628; + TASSIGN(v628, v44); + Tile v629; + __ubuf__ float* v630 = v628.data(); + uint64_t v631 = reinterpret_cast(v630); + TASSIGN(v629, v631); + Tile v632; + TASSIGN(v632, v47); + Tile v633; + __ubuf__ float* v634 = v632.data(); + uint64_t v635 = reinterpret_cast(v634); + TASSIGN(v633, v635); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4); + TLOAD(v613, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v636 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v637 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v638 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v25 * (unsigned) v36 + v33 * (unsigned) v37), v636, v637); + TLOAD(v621, v638); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v617, v613, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v625, v617, v621); + pipe_barrier(PIPE_ALL); + TROWSUM(v633, v625, v629); + pipe_barrier(PIPE_ALL); + TMOV(v609, v633); + Tile v639; + TASSIGN(v639, v49); + Tile v640; + __ubuf__ bfloat16_t* v641 = v639.data(); + uint64_t v642 = reinterpret_cast(v641); + TASSIGN(v640, v642); + Tile v643; + TASSIGN(v643, v50); + Tile v644; + __ubuf__ float* v645 = v643.data(); + uint64_t v646 = reinterpret_cast(v645); + TASSIGN(v644, v646); + Tile v647; + TASSIGN(v647, v51); + Tile v648; + __ubuf__ float* v649 = v647.data(); + uint64_t v650 = reinterpret_cast(v649); + TASSIGN(v648, v650); + Tile v651; + TASSIGN(v651, v52); + Tile v652; + __ubuf__ float* v653 = v651.data(); + uint64_t v654 = reinterpret_cast(v653); + TASSIGN(v652, v654); + Tile v655; + TASSIGN(v655, v49); + Tile v656; + __ubuf__ float* v657 = v655.data(); + uint64_t v658 = reinterpret_cast(v657); + TASSIGN(v656, v658); + Tile v659; + TASSIGN(v659, v53); + Tile v660; + __ubuf__ float* v661 = v659.data(); + uint64_t v662 = reinterpret_cast(v661); + TASSIGN(v660, v662); + TLOAD(v640, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v663 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v664 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v665 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v25 * (unsigned) v36 + v31 * (unsigned) v37), v663, v664); + TLOAD(v648, v665); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v644, v640, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v652, v644, v648); + pipe_barrier(PIPE_ALL); + TROWSUM(v660, v652, v656); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v609, v609, v660); + Tile v666; + TASSIGN(v666, v49); + Tile v667; + __ubuf__ bfloat16_t* v668 = v666.data(); + uint64_t v669 = reinterpret_cast(v668); + TASSIGN(v667, v669); + Tile v670; + TASSIGN(v670, v50); + Tile v671; + __ubuf__ float* v672 = v670.data(); + uint64_t v673 = reinterpret_cast(v672); + TASSIGN(v671, v673); + Tile v674; + TASSIGN(v674, v51); + Tile v675; + __ubuf__ float* v676 = v674.data(); + uint64_t v677 = reinterpret_cast(v676); + TASSIGN(v675, v677); + Tile v678; + TASSIGN(v678, v52); + Tile v679; + __ubuf__ float* v680 = v678.data(); + uint64_t v681 = reinterpret_cast(v680); + TASSIGN(v679, v681); + Tile v682; + TASSIGN(v682, v49); + Tile v683; + __ubuf__ float* v684 = v682.data(); + uint64_t v685 = reinterpret_cast(v684); + TASSIGN(v683, v685); + Tile v686; + TASSIGN(v686, v53); + Tile v687; + __ubuf__ float* v688 = v686.data(); + uint64_t v689 = reinterpret_cast(v688); + TASSIGN(v687, v689); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v667, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v690 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v691 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v692 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v25 * (unsigned) v36 + v30 * (unsigned) v37), v690, v691); + TLOAD(v675, v692); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v671, v667, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v679, v671, v675); + pipe_barrier(PIPE_ALL); + TROWSUM(v687, v679, v683); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v609, v609, v687); + Tile v693; + TASSIGN(v693, v49); + Tile v694; + __ubuf__ bfloat16_t* v695 = v693.data(); + uint64_t v696 = reinterpret_cast(v695); + TASSIGN(v694, v696); + Tile v697; + TASSIGN(v697, v50); + Tile v698; + __ubuf__ float* v699 = v697.data(); + uint64_t v700 = reinterpret_cast(v699); + TASSIGN(v698, v700); + Tile v701; + TASSIGN(v701, v51); + Tile v702; + __ubuf__ float* v703 = v701.data(); + uint64_t v704 = reinterpret_cast(v703); + TASSIGN(v702, v704); + Tile v705; + TASSIGN(v705, v52); + Tile v706; + __ubuf__ float* v707 = v705.data(); + uint64_t v708 = reinterpret_cast(v707); + TASSIGN(v706, v708); + Tile v709; + TASSIGN(v709, v49); + Tile v710; + __ubuf__ float* v711 = v709.data(); + uint64_t v712 = reinterpret_cast(v711); + TASSIGN(v710, v712); + Tile v713; + TASSIGN(v713, v53); + Tile v714; + __ubuf__ float* v715 = v713.data(); + uint64_t v716 = reinterpret_cast(v715); + TASSIGN(v714, v716); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v694, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v717 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v718 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v719 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v25 * (unsigned) v36 + v29 * (unsigned) v37), v717, v718); + TLOAD(v702, v719); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v698, v694, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v706, v698, v702); + pipe_barrier(PIPE_ALL); + TROWSUM(v714, v706, v710); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v609, v609, v714); + Tile v720; + TASSIGN(v720, v49); + Tile v721; + __ubuf__ bfloat16_t* v722 = v720.data(); + uint64_t v723 = reinterpret_cast(v722); + TASSIGN(v721, v723); + Tile v724; + TASSIGN(v724, v50); + Tile v725; + __ubuf__ float* v726 = v724.data(); + uint64_t v727 = reinterpret_cast(v726); + TASSIGN(v725, v727); + Tile v728; + TASSIGN(v728, v51); + Tile v729; + __ubuf__ float* v730 = v728.data(); + uint64_t v731 = reinterpret_cast(v730); + TASSIGN(v729, v731); + Tile v732; + TASSIGN(v732, v52); + Tile v733; + __ubuf__ float* v734 = v732.data(); + uint64_t v735 = reinterpret_cast(v734); + TASSIGN(v733, v735); + Tile v736; + TASSIGN(v736, v49); + Tile v737; + __ubuf__ float* v738 = v736.data(); + uint64_t v739 = reinterpret_cast(v738); + TASSIGN(v737, v739); + Tile v740; + TASSIGN(v740, v53); + Tile v741; + __ubuf__ float* v742 = v740.data(); + uint64_t v743 = reinterpret_cast(v742); + TASSIGN(v741, v743); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v721, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v744 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v745 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v746 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v25 * (unsigned) v36 + v28 * (unsigned) v37), v744, v745); + TLOAD(v729, v746); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v725, v721, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v733, v725, v729); + pipe_barrier(PIPE_ALL); + TROWSUM(v741, v733, v737); + pipe_barrier(PIPE_ALL); + TADD(v609, v609, v741); + pipe_barrier(PIPE_ALL); + TMUL(v609, v609, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 1> v747 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v748 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v749 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v25 * (unsigned) v37), v747, v748); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v749, v609); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + Tile v750; + TASSIGN(v750, v44); + Tile v751; + __ubuf__ float* v752 = v750.data(); + uint64_t v753 = reinterpret_cast(v752); + TASSIGN(v751, v753); + Tile v754; + TASSIGN(v754, v44); + Tile v755; + __ubuf__ bfloat16_t* v756 = v754.data(); + uint64_t v757 = reinterpret_cast(v756); + TASSIGN(v755, v757); + Tile v758; + TASSIGN(v758, v46); + Tile v759; + __ubuf__ float* v760 = v758.data(); + uint64_t v761 = reinterpret_cast(v760); + TASSIGN(v759, v761); + Tile v762; + TASSIGN(v762, v45); + Tile v763; + __ubuf__ float* v764 = v762.data(); + uint64_t v765 = reinterpret_cast(v764); + TASSIGN(v763, v765); + Tile v766; + TASSIGN(v766, v48); + Tile v767; + __ubuf__ float* v768 = v766.data(); + uint64_t v769 = reinterpret_cast(v768); + TASSIGN(v767, v769); + Tile v770; + TASSIGN(v770, v44); + Tile v771; + __ubuf__ float* v772 = v770.data(); + uint64_t v773 = reinterpret_cast(v772); + TASSIGN(v771, v773); + Tile v774; + TASSIGN(v774, v47); + Tile v775; + __ubuf__ float* v776 = v774.data(); + uint64_t v777 = reinterpret_cast(v776); + TASSIGN(v775, v777); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID5); + TLOAD(v755, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v778 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v779 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v780 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v24 * (unsigned) v36 + v33 * (unsigned) v37), v778, v779); + TLOAD(v763, v780); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v759, v755, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v767, v759, v763); + pipe_barrier(PIPE_ALL); + TROWSUM(v775, v767, v771); + pipe_barrier(PIPE_ALL); + TMOV(v751, v775); + Tile v781; + TASSIGN(v781, v49); + Tile v782; + __ubuf__ bfloat16_t* v783 = v781.data(); + uint64_t v784 = reinterpret_cast(v783); + TASSIGN(v782, v784); + Tile v785; + TASSIGN(v785, v50); + Tile v786; + __ubuf__ float* v787 = v785.data(); + uint64_t v788 = reinterpret_cast(v787); + TASSIGN(v786, v788); + Tile v789; + TASSIGN(v789, v51); + Tile v790; + __ubuf__ float* v791 = v789.data(); + uint64_t v792 = reinterpret_cast(v791); + TASSIGN(v790, v792); + Tile v793; + TASSIGN(v793, v52); + Tile v794; + __ubuf__ float* v795 = v793.data(); + uint64_t v796 = reinterpret_cast(v795); + TASSIGN(v794, v796); + Tile v797; + TASSIGN(v797, v49); + Tile v798; + __ubuf__ float* v799 = v797.data(); + uint64_t v800 = reinterpret_cast(v799); + TASSIGN(v798, v800); + Tile v801; + TASSIGN(v801, v53); + Tile v802; + __ubuf__ float* v803 = v801.data(); + uint64_t v804 = reinterpret_cast(v803); + TASSIGN(v802, v804); + TLOAD(v782, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v805 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v806 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v807 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v24 * (unsigned) v36 + v31 * (unsigned) v37), v805, v806); + TLOAD(v790, v807); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v786, v782, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v794, v786, v790); + pipe_barrier(PIPE_ALL); + TROWSUM(v802, v794, v798); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v751, v751, v802); + Tile v808; + TASSIGN(v808, v49); + Tile v809; + __ubuf__ bfloat16_t* v810 = v808.data(); + uint64_t v811 = reinterpret_cast(v810); + TASSIGN(v809, v811); + Tile v812; + TASSIGN(v812, v50); + Tile v813; + __ubuf__ float* v814 = v812.data(); + uint64_t v815 = reinterpret_cast(v814); + TASSIGN(v813, v815); + Tile v816; + TASSIGN(v816, v51); + Tile v817; + __ubuf__ float* v818 = v816.data(); + uint64_t v819 = reinterpret_cast(v818); + TASSIGN(v817, v819); + Tile v820; + TASSIGN(v820, v52); + Tile v821; + __ubuf__ float* v822 = v820.data(); + uint64_t v823 = reinterpret_cast(v822); + TASSIGN(v821, v823); + Tile v824; + TASSIGN(v824, v49); + Tile v825; + __ubuf__ float* v826 = v824.data(); + uint64_t v827 = reinterpret_cast(v826); + TASSIGN(v825, v827); + Tile v828; + TASSIGN(v828, v53); + Tile v829; + __ubuf__ float* v830 = v828.data(); + uint64_t v831 = reinterpret_cast(v830); + TASSIGN(v829, v831); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v809, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v832 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v833 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v834 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v24 * (unsigned) v36 + v30 * (unsigned) v37), v832, v833); + TLOAD(v817, v834); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v813, v809, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v821, v813, v817); + pipe_barrier(PIPE_ALL); + TROWSUM(v829, v821, v825); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v751, v751, v829); + Tile v835; + TASSIGN(v835, v49); + Tile v836; + __ubuf__ bfloat16_t* v837 = v835.data(); + uint64_t v838 = reinterpret_cast(v837); + TASSIGN(v836, v838); + Tile v839; + TASSIGN(v839, v50); + Tile v840; + __ubuf__ float* v841 = v839.data(); + uint64_t v842 = reinterpret_cast(v841); + TASSIGN(v840, v842); + Tile v843; + TASSIGN(v843, v51); + Tile v844; + __ubuf__ float* v845 = v843.data(); + uint64_t v846 = reinterpret_cast(v845); + TASSIGN(v844, v846); + Tile v847; + TASSIGN(v847, v52); + Tile v848; + __ubuf__ float* v849 = v847.data(); + uint64_t v850 = reinterpret_cast(v849); + TASSIGN(v848, v850); + Tile v851; + TASSIGN(v851, v49); + Tile v852; + __ubuf__ float* v853 = v851.data(); + uint64_t v854 = reinterpret_cast(v853); + TASSIGN(v852, v854); + Tile v855; + TASSIGN(v855, v53); + Tile v856; + __ubuf__ float* v857 = v855.data(); + uint64_t v858 = reinterpret_cast(v857); + TASSIGN(v856, v858); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v836, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v859 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v860 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v861 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v24 * (unsigned) v36 + v29 * (unsigned) v37), v859, v860); + TLOAD(v844, v861); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v840, v836, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v848, v840, v844); + pipe_barrier(PIPE_ALL); + TROWSUM(v856, v848, v852); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v751, v751, v856); + Tile v862; + TASSIGN(v862, v49); + Tile v863; + __ubuf__ bfloat16_t* v864 = v862.data(); + uint64_t v865 = reinterpret_cast(v864); + TASSIGN(v863, v865); + Tile v866; + TASSIGN(v866, v50); + Tile v867; + __ubuf__ float* v868 = v866.data(); + uint64_t v869 = reinterpret_cast(v868); + TASSIGN(v867, v869); + Tile v870; + TASSIGN(v870, v51); + Tile v871; + __ubuf__ float* v872 = v870.data(); + uint64_t v873 = reinterpret_cast(v872); + TASSIGN(v871, v873); + Tile v874; + TASSIGN(v874, v52); + Tile v875; + __ubuf__ float* v876 = v874.data(); + uint64_t v877 = reinterpret_cast(v876); + TASSIGN(v875, v877); + Tile v878; + TASSIGN(v878, v49); + Tile v879; + __ubuf__ float* v880 = v878.data(); + uint64_t v881 = reinterpret_cast(v880); + TASSIGN(v879, v881); + Tile v882; + TASSIGN(v882, v53); + Tile v883; + __ubuf__ float* v884 = v882.data(); + uint64_t v885 = reinterpret_cast(v884); + TASSIGN(v883, v885); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v863, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v886 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v887 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v888 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v24 * (unsigned) v36 + v28 * (unsigned) v37), v886, v887); + TLOAD(v871, v888); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v867, v863, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v875, v867, v871); + pipe_barrier(PIPE_ALL); + TROWSUM(v883, v875, v879); + pipe_barrier(PIPE_ALL); + TADD(v751, v751, v883); + pipe_barrier(PIPE_ALL); + TMUL(v751, v751, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 1> v889 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v890 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v891 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v24 * (unsigned) v37), v889, v890); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v891, v751); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + Tile v892; + TASSIGN(v892, v44); + Tile v893; + __ubuf__ float* v894 = v892.data(); + uint64_t v895 = reinterpret_cast(v894); + TASSIGN(v893, v895); + Tile v896; + TASSIGN(v896, v44); + Tile v897; + __ubuf__ bfloat16_t* v898 = v896.data(); + uint64_t v899 = reinterpret_cast(v898); + TASSIGN(v897, v899); + Tile v900; + TASSIGN(v900, v46); + Tile v901; + __ubuf__ float* v902 = v900.data(); + uint64_t v903 = reinterpret_cast(v902); + TASSIGN(v901, v903); + Tile v904; + TASSIGN(v904, v45); + Tile v905; + __ubuf__ float* v906 = v904.data(); + uint64_t v907 = reinterpret_cast(v906); + TASSIGN(v905, v907); + Tile v908; + TASSIGN(v908, v48); + Tile v909; + __ubuf__ float* v910 = v908.data(); + uint64_t v911 = reinterpret_cast(v910); + TASSIGN(v909, v911); + Tile v912; + TASSIGN(v912, v44); + Tile v913; + __ubuf__ float* v914 = v912.data(); + uint64_t v915 = reinterpret_cast(v914); + TASSIGN(v913, v915); + Tile v916; + TASSIGN(v916, v47); + Tile v917; + __ubuf__ float* v918 = v916.data(); + uint64_t v919 = reinterpret_cast(v918); + TASSIGN(v917, v919); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID6); + TLOAD(v897, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v920 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v921 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v922 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v23 * (unsigned) v36 + v33 * (unsigned) v37), v920, v921); + TLOAD(v905, v922); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v901, v897, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v909, v901, v905); + pipe_barrier(PIPE_ALL); + TROWSUM(v917, v909, v913); + pipe_barrier(PIPE_ALL); + TMOV(v893, v917); + Tile v923; + TASSIGN(v923, v49); + Tile v924; + __ubuf__ bfloat16_t* v925 = v923.data(); + uint64_t v926 = reinterpret_cast(v925); + TASSIGN(v924, v926); + Tile v927; + TASSIGN(v927, v50); + Tile v928; + __ubuf__ float* v929 = v927.data(); + uint64_t v930 = reinterpret_cast(v929); + TASSIGN(v928, v930); + Tile v931; + TASSIGN(v931, v51); + Tile v932; + __ubuf__ float* v933 = v931.data(); + uint64_t v934 = reinterpret_cast(v933); + TASSIGN(v932, v934); + Tile v935; + TASSIGN(v935, v52); + Tile v936; + __ubuf__ float* v937 = v935.data(); + uint64_t v938 = reinterpret_cast(v937); + TASSIGN(v936, v938); + Tile v939; + TASSIGN(v939, v49); + Tile v940; + __ubuf__ float* v941 = v939.data(); + uint64_t v942 = reinterpret_cast(v941); + TASSIGN(v940, v942); + Tile v943; + TASSIGN(v943, v53); + Tile v944; + __ubuf__ float* v945 = v943.data(); + uint64_t v946 = reinterpret_cast(v945); + TASSIGN(v944, v946); + TLOAD(v924, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v947 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v948 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v949 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v23 * (unsigned) v36 + v31 * (unsigned) v37), v947, v948); + TLOAD(v932, v949); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v928, v924, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v936, v928, v932); + pipe_barrier(PIPE_ALL); + TROWSUM(v944, v936, v940); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v893, v893, v944); + Tile v950; + TASSIGN(v950, v49); + Tile v951; + __ubuf__ bfloat16_t* v952 = v950.data(); + uint64_t v953 = reinterpret_cast(v952); + TASSIGN(v951, v953); + Tile v954; + TASSIGN(v954, v50); + Tile v955; + __ubuf__ float* v956 = v954.data(); + uint64_t v957 = reinterpret_cast(v956); + TASSIGN(v955, v957); + Tile v958; + TASSIGN(v958, v51); + Tile v959; + __ubuf__ float* v960 = v958.data(); + uint64_t v961 = reinterpret_cast(v960); + TASSIGN(v959, v961); + Tile v962; + TASSIGN(v962, v52); + Tile v963; + __ubuf__ float* v964 = v962.data(); + uint64_t v965 = reinterpret_cast(v964); + TASSIGN(v963, v965); + Tile v966; + TASSIGN(v966, v49); + Tile v967; + __ubuf__ float* v968 = v966.data(); + uint64_t v969 = reinterpret_cast(v968); + TASSIGN(v967, v969); + Tile v970; + TASSIGN(v970, v53); + Tile v971; + __ubuf__ float* v972 = v970.data(); + uint64_t v973 = reinterpret_cast(v972); + TASSIGN(v971, v973); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v951, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v974 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v975 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v976 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v23 * (unsigned) v36 + v30 * (unsigned) v37), v974, v975); + TLOAD(v959, v976); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v955, v951, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v963, v955, v959); + pipe_barrier(PIPE_ALL); + TROWSUM(v971, v963, v967); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v893, v893, v971); + Tile v977; + TASSIGN(v977, v49); + Tile v978; + __ubuf__ bfloat16_t* v979 = v977.data(); + uint64_t v980 = reinterpret_cast(v979); + TASSIGN(v978, v980); + Tile v981; + TASSIGN(v981, v50); + Tile v982; + __ubuf__ float* v983 = v981.data(); + uint64_t v984 = reinterpret_cast(v983); + TASSIGN(v982, v984); + Tile v985; + TASSIGN(v985, v51); + Tile v986; + __ubuf__ float* v987 = v985.data(); + uint64_t v988 = reinterpret_cast(v987); + TASSIGN(v986, v988); + Tile v989; + TASSIGN(v989, v52); + Tile v990; + __ubuf__ float* v991 = v989.data(); + uint64_t v992 = reinterpret_cast(v991); + TASSIGN(v990, v992); + Tile v993; + TASSIGN(v993, v49); + Tile v994; + __ubuf__ float* v995 = v993.data(); + uint64_t v996 = reinterpret_cast(v995); + TASSIGN(v994, v996); + Tile v997; + TASSIGN(v997, v53); + Tile v998; + __ubuf__ float* v999 = v997.data(); + uint64_t v1000 = reinterpret_cast(v999); + TASSIGN(v998, v1000); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v978, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1001 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1002 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1003 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v23 * (unsigned) v36 + v29 * (unsigned) v37), v1001, v1002); + TLOAD(v986, v1003); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v982, v978, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v990, v982, v986); + pipe_barrier(PIPE_ALL); + TROWSUM(v998, v990, v994); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v893, v893, v998); + Tile v1004; + TASSIGN(v1004, v49); + Tile v1005; + __ubuf__ bfloat16_t* v1006 = v1004.data(); + uint64_t v1007 = reinterpret_cast(v1006); + TASSIGN(v1005, v1007); + Tile v1008; + TASSIGN(v1008, v50); + Tile v1009; + __ubuf__ float* v1010 = v1008.data(); + uint64_t v1011 = reinterpret_cast(v1010); + TASSIGN(v1009, v1011); + Tile v1012; + TASSIGN(v1012, v51); + Tile v1013; + __ubuf__ float* v1014 = v1012.data(); + uint64_t v1015 = reinterpret_cast(v1014); + TASSIGN(v1013, v1015); + Tile v1016; + TASSIGN(v1016, v52); + Tile v1017; + __ubuf__ float* v1018 = v1016.data(); + uint64_t v1019 = reinterpret_cast(v1018); + TASSIGN(v1017, v1019); + Tile v1020; + TASSIGN(v1020, v49); + Tile v1021; + __ubuf__ float* v1022 = v1020.data(); + uint64_t v1023 = reinterpret_cast(v1022); + TASSIGN(v1021, v1023); + Tile v1024; + TASSIGN(v1024, v53); + Tile v1025; + __ubuf__ float* v1026 = v1024.data(); + uint64_t v1027 = reinterpret_cast(v1026); + TASSIGN(v1025, v1027); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1005, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1028 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1029 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1030 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v23 * (unsigned) v36 + v28 * (unsigned) v37), v1028, v1029); + TLOAD(v1013, v1030); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1009, v1005, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1017, v1009, v1013); + pipe_barrier(PIPE_ALL); + TROWSUM(v1025, v1017, v1021); + pipe_barrier(PIPE_ALL); + TADD(v893, v893, v1025); + pipe_barrier(PIPE_ALL); + TMUL(v893, v893, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 1> v1031 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1032 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1033 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v23 * (unsigned) v37), v1031, v1032); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + TSTORE(v1033, v893); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + Tile v1034; + TASSIGN(v1034, v44); + Tile v1035; + __ubuf__ float* v1036 = v1034.data(); + uint64_t v1037 = reinterpret_cast(v1036); + TASSIGN(v1035, v1037); + Tile v1038; + TASSIGN(v1038, v44); + Tile v1039; + __ubuf__ bfloat16_t* v1040 = v1038.data(); + uint64_t v1041 = reinterpret_cast(v1040); + TASSIGN(v1039, v1041); + Tile v1042; + TASSIGN(v1042, v46); + Tile v1043; + __ubuf__ float* v1044 = v1042.data(); + uint64_t v1045 = reinterpret_cast(v1044); + TASSIGN(v1043, v1045); + Tile v1046; + TASSIGN(v1046, v45); + Tile v1047; + __ubuf__ float* v1048 = v1046.data(); + uint64_t v1049 = reinterpret_cast(v1048); + TASSIGN(v1047, v1049); + Tile v1050; + TASSIGN(v1050, v48); + Tile v1051; + __ubuf__ float* v1052 = v1050.data(); + uint64_t v1053 = reinterpret_cast(v1052); + TASSIGN(v1051, v1053); + Tile v1054; + TASSIGN(v1054, v44); + Tile v1055; + __ubuf__ float* v1056 = v1054.data(); + uint64_t v1057 = reinterpret_cast(v1056); + TASSIGN(v1055, v1057); + Tile v1058; + TASSIGN(v1058, v47); + Tile v1059; + __ubuf__ float* v1060 = v1058.data(); + uint64_t v1061 = reinterpret_cast(v1060); + TASSIGN(v1059, v1061); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID7); + TLOAD(v1039, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1062 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1063 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1064 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v22 * (unsigned) v36 + v33 * (unsigned) v37), v1062, v1063); + TLOAD(v1047, v1064); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1043, v1039, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1051, v1043, v1047); + pipe_barrier(PIPE_ALL); + TROWSUM(v1059, v1051, v1055); + pipe_barrier(PIPE_ALL); + TMOV(v1035, v1059); + Tile v1065; + TASSIGN(v1065, v49); + Tile v1066; + __ubuf__ bfloat16_t* v1067 = v1065.data(); + uint64_t v1068 = reinterpret_cast(v1067); + TASSIGN(v1066, v1068); + Tile v1069; + TASSIGN(v1069, v50); + Tile v1070; + __ubuf__ float* v1071 = v1069.data(); + uint64_t v1072 = reinterpret_cast(v1071); + TASSIGN(v1070, v1072); + Tile v1073; + TASSIGN(v1073, v51); + Tile v1074; + __ubuf__ float* v1075 = v1073.data(); + uint64_t v1076 = reinterpret_cast(v1075); + TASSIGN(v1074, v1076); + Tile v1077; + TASSIGN(v1077, v52); + Tile v1078; + __ubuf__ float* v1079 = v1077.data(); + uint64_t v1080 = reinterpret_cast(v1079); + TASSIGN(v1078, v1080); + Tile v1081; + TASSIGN(v1081, v49); + Tile v1082; + __ubuf__ float* v1083 = v1081.data(); + uint64_t v1084 = reinterpret_cast(v1083); + TASSIGN(v1082, v1084); + Tile v1085; + TASSIGN(v1085, v53); + Tile v1086; + __ubuf__ float* v1087 = v1085.data(); + uint64_t v1088 = reinterpret_cast(v1087); + TASSIGN(v1086, v1088); + TLOAD(v1066, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1089 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1090 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1091 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v22 * (unsigned) v36 + v31 * (unsigned) v37), v1089, v1090); + TLOAD(v1074, v1091); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1070, v1066, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1078, v1070, v1074); + pipe_barrier(PIPE_ALL); + TROWSUM(v1086, v1078, v1082); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1035, v1035, v1086); + Tile v1092; + TASSIGN(v1092, v49); + Tile v1093; + __ubuf__ bfloat16_t* v1094 = v1092.data(); + uint64_t v1095 = reinterpret_cast(v1094); + TASSIGN(v1093, v1095); + Tile v1096; + TASSIGN(v1096, v50); + Tile v1097; + __ubuf__ float* v1098 = v1096.data(); + uint64_t v1099 = reinterpret_cast(v1098); + TASSIGN(v1097, v1099); + Tile v1100; + TASSIGN(v1100, v51); + Tile v1101; + __ubuf__ float* v1102 = v1100.data(); + uint64_t v1103 = reinterpret_cast(v1102); + TASSIGN(v1101, v1103); + Tile v1104; + TASSIGN(v1104, v52); + Tile v1105; + __ubuf__ float* v1106 = v1104.data(); + uint64_t v1107 = reinterpret_cast(v1106); + TASSIGN(v1105, v1107); + Tile v1108; + TASSIGN(v1108, v49); + Tile v1109; + __ubuf__ float* v1110 = v1108.data(); + uint64_t v1111 = reinterpret_cast(v1110); + TASSIGN(v1109, v1111); + Tile v1112; + TASSIGN(v1112, v53); + Tile v1113; + __ubuf__ float* v1114 = v1112.data(); + uint64_t v1115 = reinterpret_cast(v1114); + TASSIGN(v1113, v1115); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1093, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1116 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1117 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1118 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v22 * (unsigned) v36 + v30 * (unsigned) v37), v1116, v1117); + TLOAD(v1101, v1118); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1097, v1093, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1105, v1097, v1101); + pipe_barrier(PIPE_ALL); + TROWSUM(v1113, v1105, v1109); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1035, v1035, v1113); + Tile v1119; + TASSIGN(v1119, v49); + Tile v1120; + __ubuf__ bfloat16_t* v1121 = v1119.data(); + uint64_t v1122 = reinterpret_cast(v1121); + TASSIGN(v1120, v1122); + Tile v1123; + TASSIGN(v1123, v50); + Tile v1124; + __ubuf__ float* v1125 = v1123.data(); + uint64_t v1126 = reinterpret_cast(v1125); + TASSIGN(v1124, v1126); + Tile v1127; + TASSIGN(v1127, v51); + Tile v1128; + __ubuf__ float* v1129 = v1127.data(); + uint64_t v1130 = reinterpret_cast(v1129); + TASSIGN(v1128, v1130); + Tile v1131; + TASSIGN(v1131, v52); + Tile v1132; + __ubuf__ float* v1133 = v1131.data(); + uint64_t v1134 = reinterpret_cast(v1133); + TASSIGN(v1132, v1134); + Tile v1135; + TASSIGN(v1135, v49); + Tile v1136; + __ubuf__ float* v1137 = v1135.data(); + uint64_t v1138 = reinterpret_cast(v1137); + TASSIGN(v1136, v1138); + Tile v1139; + TASSIGN(v1139, v53); + Tile v1140; + __ubuf__ float* v1141 = v1139.data(); + uint64_t v1142 = reinterpret_cast(v1141); + TASSIGN(v1140, v1142); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1120, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1143 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1144 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1145 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v22 * (unsigned) v36 + v29 * (unsigned) v37), v1143, v1144); + TLOAD(v1128, v1145); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1124, v1120, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1132, v1124, v1128); + pipe_barrier(PIPE_ALL); + TROWSUM(v1140, v1132, v1136); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1035, v1035, v1140); + Tile v1146; + TASSIGN(v1146, v49); + Tile v1147; + __ubuf__ bfloat16_t* v1148 = v1146.data(); + uint64_t v1149 = reinterpret_cast(v1148); + TASSIGN(v1147, v1149); + Tile v1150; + TASSIGN(v1150, v50); + Tile v1151; + __ubuf__ float* v1152 = v1150.data(); + uint64_t v1153 = reinterpret_cast(v1152); + TASSIGN(v1151, v1153); + Tile v1154; + TASSIGN(v1154, v51); + Tile v1155; + __ubuf__ float* v1156 = v1154.data(); + uint64_t v1157 = reinterpret_cast(v1156); + TASSIGN(v1155, v1157); + Tile v1158; + TASSIGN(v1158, v52); + Tile v1159; + __ubuf__ float* v1160 = v1158.data(); + uint64_t v1161 = reinterpret_cast(v1160); + TASSIGN(v1159, v1161); + Tile v1162; + TASSIGN(v1162, v49); + Tile v1163; + __ubuf__ float* v1164 = v1162.data(); + uint64_t v1165 = reinterpret_cast(v1164); + TASSIGN(v1163, v1165); + Tile v1166; + TASSIGN(v1166, v53); + Tile v1167; + __ubuf__ float* v1168 = v1166.data(); + uint64_t v1169 = reinterpret_cast(v1168); + TASSIGN(v1167, v1169); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1147, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1170 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1171 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1172 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v22 * (unsigned) v36 + v28 * (unsigned) v37), v1170, v1171); + TLOAD(v1155, v1172); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1151, v1147, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1159, v1151, v1155); + pipe_barrier(PIPE_ALL); + TROWSUM(v1167, v1159, v1163); + pipe_barrier(PIPE_ALL); + TADD(v1035, v1035, v1167); + pipe_barrier(PIPE_ALL); + TMUL(v1035, v1035, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + pto::Shape<1, 1, 1, 1, 1> v1173 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1174 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1175 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v22 * (unsigned) v37), v1173, v1174); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + TSTORE(v1175, v1035); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1176; + TASSIGN(v1176, v44); + Tile v1177; + __ubuf__ float* v1178 = v1176.data(); + uint64_t v1179 = reinterpret_cast(v1178); + TASSIGN(v1177, v1179); + Tile v1180; + TASSIGN(v1180, v44); + Tile v1181; + __ubuf__ bfloat16_t* v1182 = v1180.data(); + uint64_t v1183 = reinterpret_cast(v1182); + TASSIGN(v1181, v1183); + Tile v1184; + TASSIGN(v1184, v46); + Tile v1185; + __ubuf__ float* v1186 = v1184.data(); + uint64_t v1187 = reinterpret_cast(v1186); + TASSIGN(v1185, v1187); + Tile v1188; + TASSIGN(v1188, v45); + Tile v1189; + __ubuf__ float* v1190 = v1188.data(); + uint64_t v1191 = reinterpret_cast(v1190); + TASSIGN(v1189, v1191); + Tile v1192; + TASSIGN(v1192, v48); + Tile v1193; + __ubuf__ float* v1194 = v1192.data(); + uint64_t v1195 = reinterpret_cast(v1194); + TASSIGN(v1193, v1195); + Tile v1196; + TASSIGN(v1196, v44); + Tile v1197; + __ubuf__ float* v1198 = v1196.data(); + uint64_t v1199 = reinterpret_cast(v1198); + TASSIGN(v1197, v1199); + Tile v1200; + TASSIGN(v1200, v47); + Tile v1201; + __ubuf__ float* v1202 = v1200.data(); + uint64_t v1203 = reinterpret_cast(v1202); + TASSIGN(v1201, v1203); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1181, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1204 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1205 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1206 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v21 * (unsigned) v36 + v33 * (unsigned) v37), v1204, v1205); + TLOAD(v1189, v1206); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1185, v1181, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1193, v1185, v1189); + pipe_barrier(PIPE_ALL); + TROWSUM(v1201, v1193, v1197); + pipe_barrier(PIPE_ALL); + TMOV(v1177, v1201); + Tile v1207; + TASSIGN(v1207, v49); + Tile v1208; + __ubuf__ bfloat16_t* v1209 = v1207.data(); + uint64_t v1210 = reinterpret_cast(v1209); + TASSIGN(v1208, v1210); + Tile v1211; + TASSIGN(v1211, v50); + Tile v1212; + __ubuf__ float* v1213 = v1211.data(); + uint64_t v1214 = reinterpret_cast(v1213); + TASSIGN(v1212, v1214); + Tile v1215; + TASSIGN(v1215, v51); + Tile v1216; + __ubuf__ float* v1217 = v1215.data(); + uint64_t v1218 = reinterpret_cast(v1217); + TASSIGN(v1216, v1218); + Tile v1219; + TASSIGN(v1219, v52); + Tile v1220; + __ubuf__ float* v1221 = v1219.data(); + uint64_t v1222 = reinterpret_cast(v1221); + TASSIGN(v1220, v1222); + Tile v1223; + TASSIGN(v1223, v49); + Tile v1224; + __ubuf__ float* v1225 = v1223.data(); + uint64_t v1226 = reinterpret_cast(v1225); + TASSIGN(v1224, v1226); + Tile v1227; + TASSIGN(v1227, v53); + Tile v1228; + __ubuf__ float* v1229 = v1227.data(); + uint64_t v1230 = reinterpret_cast(v1229); + TASSIGN(v1228, v1230); + TLOAD(v1208, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1231 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1232 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1233 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v21 * (unsigned) v36 + v31 * (unsigned) v37), v1231, v1232); + TLOAD(v1216, v1233); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1212, v1208, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1220, v1212, v1216); + pipe_barrier(PIPE_ALL); + TROWSUM(v1228, v1220, v1224); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1177, v1177, v1228); + Tile v1234; + TASSIGN(v1234, v49); + Tile v1235; + __ubuf__ bfloat16_t* v1236 = v1234.data(); + uint64_t v1237 = reinterpret_cast(v1236); + TASSIGN(v1235, v1237); + Tile v1238; + TASSIGN(v1238, v50); + Tile v1239; + __ubuf__ float* v1240 = v1238.data(); + uint64_t v1241 = reinterpret_cast(v1240); + TASSIGN(v1239, v1241); + Tile v1242; + TASSIGN(v1242, v51); + Tile v1243; + __ubuf__ float* v1244 = v1242.data(); + uint64_t v1245 = reinterpret_cast(v1244); + TASSIGN(v1243, v1245); + Tile v1246; + TASSIGN(v1246, v52); + Tile v1247; + __ubuf__ float* v1248 = v1246.data(); + uint64_t v1249 = reinterpret_cast(v1248); + TASSIGN(v1247, v1249); + Tile v1250; + TASSIGN(v1250, v49); + Tile v1251; + __ubuf__ float* v1252 = v1250.data(); + uint64_t v1253 = reinterpret_cast(v1252); + TASSIGN(v1251, v1253); + Tile v1254; + TASSIGN(v1254, v53); + Tile v1255; + __ubuf__ float* v1256 = v1254.data(); + uint64_t v1257 = reinterpret_cast(v1256); + TASSIGN(v1255, v1257); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1235, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1258 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1259 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1260 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v21 * (unsigned) v36 + v30 * (unsigned) v37), v1258, v1259); + TLOAD(v1243, v1260); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1239, v1235, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1247, v1239, v1243); + pipe_barrier(PIPE_ALL); + TROWSUM(v1255, v1247, v1251); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1177, v1177, v1255); + Tile v1261; + TASSIGN(v1261, v49); + Tile v1262; + __ubuf__ bfloat16_t* v1263 = v1261.data(); + uint64_t v1264 = reinterpret_cast(v1263); + TASSIGN(v1262, v1264); + Tile v1265; + TASSIGN(v1265, v50); + Tile v1266; + __ubuf__ float* v1267 = v1265.data(); + uint64_t v1268 = reinterpret_cast(v1267); + TASSIGN(v1266, v1268); + Tile v1269; + TASSIGN(v1269, v51); + Tile v1270; + __ubuf__ float* v1271 = v1269.data(); + uint64_t v1272 = reinterpret_cast(v1271); + TASSIGN(v1270, v1272); + Tile v1273; + TASSIGN(v1273, v52); + Tile v1274; + __ubuf__ float* v1275 = v1273.data(); + uint64_t v1276 = reinterpret_cast(v1275); + TASSIGN(v1274, v1276); + Tile v1277; + TASSIGN(v1277, v49); + Tile v1278; + __ubuf__ float* v1279 = v1277.data(); + uint64_t v1280 = reinterpret_cast(v1279); + TASSIGN(v1278, v1280); + Tile v1281; + TASSIGN(v1281, v53); + Tile v1282; + __ubuf__ float* v1283 = v1281.data(); + uint64_t v1284 = reinterpret_cast(v1283); + TASSIGN(v1282, v1284); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1262, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1285 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1286 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1287 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v21 * (unsigned) v36 + v29 * (unsigned) v37), v1285, v1286); + TLOAD(v1270, v1287); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1266, v1262, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1274, v1266, v1270); + pipe_barrier(PIPE_ALL); + TROWSUM(v1282, v1274, v1278); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1177, v1177, v1282); + Tile v1288; + TASSIGN(v1288, v49); + Tile v1289; + __ubuf__ bfloat16_t* v1290 = v1288.data(); + uint64_t v1291 = reinterpret_cast(v1290); + TASSIGN(v1289, v1291); + Tile v1292; + TASSIGN(v1292, v50); + Tile v1293; + __ubuf__ float* v1294 = v1292.data(); + uint64_t v1295 = reinterpret_cast(v1294); + TASSIGN(v1293, v1295); + Tile v1296; + TASSIGN(v1296, v51); + Tile v1297; + __ubuf__ float* v1298 = v1296.data(); + uint64_t v1299 = reinterpret_cast(v1298); + TASSIGN(v1297, v1299); + Tile v1300; + TASSIGN(v1300, v52); + Tile v1301; + __ubuf__ float* v1302 = v1300.data(); + uint64_t v1303 = reinterpret_cast(v1302); + TASSIGN(v1301, v1303); + Tile v1304; + TASSIGN(v1304, v49); + Tile v1305; + __ubuf__ float* v1306 = v1304.data(); + uint64_t v1307 = reinterpret_cast(v1306); + TASSIGN(v1305, v1307); + Tile v1308; + TASSIGN(v1308, v53); + Tile v1309; + __ubuf__ float* v1310 = v1308.data(); + uint64_t v1311 = reinterpret_cast(v1310); + TASSIGN(v1309, v1311); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1289, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1312 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1313 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1314 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v21 * (unsigned) v36 + v28 * (unsigned) v37), v1312, v1313); + TLOAD(v1297, v1314); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1293, v1289, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1301, v1293, v1297); + pipe_barrier(PIPE_ALL); + TROWSUM(v1309, v1301, v1305); + pipe_barrier(PIPE_ALL); + TADD(v1177, v1177, v1309); + pipe_barrier(PIPE_ALL); + TMUL(v1177, v1177, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + pto::Shape<1, 1, 1, 1, 1> v1315 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1316 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1317 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v21 * (unsigned) v37), v1315, v1316); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + TSTORE(v1317, v1177); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1318; + TASSIGN(v1318, v44); + Tile v1319; + __ubuf__ float* v1320 = v1318.data(); + uint64_t v1321 = reinterpret_cast(v1320); + TASSIGN(v1319, v1321); + Tile v1322; + TASSIGN(v1322, v44); + Tile v1323; + __ubuf__ bfloat16_t* v1324 = v1322.data(); + uint64_t v1325 = reinterpret_cast(v1324); + TASSIGN(v1323, v1325); + Tile v1326; + TASSIGN(v1326, v46); + Tile v1327; + __ubuf__ float* v1328 = v1326.data(); + uint64_t v1329 = reinterpret_cast(v1328); + TASSIGN(v1327, v1329); + Tile v1330; + TASSIGN(v1330, v45); + Tile v1331; + __ubuf__ float* v1332 = v1330.data(); + uint64_t v1333 = reinterpret_cast(v1332); + TASSIGN(v1331, v1333); + Tile v1334; + TASSIGN(v1334, v48); + Tile v1335; + __ubuf__ float* v1336 = v1334.data(); + uint64_t v1337 = reinterpret_cast(v1336); + TASSIGN(v1335, v1337); + Tile v1338; + TASSIGN(v1338, v44); + Tile v1339; + __ubuf__ float* v1340 = v1338.data(); + uint64_t v1341 = reinterpret_cast(v1340); + TASSIGN(v1339, v1341); + Tile v1342; + TASSIGN(v1342, v47); + Tile v1343; + __ubuf__ float* v1344 = v1342.data(); + uint64_t v1345 = reinterpret_cast(v1344); + TASSIGN(v1343, v1345); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1323, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1346 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1347 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1348 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v20 * (unsigned) v36 + v33 * (unsigned) v37), v1346, v1347); + TLOAD(v1331, v1348); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1327, v1323, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1335, v1327, v1331); + pipe_barrier(PIPE_ALL); + TROWSUM(v1343, v1335, v1339); + pipe_barrier(PIPE_ALL); + TMOV(v1319, v1343); + Tile v1349; + TASSIGN(v1349, v49); + Tile v1350; + __ubuf__ bfloat16_t* v1351 = v1349.data(); + uint64_t v1352 = reinterpret_cast(v1351); + TASSIGN(v1350, v1352); + Tile v1353; + TASSIGN(v1353, v50); + Tile v1354; + __ubuf__ float* v1355 = v1353.data(); + uint64_t v1356 = reinterpret_cast(v1355); + TASSIGN(v1354, v1356); + Tile v1357; + TASSIGN(v1357, v51); + Tile v1358; + __ubuf__ float* v1359 = v1357.data(); + uint64_t v1360 = reinterpret_cast(v1359); + TASSIGN(v1358, v1360); + Tile v1361; + TASSIGN(v1361, v52); + Tile v1362; + __ubuf__ float* v1363 = v1361.data(); + uint64_t v1364 = reinterpret_cast(v1363); + TASSIGN(v1362, v1364); + Tile v1365; + TASSIGN(v1365, v49); + Tile v1366; + __ubuf__ float* v1367 = v1365.data(); + uint64_t v1368 = reinterpret_cast(v1367); + TASSIGN(v1366, v1368); + Tile v1369; + TASSIGN(v1369, v53); + Tile v1370; + __ubuf__ float* v1371 = v1369.data(); + uint64_t v1372 = reinterpret_cast(v1371); + TASSIGN(v1370, v1372); + TLOAD(v1350, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1373 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1374 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1375 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v20 * (unsigned) v36 + v31 * (unsigned) v37), v1373, v1374); + TLOAD(v1358, v1375); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1354, v1350, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1362, v1354, v1358); + pipe_barrier(PIPE_ALL); + TROWSUM(v1370, v1362, v1366); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1319, v1319, v1370); + Tile v1376; + TASSIGN(v1376, v49); + Tile v1377; + __ubuf__ bfloat16_t* v1378 = v1376.data(); + uint64_t v1379 = reinterpret_cast(v1378); + TASSIGN(v1377, v1379); + Tile v1380; + TASSIGN(v1380, v50); + Tile v1381; + __ubuf__ float* v1382 = v1380.data(); + uint64_t v1383 = reinterpret_cast(v1382); + TASSIGN(v1381, v1383); + Tile v1384; + TASSIGN(v1384, v51); + Tile v1385; + __ubuf__ float* v1386 = v1384.data(); + uint64_t v1387 = reinterpret_cast(v1386); + TASSIGN(v1385, v1387); + Tile v1388; + TASSIGN(v1388, v52); + Tile v1389; + __ubuf__ float* v1390 = v1388.data(); + uint64_t v1391 = reinterpret_cast(v1390); + TASSIGN(v1389, v1391); + Tile v1392; + TASSIGN(v1392, v49); + Tile v1393; + __ubuf__ float* v1394 = v1392.data(); + uint64_t v1395 = reinterpret_cast(v1394); + TASSIGN(v1393, v1395); + Tile v1396; + TASSIGN(v1396, v53); + Tile v1397; + __ubuf__ float* v1398 = v1396.data(); + uint64_t v1399 = reinterpret_cast(v1398); + TASSIGN(v1397, v1399); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1377, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1400 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1401 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1402 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v20 * (unsigned) v36 + v30 * (unsigned) v37), v1400, v1401); + TLOAD(v1385, v1402); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1381, v1377, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1389, v1381, v1385); + pipe_barrier(PIPE_ALL); + TROWSUM(v1397, v1389, v1393); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1319, v1319, v1397); + Tile v1403; + TASSIGN(v1403, v49); + Tile v1404; + __ubuf__ bfloat16_t* v1405 = v1403.data(); + uint64_t v1406 = reinterpret_cast(v1405); + TASSIGN(v1404, v1406); + Tile v1407; + TASSIGN(v1407, v50); + Tile v1408; + __ubuf__ float* v1409 = v1407.data(); + uint64_t v1410 = reinterpret_cast(v1409); + TASSIGN(v1408, v1410); + Tile v1411; + TASSIGN(v1411, v51); + Tile v1412; + __ubuf__ float* v1413 = v1411.data(); + uint64_t v1414 = reinterpret_cast(v1413); + TASSIGN(v1412, v1414); + Tile v1415; + TASSIGN(v1415, v52); + Tile v1416; + __ubuf__ float* v1417 = v1415.data(); + uint64_t v1418 = reinterpret_cast(v1417); + TASSIGN(v1416, v1418); + Tile v1419; + TASSIGN(v1419, v49); + Tile v1420; + __ubuf__ float* v1421 = v1419.data(); + uint64_t v1422 = reinterpret_cast(v1421); + TASSIGN(v1420, v1422); + Tile v1423; + TASSIGN(v1423, v53); + Tile v1424; + __ubuf__ float* v1425 = v1423.data(); + uint64_t v1426 = reinterpret_cast(v1425); + TASSIGN(v1424, v1426); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1404, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1427 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1428 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1429 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v20 * (unsigned) v36 + v29 * (unsigned) v37), v1427, v1428); + TLOAD(v1412, v1429); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1408, v1404, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1416, v1408, v1412); + pipe_barrier(PIPE_ALL); + TROWSUM(v1424, v1416, v1420); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1319, v1319, v1424); + Tile v1430; + TASSIGN(v1430, v49); + Tile v1431; + __ubuf__ bfloat16_t* v1432 = v1430.data(); + uint64_t v1433 = reinterpret_cast(v1432); + TASSIGN(v1431, v1433); + Tile v1434; + TASSIGN(v1434, v50); + Tile v1435; + __ubuf__ float* v1436 = v1434.data(); + uint64_t v1437 = reinterpret_cast(v1436); + TASSIGN(v1435, v1437); + Tile v1438; + TASSIGN(v1438, v51); + Tile v1439; + __ubuf__ float* v1440 = v1438.data(); + uint64_t v1441 = reinterpret_cast(v1440); + TASSIGN(v1439, v1441); + Tile v1442; + TASSIGN(v1442, v52); + Tile v1443; + __ubuf__ float* v1444 = v1442.data(); + uint64_t v1445 = reinterpret_cast(v1444); + TASSIGN(v1443, v1445); + Tile v1446; + TASSIGN(v1446, v49); + Tile v1447; + __ubuf__ float* v1448 = v1446.data(); + uint64_t v1449 = reinterpret_cast(v1448); + TASSIGN(v1447, v1449); + Tile v1450; + TASSIGN(v1450, v53); + Tile v1451; + __ubuf__ float* v1452 = v1450.data(); + uint64_t v1453 = reinterpret_cast(v1452); + TASSIGN(v1451, v1453); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1431, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1454 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1455 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1456 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v20 * (unsigned) v36 + v28 * (unsigned) v37), v1454, v1455); + TLOAD(v1439, v1456); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1435, v1431, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1443, v1435, v1439); + pipe_barrier(PIPE_ALL); + TROWSUM(v1451, v1443, v1447); + pipe_barrier(PIPE_ALL); + TADD(v1319, v1319, v1451); + pipe_barrier(PIPE_ALL); + TMUL(v1319, v1319, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v1457 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1458 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1459 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v20 * (unsigned) v37), v1457, v1458); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1459, v1319); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1460; + TASSIGN(v1460, v44); + Tile v1461; + __ubuf__ float* v1462 = v1460.data(); + uint64_t v1463 = reinterpret_cast(v1462); + TASSIGN(v1461, v1463); + Tile v1464; + TASSIGN(v1464, v44); + Tile v1465; + __ubuf__ bfloat16_t* v1466 = v1464.data(); + uint64_t v1467 = reinterpret_cast(v1466); + TASSIGN(v1465, v1467); + Tile v1468; + TASSIGN(v1468, v46); + Tile v1469; + __ubuf__ float* v1470 = v1468.data(); + uint64_t v1471 = reinterpret_cast(v1470); + TASSIGN(v1469, v1471); + Tile v1472; + TASSIGN(v1472, v45); + Tile v1473; + __ubuf__ float* v1474 = v1472.data(); + uint64_t v1475 = reinterpret_cast(v1474); + TASSIGN(v1473, v1475); + Tile v1476; + TASSIGN(v1476, v48); + Tile v1477; + __ubuf__ float* v1478 = v1476.data(); + uint64_t v1479 = reinterpret_cast(v1478); + TASSIGN(v1477, v1479); + Tile v1480; + TASSIGN(v1480, v44); + Tile v1481; + __ubuf__ float* v1482 = v1480.data(); + uint64_t v1483 = reinterpret_cast(v1482); + TASSIGN(v1481, v1483); + Tile v1484; + TASSIGN(v1484, v47); + Tile v1485; + __ubuf__ float* v1486 = v1484.data(); + uint64_t v1487 = reinterpret_cast(v1486); + TASSIGN(v1485, v1487); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1465, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1488 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1489 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1490 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v19 * (unsigned) v36 + v33 * (unsigned) v37), v1488, v1489); + TLOAD(v1473, v1490); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1469, v1465, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1477, v1469, v1473); + pipe_barrier(PIPE_ALL); + TROWSUM(v1485, v1477, v1481); + pipe_barrier(PIPE_ALL); + TMOV(v1461, v1485); + Tile v1491; + TASSIGN(v1491, v49); + Tile v1492; + __ubuf__ bfloat16_t* v1493 = v1491.data(); + uint64_t v1494 = reinterpret_cast(v1493); + TASSIGN(v1492, v1494); + Tile v1495; + TASSIGN(v1495, v50); + Tile v1496; + __ubuf__ float* v1497 = v1495.data(); + uint64_t v1498 = reinterpret_cast(v1497); + TASSIGN(v1496, v1498); + Tile v1499; + TASSIGN(v1499, v51); + Tile v1500; + __ubuf__ float* v1501 = v1499.data(); + uint64_t v1502 = reinterpret_cast(v1501); + TASSIGN(v1500, v1502); + Tile v1503; + TASSIGN(v1503, v52); + Tile v1504; + __ubuf__ float* v1505 = v1503.data(); + uint64_t v1506 = reinterpret_cast(v1505); + TASSIGN(v1504, v1506); + Tile v1507; + TASSIGN(v1507, v49); + Tile v1508; + __ubuf__ float* v1509 = v1507.data(); + uint64_t v1510 = reinterpret_cast(v1509); + TASSIGN(v1508, v1510); + Tile v1511; + TASSIGN(v1511, v53); + Tile v1512; + __ubuf__ float* v1513 = v1511.data(); + uint64_t v1514 = reinterpret_cast(v1513); + TASSIGN(v1512, v1514); + TLOAD(v1492, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1515 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1516 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1517 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v19 * (unsigned) v36 + v31 * (unsigned) v37), v1515, v1516); + TLOAD(v1500, v1517); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1496, v1492, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1504, v1496, v1500); + pipe_barrier(PIPE_ALL); + TROWSUM(v1512, v1504, v1508); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1461, v1461, v1512); + Tile v1518; + TASSIGN(v1518, v49); + Tile v1519; + __ubuf__ bfloat16_t* v1520 = v1518.data(); + uint64_t v1521 = reinterpret_cast(v1520); + TASSIGN(v1519, v1521); + Tile v1522; + TASSIGN(v1522, v50); + Tile v1523; + __ubuf__ float* v1524 = v1522.data(); + uint64_t v1525 = reinterpret_cast(v1524); + TASSIGN(v1523, v1525); + Tile v1526; + TASSIGN(v1526, v51); + Tile v1527; + __ubuf__ float* v1528 = v1526.data(); + uint64_t v1529 = reinterpret_cast(v1528); + TASSIGN(v1527, v1529); + Tile v1530; + TASSIGN(v1530, v52); + Tile v1531; + __ubuf__ float* v1532 = v1530.data(); + uint64_t v1533 = reinterpret_cast(v1532); + TASSIGN(v1531, v1533); + Tile v1534; + TASSIGN(v1534, v49); + Tile v1535; + __ubuf__ float* v1536 = v1534.data(); + uint64_t v1537 = reinterpret_cast(v1536); + TASSIGN(v1535, v1537); + Tile v1538; + TASSIGN(v1538, v53); + Tile v1539; + __ubuf__ float* v1540 = v1538.data(); + uint64_t v1541 = reinterpret_cast(v1540); + TASSIGN(v1539, v1541); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1519, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1542 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1543 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1544 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v19 * (unsigned) v36 + v30 * (unsigned) v37), v1542, v1543); + TLOAD(v1527, v1544); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1523, v1519, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1531, v1523, v1527); + pipe_barrier(PIPE_ALL); + TROWSUM(v1539, v1531, v1535); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1461, v1461, v1539); + Tile v1545; + TASSIGN(v1545, v49); + Tile v1546; + __ubuf__ bfloat16_t* v1547 = v1545.data(); + uint64_t v1548 = reinterpret_cast(v1547); + TASSIGN(v1546, v1548); + Tile v1549; + TASSIGN(v1549, v50); + Tile v1550; + __ubuf__ float* v1551 = v1549.data(); + uint64_t v1552 = reinterpret_cast(v1551); + TASSIGN(v1550, v1552); + Tile v1553; + TASSIGN(v1553, v51); + Tile v1554; + __ubuf__ float* v1555 = v1553.data(); + uint64_t v1556 = reinterpret_cast(v1555); + TASSIGN(v1554, v1556); + Tile v1557; + TASSIGN(v1557, v52); + Tile v1558; + __ubuf__ float* v1559 = v1557.data(); + uint64_t v1560 = reinterpret_cast(v1559); + TASSIGN(v1558, v1560); + Tile v1561; + TASSIGN(v1561, v49); + Tile v1562; + __ubuf__ float* v1563 = v1561.data(); + uint64_t v1564 = reinterpret_cast(v1563); + TASSIGN(v1562, v1564); + Tile v1565; + TASSIGN(v1565, v53); + Tile v1566; + __ubuf__ float* v1567 = v1565.data(); + uint64_t v1568 = reinterpret_cast(v1567); + TASSIGN(v1566, v1568); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1546, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1569 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1570 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1571 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v19 * (unsigned) v36 + v29 * (unsigned) v37), v1569, v1570); + TLOAD(v1554, v1571); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1550, v1546, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1558, v1550, v1554); + pipe_barrier(PIPE_ALL); + TROWSUM(v1566, v1558, v1562); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1461, v1461, v1566); + Tile v1572; + TASSIGN(v1572, v49); + Tile v1573; + __ubuf__ bfloat16_t* v1574 = v1572.data(); + uint64_t v1575 = reinterpret_cast(v1574); + TASSIGN(v1573, v1575); + Tile v1576; + TASSIGN(v1576, v50); + Tile v1577; + __ubuf__ float* v1578 = v1576.data(); + uint64_t v1579 = reinterpret_cast(v1578); + TASSIGN(v1577, v1579); + Tile v1580; + TASSIGN(v1580, v51); + Tile v1581; + __ubuf__ float* v1582 = v1580.data(); + uint64_t v1583 = reinterpret_cast(v1582); + TASSIGN(v1581, v1583); + Tile v1584; + TASSIGN(v1584, v52); + Tile v1585; + __ubuf__ float* v1586 = v1584.data(); + uint64_t v1587 = reinterpret_cast(v1586); + TASSIGN(v1585, v1587); + Tile v1588; + TASSIGN(v1588, v49); + Tile v1589; + __ubuf__ float* v1590 = v1588.data(); + uint64_t v1591 = reinterpret_cast(v1590); + TASSIGN(v1589, v1591); + Tile v1592; + TASSIGN(v1592, v53); + Tile v1593; + __ubuf__ float* v1594 = v1592.data(); + uint64_t v1595 = reinterpret_cast(v1594); + TASSIGN(v1593, v1595); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1573, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1596 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1597 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1598 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v19 * (unsigned) v36 + v28 * (unsigned) v37), v1596, v1597); + TLOAD(v1581, v1598); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1577, v1573, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1585, v1577, v1581); + pipe_barrier(PIPE_ALL); + TROWSUM(v1593, v1585, v1589); + pipe_barrier(PIPE_ALL); + TADD(v1461, v1461, v1593); + pipe_barrier(PIPE_ALL); + TMUL(v1461, v1461, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v1599 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1600 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1601 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v19 * (unsigned) v37), v1599, v1600); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1601, v1461); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1602; + TASSIGN(v1602, v44); + Tile v1603; + __ubuf__ float* v1604 = v1602.data(); + uint64_t v1605 = reinterpret_cast(v1604); + TASSIGN(v1603, v1605); + Tile v1606; + TASSIGN(v1606, v44); + Tile v1607; + __ubuf__ bfloat16_t* v1608 = v1606.data(); + uint64_t v1609 = reinterpret_cast(v1608); + TASSIGN(v1607, v1609); + Tile v1610; + TASSIGN(v1610, v46); + Tile v1611; + __ubuf__ float* v1612 = v1610.data(); + uint64_t v1613 = reinterpret_cast(v1612); + TASSIGN(v1611, v1613); + Tile v1614; + TASSIGN(v1614, v45); + Tile v1615; + __ubuf__ float* v1616 = v1614.data(); + uint64_t v1617 = reinterpret_cast(v1616); + TASSIGN(v1615, v1617); + Tile v1618; + TASSIGN(v1618, v48); + Tile v1619; + __ubuf__ float* v1620 = v1618.data(); + uint64_t v1621 = reinterpret_cast(v1620); + TASSIGN(v1619, v1621); + Tile v1622; + TASSIGN(v1622, v44); + Tile v1623; + __ubuf__ float* v1624 = v1622.data(); + uint64_t v1625 = reinterpret_cast(v1624); + TASSIGN(v1623, v1625); + Tile v1626; + TASSIGN(v1626, v47); + Tile v1627; + __ubuf__ float* v1628 = v1626.data(); + uint64_t v1629 = reinterpret_cast(v1628); + TASSIGN(v1627, v1629); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1607, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1630 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1631 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1632 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v18 * (unsigned) v36 + v33 * (unsigned) v37), v1630, v1631); + TLOAD(v1615, v1632); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1611, v1607, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1619, v1611, v1615); + pipe_barrier(PIPE_ALL); + TROWSUM(v1627, v1619, v1623); + pipe_barrier(PIPE_ALL); + TMOV(v1603, v1627); + Tile v1633; + TASSIGN(v1633, v49); + Tile v1634; + __ubuf__ bfloat16_t* v1635 = v1633.data(); + uint64_t v1636 = reinterpret_cast(v1635); + TASSIGN(v1634, v1636); + Tile v1637; + TASSIGN(v1637, v50); + Tile v1638; + __ubuf__ float* v1639 = v1637.data(); + uint64_t v1640 = reinterpret_cast(v1639); + TASSIGN(v1638, v1640); + Tile v1641; + TASSIGN(v1641, v51); + Tile v1642; + __ubuf__ float* v1643 = v1641.data(); + uint64_t v1644 = reinterpret_cast(v1643); + TASSIGN(v1642, v1644); + Tile v1645; + TASSIGN(v1645, v52); + Tile v1646; + __ubuf__ float* v1647 = v1645.data(); + uint64_t v1648 = reinterpret_cast(v1647); + TASSIGN(v1646, v1648); + Tile v1649; + TASSIGN(v1649, v49); + Tile v1650; + __ubuf__ float* v1651 = v1649.data(); + uint64_t v1652 = reinterpret_cast(v1651); + TASSIGN(v1650, v1652); + Tile v1653; + TASSIGN(v1653, v53); + Tile v1654; + __ubuf__ float* v1655 = v1653.data(); + uint64_t v1656 = reinterpret_cast(v1655); + TASSIGN(v1654, v1656); + TLOAD(v1634, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1657 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1658 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1659 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v18 * (unsigned) v36 + v31 * (unsigned) v37), v1657, v1658); + TLOAD(v1642, v1659); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1638, v1634, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1646, v1638, v1642); + pipe_barrier(PIPE_ALL); + TROWSUM(v1654, v1646, v1650); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1603, v1603, v1654); + Tile v1660; + TASSIGN(v1660, v49); + Tile v1661; + __ubuf__ bfloat16_t* v1662 = v1660.data(); + uint64_t v1663 = reinterpret_cast(v1662); + TASSIGN(v1661, v1663); + Tile v1664; + TASSIGN(v1664, v50); + Tile v1665; + __ubuf__ float* v1666 = v1664.data(); + uint64_t v1667 = reinterpret_cast(v1666); + TASSIGN(v1665, v1667); + Tile v1668; + TASSIGN(v1668, v51); + Tile v1669; + __ubuf__ float* v1670 = v1668.data(); + uint64_t v1671 = reinterpret_cast(v1670); + TASSIGN(v1669, v1671); + Tile v1672; + TASSIGN(v1672, v52); + Tile v1673; + __ubuf__ float* v1674 = v1672.data(); + uint64_t v1675 = reinterpret_cast(v1674); + TASSIGN(v1673, v1675); + Tile v1676; + TASSIGN(v1676, v49); + Tile v1677; + __ubuf__ float* v1678 = v1676.data(); + uint64_t v1679 = reinterpret_cast(v1678); + TASSIGN(v1677, v1679); + Tile v1680; + TASSIGN(v1680, v53); + Tile v1681; + __ubuf__ float* v1682 = v1680.data(); + uint64_t v1683 = reinterpret_cast(v1682); + TASSIGN(v1681, v1683); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1661, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1684 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1685 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1686 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v18 * (unsigned) v36 + v30 * (unsigned) v37), v1684, v1685); + TLOAD(v1669, v1686); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1665, v1661, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1673, v1665, v1669); + pipe_barrier(PIPE_ALL); + TROWSUM(v1681, v1673, v1677); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1603, v1603, v1681); + Tile v1687; + TASSIGN(v1687, v49); + Tile v1688; + __ubuf__ bfloat16_t* v1689 = v1687.data(); + uint64_t v1690 = reinterpret_cast(v1689); + TASSIGN(v1688, v1690); + Tile v1691; + TASSIGN(v1691, v50); + Tile v1692; + __ubuf__ float* v1693 = v1691.data(); + uint64_t v1694 = reinterpret_cast(v1693); + TASSIGN(v1692, v1694); + Tile v1695; + TASSIGN(v1695, v51); + Tile v1696; + __ubuf__ float* v1697 = v1695.data(); + uint64_t v1698 = reinterpret_cast(v1697); + TASSIGN(v1696, v1698); + Tile v1699; + TASSIGN(v1699, v52); + Tile v1700; + __ubuf__ float* v1701 = v1699.data(); + uint64_t v1702 = reinterpret_cast(v1701); + TASSIGN(v1700, v1702); + Tile v1703; + TASSIGN(v1703, v49); + Tile v1704; + __ubuf__ float* v1705 = v1703.data(); + uint64_t v1706 = reinterpret_cast(v1705); + TASSIGN(v1704, v1706); + Tile v1707; + TASSIGN(v1707, v53); + Tile v1708; + __ubuf__ float* v1709 = v1707.data(); + uint64_t v1710 = reinterpret_cast(v1709); + TASSIGN(v1708, v1710); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1688, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1711 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1712 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1713 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v18 * (unsigned) v36 + v29 * (unsigned) v37), v1711, v1712); + TLOAD(v1696, v1713); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1692, v1688, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1700, v1692, v1696); + pipe_barrier(PIPE_ALL); + TROWSUM(v1708, v1700, v1704); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1603, v1603, v1708); + Tile v1714; + TASSIGN(v1714, v49); + Tile v1715; + __ubuf__ bfloat16_t* v1716 = v1714.data(); + uint64_t v1717 = reinterpret_cast(v1716); + TASSIGN(v1715, v1717); + Tile v1718; + TASSIGN(v1718, v50); + Tile v1719; + __ubuf__ float* v1720 = v1718.data(); + uint64_t v1721 = reinterpret_cast(v1720); + TASSIGN(v1719, v1721); + Tile v1722; + TASSIGN(v1722, v51); + Tile v1723; + __ubuf__ float* v1724 = v1722.data(); + uint64_t v1725 = reinterpret_cast(v1724); + TASSIGN(v1723, v1725); + Tile v1726; + TASSIGN(v1726, v52); + Tile v1727; + __ubuf__ float* v1728 = v1726.data(); + uint64_t v1729 = reinterpret_cast(v1728); + TASSIGN(v1727, v1729); + Tile v1730; + TASSIGN(v1730, v49); + Tile v1731; + __ubuf__ float* v1732 = v1730.data(); + uint64_t v1733 = reinterpret_cast(v1732); + TASSIGN(v1731, v1733); + Tile v1734; + TASSIGN(v1734, v53); + Tile v1735; + __ubuf__ float* v1736 = v1734.data(); + uint64_t v1737 = reinterpret_cast(v1736); + TASSIGN(v1735, v1737); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1715, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1738 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1739 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1740 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v18 * (unsigned) v36 + v28 * (unsigned) v37), v1738, v1739); + TLOAD(v1723, v1740); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1719, v1715, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1727, v1719, v1723); + pipe_barrier(PIPE_ALL); + TROWSUM(v1735, v1727, v1731); + pipe_barrier(PIPE_ALL); + TADD(v1603, v1603, v1735); + pipe_barrier(PIPE_ALL); + TMUL(v1603, v1603, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v1741 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1742 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1743 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v18 * (unsigned) v37), v1741, v1742); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1743, v1603); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1744; + TASSIGN(v1744, v44); + Tile v1745; + __ubuf__ float* v1746 = v1744.data(); + uint64_t v1747 = reinterpret_cast(v1746); + TASSIGN(v1745, v1747); + Tile v1748; + TASSIGN(v1748, v44); + Tile v1749; + __ubuf__ bfloat16_t* v1750 = v1748.data(); + uint64_t v1751 = reinterpret_cast(v1750); + TASSIGN(v1749, v1751); + Tile v1752; + TASSIGN(v1752, v46); + Tile v1753; + __ubuf__ float* v1754 = v1752.data(); + uint64_t v1755 = reinterpret_cast(v1754); + TASSIGN(v1753, v1755); + Tile v1756; + TASSIGN(v1756, v45); + Tile v1757; + __ubuf__ float* v1758 = v1756.data(); + uint64_t v1759 = reinterpret_cast(v1758); + TASSIGN(v1757, v1759); + Tile v1760; + TASSIGN(v1760, v48); + Tile v1761; + __ubuf__ float* v1762 = v1760.data(); + uint64_t v1763 = reinterpret_cast(v1762); + TASSIGN(v1761, v1763); + Tile v1764; + TASSIGN(v1764, v44); + Tile v1765; + __ubuf__ float* v1766 = v1764.data(); + uint64_t v1767 = reinterpret_cast(v1766); + TASSIGN(v1765, v1767); + Tile v1768; + TASSIGN(v1768, v47); + Tile v1769; + __ubuf__ float* v1770 = v1768.data(); + uint64_t v1771 = reinterpret_cast(v1770); + TASSIGN(v1769, v1771); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1749, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1772 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1773 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1774 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v17 * (unsigned) v36 + v33 * (unsigned) v37), v1772, v1773); + TLOAD(v1757, v1774); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1753, v1749, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1761, v1753, v1757); + pipe_barrier(PIPE_ALL); + TROWSUM(v1769, v1761, v1765); + pipe_barrier(PIPE_ALL); + TMOV(v1745, v1769); + Tile v1775; + TASSIGN(v1775, v49); + Tile v1776; + __ubuf__ bfloat16_t* v1777 = v1775.data(); + uint64_t v1778 = reinterpret_cast(v1777); + TASSIGN(v1776, v1778); + Tile v1779; + TASSIGN(v1779, v50); + Tile v1780; + __ubuf__ float* v1781 = v1779.data(); + uint64_t v1782 = reinterpret_cast(v1781); + TASSIGN(v1780, v1782); + Tile v1783; + TASSIGN(v1783, v51); + Tile v1784; + __ubuf__ float* v1785 = v1783.data(); + uint64_t v1786 = reinterpret_cast(v1785); + TASSIGN(v1784, v1786); + Tile v1787; + TASSIGN(v1787, v52); + Tile v1788; + __ubuf__ float* v1789 = v1787.data(); + uint64_t v1790 = reinterpret_cast(v1789); + TASSIGN(v1788, v1790); + Tile v1791; + TASSIGN(v1791, v49); + Tile v1792; + __ubuf__ float* v1793 = v1791.data(); + uint64_t v1794 = reinterpret_cast(v1793); + TASSIGN(v1792, v1794); + Tile v1795; + TASSIGN(v1795, v53); + Tile v1796; + __ubuf__ float* v1797 = v1795.data(); + uint64_t v1798 = reinterpret_cast(v1797); + TASSIGN(v1796, v1798); + TLOAD(v1776, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1799 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1800 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1801 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v17 * (unsigned) v36 + v31 * (unsigned) v37), v1799, v1800); + TLOAD(v1784, v1801); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1780, v1776, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1788, v1780, v1784); + pipe_barrier(PIPE_ALL); + TROWSUM(v1796, v1788, v1792); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1745, v1745, v1796); + Tile v1802; + TASSIGN(v1802, v49); + Tile v1803; + __ubuf__ bfloat16_t* v1804 = v1802.data(); + uint64_t v1805 = reinterpret_cast(v1804); + TASSIGN(v1803, v1805); + Tile v1806; + TASSIGN(v1806, v50); + Tile v1807; + __ubuf__ float* v1808 = v1806.data(); + uint64_t v1809 = reinterpret_cast(v1808); + TASSIGN(v1807, v1809); + Tile v1810; + TASSIGN(v1810, v51); + Tile v1811; + __ubuf__ float* v1812 = v1810.data(); + uint64_t v1813 = reinterpret_cast(v1812); + TASSIGN(v1811, v1813); + Tile v1814; + TASSIGN(v1814, v52); + Tile v1815; + __ubuf__ float* v1816 = v1814.data(); + uint64_t v1817 = reinterpret_cast(v1816); + TASSIGN(v1815, v1817); + Tile v1818; + TASSIGN(v1818, v49); + Tile v1819; + __ubuf__ float* v1820 = v1818.data(); + uint64_t v1821 = reinterpret_cast(v1820); + TASSIGN(v1819, v1821); + Tile v1822; + TASSIGN(v1822, v53); + Tile v1823; + __ubuf__ float* v1824 = v1822.data(); + uint64_t v1825 = reinterpret_cast(v1824); + TASSIGN(v1823, v1825); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1803, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1826 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1827 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1828 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v17 * (unsigned) v36 + v30 * (unsigned) v37), v1826, v1827); + TLOAD(v1811, v1828); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1807, v1803, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1815, v1807, v1811); + pipe_barrier(PIPE_ALL); + TROWSUM(v1823, v1815, v1819); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1745, v1745, v1823); + Tile v1829; + TASSIGN(v1829, v49); + Tile v1830; + __ubuf__ bfloat16_t* v1831 = v1829.data(); + uint64_t v1832 = reinterpret_cast(v1831); + TASSIGN(v1830, v1832); + Tile v1833; + TASSIGN(v1833, v50); + Tile v1834; + __ubuf__ float* v1835 = v1833.data(); + uint64_t v1836 = reinterpret_cast(v1835); + TASSIGN(v1834, v1836); + Tile v1837; + TASSIGN(v1837, v51); + Tile v1838; + __ubuf__ float* v1839 = v1837.data(); + uint64_t v1840 = reinterpret_cast(v1839); + TASSIGN(v1838, v1840); + Tile v1841; + TASSIGN(v1841, v52); + Tile v1842; + __ubuf__ float* v1843 = v1841.data(); + uint64_t v1844 = reinterpret_cast(v1843); + TASSIGN(v1842, v1844); + Tile v1845; + TASSIGN(v1845, v49); + Tile v1846; + __ubuf__ float* v1847 = v1845.data(); + uint64_t v1848 = reinterpret_cast(v1847); + TASSIGN(v1846, v1848); + Tile v1849; + TASSIGN(v1849, v53); + Tile v1850; + __ubuf__ float* v1851 = v1849.data(); + uint64_t v1852 = reinterpret_cast(v1851); + TASSIGN(v1850, v1852); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1830, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1853 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1854 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1855 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v17 * (unsigned) v36 + v29 * (unsigned) v37), v1853, v1854); + TLOAD(v1838, v1855); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1834, v1830, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1842, v1834, v1838); + pipe_barrier(PIPE_ALL); + TROWSUM(v1850, v1842, v1846); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1745, v1745, v1850); + Tile v1856; + TASSIGN(v1856, v49); + Tile v1857; + __ubuf__ bfloat16_t* v1858 = v1856.data(); + uint64_t v1859 = reinterpret_cast(v1858); + TASSIGN(v1857, v1859); + Tile v1860; + TASSIGN(v1860, v50); + Tile v1861; + __ubuf__ float* v1862 = v1860.data(); + uint64_t v1863 = reinterpret_cast(v1862); + TASSIGN(v1861, v1863); + Tile v1864; + TASSIGN(v1864, v51); + Tile v1865; + __ubuf__ float* v1866 = v1864.data(); + uint64_t v1867 = reinterpret_cast(v1866); + TASSIGN(v1865, v1867); + Tile v1868; + TASSIGN(v1868, v52); + Tile v1869; + __ubuf__ float* v1870 = v1868.data(); + uint64_t v1871 = reinterpret_cast(v1870); + TASSIGN(v1869, v1871); + Tile v1872; + TASSIGN(v1872, v49); + Tile v1873; + __ubuf__ float* v1874 = v1872.data(); + uint64_t v1875 = reinterpret_cast(v1874); + TASSIGN(v1873, v1875); + Tile v1876; + TASSIGN(v1876, v53); + Tile v1877; + __ubuf__ float* v1878 = v1876.data(); + uint64_t v1879 = reinterpret_cast(v1878); + TASSIGN(v1877, v1879); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1857, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1880 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1881 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1882 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v17 * (unsigned) v36 + v28 * (unsigned) v37), v1880, v1881); + TLOAD(v1865, v1882); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1861, v1857, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1869, v1861, v1865); + pipe_barrier(PIPE_ALL); + TROWSUM(v1877, v1869, v1873); + pipe_barrier(PIPE_ALL); + TADD(v1745, v1745, v1877); + pipe_barrier(PIPE_ALL); + TMUL(v1745, v1745, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v1883 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v1884 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v1885 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v17 * (unsigned) v37), v1883, v1884); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v1885, v1745); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v1886; + TASSIGN(v1886, v44); + Tile v1887; + __ubuf__ float* v1888 = v1886.data(); + uint64_t v1889 = reinterpret_cast(v1888); + TASSIGN(v1887, v1889); + Tile v1890; + TASSIGN(v1890, v44); + Tile v1891; + __ubuf__ bfloat16_t* v1892 = v1890.data(); + uint64_t v1893 = reinterpret_cast(v1892); + TASSIGN(v1891, v1893); + Tile v1894; + TASSIGN(v1894, v46); + Tile v1895; + __ubuf__ float* v1896 = v1894.data(); + uint64_t v1897 = reinterpret_cast(v1896); + TASSIGN(v1895, v1897); + Tile v1898; + TASSIGN(v1898, v45); + Tile v1899; + __ubuf__ float* v1900 = v1898.data(); + uint64_t v1901 = reinterpret_cast(v1900); + TASSIGN(v1899, v1901); + Tile v1902; + TASSIGN(v1902, v48); + Tile v1903; + __ubuf__ float* v1904 = v1902.data(); + uint64_t v1905 = reinterpret_cast(v1904); + TASSIGN(v1903, v1905); + Tile v1906; + TASSIGN(v1906, v44); + Tile v1907; + __ubuf__ float* v1908 = v1906.data(); + uint64_t v1909 = reinterpret_cast(v1908); + TASSIGN(v1907, v1909); + Tile v1910; + TASSIGN(v1910, v47); + Tile v1911; + __ubuf__ float* v1912 = v1910.data(); + uint64_t v1913 = reinterpret_cast(v1912); + TASSIGN(v1911, v1913); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v1891, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1914 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1915 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1916 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v16 * (unsigned) v36 + v33 * (unsigned) v37), v1914, v1915); + TLOAD(v1899, v1916); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1895, v1891, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1903, v1895, v1899); + pipe_barrier(PIPE_ALL); + TROWSUM(v1911, v1903, v1907); + pipe_barrier(PIPE_ALL); + TMOV(v1887, v1911); + Tile v1917; + TASSIGN(v1917, v49); + Tile v1918; + __ubuf__ bfloat16_t* v1919 = v1917.data(); + uint64_t v1920 = reinterpret_cast(v1919); + TASSIGN(v1918, v1920); + Tile v1921; + TASSIGN(v1921, v50); + Tile v1922; + __ubuf__ float* v1923 = v1921.data(); + uint64_t v1924 = reinterpret_cast(v1923); + TASSIGN(v1922, v1924); + Tile v1925; + TASSIGN(v1925, v51); + Tile v1926; + __ubuf__ float* v1927 = v1925.data(); + uint64_t v1928 = reinterpret_cast(v1927); + TASSIGN(v1926, v1928); + Tile v1929; + TASSIGN(v1929, v52); + Tile v1930; + __ubuf__ float* v1931 = v1929.data(); + uint64_t v1932 = reinterpret_cast(v1931); + TASSIGN(v1930, v1932); + Tile v1933; + TASSIGN(v1933, v49); + Tile v1934; + __ubuf__ float* v1935 = v1933.data(); + uint64_t v1936 = reinterpret_cast(v1935); + TASSIGN(v1934, v1936); + Tile v1937; + TASSIGN(v1937, v53); + Tile v1938; + __ubuf__ float* v1939 = v1937.data(); + uint64_t v1940 = reinterpret_cast(v1939); + TASSIGN(v1938, v1940); + TLOAD(v1918, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1941 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1942 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1943 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v16 * (unsigned) v36 + v31 * (unsigned) v37), v1941, v1942); + TLOAD(v1926, v1943); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1922, v1918, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1930, v1922, v1926); + pipe_barrier(PIPE_ALL); + TROWSUM(v1938, v1930, v1934); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1887, v1887, v1938); + Tile v1944; + TASSIGN(v1944, v49); + Tile v1945; + __ubuf__ bfloat16_t* v1946 = v1944.data(); + uint64_t v1947 = reinterpret_cast(v1946); + TASSIGN(v1945, v1947); + Tile v1948; + TASSIGN(v1948, v50); + Tile v1949; + __ubuf__ float* v1950 = v1948.data(); + uint64_t v1951 = reinterpret_cast(v1950); + TASSIGN(v1949, v1951); + Tile v1952; + TASSIGN(v1952, v51); + Tile v1953; + __ubuf__ float* v1954 = v1952.data(); + uint64_t v1955 = reinterpret_cast(v1954); + TASSIGN(v1953, v1955); + Tile v1956; + TASSIGN(v1956, v52); + Tile v1957; + __ubuf__ float* v1958 = v1956.data(); + uint64_t v1959 = reinterpret_cast(v1958); + TASSIGN(v1957, v1959); + Tile v1960; + TASSIGN(v1960, v49); + Tile v1961; + __ubuf__ float* v1962 = v1960.data(); + uint64_t v1963 = reinterpret_cast(v1962); + TASSIGN(v1961, v1963); + Tile v1964; + TASSIGN(v1964, v53); + Tile v1965; + __ubuf__ float* v1966 = v1964.data(); + uint64_t v1967 = reinterpret_cast(v1966); + TASSIGN(v1965, v1967); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1945, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1968 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1969 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1970 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v16 * (unsigned) v36 + v30 * (unsigned) v37), v1968, v1969); + TLOAD(v1953, v1970); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1949, v1945, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1957, v1949, v1953); + pipe_barrier(PIPE_ALL); + TROWSUM(v1965, v1957, v1961); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1887, v1887, v1965); + Tile v1971; + TASSIGN(v1971, v49); + Tile v1972; + __ubuf__ bfloat16_t* v1973 = v1971.data(); + uint64_t v1974 = reinterpret_cast(v1973); + TASSIGN(v1972, v1974); + Tile v1975; + TASSIGN(v1975, v50); + Tile v1976; + __ubuf__ float* v1977 = v1975.data(); + uint64_t v1978 = reinterpret_cast(v1977); + TASSIGN(v1976, v1978); + Tile v1979; + TASSIGN(v1979, v51); + Tile v1980; + __ubuf__ float* v1981 = v1979.data(); + uint64_t v1982 = reinterpret_cast(v1981); + TASSIGN(v1980, v1982); + Tile v1983; + TASSIGN(v1983, v52); + Tile v1984; + __ubuf__ float* v1985 = v1983.data(); + uint64_t v1986 = reinterpret_cast(v1985); + TASSIGN(v1984, v1986); + Tile v1987; + TASSIGN(v1987, v49); + Tile v1988; + __ubuf__ float* v1989 = v1987.data(); + uint64_t v1990 = reinterpret_cast(v1989); + TASSIGN(v1988, v1990); + Tile v1991; + TASSIGN(v1991, v53); + Tile v1992; + __ubuf__ float* v1993 = v1991.data(); + uint64_t v1994 = reinterpret_cast(v1993); + TASSIGN(v1992, v1994); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1972, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v1995 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v1996 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v1997 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v16 * (unsigned) v36 + v29 * (unsigned) v37), v1995, v1996); + TLOAD(v1980, v1997); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v1976, v1972, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v1984, v1976, v1980); + pipe_barrier(PIPE_ALL); + TROWSUM(v1992, v1984, v1988); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v1887, v1887, v1992); + Tile v1998; + TASSIGN(v1998, v49); + Tile v1999; + __ubuf__ bfloat16_t* v2000 = v1998.data(); + uint64_t v2001 = reinterpret_cast(v2000); + TASSIGN(v1999, v2001); + Tile v2002; + TASSIGN(v2002, v50); + Tile v2003; + __ubuf__ float* v2004 = v2002.data(); + uint64_t v2005 = reinterpret_cast(v2004); + TASSIGN(v2003, v2005); + Tile v2006; + TASSIGN(v2006, v51); + Tile v2007; + __ubuf__ float* v2008 = v2006.data(); + uint64_t v2009 = reinterpret_cast(v2008); + TASSIGN(v2007, v2009); + Tile v2010; + TASSIGN(v2010, v52); + Tile v2011; + __ubuf__ float* v2012 = v2010.data(); + uint64_t v2013 = reinterpret_cast(v2012); + TASSIGN(v2011, v2013); + Tile v2014; + TASSIGN(v2014, v49); + Tile v2015; + __ubuf__ float* v2016 = v2014.data(); + uint64_t v2017 = reinterpret_cast(v2016); + TASSIGN(v2015, v2017); + Tile v2018; + TASSIGN(v2018, v53); + Tile v2019; + __ubuf__ float* v2020 = v2018.data(); + uint64_t v2021 = reinterpret_cast(v2020); + TASSIGN(v2019, v2021); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v1999, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2022 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2023 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2024 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v16 * (unsigned) v36 + v28 * (unsigned) v37), v2022, v2023); + TLOAD(v2007, v2024); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2003, v1999, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2011, v2003, v2007); + pipe_barrier(PIPE_ALL); + TROWSUM(v2019, v2011, v2015); + pipe_barrier(PIPE_ALL); + TADD(v1887, v1887, v2019); + pipe_barrier(PIPE_ALL); + TMUL(v1887, v1887, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2025 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2026 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2027 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v16 * (unsigned) v37), v2025, v2026); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2027, v1887); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2028; + TASSIGN(v2028, v44); + Tile v2029; + __ubuf__ float* v2030 = v2028.data(); + uint64_t v2031 = reinterpret_cast(v2030); + TASSIGN(v2029, v2031); + Tile v2032; + TASSIGN(v2032, v44); + Tile v2033; + __ubuf__ bfloat16_t* v2034 = v2032.data(); + uint64_t v2035 = reinterpret_cast(v2034); + TASSIGN(v2033, v2035); + Tile v2036; + TASSIGN(v2036, v46); + Tile v2037; + __ubuf__ float* v2038 = v2036.data(); + uint64_t v2039 = reinterpret_cast(v2038); + TASSIGN(v2037, v2039); + Tile v2040; + TASSIGN(v2040, v45); + Tile v2041; + __ubuf__ float* v2042 = v2040.data(); + uint64_t v2043 = reinterpret_cast(v2042); + TASSIGN(v2041, v2043); + Tile v2044; + TASSIGN(v2044, v48); + Tile v2045; + __ubuf__ float* v2046 = v2044.data(); + uint64_t v2047 = reinterpret_cast(v2046); + TASSIGN(v2045, v2047); + Tile v2048; + TASSIGN(v2048, v44); + Tile v2049; + __ubuf__ float* v2050 = v2048.data(); + uint64_t v2051 = reinterpret_cast(v2050); + TASSIGN(v2049, v2051); + Tile v2052; + TASSIGN(v2052, v47); + Tile v2053; + __ubuf__ float* v2054 = v2052.data(); + uint64_t v2055 = reinterpret_cast(v2054); + TASSIGN(v2053, v2055); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2033, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2056 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2057 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2058 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v15 * (unsigned) v36 + v33 * (unsigned) v37), v2056, v2057); + TLOAD(v2041, v2058); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2037, v2033, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2045, v2037, v2041); + pipe_barrier(PIPE_ALL); + TROWSUM(v2053, v2045, v2049); + pipe_barrier(PIPE_ALL); + TMOV(v2029, v2053); + Tile v2059; + TASSIGN(v2059, v49); + Tile v2060; + __ubuf__ bfloat16_t* v2061 = v2059.data(); + uint64_t v2062 = reinterpret_cast(v2061); + TASSIGN(v2060, v2062); + Tile v2063; + TASSIGN(v2063, v50); + Tile v2064; + __ubuf__ float* v2065 = v2063.data(); + uint64_t v2066 = reinterpret_cast(v2065); + TASSIGN(v2064, v2066); + Tile v2067; + TASSIGN(v2067, v51); + Tile v2068; + __ubuf__ float* v2069 = v2067.data(); + uint64_t v2070 = reinterpret_cast(v2069); + TASSIGN(v2068, v2070); + Tile v2071; + TASSIGN(v2071, v52); + Tile v2072; + __ubuf__ float* v2073 = v2071.data(); + uint64_t v2074 = reinterpret_cast(v2073); + TASSIGN(v2072, v2074); + Tile v2075; + TASSIGN(v2075, v49); + Tile v2076; + __ubuf__ float* v2077 = v2075.data(); + uint64_t v2078 = reinterpret_cast(v2077); + TASSIGN(v2076, v2078); + Tile v2079; + TASSIGN(v2079, v53); + Tile v2080; + __ubuf__ float* v2081 = v2079.data(); + uint64_t v2082 = reinterpret_cast(v2081); + TASSIGN(v2080, v2082); + TLOAD(v2060, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2083 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2084 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2085 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v15 * (unsigned) v36 + v31 * (unsigned) v37), v2083, v2084); + TLOAD(v2068, v2085); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2064, v2060, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2072, v2064, v2068); + pipe_barrier(PIPE_ALL); + TROWSUM(v2080, v2072, v2076); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2029, v2029, v2080); + Tile v2086; + TASSIGN(v2086, v49); + Tile v2087; + __ubuf__ bfloat16_t* v2088 = v2086.data(); + uint64_t v2089 = reinterpret_cast(v2088); + TASSIGN(v2087, v2089); + Tile v2090; + TASSIGN(v2090, v50); + Tile v2091; + __ubuf__ float* v2092 = v2090.data(); + uint64_t v2093 = reinterpret_cast(v2092); + TASSIGN(v2091, v2093); + Tile v2094; + TASSIGN(v2094, v51); + Tile v2095; + __ubuf__ float* v2096 = v2094.data(); + uint64_t v2097 = reinterpret_cast(v2096); + TASSIGN(v2095, v2097); + Tile v2098; + TASSIGN(v2098, v52); + Tile v2099; + __ubuf__ float* v2100 = v2098.data(); + uint64_t v2101 = reinterpret_cast(v2100); + TASSIGN(v2099, v2101); + Tile v2102; + TASSIGN(v2102, v49); + Tile v2103; + __ubuf__ float* v2104 = v2102.data(); + uint64_t v2105 = reinterpret_cast(v2104); + TASSIGN(v2103, v2105); + Tile v2106; + TASSIGN(v2106, v53); + Tile v2107; + __ubuf__ float* v2108 = v2106.data(); + uint64_t v2109 = reinterpret_cast(v2108); + TASSIGN(v2107, v2109); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2087, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2110 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2111 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2112 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v15 * (unsigned) v36 + v30 * (unsigned) v37), v2110, v2111); + TLOAD(v2095, v2112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2091, v2087, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2099, v2091, v2095); + pipe_barrier(PIPE_ALL); + TROWSUM(v2107, v2099, v2103); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2029, v2029, v2107); + Tile v2113; + TASSIGN(v2113, v49); + Tile v2114; + __ubuf__ bfloat16_t* v2115 = v2113.data(); + uint64_t v2116 = reinterpret_cast(v2115); + TASSIGN(v2114, v2116); + Tile v2117; + TASSIGN(v2117, v50); + Tile v2118; + __ubuf__ float* v2119 = v2117.data(); + uint64_t v2120 = reinterpret_cast(v2119); + TASSIGN(v2118, v2120); + Tile v2121; + TASSIGN(v2121, v51); + Tile v2122; + __ubuf__ float* v2123 = v2121.data(); + uint64_t v2124 = reinterpret_cast(v2123); + TASSIGN(v2122, v2124); + Tile v2125; + TASSIGN(v2125, v52); + Tile v2126; + __ubuf__ float* v2127 = v2125.data(); + uint64_t v2128 = reinterpret_cast(v2127); + TASSIGN(v2126, v2128); + Tile v2129; + TASSIGN(v2129, v49); + Tile v2130; + __ubuf__ float* v2131 = v2129.data(); + uint64_t v2132 = reinterpret_cast(v2131); + TASSIGN(v2130, v2132); + Tile v2133; + TASSIGN(v2133, v53); + Tile v2134; + __ubuf__ float* v2135 = v2133.data(); + uint64_t v2136 = reinterpret_cast(v2135); + TASSIGN(v2134, v2136); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2114, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2137 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2138 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2139 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v15 * (unsigned) v36 + v29 * (unsigned) v37), v2137, v2138); + TLOAD(v2122, v2139); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2118, v2114, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2126, v2118, v2122); + pipe_barrier(PIPE_ALL); + TROWSUM(v2134, v2126, v2130); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2029, v2029, v2134); + Tile v2140; + TASSIGN(v2140, v49); + Tile v2141; + __ubuf__ bfloat16_t* v2142 = v2140.data(); + uint64_t v2143 = reinterpret_cast(v2142); + TASSIGN(v2141, v2143); + Tile v2144; + TASSIGN(v2144, v50); + Tile v2145; + __ubuf__ float* v2146 = v2144.data(); + uint64_t v2147 = reinterpret_cast(v2146); + TASSIGN(v2145, v2147); + Tile v2148; + TASSIGN(v2148, v51); + Tile v2149; + __ubuf__ float* v2150 = v2148.data(); + uint64_t v2151 = reinterpret_cast(v2150); + TASSIGN(v2149, v2151); + Tile v2152; + TASSIGN(v2152, v52); + Tile v2153; + __ubuf__ float* v2154 = v2152.data(); + uint64_t v2155 = reinterpret_cast(v2154); + TASSIGN(v2153, v2155); + Tile v2156; + TASSIGN(v2156, v49); + Tile v2157; + __ubuf__ float* v2158 = v2156.data(); + uint64_t v2159 = reinterpret_cast(v2158); + TASSIGN(v2157, v2159); + Tile v2160; + TASSIGN(v2160, v53); + Tile v2161; + __ubuf__ float* v2162 = v2160.data(); + uint64_t v2163 = reinterpret_cast(v2162); + TASSIGN(v2161, v2163); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2141, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2164 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2165 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2166 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v15 * (unsigned) v36 + v28 * (unsigned) v37), v2164, v2165); + TLOAD(v2149, v2166); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2145, v2141, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2153, v2145, v2149); + pipe_barrier(PIPE_ALL); + TROWSUM(v2161, v2153, v2157); + pipe_barrier(PIPE_ALL); + TADD(v2029, v2029, v2161); + pipe_barrier(PIPE_ALL); + TMUL(v2029, v2029, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2167 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2168 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2169 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v15 * (unsigned) v37), v2167, v2168); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2169, v2029); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2170; + TASSIGN(v2170, v44); + Tile v2171; + __ubuf__ float* v2172 = v2170.data(); + uint64_t v2173 = reinterpret_cast(v2172); + TASSIGN(v2171, v2173); + Tile v2174; + TASSIGN(v2174, v44); + Tile v2175; + __ubuf__ bfloat16_t* v2176 = v2174.data(); + uint64_t v2177 = reinterpret_cast(v2176); + TASSIGN(v2175, v2177); + Tile v2178; + TASSIGN(v2178, v46); + Tile v2179; + __ubuf__ float* v2180 = v2178.data(); + uint64_t v2181 = reinterpret_cast(v2180); + TASSIGN(v2179, v2181); + Tile v2182; + TASSIGN(v2182, v45); + Tile v2183; + __ubuf__ float* v2184 = v2182.data(); + uint64_t v2185 = reinterpret_cast(v2184); + TASSIGN(v2183, v2185); + Tile v2186; + TASSIGN(v2186, v48); + Tile v2187; + __ubuf__ float* v2188 = v2186.data(); + uint64_t v2189 = reinterpret_cast(v2188); + TASSIGN(v2187, v2189); + Tile v2190; + TASSIGN(v2190, v44); + Tile v2191; + __ubuf__ float* v2192 = v2190.data(); + uint64_t v2193 = reinterpret_cast(v2192); + TASSIGN(v2191, v2193); + Tile v2194; + TASSIGN(v2194, v47); + Tile v2195; + __ubuf__ float* v2196 = v2194.data(); + uint64_t v2197 = reinterpret_cast(v2196); + TASSIGN(v2195, v2197); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2175, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2198 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2199 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2200 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v14 * (unsigned) v36 + v33 * (unsigned) v37), v2198, v2199); + TLOAD(v2183, v2200); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2179, v2175, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2187, v2179, v2183); + pipe_barrier(PIPE_ALL); + TROWSUM(v2195, v2187, v2191); + pipe_barrier(PIPE_ALL); + TMOV(v2171, v2195); + Tile v2201; + TASSIGN(v2201, v49); + Tile v2202; + __ubuf__ bfloat16_t* v2203 = v2201.data(); + uint64_t v2204 = reinterpret_cast(v2203); + TASSIGN(v2202, v2204); + Tile v2205; + TASSIGN(v2205, v50); + Tile v2206; + __ubuf__ float* v2207 = v2205.data(); + uint64_t v2208 = reinterpret_cast(v2207); + TASSIGN(v2206, v2208); + Tile v2209; + TASSIGN(v2209, v51); + Tile v2210; + __ubuf__ float* v2211 = v2209.data(); + uint64_t v2212 = reinterpret_cast(v2211); + TASSIGN(v2210, v2212); + Tile v2213; + TASSIGN(v2213, v52); + Tile v2214; + __ubuf__ float* v2215 = v2213.data(); + uint64_t v2216 = reinterpret_cast(v2215); + TASSIGN(v2214, v2216); + Tile v2217; + TASSIGN(v2217, v49); + Tile v2218; + __ubuf__ float* v2219 = v2217.data(); + uint64_t v2220 = reinterpret_cast(v2219); + TASSIGN(v2218, v2220); + Tile v2221; + TASSIGN(v2221, v53); + Tile v2222; + __ubuf__ float* v2223 = v2221.data(); + uint64_t v2224 = reinterpret_cast(v2223); + TASSIGN(v2222, v2224); + TLOAD(v2202, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2225 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2226 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2227 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v14 * (unsigned) v36 + v31 * (unsigned) v37), v2225, v2226); + TLOAD(v2210, v2227); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2206, v2202, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2214, v2206, v2210); + pipe_barrier(PIPE_ALL); + TROWSUM(v2222, v2214, v2218); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2171, v2171, v2222); + Tile v2228; + TASSIGN(v2228, v49); + Tile v2229; + __ubuf__ bfloat16_t* v2230 = v2228.data(); + uint64_t v2231 = reinterpret_cast(v2230); + TASSIGN(v2229, v2231); + Tile v2232; + TASSIGN(v2232, v50); + Tile v2233; + __ubuf__ float* v2234 = v2232.data(); + uint64_t v2235 = reinterpret_cast(v2234); + TASSIGN(v2233, v2235); + Tile v2236; + TASSIGN(v2236, v51); + Tile v2237; + __ubuf__ float* v2238 = v2236.data(); + uint64_t v2239 = reinterpret_cast(v2238); + TASSIGN(v2237, v2239); + Tile v2240; + TASSIGN(v2240, v52); + Tile v2241; + __ubuf__ float* v2242 = v2240.data(); + uint64_t v2243 = reinterpret_cast(v2242); + TASSIGN(v2241, v2243); + Tile v2244; + TASSIGN(v2244, v49); + Tile v2245; + __ubuf__ float* v2246 = v2244.data(); + uint64_t v2247 = reinterpret_cast(v2246); + TASSIGN(v2245, v2247); + Tile v2248; + TASSIGN(v2248, v53); + Tile v2249; + __ubuf__ float* v2250 = v2248.data(); + uint64_t v2251 = reinterpret_cast(v2250); + TASSIGN(v2249, v2251); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2229, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2252 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2253 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2254 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v14 * (unsigned) v36 + v30 * (unsigned) v37), v2252, v2253); + TLOAD(v2237, v2254); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2233, v2229, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2241, v2233, v2237); + pipe_barrier(PIPE_ALL); + TROWSUM(v2249, v2241, v2245); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2171, v2171, v2249); + Tile v2255; + TASSIGN(v2255, v49); + Tile v2256; + __ubuf__ bfloat16_t* v2257 = v2255.data(); + uint64_t v2258 = reinterpret_cast(v2257); + TASSIGN(v2256, v2258); + Tile v2259; + TASSIGN(v2259, v50); + Tile v2260; + __ubuf__ float* v2261 = v2259.data(); + uint64_t v2262 = reinterpret_cast(v2261); + TASSIGN(v2260, v2262); + Tile v2263; + TASSIGN(v2263, v51); + Tile v2264; + __ubuf__ float* v2265 = v2263.data(); + uint64_t v2266 = reinterpret_cast(v2265); + TASSIGN(v2264, v2266); + Tile v2267; + TASSIGN(v2267, v52); + Tile v2268; + __ubuf__ float* v2269 = v2267.data(); + uint64_t v2270 = reinterpret_cast(v2269); + TASSIGN(v2268, v2270); + Tile v2271; + TASSIGN(v2271, v49); + Tile v2272; + __ubuf__ float* v2273 = v2271.data(); + uint64_t v2274 = reinterpret_cast(v2273); + TASSIGN(v2272, v2274); + Tile v2275; + TASSIGN(v2275, v53); + Tile v2276; + __ubuf__ float* v2277 = v2275.data(); + uint64_t v2278 = reinterpret_cast(v2277); + TASSIGN(v2276, v2278); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2256, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2279 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2280 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2281 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v14 * (unsigned) v36 + v29 * (unsigned) v37), v2279, v2280); + TLOAD(v2264, v2281); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2260, v2256, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2268, v2260, v2264); + pipe_barrier(PIPE_ALL); + TROWSUM(v2276, v2268, v2272); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2171, v2171, v2276); + Tile v2282; + TASSIGN(v2282, v49); + Tile v2283; + __ubuf__ bfloat16_t* v2284 = v2282.data(); + uint64_t v2285 = reinterpret_cast(v2284); + TASSIGN(v2283, v2285); + Tile v2286; + TASSIGN(v2286, v50); + Tile v2287; + __ubuf__ float* v2288 = v2286.data(); + uint64_t v2289 = reinterpret_cast(v2288); + TASSIGN(v2287, v2289); + Tile v2290; + TASSIGN(v2290, v51); + Tile v2291; + __ubuf__ float* v2292 = v2290.data(); + uint64_t v2293 = reinterpret_cast(v2292); + TASSIGN(v2291, v2293); + Tile v2294; + TASSIGN(v2294, v52); + Tile v2295; + __ubuf__ float* v2296 = v2294.data(); + uint64_t v2297 = reinterpret_cast(v2296); + TASSIGN(v2295, v2297); + Tile v2298; + TASSIGN(v2298, v49); + Tile v2299; + __ubuf__ float* v2300 = v2298.data(); + uint64_t v2301 = reinterpret_cast(v2300); + TASSIGN(v2299, v2301); + Tile v2302; + TASSIGN(v2302, v53); + Tile v2303; + __ubuf__ float* v2304 = v2302.data(); + uint64_t v2305 = reinterpret_cast(v2304); + TASSIGN(v2303, v2305); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2283, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2306 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2307 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2308 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v14 * (unsigned) v36 + v28 * (unsigned) v37), v2306, v2307); + TLOAD(v2291, v2308); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2287, v2283, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2295, v2287, v2291); + pipe_barrier(PIPE_ALL); + TROWSUM(v2303, v2295, v2299); + pipe_barrier(PIPE_ALL); + TADD(v2171, v2171, v2303); + pipe_barrier(PIPE_ALL); + TMUL(v2171, v2171, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2309 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2310 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2311 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v14 * (unsigned) v37), v2309, v2310); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2311, v2171); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2312; + TASSIGN(v2312, v44); + Tile v2313; + __ubuf__ float* v2314 = v2312.data(); + uint64_t v2315 = reinterpret_cast(v2314); + TASSIGN(v2313, v2315); + Tile v2316; + TASSIGN(v2316, v44); + Tile v2317; + __ubuf__ bfloat16_t* v2318 = v2316.data(); + uint64_t v2319 = reinterpret_cast(v2318); + TASSIGN(v2317, v2319); + Tile v2320; + TASSIGN(v2320, v46); + Tile v2321; + __ubuf__ float* v2322 = v2320.data(); + uint64_t v2323 = reinterpret_cast(v2322); + TASSIGN(v2321, v2323); + Tile v2324; + TASSIGN(v2324, v45); + Tile v2325; + __ubuf__ float* v2326 = v2324.data(); + uint64_t v2327 = reinterpret_cast(v2326); + TASSIGN(v2325, v2327); + Tile v2328; + TASSIGN(v2328, v48); + Tile v2329; + __ubuf__ float* v2330 = v2328.data(); + uint64_t v2331 = reinterpret_cast(v2330); + TASSIGN(v2329, v2331); + Tile v2332; + TASSIGN(v2332, v44); + Tile v2333; + __ubuf__ float* v2334 = v2332.data(); + uint64_t v2335 = reinterpret_cast(v2334); + TASSIGN(v2333, v2335); + Tile v2336; + TASSIGN(v2336, v47); + Tile v2337; + __ubuf__ float* v2338 = v2336.data(); + uint64_t v2339 = reinterpret_cast(v2338); + TASSIGN(v2337, v2339); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2317, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2340 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2341 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2342 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v13 * (unsigned) v36 + v33 * (unsigned) v37), v2340, v2341); + TLOAD(v2325, v2342); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2321, v2317, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2329, v2321, v2325); + pipe_barrier(PIPE_ALL); + TROWSUM(v2337, v2329, v2333); + pipe_barrier(PIPE_ALL); + TMOV(v2313, v2337); + Tile v2343; + TASSIGN(v2343, v49); + Tile v2344; + __ubuf__ bfloat16_t* v2345 = v2343.data(); + uint64_t v2346 = reinterpret_cast(v2345); + TASSIGN(v2344, v2346); + Tile v2347; + TASSIGN(v2347, v50); + Tile v2348; + __ubuf__ float* v2349 = v2347.data(); + uint64_t v2350 = reinterpret_cast(v2349); + TASSIGN(v2348, v2350); + Tile v2351; + TASSIGN(v2351, v51); + Tile v2352; + __ubuf__ float* v2353 = v2351.data(); + uint64_t v2354 = reinterpret_cast(v2353); + TASSIGN(v2352, v2354); + Tile v2355; + TASSIGN(v2355, v52); + Tile v2356; + __ubuf__ float* v2357 = v2355.data(); + uint64_t v2358 = reinterpret_cast(v2357); + TASSIGN(v2356, v2358); + Tile v2359; + TASSIGN(v2359, v49); + Tile v2360; + __ubuf__ float* v2361 = v2359.data(); + uint64_t v2362 = reinterpret_cast(v2361); + TASSIGN(v2360, v2362); + Tile v2363; + TASSIGN(v2363, v53); + Tile v2364; + __ubuf__ float* v2365 = v2363.data(); + uint64_t v2366 = reinterpret_cast(v2365); + TASSIGN(v2364, v2366); + TLOAD(v2344, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2367 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2368 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2369 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v13 * (unsigned) v36 + v31 * (unsigned) v37), v2367, v2368); + TLOAD(v2352, v2369); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2348, v2344, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2356, v2348, v2352); + pipe_barrier(PIPE_ALL); + TROWSUM(v2364, v2356, v2360); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2313, v2313, v2364); + Tile v2370; + TASSIGN(v2370, v49); + Tile v2371; + __ubuf__ bfloat16_t* v2372 = v2370.data(); + uint64_t v2373 = reinterpret_cast(v2372); + TASSIGN(v2371, v2373); + Tile v2374; + TASSIGN(v2374, v50); + Tile v2375; + __ubuf__ float* v2376 = v2374.data(); + uint64_t v2377 = reinterpret_cast(v2376); + TASSIGN(v2375, v2377); + Tile v2378; + TASSIGN(v2378, v51); + Tile v2379; + __ubuf__ float* v2380 = v2378.data(); + uint64_t v2381 = reinterpret_cast(v2380); + TASSIGN(v2379, v2381); + Tile v2382; + TASSIGN(v2382, v52); + Tile v2383; + __ubuf__ float* v2384 = v2382.data(); + uint64_t v2385 = reinterpret_cast(v2384); + TASSIGN(v2383, v2385); + Tile v2386; + TASSIGN(v2386, v49); + Tile v2387; + __ubuf__ float* v2388 = v2386.data(); + uint64_t v2389 = reinterpret_cast(v2388); + TASSIGN(v2387, v2389); + Tile v2390; + TASSIGN(v2390, v53); + Tile v2391; + __ubuf__ float* v2392 = v2390.data(); + uint64_t v2393 = reinterpret_cast(v2392); + TASSIGN(v2391, v2393); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2371, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2394 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2395 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2396 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v13 * (unsigned) v36 + v30 * (unsigned) v37), v2394, v2395); + TLOAD(v2379, v2396); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2375, v2371, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2383, v2375, v2379); + pipe_barrier(PIPE_ALL); + TROWSUM(v2391, v2383, v2387); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2313, v2313, v2391); + Tile v2397; + TASSIGN(v2397, v49); + Tile v2398; + __ubuf__ bfloat16_t* v2399 = v2397.data(); + uint64_t v2400 = reinterpret_cast(v2399); + TASSIGN(v2398, v2400); + Tile v2401; + TASSIGN(v2401, v50); + Tile v2402; + __ubuf__ float* v2403 = v2401.data(); + uint64_t v2404 = reinterpret_cast(v2403); + TASSIGN(v2402, v2404); + Tile v2405; + TASSIGN(v2405, v51); + Tile v2406; + __ubuf__ float* v2407 = v2405.data(); + uint64_t v2408 = reinterpret_cast(v2407); + TASSIGN(v2406, v2408); + Tile v2409; + TASSIGN(v2409, v52); + Tile v2410; + __ubuf__ float* v2411 = v2409.data(); + uint64_t v2412 = reinterpret_cast(v2411); + TASSIGN(v2410, v2412); + Tile v2413; + TASSIGN(v2413, v49); + Tile v2414; + __ubuf__ float* v2415 = v2413.data(); + uint64_t v2416 = reinterpret_cast(v2415); + TASSIGN(v2414, v2416); + Tile v2417; + TASSIGN(v2417, v53); + Tile v2418; + __ubuf__ float* v2419 = v2417.data(); + uint64_t v2420 = reinterpret_cast(v2419); + TASSIGN(v2418, v2420); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2398, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2421 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2422 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2423 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v13 * (unsigned) v36 + v29 * (unsigned) v37), v2421, v2422); + TLOAD(v2406, v2423); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2402, v2398, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2410, v2402, v2406); + pipe_barrier(PIPE_ALL); + TROWSUM(v2418, v2410, v2414); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2313, v2313, v2418); + Tile v2424; + TASSIGN(v2424, v49); + Tile v2425; + __ubuf__ bfloat16_t* v2426 = v2424.data(); + uint64_t v2427 = reinterpret_cast(v2426); + TASSIGN(v2425, v2427); + Tile v2428; + TASSIGN(v2428, v50); + Tile v2429; + __ubuf__ float* v2430 = v2428.data(); + uint64_t v2431 = reinterpret_cast(v2430); + TASSIGN(v2429, v2431); + Tile v2432; + TASSIGN(v2432, v51); + Tile v2433; + __ubuf__ float* v2434 = v2432.data(); + uint64_t v2435 = reinterpret_cast(v2434); + TASSIGN(v2433, v2435); + Tile v2436; + TASSIGN(v2436, v52); + Tile v2437; + __ubuf__ float* v2438 = v2436.data(); + uint64_t v2439 = reinterpret_cast(v2438); + TASSIGN(v2437, v2439); + Tile v2440; + TASSIGN(v2440, v49); + Tile v2441; + __ubuf__ float* v2442 = v2440.data(); + uint64_t v2443 = reinterpret_cast(v2442); + TASSIGN(v2441, v2443); + Tile v2444; + TASSIGN(v2444, v53); + Tile v2445; + __ubuf__ float* v2446 = v2444.data(); + uint64_t v2447 = reinterpret_cast(v2446); + TASSIGN(v2445, v2447); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2425, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2448 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2449 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2450 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v13 * (unsigned) v36 + v28 * (unsigned) v37), v2448, v2449); + TLOAD(v2433, v2450); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2429, v2425, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2437, v2429, v2433); + pipe_barrier(PIPE_ALL); + TROWSUM(v2445, v2437, v2441); + pipe_barrier(PIPE_ALL); + TADD(v2313, v2313, v2445); + pipe_barrier(PIPE_ALL); + TMUL(v2313, v2313, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2451 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2452 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2453 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v13 * (unsigned) v37), v2451, v2452); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2453, v2313); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2454; + TASSIGN(v2454, v44); + Tile v2455; + __ubuf__ float* v2456 = v2454.data(); + uint64_t v2457 = reinterpret_cast(v2456); + TASSIGN(v2455, v2457); + Tile v2458; + TASSIGN(v2458, v44); + Tile v2459; + __ubuf__ bfloat16_t* v2460 = v2458.data(); + uint64_t v2461 = reinterpret_cast(v2460); + TASSIGN(v2459, v2461); + Tile v2462; + TASSIGN(v2462, v46); + Tile v2463; + __ubuf__ float* v2464 = v2462.data(); + uint64_t v2465 = reinterpret_cast(v2464); + TASSIGN(v2463, v2465); + Tile v2466; + TASSIGN(v2466, v45); + Tile v2467; + __ubuf__ float* v2468 = v2466.data(); + uint64_t v2469 = reinterpret_cast(v2468); + TASSIGN(v2467, v2469); + Tile v2470; + TASSIGN(v2470, v48); + Tile v2471; + __ubuf__ float* v2472 = v2470.data(); + uint64_t v2473 = reinterpret_cast(v2472); + TASSIGN(v2471, v2473); + Tile v2474; + TASSIGN(v2474, v44); + Tile v2475; + __ubuf__ float* v2476 = v2474.data(); + uint64_t v2477 = reinterpret_cast(v2476); + TASSIGN(v2475, v2477); + Tile v2478; + TASSIGN(v2478, v47); + Tile v2479; + __ubuf__ float* v2480 = v2478.data(); + uint64_t v2481 = reinterpret_cast(v2480); + TASSIGN(v2479, v2481); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2459, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2482 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2483 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2484 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v12 * (unsigned) v36 + v33 * (unsigned) v37), v2482, v2483); + TLOAD(v2467, v2484); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2463, v2459, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2471, v2463, v2467); + pipe_barrier(PIPE_ALL); + TROWSUM(v2479, v2471, v2475); + pipe_barrier(PIPE_ALL); + TMOV(v2455, v2479); + Tile v2485; + TASSIGN(v2485, v49); + Tile v2486; + __ubuf__ bfloat16_t* v2487 = v2485.data(); + uint64_t v2488 = reinterpret_cast(v2487); + TASSIGN(v2486, v2488); + Tile v2489; + TASSIGN(v2489, v50); + Tile v2490; + __ubuf__ float* v2491 = v2489.data(); + uint64_t v2492 = reinterpret_cast(v2491); + TASSIGN(v2490, v2492); + Tile v2493; + TASSIGN(v2493, v51); + Tile v2494; + __ubuf__ float* v2495 = v2493.data(); + uint64_t v2496 = reinterpret_cast(v2495); + TASSIGN(v2494, v2496); + Tile v2497; + TASSIGN(v2497, v52); + Tile v2498; + __ubuf__ float* v2499 = v2497.data(); + uint64_t v2500 = reinterpret_cast(v2499); + TASSIGN(v2498, v2500); + Tile v2501; + TASSIGN(v2501, v49); + Tile v2502; + __ubuf__ float* v2503 = v2501.data(); + uint64_t v2504 = reinterpret_cast(v2503); + TASSIGN(v2502, v2504); + Tile v2505; + TASSIGN(v2505, v53); + Tile v2506; + __ubuf__ float* v2507 = v2505.data(); + uint64_t v2508 = reinterpret_cast(v2507); + TASSIGN(v2506, v2508); + TLOAD(v2486, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2509 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2510 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2511 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v12 * (unsigned) v36 + v31 * (unsigned) v37), v2509, v2510); + TLOAD(v2494, v2511); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2490, v2486, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2498, v2490, v2494); + pipe_barrier(PIPE_ALL); + TROWSUM(v2506, v2498, v2502); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2455, v2455, v2506); + Tile v2512; + TASSIGN(v2512, v49); + Tile v2513; + __ubuf__ bfloat16_t* v2514 = v2512.data(); + uint64_t v2515 = reinterpret_cast(v2514); + TASSIGN(v2513, v2515); + Tile v2516; + TASSIGN(v2516, v50); + Tile v2517; + __ubuf__ float* v2518 = v2516.data(); + uint64_t v2519 = reinterpret_cast(v2518); + TASSIGN(v2517, v2519); + Tile v2520; + TASSIGN(v2520, v51); + Tile v2521; + __ubuf__ float* v2522 = v2520.data(); + uint64_t v2523 = reinterpret_cast(v2522); + TASSIGN(v2521, v2523); + Tile v2524; + TASSIGN(v2524, v52); + Tile v2525; + __ubuf__ float* v2526 = v2524.data(); + uint64_t v2527 = reinterpret_cast(v2526); + TASSIGN(v2525, v2527); + Tile v2528; + TASSIGN(v2528, v49); + Tile v2529; + __ubuf__ float* v2530 = v2528.data(); + uint64_t v2531 = reinterpret_cast(v2530); + TASSIGN(v2529, v2531); + Tile v2532; + TASSIGN(v2532, v53); + Tile v2533; + __ubuf__ float* v2534 = v2532.data(); + uint64_t v2535 = reinterpret_cast(v2534); + TASSIGN(v2533, v2535); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2513, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2536 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2537 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2538 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v12 * (unsigned) v36 + v30 * (unsigned) v37), v2536, v2537); + TLOAD(v2521, v2538); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2517, v2513, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2525, v2517, v2521); + pipe_barrier(PIPE_ALL); + TROWSUM(v2533, v2525, v2529); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2455, v2455, v2533); + Tile v2539; + TASSIGN(v2539, v49); + Tile v2540; + __ubuf__ bfloat16_t* v2541 = v2539.data(); + uint64_t v2542 = reinterpret_cast(v2541); + TASSIGN(v2540, v2542); + Tile v2543; + TASSIGN(v2543, v50); + Tile v2544; + __ubuf__ float* v2545 = v2543.data(); + uint64_t v2546 = reinterpret_cast(v2545); + TASSIGN(v2544, v2546); + Tile v2547; + TASSIGN(v2547, v51); + Tile v2548; + __ubuf__ float* v2549 = v2547.data(); + uint64_t v2550 = reinterpret_cast(v2549); + TASSIGN(v2548, v2550); + Tile v2551; + TASSIGN(v2551, v52); + Tile v2552; + __ubuf__ float* v2553 = v2551.data(); + uint64_t v2554 = reinterpret_cast(v2553); + TASSIGN(v2552, v2554); + Tile v2555; + TASSIGN(v2555, v49); + Tile v2556; + __ubuf__ float* v2557 = v2555.data(); + uint64_t v2558 = reinterpret_cast(v2557); + TASSIGN(v2556, v2558); + Tile v2559; + TASSIGN(v2559, v53); + Tile v2560; + __ubuf__ float* v2561 = v2559.data(); + uint64_t v2562 = reinterpret_cast(v2561); + TASSIGN(v2560, v2562); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2540, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2563 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2564 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2565 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v12 * (unsigned) v36 + v29 * (unsigned) v37), v2563, v2564); + TLOAD(v2548, v2565); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2544, v2540, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2552, v2544, v2548); + pipe_barrier(PIPE_ALL); + TROWSUM(v2560, v2552, v2556); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2455, v2455, v2560); + Tile v2566; + TASSIGN(v2566, v49); + Tile v2567; + __ubuf__ bfloat16_t* v2568 = v2566.data(); + uint64_t v2569 = reinterpret_cast(v2568); + TASSIGN(v2567, v2569); + Tile v2570; + TASSIGN(v2570, v50); + Tile v2571; + __ubuf__ float* v2572 = v2570.data(); + uint64_t v2573 = reinterpret_cast(v2572); + TASSIGN(v2571, v2573); + Tile v2574; + TASSIGN(v2574, v51); + Tile v2575; + __ubuf__ float* v2576 = v2574.data(); + uint64_t v2577 = reinterpret_cast(v2576); + TASSIGN(v2575, v2577); + Tile v2578; + TASSIGN(v2578, v52); + Tile v2579; + __ubuf__ float* v2580 = v2578.data(); + uint64_t v2581 = reinterpret_cast(v2580); + TASSIGN(v2579, v2581); + Tile v2582; + TASSIGN(v2582, v49); + Tile v2583; + __ubuf__ float* v2584 = v2582.data(); + uint64_t v2585 = reinterpret_cast(v2584); + TASSIGN(v2583, v2585); + Tile v2586; + TASSIGN(v2586, v53); + Tile v2587; + __ubuf__ float* v2588 = v2586.data(); + uint64_t v2589 = reinterpret_cast(v2588); + TASSIGN(v2587, v2589); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2567, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2590 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2591 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2592 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v12 * (unsigned) v36 + v28 * (unsigned) v37), v2590, v2591); + TLOAD(v2575, v2592); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2571, v2567, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2579, v2571, v2575); + pipe_barrier(PIPE_ALL); + TROWSUM(v2587, v2579, v2583); + pipe_barrier(PIPE_ALL); + TADD(v2455, v2455, v2587); + pipe_barrier(PIPE_ALL); + TMUL(v2455, v2455, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2593 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2594 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2595 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v12 * (unsigned) v37), v2593, v2594); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2595, v2455); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2596; + TASSIGN(v2596, v44); + Tile v2597; + __ubuf__ float* v2598 = v2596.data(); + uint64_t v2599 = reinterpret_cast(v2598); + TASSIGN(v2597, v2599); + Tile v2600; + TASSIGN(v2600, v44); + Tile v2601; + __ubuf__ bfloat16_t* v2602 = v2600.data(); + uint64_t v2603 = reinterpret_cast(v2602); + TASSIGN(v2601, v2603); + Tile v2604; + TASSIGN(v2604, v46); + Tile v2605; + __ubuf__ float* v2606 = v2604.data(); + uint64_t v2607 = reinterpret_cast(v2606); + TASSIGN(v2605, v2607); + Tile v2608; + TASSIGN(v2608, v45); + Tile v2609; + __ubuf__ float* v2610 = v2608.data(); + uint64_t v2611 = reinterpret_cast(v2610); + TASSIGN(v2609, v2611); + Tile v2612; + TASSIGN(v2612, v48); + Tile v2613; + __ubuf__ float* v2614 = v2612.data(); + uint64_t v2615 = reinterpret_cast(v2614); + TASSIGN(v2613, v2615); + Tile v2616; + TASSIGN(v2616, v44); + Tile v2617; + __ubuf__ float* v2618 = v2616.data(); + uint64_t v2619 = reinterpret_cast(v2618); + TASSIGN(v2617, v2619); + Tile v2620; + TASSIGN(v2620, v47); + Tile v2621; + __ubuf__ float* v2622 = v2620.data(); + uint64_t v2623 = reinterpret_cast(v2622); + TASSIGN(v2621, v2623); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2601, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2624 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2625 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2626 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v11 * (unsigned) v36 + v33 * (unsigned) v37), v2624, v2625); + TLOAD(v2609, v2626); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2605, v2601, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2613, v2605, v2609); + pipe_barrier(PIPE_ALL); + TROWSUM(v2621, v2613, v2617); + pipe_barrier(PIPE_ALL); + TMOV(v2597, v2621); + Tile v2627; + TASSIGN(v2627, v49); + Tile v2628; + __ubuf__ bfloat16_t* v2629 = v2627.data(); + uint64_t v2630 = reinterpret_cast(v2629); + TASSIGN(v2628, v2630); + Tile v2631; + TASSIGN(v2631, v50); + Tile v2632; + __ubuf__ float* v2633 = v2631.data(); + uint64_t v2634 = reinterpret_cast(v2633); + TASSIGN(v2632, v2634); + Tile v2635; + TASSIGN(v2635, v51); + Tile v2636; + __ubuf__ float* v2637 = v2635.data(); + uint64_t v2638 = reinterpret_cast(v2637); + TASSIGN(v2636, v2638); + Tile v2639; + TASSIGN(v2639, v52); + Tile v2640; + __ubuf__ float* v2641 = v2639.data(); + uint64_t v2642 = reinterpret_cast(v2641); + TASSIGN(v2640, v2642); + Tile v2643; + TASSIGN(v2643, v49); + Tile v2644; + __ubuf__ float* v2645 = v2643.data(); + uint64_t v2646 = reinterpret_cast(v2645); + TASSIGN(v2644, v2646); + Tile v2647; + TASSIGN(v2647, v53); + Tile v2648; + __ubuf__ float* v2649 = v2647.data(); + uint64_t v2650 = reinterpret_cast(v2649); + TASSIGN(v2648, v2650); + TLOAD(v2628, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2651 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2652 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2653 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v11 * (unsigned) v36 + v31 * (unsigned) v37), v2651, v2652); + TLOAD(v2636, v2653); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2632, v2628, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2640, v2632, v2636); + pipe_barrier(PIPE_ALL); + TROWSUM(v2648, v2640, v2644); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2597, v2597, v2648); + Tile v2654; + TASSIGN(v2654, v49); + Tile v2655; + __ubuf__ bfloat16_t* v2656 = v2654.data(); + uint64_t v2657 = reinterpret_cast(v2656); + TASSIGN(v2655, v2657); + Tile v2658; + TASSIGN(v2658, v50); + Tile v2659; + __ubuf__ float* v2660 = v2658.data(); + uint64_t v2661 = reinterpret_cast(v2660); + TASSIGN(v2659, v2661); + Tile v2662; + TASSIGN(v2662, v51); + Tile v2663; + __ubuf__ float* v2664 = v2662.data(); + uint64_t v2665 = reinterpret_cast(v2664); + TASSIGN(v2663, v2665); + Tile v2666; + TASSIGN(v2666, v52); + Tile v2667; + __ubuf__ float* v2668 = v2666.data(); + uint64_t v2669 = reinterpret_cast(v2668); + TASSIGN(v2667, v2669); + Tile v2670; + TASSIGN(v2670, v49); + Tile v2671; + __ubuf__ float* v2672 = v2670.data(); + uint64_t v2673 = reinterpret_cast(v2672); + TASSIGN(v2671, v2673); + Tile v2674; + TASSIGN(v2674, v53); + Tile v2675; + __ubuf__ float* v2676 = v2674.data(); + uint64_t v2677 = reinterpret_cast(v2676); + TASSIGN(v2675, v2677); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2655, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2678 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2679 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2680 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v11 * (unsigned) v36 + v30 * (unsigned) v37), v2678, v2679); + TLOAD(v2663, v2680); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2659, v2655, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2667, v2659, v2663); + pipe_barrier(PIPE_ALL); + TROWSUM(v2675, v2667, v2671); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2597, v2597, v2675); + Tile v2681; + TASSIGN(v2681, v49); + Tile v2682; + __ubuf__ bfloat16_t* v2683 = v2681.data(); + uint64_t v2684 = reinterpret_cast(v2683); + TASSIGN(v2682, v2684); + Tile v2685; + TASSIGN(v2685, v50); + Tile v2686; + __ubuf__ float* v2687 = v2685.data(); + uint64_t v2688 = reinterpret_cast(v2687); + TASSIGN(v2686, v2688); + Tile v2689; + TASSIGN(v2689, v51); + Tile v2690; + __ubuf__ float* v2691 = v2689.data(); + uint64_t v2692 = reinterpret_cast(v2691); + TASSIGN(v2690, v2692); + Tile v2693; + TASSIGN(v2693, v52); + Tile v2694; + __ubuf__ float* v2695 = v2693.data(); + uint64_t v2696 = reinterpret_cast(v2695); + TASSIGN(v2694, v2696); + Tile v2697; + TASSIGN(v2697, v49); + Tile v2698; + __ubuf__ float* v2699 = v2697.data(); + uint64_t v2700 = reinterpret_cast(v2699); + TASSIGN(v2698, v2700); + Tile v2701; + TASSIGN(v2701, v53); + Tile v2702; + __ubuf__ float* v2703 = v2701.data(); + uint64_t v2704 = reinterpret_cast(v2703); + TASSIGN(v2702, v2704); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2682, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2705 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2706 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2707 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v11 * (unsigned) v36 + v29 * (unsigned) v37), v2705, v2706); + TLOAD(v2690, v2707); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2686, v2682, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2694, v2686, v2690); + pipe_barrier(PIPE_ALL); + TROWSUM(v2702, v2694, v2698); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2597, v2597, v2702); + Tile v2708; + TASSIGN(v2708, v49); + Tile v2709; + __ubuf__ bfloat16_t* v2710 = v2708.data(); + uint64_t v2711 = reinterpret_cast(v2710); + TASSIGN(v2709, v2711); + Tile v2712; + TASSIGN(v2712, v50); + Tile v2713; + __ubuf__ float* v2714 = v2712.data(); + uint64_t v2715 = reinterpret_cast(v2714); + TASSIGN(v2713, v2715); + Tile v2716; + TASSIGN(v2716, v51); + Tile v2717; + __ubuf__ float* v2718 = v2716.data(); + uint64_t v2719 = reinterpret_cast(v2718); + TASSIGN(v2717, v2719); + Tile v2720; + TASSIGN(v2720, v52); + Tile v2721; + __ubuf__ float* v2722 = v2720.data(); + uint64_t v2723 = reinterpret_cast(v2722); + TASSIGN(v2721, v2723); + Tile v2724; + TASSIGN(v2724, v49); + Tile v2725; + __ubuf__ float* v2726 = v2724.data(); + uint64_t v2727 = reinterpret_cast(v2726); + TASSIGN(v2725, v2727); + Tile v2728; + TASSIGN(v2728, v53); + Tile v2729; + __ubuf__ float* v2730 = v2728.data(); + uint64_t v2731 = reinterpret_cast(v2730); + TASSIGN(v2729, v2731); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2709, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2732 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2733 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2734 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v11 * (unsigned) v36 + v28 * (unsigned) v37), v2732, v2733); + TLOAD(v2717, v2734); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2713, v2709, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2721, v2713, v2717); + pipe_barrier(PIPE_ALL); + TROWSUM(v2729, v2721, v2725); + pipe_barrier(PIPE_ALL); + TADD(v2597, v2597, v2729); + pipe_barrier(PIPE_ALL); + TMUL(v2597, v2597, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2735 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2736 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2737 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v11 * (unsigned) v37), v2735, v2736); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2737, v2597); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2738; + TASSIGN(v2738, v44); + Tile v2739; + __ubuf__ float* v2740 = v2738.data(); + uint64_t v2741 = reinterpret_cast(v2740); + TASSIGN(v2739, v2741); + Tile v2742; + TASSIGN(v2742, v44); + Tile v2743; + __ubuf__ bfloat16_t* v2744 = v2742.data(); + uint64_t v2745 = reinterpret_cast(v2744); + TASSIGN(v2743, v2745); + Tile v2746; + TASSIGN(v2746, v46); + Tile v2747; + __ubuf__ float* v2748 = v2746.data(); + uint64_t v2749 = reinterpret_cast(v2748); + TASSIGN(v2747, v2749); + Tile v2750; + TASSIGN(v2750, v45); + Tile v2751; + __ubuf__ float* v2752 = v2750.data(); + uint64_t v2753 = reinterpret_cast(v2752); + TASSIGN(v2751, v2753); + Tile v2754; + TASSIGN(v2754, v48); + Tile v2755; + __ubuf__ float* v2756 = v2754.data(); + uint64_t v2757 = reinterpret_cast(v2756); + TASSIGN(v2755, v2757); + Tile v2758; + TASSIGN(v2758, v44); + Tile v2759; + __ubuf__ float* v2760 = v2758.data(); + uint64_t v2761 = reinterpret_cast(v2760); + TASSIGN(v2759, v2761); + Tile v2762; + TASSIGN(v2762, v47); + Tile v2763; + __ubuf__ float* v2764 = v2762.data(); + uint64_t v2765 = reinterpret_cast(v2764); + TASSIGN(v2763, v2765); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2743, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2766 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2767 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2768 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v10 * (unsigned) v36 + v33 * (unsigned) v37), v2766, v2767); + TLOAD(v2751, v2768); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2747, v2743, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2755, v2747, v2751); + pipe_barrier(PIPE_ALL); + TROWSUM(v2763, v2755, v2759); + pipe_barrier(PIPE_ALL); + TMOV(v2739, v2763); + Tile v2769; + TASSIGN(v2769, v49); + Tile v2770; + __ubuf__ bfloat16_t* v2771 = v2769.data(); + uint64_t v2772 = reinterpret_cast(v2771); + TASSIGN(v2770, v2772); + Tile v2773; + TASSIGN(v2773, v50); + Tile v2774; + __ubuf__ float* v2775 = v2773.data(); + uint64_t v2776 = reinterpret_cast(v2775); + TASSIGN(v2774, v2776); + Tile v2777; + TASSIGN(v2777, v51); + Tile v2778; + __ubuf__ float* v2779 = v2777.data(); + uint64_t v2780 = reinterpret_cast(v2779); + TASSIGN(v2778, v2780); + Tile v2781; + TASSIGN(v2781, v52); + Tile v2782; + __ubuf__ float* v2783 = v2781.data(); + uint64_t v2784 = reinterpret_cast(v2783); + TASSIGN(v2782, v2784); + Tile v2785; + TASSIGN(v2785, v49); + Tile v2786; + __ubuf__ float* v2787 = v2785.data(); + uint64_t v2788 = reinterpret_cast(v2787); + TASSIGN(v2786, v2788); + Tile v2789; + TASSIGN(v2789, v53); + Tile v2790; + __ubuf__ float* v2791 = v2789.data(); + uint64_t v2792 = reinterpret_cast(v2791); + TASSIGN(v2790, v2792); + TLOAD(v2770, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2793 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2794 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2795 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v10 * (unsigned) v36 + v31 * (unsigned) v37), v2793, v2794); + TLOAD(v2778, v2795); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2774, v2770, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2782, v2774, v2778); + pipe_barrier(PIPE_ALL); + TROWSUM(v2790, v2782, v2786); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2739, v2739, v2790); + Tile v2796; + TASSIGN(v2796, v49); + Tile v2797; + __ubuf__ bfloat16_t* v2798 = v2796.data(); + uint64_t v2799 = reinterpret_cast(v2798); + TASSIGN(v2797, v2799); + Tile v2800; + TASSIGN(v2800, v50); + Tile v2801; + __ubuf__ float* v2802 = v2800.data(); + uint64_t v2803 = reinterpret_cast(v2802); + TASSIGN(v2801, v2803); + Tile v2804; + TASSIGN(v2804, v51); + Tile v2805; + __ubuf__ float* v2806 = v2804.data(); + uint64_t v2807 = reinterpret_cast(v2806); + TASSIGN(v2805, v2807); + Tile v2808; + TASSIGN(v2808, v52); + Tile v2809; + __ubuf__ float* v2810 = v2808.data(); + uint64_t v2811 = reinterpret_cast(v2810); + TASSIGN(v2809, v2811); + Tile v2812; + TASSIGN(v2812, v49); + Tile v2813; + __ubuf__ float* v2814 = v2812.data(); + uint64_t v2815 = reinterpret_cast(v2814); + TASSIGN(v2813, v2815); + Tile v2816; + TASSIGN(v2816, v53); + Tile v2817; + __ubuf__ float* v2818 = v2816.data(); + uint64_t v2819 = reinterpret_cast(v2818); + TASSIGN(v2817, v2819); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2797, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2820 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2821 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2822 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v10 * (unsigned) v36 + v30 * (unsigned) v37), v2820, v2821); + TLOAD(v2805, v2822); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2801, v2797, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2809, v2801, v2805); + pipe_barrier(PIPE_ALL); + TROWSUM(v2817, v2809, v2813); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2739, v2739, v2817); + Tile v2823; + TASSIGN(v2823, v49); + Tile v2824; + __ubuf__ bfloat16_t* v2825 = v2823.data(); + uint64_t v2826 = reinterpret_cast(v2825); + TASSIGN(v2824, v2826); + Tile v2827; + TASSIGN(v2827, v50); + Tile v2828; + __ubuf__ float* v2829 = v2827.data(); + uint64_t v2830 = reinterpret_cast(v2829); + TASSIGN(v2828, v2830); + Tile v2831; + TASSIGN(v2831, v51); + Tile v2832; + __ubuf__ float* v2833 = v2831.data(); + uint64_t v2834 = reinterpret_cast(v2833); + TASSIGN(v2832, v2834); + Tile v2835; + TASSIGN(v2835, v52); + Tile v2836; + __ubuf__ float* v2837 = v2835.data(); + uint64_t v2838 = reinterpret_cast(v2837); + TASSIGN(v2836, v2838); + Tile v2839; + TASSIGN(v2839, v49); + Tile v2840; + __ubuf__ float* v2841 = v2839.data(); + uint64_t v2842 = reinterpret_cast(v2841); + TASSIGN(v2840, v2842); + Tile v2843; + TASSIGN(v2843, v53); + Tile v2844; + __ubuf__ float* v2845 = v2843.data(); + uint64_t v2846 = reinterpret_cast(v2845); + TASSIGN(v2844, v2846); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2824, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2847 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2848 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2849 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v10 * (unsigned) v36 + v29 * (unsigned) v37), v2847, v2848); + TLOAD(v2832, v2849); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2828, v2824, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2836, v2828, v2832); + pipe_barrier(PIPE_ALL); + TROWSUM(v2844, v2836, v2840); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2739, v2739, v2844); + Tile v2850; + TASSIGN(v2850, v49); + Tile v2851; + __ubuf__ bfloat16_t* v2852 = v2850.data(); + uint64_t v2853 = reinterpret_cast(v2852); + TASSIGN(v2851, v2853); + Tile v2854; + TASSIGN(v2854, v50); + Tile v2855; + __ubuf__ float* v2856 = v2854.data(); + uint64_t v2857 = reinterpret_cast(v2856); + TASSIGN(v2855, v2857); + Tile v2858; + TASSIGN(v2858, v51); + Tile v2859; + __ubuf__ float* v2860 = v2858.data(); + uint64_t v2861 = reinterpret_cast(v2860); + TASSIGN(v2859, v2861); + Tile v2862; + TASSIGN(v2862, v52); + Tile v2863; + __ubuf__ float* v2864 = v2862.data(); + uint64_t v2865 = reinterpret_cast(v2864); + TASSIGN(v2863, v2865); + Tile v2866; + TASSIGN(v2866, v49); + Tile v2867; + __ubuf__ float* v2868 = v2866.data(); + uint64_t v2869 = reinterpret_cast(v2868); + TASSIGN(v2867, v2869); + Tile v2870; + TASSIGN(v2870, v53); + Tile v2871; + __ubuf__ float* v2872 = v2870.data(); + uint64_t v2873 = reinterpret_cast(v2872); + TASSIGN(v2871, v2873); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2851, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2874 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2875 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2876 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v10 * (unsigned) v36 + v28 * (unsigned) v37), v2874, v2875); + TLOAD(v2859, v2876); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2855, v2851, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2863, v2855, v2859); + pipe_barrier(PIPE_ALL); + TROWSUM(v2871, v2863, v2867); + pipe_barrier(PIPE_ALL); + TADD(v2739, v2739, v2871); + pipe_barrier(PIPE_ALL); + TMUL(v2739, v2739, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v2877 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v2878 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v2879 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v10 * (unsigned) v37), v2877, v2878); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v2879, v2739); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v2880; + TASSIGN(v2880, v44); + Tile v2881; + __ubuf__ float* v2882 = v2880.data(); + uint64_t v2883 = reinterpret_cast(v2882); + TASSIGN(v2881, v2883); + Tile v2884; + TASSIGN(v2884, v44); + Tile v2885; + __ubuf__ bfloat16_t* v2886 = v2884.data(); + uint64_t v2887 = reinterpret_cast(v2886); + TASSIGN(v2885, v2887); + Tile v2888; + TASSIGN(v2888, v46); + Tile v2889; + __ubuf__ float* v2890 = v2888.data(); + uint64_t v2891 = reinterpret_cast(v2890); + TASSIGN(v2889, v2891); + Tile v2892; + TASSIGN(v2892, v45); + Tile v2893; + __ubuf__ float* v2894 = v2892.data(); + uint64_t v2895 = reinterpret_cast(v2894); + TASSIGN(v2893, v2895); + Tile v2896; + TASSIGN(v2896, v48); + Tile v2897; + __ubuf__ float* v2898 = v2896.data(); + uint64_t v2899 = reinterpret_cast(v2898); + TASSIGN(v2897, v2899); + Tile v2900; + TASSIGN(v2900, v44); + Tile v2901; + __ubuf__ float* v2902 = v2900.data(); + uint64_t v2903 = reinterpret_cast(v2902); + TASSIGN(v2901, v2903); + Tile v2904; + TASSIGN(v2904, v47); + Tile v2905; + __ubuf__ float* v2906 = v2904.data(); + uint64_t v2907 = reinterpret_cast(v2906); + TASSIGN(v2905, v2907); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v2885, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2908 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2909 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2910 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v9 * (unsigned) v36 + v33 * (unsigned) v37), v2908, v2909); + TLOAD(v2893, v2910); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2889, v2885, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2897, v2889, v2893); + pipe_barrier(PIPE_ALL); + TROWSUM(v2905, v2897, v2901); + pipe_barrier(PIPE_ALL); + TMOV(v2881, v2905); + Tile v2911; + TASSIGN(v2911, v49); + Tile v2912; + __ubuf__ bfloat16_t* v2913 = v2911.data(); + uint64_t v2914 = reinterpret_cast(v2913); + TASSIGN(v2912, v2914); + Tile v2915; + TASSIGN(v2915, v50); + Tile v2916; + __ubuf__ float* v2917 = v2915.data(); + uint64_t v2918 = reinterpret_cast(v2917); + TASSIGN(v2916, v2918); + Tile v2919; + TASSIGN(v2919, v51); + Tile v2920; + __ubuf__ float* v2921 = v2919.data(); + uint64_t v2922 = reinterpret_cast(v2921); + TASSIGN(v2920, v2922); + Tile v2923; + TASSIGN(v2923, v52); + Tile v2924; + __ubuf__ float* v2925 = v2923.data(); + uint64_t v2926 = reinterpret_cast(v2925); + TASSIGN(v2924, v2926); + Tile v2927; + TASSIGN(v2927, v49); + Tile v2928; + __ubuf__ float* v2929 = v2927.data(); + uint64_t v2930 = reinterpret_cast(v2929); + TASSIGN(v2928, v2930); + Tile v2931; + TASSIGN(v2931, v53); + Tile v2932; + __ubuf__ float* v2933 = v2931.data(); + uint64_t v2934 = reinterpret_cast(v2933); + TASSIGN(v2932, v2934); + TLOAD(v2912, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2935 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2936 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2937 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v9 * (unsigned) v36 + v31 * (unsigned) v37), v2935, v2936); + TLOAD(v2920, v2937); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2916, v2912, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2924, v2916, v2920); + pipe_barrier(PIPE_ALL); + TROWSUM(v2932, v2924, v2928); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2881, v2881, v2932); + Tile v2938; + TASSIGN(v2938, v49); + Tile v2939; + __ubuf__ bfloat16_t* v2940 = v2938.data(); + uint64_t v2941 = reinterpret_cast(v2940); + TASSIGN(v2939, v2941); + Tile v2942; + TASSIGN(v2942, v50); + Tile v2943; + __ubuf__ float* v2944 = v2942.data(); + uint64_t v2945 = reinterpret_cast(v2944); + TASSIGN(v2943, v2945); + Tile v2946; + TASSIGN(v2946, v51); + Tile v2947; + __ubuf__ float* v2948 = v2946.data(); + uint64_t v2949 = reinterpret_cast(v2948); + TASSIGN(v2947, v2949); + Tile v2950; + TASSIGN(v2950, v52); + Tile v2951; + __ubuf__ float* v2952 = v2950.data(); + uint64_t v2953 = reinterpret_cast(v2952); + TASSIGN(v2951, v2953); + Tile v2954; + TASSIGN(v2954, v49); + Tile v2955; + __ubuf__ float* v2956 = v2954.data(); + uint64_t v2957 = reinterpret_cast(v2956); + TASSIGN(v2955, v2957); + Tile v2958; + TASSIGN(v2958, v53); + Tile v2959; + __ubuf__ float* v2960 = v2958.data(); + uint64_t v2961 = reinterpret_cast(v2960); + TASSIGN(v2959, v2961); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2939, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2962 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2963 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2964 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v9 * (unsigned) v36 + v30 * (unsigned) v37), v2962, v2963); + TLOAD(v2947, v2964); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2943, v2939, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2951, v2943, v2947); + pipe_barrier(PIPE_ALL); + TROWSUM(v2959, v2951, v2955); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2881, v2881, v2959); + Tile v2965; + TASSIGN(v2965, v49); + Tile v2966; + __ubuf__ bfloat16_t* v2967 = v2965.data(); + uint64_t v2968 = reinterpret_cast(v2967); + TASSIGN(v2966, v2968); + Tile v2969; + TASSIGN(v2969, v50); + Tile v2970; + __ubuf__ float* v2971 = v2969.data(); + uint64_t v2972 = reinterpret_cast(v2971); + TASSIGN(v2970, v2972); + Tile v2973; + TASSIGN(v2973, v51); + Tile v2974; + __ubuf__ float* v2975 = v2973.data(); + uint64_t v2976 = reinterpret_cast(v2975); + TASSIGN(v2974, v2976); + Tile v2977; + TASSIGN(v2977, v52); + Tile v2978; + __ubuf__ float* v2979 = v2977.data(); + uint64_t v2980 = reinterpret_cast(v2979); + TASSIGN(v2978, v2980); + Tile v2981; + TASSIGN(v2981, v49); + Tile v2982; + __ubuf__ float* v2983 = v2981.data(); + uint64_t v2984 = reinterpret_cast(v2983); + TASSIGN(v2982, v2984); + Tile v2985; + TASSIGN(v2985, v53); + Tile v2986; + __ubuf__ float* v2987 = v2985.data(); + uint64_t v2988 = reinterpret_cast(v2987); + TASSIGN(v2986, v2988); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2966, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v2989 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v2990 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v2991 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v9 * (unsigned) v36 + v29 * (unsigned) v37), v2989, v2990); + TLOAD(v2974, v2991); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2970, v2966, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v2978, v2970, v2974); + pipe_barrier(PIPE_ALL); + TROWSUM(v2986, v2978, v2982); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v2881, v2881, v2986); + Tile v2992; + TASSIGN(v2992, v49); + Tile v2993; + __ubuf__ bfloat16_t* v2994 = v2992.data(); + uint64_t v2995 = reinterpret_cast(v2994); + TASSIGN(v2993, v2995); + Tile v2996; + TASSIGN(v2996, v50); + Tile v2997; + __ubuf__ float* v2998 = v2996.data(); + uint64_t v2999 = reinterpret_cast(v2998); + TASSIGN(v2997, v2999); + Tile v3000; + TASSIGN(v3000, v51); + Tile v3001; + __ubuf__ float* v3002 = v3000.data(); + uint64_t v3003 = reinterpret_cast(v3002); + TASSIGN(v3001, v3003); + Tile v3004; + TASSIGN(v3004, v52); + Tile v3005; + __ubuf__ float* v3006 = v3004.data(); + uint64_t v3007 = reinterpret_cast(v3006); + TASSIGN(v3005, v3007); + Tile v3008; + TASSIGN(v3008, v49); + Tile v3009; + __ubuf__ float* v3010 = v3008.data(); + uint64_t v3011 = reinterpret_cast(v3010); + TASSIGN(v3009, v3011); + Tile v3012; + TASSIGN(v3012, v53); + Tile v3013; + __ubuf__ float* v3014 = v3012.data(); + uint64_t v3015 = reinterpret_cast(v3014); + TASSIGN(v3013, v3015); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v2993, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3016 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3017 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3018 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v9 * (unsigned) v36 + v28 * (unsigned) v37), v3016, v3017); + TLOAD(v3001, v3018); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v2997, v2993, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3005, v2997, v3001); + pipe_barrier(PIPE_ALL); + TROWSUM(v3013, v3005, v3009); + pipe_barrier(PIPE_ALL); + TADD(v2881, v2881, v3013); + pipe_barrier(PIPE_ALL); + TMUL(v2881, v2881, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v3019 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v3020 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v3021 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v9 * (unsigned) v37), v3019, v3020); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v3021, v2881); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v3022; + TASSIGN(v3022, v44); + Tile v3023; + __ubuf__ float* v3024 = v3022.data(); + uint64_t v3025 = reinterpret_cast(v3024); + TASSIGN(v3023, v3025); + Tile v3026; + TASSIGN(v3026, v44); + Tile v3027; + __ubuf__ bfloat16_t* v3028 = v3026.data(); + uint64_t v3029 = reinterpret_cast(v3028); + TASSIGN(v3027, v3029); + Tile v3030; + TASSIGN(v3030, v46); + Tile v3031; + __ubuf__ float* v3032 = v3030.data(); + uint64_t v3033 = reinterpret_cast(v3032); + TASSIGN(v3031, v3033); + Tile v3034; + TASSIGN(v3034, v45); + Tile v3035; + __ubuf__ float* v3036 = v3034.data(); + uint64_t v3037 = reinterpret_cast(v3036); + TASSIGN(v3035, v3037); + Tile v3038; + TASSIGN(v3038, v48); + Tile v3039; + __ubuf__ float* v3040 = v3038.data(); + uint64_t v3041 = reinterpret_cast(v3040); + TASSIGN(v3039, v3041); + Tile v3042; + TASSIGN(v3042, v44); + Tile v3043; + __ubuf__ float* v3044 = v3042.data(); + uint64_t v3045 = reinterpret_cast(v3044); + TASSIGN(v3043, v3045); + Tile v3046; + TASSIGN(v3046, v47); + Tile v3047; + __ubuf__ float* v3048 = v3046.data(); + uint64_t v3049 = reinterpret_cast(v3048); + TASSIGN(v3047, v3049); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v3027, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3050 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3051 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3052 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v8 * (unsigned) v36 + v33 * (unsigned) v37), v3050, v3051); + TLOAD(v3035, v3052); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3031, v3027, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3039, v3031, v3035); + pipe_barrier(PIPE_ALL); + TROWSUM(v3047, v3039, v3043); + pipe_barrier(PIPE_ALL); + TMOV(v3023, v3047); + Tile v3053; + TASSIGN(v3053, v49); + Tile v3054; + __ubuf__ bfloat16_t* v3055 = v3053.data(); + uint64_t v3056 = reinterpret_cast(v3055); + TASSIGN(v3054, v3056); + Tile v3057; + TASSIGN(v3057, v50); + Tile v3058; + __ubuf__ float* v3059 = v3057.data(); + uint64_t v3060 = reinterpret_cast(v3059); + TASSIGN(v3058, v3060); + Tile v3061; + TASSIGN(v3061, v51); + Tile v3062; + __ubuf__ float* v3063 = v3061.data(); + uint64_t v3064 = reinterpret_cast(v3063); + TASSIGN(v3062, v3064); + Tile v3065; + TASSIGN(v3065, v52); + Tile v3066; + __ubuf__ float* v3067 = v3065.data(); + uint64_t v3068 = reinterpret_cast(v3067); + TASSIGN(v3066, v3068); + Tile v3069; + TASSIGN(v3069, v49); + Tile v3070; + __ubuf__ float* v3071 = v3069.data(); + uint64_t v3072 = reinterpret_cast(v3071); + TASSIGN(v3070, v3072); + Tile v3073; + TASSIGN(v3073, v53); + Tile v3074; + __ubuf__ float* v3075 = v3073.data(); + uint64_t v3076 = reinterpret_cast(v3075); + TASSIGN(v3074, v3076); + TLOAD(v3054, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3077 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3078 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3079 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v8 * (unsigned) v36 + v31 * (unsigned) v37), v3077, v3078); + TLOAD(v3062, v3079); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3058, v3054, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3066, v3058, v3062); + pipe_barrier(PIPE_ALL); + TROWSUM(v3074, v3066, v3070); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3023, v3023, v3074); + Tile v3080; + TASSIGN(v3080, v49); + Tile v3081; + __ubuf__ bfloat16_t* v3082 = v3080.data(); + uint64_t v3083 = reinterpret_cast(v3082); + TASSIGN(v3081, v3083); + Tile v3084; + TASSIGN(v3084, v50); + Tile v3085; + __ubuf__ float* v3086 = v3084.data(); + uint64_t v3087 = reinterpret_cast(v3086); + TASSIGN(v3085, v3087); + Tile v3088; + TASSIGN(v3088, v51); + Tile v3089; + __ubuf__ float* v3090 = v3088.data(); + uint64_t v3091 = reinterpret_cast(v3090); + TASSIGN(v3089, v3091); + Tile v3092; + TASSIGN(v3092, v52); + Tile v3093; + __ubuf__ float* v3094 = v3092.data(); + uint64_t v3095 = reinterpret_cast(v3094); + TASSIGN(v3093, v3095); + Tile v3096; + TASSIGN(v3096, v49); + Tile v3097; + __ubuf__ float* v3098 = v3096.data(); + uint64_t v3099 = reinterpret_cast(v3098); + TASSIGN(v3097, v3099); + Tile v3100; + TASSIGN(v3100, v53); + Tile v3101; + __ubuf__ float* v3102 = v3100.data(); + uint64_t v3103 = reinterpret_cast(v3102); + TASSIGN(v3101, v3103); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3081, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3104 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3105 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3106 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v8 * (unsigned) v36 + v30 * (unsigned) v37), v3104, v3105); + TLOAD(v3089, v3106); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3085, v3081, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3093, v3085, v3089); + pipe_barrier(PIPE_ALL); + TROWSUM(v3101, v3093, v3097); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3023, v3023, v3101); + Tile v3107; + TASSIGN(v3107, v49); + Tile v3108; + __ubuf__ bfloat16_t* v3109 = v3107.data(); + uint64_t v3110 = reinterpret_cast(v3109); + TASSIGN(v3108, v3110); + Tile v3111; + TASSIGN(v3111, v50); + Tile v3112; + __ubuf__ float* v3113 = v3111.data(); + uint64_t v3114 = reinterpret_cast(v3113); + TASSIGN(v3112, v3114); + Tile v3115; + TASSIGN(v3115, v51); + Tile v3116; + __ubuf__ float* v3117 = v3115.data(); + uint64_t v3118 = reinterpret_cast(v3117); + TASSIGN(v3116, v3118); + Tile v3119; + TASSIGN(v3119, v52); + Tile v3120; + __ubuf__ float* v3121 = v3119.data(); + uint64_t v3122 = reinterpret_cast(v3121); + TASSIGN(v3120, v3122); + Tile v3123; + TASSIGN(v3123, v49); + Tile v3124; + __ubuf__ float* v3125 = v3123.data(); + uint64_t v3126 = reinterpret_cast(v3125); + TASSIGN(v3124, v3126); + Tile v3127; + TASSIGN(v3127, v53); + Tile v3128; + __ubuf__ float* v3129 = v3127.data(); + uint64_t v3130 = reinterpret_cast(v3129); + TASSIGN(v3128, v3130); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3108, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3131 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3132 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3133 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v8 * (unsigned) v36 + v29 * (unsigned) v37), v3131, v3132); + TLOAD(v3116, v3133); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3112, v3108, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3120, v3112, v3116); + pipe_barrier(PIPE_ALL); + TROWSUM(v3128, v3120, v3124); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3023, v3023, v3128); + Tile v3134; + TASSIGN(v3134, v49); + Tile v3135; + __ubuf__ bfloat16_t* v3136 = v3134.data(); + uint64_t v3137 = reinterpret_cast(v3136); + TASSIGN(v3135, v3137); + Tile v3138; + TASSIGN(v3138, v50); + Tile v3139; + __ubuf__ float* v3140 = v3138.data(); + uint64_t v3141 = reinterpret_cast(v3140); + TASSIGN(v3139, v3141); + Tile v3142; + TASSIGN(v3142, v51); + Tile v3143; + __ubuf__ float* v3144 = v3142.data(); + uint64_t v3145 = reinterpret_cast(v3144); + TASSIGN(v3143, v3145); + Tile v3146; + TASSIGN(v3146, v52); + Tile v3147; + __ubuf__ float* v3148 = v3146.data(); + uint64_t v3149 = reinterpret_cast(v3148); + TASSIGN(v3147, v3149); + Tile v3150; + TASSIGN(v3150, v49); + Tile v3151; + __ubuf__ float* v3152 = v3150.data(); + uint64_t v3153 = reinterpret_cast(v3152); + TASSIGN(v3151, v3153); + Tile v3154; + TASSIGN(v3154, v53); + Tile v3155; + __ubuf__ float* v3156 = v3154.data(); + uint64_t v3157 = reinterpret_cast(v3156); + TASSIGN(v3155, v3157); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3135, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3158 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3159 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3160 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v8 * (unsigned) v36 + v28 * (unsigned) v37), v3158, v3159); + TLOAD(v3143, v3160); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3139, v3135, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3147, v3139, v3143); + pipe_barrier(PIPE_ALL); + TROWSUM(v3155, v3147, v3151); + pipe_barrier(PIPE_ALL); + TADD(v3023, v3023, v3155); + pipe_barrier(PIPE_ALL); + TMUL(v3023, v3023, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v3161 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v3162 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v3163 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v8 * (unsigned) v37), v3161, v3162); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v3163, v3023); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v3164; + TASSIGN(v3164, v44); + Tile v3165; + __ubuf__ float* v3166 = v3164.data(); + uint64_t v3167 = reinterpret_cast(v3166); + TASSIGN(v3165, v3167); + Tile v3168; + TASSIGN(v3168, v44); + Tile v3169; + __ubuf__ bfloat16_t* v3170 = v3168.data(); + uint64_t v3171 = reinterpret_cast(v3170); + TASSIGN(v3169, v3171); + Tile v3172; + TASSIGN(v3172, v46); + Tile v3173; + __ubuf__ float* v3174 = v3172.data(); + uint64_t v3175 = reinterpret_cast(v3174); + TASSIGN(v3173, v3175); + Tile v3176; + TASSIGN(v3176, v45); + Tile v3177; + __ubuf__ float* v3178 = v3176.data(); + uint64_t v3179 = reinterpret_cast(v3178); + TASSIGN(v3177, v3179); + Tile v3180; + TASSIGN(v3180, v48); + Tile v3181; + __ubuf__ float* v3182 = v3180.data(); + uint64_t v3183 = reinterpret_cast(v3182); + TASSIGN(v3181, v3183); + Tile v3184; + TASSIGN(v3184, v44); + Tile v3185; + __ubuf__ float* v3186 = v3184.data(); + uint64_t v3187 = reinterpret_cast(v3186); + TASSIGN(v3185, v3187); + Tile v3188; + TASSIGN(v3188, v47); + Tile v3189; + __ubuf__ float* v3190 = v3188.data(); + uint64_t v3191 = reinterpret_cast(v3190); + TASSIGN(v3189, v3191); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v3169, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3192 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3193 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3194 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v7 * (unsigned) v36 + v33 * (unsigned) v37), v3192, v3193); + TLOAD(v3177, v3194); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3173, v3169, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3181, v3173, v3177); + pipe_barrier(PIPE_ALL); + TROWSUM(v3189, v3181, v3185); + pipe_barrier(PIPE_ALL); + TMOV(v3165, v3189); + Tile v3195; + TASSIGN(v3195, v49); + Tile v3196; + __ubuf__ bfloat16_t* v3197 = v3195.data(); + uint64_t v3198 = reinterpret_cast(v3197); + TASSIGN(v3196, v3198); + Tile v3199; + TASSIGN(v3199, v50); + Tile v3200; + __ubuf__ float* v3201 = v3199.data(); + uint64_t v3202 = reinterpret_cast(v3201); + TASSIGN(v3200, v3202); + Tile v3203; + TASSIGN(v3203, v51); + Tile v3204; + __ubuf__ float* v3205 = v3203.data(); + uint64_t v3206 = reinterpret_cast(v3205); + TASSIGN(v3204, v3206); + Tile v3207; + TASSIGN(v3207, v52); + Tile v3208; + __ubuf__ float* v3209 = v3207.data(); + uint64_t v3210 = reinterpret_cast(v3209); + TASSIGN(v3208, v3210); + Tile v3211; + TASSIGN(v3211, v49); + Tile v3212; + __ubuf__ float* v3213 = v3211.data(); + uint64_t v3214 = reinterpret_cast(v3213); + TASSIGN(v3212, v3214); + Tile v3215; + TASSIGN(v3215, v53); + Tile v3216; + __ubuf__ float* v3217 = v3215.data(); + uint64_t v3218 = reinterpret_cast(v3217); + TASSIGN(v3216, v3218); + TLOAD(v3196, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3219 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3220 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3221 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v7 * (unsigned) v36 + v31 * (unsigned) v37), v3219, v3220); + TLOAD(v3204, v3221); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3200, v3196, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3208, v3200, v3204); + pipe_barrier(PIPE_ALL); + TROWSUM(v3216, v3208, v3212); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3165, v3165, v3216); + Tile v3222; + TASSIGN(v3222, v49); + Tile v3223; + __ubuf__ bfloat16_t* v3224 = v3222.data(); + uint64_t v3225 = reinterpret_cast(v3224); + TASSIGN(v3223, v3225); + Tile v3226; + TASSIGN(v3226, v50); + Tile v3227; + __ubuf__ float* v3228 = v3226.data(); + uint64_t v3229 = reinterpret_cast(v3228); + TASSIGN(v3227, v3229); + Tile v3230; + TASSIGN(v3230, v51); + Tile v3231; + __ubuf__ float* v3232 = v3230.data(); + uint64_t v3233 = reinterpret_cast(v3232); + TASSIGN(v3231, v3233); + Tile v3234; + TASSIGN(v3234, v52); + Tile v3235; + __ubuf__ float* v3236 = v3234.data(); + uint64_t v3237 = reinterpret_cast(v3236); + TASSIGN(v3235, v3237); + Tile v3238; + TASSIGN(v3238, v49); + Tile v3239; + __ubuf__ float* v3240 = v3238.data(); + uint64_t v3241 = reinterpret_cast(v3240); + TASSIGN(v3239, v3241); + Tile v3242; + TASSIGN(v3242, v53); + Tile v3243; + __ubuf__ float* v3244 = v3242.data(); + uint64_t v3245 = reinterpret_cast(v3244); + TASSIGN(v3243, v3245); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3223, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3246 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3247 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3248 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v7 * (unsigned) v36 + v30 * (unsigned) v37), v3246, v3247); + TLOAD(v3231, v3248); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3227, v3223, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3235, v3227, v3231); + pipe_barrier(PIPE_ALL); + TROWSUM(v3243, v3235, v3239); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3165, v3165, v3243); + Tile v3249; + TASSIGN(v3249, v49); + Tile v3250; + __ubuf__ bfloat16_t* v3251 = v3249.data(); + uint64_t v3252 = reinterpret_cast(v3251); + TASSIGN(v3250, v3252); + Tile v3253; + TASSIGN(v3253, v50); + Tile v3254; + __ubuf__ float* v3255 = v3253.data(); + uint64_t v3256 = reinterpret_cast(v3255); + TASSIGN(v3254, v3256); + Tile v3257; + TASSIGN(v3257, v51); + Tile v3258; + __ubuf__ float* v3259 = v3257.data(); + uint64_t v3260 = reinterpret_cast(v3259); + TASSIGN(v3258, v3260); + Tile v3261; + TASSIGN(v3261, v52); + Tile v3262; + __ubuf__ float* v3263 = v3261.data(); + uint64_t v3264 = reinterpret_cast(v3263); + TASSIGN(v3262, v3264); + Tile v3265; + TASSIGN(v3265, v49); + Tile v3266; + __ubuf__ float* v3267 = v3265.data(); + uint64_t v3268 = reinterpret_cast(v3267); + TASSIGN(v3266, v3268); + Tile v3269; + TASSIGN(v3269, v53); + Tile v3270; + __ubuf__ float* v3271 = v3269.data(); + uint64_t v3272 = reinterpret_cast(v3271); + TASSIGN(v3270, v3272); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3250, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3273 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3274 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3275 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v7 * (unsigned) v36 + v29 * (unsigned) v37), v3273, v3274); + TLOAD(v3258, v3275); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3254, v3250, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3262, v3254, v3258); + pipe_barrier(PIPE_ALL); + TROWSUM(v3270, v3262, v3266); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3165, v3165, v3270); + Tile v3276; + TASSIGN(v3276, v49); + Tile v3277; + __ubuf__ bfloat16_t* v3278 = v3276.data(); + uint64_t v3279 = reinterpret_cast(v3278); + TASSIGN(v3277, v3279); + Tile v3280; + TASSIGN(v3280, v50); + Tile v3281; + __ubuf__ float* v3282 = v3280.data(); + uint64_t v3283 = reinterpret_cast(v3282); + TASSIGN(v3281, v3283); + Tile v3284; + TASSIGN(v3284, v51); + Tile v3285; + __ubuf__ float* v3286 = v3284.data(); + uint64_t v3287 = reinterpret_cast(v3286); + TASSIGN(v3285, v3287); + Tile v3288; + TASSIGN(v3288, v52); + Tile v3289; + __ubuf__ float* v3290 = v3288.data(); + uint64_t v3291 = reinterpret_cast(v3290); + TASSIGN(v3289, v3291); + Tile v3292; + TASSIGN(v3292, v49); + Tile v3293; + __ubuf__ float* v3294 = v3292.data(); + uint64_t v3295 = reinterpret_cast(v3294); + TASSIGN(v3293, v3295); + Tile v3296; + TASSIGN(v3296, v53); + Tile v3297; + __ubuf__ float* v3298 = v3296.data(); + uint64_t v3299 = reinterpret_cast(v3298); + TASSIGN(v3297, v3299); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3277, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3300 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3301 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3302 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v7 * (unsigned) v36 + v28 * (unsigned) v37), v3300, v3301); + TLOAD(v3285, v3302); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3281, v3277, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3289, v3281, v3285); + pipe_barrier(PIPE_ALL); + TROWSUM(v3297, v3289, v3293); + pipe_barrier(PIPE_ALL); + TADD(v3165, v3165, v3297); + pipe_barrier(PIPE_ALL); + TMUL(v3165, v3165, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v3303 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v3304 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v3305 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v7 * (unsigned) v37), v3303, v3304); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v3305, v3165); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v3306; + TASSIGN(v3306, v44); + Tile v3307; + __ubuf__ float* v3308 = v3306.data(); + uint64_t v3309 = reinterpret_cast(v3308); + TASSIGN(v3307, v3309); + Tile v3310; + TASSIGN(v3310, v44); + Tile v3311; + __ubuf__ bfloat16_t* v3312 = v3310.data(); + uint64_t v3313 = reinterpret_cast(v3312); + TASSIGN(v3311, v3313); + Tile v3314; + TASSIGN(v3314, v46); + Tile v3315; + __ubuf__ float* v3316 = v3314.data(); + uint64_t v3317 = reinterpret_cast(v3316); + TASSIGN(v3315, v3317); + Tile v3318; + TASSIGN(v3318, v45); + Tile v3319; + __ubuf__ float* v3320 = v3318.data(); + uint64_t v3321 = reinterpret_cast(v3320); + TASSIGN(v3319, v3321); + Tile v3322; + TASSIGN(v3322, v48); + Tile v3323; + __ubuf__ float* v3324 = v3322.data(); + uint64_t v3325 = reinterpret_cast(v3324); + TASSIGN(v3323, v3325); + Tile v3326; + TASSIGN(v3326, v44); + Tile v3327; + __ubuf__ float* v3328 = v3326.data(); + uint64_t v3329 = reinterpret_cast(v3328); + TASSIGN(v3327, v3329); + Tile v3330; + TASSIGN(v3330, v47); + Tile v3331; + __ubuf__ float* v3332 = v3330.data(); + uint64_t v3333 = reinterpret_cast(v3332); + TASSIGN(v3331, v3333); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v3311, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3334 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3335 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3336 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v6 * (unsigned) v36 + v33 * (unsigned) v37), v3334, v3335); + TLOAD(v3319, v3336); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3315, v3311, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3323, v3315, v3319); + pipe_barrier(PIPE_ALL); + TROWSUM(v3331, v3323, v3327); + pipe_barrier(PIPE_ALL); + TMOV(v3307, v3331); + Tile v3337; + TASSIGN(v3337, v49); + Tile v3338; + __ubuf__ bfloat16_t* v3339 = v3337.data(); + uint64_t v3340 = reinterpret_cast(v3339); + TASSIGN(v3338, v3340); + Tile v3341; + TASSIGN(v3341, v50); + Tile v3342; + __ubuf__ float* v3343 = v3341.data(); + uint64_t v3344 = reinterpret_cast(v3343); + TASSIGN(v3342, v3344); + Tile v3345; + TASSIGN(v3345, v51); + Tile v3346; + __ubuf__ float* v3347 = v3345.data(); + uint64_t v3348 = reinterpret_cast(v3347); + TASSIGN(v3346, v3348); + Tile v3349; + TASSIGN(v3349, v52); + Tile v3350; + __ubuf__ float* v3351 = v3349.data(); + uint64_t v3352 = reinterpret_cast(v3351); + TASSIGN(v3350, v3352); + Tile v3353; + TASSIGN(v3353, v49); + Tile v3354; + __ubuf__ float* v3355 = v3353.data(); + uint64_t v3356 = reinterpret_cast(v3355); + TASSIGN(v3354, v3356); + Tile v3357; + TASSIGN(v3357, v53); + Tile v3358; + __ubuf__ float* v3359 = v3357.data(); + uint64_t v3360 = reinterpret_cast(v3359); + TASSIGN(v3358, v3360); + TLOAD(v3338, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3361 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3362 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3363 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v6 * (unsigned) v36 + v31 * (unsigned) v37), v3361, v3362); + TLOAD(v3346, v3363); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3342, v3338, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3350, v3342, v3346); + pipe_barrier(PIPE_ALL); + TROWSUM(v3358, v3350, v3354); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3307, v3307, v3358); + Tile v3364; + TASSIGN(v3364, v49); + Tile v3365; + __ubuf__ bfloat16_t* v3366 = v3364.data(); + uint64_t v3367 = reinterpret_cast(v3366); + TASSIGN(v3365, v3367); + Tile v3368; + TASSIGN(v3368, v50); + Tile v3369; + __ubuf__ float* v3370 = v3368.data(); + uint64_t v3371 = reinterpret_cast(v3370); + TASSIGN(v3369, v3371); + Tile v3372; + TASSIGN(v3372, v51); + Tile v3373; + __ubuf__ float* v3374 = v3372.data(); + uint64_t v3375 = reinterpret_cast(v3374); + TASSIGN(v3373, v3375); + Tile v3376; + TASSIGN(v3376, v52); + Tile v3377; + __ubuf__ float* v3378 = v3376.data(); + uint64_t v3379 = reinterpret_cast(v3378); + TASSIGN(v3377, v3379); + Tile v3380; + TASSIGN(v3380, v49); + Tile v3381; + __ubuf__ float* v3382 = v3380.data(); + uint64_t v3383 = reinterpret_cast(v3382); + TASSIGN(v3381, v3383); + Tile v3384; + TASSIGN(v3384, v53); + Tile v3385; + __ubuf__ float* v3386 = v3384.data(); + uint64_t v3387 = reinterpret_cast(v3386); + TASSIGN(v3385, v3387); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3365, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3388 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3389 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3390 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v6 * (unsigned) v36 + v30 * (unsigned) v37), v3388, v3389); + TLOAD(v3373, v3390); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3369, v3365, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3377, v3369, v3373); + pipe_barrier(PIPE_ALL); + TROWSUM(v3385, v3377, v3381); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3307, v3307, v3385); + Tile v3391; + TASSIGN(v3391, v49); + Tile v3392; + __ubuf__ bfloat16_t* v3393 = v3391.data(); + uint64_t v3394 = reinterpret_cast(v3393); + TASSIGN(v3392, v3394); + Tile v3395; + TASSIGN(v3395, v50); + Tile v3396; + __ubuf__ float* v3397 = v3395.data(); + uint64_t v3398 = reinterpret_cast(v3397); + TASSIGN(v3396, v3398); + Tile v3399; + TASSIGN(v3399, v51); + Tile v3400; + __ubuf__ float* v3401 = v3399.data(); + uint64_t v3402 = reinterpret_cast(v3401); + TASSIGN(v3400, v3402); + Tile v3403; + TASSIGN(v3403, v52); + Tile v3404; + __ubuf__ float* v3405 = v3403.data(); + uint64_t v3406 = reinterpret_cast(v3405); + TASSIGN(v3404, v3406); + Tile v3407; + TASSIGN(v3407, v49); + Tile v3408; + __ubuf__ float* v3409 = v3407.data(); + uint64_t v3410 = reinterpret_cast(v3409); + TASSIGN(v3408, v3410); + Tile v3411; + TASSIGN(v3411, v53); + Tile v3412; + __ubuf__ float* v3413 = v3411.data(); + uint64_t v3414 = reinterpret_cast(v3413); + TASSIGN(v3412, v3414); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3392, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3415 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3416 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3417 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v6 * (unsigned) v36 + v29 * (unsigned) v37), v3415, v3416); + TLOAD(v3400, v3417); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3396, v3392, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3404, v3396, v3400); + pipe_barrier(PIPE_ALL); + TROWSUM(v3412, v3404, v3408); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3307, v3307, v3412); + Tile v3418; + TASSIGN(v3418, v49); + Tile v3419; + __ubuf__ bfloat16_t* v3420 = v3418.data(); + uint64_t v3421 = reinterpret_cast(v3420); + TASSIGN(v3419, v3421); + Tile v3422; + TASSIGN(v3422, v50); + Tile v3423; + __ubuf__ float* v3424 = v3422.data(); + uint64_t v3425 = reinterpret_cast(v3424); + TASSIGN(v3423, v3425); + Tile v3426; + TASSIGN(v3426, v51); + Tile v3427; + __ubuf__ float* v3428 = v3426.data(); + uint64_t v3429 = reinterpret_cast(v3428); + TASSIGN(v3427, v3429); + Tile v3430; + TASSIGN(v3430, v52); + Tile v3431; + __ubuf__ float* v3432 = v3430.data(); + uint64_t v3433 = reinterpret_cast(v3432); + TASSIGN(v3431, v3433); + Tile v3434; + TASSIGN(v3434, v49); + Tile v3435; + __ubuf__ float* v3436 = v3434.data(); + uint64_t v3437 = reinterpret_cast(v3436); + TASSIGN(v3435, v3437); + Tile v3438; + TASSIGN(v3438, v53); + Tile v3439; + __ubuf__ float* v3440 = v3438.data(); + uint64_t v3441 = reinterpret_cast(v3440); + TASSIGN(v3439, v3441); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3419, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3442 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3443 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3444 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v6 * (unsigned) v36 + v28 * (unsigned) v37), v3442, v3443); + TLOAD(v3427, v3444); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3423, v3419, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3431, v3423, v3427); + pipe_barrier(PIPE_ALL); + TROWSUM(v3439, v3431, v3435); + pipe_barrier(PIPE_ALL); + TADD(v3307, v3307, v3439); + pipe_barrier(PIPE_ALL); + TMUL(v3307, v3307, v64); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v3445 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v3446 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v3447 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v6 * (unsigned) v37), v3445, v3446); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v3447, v3307); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + Tile v3448; + TASSIGN(v3448, v44); + Tile v3449; + __ubuf__ float* v3450 = v3448.data(); + uint64_t v3451 = reinterpret_cast(v3450); + TASSIGN(v3449, v3451); + Tile v3452; + TASSIGN(v3452, v44); + Tile v3453; + __ubuf__ bfloat16_t* v3454 = v3452.data(); + uint64_t v3455 = reinterpret_cast(v3454); + TASSIGN(v3453, v3455); + Tile v3456; + TASSIGN(v3456, v46); + Tile v3457; + __ubuf__ float* v3458 = v3456.data(); + uint64_t v3459 = reinterpret_cast(v3458); + TASSIGN(v3457, v3459); + Tile v3460; + TASSIGN(v3460, v45); + Tile v3461; + __ubuf__ float* v3462 = v3460.data(); + uint64_t v3463 = reinterpret_cast(v3462); + TASSIGN(v3461, v3463); + Tile v3464; + TASSIGN(v3464, v48); + Tile v3465; + __ubuf__ float* v3466 = v3464.data(); + uint64_t v3467 = reinterpret_cast(v3466); + TASSIGN(v3465, v3467); + Tile v3468; + TASSIGN(v3468, v44); + Tile v3469; + __ubuf__ float* v3470 = v3468.data(); + uint64_t v3471 = reinterpret_cast(v3470); + TASSIGN(v3469, v3471); + Tile v3472; + TASSIGN(v3472, v47); + Tile v3473; + __ubuf__ float* v3474 = v3472.data(); + uint64_t v3475 = reinterpret_cast(v3474); + TASSIGN(v3473, v3475); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v3453, v89); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3476 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3477 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3478 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v5 * (unsigned) v36 + v33 * (unsigned) v37), v3476, v3477); + TLOAD(v3461, v3478); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3457, v3453, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3465, v3457, v3461); + pipe_barrier(PIPE_ALL); + TROWSUM(v3473, v3465, v3469); + pipe_barrier(PIPE_ALL); + TMOV(v3449, v3473); + Tile v3479; + TASSIGN(v3479, v49); + Tile v3480; + __ubuf__ bfloat16_t* v3481 = v3479.data(); + uint64_t v3482 = reinterpret_cast(v3481); + TASSIGN(v3480, v3482); + Tile v3483; + TASSIGN(v3483, v50); + Tile v3484; + __ubuf__ float* v3485 = v3483.data(); + uint64_t v3486 = reinterpret_cast(v3485); + TASSIGN(v3484, v3486); + Tile v3487; + TASSIGN(v3487, v51); + Tile v3488; + __ubuf__ float* v3489 = v3487.data(); + uint64_t v3490 = reinterpret_cast(v3489); + TASSIGN(v3488, v3490); + Tile v3491; + TASSIGN(v3491, v52); + Tile v3492; + __ubuf__ float* v3493 = v3491.data(); + uint64_t v3494 = reinterpret_cast(v3493); + TASSIGN(v3492, v3494); + Tile v3495; + TASSIGN(v3495, v49); + Tile v3496; + __ubuf__ float* v3497 = v3495.data(); + uint64_t v3498 = reinterpret_cast(v3497); + TASSIGN(v3496, v3498); + Tile v3499; + TASSIGN(v3499, v53); + Tile v3500; + __ubuf__ float* v3501 = v3499.data(); + uint64_t v3502 = reinterpret_cast(v3501); + TASSIGN(v3500, v3502); + TLOAD(v3480, v112); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3503 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3504 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3505 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v5 * (unsigned) v36 + v31 * (unsigned) v37), v3503, v3504); + TLOAD(v3488, v3505); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3484, v3480, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3492, v3484, v3488); + pipe_barrier(PIPE_ALL); + TROWSUM(v3500, v3492, v3496); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3449, v3449, v3500); + Tile v3506; + TASSIGN(v3506, v49); + Tile v3507; + __ubuf__ bfloat16_t* v3508 = v3506.data(); + uint64_t v3509 = reinterpret_cast(v3508); + TASSIGN(v3507, v3509); + Tile v3510; + TASSIGN(v3510, v50); + Tile v3511; + __ubuf__ float* v3512 = v3510.data(); + uint64_t v3513 = reinterpret_cast(v3512); + TASSIGN(v3511, v3513); + Tile v3514; + TASSIGN(v3514, v51); + Tile v3515; + __ubuf__ float* v3516 = v3514.data(); + uint64_t v3517 = reinterpret_cast(v3516); + TASSIGN(v3515, v3517); + Tile v3518; + TASSIGN(v3518, v52); + Tile v3519; + __ubuf__ float* v3520 = v3518.data(); + uint64_t v3521 = reinterpret_cast(v3520); + TASSIGN(v3519, v3521); + Tile v3522; + TASSIGN(v3522, v49); + Tile v3523; + __ubuf__ float* v3524 = v3522.data(); + uint64_t v3525 = reinterpret_cast(v3524); + TASSIGN(v3523, v3525); + Tile v3526; + TASSIGN(v3526, v53); + Tile v3527; + __ubuf__ float* v3528 = v3526.data(); + uint64_t v3529 = reinterpret_cast(v3528); + TASSIGN(v3527, v3529); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3507, v135); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3530 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3531 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3532 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v5 * (unsigned) v36 + v30 * (unsigned) v37), v3530, v3531); + TLOAD(v3515, v3532); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3511, v3507, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3519, v3511, v3515); + pipe_barrier(PIPE_ALL); + TROWSUM(v3527, v3519, v3523); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3449, v3449, v3527); + Tile v3533; + TASSIGN(v3533, v49); + Tile v3534; + __ubuf__ bfloat16_t* v3535 = v3533.data(); + uint64_t v3536 = reinterpret_cast(v3535); + TASSIGN(v3534, v3536); + Tile v3537; + TASSIGN(v3537, v50); + Tile v3538; + __ubuf__ float* v3539 = v3537.data(); + uint64_t v3540 = reinterpret_cast(v3539); + TASSIGN(v3538, v3540); + Tile v3541; + TASSIGN(v3541, v51); + Tile v3542; + __ubuf__ float* v3543 = v3541.data(); + uint64_t v3544 = reinterpret_cast(v3543); + TASSIGN(v3542, v3544); + Tile v3545; + TASSIGN(v3545, v52); + Tile v3546; + __ubuf__ float* v3547 = v3545.data(); + uint64_t v3548 = reinterpret_cast(v3547); + TASSIGN(v3546, v3548); + Tile v3549; + TASSIGN(v3549, v49); + Tile v3550; + __ubuf__ float* v3551 = v3549.data(); + uint64_t v3552 = reinterpret_cast(v3551); + TASSIGN(v3550, v3552); + Tile v3553; + TASSIGN(v3553, v53); + Tile v3554; + __ubuf__ float* v3555 = v3553.data(); + uint64_t v3556 = reinterpret_cast(v3555); + TASSIGN(v3554, v3556); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3534, v158); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3557 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3558 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3559 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v5 * (unsigned) v36 + v29 * (unsigned) v37), v3557, v3558); + TLOAD(v3542, v3559); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3538, v3534, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3546, v3538, v3542); + pipe_barrier(PIPE_ALL); + TROWSUM(v3554, v3546, v3550); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + pipe_barrier(PIPE_ALL); + TADD(v3449, v3449, v3554); + Tile v3560; + TASSIGN(v3560, v49); + Tile v3561; + __ubuf__ bfloat16_t* v3562 = v3560.data(); + uint64_t v3563 = reinterpret_cast(v3562); + TASSIGN(v3561, v3563); + Tile v3564; + TASSIGN(v3564, v50); + Tile v3565; + __ubuf__ float* v3566 = v3564.data(); + uint64_t v3567 = reinterpret_cast(v3566); + TASSIGN(v3565, v3567); + Tile v3568; + TASSIGN(v3568, v51); + Tile v3569; + __ubuf__ float* v3570 = v3568.data(); + uint64_t v3571 = reinterpret_cast(v3570); + TASSIGN(v3569, v3571); + Tile v3572; + TASSIGN(v3572, v52); + Tile v3573; + __ubuf__ float* v3574 = v3572.data(); + uint64_t v3575 = reinterpret_cast(v3574); + TASSIGN(v3573, v3575); + Tile v3576; + TASSIGN(v3576, v49); + Tile v3577; + __ubuf__ float* v3578 = v3576.data(); + uint64_t v3579 = reinterpret_cast(v3578); + TASSIGN(v3577, v3579); + Tile v3580; + TASSIGN(v3580, v53); + Tile v3581; + __ubuf__ float* v3582 = v3580.data(); + uint64_t v3583 = reinterpret_cast(v3582); + TASSIGN(v3581, v3583); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v3561, v181); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1024> v3584 = pto::Shape<1, 1, 1, 1, 1024>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v3585 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v3586 = GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>(v2 + (v33 + v5 * (unsigned) v36 + v28 * (unsigned) v37), v3584, v3585); + TLOAD(v3569, v3586); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v3565, v3561, v32); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v3573, v3565, v3569); + pipe_barrier(PIPE_ALL); + TROWSUM(v3581, v3573, v3577); + pipe_barrier(PIPE_ALL); + TADD(v3449, v3449, v3581); + pipe_barrier(PIPE_ALL); + TMUL(v3449, v3449, v64); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v3587 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<24, 24, 24, 24, 1> v3588 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v3589 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v33 + (unsigned) v62 * (unsigned) v35 + v5 * (unsigned) v37), v3587, v3588); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v3589, v3449); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/pre_split_mixes_bwd.cpp b/kernels/manual/a5/mhc/pre_split_mixes_bwd.cpp new file mode 100644 index 000000000..533a3bac8 --- /dev/null +++ b/kernels/manual/a5/mhc/pre_split_mixes_bwd.cpp @@ -0,0 +1,462 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_pre_split_mixes_bwd_m4(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, __gm__ float* v5, __gm__ float* v6, __gm__ float* v7, __gm__ float* v8, __gm__ float* v9, __gm__ float* v10, int32_t v11) { + unsigned v12 = 8; + unsigned v13 = 4; + unsigned v14 = 24; + unsigned v15 = 2; + unsigned v16 = 1; + unsigned v17 = 0; + const int32_t v18 = 1; + const int32_t v19 = 0; + const int32_t v20 = 4; + const int32_t v21 = 16; + const int32_t v22 = 24; + const int32_t v23 = 3; + const float v24 = 0.0f; + const float v25 = -1.0f; + const float v26 = 1.0f; + const float v27 = 0.5f; + const int64_t v28 = 544; + const int64_t v29 = 96; + const int64_t v30 = 0; + const int64_t v31 = 32; + const int64_t v32 = 64; + const int64_t v33 = 192; + const int64_t v34 = 224; + const int64_t v35 = 256; + const int64_t v36 = 640; + const int64_t v37 = 672; + const int64_t v38 = 704; + const int64_t v39 = 768; + const int64_t v40 = 800; + const int64_t v41 = 832; + const int64_t v42 = 864; + const int64_t v43 = 896; + const int64_t v44 = 928; + const int64_t v45 = 960; + const int64_t v46 = 992; + const int64_t v47 = 1024; + const int64_t v48 = 1056; + const int64_t v49 = 1120; + const int64_t v50 = 1152; + const int64_t v51 = 1216; + const int64_t v52 = 1248; + const int64_t v53 = 1312; + const int64_t v54 = 320; + const int64_t v55 = 352; + const int64_t v56 = 384; + const int64_t v57 = 416; + const int64_t v58 = 448; + const int64_t v59 = 480; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + Tile v60; + TASSIGN(v60, v28); + Tile v61; + TASSIGN(v61, v29); + Tile v62; + TASSIGN(v62, v30); + Tile v63; + __ubuf__ float* v64 = v62.data(); + uint64_t v65 = reinterpret_cast(v64); + TASSIGN(v63, v65); + Tile v66; + TASSIGN(v66, v31); + Tile v67; + __ubuf__ float* v68 = v66.data(); + uint64_t v69 = reinterpret_cast(v68); + TASSIGN(v67, v69); + Tile v70; + TASSIGN(v70, v32); + Tile v71; + __ubuf__ float* v72 = v70.data(); + uint64_t v73 = reinterpret_cast(v72); + TASSIGN(v71, v73); + Tile v74; + TASSIGN(v74, v33); + Tile v75; + __ubuf__ float* v76 = v74.data(); + uint64_t v77 = reinterpret_cast(v76); + TASSIGN(v75, v77); + Tile v78; + TASSIGN(v78, v34); + Tile v79; + __ubuf__ float* v80 = v78.data(); + uint64_t v81 = reinterpret_cast(v80); + TASSIGN(v79, v81); + Tile v82; + TASSIGN(v82, v35); + Tile v83; + TASSIGN(v83, v36); + Tile v84; + __ubuf__ float* v85 = v83.data(); + uint64_t v86 = reinterpret_cast(v85); + TASSIGN(v84, v86); + Tile v87; + TASSIGN(v87, v37); + Tile v88; + __ubuf__ float* v89 = v87.data(); + uint64_t v90 = reinterpret_cast(v89); + TASSIGN(v88, v90); + Tile v91; + TASSIGN(v91, v38); + Tile v92; + TASSIGN(v92, v39); + Tile v93; + __ubuf__ float* v94 = v92.data(); + uint64_t v95 = reinterpret_cast(v94); + TASSIGN(v93, v95); + Tile v96; + TASSIGN(v96, v40); + Tile v97; + __ubuf__ float* v98 = v96.data(); + uint64_t v99 = reinterpret_cast(v98); + TASSIGN(v97, v99); + Tile v100; + TASSIGN(v100, v41); + Tile v101; + __ubuf__ float* v102 = v100.data(); + uint64_t v103 = reinterpret_cast(v102); + TASSIGN(v101, v103); + Tile v104; + TASSIGN(v104, v42); + Tile v105; + __ubuf__ float* v106 = v104.data(); + uint64_t v107 = reinterpret_cast(v106); + TASSIGN(v105, v107); + Tile v108; + TASSIGN(v108, v43); + Tile v109; + __ubuf__ float* v110 = v108.data(); + uint64_t v111 = reinterpret_cast(v110); + TASSIGN(v109, v111); + Tile v112; + TASSIGN(v112, v44); + Tile v113; + __ubuf__ float* v114 = v112.data(); + uint64_t v115 = reinterpret_cast(v114); + TASSIGN(v113, v115); + Tile v116; + TASSIGN(v116, v45); + Tile v117; + __ubuf__ float* v118 = v116.data(); + uint64_t v119 = reinterpret_cast(v118); + TASSIGN(v117, v119); + Tile v120; + TASSIGN(v120, v46); + Tile v121; + __ubuf__ float* v122 = v120.data(); + uint64_t v123 = reinterpret_cast(v122); + TASSIGN(v121, v123); + Tile v124; + TASSIGN(v124, v47); + Tile v125; + __ubuf__ float* v126 = v124.data(); + uint64_t v127 = reinterpret_cast(v126); + TASSIGN(v125, v127); + Tile v128; + TASSIGN(v128, v48); + Tile v129; + TASSIGN(v129, v49); + Tile v130; + __ubuf__ float* v131 = v129.data(); + uint64_t v132 = reinterpret_cast(v131); + TASSIGN(v130, v132); + Tile v133; + TASSIGN(v133, v50); + Tile v134; + TASSIGN(v134, v51); + Tile v135; + __ubuf__ float* v136 = v134.data(); + uint64_t v137 = reinterpret_cast(v136); + TASSIGN(v135, v137); + Tile v138; + TASSIGN(v138, v52); + Tile v139; + TASSIGN(v139, v53); + Tile v140; + __ubuf__ float* v141 = v139.data(); + uint64_t v142 = reinterpret_cast(v141); + TASSIGN(v140, v142); + Tile v143; + TASSIGN(v143, v54); + Tile v144; + __ubuf__ float* v145 = v143.data(); + uint64_t v146 = reinterpret_cast(v145); + TASSIGN(v144, v146); + Tile v147; + TASSIGN(v147, v55); + Tile v148; + __ubuf__ float* v149 = v147.data(); + uint64_t v150 = reinterpret_cast(v149); + TASSIGN(v148, v150); + Tile v151; + TASSIGN(v151, v56); + Tile v152; + __ubuf__ float* v153 = v151.data(); + uint64_t v154 = reinterpret_cast(v153); + TASSIGN(v152, v154); + Tile v155; + TASSIGN(v155, v57); + Tile v156; + __ubuf__ float* v157 = v155.data(); + uint64_t v158 = reinterpret_cast(v157); + TASSIGN(v156, v158); + Tile v159; + TASSIGN(v159, v58); + Tile v160; + __ubuf__ float* v161 = v159.data(); + uint64_t v162 = reinterpret_cast(v161); + TASSIGN(v160, v162); + Tile v163; + TASSIGN(v163, v59); + pto::Shape<1, 1, 1, 1, 1> v164 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v165 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v166 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v6 + (v17 + v17 * (unsigned) v23 + v17 * (unsigned) v18), v164, v165); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TLOAD(v63, v166); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v167 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v168 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v169 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v6 + (v17 + v17 * (unsigned) v23 + v16 * (unsigned) v18), v167, v168); + TLOAD(v67, v169); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v170 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v171 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v172 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v6 + (v17 + v17 * (unsigned) v23 + v15 * (unsigned) v18), v170, v171); + TLOAD(v71, v172); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 24> v173 = pto::Shape<1, 1, 1, 1, 24>(); + pto::Stride<24, 24, 24, 24, 1> v174 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v175 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v7 + (v17 + v17 * (unsigned) v22 + v17 * (unsigned) v18), v173, v174); + TLOAD(v61, v175); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v75, v63); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPAND(v79, v67); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TROWEXPAND(v82, v71); + TMULS(v144, v63, v24); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TMULS(v148, v67, v24); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TMULS(v152, v71, v24); + __ubuf__ float* v176 = v61.data(); + __ubuf__ float* v177 = v176 + (v17 + v17 * v14 + v17 * v16); + __ubuf__ float* v178 = (__ubuf__ float*) v177; + Tile v179; + uint64_t v180 = reinterpret_cast(v177); + TASSIGN(v179, v180); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TMULS(v156, v179, v24); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + __ubuf__ float* v181 = v61.data(); + __ubuf__ float* v182 = v181 + (v17 + v17 * v14 + v13 * v16); + __ubuf__ float* v183 = (__ubuf__ float*) v182; + Tile v184; + uint64_t v185 = reinterpret_cast(v182); + TASSIGN(v184, v185); + TMULS(v160, v184, v24); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + __ubuf__ float* v186 = v61.data(); + __ubuf__ float* v187 = v186 + (v17 + v17 * v14 + v12 * v16); + __ubuf__ float* v188 = (__ubuf__ float*) v187; + Tile v189; + uint64_t v190 = reinterpret_cast(v187); + TASSIGN(v189, v190); + TMULS(v163, v189, v24); + for (size_t v191 = (size_t) v19; v191 < ((size_t) v11); v191 += (size_t) v18) { + int32_t v192 = (int32_t) v191; + pto::Shape<1, 1, 1, 1, 24> v193 = pto::Shape<1, 1, 1, 1, 24>(); + pto::Stride<24, 24, 24, 24, 1> v194 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v195 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v4 + (v17 + (unsigned) v192 * (unsigned) v22 + v17 * (unsigned) v18), v193, v194); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v60, v195); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + pto::Shape<1, 1, 1, 1, 4> v196 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v197 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v198 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v1 + (v17 + (unsigned) v192 * (unsigned) v20 + v17 * (unsigned) v18), v196, v197); + TLOAD(v84, v198); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 4> v199 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v200 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v201 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v2 + (v17 + (unsigned) v192 * (unsigned) v20 + v17 * (unsigned) v18), v199, v200); + TLOAD(v88, v201); + pto::Shape<1, 1, 1, 1, 16> v202 = pto::Shape<1, 1, 1, 1, 16>(); + pto::Stride<16, 16, 16, 16, 1> v203 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v204 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v3 + (v17 + (unsigned) v192 * (unsigned) v21 + v17 * (unsigned) v18), v202, v203); + TLOAD(v91, v204); + pto::Shape<1, 1, 1, 1, 4> v205 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v206 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v207 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v17 + (unsigned) v192 * (unsigned) v20 + v17 * (unsigned) v18), v205, v206); + TLOAD(v93, v207); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + __ubuf__ float* v208 = v60.data(); + __ubuf__ float* v209 = v208 + (v17 + v17 * v14 + v17 * v16); + __ubuf__ float* v210 = (__ubuf__ float*) v209; + Tile v211; + uint64_t v212 = reinterpret_cast(v209); + TASSIGN(v211, v212); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + pipe_barrier(PIPE_ALL); + TMUL(v97, v211, v75); + pipe_barrier(PIPE_ALL); + TADD(v97, v97, v179); + pipe_barrier(PIPE_ALL); + TMULS(v101, v97, v25); + pipe_barrier(PIPE_ALL); + TEXP(v105, v101); + pipe_barrier(PIPE_ALL); + TADDS(v109, v105, v26); + pipe_barrier(PIPE_ALL); + TRECIP(v113, v109); + pipe_barrier(PIPE_ALL); + TMULS(v117, v113, v25); + pipe_barrier(PIPE_ALL); + TADDS(v117, v117, v26); + pipe_barrier(PIPE_ALL); + TMUL(v121, v113, v117); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID5); + TMUL(v121, v121, v84); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMUL(v125, v121, v75); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 4> v213 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<24, 24, 24, 24, 1> v214 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v215 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v8 + (v17 + (unsigned) v192 * (unsigned) v22 + v17 * (unsigned) v18), v213, v214); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v215, v125); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + TADD(v156, v156, v121); + TMUL(v130, v121, v211); + pipe_barrier(PIPE_ALL); + TROWSUM(v140, v130, v135); + pipe_barrier(PIPE_ALL); + TADD(v144, v144, v140); + __ubuf__ float* v216 = v60.data(); + __ubuf__ float* v217 = v216 + (v17 + v17 * v14 + v13 * v16); + __ubuf__ float* v218 = (__ubuf__ float*) v217; + Tile v219; + uint64_t v220 = reinterpret_cast(v217); + TASSIGN(v219, v220); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID6); + TMULS(v117, v93, v27); + pipe_barrier(PIPE_ALL); + TMULS(v117, v117, v25); + pipe_barrier(PIPE_ALL); + TADDS(v117, v117, v26); + TMUL(v121, v88, v93); + pipe_barrier(PIPE_ALL); + TMUL(v121, v121, v117); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + TMUL(v125, v121, v79); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 4> v221 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<24, 24, 24, 24, 1> v222 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v223 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v8 + (v17 + (unsigned) v192 * (unsigned) v22 + v13 * (unsigned) v18), v221, v222); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v223, v125); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TADD(v160, v160, v121); + TMUL(v130, v121, v219); + pipe_barrier(PIPE_ALL); + TROWSUM(v140, v130, v135); + pipe_barrier(PIPE_ALL); + TADD(v148, v148, v140); + __ubuf__ float* v224 = v60.data(); + __ubuf__ float* v225 = v224 + (v17 + v17 * v14 + v12 * v16); + __ubuf__ float* v226 = (__ubuf__ float*) v225; + Tile v227; + uint64_t v228 = reinterpret_cast(v225); + TASSIGN(v227, v228); + TMUL(v128, v91, v82); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 16> v229 = pto::Shape<1, 1, 1, 1, 16>(); + pto::Stride<24, 24, 24, 24, 1> v230 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v231 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v8 + (v17 + (unsigned) v192 * (unsigned) v22 + v12 * (unsigned) v18), v229, v230); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pipe_barrier(PIPE_MTE3); + TSTORE(v231, v128); + TADD(v163, v163, v91); + TMUL(v133, v91, v227); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TROWSUM(v140, v133, v138); + pipe_barrier(PIPE_ALL); + TADD(v152, v152, v140); + } + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + pto::Shape<1, 1, 1, 1, 1> v232 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v233 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v234 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v9 + (v17 + v17 * (unsigned) v23 + v17 * (unsigned) v18), v232, v233); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + TSTORE(v234, v144); + pto::Shape<1, 1, 1, 1, 1> v235 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v236 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v237 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v9 + (v17 + v17 * (unsigned) v23 + v16 * (unsigned) v18), v235, v236); + pipe_barrier(PIPE_MTE3); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v237, v148); + pto::Shape<1, 1, 1, 1, 1> v238 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v239 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v240 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v9 + (v17 + v17 * (unsigned) v23 + v15 * (unsigned) v18), v238, v239); + pipe_barrier(PIPE_MTE3); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID5); + TSTORE(v240, v152); + pto::Shape<1, 1, 1, 1, 4> v241 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<24, 24, 24, 24, 1> v242 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v243 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v10 + (v17 + v17 * (unsigned) v22 + v17 * (unsigned) v18), v241, v242); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID6); + TSTORE(v243, v156); + pto::Shape<1, 1, 1, 1, 4> v244 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<24, 24, 24, 24, 1> v245 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v246 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v10 + (v17 + v17 * (unsigned) v22 + v13 * (unsigned) v18), v244, v245); + pipe_barrier(PIPE_MTE3); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID7); + TSTORE(v246, v160); + pto::Shape<1, 1, 1, 1, 16> v247 = pto::Shape<1, 1, 1, 1, 16>(); + pto::Stride<24, 24, 24, 24, 1> v248 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v249 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v10 + (v17 + v17 * (unsigned) v22 + v12 * (unsigned) v18), v247, v248); + pipe_barrier(PIPE_MTE3); + TSTORE(v249, v163); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/pre_split_mixes_fwd.cpp b/kernels/manual/a5/mhc/pre_split_mixes_fwd.cpp new file mode 100644 index 000000000..d80be5d60 --- /dev/null +++ b/kernels/manual/a5/mhc/pre_split_mixes_fwd.cpp @@ -0,0 +1,277 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_pre_split_mixes_fwd_m4(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, __gm__ float* v5, __gm__ float* v6, int32_t v7) { + unsigned v8 = 8; + unsigned v9 = 4; + unsigned v10 = 24; + unsigned v11 = 2; + unsigned v12 = 1; + unsigned v13 = 0; + const int32_t v14 = 0; + const int32_t v15 = 1; + const int32_t v16 = 24; + const int32_t v17 = 4; + const int32_t v18 = 16; + const int32_t v19 = 3; + const float v20 = -1.0f; + const float v21 = 1.0f; + const float v22 = 0.00999999977f; + const float v23 = 2.0f; + const int64_t v24 = 320; + const int64_t v25 = 96; + const int64_t v26 = 0; + const int64_t v27 = 32; + const int64_t v28 = 64; + const int64_t v29 = 192; + const int64_t v30 = 224; + const int64_t v31 = 256; + const int64_t v32 = 416; + const int64_t v33 = 448; + const int64_t v34 = 480; + const int64_t v35 = 512; + const int64_t v36 = 544; + const int64_t v37 = 576; + const int64_t v38 = 608; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v39 = get_block_idx(); + int64_t v40 = get_block_num(); + int32_t v41 = (int32_t) ((int64_t) v40); + int32_t v42 = v7 / v41; + int32_t v43 = v7 % v41 != v14 && v7 < v14 == v41 < v14 ? v42 + v15 : v42; + int32_t v44 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v39) * (uint32_t) v43); + int32_t v45 = (int32_t) ((uint32_t) v44 + (uint32_t) v43); + Tile v46; + TASSIGN(v46, v24); + Tile v47; + TASSIGN(v47, v25); + Tile v48; + TASSIGN(v48, v26); + Tile v49; + __ubuf__ float* v50 = v48.data(); + uint64_t v51 = reinterpret_cast(v50); + TASSIGN(v49, v51); + Tile v52; + TASSIGN(v52, v27); + Tile v53; + __ubuf__ float* v54 = v52.data(); + uint64_t v55 = reinterpret_cast(v54); + TASSIGN(v53, v55); + Tile v56; + TASSIGN(v56, v28); + Tile v57; + __ubuf__ float* v58 = v56.data(); + uint64_t v59 = reinterpret_cast(v58); + TASSIGN(v57, v59); + Tile v60; + TASSIGN(v60, v29); + Tile v61; + __ubuf__ float* v62 = v60.data(); + uint64_t v63 = reinterpret_cast(v62); + TASSIGN(v61, v63); + Tile v64; + TASSIGN(v64, v30); + Tile v65; + __ubuf__ float* v66 = v64.data(); + uint64_t v67 = reinterpret_cast(v66); + TASSIGN(v65, v67); + Tile v68; + TASSIGN(v68, v31); + Tile v69; + TASSIGN(v69, v32); + Tile v70; + __ubuf__ float* v71 = v69.data(); + uint64_t v72 = reinterpret_cast(v71); + TASSIGN(v70, v72); + Tile v73; + TASSIGN(v73, v33); + Tile v74; + __ubuf__ float* v75 = v73.data(); + uint64_t v76 = reinterpret_cast(v75); + TASSIGN(v74, v76); + Tile v77; + TASSIGN(v77, v34); + Tile v78; + __ubuf__ float* v79 = v77.data(); + uint64_t v80 = reinterpret_cast(v79); + TASSIGN(v78, v80); + Tile v81; + TASSIGN(v81, v35); + Tile v82; + __ubuf__ float* v83 = v81.data(); + uint64_t v84 = reinterpret_cast(v83); + TASSIGN(v82, v84); + Tile v85; + TASSIGN(v85, v36); + Tile v86; + __ubuf__ float* v87 = v85.data(); + uint64_t v88 = reinterpret_cast(v87); + TASSIGN(v86, v88); + Tile v89; + TASSIGN(v89, v37); + Tile v90; + __ubuf__ float* v91 = v89.data(); + uint64_t v92 = reinterpret_cast(v91); + TASSIGN(v90, v92); + Tile v93; + TASSIGN(v93, v38); + pto::Shape<1, 1, 1, 1, 1> v94 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v95 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v96 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v2 + (v13 + v13 * (unsigned) v19 + v13 * (unsigned) v15), v94, v95); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TLOAD(v49, v96); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 1> v97 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v98 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v99 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v2 + (v13 + v13 * (unsigned) v19 + v12 * (unsigned) v15), v97, v98); + TLOAD(v53, v99); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 1> v100 = pto::Shape<1, 1, 1, 1, 1>(); + pto::Stride<3, 3, 3, 3, 1> v101 = pto::Stride<3, 3, 3, 3, 1>(); + GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND> v102 = GlobalTensor, pto::Stride<3, 3, 3, 3, 1>, pto::Layout::ND>(v2 + (v13 + v13 * (unsigned) v19 + v11 * (unsigned) v15), v100, v101); + TLOAD(v57, v102); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 24> v103 = pto::Shape<1, 1, 1, 1, 24>(); + pto::Stride<24, 24, 24, 24, 1> v104 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v105 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v3 + (v13 + v13 * (unsigned) v16 + v13 * (unsigned) v15), v103, v104); + TLOAD(v47, v105); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWEXPAND(v61, v49); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPAND(v65, v53); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TROWEXPAND(v68, v57); + __ubuf__ float* v106 = v47.data(); + __ubuf__ float* v107 = v106 + (v13 + v13 * v10 + v13 * v12); + __ubuf__ float* v108 = (__ubuf__ float*) v107; + Tile v109; + uint64_t v110 = reinterpret_cast(v107); + TASSIGN(v109, v110); + __ubuf__ float* v111 = v47.data(); + __ubuf__ float* v112 = v111 + (v13 + v13 * v10 + v9 * v12); + __ubuf__ float* v113 = (__ubuf__ float*) v112; + Tile v114; + uint64_t v115 = reinterpret_cast(v112); + TASSIGN(v114, v115); + __ubuf__ float* v116 = v47.data(); + __ubuf__ float* v117 = v116 + (v13 + v13 * v10 + v8 * v12); + __ubuf__ float* v118 = (__ubuf__ float*) v117; + Tile v119; + uint64_t v120 = reinterpret_cast(v117); + TASSIGN(v119, v120); + for (size_t v121 = (size_t) v44; v121 < ((size_t) ((uint32_t) v45 < (uint32_t) v7 ? v45 : v7)); v121 += (size_t) v15) { + int32_t v122 = (int32_t) v121; + pto::Shape<1, 1, 1, 1, 24> v123 = pto::Shape<1, 1, 1, 1, 24>(); + pto::Stride<24, 24, 24, 24, 1> v124 = pto::Stride<24, 24, 24, 24, 1>(); + GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND> v125 = GlobalTensor, pto::Stride<24, 24, 24, 24, 1>, pto::Layout::ND>(v1 + (v13 + (unsigned) v122 * (unsigned) v16 + v13 * (unsigned) v15), v123, v124); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v46, v125); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + __ubuf__ float* v126 = v46.data(); + __ubuf__ float* v127 = v126 + (v13 + v13 * v10 + v13 * v12); + __ubuf__ float* v128 = (__ubuf__ float*) v127; + Tile v129; + uint64_t v130 = reinterpret_cast(v127); + TASSIGN(v129, v130); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + pipe_barrier(PIPE_ALL); + TMUL(v70, v129, v61); + pipe_barrier(PIPE_ALL); + TADD(v70, v70, v109); + pipe_barrier(PIPE_ALL); + TMULS(v74, v70, v20); + pipe_barrier(PIPE_ALL); + TEXP(v78, v74); + pipe_barrier(PIPE_ALL); + TADDS(v82, v78, v21); + pipe_barrier(PIPE_ALL); + TRECIP(v86, v82); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TADDS(v90, v86, v22); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 4> v131 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v132 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v133 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v4 + (v13 + (unsigned) v122 * (unsigned) v17 + v13 * (unsigned) v15), v131, v132); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v133, v90); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + __ubuf__ float* v134 = v46.data(); + __ubuf__ float* v135 = v134 + (v13 + v13 * v10 + v9 * v12); + __ubuf__ float* v136 = (__ubuf__ float*) v135; + Tile v137; + uint64_t v138 = reinterpret_cast(v135); + TASSIGN(v137, v138); + TMUL(v70, v137, v65); + pipe_barrier(PIPE_ALL); + TADD(v70, v70, v114); + pipe_barrier(PIPE_ALL); + TMULS(v74, v70, v20); + pipe_barrier(PIPE_ALL); + TEXP(v78, v74); + pipe_barrier(PIPE_ALL); + TADDS(v82, v78, v21); + pipe_barrier(PIPE_ALL); + TRECIP(v86, v82); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + TMULS(v90, v86, v23); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 4> v139 = pto::Shape<1, 1, 1, 1, 4>(); + pto::Stride<4, 4, 4, 4, 1> v140 = pto::Stride<4, 4, 4, 4, 1>(); + GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND> v141 = GlobalTensor, pto::Stride<4, 4, 4, 4, 1>, pto::Layout::ND>(v5 + (v13 + (unsigned) v122 * (unsigned) v17 + v13 * (unsigned) v15), v139, v140); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v141, v90); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + __ubuf__ float* v142 = v46.data(); + __ubuf__ float* v143 = v142 + (v13 + v13 * v10 + v8 * v12); + __ubuf__ float* v144 = (__ubuf__ float*) v143; + Tile v145; + uint64_t v146 = reinterpret_cast(v143); + TASSIGN(v145, v146); + TMUL(v93, v145, v68); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TADD(v93, v93, v119); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 16> v147 = pto::Shape<1, 1, 1, 1, 16>(); + pto::Stride<16, 16, 16, 16, 1> v148 = pto::Stride<16, 16, 16, 16, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND> v149 = GlobalTensor, pto::Stride<16, 16, 16, 16, 1>, pto::Layout::ND>(v6 + (v13 + (unsigned) v122 * (unsigned) v18 + v13 * (unsigned) v15), v147, v148); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v149, v93); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/run.sh b/kernels/manual/a5/mhc/run.sh new file mode 100644 index 000000000..43edade09 --- /dev/null +++ b/kernels/manual/a5/mhc/run.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail +cd "$(dirname "$0")" + +BUILD_DIR=build +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +cmake .. -DRUN_MODE=npu +make -j$(nproc) + +echo "" +echo "=== Running expand_to_mhc_fwd verification ===" +./mhc_test diff --git a/kernels/manual/a5/mhc/sinkhorn_normalize_bwd.cpp b/kernels/manual/a5/mhc/sinkhorn_normalize_bwd.cpp new file mode 100644 index 000000000..5d8fff1bb --- /dev/null +++ b/kernels/manual/a5/mhc/sinkhorn_normalize_bwd.cpp @@ -0,0 +1,1100 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_sinkhorn_bwd_m4_r10(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + const bool v5 = true; + unsigned v6 = 0; + const int32_t v7 = 0; + const int32_t v8 = 1; + const int32_t v9 = 4; + const float v10 = 9.99999997E-7f; + const int64_t v11 = 0; + const int64_t v12 = 1600; + const int64_t v13 = 1856; + const int64_t v14 = 256; + const int64_t v15 = 512; + const int64_t v16 = 768; + const int64_t v17 = 1024; + const int64_t v18 = 1056; + const int64_t v19 = 1088; + const int64_t v20 = 1344; + const int64_t v21 = 2112; + const int64_t v22 = 2368; + const int64_t v23 = 4160; + const int64_t v24 = 3136; + const int64_t v25 = 4192; + const int64_t v26 = 2624; + const int64_t v27 = 2880; + const int64_t v28 = 3392; + const int64_t v29 = 4448; + const int64_t v30 = 3648; + const int64_t v31 = 3904; + const int64_t v32 = 4704; + const int64_t v33 = 4736; + const int64_t v34 = 4992; + const int64_t v35 = 5248; + const int64_t v36 = 5504; + const int64_t v37 = 5536; + const int64_t v38 = 5792; + const int64_t v39 = 6048; + const int64_t v40 = 6304; + const int64_t v41 = 6336; + const int64_t v42 = 6592; + const int64_t v43 = 6848; + const int64_t v44 = 7104; + const int64_t v45 = 7136; + const int64_t v46 = 7392; + const int64_t v47 = 7648; + const int64_t v48 = 7904; + const int64_t v49 = 7936; + const int64_t v50 = 8192; + const int64_t v51 = 8448; + const int64_t v52 = 8704; + const int64_t v53 = 8736; + const int64_t v54 = 8992; + const int64_t v55 = 9248; + const int64_t v56 = 9504; + const int64_t v57 = 9536; + const int64_t v58 = 9792; + const int64_t v59 = 10048; + const int64_t v60 = 10304; + const int64_t v61 = 10336; + const int64_t v62 = 10592; + const int64_t v63 = 10848; + const int64_t v64 = 11104; + const int64_t v65 = 11136; + const int64_t v66 = 11392; + const int64_t v67 = 11648; + const int64_t v68 = 11904; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v69 = get_block_idx(); + int64_t v70 = get_block_num(); + int32_t v71 = (int32_t) ((int64_t) v70); + int32_t v72 = v4 / v71; + int32_t v73 = v4 % v71 != v7 && v4 < v7 == v71 < v7 ? v72 + v8 : v72; + int32_t v74 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v69) * (uint32_t) v73); + int32_t v75 = (int32_t) ((uint32_t) v74 + (uint32_t) v73); + int32_t v76 = (int32_t) ((uint32_t) v4 * (uint32_t) v9); + Tile v77; + TASSIGN(v77, v11); + Tile v78; + __ubuf__ float* v79 = v77.data(); + uint64_t v80 = reinterpret_cast(v79); + TASSIGN(v78, v80); + Tile v81; + TASSIGN(v81, v12); + Tile v82; + __ubuf__ float* v83 = v81.data(); + uint64_t v84 = reinterpret_cast(v83); + TASSIGN(v82, v84); + Tile v85; + TASSIGN(v85, v13); + Tile v86; + __ubuf__ float* v87 = v85.data(); + uint64_t v88 = reinterpret_cast(v87); + TASSIGN(v86, v88); + Tile v89; + TASSIGN(v89, v14); + Tile v90; + __ubuf__ float* v91 = v89.data(); + uint64_t v92 = reinterpret_cast(v91); + TASSIGN(v90, v92); + Tile v93; + TASSIGN(v93, v15); + Tile v94; + __ubuf__ float* v95 = v93.data(); + uint64_t v96 = reinterpret_cast(v95); + TASSIGN(v94, v96); + Tile v97; + TASSIGN(v97, v16); + Tile v98; + __ubuf__ float* v99 = v97.data(); + uint64_t v100 = reinterpret_cast(v99); + TASSIGN(v98, v100); + Tile v101; + TASSIGN(v101, v17); + Tile v102; + __ubuf__ float* v103 = v101.data(); + uint64_t v104 = reinterpret_cast(v103); + TASSIGN(v102, v104); + Tile v105; + TASSIGN(v105, v18); + Tile v106; + __ubuf__ float* v107 = v105.data(); + uint64_t v108 = reinterpret_cast(v107); + TASSIGN(v106, v108); + Tile v109; + TASSIGN(v109, v19); + Tile v110; + __ubuf__ float* v111 = v109.data(); + uint64_t v112 = reinterpret_cast(v111); + TASSIGN(v110, v112); + Tile v113; + TASSIGN(v113, v20); + Tile v114; + __ubuf__ float* v115 = v113.data(); + uint64_t v116 = reinterpret_cast(v115); + TASSIGN(v114, v116); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v117 = (size_t) v74; v117 < ((size_t) ((uint32_t) v75 < (uint32_t) v4 ? v75 : v4)); v117 += (size_t) v8) { + int32_t v118 = (int32_t) ((uint32_t) ((int32_t) v117) * (uint32_t) v9); + pto::Shape<1, 1, 1, 4, 4> v119 = pto::Shape<1, 1, 1, 4, 4>(); + pto::Stride<16, 16, 16, 4, 1> v120 = pto::Stride<16, 16, 16, 4, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND> v121 = GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND>(v2 + (v6 + (unsigned) v118 * (unsigned) v9 + v6 * (unsigned) v8), v119, v120); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v78, v121); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 4, 4> v122 = pto::Shape<1, 1, 1, 4, 4>(); + pto::Stride<16, 16, 16, 4, 1> v123 = pto::Stride<16, 16, 16, 4, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND> v124 = GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND>(v1 + (v6 + (unsigned) v118 * (unsigned) v9 + v6 * (unsigned) v8), v122, v123); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v82, v124); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile v125; + TASSIGN(v125, v21); + Tile v126; + __ubuf__ float* v127 = v125.data(); + uint64_t v128 = reinterpret_cast(v127); + TASSIGN(v126, v128); + Tile v129; + TASSIGN(v129, v22); + Tile v130; + __ubuf__ float* v131 = v129.data(); + uint64_t v132 = reinterpret_cast(v131); + TASSIGN(v130, v132); + Tile v133; + TASSIGN(v133, v23); + Tile v134; + __ubuf__ float* v135 = v133.data(); + uint64_t v136 = reinterpret_cast(v135); + TASSIGN(v134, v136); + Tile v137; + TASSIGN(v137, v24); + Tile v138; + __ubuf__ float* v139 = v137.data(); + uint64_t v140 = reinterpret_cast(v139); + TASSIGN(v138, v140); + Tile v141; + TASSIGN(v141, v25); + Tile v142; + __ubuf__ float* v143 = v141.data(); + uint64_t v144 = reinterpret_cast(v143); + TASSIGN(v142, v144); + Tile v145; + TASSIGN(v145, v26); + Tile v146; + __ubuf__ float* v147 = v145.data(); + uint64_t v148 = reinterpret_cast(v147); + TASSIGN(v146, v148); + Tile v149; + TASSIGN(v149, v27); + Tile v150; + __ubuf__ float* v151 = v149.data(); + uint64_t v152 = reinterpret_cast(v151); + TASSIGN(v150, v152); + Tile v153; + TASSIGN(v153, v28); + Tile v154; + __ubuf__ float* v155 = v153.data(); + uint64_t v156 = reinterpret_cast(v155); + TASSIGN(v154, v156); + Tile v157; + TASSIGN(v157, v29); + Tile v158; + __ubuf__ float* v159 = v157.data(); + uint64_t v160 = reinterpret_cast(v159); + TASSIGN(v158, v160); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWMAX(v130, v78, v126); + pipe_barrier(PIPE_ALL); + TROWEXPANDSUB(v146, v78, v130); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TEXP(v150, v146); + pipe_barrier(PIPE_ALL); + TROWSUM(v130, v150, v126); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v154, v150, v138); + Tile v161; + TASSIGN(v161, v30); + Tile v162; + __ubuf__ float* v163 = v161.data(); + uint64_t v164 = reinterpret_cast(v163); + TASSIGN(v162, v164); + pipe_barrier(PIPE_ALL); + TMOV(v162, v154); + pipe_barrier(PIPE_ALL); + TADDS(v154, v154, v10); + Tile v165; + TASSIGN(v165, v31); + Tile v166; + __ubuf__ float* v167 = v165.data(); + uint64_t v168 = reinterpret_cast(v167); + TASSIGN(v166, v168); + pipe_barrier(PIPE_ALL); + TMOV(v166, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v169; + TASSIGN(v169, v32); + Tile v170; + __ubuf__ float* v171 = v169.data(); + uint64_t v172 = reinterpret_cast(v171); + TASSIGN(v170, v172); + TMOV(v170, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v173; + TASSIGN(v173, v33); + Tile v174; + __ubuf__ float* v175 = v173.data(); + uint64_t v176 = reinterpret_cast(v175); + TASSIGN(v174, v176); + pipe_barrier(PIPE_ALL); + TMOV(v174, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v177; + TASSIGN(v177, v34); + Tile v178; + __ubuf__ float* v179 = v177.data(); + uint64_t v180 = reinterpret_cast(v179); + TASSIGN(v178, v180); + TMOV(v178, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v181; + TASSIGN(v181, v35); + Tile v182; + __ubuf__ float* v183 = v181.data(); + uint64_t v184 = reinterpret_cast(v183); + TASSIGN(v182, v184); + pipe_barrier(PIPE_ALL); + TMOV(v182, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v185; + TASSIGN(v185, v36); + Tile v186; + __ubuf__ float* v187 = v185.data(); + uint64_t v188 = reinterpret_cast(v187); + TASSIGN(v186, v188); + TMOV(v186, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v189; + TASSIGN(v189, v37); + Tile v190; + __ubuf__ float* v191 = v189.data(); + uint64_t v192 = reinterpret_cast(v191); + TASSIGN(v190, v192); + pipe_barrier(PIPE_ALL); + TMOV(v190, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v193; + TASSIGN(v193, v38); + Tile v194; + __ubuf__ float* v195 = v193.data(); + uint64_t v196 = reinterpret_cast(v195); + TASSIGN(v194, v196); + TMOV(v194, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v197; + TASSIGN(v197, v39); + Tile v198; + __ubuf__ float* v199 = v197.data(); + uint64_t v200 = reinterpret_cast(v199); + TASSIGN(v198, v200); + pipe_barrier(PIPE_ALL); + TMOV(v198, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v201; + TASSIGN(v201, v40); + Tile v202; + __ubuf__ float* v203 = v201.data(); + uint64_t v204 = reinterpret_cast(v203); + TASSIGN(v202, v204); + TMOV(v202, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v205; + TASSIGN(v205, v41); + Tile v206; + __ubuf__ float* v207 = v205.data(); + uint64_t v208 = reinterpret_cast(v207); + TASSIGN(v206, v208); + pipe_barrier(PIPE_ALL); + TMOV(v206, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v209; + TASSIGN(v209, v42); + Tile v210; + __ubuf__ float* v211 = v209.data(); + uint64_t v212 = reinterpret_cast(v211); + TASSIGN(v210, v212); + TMOV(v210, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v213; + TASSIGN(v213, v43); + Tile v214; + __ubuf__ float* v215 = v213.data(); + uint64_t v216 = reinterpret_cast(v215); + TASSIGN(v214, v216); + pipe_barrier(PIPE_ALL); + TMOV(v214, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v217; + TASSIGN(v217, v44); + Tile v218; + __ubuf__ float* v219 = v217.data(); + uint64_t v220 = reinterpret_cast(v219); + TASSIGN(v218, v220); + TMOV(v218, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v221; + TASSIGN(v221, v45); + Tile v222; + __ubuf__ float* v223 = v221.data(); + uint64_t v224 = reinterpret_cast(v223); + TASSIGN(v222, v224); + pipe_barrier(PIPE_ALL); + TMOV(v222, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v225; + TASSIGN(v225, v46); + Tile v226; + __ubuf__ float* v227 = v225.data(); + uint64_t v228 = reinterpret_cast(v227); + TASSIGN(v226, v228); + TMOV(v226, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v229; + TASSIGN(v229, v47); + Tile v230; + __ubuf__ float* v231 = v229.data(); + uint64_t v232 = reinterpret_cast(v231); + TASSIGN(v230, v232); + pipe_barrier(PIPE_ALL); + TMOV(v230, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v233; + TASSIGN(v233, v48); + Tile v234; + __ubuf__ float* v235 = v233.data(); + uint64_t v236 = reinterpret_cast(v235); + TASSIGN(v234, v236); + TMOV(v234, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v237; + TASSIGN(v237, v49); + Tile v238; + __ubuf__ float* v239 = v237.data(); + uint64_t v240 = reinterpret_cast(v239); + TASSIGN(v238, v240); + pipe_barrier(PIPE_ALL); + TMOV(v238, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v241; + TASSIGN(v241, v50); + Tile v242; + __ubuf__ float* v243 = v241.data(); + uint64_t v244 = reinterpret_cast(v243); + TASSIGN(v242, v244); + TMOV(v242, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v245; + TASSIGN(v245, v51); + Tile v246; + __ubuf__ float* v247 = v245.data(); + uint64_t v248 = reinterpret_cast(v247); + TASSIGN(v246, v248); + pipe_barrier(PIPE_ALL); + TMOV(v246, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v249; + TASSIGN(v249, v52); + Tile v250; + __ubuf__ float* v251 = v249.data(); + uint64_t v252 = reinterpret_cast(v251); + TASSIGN(v250, v252); + TMOV(v250, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v253; + TASSIGN(v253, v53); + Tile v254; + __ubuf__ float* v255 = v253.data(); + uint64_t v256 = reinterpret_cast(v255); + TASSIGN(v254, v256); + pipe_barrier(PIPE_ALL); + TMOV(v254, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v257; + TASSIGN(v257, v54); + Tile v258; + __ubuf__ float* v259 = v257.data(); + uint64_t v260 = reinterpret_cast(v259); + TASSIGN(v258, v260); + TMOV(v258, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v261; + TASSIGN(v261, v55); + Tile v262; + __ubuf__ float* v263 = v261.data(); + uint64_t v264 = reinterpret_cast(v263); + TASSIGN(v262, v264); + pipe_barrier(PIPE_ALL); + TMOV(v262, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v265; + TASSIGN(v265, v56); + Tile v266; + __ubuf__ float* v267 = v265.data(); + uint64_t v268 = reinterpret_cast(v267); + TASSIGN(v266, v268); + TMOV(v266, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v269; + TASSIGN(v269, v57); + Tile v270; + __ubuf__ float* v271 = v269.data(); + uint64_t v272 = reinterpret_cast(v271); + TASSIGN(v270, v272); + pipe_barrier(PIPE_ALL); + TMOV(v270, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v273; + TASSIGN(v273, v58); + Tile v274; + __ubuf__ float* v275 = v273.data(); + uint64_t v276 = reinterpret_cast(v275); + TASSIGN(v274, v276); + TMOV(v274, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v277; + TASSIGN(v277, v59); + Tile v278; + __ubuf__ float* v279 = v277.data(); + uint64_t v280 = reinterpret_cast(v279); + TASSIGN(v278, v280); + pipe_barrier(PIPE_ALL); + TMOV(v278, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v281; + TASSIGN(v281, v60); + Tile v282; + __ubuf__ float* v283 = v281.data(); + uint64_t v284 = reinterpret_cast(v283); + TASSIGN(v282, v284); + TMOV(v282, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v285; + TASSIGN(v285, v61); + Tile v286; + __ubuf__ float* v287 = v285.data(); + uint64_t v288 = reinterpret_cast(v287); + TASSIGN(v286, v288); + pipe_barrier(PIPE_ALL); + TMOV(v286, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v289; + TASSIGN(v289, v62); + Tile v290; + __ubuf__ float* v291 = v289.data(); + uint64_t v292 = reinterpret_cast(v291); + TASSIGN(v290, v292); + TMOV(v290, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v293; + TASSIGN(v293, v63); + Tile v294; + __ubuf__ float* v295 = v293.data(); + uint64_t v296 = reinterpret_cast(v295); + TASSIGN(v294, v296); + pipe_barrier(PIPE_ALL); + TMOV(v294, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v297; + TASSIGN(v297, v64); + Tile v298; + __ubuf__ float* v299 = v297.data(); + uint64_t v300 = reinterpret_cast(v299); + TASSIGN(v298, v300); + TMOV(v298, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v301; + TASSIGN(v301, v65); + Tile v302; + __ubuf__ float* v303 = v301.data(); + uint64_t v304 = reinterpret_cast(v303); + TASSIGN(v302, v304); + pipe_barrier(PIPE_ALL); + TMOV(v302, v154); + TROWSUM(v130, v154, v126); + pipe_barrier(PIPE_ALL); + TADDS(v130, v130, v10); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v138, v130); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v138); + Tile v305; + TASSIGN(v305, v66); + Tile v306; + __ubuf__ float* v307 = v305.data(); + uint64_t v308 = reinterpret_cast(v307); + TASSIGN(v306, v308); + TMOV(v306, v130); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + Tile v309; + TASSIGN(v309, v67); + Tile v310; + __ubuf__ float* v311 = v309.data(); + uint64_t v312 = reinterpret_cast(v311); + TASSIGN(v310, v312); + pipe_barrier(PIPE_ALL); + TMOV(v310, v154); + TCOLSUM(v134, v154, v126, v5); + pipe_barrier(PIPE_ALL); + TADDS(v134, v134, v10); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v142, v134); + pipe_barrier(PIPE_ALL); + TDIV(v158, v154, v142); + Tile v313; + TASSIGN(v313, v68); + Tile v314; + __ubuf__ float* v315 = v313.data(); + uint64_t v316 = reinterpret_cast(v315); + TASSIGN(v314, v316); + TMOV(v314, v134); + pipe_barrier(PIPE_ALL); + TMOV(v154, v158); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v86, v82, v310); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v314); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v302); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v306); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v294); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v298); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v286); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v290); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v278); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v282); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v270); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v274); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v262); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v266); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v254); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v258); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v246); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v250); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v238); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v242); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v230); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v234); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v222); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v226); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v214); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v218); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v206); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v210); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v198); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v202); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v190); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v194); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v182); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v186); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v174); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + TMOV(v98, v178); + pipe_barrier(PIPE_ALL); + TADDS(v98, v98, v10); + pipe_barrier(PIPE_ALL); + TDIV(v94, v94, v98); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v98); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v166); + pipe_barrier(PIPE_ALL); + TCOLSUM(v102, v86, v90, v5); + TMOV(v106, v170); + pipe_barrier(PIPE_ALL); + TADDS(v106, v106, v10); + pipe_barrier(PIPE_ALL); + TDIV(v102, v102, v106); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v102); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v110, v106); + pipe_barrier(PIPE_ALL); + TDIV(v114, v86, v110); + pipe_barrier(PIPE_ALL); + TMOV(v82, v114); + pipe_barrier(PIPE_ALL); + TMUL(v86, v82, v162); + pipe_barrier(PIPE_ALL); + TROWSUM(v94, v86, v90); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v110, v94); + pipe_barrier(PIPE_ALL); + TSUB(v86, v82, v110); + pipe_barrier(PIPE_ALL); + TMUL(v82, v86, v162); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 4, 4> v317 = pto::Shape<1, 1, 1, 4, 4>(); + pto::Stride<16, 16, 16, 4, 1> v318 = pto::Stride<16, 16, 16, 4, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND> v319 = GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND>(v3 + (v6 + (unsigned) v118 * (unsigned) v9 + v6 * (unsigned) v8), v317, v318); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v319, v82); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} diff --git a/kernels/manual/a5/mhc/sinkhorn_normalize_fwd.cpp b/kernels/manual/a5/mhc/sinkhorn_normalize_fwd.cpp new file mode 100644 index 000000000..46b20c943 --- /dev/null +++ b/kernels/manual/a5/mhc/sinkhorn_normalize_fwd.cpp @@ -0,0 +1,348 @@ +#include "pto/pto-inst.hpp" +using namespace pto; + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static AICORE inline void ptoas_auto_sync_tail( + PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +__global__ AICORE void tilekernels_mhc_sinkhorn_fwd_m4_r10(__gm__ float* v1, __gm__ float* v2, int32_t v3) { + const bool v4 = true; + unsigned v5 = 0; + const int32_t v6 = 0; + const int32_t v7 = 1; + const int32_t v8 = 4; + const float v9 = 9.99999997E-7f; + const int64_t v10 = 0; + const int64_t v11 = 512; + const int64_t v12 = 256; + const int64_t v13 = 1792; + const int64_t v14 = 1280; + const int64_t v15 = 1824; + const int64_t v16 = 768; + const int64_t v17 = 1024; + const int64_t v18 = 1536; + const int64_t v19 = 2080; + using T = float; + + #if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v20 = get_block_idx(); + int64_t v21 = get_block_num(); + int32_t v22 = (int32_t) ((int64_t) v21); + int32_t v23 = v3 / v22; + int32_t v24 = v3 % v22 != v6 && v3 < v6 == v22 < v6 ? v23 + v7 : v23; + int32_t v25 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v20) * (uint32_t) v24); + int32_t v26 = (int32_t) ((uint32_t) v25 + (uint32_t) v24); + int32_t v27 = (int32_t) ((uint32_t) v3 * (uint32_t) v8); + Tile v28; + TASSIGN(v28, v10); + Tile v29; + __ubuf__ float* v30 = v28.data(); + uint64_t v31 = reinterpret_cast(v30); + TASSIGN(v29, v31); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + for (size_t v32 = (size_t) v25; v32 < ((size_t) ((uint32_t) v26 < (uint32_t) v3 ? v26 : v3)); v32 += (size_t) v7) { + int32_t v33 = (int32_t) ((uint32_t) ((int32_t) v32) * (uint32_t) v8); + pto::Shape<1, 1, 1, 4, 4> v34 = pto::Shape<1, 1, 1, 4, 4>(); + pto::Stride<16, 16, 16, 4, 1> v35 = pto::Stride<16, 16, 16, 4, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND> v36 = GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND>(v1 + (v5 + (unsigned) v33 * (unsigned) v8 + v5 * (unsigned) v7), v34, v35); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v29, v36); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile v37; + TASSIGN(v37, v11); + Tile v38; + __ubuf__ float* v39 = v37.data(); + uint64_t v40 = reinterpret_cast(v39); + TASSIGN(v38, v40); + Tile v41; + TASSIGN(v41, v12); + Tile v42; + __ubuf__ float* v43 = v41.data(); + uint64_t v44 = reinterpret_cast(v43); + TASSIGN(v42, v44); + Tile v45; + TASSIGN(v45, v13); + Tile v46; + __ubuf__ float* v47 = v45.data(); + uint64_t v48 = reinterpret_cast(v47); + TASSIGN(v46, v48); + Tile v49; + TASSIGN(v49, v14); + Tile v50; + __ubuf__ float* v51 = v49.data(); + uint64_t v52 = reinterpret_cast(v51); + TASSIGN(v50, v52); + Tile v53; + TASSIGN(v53, v15); + Tile v54; + __ubuf__ float* v55 = v53.data(); + uint64_t v56 = reinterpret_cast(v55); + TASSIGN(v54, v56); + Tile v57; + TASSIGN(v57, v16); + Tile v58; + __ubuf__ float* v59 = v57.data(); + uint64_t v60 = reinterpret_cast(v59); + TASSIGN(v58, v60); + Tile v61; + TASSIGN(v61, v17); + Tile v62; + __ubuf__ float* v63 = v61.data(); + uint64_t v64 = reinterpret_cast(v63); + TASSIGN(v62, v64); + Tile v65; + TASSIGN(v65, v18); + Tile v66; + __ubuf__ float* v67 = v65.data(); + uint64_t v68 = reinterpret_cast(v67); + TASSIGN(v66, v68); + Tile v69; + TASSIGN(v69, v19); + Tile v70; + __ubuf__ float* v71 = v69.data(); + uint64_t v72 = reinterpret_cast(v71); + TASSIGN(v70, v72); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TROWMAX(v42, v29, v38); + pipe_barrier(PIPE_ALL); + TROWEXPANDSUB(v58, v29, v42); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_ALL); + TEXP(v62, v58); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v62, v38); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TDIV(v66, v62, v50); + pipe_barrier(PIPE_ALL); + TADDS(v66, v66, v9); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TROWSUM(v42, v66, v38); + pipe_barrier(PIPE_ALL); + TADDS(v42, v42, v9); + pipe_barrier(PIPE_ALL); + TROWEXPAND(v50, v42); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v50); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + pipe_barrier(PIPE_ALL); + TCOLSUM(v46, v66, v38, v4); + pipe_barrier(PIPE_ALL); + TADDS(v46, v46, v9); + pipe_barrier(PIPE_ALL); + TCOLEXPAND(v54, v46); + pipe_barrier(PIPE_ALL); + TDIV(v70, v66, v54); + pipe_barrier(PIPE_ALL); + TMOV(v66, v70); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 4, 4> v73 = pto::Shape<1, 1, 1, 4, 4>(); + pto::Stride<16, 16, 16, 4, 1> v74 = pto::Stride<16, 16, 16, 4, 1>(); + GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND> v75 = GlobalTensor, pto::Stride<16, 16, 16, 4, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) v33 * (unsigned) v8 + v5 * (unsigned) v7), v73, v74); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v75, v66); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + #endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +}