diff --git a/include/pto/cpu/TPush.hpp b/include/pto/cpu/TPush.hpp index f0ae9d77d..1a6a5167d 100644 --- a/include/pto/cpu/TPush.hpp +++ b/include/pto/cpu/TPush.hpp @@ -46,7 +46,7 @@ enum class TransferDir : uint8_t template PTO_INTERNAL constexpr bool IsC2VProducerTile() { - return TileProd::Loc == TileType::Acc; + return TileProd::Loc == TileType::Acc || TileProd::Loc == TileType::Mat; } template @@ -693,14 +693,14 @@ struct TPipe { if (fifo.GM_SLOT_BUFFER != nullptr) { popTileFromGMFiFo(fifo, tile); return true; - } else if constexpr (TPipe::is_c2v) { + } else if constexpr (TPipe::is_c2v && TileCons::Loc == TileType::Vec) { // && TileCons::Loc != TileType::Vec if constexpr (Split == TileSplitAxis::TILE_NO_SPLIT) { - popTileFromVecFiFo(fifo, tile); + popTileFromVecFiFoSplit(fifo, tile); // popTileFromVecFiFoSplit } else { popTileFromVecFiFoSplit(fifo, tile); } return false; - } else if constexpr (TPipe::is_v2c) { + } else if constexpr (TPipe::is_v2c && TileCons::Loc != TileType::Vec) { // && TileCons::Loc == TileType::Vec popTileFromMatFiFo(fifo, tile); return false; } @@ -728,10 +728,18 @@ PTO_INTERNAL void TPush_c2v(Pipe &pipe, TileProd &tile, size_t entryBase, size_t (Split == TileSplitAxis::TILE_LEFT_RIGHT) ? (TileProd::Cols / 2) : static_cast(TileProd::Cols); if constexpr (Split == TileSplitAxis::TILE_NO_SPLIT) { - using SlotTile = Tile; - SlotTile slotTile; - TASSIGN(slotTile, static_cast(pipe.fifo.C2V_CONSUMER_BUF + entryBase)); - cpu_pipe::CopyTileWindow(slotTile, tile, 0, 0); + // using SlotTile = Tile; + // SlotTile slotTile; + // TASSIGN(slotTile, static_cast(pipe.fifo.C2V_CONSUMER_BUF + entryBase)); + // cpu_pipe::CopyTileWindow(slotTile, tile, 0, 0); + auto &slotStorage = Pipe::GetSharedState().local_slot_storage[slotIndex]; + for (uint32_t splitIndex = 0; splitIndex < cpu_pipe::GetSplitCount(); ++splitIndex) { + auto *slotPtr = reinterpret_cast(slotStorage.data() + splitIndex * Pipe::RingFiFo::SLOT_SIZE + + pipe.prod.entryOffset); + const uint32_t rowOffset = (Split == TileSplitAxis::TILE_UP_DOWN) ? splitIndex * consRows : 0; + const uint32_t colOffset = (Split == TileSplitAxis::TILE_LEFT_RIGHT) ? splitIndex * consCols : 0; + cpu_pipe::CopyTileWindowToLinear(slotPtr, consCols, tile, consRows, rowOffset, colOffset); + } } else { auto &slotStorage = Pipe::GetSharedState().local_slot_storage[slotIndex]; for (uint32_t splitIndex = 0; splitIndex < cpu_pipe::GetSplitCount(); ++splitIndex) { @@ -795,15 +803,15 @@ PTO_INTERNAL void TPUSH_IMPL(Pipe &pipe, TileProd &tile) TSTORE(globalData, tile); } else { using GlobalData = GlobalTensor, Stride<1, 1, 1, cols, 1>>; - auto *addr = + auto *addr = reinterpret_cast<__gm__ T *>(reinterpret_cast(pipe.fifo.GM_SLOT_BUFFER) + entryBase); GlobalData globalData(addr); TSTORE(globalData, tile); } - } else if constexpr (Pipe::is_c2v) { - TPush_c2v(pipe, tile, entryBase, slotIndex); - } else if constexpr (Pipe::is_v2c) { + } else if constexpr (Pipe::is_v2c && TileProd::Loc == TileType::Vec) { TPush_v2c(pipe, tile, entryBase); + } else if constexpr (Pipe::is_c2v && TileProd::Loc != TileType::Vec) { + TPush_c2v(pipe, tile, entryBase, slotIndex); } if (pipe.prod.getRecordStatus()) { pipe.prod.template record(); diff --git a/tests/cpu/st/testcase/CMakeLists.txt b/tests/cpu/st/testcase/CMakeLists.txt index 3dd7b1ad7..f94ab314d 100644 --- a/tests/cpu/st/testcase/CMakeLists.txt +++ b/tests/cpu/st/testcase/CMakeLists.txt @@ -37,120 +37,122 @@ endfunction() find_package(Threads REQUIRED) set(ALL_TESTCASES -hashfind -mgather -mscatter -setgetval -tabs -tadd -taddc -tadds -taddsc -tassign_alias -tand -tands -targreduceop -taxpy -tbroadcast -tci -tcmp -tcmps -tcolexpand -tcolexpandop -tcolmax -tcolmin -tcolprod -tcolreduceidx -tcolsum -tconcat -tcvt -tdequant -tdiv -tdivs -texp -texpands -textract -tfillpad -tflashattn -tfmod -tfmods -tgather -tgatherb -tget -tget_async -tget_scale_addr -tgetscaleaddr -thistogram -timg2col -tinsert -tload -tloadconv -tlog -tlrelu -tmatmul -tmatmul_layout -#tmatmul_mx -tmax -tmaxs -tmin -tmins -tmov -tmrgsort -tmul -tmuls -tneg -tnot -tnotify -tor -tors -tpartadd -tpartmul -tpartmax -tpartmin -tprefetch -tprelu -tpushpop -tput -tput_async -tquant -trandom -trecip -treduce -trelu -trem -trems -treshape -trowexpand -trowexpandop -trowmax -trowmin -trowreduceidx -trowsum -trsqrt -tscatter -tsel -tsels -tshl -tshls -tshr -tshrs -tsort32 -tsqrt -tstore -tsub -tsubview -tsubc -tsubs -tsubsc -ttest -ttrans -ttri -twait -txor -txors -tpushpop_cv_nosplit -tpushpop_cv -tpushpop_vc_nosplit +# hashfind +# mgather +# mscatter +# setgetval +# tabs +# tadd +# taddc +# tadds +# taddsc +# tassign_alias +# tand +# tands +# targreduceop +# taxpy +# tbroadcast +# tci +# tcmp +# tcmps +# tcolexpand +# tcolexpandop +# tcolmax +# tcolmin +# tcolprod +# tcolreduceidx +# tcolsum +# tconcat +# tcvt +# tdequant +# tdiv +# tdivs +# texp +# texpands +# textract +# tfillpad +# tflashattn +# tfmod +# tfmods +# tgather +# tgatherb +# tget +# tget_async +# tget_scale_addr +# tgetscaleaddr +# thistogram +# timg2col +# tinsert +# tload +# tloadconv +# tlog +# tlrelu +# tmatmul +# tmatmul_layout +# #tmatmul_mx +# tmax +# tmaxs +# tmin +# tmins +# tmov +# tmrgsort +# tmul +# tmuls +# tneg +# tnot +# tnotify +# tor +# tors +# tpartadd +# tpartmul +# tpartmax +# tpartmin +# tprefetch +# tprelu +# tpushpop +# tput +# tput_async +# tquant +# trandom +# trecip +# treduce +# trelu +# trem +# trems +# treshape +# trowexpand +# trowexpandop +# trowmax +# trowmin +# trowreduceidx +# trowsum +# trsqrt +# tscatter +# tsel +# tsels +# tshl +# tshls +# tshr +# tshrs +# tsort32 +# tsqrt +# tstore +# tsub +# tsubview +# tsubc +# tsubs +# tsubsc +# ttest +# ttrans +# ttri +# twait +# txor +# txors +# tpushpop_cv_nosplit +# tpushpop_cv +# tpushpop_vc_nosplit +tpush_a3 +tpush_a5 ) foreach(TESTCASE ${ALL_TESTCASES}) diff --git a/tests/cpu/st/testcase/tpush_a3/CMakeLists.txt b/tests/cpu/st/testcase/tpush_a3/CMakeLists.txt new file mode 100644 index 000000000..51273f0d9 --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a3/CMakeLists.txt @@ -0,0 +1 @@ +pto_cpu_sim_st(tpush_a3) diff --git a/tests/cpu/st/testcase/tpush_a3/gen_data.py b/tests/cpu/st/testcase/tpush_a3/gen_data.py new file mode 100644 index 000000000..5c99124f3 --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a3/gen_data.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import numpy as np +import os + +# Generate random inputs +a = np.random.randn(32, 64).astype(np.float32) +b = np.random.randn(64, 512).astype(np.float32) +c_prev = np.random.randn(32, 512).astype(np.float32) + +# Compute golden output: c = c_prev + matmul(a + 1, b) +c_golden = c_prev + np.matmul(a + 1.0, b) + +# Save as raw binary files + + +case_name = "TPUSH_A3Test.case_1" +if not os.path.exists(case_name): + os.makedirs(case_name) +original_dir = os.getcwd() +os.chdir(case_name) + +a.tofile("a.bin") +b.tofile("b.bin") +c_prev.tofile("c.bin") +c_golden.tofile("golden.bin") + +os.chdir(original_dir) \ No newline at end of file diff --git a/tests/cpu/st/testcase/tpush_a3/main.cpp b/tests/cpu/st/testcase/tpush_a3/main.cpp new file mode 100644 index 000000000..ad4889ff1 --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a3/main.cpp @@ -0,0 +1,332 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_common.h" + +using namespace std; +using namespace pto; +using namespace PtoTestCommon; + +namespace { +using T = float; + +class TPUSH_A3Test : public testing::Test { +protected: + void SetUp() override + {} + void TearDown() override + {} +}; + +std::string GetGoldenDir() +{ + const testing::TestInfo *testInfo = testing::UnitTest::GetInstance()->current_test_info(); + const std::string caseName = testInfo->name(); + std::string suiteName = testInfo->test_suite_name(); + std::string fullPath = "../" + suiteName + "." + caseName; + return fullPath; +} + +// Pipe and Communication +using MainPipe = TPipe<0, Direction::DIR_BOTH, 8192, 4, 4, false>; + +static __aicore__ void main_incore_0_aic(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, int32_t v5) { + unsigned v6 = 0; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 0; + const int64_t v13 = 32768; + const int32_t v14 = 0; + using T = float; + + auto v15 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 4, false>(v4, v14, v14); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v16 = (size_t) v14; v16 < ((size_t) v7); v16 += (size_t) v9) { + Tile v17 = Tile(v8, v8); + TASSIGN(v17, v13); + Tile v18 = Tile(v8, v8); + __cbuf__ float* v19 = v17.data(); + uint64_t v20 = reinterpret_cast(v19); + TASSIGN(v18, v20); + pto::Shape<1, 1, 1, 64, 64> v21 = pto::Shape<1, 1, 1, 64, 64>(); + pto::Stride<32768, 32768, 32768, 512, 1> v22 = pto::Stride<32768, 32768, 32768, 512, 1>(); + GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND> v23 = GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND>(v3 + (v6 + v6 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v5 * (uint32_t) v7) + (uint32_t) ((int32_t) v16)) * (uint32_t) v8) * (unsigned) v9), v21, v22); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v18, v23); + Tile v24 = Tile(v11, v8); + TPOP, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v15, v24); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile v25 = Tile(v11, v8); + TASSIGN(v25, v12); + Tile v26 = Tile(v11, v8); + __ca__ float* v27 = v25.data(); + uint64_t v28 = reinterpret_cast(v27); + TASSIGN(v26, v28); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TMOV(v26, v24); + TFREE, TileSplitAxis::TILE_LEFT_RIGHT>(v15); + Tile v29 = Tile(v8, v8); + TASSIGN(v29, v12); + Tile v30 = Tile(v8, v8); + __cb__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + TMOV(v30, v18); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile v33 = Tile(v11, v8); + TASSIGN(v33, v12); + Tile v34 = Tile(v11, v8); + __cc__ float* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v34, v26, v30); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v15, v34); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + + return; +} + +static __aicore__ void main_incore_0_aiv(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, __gm__ float* v4, int32_t v5) { + unsigned v6 = 0; + const float v7 = 1.0f; + const int32_t v8 = 8; + const int32_t v9 = 64; + const int32_t v10 = 1; + const int32_t v11 = 512; + const int32_t v12 = 32; + const int64_t v13 = 45056; + const int64_t v14 = 40960; + const int64_t v15 = 36864; + const int64_t v16 = 32768; + const int32_t v17 = 0; + using T = float; + + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v18 = get_subblockid(); + auto v19 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 4, false>(v4, v17, v17); + Tile v20 = Tile(v12, v12); + TASSIGN(v20, v16); + Tile v21 = Tile(v12, v12); + __ubuf__ float* v22 = v20.data(); + uint64_t v23 = reinterpret_cast(v22); + TASSIGN(v21, v23); + int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v18) * (uint32_t) v12); + pto::Shape<1, 1, 1, 32, 32> v25 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<2048, 2048, 2048, 64, 1> v26 = pto::Stride<2048, 2048, 2048, 64, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND> v27 = GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND>(v2 + (v6 + v6 * (unsigned) v9 + (unsigned) v24 * (unsigned) v10), v25, v26); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TLOAD(v21, v27); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + for (size_t v28 = (size_t) v17; v28 < ((size_t) v8); v28 += (size_t) v10) { + Tile v29 = Tile(v12, v12); + TASSIGN(v29, v15); + Tile v30 = Tile(v12, v12); + __ubuf__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + pto::Shape<1, 1, 1, 32, 32> v33 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<16384, 16384, 16384, 512, 1> v34 = pto::Stride<16384, 16384, 16384, 512, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND>(v1 + (v6 + v6 * (unsigned) v11 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v5 * (uint32_t) v8) + (uint32_t) ((int32_t) v28)) * (uint32_t) v9) + (uint32_t) v24) * (unsigned) v10), v33, v34); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v30, v35); + Tile v36 = Tile(v12, v12); + TASSIGN(v36, v14); + Tile v37 = Tile(v12, v12); + __ubuf__ float* v38 = v36.data(); + uint64_t v39 = reinterpret_cast(v38); + TASSIGN(v37, v39); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TADDS(v37, v21, v7); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v19, v37); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + Tile v40 = Tile(v12, v12); + TPOP, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v19, v40); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile v41 = Tile(v12, v12); + TASSIGN(v41, v13); + Tile v42 = Tile(v12, v12); + __ubuf__ float* v43 = v41.data(); + uint64_t v44 = reinterpret_cast(v43); + TASSIGN(v42, v44); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v42, v30, v40); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TFREE, TileSplitAxis::TILE_LEFT_RIGHT>(v19); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v35, v42); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + + return; +} + +void* g_shared_storage_ptr = nullptr; +// A simple function matching GetPipeSharedStateInjectedHookFn signature +extern "C" void* GlobalPipeHook(uint64_t key, size_t size) { + // We'll use a global pointer to store the allocated memory + return g_shared_storage_ptr; +} + + + +inline void LaunchTPut(T *out, T *A, T *B, T *C) { + size_t v5 = 0; + std::cout<<"Start"<(g_shared_storage_ptr)->~SharedState(); + // free(g_shared_storage_ptr); +} + +void test_tpush() +{ + size_t ARow = 32, ACol = 64, BRow = 64, BCol = 512, CRow = 32, CCol = 512; + size_t ASize = ARow * ACol * sizeof(T); + size_t BSize = BRow * BCol * sizeof(T); + size_t CSize = CRow * CCol * sizeof(T); + + aclInit(nullptr); + aclrtSetDevice(0); + aclrtStream stream; + aclrtCreateStream(&stream); + + T *dstHost, *srcAHost, *srcBHost, *srcCHost; + T *dstDevice, *srcADevice, *srcBDevice, *srcCDevice; + + aclrtMallocHost((void **)(&dstHost), CSize); + aclrtMallocHost((void **)(&srcAHost), ASize); + aclrtMallocHost((void **)(&srcBHost), BSize); + aclrtMallocHost((void **)(&srcCHost), CSize); + + aclrtMalloc((void **)&dstDevice, CSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcADevice, ASize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcBDevice, BSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcCDevice, CSize, ACL_MEM_MALLOC_HUGE_FIRST); + + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/a.bin", ASize, srcAHost, ASize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/b.bin", BSize, srcBHost, BSize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/c.bin", CSize, srcCHost, CSize)); + + aclrtMemcpy(srcADevice, ASize, srcAHost, ASize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(srcBDevice, BSize, srcBHost, BSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(srcCDevice, CSize, srcCHost, CSize, ACL_MEMCPY_HOST_TO_DEVICE); + LaunchTPut(dstDevice, srcADevice, srcBDevice, srcCDevice); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, CSize, dstDevice, CSize, ACL_MEMCPY_DEVICE_TO_HOST); + + WriteFile(GetGoldenDir() + "/output.bin", srcCDevice, CSize); + + aclrtFree(dstDevice); + aclrtFree(srcADevice); + aclrtFree(srcBDevice); + aclrtFree(srcCDevice); + + aclrtFreeHost(dstHost); + aclrtFreeHost(srcAHost); + aclrtFreeHost(srcBHost); + aclrtFreeHost(srcCHost); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + size_t elem_count = CSize / sizeof(T); + + std::vector golden(elem_count); + std::vector devFinal(elem_count); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/golden.bin", CSize, golden.data(), CSize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/output.bin", CSize, devFinal.data(), CSize)); + + bool ret = ResultCmp(golden, devFinal, 0.001f); + + EXPECT_TRUE(ret); +} + +TEST_F(TPUSH_A3Test, case_1) +{ + test_tpush(); +} + +} diff --git a/tests/cpu/st/testcase/tpush_a5/CMakeLists.txt b/tests/cpu/st/testcase/tpush_a5/CMakeLists.txt new file mode 100644 index 000000000..a7e91400e --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a5/CMakeLists.txt @@ -0,0 +1 @@ +pto_cpu_sim_st(tpush_a5) diff --git a/tests/cpu/st/testcase/tpush_a5/gen_data.py b/tests/cpu/st/testcase/tpush_a5/gen_data.py new file mode 100644 index 000000000..621070397 --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a5/gen_data.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import numpy as np +import os + +# Generate random inputs +a = np.random.randn(32, 64).astype(np.float32) +b = np.random.randn(64, 512).astype(np.float32) +c_prev = np.random.randn(32, 512).astype(np.float32) + +# Compute golden output: c = c_prev + matmul(a + 1, b) +c_golden = c_prev + np.matmul(a + 1.0, b) + +# Save as raw binary files + + +case_name = "TPUSH_A5Test.case_1" +if not os.path.exists(case_name): + os.makedirs(case_name) +original_dir = os.getcwd() +os.chdir(case_name) + +a.tofile("a.bin") +b.tofile("b.bin") +c_prev.tofile("c.bin") +c_golden.tofile("golden.bin") + +os.chdir(original_dir) + +#------------------------------------------------------ + +c_golden = c_prev + np.matmul(a, b) + +# Save as raw binary files + + +case_name = "TPUSH_A5Test.case_4" +if not os.path.exists(case_name): + os.makedirs(case_name) +original_dir = os.getcwd() +os.chdir(case_name) + +a.tofile("a.bin") +b.tofile("b.bin") +c_prev.tofile("c.bin") +c_golden.tofile("golden.bin") + +os.chdir(original_dir) \ No newline at end of file diff --git a/tests/cpu/st/testcase/tpush_a5/main.cpp b/tests/cpu/st/testcase/tpush_a5/main.cpp new file mode 100644 index 000000000..f5df5ca1e --- /dev/null +++ b/tests/cpu/st/testcase/tpush_a5/main.cpp @@ -0,0 +1,1039 @@ +/** +Copyright (c) 2026 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_common.h" + +using namespace std; +using namespace pto; +using namespace PtoTestCommon; + +namespace { +using T = float; + +class TPUSH_A5Test : public testing::Test { +protected: + void SetUp() override + {} + void TearDown() override + {} +}; + +std::string GetGoldenDir() +{ + const testing::TestInfo *testInfo = testing::UnitTest::GetInstance()->current_test_info(); + const std::string caseName = testInfo->name(); + std::string suiteName = testInfo->test_suite_name(); + std::string fullPath = "../" + suiteName + "." + caseName; + return fullPath; +} + +static __aicore__ void main_incore_0_aic_BI_LEFTRIGHT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 0; + const int64_t v13 = 32768; + const int32_t v14 = 0; + using T = float; + + auto v15 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, false>(v6, v14, v14); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v16 = (size_t) v14; v16 < ((size_t) v7); v16 += (size_t) v9) { + Tile v17 = Tile(v8, v8); + TASSIGN(v17, v13); + Tile v18 = Tile(v8, v8); + __cbuf__ float* v19 = v17.data(); + uint64_t v20 = reinterpret_cast(v19); + TASSIGN(v18, v20); + pto::Shape<1, 1, 1, 64, 64> v21 = pto::Shape<1, 1, 1, 64, 64>(); + pto::Stride<32768, 32768, 32768, 512, 1> v22 = pto::Stride<32768, 32768, 32768, 512, 1>(); + GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND> v23 = GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND>(v3 + (v5 + v5 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v7) + (uint32_t) ((int32_t) v16)) * (uint32_t) v8) * (unsigned) v9), v21, v22); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v18, v23); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile v24 = Tile(v11, v8); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v15, v24); + set_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + Tile v25 = Tile(v11, v8); + TASSIGN(v25, v12); + Tile v26 = Tile(v11, v8); + __ca__ float* v27 = v25.data(); + uint64_t v28 = reinterpret_cast(v27); + TASSIGN(v26, v28); + wait_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TMOV(v26, v24); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TFREE, TileSplitAxis::TILE_LEFT_RIGHT>(v15); + Tile v29 = Tile(v8, v8); + TASSIGN(v29, v12); + Tile v30 = Tile(v8, v8); + __cb__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(v30, v18); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile v33 = Tile(v11, v8); + TASSIGN(v33, v12); + Tile v34 = Tile(v11, v8); + __cc__ float* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v34, v26, v30); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v15, v34); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + + return; +} + +static __aicore__ void main_incore_0_aiv_BI_LEFTRIGHT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const float v7 = 1.0f; + const int32_t v8 = 8; + const int32_t v9 = 64; + const int32_t v10 = 1; + const int32_t v11 = 512; + const int32_t v12 = 32; + const int64_t v13 = 45056; + const int64_t v14 = 40960; + const int64_t v15 = 36864; + const int64_t v16 = 32768; + const int32_t v17 = 0; + using T = float; + + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v18 = get_subblockid(); + auto v19 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, false>(v6, v17, v17); + Tile v20 = Tile(v12, v12); + TASSIGN(v20, v16); + Tile v21 = Tile(v12, v12); + __ubuf__ float* v22 = v20.data(); + uint64_t v23 = reinterpret_cast(v22); + TASSIGN(v21, v23); + int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v18) * (uint32_t) v12); + pto::Shape<1, 1, 1, 32, 32> v25 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<2048, 2048, 2048, 64, 1> v26 = pto::Stride<2048, 2048, 2048, 64, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND> v27 = GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND>(v2 + (v5 + v5 * (unsigned) v9 + (unsigned) v24 * (unsigned) v10), v25, v26); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_S, EVENT_ID1); + TLOAD(v21, v27); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + for (size_t v28 = (size_t) v17; v28 < ((size_t) v8); v28 += (size_t) v10) { + Tile v29 = Tile(v12, v12); + TASSIGN(v29, v15); + Tile v30 = Tile(v12, v12); + __ubuf__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + pto::Shape<1, 1, 1, 32, 32> v33 = pto::Shape<1, 1, 1, 32, 32>(); + pto::Stride<16384, 16384, 16384, 512, 1> v34 = pto::Stride<16384, 16384, 16384, 512, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND>(v1 + (v5 + v5 * (unsigned) v11 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v8) + (uint32_t) ((int32_t) v28)) * (uint32_t) v9) + (uint32_t) v24) * (unsigned) v10), v33, v34); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v30, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile v36 = Tile(v12, v12); + TASSIGN(v36, v14); + Tile v37 = Tile(v12, v12); + __ubuf__ float* v38 = v36.data(); + uint64_t v39 = reinterpret_cast(v38); + TASSIGN(v37, v39); + TADDS(v37, v21, v7); + Tile v40 = Tile(v12, v12); + TASSIGN(v40, v13); + Tile v41 = Tile(v12, v12); + __ubuf__ float* v42 = v40.data(); + uint64_t v43 = reinterpret_cast(v42); + TASSIGN(v41, v43); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMOV(v41, v37); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v19, v41); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + Tile v44 = Tile(v12, v12); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_LEFT_RIGHT>(v19, v44); + set_flag(PIPE_S, PIPE_V, EVENT_ID0); + Tile v45 = Tile(v12, v12); + TASSIGN(v45, v15); + Tile v46 = Tile(v12, v12); + __ubuf__ float* v47 = v45.data(); + uint64_t v48 = reinterpret_cast(v47); + TASSIGN(v46, v48); + wait_flag(PIPE_S, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v46, v30, v44); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TFREE, TileSplitAxis::TILE_LEFT_RIGHT>(v19); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pipe_barrier(PIPE_MTE3); + TSTORE(v35, v46); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + wait_flag(PIPE_V, PIPE_S, EVENT_ID1); + + return; +} + +static __aicore__ void main_incore_0_aic_BI_NOSPLIT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 0; + const int64_t v13 = 32768; + const int32_t v14 = 0; + using T = float; + + auto v15 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, true>(v6, v14, v14); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v16 = (size_t) v14; v16 < ((size_t) v7); v16 += (size_t) v9) { + Tile v17 = Tile(v8, v8); + TASSIGN(v17, v13); + Tile v18 = Tile(v8, v8); + __cbuf__ float* v19 = v17.data(); + uint64_t v20 = reinterpret_cast(v19); + TASSIGN(v18, v20); + pto::Shape<1, 1, 1, 64, 64> v21 = pto::Shape<1, 1, 1, 64, 64>(); + pto::Stride<32768, 32768, 32768, 512, 1> v22 = pto::Stride<32768, 32768, 32768, 512, 1>(); + GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND> v23 = GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND>(v3 + (v5 + v5 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v7) + (uint32_t) ((int32_t) v16)) * (uint32_t) v8) * (unsigned) v9), v21, v22); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v18, v23); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile v24 = Tile(v11, v8); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_NO_SPLIT>(v15, v24); + set_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + Tile v25 = Tile(v11, v8); + TASSIGN(v25, v12); + Tile v26 = Tile(v11, v8); + __ca__ float* v27 = v25.data(); + uint64_t v28 = reinterpret_cast(v27); + TASSIGN(v26, v28); + wait_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TMOV(v26, v24); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TFREE, TileSplitAxis::TILE_NO_SPLIT>(v15); + Tile v29 = Tile(v8, v8); + TASSIGN(v29, v12); + Tile v30 = Tile(v8, v8); + __cb__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(v30, v18); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile v33 = Tile(v11, v8); + TASSIGN(v33, v12); + Tile v34 = Tile(v11, v8); + __cc__ float* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v34, v26, v30); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_NO_SPLIT>(v15, v34); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + + return; +} + +static __aicore__ void main_incore_0_aiv_BI_NOSPLIT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const float v7 = 1.0f; + const int32_t v8 = 8; + const int32_t v9 = 64; + const int32_t v10 = 1; + const int32_t v11 = 512; + const int32_t v12 = 32; + const int64_t v13 = 57344; + const int64_t v14 = 49152; + const int64_t v15 = 40960; + const int64_t v16 = 32768; + const int32_t v17 = 0; + using T = float; + + set_mask_norm(); + set_vector_mask(-1, -1); + if (get_subblockid() == 0) { + auto v18 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, true>(v6, v17, v17); + Tile v19 = Tile(v12, v9); + TASSIGN(v19, v16); + Tile v20 = Tile(v12, v9); + __ubuf__ float* v21 = v19.data(); + uint64_t v22 = reinterpret_cast(v21); + TASSIGN(v20, v22); + pto::Shape<1, 1, 1, 32, 64> v23 = pto::Shape<1, 1, 1, 32, 64>(); + pto::Stride<2048, 2048, 2048, 64, 1> v24 = pto::Stride<2048, 2048, 2048, 64, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND> v25 = GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND>(v2 + (v5 + v5 * (unsigned) v9 + v5 * (unsigned) v10), v23, v24); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_S, EVENT_ID1); + TLOAD(v20, v25); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + for (size_t v26 = (size_t) v17; v26 < ((size_t) v8); v26 += (size_t) v10) { + Tile v27 = Tile(v12, v9); + TASSIGN(v27, v15); + Tile v28 = Tile(v12, v9); + __ubuf__ float* v29 = v27.data(); + uint64_t v30 = reinterpret_cast(v29); + TASSIGN(v28, v30); + pto::Shape<1, 1, 1, 32, 64> v31 = pto::Shape<1, 1, 1, 32, 64>(); + pto::Stride<16384, 16384, 16384, 512, 1> v32 = pto::Stride<16384, 16384, 16384, 512, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND> v33 = GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND>(v1 + (v5 + v5 * (unsigned) v11 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v8) + (uint32_t) ((int32_t) v26)) * (uint32_t) v9) * (unsigned) v10), v31, v32); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v28, v33); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile v34 = Tile(v12, v9); + TASSIGN(v34, v14); + Tile v35 = Tile(v12, v9); + __ubuf__ float* v36 = v34.data(); + uint64_t v37 = reinterpret_cast(v36); + TASSIGN(v35, v37); + TADDS(v35, v20, v7); + Tile v38 = Tile(v12, v9); + TASSIGN(v38, v13); + Tile v39 = Tile(v12, v9); + __ubuf__ float* v40 = v38.data(); + uint64_t v41 = reinterpret_cast(v40); + TASSIGN(v39, v41); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMOV(v39, v35); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_NO_SPLIT>(v18, v39); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + Tile v42 = Tile(v12, v9); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_NO_SPLIT>(v18, v42); + set_flag(PIPE_S, PIPE_V, EVENT_ID0); + Tile v43 = Tile(v12, v9); + TASSIGN(v43, v15); + Tile v44 = Tile(v12, v9); + __ubuf__ float* v45 = v43.data(); + uint64_t v46 = reinterpret_cast(v45); + TASSIGN(v44, v46); + wait_flag(PIPE_S, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v44, v28, v42); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TFREE, TileSplitAxis::TILE_NO_SPLIT>(v18); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pipe_barrier(PIPE_MTE3); + TSTORE(v33, v44); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + wait_flag(PIPE_V, PIPE_S, EVENT_ID1); + } + + return; +} + +static __aicore__ void main_incore_0_aic_BI_TOPDOWN(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 0; + const int64_t v13 = 32768; + const int32_t v14 = 0; + using T = float; + + auto v15 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, false>(v6, v14, v14); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v16 = (size_t) v14; v16 < ((size_t) v7); v16 += (size_t) v9) { + Tile v17 = Tile(v8, v8); + TASSIGN(v17, v13); + Tile v18 = Tile(v8, v8); + __cbuf__ float* v19 = v17.data(); + uint64_t v20 = reinterpret_cast(v19); + TASSIGN(v18, v20); + pto::Shape<1, 1, 1, 64, 64> v21 = pto::Shape<1, 1, 1, 64, 64>(); + pto::Stride<32768, 32768, 32768, 512, 1> v22 = pto::Stride<32768, 32768, 32768, 512, 1>(); + GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND> v23 = GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND>(v3 + (v5 + v5 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v7) + (uint32_t) ((int32_t) v16)) * (uint32_t) v8) * (unsigned) v9), v21, v22); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v18, v23); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile v24 = Tile(v11, v8); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_UP_DOWN>(v15, v24); + set_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + Tile v25 = Tile(v11, v8); + TASSIGN(v25, v12); + Tile v26 = Tile(v11, v8); + __ca__ float* v27 = v25.data(); + uint64_t v28 = reinterpret_cast(v27); + TASSIGN(v26, v28); + wait_flag(PIPE_S, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TMOV(v26, v24); + set_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + TFREE, TileSplitAxis::TILE_UP_DOWN>(v15); + Tile v29 = Tile(v8, v8); + TASSIGN(v29, v12); + Tile v30 = Tile(v8, v8); + __cb__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(v30, v18); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile v33 = Tile(v11, v8); + TASSIGN(v33, v12); + Tile v34 = Tile(v11, v8); + __cc__ float* v35 = v33.data(); + uint64_t v36 = reinterpret_cast(v35); + TASSIGN(v34, v36); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v34, v26, v30); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_UP_DOWN>(v15, v34); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_S, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + + return; +} + +static __aicore__ void main_incore_0_aiv_BI_TOPDOWN(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const float v7 = 1.0f; + const int32_t v8 = 8; + const int32_t v9 = 16; + const int32_t v10 = 64; + const int32_t v11 = 1; + const int32_t v12 = 512; + const int64_t v13 = 45056; + const int64_t v14 = 40960; + const int64_t v15 = 36864; + const int64_t v16 = 32768; + const int32_t v17 = 0; + using T = float; + + set_mask_norm(); + set_vector_mask(-1, -1); + int64_t v18 = get_subblockid(); + auto v19 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, false>(v6, v17, v17); + Tile v20 = Tile(v9, v10); + TASSIGN(v20, v16); + Tile v21 = Tile(v9, v10); + __ubuf__ float* v22 = v20.data(); + uint64_t v23 = reinterpret_cast(v22); + TASSIGN(v21, v23); + int32_t v24 = (int32_t) ((uint32_t) ((int32_t) (int64_t) v18) * (uint32_t) v9); + pto::Shape<1, 1, 1, 16, 64> v25 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<1024, 1024, 1024, 64, 1> v26 = pto::Stride<1024, 1024, 1024, 64, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND> v27 = GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND>(v2 + (v5 + (unsigned) v24 * (unsigned) v10 + v5 * (unsigned) v11), v25, v26); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_S, EVENT_ID1); + TLOAD(v21, v27); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + for (size_t v28 = (size_t) v17; v28 < ((size_t) v8); v28 += (size_t) v11) { + Tile v29 = Tile(v9, v10); + TASSIGN(v29, v15); + Tile v30 = Tile(v9, v10); + __ubuf__ float* v31 = v29.data(); + uint64_t v32 = reinterpret_cast(v31); + TASSIGN(v30, v32); + pto::Shape<1, 1, 1, 16, 64> v33 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<8192, 8192, 8192, 512, 1> v34 = pto::Stride<8192, 8192, 8192, 512, 1>(); + GlobalTensor, pto::Stride<8192, 8192, 8192, 512, 1>, pto::Layout::ND> v35 = GlobalTensor, pto::Stride<8192, 8192, 8192, 512, 1>, pto::Layout::ND>(v1 + (v5 + (unsigned) v24 * (unsigned) v12 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v8) + (uint32_t) ((int32_t) v28)) * (uint32_t) v10) * (unsigned) v11), v33, v34); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v30, v35); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile v36 = Tile(v9, v10); + TASSIGN(v36, v14); + Tile v37 = Tile(v9, v10); + __ubuf__ float* v38 = v36.data(); + uint64_t v39 = reinterpret_cast(v38); + TASSIGN(v37, v39); + TADDS(v37, v21, v7); + Tile v40 = Tile(v9, v10); + TASSIGN(v40, v13); + Tile v41 = Tile(v9, v10); + __ubuf__ float* v42 = v40.data(); + uint64_t v43 = reinterpret_cast(v42); + TASSIGN(v41, v43); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TMOV(v41, v37); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_UP_DOWN>(v19, v41); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + Tile v44 = Tile(v9, v10); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_UP_DOWN>(v19, v44); + set_flag(PIPE_S, PIPE_V, EVENT_ID0); + Tile v45 = Tile(v9, v10); + TASSIGN(v45, v15); + Tile v46 = Tile(v9, v10); + __ubuf__ float* v47 = v45.data(); + uint64_t v48 = reinterpret_cast(v47); + TASSIGN(v46, v48); + wait_flag(PIPE_S, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TADD(v46, v30, v44); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TFREE, TileSplitAxis::TILE_UP_DOWN>(v19); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pipe_barrier(PIPE_MTE3); + TSTORE(v35, v46); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + wait_flag(PIPE_V, PIPE_S, EVENT_ID1); + + return; +} + +static __aicore__ void main_incore_0_aic_C2V_NOSPLIT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 16384; + const int64_t v13 = 0; + const int32_t v14 = 0; + using T = float; + + auto v15 = TPipe<0, Direction::DIR_C2V, 8192, 8, 2, true>(v6, v14, v14); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v16 = (size_t) v14; v16 < ((size_t) v7); v16 += (size_t) v9) { + Tile v17 = Tile(v8, v8); + TASSIGN(v17, v13); + Tile v18 = Tile(v8, v8); + __cbuf__ float* v19 = v17.data(); + uint64_t v20 = reinterpret_cast(v19); + TASSIGN(v18, v20); + pto::Shape<1, 1, 1, 64, 64> v21 = pto::Shape<1, 1, 1, 64, 64>(); + pto::Stride<32768, 32768, 32768, 512, 1> v22 = pto::Stride<32768, 32768, 32768, 512, 1>(); + GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND> v23 = GlobalTensor, pto::Stride<32768, 32768, 32768, 512, 1>, pto::Layout::ND>(v2 + (v5 + v5 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v7) + (uint32_t) ((int32_t) v16)) * (uint32_t) v8) * (unsigned) v9), v21, v22); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v18, v23); + Tile v24 = Tile(v11, v8); + TASSIGN(v24, v12); + Tile v25 = Tile(v11, v8); + __cbuf__ float* v26 = v24.data(); + uint64_t v27 = reinterpret_cast(v26); + TASSIGN(v25, v27); + pto::Shape<1, 1, 1, 32, 64> v28 = pto::Shape<1, 1, 1, 32, 64>(); + pto::Stride<2048, 2048, 2048, 64, 1> v29 = pto::Stride<2048, 2048, 2048, 64, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND> v30 = GlobalTensor, pto::Stride<2048, 2048, 2048, 64, 1>, pto::Layout::ND>(v3 + (v5 + v5 * (unsigned) v8 + v5 * (unsigned) v9), v28, v29); + TLOAD(v25, v30); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile v31 = Tile(v11, v8); + TASSIGN(v31, v13); + Tile v32 = Tile(v11, v8); + __ca__ float* v33 = v31.data(); + uint64_t v34 = reinterpret_cast(v33); + TASSIGN(v32, v34); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TMOV(v32, v25); + Tile v35 = Tile(v8, v8); + TASSIGN(v35, v13); + Tile v36 = Tile(v8, v8); + __cb__ float* v37 = v35.data(); + uint64_t v38 = reinterpret_cast(v37); + TASSIGN(v36, v38); + TMOV(v36, v18); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile v39 = Tile(v11, v8); + TASSIGN(v39, v13); + Tile v40 = Tile(v11, v8); + __cc__ float* v41 = v39.data(); + uint64_t v42 = reinterpret_cast(v41); + TASSIGN(v40, v42); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v40, v32, v36); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TPUSH, Tile, TileSplitAxis::TILE_NO_SPLIT>(v15, v40); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + + return; +} + +static __aicore__ void main_incore_0_aiv_C2V_NOSPLIT(__gm__ float* v1, __gm__ float* v2, __gm__ float* v3, int32_t v4) { + unsigned v5 = 0; + __gm__ void * v6 = nullptr; + const int32_t v7 = 8; + const int32_t v8 = 64; + const int32_t v9 = 1; + const int32_t v10 = 512; + const int32_t v11 = 32; + const int64_t v12 = 65536; + const int32_t v13 = 0; + using T = float; + + set_mask_norm(); + set_vector_mask(-1, -1); + if (get_subblockid() == 0) { + auto v14 = TPipe<0, Direction::DIR_C2V, 8192, 8, 2, true>(v6, v13, v13); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_S, EVENT_ID1); + for (size_t v15 = (size_t) v13; v15 < ((size_t) v7); v15 += (size_t) v9) { + Tile v16 = Tile(v11, v8); + TASSIGN(v16, v12); + Tile v17 = Tile(v11, v8); + __ubuf__ float* v18 = v16.data(); + uint64_t v19 = reinterpret_cast(v18); + TASSIGN(v17, v19); + pto::Shape<1, 1, 1, 32, 64> v20 = pto::Shape<1, 1, 1, 32, 64>(); + pto::Stride<16384, 16384, 16384, 512, 1> v21 = pto::Stride<16384, 16384, 16384, 512, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND> v22 = GlobalTensor, pto::Stride<16384, 16384, 16384, 512, 1>, pto::Layout::ND>(v1 + (v5 + v5 * (unsigned) v10 + (unsigned) ((int32_t) (uint32_t) ((int32_t) (uint32_t) ((int32_t) (uint32_t) v4 * (uint32_t) v7) + (uint32_t) ((int32_t) v15)) * (uint32_t) v8) * (unsigned) v9), v20, v21); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v17, v22); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile v23 = Tile(v11, v8); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + TPOP, Tile, TileSplitAxis::TILE_NO_SPLIT>(v14, v23); + set_flag(PIPE_S, PIPE_V, EVENT_ID0); + Tile v24 = Tile(v11, v8); + TASSIGN(v24, v12); + Tile v25 = Tile(v11, v8); + __ubuf__ float* v26 = v24.data(); + uint64_t v27 = reinterpret_cast(v26); + TASSIGN(v25, v27); + wait_flag(PIPE_S, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(v25, v17, v23); + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TFREE, TileSplitAxis::TILE_NO_SPLIT>(v14); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_MTE3); + TSTORE(v22, v25); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + wait_flag(PIPE_V, PIPE_S, EVENT_ID1); + } + + return; +} + + +void* g_shared_storage_ptr = nullptr; +// A simple function matching GetPipeSharedStateInjectedHookFn signature +extern "C" void* GlobalPipeHook(uint64_t key, size_t size) { + // We'll use a global pointer to store the allocated memory + return g_shared_storage_ptr; +} + + + +inline void LaunchTPut_BI_LEFTRIGHT(T *out, T *A, T *B, T *C) { + using MainPipe_1 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, true>; + size_t v5 = 0; + std::cout<<"Start"<(g_shared_storage_ptr)->~SharedState(); + // free(g_shared_storage_ptr); +} + +inline void LaunchTPut_BI_NOSPLIT(T *out, T *A, T *B, T *C) { + using MainPipe_2 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, true>; + size_t v5 = 0; + std::cout<<"Start"<(g_shared_storage_ptr)->~SharedState(); + // free(g_shared_storage_ptr); +} + +inline void LaunchTPut_BI_TOPDOWN(T *out, T *A, T *B, T *C) { + using MainPipe_3 = TPipe<0, Direction::DIR_BOTH, 8192, 4, 2, false>; + size_t v5 = 0; + std::cout<<"Start"<(g_shared_storage_ptr)->~SharedState(); + // free(g_shared_storage_ptr); +} + +inline void LaunchTPut_C2V_NOSPLIT(T *out, T *A, T *B, T *C) { + using MainPipe = TPipe<0, Direction::DIR_C2V, 8192, 8, 2, true>; + size_t v5 = 0; + std::cout<<"Start"<(g_shared_storage_ptr)->~SharedState(); + // free(g_shared_storage_ptr); +} + + +template +void test_tpush() +{ + size_t ARow = 32, ACol = 64, BRow = 64, BCol = 512, CRow = 32, CCol = 512; + size_t ASize = ARow * ACol * sizeof(T); + size_t BSize = BRow * BCol * sizeof(T); + size_t CSize = CRow * CCol * sizeof(T); + + aclInit(nullptr); + aclrtSetDevice(0); + aclrtStream stream; + aclrtCreateStream(&stream); + + T *dstHost, *srcAHost, *srcBHost, *srcCHost; + T *dstDevice, *srcADevice, *srcBDevice, *srcCDevice; + + aclrtMallocHost((void **)(&dstHost), CSize); + aclrtMallocHost((void **)(&srcAHost), ASize); + aclrtMallocHost((void **)(&srcBHost), BSize); + aclrtMallocHost((void **)(&srcCHost), CSize); + + aclrtMalloc((void **)&dstDevice, CSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcADevice, ASize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcBDevice, BSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcCDevice, CSize, ACL_MEM_MALLOC_HUGE_FIRST); + + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/a.bin", ASize, srcAHost, ASize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/b.bin", BSize, srcBHost, BSize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/c.bin", CSize, srcCHost, CSize)); + + aclrtMemcpy(srcADevice, ASize, srcAHost, ASize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(srcBDevice, BSize, srcBHost, BSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(srcCDevice, CSize, srcCHost, CSize, ACL_MEMCPY_HOST_TO_DEVICE); + + if(key == 1) { + LaunchTPut_BI_LEFTRIGHT(dstDevice, srcADevice, srcBDevice, srcCDevice); + } else if (key == 2) { + LaunchTPut_BI_NOSPLIT(dstDevice, srcADevice, srcBDevice, srcCDevice); + } else if (key == 3) { + LaunchTPut_BI_TOPDOWN(dstDevice, srcADevice, srcBDevice, srcCDevice); + } else if (key == 4) { + LaunchTPut_C2V_NOSPLIT(dstDevice, srcADevice, srcBDevice, srcCDevice); + } + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, CSize, dstDevice, CSize, ACL_MEMCPY_DEVICE_TO_HOST); + + WriteFile(GetGoldenDir() + "/output.bin", srcCDevice, CSize); + + aclrtFree(dstDevice); + aclrtFree(srcADevice); + aclrtFree(srcBDevice); + aclrtFree(srcCDevice); + + aclrtFreeHost(dstHost); + aclrtFreeHost(srcAHost); + aclrtFreeHost(srcBHost); + aclrtFreeHost(srcCHost); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + size_t elem_count = CSize / sizeof(T); + + std::vector golden(elem_count); + std::vector devFinal(elem_count); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/golden.bin", CSize, golden.data(), CSize)); + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/output.bin", CSize, devFinal.data(), CSize)); + + bool ret = ResultCmp(golden, devFinal, 0.001f); + + EXPECT_TRUE(ret); +} + +// TEST_F(TPUSH_A5Test, case_1) +// { +// test_tpush<1>(); +// } + +// TEST_F(TPUSH_A5Test, case_2) +// { +// test_tpush<2>(); +// } + +// TEST_F(TPUSH_A5Test, case_1) +// { +// test_tpush<3>(); +// } + +TEST_F(TPUSH_A5Test, case_4) +{ + test_tpush<4>(); +} + +}