diff --git a/include/pto/common/pto_instr.hpp b/include/pto/common/pto_instr.hpp index 74606495d..4b12360e8 100644 --- a/include/pto/common/pto_instr.hpp +++ b/include/pto/common/pto_instr.hpp @@ -982,6 +982,15 @@ PTO_INST RecordEvent TGATHER(DstTileData &dst, SrcTileData &src, WaitEvents &... return {}; } +template +PTO_INST RecordEvent TSCATTER(DstTileData &dst, SrcTileData &src, WaitEvents &...events) +{ + TSYNC(events...); + TSCATTER_IMPL(dst, src); + return {}; +} + template PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { diff --git a/include/pto/costmodel/pto_instr.hpp b/include/pto/costmodel/pto_instr.hpp index 43d48c1d1..2eb07e60a 100644 --- a/include/pto/costmodel/pto_instr.hpp +++ b/include/pto/costmodel/pto_instr.hpp @@ -109,13 +109,13 @@ PTO_INST void TSYNC() } template -PTO_INST void TSYNC(WaitEvents &... events) +PTO_INST void TSYNC(WaitEvents &...events) { WaitAllEvents(events...); } template -PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TADD, dst, src0, src1); @@ -123,7 +123,7 @@ PTO_INST RecordEvent TADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TABS(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TABS(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TABS, dst, src); @@ -131,7 +131,7 @@ PTO_INST RecordEvent TABS(TileDataDst &dst, TileDataSrc &src, WaitEvents &... ev } template -PTO_INST RecordEvent TAND(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TAND(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TAND, dst, src0, src1); @@ -139,7 +139,7 @@ PTO_INST RecordEvent TAND(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TOR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TOR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TOR, dst, src0, src1); @@ -147,7 +147,7 @@ PTO_INST RecordEvent TOR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src } template -PTO_INST RecordEvent TSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSUB, dst, src0, src1); @@ -156,7 +156,7 @@ PTO_INST RecordEvent TSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr template PTO_INST RecordEvent TSUBVIEW(TileDataDst &dst, TileDataSrc &src, uint16_t rowIdx, uint16_t colIdx, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSUBVIEW, dst, src, rowIdx, colIdx); @@ -164,7 +164,7 @@ PTO_INST RecordEvent TSUBVIEW(TileDataDst &dst, TileDataSrc &src, uint16_t rowId } template -PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMUL, dst, src0, src1); @@ -172,7 +172,7 @@ PTO_INST RecordEvent TMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMIN, dst, src0, src1); @@ -180,7 +180,7 @@ PTO_INST RecordEvent TMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMAX, dst, src0, src1); @@ -188,7 +188,7 @@ PTO_INST RecordEvent TMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, WaitEvents &... events) +PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TEXPANDS, dst, scalar); @@ -196,7 +196,7 @@ PTO_INST RecordEvent TEXPANDS(TileData &dst, typename TileData::DType scalar, Wa } template -PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, WaitEvents &... events) +PTO_INST RecordEvent TLOAD(TileData &dst, GlobalData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TLOAD, dst, src); @@ -212,7 +212,7 @@ PTO_INST RecordEvent TPREFETCH(TileData &dst, GlobalData &src) template PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType src1, CmpMode mode, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCMPS, dst, src0, src1, mode); @@ -222,7 +222,7 @@ PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template , typename... WaitEvents> PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, CmpMode mode, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCMPS, dst, src0, src1, mode); @@ -231,7 +231,7 @@ PTO_INST RecordEvent TCMPS(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &s template PTO_INST RecordEvent TCMP(TileDataDst &dst, TileDataSrc &src0, TileDataSrc &src1, CmpMode cmpMode, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCMP, dst, src0, src1, cmpMode); @@ -239,7 +239,7 @@ PTO_INST RecordEvent TCMP(TileDataDst &dst, TileDataSrc &src0, TileDataSrc &src1 } template -PTO_INST RecordEvent TCONCAT(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCONCAT(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCONCAT, dst, src0, src1); @@ -247,7 +247,7 @@ PTO_INST RecordEvent TCONCAT(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 } template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, AtomicType::AtomicNone), dst, src); @@ -256,7 +256,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even // UF-aware overload: allow selecting unit-flag phase while keeping the TSTORE name. template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, AtomicType::AtomicNone, Phase), dst, src); @@ -264,7 +264,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even } template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType), dst, src); @@ -272,7 +272,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even } template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType, Phase), dst, src); @@ -281,7 +281,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType, reluPreMode), dst, src); @@ -290,7 +290,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType, reluPreMode, Phase), dst, src); @@ -299,7 +299,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, WaitEvents &... even template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType, reluPreMode), dst, src, @@ -309,7 +309,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantSca template -PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &... events) +PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantScalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE, PTO_TEMPLATE_ARGS(TileData, GlobalData, atomicType, reluPreMode, Phase), dst, src, @@ -319,7 +319,7 @@ PTO_INST RecordEvent TSTORE(GlobalData &dst, TileData &src, uint64_t preQuantSca template -PTO_INST RecordEvent TSTORE_FP(GlobalData &dst, TileData &src, FpTileData &fp, WaitEvents &... events) +PTO_INST RecordEvent TSTORE_FP(GlobalData &dst, TileData &src, FpTileData &fp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TSTORE_FP, PTO_TEMPLATE_ARGS(TileData, GlobalData, FpTileData, atomicType, reluPreMode), dst, src, @@ -328,7 +328,7 @@ PTO_INST RecordEvent TSTORE_FP(GlobalData &dst, TileData &src, FpTileData &fp, W } template -PTO_INST RecordEvent TDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TDIV, dst, src0, src1); @@ -336,7 +336,7 @@ PTO_INST RecordEvent TDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TSHL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TSHL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSHL, dst, src0, src1); @@ -344,7 +344,7 @@ PTO_INST RecordEvent TSHL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TSHR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TSHR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSHR, dst, src0, src1); @@ -352,7 +352,7 @@ PTO_INST RecordEvent TSHR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TAND(TileData &dst, TileData &src0, TileData &src1, WaitEvents &... events) +PTO_INST RecordEvent TAND(TileData &dst, TileData &src0, TileData &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TAND, dst, src0, src1); @@ -360,7 +360,7 @@ PTO_INST RecordEvent TAND(TileData &dst, TileData &src0, TileData &src1, WaitEve } template -PTO_INST RecordEvent TOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents &... events) +PTO_INST RecordEvent TOR(TileData &dst, TileData &src0, TileData &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TOR, dst, src0, src1); @@ -370,7 +370,7 @@ PTO_INST RecordEvent TOR(TileData &dst, TileData &src0, TileData &src1, WaitEven template PTO_INST RecordEvent TXOR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TXOR, dst, src0, src1, tmp); @@ -378,7 +378,7 @@ PTO_INST RecordEvent TXOR(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TLOG(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TLOG(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TLOG, dst, src); @@ -386,7 +386,7 @@ PTO_INST RecordEvent TLOG(TileDataDst &dst, TileDataSrc &src, WaitEvents &... ev } template -PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TDIVS, dst, 1, src); @@ -396,7 +396,7 @@ PTO_INST RecordEvent TRECIP(TileDataDst &dst, TileDataSrc &src, WaitEvents &... template PTO_INST RecordEvent TPRELU(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPRELU, dst, src0, src1, tmp); @@ -404,7 +404,7 @@ PTO_INST RecordEvent TPRELU(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 & } template -PTO_INST RecordEvent TPRINT(TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TPRINT(TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPRINT, src); @@ -412,7 +412,7 @@ PTO_INST RecordEvent TPRINT(TileData &src, WaitEvents &... events) } template -PTO_INST RecordEvent TADDC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents &... events) +PTO_INST RecordEvent TADDC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TADDC, dst, src0, src1, src2); @@ -420,7 +420,7 @@ PTO_INST RecordEvent TADDC(TileData &dst, TileData &src0, TileData &src1, TileDa } template -PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents &... events) +PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileData &src2, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSUBC, dst, src0, src1, src2); @@ -428,7 +428,7 @@ PTO_INST RecordEvent TSUBC(TileData &dst, TileData &src0, TileData &src1, TileDa } template -PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &... events) +PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMATMUL, cMatrix, aMatrix, bMatrix); @@ -437,7 +437,7 @@ PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMa // UF-aware overload enabling unit-flag selection via AccPhase while retaining the TMATMUL name. template -PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &... events) +PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMATMUL, PTO_TEMPLATE_ARGS(Phase), cMatrix, aMatrix, bMatrix); @@ -446,7 +446,7 @@ PTO_INST RecordEvent TMATMUL(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMa template PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMATMUL_ACC, cOutMatrix, cInMatrix, aMatrix, bMatrix); @@ -456,7 +456,7 @@ PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLe // UF-aware overloads for TMATMUL_ACC: explicit input/output or shared accumulator tile. template PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMATMUL_ACC, PTO_TEMPLATE_ARGS(Phase), cOutMatrix, cInMatrix, aMatrix, bMatrix); @@ -465,7 +465,7 @@ PTO_INST RecordEvent TMATMUL_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLe template -PTO_INST RecordEvent TMATMUL_ACC(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &... events) +PTO_INST RecordEvent TMATMUL_ACC(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMATMUL_ACC, PTO_TEMPLATE_ARGS(Phase), cMatrix, aMatrix, bMatrix); @@ -474,7 +474,7 @@ PTO_INST RecordEvent TMATMUL_ACC(TileRes &cMatrix, TileLeft &aMatrix, TileRight template PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMATMUL_BIAS, cMatrix, aMatrix, bMatrix, biasData); @@ -485,7 +485,7 @@ PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight template PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMATMUL_BIAS, PTO_TEMPLATE_ARGS(Phase), cMatrix, aMatrix, bMatrix, biasData); @@ -493,7 +493,7 @@ PTO_INST RecordEvent TMATMUL_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight } template -PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &... events) +PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGEMV, cMatrix, aMatrix, bMatrix); @@ -501,7 +501,7 @@ PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatr } template -PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &... events) +PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TGEMV, PTO_TEMPLATE_ARGS(Phase), cMatrix, aMatrix, bMatrix); @@ -510,7 +510,7 @@ PTO_INST RecordEvent TGEMV(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatr template PTO_INST RecordEvent TGEMV_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGEMV_ACC, cOutMatrix, cInMatrix, aMatrix, bMatrix); @@ -519,7 +519,7 @@ PTO_INST RecordEvent TGEMV_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft template PTO_INST RecordEvent TGEMV_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft &aMatrix, TileRight &bMatrix, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TGEMV_ACC, PTO_TEMPLATE_ARGS(Phase), cOutMatrix, cInMatrix, aMatrix, bMatrix); @@ -528,7 +528,7 @@ PTO_INST RecordEvent TGEMV_ACC(TileRes &cOutMatrix, TileRes &cInMatrix, TileLeft template PTO_INST RecordEvent TGEMV_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGEMV_BIAS, cMatrix, aMatrix, bMatrix, biasData); @@ -538,7 +538,7 @@ PTO_INST RecordEvent TGEMV_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight & template PTO_INST RecordEvent TGEMV_BIAS(TileRes &cMatrix, TileLeft &aMatrix, TileRight &bMatrix, TileBias &biasData, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TGEMV_BIAS, PTO_TEMPLATE_ARGS(Phase), cMatrix, aMatrix, bMatrix, biasData); @@ -549,7 +549,7 @@ template PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, TmpTileData &tmp, Src0TileData &src0, Src1TileData &src1, Src2TileData &src2, Src3TileData &src3, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T( @@ -562,7 +562,7 @@ PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executed template PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, TmpTileData &tmp, - Src0TileData &src0, Src1TileData &src1, Src2TileData &src2, WaitEvents &... events) + Src0TileData &src0, Src1TileData &src1, Src2TileData &src2, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMRGSORT, @@ -574,7 +574,7 @@ PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executed template PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executedNumList, TmpTileData &tmp, - Src0TileData &src0, Src1TileData &src1, WaitEvents &... events) + Src0TileData &src0, Src1TileData &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMRGSORT, PTO_TEMPLATE_ARGS(DstTileData, TmpTileData, Src0TileData, Src1TileData, exhausted), dst, @@ -583,7 +583,7 @@ PTO_INST RecordEvent TMRGSORT(DstTileData &dst, MrgSortExecutedNumList &executed } template -PTO_INST RecordEvent TMRGSORT(DstTileData &dst, SrcTileData &src, uint32_t blockLen, WaitEvents &... events) +PTO_INST RecordEvent TMRGSORT(DstTileData &dst, SrcTileData &src, uint32_t blockLen, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMRGSORT, dst, src, blockLen); @@ -592,7 +592,7 @@ PTO_INST RecordEvent TMRGSORT(DstTileData &dst, SrcTileData &src, uint32_t block template PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint16_t indexRow = 0, uint16_t indexCol = 0, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TEXTRACT, dst, src, indexRow, indexCol); @@ -601,7 +601,7 @@ PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint16_t index template PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint16_t indexRow, uint16_t indexCol, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TEXTRACT, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src, indexRow, indexCol); @@ -611,7 +611,7 @@ PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint16_t index template PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, uint16_t indexRow, - uint16_t indexCol, WaitEvents &... events) + uint16_t indexCol, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TEXTRACT, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src, preQuantScalar, @@ -622,7 +622,7 @@ PTO_INST RecordEvent TEXTRACT(DstTileData &dst, SrcTileData &src, uint64_t preQu template PTO_INST RecordEvent TEXTRACT_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, uint16_t indexRow, - uint16_t indexCol, WaitEvents &... events) + uint16_t indexCol, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TEXTRACT_FP, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, FpTileData, reluMode), dst, src, fp, @@ -633,7 +633,7 @@ PTO_INST RecordEvent TEXTRACT_FP(DstTileData &dst, SrcTileData &src, FpTileData template PTO_INST RecordEvent TIMG2COL(TileData &dst, ConvTileData &src, uint16_t posM = 0, uint16_t posK = 0, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TIMG2COL, PTO_TEMPLATE_ARGS(TileData, ConvTileData, FmatrixMode), dst, src, posM, posK); @@ -641,7 +641,7 @@ PTO_INST RecordEvent TIMG2COL(TileData &dst, ConvTileData &src, uint16_t posM = } template -PTO_INST RecordEvent SETFMATRIX(ConvTileData &src, WaitEvents &... events) +PTO_INST RecordEvent SETFMATRIX(ConvTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(SETFMATRIX, PTO_TEMPLATE_ARGS(ConvTileData, FmatrixMode), src); @@ -650,7 +650,7 @@ PTO_INST RecordEvent SETFMATRIX(ConvTileData &src, WaitEvents &... events) #ifdef PTO_NPU_ARCH_A2A3 template -PTO_INST RecordEvent SET_IMG2COL_RPT(ConvTileData &src, WaitEvents &... events) +PTO_INST RecordEvent SET_IMG2COL_RPT(ConvTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(SET_IMG2COL_RPT, PTO_TEMPLATE_ARGS(ConvTileData), src); @@ -658,7 +658,7 @@ PTO_INST RecordEvent SET_IMG2COL_RPT(ConvTileData &src, WaitEvents &... events) } template -PTO_INST RecordEvent SET_IMG2COL_PADDING(ConvTileData &src, WaitEvents &... events) +PTO_INST RecordEvent SET_IMG2COL_PADDING(ConvTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(SET_IMG2COL_PADDING, PTO_TEMPLATE_ARGS(ConvTileData), src); @@ -667,7 +667,7 @@ PTO_INST RecordEvent SET_IMG2COL_PADDING(ConvTileData &src, WaitEvents &... even #endif template PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint16_t indexRow, uint16_t indexCol, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TINSERT, dst, src, indexRow, indexCol); @@ -676,7 +676,7 @@ PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint16_t indexR template PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint16_t indexRow, uint16_t indexCol, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TINSERT, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src, indexRow, indexCol); @@ -686,7 +686,7 @@ PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint16_t indexR template PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, uint16_t indexRow, - uint16_t indexCol, WaitEvents &... events) + uint16_t indexCol, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TINSERT, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src, preQuantScalar, indexRow, @@ -697,7 +697,7 @@ PTO_INST RecordEvent TINSERT(DstTileData &dst, SrcTileData &src, uint64_t preQua template PTO_INST RecordEvent TINSERT_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, uint16_t indexRow, - uint16_t indexCol, WaitEvents &... events) + uint16_t indexCol, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TINSERT_FP, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, FpTileData, reluMode), dst, src, fp, @@ -707,7 +707,7 @@ PTO_INST RecordEvent TINSERT_FP(DstTileData &dst, SrcTileData &src, FpTileData & template = 0, typename... WaitEvents> -PTO_INST RecordEvent TFILLPAD(TileData &dst, TileData &src, WaitEvents &... events) +PTO_INST RecordEvent TFILLPAD(TileData &dst, TileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TFILLPAD, PTO_TEMPLATE_ARGS(TileData, PadVal), dst, src); @@ -717,7 +717,7 @@ PTO_INST RecordEvent TFILLPAD(TileData &dst, TileData &src, WaitEvents &... even template = 0, typename... WaitEvents> -PTO_INST RecordEvent TFILLPAD(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TFILLPAD(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TFILLPAD, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData), dst, src); @@ -725,7 +725,7 @@ PTO_INST RecordEvent TFILLPAD(DstTileData &dst, SrcTileData &src, WaitEvents &.. } template -PTO_INST RecordEvent TFILLPAD_INPLACE(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TFILLPAD_INPLACE(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TFILLPAD_INPLACE, dst, src); @@ -733,7 +733,7 @@ PTO_INST RecordEvent TFILLPAD_INPLACE(DstTileData &dst, SrcTileData &src, WaitEv } template -PTO_INST RecordEvent TFILLPAD_EXPAND(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TFILLPAD_EXPAND(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TFILLPAD_EXPAND, dst, src); @@ -757,7 +757,7 @@ PTO_INST RecordEvent TSORT32(DstTileData &dst, SrcTileData &src, IdxTileData &id template PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS0 &src0, TileDataS1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGATHER, dst, src0, src1, tmp); @@ -767,7 +767,7 @@ PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS0 &src0, TileDataS1 &src1, template PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS &src0, typename TileDataS::DType k_value, TileDataC &cdst, - TileDataTmp &tmp, WaitEvents &... events) + TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TGATHER, PTO_TEMPLATE_ARGS(TileDataD, TileDataS, TileDataC, TileDataTmp, cmpMode, offset), dst, @@ -776,7 +776,7 @@ PTO_INST RecordEvent TGATHER(TileDataD &dst, TileDataS &src0, typename TileDataS } template -PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents &... events) +PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TCI, PTO_TEMPLATE_ARGS(TileData, T, descending), dst, start); @@ -784,7 +784,7 @@ PTO_INST RecordEvent TCI(TileData &dst, T start, WaitEvents &... events) } template -PTO_INST RecordEvent TTRI(TileData &dst, int diagonal, WaitEvents &... events) +PTO_INST RecordEvent TTRI(TileData &dst, int diagonal, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TTRI, PTO_TEMPLATE_ARGS(TileData, isUpperOrLower), dst, diagonal); @@ -792,15 +792,24 @@ PTO_INST RecordEvent TTRI(TileData &dst, int diagonal, WaitEvents &... events) } template -PTO_INST RecordEvent TGATHER(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TGATHER(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TGATHER, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, maskPattern), dst, src); return {}; } +template +PTO_INST RecordEvent TSCATTER(DstTileData &dst, SrcTileData &src, WaitEvents &...events) +{ + TSYNC(events...); + MAP_INSTR_IMPL_T(TSCATTER, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, maskPattern), dst, src); + return {}; +} + template -PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPARTADD, dst, src0, src1); @@ -808,7 +817,7 @@ PTO_INST RecordEvent TPARTADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 } template -PTO_INST RecordEvent TPARTMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TPARTMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPARTMUL, dst, src0, src1); @@ -816,7 +825,7 @@ PTO_INST RecordEvent TPARTMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 } template -PTO_INST RecordEvent TPARTMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TPARTMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPARTMAX, dst, src0, src1); @@ -824,7 +833,7 @@ PTO_INST RecordEvent TPARTMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 } template -PTO_INST RecordEvent TPARTMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TPARTMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TPARTMIN, dst, src0, src1); @@ -833,7 +842,7 @@ PTO_INST RecordEvent TPARTMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 template PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, RoundMode mode, SaturationMode satMode, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCVT, dst, src, tmp, mode, satMode); @@ -841,7 +850,7 @@ PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, Roun } template -PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, RoundMode mode, WaitEvents &... events) +PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, RoundMode mode, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCVT, dst, src, tmp, mode); @@ -849,8 +858,7 @@ PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, TmpTileData &tmp, Roun } template -PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, SaturationMode satMode, - WaitEvents &... events) +PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, SaturationMode satMode, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCVT, dst, src, mode, satMode); @@ -858,7 +866,7 @@ PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, Satura } template -PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, WaitEvents &... events) +PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCVT, dst, src, mode); @@ -866,7 +874,7 @@ PTO_INST RecordEvent TCVT(TileDataD &dst, TileDataS &src, RoundMode mode, WaitEv } template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMOV, dst, src); @@ -874,7 +882,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... ev } template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src); @@ -883,7 +891,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... ev template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, mode, reluMode), dst, src); @@ -892,7 +900,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, WaitEvents &... ev template -PTO_INST RecordEvent TMOV_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents &... events) +PTO_INST RecordEvent TMOV_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV_FP, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, FpTileData, reluMode), dst, src, fp); @@ -901,7 +909,7 @@ PTO_INST RecordEvent TMOV_FP(DstTileData &dst, SrcTileData &src, FpTileData &fp, template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, FpTileData &fp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, FpTileData, mode, reluMode), dst, src, fp); @@ -910,7 +918,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, FpTileData &fp, Wa template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, reluMode), dst, src, preQuantScalar); @@ -919,7 +927,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantS template -PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents &... events) +PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantScalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TMOV, PTO_TEMPLATE_ARGS(DstTileData, SrcTileData, mode, reluMode), dst, src, preQuantScalar); @@ -927,7 +935,7 @@ PTO_INST RecordEvent TMOV(DstTileData &dst, SrcTileData &src, uint64_t preQuantS } template -PTO_INST RecordEvent TROWSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWSUM, dst, src, tmp); @@ -935,7 +943,7 @@ PTO_INST RecordEvent TROWSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp } template -PTO_INST RecordEvent TROWPROD(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWPROD(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWPROD, dst, src, tmp); @@ -943,7 +951,7 @@ PTO_INST RecordEvent TROWPROD(TileDataOut &dst, TileDataIn &src, TileDataTmp &tm } template -PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLSUM, dst, src); @@ -951,7 +959,7 @@ PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, WaitEvents &... } template -PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, bool isBinary, WaitEvents &... events) +PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, bool isBinary, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLSUM, dst, src, tmp, isBinary); @@ -959,7 +967,7 @@ PTO_INST RecordEvent TCOLSUM(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp } template -PTO_INST RecordEvent TCOLPROD(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TCOLPROD(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLPROD, dst, src); @@ -967,7 +975,7 @@ PTO_INST RecordEvent TCOLPROD(TileDataOut &dst, TileDataIn &src, WaitEvents &... } template -PTO_INST RecordEvent TCOLMAX(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TCOLMAX(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLMAX, dst, src); @@ -975,7 +983,7 @@ PTO_INST RecordEvent TCOLMAX(TileDataOut &dst, TileDataIn &src, WaitEvents &... } template -PTO_INST RecordEvent TROWMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWMAX, dst, src, tmp); @@ -983,7 +991,7 @@ PTO_INST RecordEvent TROWMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp } template -PTO_INST RecordEvent TROWARGMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWARGMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWARGMAX, dst, src, tmp); @@ -991,7 +999,7 @@ PTO_INST RecordEvent TROWARGMAX(TileDataOut &dst, TileDataIn &src, TileDataTmp & } template -PTO_INST RecordEvent TRESHAPE(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TRESHAPE(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TRESHAPE, dst, src); @@ -999,7 +1007,7 @@ PTO_INST RecordEvent TRESHAPE(TileDataOut &dst, TileDataIn &src, WaitEvents &... } template -PTO_INST RecordEvent TROWMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWMIN, dst, src, tmp); @@ -1007,7 +1015,7 @@ PTO_INST RecordEvent TROWMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp } template -PTO_INST RecordEvent TROWARGMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TROWARGMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWARGMIN, dst, src, tmp); @@ -1017,7 +1025,7 @@ PTO_INST RecordEvent TROWARGMIN(TileDataOut &dst, TileDataIn &src, TileDataTmp & template PTO_INST RecordEvent TSELS(TileDataDst &dst, TileDataMask &mask, TileDataSrc &src, TileDataTmp &tmp, - typename TileDataSrc::DType scalar, WaitEvents &... events) + typename TileDataSrc::DType scalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSELS, dst, mask, src, tmp, scalar); @@ -1026,7 +1034,7 @@ PTO_INST RecordEvent TSELS(TileDataDst &dst, TileDataMask &mask, TileDataSrc &sr template PTO_INST RecordEvent TSEL(TileData &dst, MaskTile &selMask, TileData &src0, TileData &src1, TmpTile &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSEL, dst, selMask, src0, src1, tmp); @@ -1034,7 +1042,7 @@ PTO_INST RecordEvent TSEL(TileData &dst, MaskTile &selMask, TileData &src0, Tile } template -PTO_INST RecordEvent TTRANS(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TTRANS(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TTRANS, dst, src, tmp); @@ -1043,7 +1051,7 @@ PTO_INST RecordEvent TTRANS(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp template PTO_INST RecordEvent TMINS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMINS, dst, src, scalar); @@ -1051,7 +1059,7 @@ PTO_INST RecordEvent TMINS(TileDataDst &dst, TileDataSrc &src, typename TileData } template -PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPAND, dst, src); @@ -1059,7 +1067,7 @@ PTO_INST RecordEvent TROWEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents & } template -PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDDIV, dst, src0, src1); @@ -1069,7 +1077,7 @@ PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDDIV, dst, src0, src1, tmp); @@ -1077,7 +1085,7 @@ PTO_INST RecordEvent TROWEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMUL, dst, src0, src1); @@ -1087,7 +1095,7 @@ PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMUL, dst, src0, src1, tmp); @@ -1095,7 +1103,7 @@ PTO_INST RecordEvent TROWEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDSUB, dst, src0, src1); @@ -1105,7 +1113,7 @@ PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDSUB, dst, src0, src1, tmp); @@ -1113,7 +1121,7 @@ PTO_INST RecordEvent TROWEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDADD, dst, src0, src1); @@ -1123,7 +1131,7 @@ PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDADD, dst, src0, src1, tmp); @@ -1131,7 +1139,7 @@ PTO_INST RecordEvent TROWEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMAX, dst, src0, src1); @@ -1141,7 +1149,7 @@ PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMAX, dst, src0, src1, tmp); @@ -1149,7 +1157,7 @@ PTO_INST RecordEvent TROWEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMIN, dst, src0, src1); @@ -1159,7 +1167,7 @@ PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDat template PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDMIN, dst, src0, src1, tmp); @@ -1167,7 +1175,7 @@ PTO_INST RecordEvent TROWEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TROWEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TROWEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDEXPDIF, dst, src0, src1); @@ -1177,7 +1185,7 @@ PTO_INST RecordEvent TROWEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, Tile template PTO_INST RecordEvent TROWEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TROWEXPANDEXPDIF, dst, src0, src1, tmp); @@ -1185,7 +1193,7 @@ PTO_INST RecordEvent TROWEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, Tile } template -PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TRSQRT, dst, src); @@ -1194,7 +1202,7 @@ PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... template > -PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents &... events) +PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TRSQRT, dst, src, tmp); @@ -1202,7 +1210,7 @@ PTO_INST RecordEvent TRSQRT(TileDataDst &dst, TileDataSrc &src, TileDataTmp &tmp } template -PTO_INST RecordEvent TSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSQRT, dst, src); @@ -1210,7 +1218,7 @@ PTO_INST RecordEvent TSQRT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... e } template -PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TEXP, dst, src); @@ -1218,7 +1226,7 @@ PTO_INST RecordEvent TEXP(TileDataDst &dst, TileDataSrc &src, WaitEvents &... ev } template -PTO_INST RecordEvent TNOT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TNOT(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TNOT, dst, src); @@ -1226,7 +1234,7 @@ PTO_INST RecordEvent TNOT(TileDataDst &dst, TileDataSrc &src, WaitEvents &... ev } template -PTO_INST RecordEvent TRELU(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TRELU(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TRELU, dst, src); @@ -1234,7 +1242,7 @@ PTO_INST RecordEvent TRELU(TileDataDst &dst, TileDataSrc &src, WaitEvents &... e } template -PTO_INST RecordEvent TGATHERB(TileDataDst &dst, TileDataSrc &src, TileDataOffset &offset, WaitEvents &... events) +PTO_INST RecordEvent TGATHERB(TileDataDst &dst, TileDataSrc &src, TileDataOffset &offset, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGATHERB, dst, src, offset); @@ -1243,7 +1251,7 @@ PTO_INST RecordEvent TGATHERB(TileDataDst &dst, TileDataSrc &src, TileDataOffset template PTO_INST RecordEvent TADDS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TADDS, dst, src0, scalar); @@ -1252,7 +1260,7 @@ PTO_INST RecordEvent TADDS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TAXPY(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TAXPY, dst, src0, scalar); @@ -1261,7 +1269,7 @@ PTO_INST RecordEvent TAXPY(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TSUBS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSUBS, dst, src0, scalar); @@ -1270,7 +1278,7 @@ PTO_INST RecordEvent TSUBS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TDIVS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TDIVS, dst, src0, scalar); @@ -1279,7 +1287,7 @@ PTO_INST RecordEvent TDIVS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMULS, dst, src0, scalar); @@ -1288,7 +1296,7 @@ PTO_INST RecordEvent TMULS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TDIVS(TileDataDst &dst, typename TileDataDst::DType scalar, TileDataSrc &src0, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TDIVS, dst, scalar, src0); @@ -1297,7 +1305,7 @@ PTO_INST RecordEvent TDIVS(TileDataDst &dst, typename TileDataDst::DType scalar, template PTO_INST RecordEvent TFMODS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TFMODS, dst, src, scalar); @@ -1306,7 +1314,7 @@ PTO_INST RecordEvent TFMODS(TileDataDst &dst, TileDataSrc &src, typename TileDat template PTO_INST RecordEvent TREMS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TREMS, dst, src, scalar); @@ -1315,7 +1323,7 @@ PTO_INST RecordEvent TREMS(TileDataDst &dst, TileDataSrc &src, typename TileData template PTO_INST RecordEvent TMAXS(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TMAXS, dst, src, scalar); @@ -1324,7 +1332,7 @@ PTO_INST RecordEvent TMAXS(TileDataDst &dst, TileDataSrc &src, typename TileData template PTO_INST RecordEvent TANDS(TileDataDst &dst, TileDataSrc &src, typename TileDataDst::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TANDS, dst, src, scalar); @@ -1332,8 +1340,7 @@ PTO_INST RecordEvent TANDS(TileDataDst &dst, TileDataSrc &src, typename TileData } template -PTO_INST RecordEvent TORS(TileDataDst &dst, TileDataSrc &src, typename TileDataDst::DType scalar, - WaitEvents &... events) +PTO_INST RecordEvent TORS(TileDataDst &dst, TileDataSrc &src, typename TileDataDst::DType scalar, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TORS, dst, src, scalar); @@ -1342,7 +1349,7 @@ PTO_INST RecordEvent TORS(TileDataDst &dst, TileDataSrc &src, typename TileDataD template PTO_INST RecordEvent TSHLS(TileDataDst &dst, TileDataSrc &src, typename TileDataDst::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSHLS, dst, src, scalar); @@ -1351,7 +1358,7 @@ PTO_INST RecordEvent TSHLS(TileDataDst &dst, TileDataSrc &src, typename TileData template PTO_INST RecordEvent TSHRS(TileDataDst &dst, TileDataSrc &src, typename TileDataDst::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSHRS, dst, src, scalar); @@ -1360,7 +1367,7 @@ PTO_INST RecordEvent TSHRS(TileDataDst &dst, TileDataSrc &src, typename TileData template PTO_INST RecordEvent TXORS(TileDataDst &dst, TileDataSrc &src0, typename TileDataSrc::DType scalar, TileDataTmp &tmp, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TXORS, dst, src0, scalar, tmp); @@ -1369,7 +1376,7 @@ PTO_INST RecordEvent TXORS(TileDataDst &dst, TileDataSrc &src0, typename TileDat template PTO_INST RecordEvent TLRELU(TileDataDst &dst, TileDataSrc &src, typename TileDataSrc::DType scalar, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TLRELU, dst, src, scalar); @@ -1378,7 +1385,7 @@ PTO_INST RecordEvent TLRELU(TileDataDst &dst, TileDataSrc &src, typename TileDat template PTO_INST RecordEvent TADDSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TADDSC, dst, src0, scalar, src1); @@ -1387,7 +1394,7 @@ PTO_INST RecordEvent TADDSC(TileData &dst, TileData &src0, typename TileData::DT template PTO_INST RecordEvent TSUBSC(TileData &dst, TileData &src0, typename TileData::DType scalar, TileData &src1, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSUBSC, dst, src0, scalar, src1); @@ -1395,7 +1402,7 @@ PTO_INST RecordEvent TSUBSC(TileData &dst, TileData &src0, typename TileData::DT } template -PTO_INST RecordEvent TCOLMIN(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TCOLMIN(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLMIN, dst, src); @@ -1403,7 +1410,7 @@ PTO_INST RecordEvent TCOLMIN(TileDataOut &dst, TileDataIn &src, WaitEvents &... } template -PTO_INST RecordEvent TSCATTER(TileDataD &dst, TileDataS &src, TileDataI &indexes, WaitEvents &... events) +PTO_INST RecordEvent TSCATTER(TileDataD &dst, TileDataS &src, TileDataI &indexes, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TSCATTER, dst, src, indexes); @@ -1411,7 +1418,7 @@ PTO_INST RecordEvent TSCATTER(TileDataD &dst, TileDataS &src, TileDataI &indexes } template -PTO_INST RecordEvent TCOLEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPAND, dst, src); @@ -1419,7 +1426,7 @@ PTO_INST RecordEvent TCOLEXPAND(TileDataDst &dst, TileDataSrc &src, WaitEvents & } template -PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, TileInd &indexes, WaitEvents &... events) +PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, TileInd &indexes, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(MGATHER, dst, src, indexes); @@ -1427,7 +1434,7 @@ PTO_INST RecordEvent MGATHER(TileDst &dst, GlobalData &src, TileInd &indexes, Wa } template -PTO_INST RecordEvent MSCATTER(GlobalData &dst, TileSrc &src, TileInd &indexes, WaitEvents &... events) +PTO_INST RecordEvent MSCATTER(GlobalData &dst, TileSrc &src, TileInd &indexes, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(MSCATTER, dst, src, indexes); @@ -1435,7 +1442,7 @@ PTO_INST RecordEvent MSCATTER(GlobalData &dst, TileSrc &src, TileInd &indexes, W } template -PTO_INST RecordEvent TNEG(TileDataDst &dst, TileDataSrc &src, WaitEvents &... events) +PTO_INST RecordEvent TNEG(TileDataDst &dst, TileDataSrc &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TNEG, dst, src); @@ -1443,7 +1450,7 @@ PTO_INST RecordEvent TNEG(TileDataDst &dst, TileDataSrc &src, WaitEvents &... ev } template -PTO_INST RecordEvent TCOLEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDDIV, dst, src0, src1); @@ -1451,7 +1458,7 @@ PTO_INST RecordEvent TCOLEXPANDDIV(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDMUL, dst, src0, src1); @@ -1459,7 +1466,7 @@ PTO_INST RecordEvent TCOLEXPANDMUL(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDADD, dst, src0, src1); @@ -1467,7 +1474,7 @@ PTO_INST RecordEvent TCOLEXPANDADD(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDMAX, dst, src0, src1); @@ -1475,7 +1482,7 @@ PTO_INST RecordEvent TCOLEXPANDMAX(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDMIN, dst, src0, src1); @@ -1483,7 +1490,7 @@ PTO_INST RecordEvent TCOLEXPANDMIN(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDSUB, dst, src0, src1); @@ -1491,7 +1498,7 @@ PTO_INST RecordEvent TCOLEXPANDSUB(TileDataDst &dst, TileDataSrc0 &src0, TileDat } template -PTO_INST RecordEvent TCOLEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TCOLEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TCOLEXPANDEXPDIF, dst, src0, src1); @@ -1500,7 +1507,7 @@ PTO_INST RecordEvent TCOLEXPANDEXPDIF(TileDataDst &dst, TileDataSrc0 &src0, Tile template PTO_INST RecordEvent TDEQUANT(TileDataDst &dst, TileDataSrc &src, TileDataPara &scale, TileDataPara &offset, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TDEQUANT, dst, src, scale, offset); @@ -1508,7 +1515,7 @@ PTO_INST RecordEvent TDEQUANT(TileDataDst &dst, TileDataSrc &src, TileDataPara & } template -PTO_INST RecordEvent TREM(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TREM(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TREM, dst, src0, src1); @@ -1516,7 +1523,7 @@ PTO_INST RecordEvent TREM(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &sr } template -PTO_INST RecordEvent TFMOD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &... events) +PTO_INST RecordEvent TFMOD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &src1, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TFMOD, dst, src0, src1); @@ -1524,7 +1531,7 @@ PTO_INST RecordEvent TFMOD(TileDataDst &dst, TileDataSrc0 &src0, TileDataSrc1 &s } template -PTO_INST RecordEvent TPUSH(Pipe &pipe, TileProd &tile, WaitEvents &... events) +PTO_INST RecordEvent TPUSH(Pipe &pipe, TileProd &tile, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TPUSH, PTO_TEMPLATE_ARGS(Pipe, TileProd, Split), pipe, tile); @@ -1532,7 +1539,7 @@ PTO_INST RecordEvent TPUSH(Pipe &pipe, TileProd &tile, WaitEvents &... events) } template -PTO_INST RecordEvent TPUSH(TileData &tile, Pipe &pipe, WaitEvents &... events) +PTO_INST RecordEvent TPUSH(TileData &tile, Pipe &pipe, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TPUSH, PTO_TEMPLATE_ARGS(TileData, Pipe), tile, pipe); @@ -1540,7 +1547,7 @@ PTO_INST RecordEvent TPUSH(TileData &tile, Pipe &pipe, WaitEvents &... events) } template -PTO_INST RecordEvent TPOP(Pipe &pipe, TileCons &tile, WaitEvents &... events) +PTO_INST RecordEvent TPOP(Pipe &pipe, TileCons &tile, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TPOP, PTO_TEMPLATE_ARGS(Pipe, TileCons, Split), pipe, tile); @@ -1548,7 +1555,7 @@ PTO_INST RecordEvent TPOP(Pipe &pipe, TileCons &tile, WaitEvents &... events) } template -PTO_INST RecordEvent TPOP(TileData &tile, Pipe &pipe, WaitEvents &... events) +PTO_INST RecordEvent TPOP(TileData &tile, Pipe &pipe, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TPOP, PTO_TEMPLATE_ARGS(TileData, Pipe), tile, pipe); @@ -1556,7 +1563,7 @@ PTO_INST RecordEvent TPOP(TileData &tile, Pipe &pipe, WaitEvents &... events) } template -PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &... events) +PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TFREE, PTO_TEMPLATE_ARGS(Pipe, Split), pipe); @@ -1564,7 +1571,7 @@ PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &... events) } template -PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &... events) +PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TFREE, PTO_TEMPLATE_ARGS(Pipe), pipe); @@ -1573,7 +1580,7 @@ PTO_INST RecordEvent TFREE(Pipe &pipe, WaitEvents &... events) template PTO_INST RecordEvent TQUANT(TileDataOut &dst, TileDataSrc &src, TileDataPara &scale, TileDataPara *offset = nullptr, - WaitEvents &... events) + WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL_T(TQUANT, PTO_TEMPLATE_ARGS(quant_type, TileDataOut, TileDataSrc, TileDataPara), dst, src, scale, @@ -1582,7 +1589,7 @@ PTO_INST RecordEvent TQUANT(TileDataOut &dst, TileDataSrc &src, TileDataPara &sc } template -PTO_INST RecordEvent TGET_SCALE_ADDR(TileDataOut &dst, TileDataIn &src, WaitEvents &... events) +PTO_INST RecordEvent TGET_SCALE_ADDR(TileDataOut &dst, TileDataIn &src, WaitEvents &...events) { TSYNC(events...); MAP_INSTR_IMPL(TGET_SCALE_ADDR, dst, src); diff --git a/include/pto/cpu/TScatter.hpp b/include/pto/cpu/TScatter.hpp index 57ba3d881..9d7fff414 100644 --- a/include/pto/cpu/TScatter.hpp +++ b/include/pto/cpu/TScatter.hpp @@ -12,6 +12,7 @@ See LICENSE in the root of the software repository for the full text of the Lice #define TSCATTER_HPP #include "pto/cpu/tile_offsets.hpp" +#include "pto/cpu/TGather.hpp" #include #include @@ -40,6 +41,43 @@ PTO_INTERNAL void TSCATTER_IMPL(TileDataDst &dst, TileDataSrc &src, TileInd &ind } } +template +PTO_INTERNAL void TScatter(typename DstTileData::TileDType dst, typename SrcTileData::TileDType src, unsigned validRow, + unsigned validCol) +{ + unsigned sR = 0; + unsigned sC = 0; + for (unsigned r = 0; r < validRow; r++) { + for (unsigned c = 0; c < validCol; c++) { + const size_t didx = GetTileElementOffset(r, c); + if (MaskSelect(maskPattern, c)) { + const size_t sidx = GetTileElementOffset(sR, sC); + dst[didx] = static_cast(src[sidx]); + if (++sC == SrcTileData::Cols) { + sC = 0; + sR++; + } + } else { + dst[didx] = static_cast(0); + } + } + } +} + +template +PTO_INTERNAL void TSCATTER_IMPL(DstTileData &dst, SrcTileData &src) +{ + using T = typename SrcTileData::DType; + static_assert(sizeof(T) == 2 || sizeof(T) == 4, "TSCATTER: src element type must be 16 or 32-bit wide"); + static_assert((DstTileData::Loc == TileType::Vec) && (SrcTileData::Loc == TileType::Vec), + "TSCATTER: expect vec TileType"); + static_assert((DstTileData::isRowMajor && SrcTileData::isRowMajor), "TSCATTER: expect row major"); + static_assert((sizeof(typename DstTileData::DType) == sizeof(T)), + "TSCATTER: expect same type size for dst and src"); + assert(src.GetValidCol() == SrcTileData::Cols); + TScatter(dst.data(), src.data(), src.GetValidRow(), dst.GetValidCol()); +} + } // namespace pto #endif diff --git a/tests/cpu/st/testcase/tscatter/gen_data.py b/tests/cpu/st/testcase/tscatter/gen_data.py index 70d731fd1..8e3467e32 100644 --- a/tests/cpu/st/testcase/tscatter/gen_data.py +++ b/tests/cpu/st/testcase/tscatter/gen_data.py @@ -15,6 +15,42 @@ np.random.seed(19) +P0101 = 1 +P1010 = 2 +P0001 = 3 +P0010 = 4 +P0100 = 5 +P1000 = 6 +P1111 = 7 + +FLOAT_P0101_ROW = 4 +FLOAT_P0101_COL = 64 +FLOAT_P1010_ROW = 7 +FLOAT_P1010_COL = 1024 +FLOAT_P0001_ROW = 3 +FLOAT_P0001_COL = 1056 +FLOAT_P0010_ROW = 4 +FLOAT_P0010_COL = 128 +FLOAT_P0100_ROW = 5 +FLOAT_P0100_COL = 256 +FLOAT_P1000_ROW = 6 +FLOAT_P1000_COL = 288 +FLOAT_P1111_ROW = 7 +FLOAT_P1111_COL = 320 + +HALF_P0101_ROW = 5 +HALF_P0101_COL = 128 +HALF_P1010_ROW = 7 +HALF_P1010_COL = 1024 +HALF_P0001_ROW = 3 +HALF_P0001_COL = 1024 +HALF_P0010_ROW = 4 +HALF_P0010_COL = 128 +HALF_P0100_ROW = 5 +HALF_P0100_COL = 256 +HALF_P1000_ROW = 6 +HALF_P1000_COL = 256 + def gen_case(case_dir: str, rows: int, cols: int): os.makedirs(case_dir, exist_ok=True) @@ -34,6 +70,110 @@ def gen_case(case_dir: str, rows: int, cols: int): os.chdir("..") +class TScatterParamsMasked: + def __init__(self, name, src_type, row, dst_col, pattern): + self.testname = name + self.src_type = src_type + self.row = row + self.dst_col = dst_col + self.pattern = pattern + + +def gen_masked_scatter_golden(param: TScatterParamsMasked): + original_dir = os.getcwd() + os.makedirs(param.testname, exist_ok=True) + os.chdir(param.testname) + + row = param.row + dst_col = param.dst_col + pattern = param.pattern + + if pattern == P0101: + src_col = dst_col // 2 + mask_indices = set(range(0, dst_col, 2)) + elif pattern == P1010: + src_col = dst_col // 2 + mask_indices = set(range(1, dst_col, 2)) + elif pattern == P0001: + src_col = dst_col // 4 + mask_indices = set(range(0, dst_col, 4)) + elif pattern == P0010: + src_col = dst_col // 4 + mask_indices = set(range(1, dst_col, 4)) + elif pattern == P0100: + src_col = dst_col // 4 + mask_indices = set(range(2, dst_col, 4)) + elif pattern == P1000: + src_col = dst_col // 4 + mask_indices = set(range(3, dst_col, 4)) + elif pattern == P1111: + src_col = dst_col + mask_indices = set(range(0, dst_col)) + else: + raise ValueError(f"Unsupported pattern: {pattern}") + + src = np.random.randint(1, 100, [row, src_col]).astype(param.src_type) + dst = np.zeros([row, dst_col], dtype=param.src_type) + + for r in range(row): + sidx = 0 + for c in range(dst_col): + if c in mask_indices: + dst[r, c] = src.flat[r * src_col + sidx] + sidx += 1 + + src.tofile("./x1_gm.bin") + dst.tofile("./golden.bin") + os.chdir(original_dir) + + if __name__ == "__main__": gen_case("TSCATTERTest.case_float_16x16_16x16_16x16", 16, 16) + masked_cases = [ + # float + TScatterParamsMasked("TSCATTERTest.case_masked_float_P0101", + np.float32, FLOAT_P0101_ROW, FLOAT_P0101_COL, P0101), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P1010", + np.float32, FLOAT_P1010_ROW, FLOAT_P1010_COL, P1010), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P0001", + np.float32, FLOAT_P0001_ROW, FLOAT_P0001_COL, P0001), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P0010", + np.float32, FLOAT_P0010_ROW, FLOAT_P0010_COL, P0010), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P0100", + np.float32, FLOAT_P0100_ROW, FLOAT_P0100_COL, P0100), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P1000", + np.float32, FLOAT_P1000_ROW, FLOAT_P1000_COL, P1000), + TScatterParamsMasked("TSCATTERTest.case_masked_float_P1111", + np.float32, FLOAT_P1111_ROW, FLOAT_P1111_COL, P1111), + # half + TScatterParamsMasked("TSCATTERTest.case_masked_half_P0101", + np.float16, HALF_P0101_ROW, HALF_P0101_COL, P0101), + TScatterParamsMasked("TSCATTERTest.case_masked_half_P1010", + np.float16, HALF_P1010_ROW, HALF_P1010_COL, P1010), + TScatterParamsMasked("TSCATTERTest.case_masked_half_P0001", + np.float16, HALF_P0001_ROW, HALF_P0001_COL, P0001), + TScatterParamsMasked("TSCATTERTest.case_masked_half_P0100", + np.float16, HALF_P0100_ROW, HALF_P0100_COL, P0100), + TScatterParamsMasked("TSCATTERTest.case_masked_half_P1000", + np.float16, HALF_P1000_ROW, HALF_P1000_COL, P1000), + # uint16 / int16 + TScatterParamsMasked("TSCATTERTest.case_masked_U16_P0101", + np.uint16, HALF_P0101_ROW, HALF_P0101_COL, P0101), + TScatterParamsMasked("TSCATTERTest.case_masked_U16_P1010", + np.uint16, HALF_P1010_ROW, HALF_P1010_COL, P1010), + TScatterParamsMasked("TSCATTERTest.case_masked_I16_P0001", + np.int16, HALF_P0001_ROW, HALF_P0001_COL, P0001), + TScatterParamsMasked("TSCATTERTest.case_masked_I16_P0010", + np.int16, HALF_P0010_ROW, HALF_P0010_COL, P0010), + # uint32 / int32 + TScatterParamsMasked("TSCATTERTest.case_masked_U32_P0100", + np.uint32, FLOAT_P0100_ROW, FLOAT_P0100_COL, P0100), + TScatterParamsMasked("TSCATTERTest.case_masked_I32_P1000", + np.int32, FLOAT_P1000_ROW, FLOAT_P1000_COL, P1000), + TScatterParamsMasked("TSCATTERTest.case_masked_I32_P1111", + np.int32, FLOAT_P1111_ROW, FLOAT_P1111_COL, P1111), + ] + + for case in masked_cases: + gen_masked_scatter_golden(case) diff --git a/tests/cpu/st/testcase/tscatter/main.cpp b/tests/cpu/st/testcase/tscatter/main.cpp index efb6ee282..f714fee7a 100644 --- a/tests/cpu/st/testcase/tscatter/main.cpp +++ b/tests/cpu/st/testcase/tscatter/main.cpp @@ -9,6 +9,7 @@ See LICENSE in the root of the software repository for the full text of the Lice */ #include "test_common.h" +#include "tscatter_common.h" #include #include @@ -96,3 +97,158 @@ TEST_F(TSCATTERTest, case_float_16x16_16x16_16x16) { test_tscatter<16, 16>(); } + +// --- Mask-pattern TSCATTER tests --- + +template +void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); + +template +void test_scatter_masked() +{ + constexpr uint32_t SRC_COL = DST_COL / MASK_DIVISOR; + size_t srcSize = ROW * SRC_COL * sizeof(T); + size_t dstSize = ROW * DST_COL * sizeof(T); + + aclInit(nullptr); + aclrtSetDevice(0); + aclrtStream stream; + aclrtCreateStream(&stream); + + uint8_t *dstHost, *srcHost; + uint8_t *dstDevice, *srcDevice; + + aclrtMallocHost((void **)(&dstHost), dstSize); + aclrtMallocHost((void **)(&srcHost), srcSize); + aclrtMalloc((void **)&dstDevice, dstSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&srcDevice, srcSize, ACL_MEM_MALLOC_HUGE_FIRST); + + size_t readSize = srcSize; + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/x1_gm.bin", readSize, srcHost, srcSize)); + + aclrtMemcpy(srcDevice, srcSize, srcHost, srcSize, ACL_MEMCPY_HOST_TO_DEVICE); + launchTSCATTER_masked(dstDevice, srcDevice, stream); + + aclrtSynchronizeStream(stream); + aclrtMemcpy(dstHost, dstSize, dstDevice, dstSize, ACL_MEMCPY_DEVICE_TO_HOST); + + WriteFile(GetGoldenDir() + "/output_z.bin", dstHost, dstSize); + + aclrtFree(dstDevice); + aclrtFree(srcDevice); + aclrtFreeHost(dstHost); + aclrtFreeHost(srcHost); + aclrtDestroyStream(stream); + aclrtResetDevice(0); + aclFinalize(); + + constexpr size_t numElements = ROW * DST_COL; + std::vector golden(numElements); + std::vector devFinal(numElements); + readSize = dstSize; + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/golden.bin", readSize, golden.data(), dstSize)); + readSize = dstSize; + CHECK_RESULT_GTEST(ReadFile(GetGoldenDir() + "/output_z.bin", readSize, devFinal.data(), dstSize)); + + bool ret = ResultCmp(golden, devFinal, 0.001f); + EXPECT_TRUE(ret); +} + +// float +TEST_F(TSCATTERTest, case_masked_float_P0101) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P1010) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P0001) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P0010) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P0100) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P1000) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_float_P1111) +{ + test_scatter_masked(); +} + +// half +TEST_F(TSCATTERTest, case_masked_half_P0101) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_half_P1010) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_half_P0001) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_half_P0100) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_half_P1000) +{ + test_scatter_masked(); +} + +// uint16 / int16 +TEST_F(TSCATTERTest, case_masked_U16_P0101) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_U16_P1010) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_I16_P0001) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_I16_P0010) +{ + test_scatter_masked(); +} + +// uint32 / int32 +TEST_F(TSCATTERTest, case_masked_U32_P0100) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_I32_P1000) +{ + test_scatter_masked(); +} + +TEST_F(TSCATTERTest, case_masked_I32_P1111) +{ + test_scatter_masked(); +} diff --git a/tests/cpu/st/testcase/tscatter/tscatter_common.h b/tests/cpu/st/testcase/tscatter/tscatter_common.h new file mode 100644 index 000000000..9471295ea --- /dev/null +++ b/tests/cpu/st/testcase/tscatter/tscatter_common.h @@ -0,0 +1,59 @@ +/** +Copyright (c) 2025 Huawei Technologies Co., Ltd. +This program is free software, you can redistribute it and/or modify it under the terms and conditions of +CANN Open Software License Agreement Version 2.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ + +#define FP0101 1 +#define FP1010 2 +#define FP0001 3 +#define FP0010 4 +#define FP0100 5 +#define FP1000 6 +#define FP1111 7 + +#define HP0101 11 +#define HP1010 12 +#define HP0001 13 +#define HP0100 15 +#define HP1000 16 + +#define U16P0101 21 +#define U16P1010 22 +#define I16P0001 23 +#define I16P0010 24 +#define U32P0100 25 +#define I32P1000 26 +#define I32P1111 27 + +#define FLOAT_P0101_ROW 4 +#define FLOAT_P0101_COL 64 +#define FLOAT_P1010_ROW 7 +#define FLOAT_P1010_COL 1024 +#define FLOAT_P0001_ROW 3 +#define FLOAT_P0001_COL 1056 +#define FLOAT_P0010_ROW 4 +#define FLOAT_P0010_COL 128 +#define FLOAT_P0100_ROW 5 +#define FLOAT_P0100_COL 256 +#define FLOAT_P1000_ROW 6 +#define FLOAT_P1000_COL 288 +#define FLOAT_P1111_ROW 7 +#define FLOAT_P1111_COL 320 + +#define HALF_P0101_ROW 5 +#define HALF_P0101_COL 128 +#define HALF_P1010_ROW 7 +#define HALF_P1010_COL 1024 +#define HALF_P0001_ROW 3 +#define HALF_P0001_COL 1024 +#define HALF_P0010_ROW 4 +#define HALF_P0010_COL 128 +#define HALF_P0100_ROW 5 +#define HALF_P0100_COL 256 +#define HALF_P1000_ROW 6 +#define HALF_P1000_COL 256 diff --git a/tests/cpu/st/testcase/tscatter/tscatter_kernel.cpp b/tests/cpu/st/testcase/tscatter/tscatter_kernel.cpp index c0eb32e6b..f829d41b2 100644 --- a/tests/cpu/st/testcase/tscatter/tscatter_kernel.cpp +++ b/tests/cpu/st/testcase/tscatter/tscatter_kernel.cpp @@ -10,6 +10,7 @@ See LICENSE in the root of the software repository for the full text of the Lice #include #include +#include "tscatter_common.h" using namespace pto; @@ -51,3 +52,225 @@ void LaunchTScatter(float *out, float *src, uint16_t *idx, void *stream) } template void LaunchTScatter<16, 16>(float *out, float *src, uint16_t *idx, void *stream); + +// --- Mask-pattern TSCATTER --- + +template +AICORE void runTScatterMasked(__gm__ T __out__ *out, __gm__ T __in__ *src) +{ + using DynShapeSrc = Shape<1, 1, 1, kSrcRows_, kSrcCols_>; + using DynStridSrc = Stride<1, 1, 1, kSrcCols_, 1>; + using GlobalSrc = GlobalTensor; + + using DynShapeDst = Shape<1, 1, 1, kDstRows_, kDstCols_>; + using DynStridDst = Stride<1, 1, 1, kDstCols_, 1>; + using GlobalDst = GlobalTensor; + + using SrcTileData = Tile; + using DstTileData = Tile; + + SrcTileData srcTile(kSrcRows_, kSrcCols_); + DstTileData dstTile(kDstRows_, kDstCols_); + TASSIGN(srcTile, 0x0); + TASSIGN(dstTile, 0x0 + kSrcRows_ * kSrcCols_ * sizeof(T)); + + GlobalSrc srcGlobal(src); + GlobalDst dstGlobal(out); + + TLOAD(srcTile, srcGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TSCATTER(dstTile, srcTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, dstTile); + out = dstGlobal.data(); +} + +// --- float launchers --- + +extern "C" __global__ AICORE void launchTSCATTER_FP0101(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP1010(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP0001(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP0010(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP0100(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP1000(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_FP1111(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ float *>(out), reinterpret_cast<__gm__ float *>(src)); +} + +// --- half launchers --- + +extern "C" __global__ AICORE void launchTSCATTER_HP0101(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ half *>(out), reinterpret_cast<__gm__ half *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_HP1010(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ half *>(out), reinterpret_cast<__gm__ half *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_HP0001(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ half *>(out), reinterpret_cast<__gm__ half *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_HP0100(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ half *>(out), reinterpret_cast<__gm__ half *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_HP1000(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ half *>(out), reinterpret_cast<__gm__ half *>(src)); +} + +// --- uint16/int16/uint32/int32 launchers --- + +extern "C" __global__ AICORE void launchTSCATTER_U16P0101(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ uint16_t *>(out), reinterpret_cast<__gm__ uint16_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_U16P1010(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ uint16_t *>(out), reinterpret_cast<__gm__ uint16_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_I16P0001(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ int16_t *>(out), reinterpret_cast<__gm__ int16_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_I16P0010(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ int16_t *>(out), reinterpret_cast<__gm__ int16_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_U32P0100(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ uint32_t *>(out), + reinterpret_cast<__gm__ uint32_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_I32P1000(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked(reinterpret_cast<__gm__ int32_t *>(out), + reinterpret_cast<__gm__ int32_t *>(src)); +} + +extern "C" __global__ AICORE void launchTSCATTER_I32P1111(__gm__ uint8_t *out, __gm__ uint8_t *src) +{ + runTScatterMasked( + reinterpret_cast<__gm__ int32_t *>(out), reinterpret_cast<__gm__ int32_t *>(src)); +} + +// --- dispatch --- + +template +void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream) +{ + if constexpr (tilingKey == FP0101) { + launchTSCATTER_FP0101(out, src); + } else if constexpr (tilingKey == FP1010) { + launchTSCATTER_FP1010(out, src); + } else if constexpr (tilingKey == FP0001) { + launchTSCATTER_FP0001(out, src); + } else if constexpr (tilingKey == FP0010) { + launchTSCATTER_FP0010(out, src); + } else if constexpr (tilingKey == FP0100) { + launchTSCATTER_FP0100(out, src); + } else if constexpr (tilingKey == FP1000) { + launchTSCATTER_FP1000(out, src); + } else if constexpr (tilingKey == FP1111) { + launchTSCATTER_FP1111(out, src); + } else if constexpr (tilingKey == HP0101) { + launchTSCATTER_HP0101(out, src); + } else if constexpr (tilingKey == HP1010) { + launchTSCATTER_HP1010(out, src); + } else if constexpr (tilingKey == HP0001) { + launchTSCATTER_HP0001(out, src); + } else if constexpr (tilingKey == HP0100) { + launchTSCATTER_HP0100(out, src); + } else if constexpr (tilingKey == HP1000) { + launchTSCATTER_HP1000(out, src); + } else if constexpr (tilingKey == U16P0101) { + launchTSCATTER_U16P0101(out, src); + } else if constexpr (tilingKey == U16P1010) { + launchTSCATTER_U16P1010(out, src); + } else if constexpr (tilingKey == I16P0001) { + launchTSCATTER_I16P0001(out, src); + } else if constexpr (tilingKey == I16P0010) { + launchTSCATTER_I16P0010(out, src); + } else if constexpr (tilingKey == U32P0100) { + launchTSCATTER_U32P0100(out, src); + } else if constexpr (tilingKey == I32P1000) { + launchTSCATTER_I32P1000(out, src); + } else if constexpr (tilingKey == I32P1111) { + launchTSCATTER_I32P1111(out, src); + } +} + +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream); +template void launchTSCATTER_masked(uint8_t *out, uint8_t *src, void *stream);