|
| 1 | +/******************************************************************************* |
| 2 | +* INTEL CONFIDENTIAL |
| 3 | +* Copyright 2025 Intel Corporation. |
| 4 | +* |
| 5 | +* This software and the related documents are Intel copyrighted materials, and |
| 6 | +* your use of them is governed by the express license under which they were |
| 7 | +* provided to you (License). Unless the License provides otherwise, you may not |
| 8 | +* use, modify, copy, publish, distribute, disclose or transmit this software or |
| 9 | +* the related documents without Intel's prior written permission. |
| 10 | +* |
| 11 | +* This software and the related documents are provided as is, with no express |
| 12 | +* or implied warranties, other than those that are expressly stated in the |
| 13 | +* License. |
| 14 | +*******************************************************************************/ |
| 15 | + |
| 16 | + |
| 17 | +#include "generator.hpp" |
| 18 | +#include "hw_utils.hpp" |
| 19 | +#include "layout_utils.hpp" |
| 20 | +#include "state_utils.hpp" |
| 21 | +#include "ngen_object_helpers.hpp" |
| 22 | + |
| 23 | +#include "internal/namespace_start.hxx" |
| 24 | + |
| 25 | +using namespace ngen; |
| 26 | +using namespace ngen::utils; |
| 27 | +using std::vector; |
| 28 | + |
| 29 | + |
| 30 | + |
| 31 | +template <HW hw> |
| 32 | +void BLASKernelGenerator<hw>::gemmTLBWarmup(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state) |
| 33 | +{ |
| 34 | + auto lid = state.ra.allocSub<uint32_t>(); |
| 35 | + int whose = 0; |
| 36 | + |
| 37 | + emad(1, lid, state.inputs.localIDM, state.inputs.localIDN, strategy.wg[LoopM], strategy, state); |
| 38 | + if (strategy.kParallelLocal) |
| 39 | + emad(1, lid, lid, state.inputs.localIDK, strategy.wg[LoopM] * strategy.wg[LoopN], strategy, state); |
| 40 | + |
| 41 | + if (problem.quantized2DA()) { |
| 42 | + auto mq = state.ra.allocSub<uint32_t>(); |
| 43 | + auto kq = state.ra.allocSub<uint32_t>(); |
| 44 | + divDown(mq, state.inputs.m, problem.aqGroupM, strategy, state); |
| 45 | + divDown(kq, state.inputs.k, problem.aqGroupK, strategy, state); |
| 46 | + if (problem.aScale2D) { |
| 47 | + tlbWarmup(problem.Ta_scale, problem.A_scale, strategy.A_scale, state.inputs.aScalePtr, |
| 48 | + mq, kq, state.inputs.ldaq, lid, whose++, problem, strategy, state); |
| 49 | + } |
| 50 | + if (problem.aoPtrDims == 2) { |
| 51 | + tlbWarmup(problem.Tao, problem.AO, strategy.AO, state.inputs.aoPtr, |
| 52 | + mq, kq, state.inputs.ldaq, lid, whose++, problem, strategy, state); |
| 53 | + } |
| 54 | + state.ra.safeRelease(mq); |
| 55 | + state.ra.safeRelease(kq); |
| 56 | + } |
| 57 | + |
| 58 | + if (problem.quantized2DB()) { |
| 59 | + auto kq = state.ra.allocSub<uint32_t>(); |
| 60 | + auto nq = state.ra.allocSub<uint32_t>(); |
| 61 | + divDown(kq, state.inputs.k, problem.bqGroupK, strategy, state); |
| 62 | + divDown(nq, state.inputs.n, problem.bqGroupN, strategy, state); |
| 63 | + if (problem.bScale2D) { |
| 64 | + tlbWarmup(problem.Tb_scale, problem.B_scale, strategy.B_scale, state.inputs.bScalePtr, |
| 65 | + kq, nq, state.inputs.ldbq, lid, whose++, problem, strategy, state); |
| 66 | + } |
| 67 | + if (problem.boPtrDims == 2) { |
| 68 | + tlbWarmup(problem.Tbo, problem.BO, strategy.BO, state.inputs.boPtr, |
| 69 | + kq, nq, state.inputs.ldbq, lid, whose++, problem, strategy, state); |
| 70 | + } |
| 71 | + state.ra.safeRelease(kq); |
| 72 | + state.ra.safeRelease(nq); |
| 73 | + } |
| 74 | + |
| 75 | + tlbWarmup(problem.Ta_ext, problem.A, strategy.A, state.effA, |
| 76 | + state.inputs.m, state.inputs.k, state.inputs.lda, lid, whose++, |
| 77 | + problem, strategy, state); |
| 78 | + tlbWarmup(problem.Tb_ext, problem.B, strategy.B, state.effB, |
| 79 | + state.inputs.k, state.inputs.n, state.inputs.ldb, lid, whose++, |
| 80 | + problem, strategy, state); |
| 81 | + |
| 82 | + state.ra.safeRelease(lid); |
| 83 | +} |
| 84 | + |
| 85 | +template <HW hw> |
| 86 | +void BLASKernelGenerator<hw>::tlbWarmup(Type T, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, |
| 87 | + const Subregister &ptr, const Subregister &r, const Subregister &c, |
| 88 | + const Subregister &ld, const Subregister &lid, int whose, |
| 89 | + const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state) |
| 90 | +{ |
| 91 | + auto flag = state.raVFlag.alloc(); |
| 92 | + const uint32_t byteLimit = 256 * 1024 * 1024; |
| 93 | + |
| 94 | + auto bytes = state.ra.allocSub<uint64_t>(); |
| 95 | + emul(1, bytes, ld, isColMajor(atype.layout) ? c : r, strategy, state); |
| 96 | + cmp(1 | nz | flag, bytes.ud(1), 0); |
| 97 | + cmp(1 | ~flag | gt | flag, bytes.ud(), byteLimit / T); |
| 98 | + emulConstant(1, bytes.ud(), bytes.ud(), T, strategy, state); |
| 99 | + mov(1 | flag, bytes.ud(), byteLimit); |
| 100 | + |
| 101 | + state.raVFlag.safeRelease(flag); |
| 102 | + |
| 103 | + tlbWarmup(astrategy.base, ptr, bytes.ud(), lid, whose, problem, strategy, state); |
| 104 | + |
| 105 | + state.ra.safeRelease(bytes); |
| 106 | +} |
| 107 | + |
| 108 | +template <HW hw> |
| 109 | +void BLASKernelGenerator<hw>::tlbWarmup(AddressBase base, const Subregister &ptr, const Subregister &bytes, |
| 110 | + const Subregister &lid, int whose, |
| 111 | + const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state) |
| 112 | +{ |
| 113 | + bool a64 = base.isA64(); |
| 114 | + auto Taddr = a64 ? DataType::uq : DataType::ud; |
| 115 | + const int simd = elementsPerGRF<uint32_t>(hw); |
| 116 | + const int log2Stride = 16; // 64kb stride. |
| 117 | + const int log2TwiddleStride = 6; |
| 118 | + |
| 119 | + int udStride = a64 ? 2 : 1; |
| 120 | + auto addr = state.ra.allocRange(udStride); |
| 121 | + auto addr0 = addr[0].retype(Taddr); |
| 122 | + auto addrLo = addr0.ud(0)(udStride); |
| 123 | + auto off = state.ra.allocRange(udStride); |
| 124 | + auto off0 = off[0].ud(0)(udStride); |
| 125 | + auto twiddle = state.ra.alloc().ud(); |
| 126 | + auto data = state.ra.alloc().ud(); |
| 127 | + auto count = state.ra.alloc().d(); |
| 128 | + auto flag = state.raVFlag.alloc(); |
| 129 | + |
| 130 | + extendIndexVec(simd, state); |
| 131 | + |
| 132 | + auto iv = accessIndexVec(0, state)(1); |
| 133 | + |
| 134 | + cmp(1 | nz | flag, lid, whose); /* Check if we are responsible thread */ |
| 135 | + |
| 136 | + shl(simd, off0, iv, log2Stride); |
| 137 | + shl(simd, twiddle, iv, log2TwiddleStride); |
| 138 | + eadd(simd, addr0, ptr, off0, strategy, state); |
| 139 | + xor_(simd, addrLo, addrLo, twiddle); /* Perturb low bits to avoid cache hotspotting */ |
| 140 | + |
| 141 | + add(1, count, bytes, ((simd + 1) << log2Stride) - 1); |
| 142 | + shr(1, count, count, log2Stride); |
| 143 | + add(simd, count, count[0], -iv); |
| 144 | + |
| 145 | + Label lTop, lSkip; |
| 146 | + jmpi(1 | flag, lSkip); |
| 147 | + |
| 148 | + mark(lTop); |
| 149 | + add(simd | gt | flag, count, count, -simd); |
| 150 | + if (hw >= HW::XeHPC) |
| 151 | + load(simd | flag, null, D8U32 | L1C_L3C, base, addr); |
| 152 | + else if (hw >= HW::XeHPG) |
| 153 | + load(simd | flag, data, D8U32 | L1C_L3C, base, addr); |
| 154 | + else |
| 155 | + load(simd | flag, data, scattered_byte(), base, addr); |
| 156 | + xor_(simd, addrLo, addrLo, twiddle); |
| 157 | + add(simd, twiddle, twiddle, simd << log2TwiddleStride); |
| 158 | + and_(simd, twiddle, twiddle, 0xFFF); /* Don't cross 4K page boundaries */ |
| 159 | + eadd(simd, addr0, addr0, simd << log2Stride, strategy, state); |
| 160 | + xor_(simd, addrLo, addrLo, twiddle); |
| 161 | + jmpi(1 | flag, lTop); |
| 162 | + mark(lSkip); |
| 163 | + |
| 164 | + releaseIndexVec(state); |
| 165 | + state.raVFlag.safeRelease(flag); |
| 166 | + state.ra.safeRelease(off); |
| 167 | + state.ra.safeRelease(twiddle); |
| 168 | + state.ra.safeRelease(addr); |
| 169 | + state.ra.safeRelease(data); |
| 170 | + state.ra.safeRelease(count); |
| 171 | +} |
| 172 | + |
| 173 | +#include "internal/namespace_end.hxx" |
0 commit comments