@@ -7218,7 +7218,8 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
72187218 void Optimizer::HWWorkaround ()
72197219 {
72207220 if ((kernel.getInt32KernelAttr (Attributes::ATTR_Target) == VISA_CM) &&
7221- builder.getJitInfo ()->spillMemUsed > 0 && builder.hasFusedEUWA ())
7221+ builder.hasFusedEUWA () &&
7222+ (builder.getJitInfo ()->spillMemUsed > 0 || builder.getJitInfo ()->numFlagSpillStore > 0 ))
72227223 {
72237224 // For now, do it for CM/VC. Will turn it on for all.
72247225 doNoMaskWA_postRA ();
@@ -11917,6 +11918,10 @@ void Optimizer::doNoMaskWA()
1191711918 for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
1191811919 {
1191911920 G4_INST* I = *II;
11921+
11922+ // Mark all instruction as created by preRA to avoid re-processing postRA
11923+ I->setCreatedPreRA (true );
11924+
1192011925 if (!isCandidateInst (I, fg))
1192111926 {
1192211927 continue ;
@@ -12103,21 +12108,48 @@ void Optimizer::doNoMaskWA()
1210312108// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
1210412109// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
1210512110//
12111+ // For flag spill:
12112+ // Need WA as well due to the following case:
12113+ //
12114+ // After RA:
12115+ // BB_19:
12116+ // (W) mov (1|M0) r34.8<1>:uw f0.1<0;1,0>:uw
12117+ // ...
12118+ // BB_21:
12119+ // (W) mov (1|M0) f1.1<1>:uw r34.8<0;1,0>:uw
12120+ //
12121+ // If BB_19 should be skipped but runs due to this HW bug, r34.8 will be updated
12122+ // with a f0.1, which is undefined value. And at BB_21, reading from r34.8 will
12123+ // get garbage value!
12124+ //
1210612125// Note this works only for NoMaskWA=2
1210712126//
1210812127void Optimizer::doNoMaskWA_postRA ()
1210912128{
1211012129 std::vector<INST_LIST_ITER> NoMaskCandidates;
1211112130 G4_ExecSize simdsize = fg.getKernel ()->getSimdSize ();
12131+ const bool HasFlagSpill = (builder.getJitInfo ()->numFlagSpillStore > 0 );
12132+
12133+ auto isCandidate = [&](G4_INST* I) {
12134+ if (I->getCreatedPreRA () || !I->isWriteEnableInst ())
12135+ {
12136+ return false ;
12137+ }
1211212138
12113- auto isCandidate = [](G4_INST* I) {
12114- if (I->isSend () && I->isWriteEnableInst () &&
12115- I->getPredicate () == nullptr &&
12139+ // If it is global flag spill or global grf spill, need to do WA.
12140+ // For now, global checking is not available
12141+
12142+ // 1. flag spill
12143+ if (HasFlagSpill &&
12144+ I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag () &&
12145+ I->getExecSize () == g4::SIMD1 && I->getPredicate () == nullptr )
12146+ {
12147+ return true ;
12148+ }
12149+ // 2. GRF spill
12150+ if (I->isSend () && I->getPredicate () == nullptr &&
1211612151 (I->getDst () == nullptr || I->getDst ()->isNullReg ()))
1211712152 {
12118- // This shall be a spill (write).
12119- // May check if the spilled var is global. We only need
12120- // to do WA for global spill!
1212112153 return true ;
1212212154 }
1212312155 return false ;
@@ -12175,6 +12207,16 @@ void Optimizer::doNoMaskWA_postRA()
1217512207 // (W & f0.0.any16h) send (16|M0) ...
1217612208 // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
1217712209 //
12210+ // For flag spill, the sequence is the same as the above except for the case in which
12211+ // the WAFlag is the same as spilled flag. For example,
12212+ //
12213+ // (W) mov (1|M0) r34.8<1>:uw f0.0<0;1,0>:uw
12214+ //
12215+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12216+ // 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
12217+ // (W & f0.0.any16h) mov r34.8<1>:uw DW0.0<0;1,0>:uw
12218+ // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12219+ //
1217812220 // Todo: check if save/restore is needed to avoid redundant save/restore.
1217912221 //
1218012222 G4_Declare* saveTmp = builder.getEUFusionWATmpVar (); // 2DW;
@@ -12215,10 +12257,15 @@ void Optimizer::doNoMaskWA_postRA()
1221512257 // Without optimization, always do save/restore
1221612258 bool needSave = true ;
1221712259 bool needRestore = true ;
12260+
12261+ // wa flag register to use f(wafregnum, wafsregnum)
12262+ uint32_t wafregnum = 0 ;
12263+ uint32_t wafsregnum = 0 ;
12264+
1221812265 G4_Type Ty = (simdsize > 16 ) ? Type_UD : Type_UW;
1221912266 G4_Declare* flagDcl = builder.createTempFlag ((Ty == Type_UW ? 1 : 2 ), " waflag" );
1222012267 G4_RegVar* flagVar = flagDcl->getRegVar ();
12221- flagVar->setPhyReg (builder.phyregpool .getFlagAreg (0 ), 0 );
12268+ flagVar->setPhyReg (builder.phyregpool .getFlagAreg (wafregnum ), wafsregnum );
1222212269
1222312270 // Save flag, create WA mask, save WAflag
1222412271 createMov1 (BB, WAInsts[0 ], saveVar, saveOff, flagVar, 0 , Ty); // save
@@ -12239,6 +12286,24 @@ void Optimizer::doNoMaskWA_postRA()
1223912286 G4_INST* I = *currII;
1224012287 G4_Predicate* newPred = builder.createPredicate (
1224112288 PredState_Plus, flagVar, 0 , waPredCtrl);
12289+ if (I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag ())
12290+ {
12291+ G4_SrcRegRegion* srcReg = I->getSrc (0 )->asSrcRegRegion ();
12292+ G4_RegVar* baseVar = static_cast <G4_RegVar*>(srcReg->getBase ());
12293+ assert (baseVar->isPhyRegAssigned ());
12294+
12295+ // For flag, G4_Areg has flag number and G4_RegVar has subRefOff.
12296+ // (SrcRegRegion's refOff/subRefOff is 0/0 always.)
12297+ G4_Areg* flagReg = baseVar->getPhyReg ()->getAreg ();
12298+ uint32_t subRegOff = baseVar->getPhyRegOff ();
12299+ if (flagReg->getFlagNum () == wafregnum &&
12300+ (Ty == Type_UD /* 32bit flag */ || subRegOff == wafsregnum /* 16bit flag */ ))
12301+ {
12302+ G4_SrcRegRegion* S = builder.createSrc (
12303+ saveVar, 0 , saveOff, builder.getRegionScalar (), Ty);
12304+ I->setSrc (S, 0 );
12305+ }
12306+ }
1224212307 I->setPredicate (newPred);
1224312308
1224412309 if (i == (sz - 1 ) || needRestore) {
0 commit comments