Skip to content

Commit

Permalink
Ensure predicate cache is reset when control flow leaves block
Browse files Browse the repository at this point in the history
Whenever the control float leaves the block, it might clobber the
predicate register so we reset the cache whenever that happens.

The difficulty here is that the cache is valid only during IR generation
so we need to make sure we catch all the cases during this pass where
the execution might leave the block.

Fixes FEX-Emu#4264
  • Loading branch information
pmatos committed Jan 13, 2025
1 parent 8cfc016 commit c1313a7
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 16 deletions.
2 changes: 1 addition & 1 deletion FEXCore/Scripts/json_ir_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def parse_ops(ops):
(OpArg.Type == "GPR" or
OpArg.Type == "GPRPair" or
OpArg.Type == "FPR" or
OpArg.Type == "PR")):
OpArg.Type == "PRED")):
OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class")

OpArg.Name = ArgName
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
16 changes: 12 additions & 4 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,40 +125,51 @@ class OpDispatchBuilder final : public IREmitter {

// Need to clear any named constants that were cached.
ClearCachedNamedConstants();

// Clear predicate cache for x87 ldst
ResetInitPredicateCache();
}

IRPair<IROp_Jump> Jump() {
FlushRegisterCache();
ResetInitPredicateCache();
return _Jump();
}
IRPair<IROp_Jump> Jump(Ref _TargetBlock) {
FlushRegisterCache();
ResetInitPredicateCache();
return _Jump(_TargetBlock);
}
IRPair<IROp_CondJump> CondJump(Ref _Cmp1, Ref _Cmp2, Ref _TrueBlock, Ref _FalseBlock, CondClassType _Cond = {COND_NEQ},
IR::OpSize _CompareSize = OpSize::iInvalid) {
FlushRegisterCache();
ResetInitPredicateCache();
return _CondJump(_Cmp1, _Cmp2, _TrueBlock, _FalseBlock, _Cond, _CompareSize);
}
IRPair<IROp_CondJump> CondJump(Ref ssa0, CondClassType cond = {COND_NEQ}) {
FlushRegisterCache();
ResetInitPredicateCache();
return _CondJump(ssa0, cond);
}
IRPair<IROp_CondJump> CondJump(Ref ssa0, Ref ssa1, Ref ssa2, CondClassType cond = {COND_NEQ}) {
FlushRegisterCache();
ResetInitPredicateCache();
return _CondJump(ssa0, ssa1, ssa2, cond);
}
IRPair<IROp_CondJump> CondJumpNZCV(CondClassType Cond) {
FlushRegisterCache();
ResetInitPredicateCache();
return _CondJump(InvalidNode, InvalidNode, InvalidNode, InvalidNode, Cond, OpSize::iInvalid, true);
}
IRPair<IROp_CondJump> CondJumpBit(Ref Src, unsigned Bit, bool Set) {
FlushRegisterCache();
ResetInitPredicateCache();
auto InlineConst = _InlineConstant(Bit);
return _CondJump(Src, InlineConst, InvalidNode, InvalidNode, {Set ? COND_TSTNZ : COND_TSTZ}, OpSize::iInvalid, false);
}
IRPair<IROp_ExitFunction> ExitFunction(Ref NewRIP) {
FlushRegisterCache();
ResetInitPredicateCache();
return _ExitFunction(NewRIP);
}
IRPair<IROp_Break> Break(BreakDefinition Reason) {
Expand All @@ -167,6 +178,7 @@ class OpDispatchBuilder final : public IREmitter {
}
IRPair<IROp_Thunk> Thunk(Ref ArgPtr, SHA256Sum ThunkNameHash) {
FlushRegisterCache();
ResetInitPredicateCache();
return _Thunk(ArgPtr, ThunkNameHash);
}

Expand Down Expand Up @@ -718,7 +730,6 @@ class OpDispatchBuilder final : public IREmitter {
void FNINIT(OpcodeArgs);

void X87ModifySTP(OpcodeArgs, bool Inc);
void X87SinCos(OpcodeArgs);
void X87FYL2X(OpcodeArgs, bool IsFYL2XP1);
void X87LDENV(OpcodeArgs);
void X87FLDCW(OpcodeArgs);
Expand Down Expand Up @@ -764,9 +775,6 @@ class OpDispatchBuilder final : public IREmitter {
void FTSTF64(OpcodeArgs);
void FRNDINTF64(OpcodeArgs);
void FSQRTF64(OpcodeArgs);
void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87SinCosF64(OpcodeArgs);
void X87FLDCWF64(OpcodeArgs);
void X87TANF64(OpcodeArgs);
void X87ATANF64(OpcodeArgs);
Expand Down
25 changes: 24 additions & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
Ref PredReg = Invalid();
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
}
_StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down Expand Up @@ -164,6 +168,9 @@ void OpDispatchBuilder::FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa
if (Op->Src[0].IsNone()) { // Implicit argument case
auto Offset = Op->OP & 7;
auto St0 = 0;
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}
if (ResInST0 == OpResult::RES_STI) {
_F80AddStack(Offset, St0);
} else {
Expand Down Expand Up @@ -194,6 +201,9 @@ void OpDispatchBuilder::FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa
if (Op->Src[0].IsNone()) { // Implicit argument case
auto offset = Op->OP & 7;
auto st0 = 0;
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}
if (ResInST0 == OpResult::RES_STI) {
_F80MulStack(offset, st0);
} else {
Expand Down Expand Up @@ -230,6 +240,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re
const auto St0 = 0;
const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0;

if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}
if (Reverse ^ (ResInST0 == OpResult::RES_STI)) {
_F80DivStack(Result, Offset, St0);
} else {
Expand Down Expand Up @@ -271,6 +284,9 @@ void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re
const auto St0 = 0;
const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0;

if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}
if (Reverse ^ (ResInST0 == OpResult::RES_STI)) {
_F80SubStack(Result, Offset, St0);
} else {
Expand Down Expand Up @@ -589,6 +605,10 @@ void OpDispatchBuilder::FXCH(OpcodeArgs) {
}

void OpDispatchBuilder::X87FYL2X(OpcodeArgs, bool IsFYL2XP1) {
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}

if (IsFYL2XP1) {
// create an add between top of stack and 1.
Ref One = ReducedPrecisionMode ? _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, _Constant(0x3FF0000000000000)) :
Expand Down Expand Up @@ -671,6 +691,9 @@ void OpDispatchBuilder::FTST(OpcodeArgs) {

void OpDispatchBuilder::X87OpHelper(OpcodeArgs, FEXCore::IR::IROps IROp, bool ZeroC2) {
DeriveOp(Result, IROp, _F80SCALEStack());
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}
if (ZeroC2) {
SetRFLAG<FEXCore::X86State::X87FLAG_C2_LOC>(_Constant(0));
}
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
_StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width);

if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
Expand Down
5 changes: 3 additions & 2 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,13 +2788,14 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
"StoreSize is the store size for conversion:",
"Float: 80-bit, 64-bit, or 32-bit",
"Int: 64-bit, 32-bit, 16-bit"
"Int: 64-bit, 32-bit, 16-bit",
"If possible, it will use the PredReg for an SVE store."
],
"HasSideEffects": true,
"X87": true
Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
34 changes: 33 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -9,9 +10,9 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down Expand Up @@ -45,6 +46,37 @@ class IREmitter {
}
void ResetWorkingList();

// Predicate Cache Implementation
// This lives here rather than OpcodeDispatcher because x87StackOptimization Pass
// also needs it.
struct PredicateKey {
ARMEmitter::PredicatePattern Pattern;
OpSize Size;
bool operator==(const PredicateKey& rhs) const = default;
};

struct PredicateKeyHash {
size_t operator()(const PredicateKey& key) const {
return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid));
}
};
fextl::unordered_map<PredicateKey, Ref, PredicateKeyHash> InitPredicateCache;

Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) {
PredicateKey Key {Pattern, Size};
auto ValIt = InitPredicateCache.find(Key);
if (ValIt == InitPredicateCache.end()) {
auto Predicate = _InitPredicate(Size, static_cast<uint8_t>(FEXCore::ToUnderlying(Pattern)));
InitPredicateCache[Key] = Predicate;
return Predicate;
}
return ValIt->second;
}

void ResetInitPredicateCache() {
InitPredicateCache.clear();
}

/**
* @name IR allocation routines
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) {
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
if (Op->StoreSize == OpSize::f80Bit) {
Ref PredReg = CurrentIR.GetNode(Op->PredReg);
bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg;
if (CanUsePredicateStore) {
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
Expand Down

0 comments on commit c1313a7

Please sign in to comment.