Skip to content

Commit

Permalink
Ensure predicate cache is reset when control flow leaves block
Browse files Browse the repository at this point in the history
Whenever the control float leaves the block, it might clobber the
predicate register so we reset the cache whenever that happens.

The difficulty here is that the cache is valid only during IR generation
so we need to make sure we catch all the cases during this pass where
the execution might leave the block.

Fixes FEX-Emu#4264
  • Loading branch information
pmatos committed Jan 16, 2025
1 parent 48c03d7 commit 3e4ba0d
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 19 deletions.
2 changes: 1 addition & 1 deletion FEXCore/Scripts/json_ir_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def parse_ops(ops):
(OpArg.Type == "GPR" or
OpArg.Type == "GPRPair" or
OpArg.Type == "FPR" or
OpArg.Type == "PR")):
OpArg.Type == "PRED")):
OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class")

OpArg.Name = ArgName
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
} else {
// For X87 extended doubles, Split the load.
Expand Down Expand Up @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
Expand Down
8 changes: 3 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,6 @@ class OpDispatchBuilder final : public IREmitter {
void FNINIT(OpcodeArgs);

void X87ModifySTP(OpcodeArgs, bool Inc);
void X87SinCos(OpcodeArgs);
void X87FYL2X(OpcodeArgs, bool IsFYL2XP1);
void X87LDENV(OpcodeArgs);
void X87FLDCW(OpcodeArgs);
Expand Down Expand Up @@ -764,9 +763,6 @@ class OpDispatchBuilder final : public IREmitter {
void FTSTF64(OpcodeArgs);
void FRNDINTF64(OpcodeArgs);
void FSQRTF64(OpcodeArgs);
void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp);
void X87SinCosF64(OpcodeArgs);
void X87FLDCWF64(OpcodeArgs);
void X87TANF64(OpcodeArgs);
void X87ATANF64(OpcodeArgs);
Expand Down Expand Up @@ -1175,9 +1171,11 @@ class OpDispatchBuilder final : public IREmitter {
}

void FlushRegisterCache(bool SRAOnly = false) {
// At block boundaries, fix up the carry flag.

// At block boundaries, fix up the carry flag, and reset the predicate cache.
if (!SRAOnly) {
RectifyCarryInvert(CFInvertedABI);
ResetInitPredicateCache();
}

CalculateDeferredFlags();
Expand Down
34 changes: 31 additions & 3 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
Ref PredReg = Invalid();
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
}
_StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down Expand Up @@ -161,6 +165,8 @@ void OpDispatchBuilder::FIST(OpcodeArgs, bool Truncate) {
}

void OpDispatchBuilder::FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) {
ResetInitPredicateCache();

if (Op->Src[0].IsNone()) { // Implicit argument case
auto Offset = Op->OP & 7;
auto St0 = 0;
Expand Down Expand Up @@ -191,6 +197,8 @@ void OpDispatchBuilder::FADD(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa
}

void OpDispatchBuilder::FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) {
ResetInitPredicateCache();

if (Op->Src[0].IsNone()) { // Implicit argument case
auto offset = Op->OP & 7;
auto st0 = 0;
Expand Down Expand Up @@ -225,6 +233,8 @@ void OpDispatchBuilder::FMUL(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispa
}

void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) {
ResetInitPredicateCache();

if (Op->Src[0].IsNone()) {
const auto Offset = Op->OP & 7;
const auto St0 = 0;
Expand Down Expand Up @@ -266,6 +276,8 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re
}

void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) {
ResetInitPredicateCache();

if (Op->Src[0].IsNone()) {
const auto Offset = Op->OP & 7;
const auto St0 = 0;
Expand Down Expand Up @@ -589,6 +601,10 @@ void OpDispatchBuilder::FXCH(OpcodeArgs) {
}

void OpDispatchBuilder::X87FYL2X(OpcodeArgs, bool IsFYL2XP1) {
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}

if (IsFYL2XP1) {
// create an add between top of stack and 1.
Ref One = ReducedPrecisionMode ? _VCastFromGPR(OpSize::i64Bit, OpSize::i64Bit, _Constant(0x3FF0000000000000)) :
Expand All @@ -600,6 +616,10 @@ void OpDispatchBuilder::X87FYL2X(OpcodeArgs, bool IsFYL2XP1) {
}

void OpDispatchBuilder::FCOMI(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::FCOMIFlags WhichFlags, bool PopTwice) {
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}

Ref arg {};
Ref b {};

Expand Down Expand Up @@ -655,6 +675,10 @@ void OpDispatchBuilder::FCOMI(OpcodeArgs, IR::OpSize Width, bool Integer, OpDisp
}

void OpDispatchBuilder::FTST(OpcodeArgs) {
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}

Ref Res = _F80StackTest(0);

Ref HostFlag_CF = _Bfe(OpSize::i64Bit, 1, FCMP_FLAG_LT, Res);
Expand All @@ -670,6 +694,10 @@ void OpDispatchBuilder::FTST(OpcodeArgs) {
}

void OpDispatchBuilder::X87OpHelper(OpcodeArgs, FEXCore::IR::IROps IROp, bool ZeroC2) {
if (!ReducedPrecisionMode) {
ResetInitPredicateCache();
}

DeriveOp(Result, IROp, _F80SCALEStack());
if (ZeroC2) {
SetRFLAG<FEXCore::X86State::X87FLAG_C2_LOC>(_Constant(0));
Expand Down Expand Up @@ -751,13 +779,11 @@ void OpDispatchBuilder::FNINIT(OpcodeArgs) {
}

void OpDispatchBuilder::X87FFREE(OpcodeArgs) {

_InvalidateStack(Op->OP & 7);
}

void OpDispatchBuilder::X87EMMS(OpcodeArgs) {
// Tags all get set to 0b11

_InvalidateStack(0xff);
}

Expand Down Expand Up @@ -829,6 +855,8 @@ void OpDispatchBuilder::X87FXAM(OpcodeArgs) {
}

void OpDispatchBuilder::X87FXTRACT(OpcodeArgs) {
ResetInitPredicateCache();

auto Top = _ReadStackValue(0);

_PopStackDestroy();
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
_StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width);

if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
Expand Down
5 changes: 3 additions & 2 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,13 +2788,14 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
"StoreSize is the store size for conversion:",
"Float: 80-bit, 64-bit, or 32-bit",
"Int: 64-bit, 32-bit, 16-bit"
"Int: 64-bit, 32-bit, 16-bit",
"If possible, it will use the PredReg for an SVE store."
],
"HasSideEffects": true,
"X87": true
Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
34 changes: 33 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: MIT
#pragma once

#include "CodeEmitter/Emitter.h"
#include "Interface/IR/IR.h"
#include "Interface/IR/IntrusiveIRList.h"

Expand All @@ -9,9 +10,9 @@

#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/fextl/unordered_map.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down Expand Up @@ -45,6 +46,37 @@ class IREmitter {
}
void ResetWorkingList();

// Predicate Cache Implementation
// This lives here rather than OpcodeDispatcher because x87StackOptimization Pass
// also needs it.
struct PredicateKey {
ARMEmitter::PredicatePattern Pattern;
OpSize Size;
bool operator==(const PredicateKey& rhs) const = default;
};

struct PredicateKeyHash {
size_t operator()(const PredicateKey& key) const {
return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid));
}
};
fextl::unordered_map<PredicateKey, Ref, PredicateKeyHash> InitPredicateCache;

Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) {
PredicateKey Key {Pattern, Size};
auto ValIt = InitPredicateCache.find(Key);
if (ValIt == InitPredicateCache.end()) {
auto Predicate = _InitPredicate(Size, static_cast<uint8_t>(FEXCore::ToUnderlying(Pattern)));
InitPredicateCache[Key] = Predicate;
return Predicate;
}
return ValIt->second;
}

void ResetInitPredicateCache() {
InitPredicateCache.clear();
}

/**
* @name IR allocation routines
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) {
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
if (Op->StoreSize == OpSize::f80Bit) {
Ref PredReg = CurrentIR.GetNode(Op->PredReg);
bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg;
if (CanUsePredicateStore) {
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
Expand Down

0 comments on commit 3e4ba0d

Please sign in to comment.