Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x87 fst/fld optimization for different addrmodes #4266

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ desc: Handles x86/64 x87 to IR
#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/Utils/FPState.h>

#include <cmath>
#include <stddef.h>
#include <stdint.h>

Expand Down Expand Up @@ -129,8 +130,23 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
}

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
// Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
// FIXME: Is TSO relevant for x87?
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

// Index scale is a power of 2?
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i128Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
16 changes: 14 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,21 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
}

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

// Index scale is a power of 2?
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i64Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,7 +2788,7 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, i1:$Float": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
Expand Down
1 change: 0 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <FEXCore/fextl/vector.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down
51 changes: 35 additions & 16 deletions FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,18 @@ class X87StackOptimization final : public Pass {
// Helpers
Ref RotateRight8(uint32_t V, Ref Amount);

// Helper to check if a Ref is a Zero constant
bool IsZero(Ref Node) {
auto Header = IR->GetOp<IR::IROp_Header>(Node);
if (Header->Op != OP_CONSTANT) {
return false;
}

auto Const = Header->C<IROp_Constant>();
return Const->Constant == 0;
}


// Handles a Unary operation.
// Takes the op we are handling, the Node for the reduced precision case and the node for the normal case.
// Depending on the type of Op64, we might need to pass a couple of extra constant arguments, this happens
Expand Down Expand Up @@ -245,6 +257,7 @@ class X87StackOptimization final : public Pass {
bool SlowPath = false;
// Keeping IREmitter not to pass arguments around
IREmitter* IREmit = nullptr;
IRListView* IR;
};

inline void X87StackOptimization::InvalidateCaches() {
Expand Down Expand Up @@ -537,6 +550,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {

// Initialize IREmit member
IREmit = Emit;
IR = &CurrentIR;

// Run optimization proper
for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) {
Expand Down Expand Up @@ -780,11 +794,12 @@ void X87StackOptimization::Run(IREmitter* Emit) {
break;
}

case OP_STORESTACKMEMORY: {
const auto* Op = IROp->C<IROp_StoreStackMemory>();
case OP_STORESTACKMEM: {
const auto* Op = IROp->C<IROp_StoreStackMem>();
const auto& Value = MigrateToSlowPath_IfInvalid();
Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
Ref AddrNode = CurrentIR.GetNode(Op->Addr);
Ref Offset = CurrentIR.GetNode(Op->Offset);

// On the fast path we can optimize memory copies.
// If we are doing:
Expand All @@ -796,45 +811,49 @@ void X87StackOptimization::Run(IREmitter* Emit) {
// or similar. As long as the source size and dest size are one and the same.
// This will avoid any conversions between source and stack element size and conversion back.
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->second, AddrNode, Offset,
OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
} else {
if (ReducedPrecisionMode) {
switch (Op->StoreSize) {
case OpSize::i32Bit: {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i32Bit, AddrNode, StackNode);
break;
}
case OpSize::i32Bit:
case OpSize::i64Bit: {
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
if (Op->StoreSize == OpSize::i32Bit) {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
}
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
break;
}
case OpSize::f80Bit: {
StackNode = IREmit->_F80CVTTo(StackNode, OpSize::i64Bit);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, GetConstant(8), OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
break;
}
default: ERROR_AND_DIE_FMT("Unsupported x87 size");
}
} else {
} else { // !ReducedPrecisionMode
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
if (!IsZero(Offset)) {
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
}
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
}
} else {
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
}
}
}
Expand Down
44 changes: 44 additions & 0 deletions unittests/32Bit_ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"RBP": "0x3f800000",
"RDI": "0x3f800000",
"RSP": "0x3f800000"
},
"MemoryRegions": {
"0xf0000000": "4096"
},
"Mode": "32BIT"
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea edx, [rel base]
mov esi, 0x64

; Test fst
fst dword [edx]
fst dword [edx + 0xa]
fst dword [edx + esi]
fst dword [edx + esi * 4]
fst dword [edx + esi + 0xa]
fst dword [edx + esi * 4 + 0xa]

; Result check
mov eax, dword [edx]
mov ebx, dword [edx + 0xa]
mov ecx, dword [edx + esi]
mov ebp, dword [edx + esi * 4]
mov edi, dword [edx + esi + 0xa]
mov esp, dword [edx + esi * 4 + 0xa]

hlt
25 changes: 25 additions & 0 deletions unittests/ASM/X87/DB_07_2.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
%ifdef CONFIG
{
"RegData": {
"MM7": ["0x8000000000000000", "0x4000"]
}
}
%endif

lea rdx, [rel data]
fld tword [rdx + 8 * 0]

lea rdx, [rel data2]
lea rax, [rdx + 8 * 0]
fstp tword [rax]
fld tword [rdx + 8 * 0]

hlt

align 8
data:
dt 2.0
dq 0
data2:
dt 0.0
dq 0
43 changes: 43 additions & 0 deletions unittests/ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"R8": "0x3f800000",
"R9": "0x3f800000",
"R10": "0x3f800000"
},
"MemoryRegions": {
"0x100000000": "4096"
}
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea rdx, [rel base]
mov rsi, 0x64

; Test fst
fst dword [rdx]
fst dword [rdx + 0xa]
fst dword [rdx + rsi]
fst dword [rdx + rsi * 4]
fst dword [rdx + rsi + 0xa]
fst dword [rdx + rsi * 4 + 0xa]

; Result check
mov eax, dword [rdx]
mov ebx, dword [rdx + 0xa]
mov ecx, dword [rdx + rsi]
mov r8d, dword [rdx + rsi * 4]
mov r9d, dword [rdx + rsi + 0xa]
mov r10d, dword [rdx + rsi * 4 + 0xa]

hlt
Loading
Loading