Skip to content

Commit

Permalink
x87 fst/fld optimization for different addrmodes
Browse files Browse the repository at this point in the history
Includes tests and instcountci files and tests.
When the x87 optimizations were implement, we missed
optimizing different addressing modes. This commit addresses this issue.

Discussed in #4252.
  • Loading branch information
pmatos committed Jan 10, 2025
1 parent a18b2d0 commit 20d6c5e
Show file tree
Hide file tree
Showing 10 changed files with 28,666 additions and 21 deletions.
16 changes: 14 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,20 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
}

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
// Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
// FIXME: Is TSO relevant for x87?
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(A.IndexScale >> 1));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i128Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
13 changes: 11 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,18 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
}

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(A.IndexScale));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i64Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,7 +2788,7 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, i1:$Float": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
Expand Down
36 changes: 20 additions & 16 deletions FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,11 +780,12 @@ void X87StackOptimization::Run(IREmitter* Emit) {
break;
}

case OP_STORESTACKMEMORY: {
const auto* Op = IROp->C<IROp_StoreStackMemory>();
case OP_STORESTACKMEM: {
const auto* Op = IROp->C<IROp_StoreStackMem>();
const auto& Value = MigrateToSlowPath_IfInvalid();
Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
Ref AddrNode = CurrentIR.GetNode(Op->Addr);
Ref Offset = CurrentIR.GetNode(Op->Offset);

// On the fast path we can optimize memory copies.
// If we are doing:
Expand All @@ -796,45 +797,48 @@ void X87StackOptimization::Run(IREmitter* Emit) {
// or similar. As long as the source size and dest size are one and the same.
// This will avoid any conversions between source and stack element size and conversion back.
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
fprintf(stderr, "Optimizing memcpy\n");
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->second, AddrNode, Offset,
OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
} else {
if (ReducedPrecisionMode) {
switch (Op->StoreSize) {
case OpSize::i32Bit: {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i32Bit, AddrNode, StackNode);
break;
}
case OpSize::i32Bit:
case OpSize::i64Bit: {
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
if (Op->StoreSize == OpSize::i32Bit) {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
}
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
break;
}
case OpSize::f80Bit: {
StackNode = IREmit->_F80CVTTo(StackNode, OpSize::i64Bit);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, GetConstant(8), OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
break;
}
default: ERROR_AND_DIE_FMT("Unsupported x87 size");
}
} else {
} else { // !ReducedPrecisionMode
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5);
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
}
} else {
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
}
}
}
Expand Down
44 changes: 44 additions & 0 deletions unittests/32Bit_ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"RBP": "0x3f800000",
"RDI": "0x3f800000",
"RSP": "0x3f800000"
},
"MemoryRegions": {
"0xf0000000": "4096"
},
"Mode": "32BIT"
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea edx, [rel base]
mov esi, 0x64

; Test fst
fst dword [edx]
fst dword [edx + 0xa]
fst dword [edx + esi]
fst dword [edx + esi * 4]
fst dword [edx + esi + 0xa]
fst dword [edx + esi * 4 + 0xa]

; Result check
mov eax, dword [edx]
mov ebx, dword [edx + 0xa]
mov ecx, dword [edx + esi]
mov ebp, dword [edx + esi * 4]
mov edi, dword [edx + esi + 0xa]
mov esp, dword [edx + esi * 4 + 0xa]

hlt
43 changes: 43 additions & 0 deletions unittests/ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"R8": "0x3f800000",
"R9": "0x3f800000",
"R10": "0x3f800000"
},
"MemoryRegions": {
"0x100000000": "4096"
}
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea rdx, [rel base]
mov rsi, 0x64

; Test fst
fst dword [rdx]
fst dword [rdx + 0xa]
fst dword [rdx + rsi]
fst dword [rdx + rsi * 4]
fst dword [rdx + rsi + 0xa]
fst dword [rdx + rsi * 4 + 0xa]

; Result check
mov eax, dword [rdx]
mov ebx, dword [rdx + 0xa]
mov ecx, dword [rdx + rsi]
mov r8d, dword [rdx + rsi * 4]
mov r9d, dword [rdx + rsi + 0xa]
mov r10d, dword [rdx + rsi * 4 + 0xa]

hlt
Loading

0 comments on commit 20d6c5e

Please sign in to comment.