Skip to content

Commit

Permalink
Merge pull request FEX-Emu#4292 from pmatos/EnsurePredCacheReset2
Browse files Browse the repository at this point in the history
Predicate cache alternative implementation
  • Loading branch information
Sonicadvance1 authored Jan 29, 2025
2 parents 2a4c169 + ddd241f commit b148cc6
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 69 deletions.
19 changes: 11 additions & 8 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: MIT
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
#include "FEXCore/Core/X86Enums.h"
#include "FEXCore/Utils/AllocatorHooks.h"
#include "Interface/Core/Dispatcher/Dispatcher.h"
#include "Interface/Context/Context.h"

Expand Down Expand Up @@ -60,8 +59,8 @@ namespace x64 {
// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5.
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand Down Expand Up @@ -100,6 +99,7 @@ namespace x64 {
ARMEmitter::Reg::r20,
ARMEmitter::Reg::r21,
ARMEmitter::Reg::r22,
// PF/AF must be last.
REG_PF,
REG_AF,
};
Expand All @@ -112,8 +112,8 @@ namespace x64 {
// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5.
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

constexpr unsigned RAPairs = 6;

Expand Down Expand Up @@ -249,8 +249,8 @@ namespace x32 {
// p6 and p7 registers are used as temporaries no not added here for RA
// See PREF_TMP_16B and PREF_TMP_32B
// p0-p1 are also used in the jit as temps.
// Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5.
constexpr std::array<ARMEmitter::PRegister, 4> PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};
// Also p8-p15 cannot be used can only encode p0-p7, p2 is a special register, so we're left with p3-p5.
constexpr std::array<ARMEmitter::PRegister, 3> PR = {ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5};

// All are caller saved
constexpr std::array<ARMEmitter::VRegister, 8> SRAFPR = {
Expand Down Expand Up @@ -631,7 +631,7 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi
}
#endif

if (SetPredRegs) {
if (SetPredRegs && (EmitterCTX->HostFeatures.SupportsSVE256 || EmitterCTX->HostFeatures.SupportsSVE128)) {
// Set up predicate registers.
// We don't bother spilling these in SpillStaticRegs,
// since all that matters is we restore them on a fill.
Expand All @@ -643,6 +643,9 @@ void Arm64Emitter::FillSpecialRegs(ARMEmitter::Register TmpReg, ARMEmitter::Regi
if (EmitterCTX->HostFeatures.SupportsSVE128) {
ptrue(ARMEmitter::SubRegSize::i8Bit, PRED_TMP_16B, ARMEmitter::PredicatePattern::SVE_VL16);
}

// Fill in the predicate register for the x87 ldst SVE optimization.
ptrue(ARMEmitter::SubRegSize::i16Bit, PRED_X87_SVEOPT, ARMEmitter::PredicatePattern::SVE_VL5);
}
}

Expand Down
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ constexpr auto REG_AF = ARMEmitter::Reg::r27;
// Vector temporaries
constexpr auto VTMP1 = ARMEmitter::VReg::v0;
constexpr auto VTMP2 = ARMEmitter::VReg::v1;

// Predicate register for X87 SVE Optimization
constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;

#else
constexpr auto TMP1 = ARMEmitter::XReg::x10;
constexpr auto TMP2 = ARMEmitter::XReg::x11;
Expand All @@ -65,6 +69,9 @@ constexpr auto VTMP2 = ARMEmitter::VReg::v17;
constexpr auto EC_CALL_CHECKER_PC_REG = ARMEmitter::XReg::x9;
constexpr auto EC_ENTRY_CPUAREA_REG = ARMEmitter::XReg::x17;

// Predicate register for X87 SVE Optimization
constexpr auto SVE_OPT_PRED = ARMEmitter::PReg::p2;

// These structures are not included in the standard Windows headers, define the offsets of members we care about for EC here.
constexpr size_t TEB_CPU_AREA_OFFSET = 0x1788;
constexpr size_t TEB_PEB_OFFSET = 0x60;
Expand All @@ -79,6 +86,9 @@ constexpr uint64_t EC_CODE_BITMAP_MAX_ADDRESS = 1ULL << 47;
// Will force one single instruction block to be generated first if set when entering the JIT filling SRA.
constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1;

// Predicate to use in the X87 SVE optimization
constexpr ARMEmitter::PRegister PRED_X87_SVEOPT = ARMEmitter::PReg::p2;

// Predicate register temporaries (used when AVX support is enabled)
// PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
// PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1.
Expand Down
24 changes: 9 additions & 15 deletions FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ tags: backend|arm64
#include "FEXCore/Core/X86Enums.h"
#include "FEXCore/Utils/LogManager.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"
#include "Interface/Core/CPUID.h"
#include "Interface/Core/JIT/JITClass.h"
#include "Interface/IR/IR.h"
Expand Down Expand Up @@ -1590,21 +1591,14 @@ DEF_OP(StoreMem) {
}
}

DEF_OP(InitPredicate) {
const auto Op = IROp->C<IR::IROp_InitPredicate>();
const auto OpSize = IROp->Size;
ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast<ARMEmitter::PredicatePattern>(Op->Pattern));
}
DEF_OP(StoreMemX87SVEOptPredicate) {
const auto Op = IROp->C<IR::IROp_StoreMemX87SVEOptPredicate>();
const auto Predicate = PRED_X87_SVEOPT;

DEF_OP(StoreMemPredicate) {
const auto Op = IROp->C<IR::IROp_StoreMemPredicate>();
const auto Predicate = GetPReg(Op->Mask.ID());
LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemX87SVEOptPredicate needs SVE support");

const auto RegData = GetVReg(Op->Value.ID());
const auto MemReg = GetReg(Op->Addr.ID());

LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemPredicate needs SVE support");

const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0);

switch (IROp->ElementSize) {
Expand All @@ -1628,13 +1622,13 @@ DEF_OP(StoreMemPredicate) {
}
}

DEF_OP(LoadMemPredicate) {
const auto Op = IROp->C<IR::IROp_LoadMemPredicate>();
DEF_OP(LoadMemX87SVEOptPredicate) {
const auto Op = IROp->C<IR::IROp_LoadMemX87SVEOptPredicate>();
const auto Dst = GetVReg(Node);
const auto Predicate = GetPReg(Op->Mask.ID());
const auto Predicate = PRED_X87_SVEOPT;
const auto MemReg = GetReg(Op->Addr.ID());

LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemPredicate needs SVE support");
LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemX87SVEOptPredicate needs SVE support");

const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0);

Expand Down
7 changes: 2 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4313,9 +4313,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T
if (OpSize == OpSize::f80Bit) {
Ref MemSrc = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
// Using SVE we can load this with a single instruction.
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc);
return _LoadMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, MemSrc);
} else {
// For X87 extended doubles, Split the load.
auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align);
Expand Down Expand Up @@ -4448,8 +4446,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl
if (OpSize == OpSize::f80Bit) {
Ref MemStoreDst = LoadEffectiveAddress(A, true);
if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) {
auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst);
_StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, MemStoreDst);
} else {
// For X87 extended doubles, split before storing
_StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align);
Expand Down
15 changes: 6 additions & 9 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -567,19 +567,16 @@
]
},

"PRED = InitPredicate OpSize:#Size, u8:$Pattern": {
"Desc": ["Initialize predicate register from Pattern"],
"DestSize": "Size"
},

"StoreMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, PRED:$Mask, GPR:$Addr": {
"Desc": [ "Stores a value to memory using SVE predicate mask." ],
"StoreMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, GPR:$Addr": {
"Desc": [ "Stores a value to memory using SVE predicate mask that's designed",
"specifically for use in the X87 SVE Ldst optimization." ],
"DestSize": "RegisterSize",
"HasSideEffects": true,
"ElementSize": "ElementSize"
},
"FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": {
"Desc": [ "Loads a value to memory using SVE predicate mask." ],
"FPR = LoadMemX87SVEOptPredicate OpSize:#RegisterSize, OpSize:#ElementSize, GPR:$Addr": {
"Desc": [ "Loads a value to memory using SVE predicate mask that's designed",
"specifically for use in the X87 SVE Ldst optimization." ],
"DestSize": "RegisterSize",
"ElementSize": "ElementSize"
},
Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/IR/IREmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) {
case FPRClass:
case GPRFixedClass:
case FPRFixedClass:
case PREDClass:
case InvalidClass: return Class;
default: break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "FEXCore/IR/IR.h"
#include "FEXCore/Utils/Profiler.h"
#include "FEXCore/Core/HostFeatures.h"
#include "CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/Arm64Emitter.h"

#include <array>
#include <cstddef>
Expand Down Expand Up @@ -838,13 +838,12 @@ void X87StackOptimization::Run(IREmitter* Emit) {
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Op->StoreSize == OpSize::f80Bit) {
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
if (!IsZero(Offset)) {
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
}
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
IREmit->_StoreMemX87SVEOptPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
Expand Down
36 changes: 36 additions & 0 deletions unittests/ASM/X87/MemcopyWithCPUID.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
%ifdef CONFIG
{
"RegData": {
"RBX": "0x8000000000000000",
"RCX": "0x3fff"
}
}
%endif

; Related to #4274 - ensures that if cpuid clobbers the predicate register,
; we reset the predicate cache.

section .data
align 8

data:
dt 1.0

section .bss
align 8

data2:
resb 10

section .text
lea r8, [rel data]
fld tword [r8]

mov rax, 0x0
cpuid ; Will this instruction clobber the predicate register?

fstp tword [rel data2]

mov rbx, [rel data2]
mov rcx, [rel data2+8]
hlt
Loading

0 comments on commit b148cc6

Please sign in to comment.