diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index cda8d9c509..8ef4842249 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -223,7 +223,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6dd2d8b108..f729823364 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..6225f2085e 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -718,7 +718,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +763,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); @@ -1175,9 +1171,11 @@ class OpDispatchBuilder final : public IREmitter { } void FlushRegisterCache(bool SRAOnly = false) { - // At block boundaries, fix up the carry flag. + + // At block boundaries, fix up the carry flag, and reset the predicate cache. if (!SRAOnly) { RectifyCarryInvert(CFInvertedABI); + ResetInitPredicateCache(); } CalculateDeferredFlags(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..66da88425f 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..5414619689 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -2788,13 +2788,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);