From 3160e0a4307c3aa2df8cba4ba38d49e5d201f015 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 11 Feb 2025 12:54:00 -0800 Subject: [PATCH 1/2] FEXCore: Keep PCMPISTRI arguments in vectors longer This reduces our codegen size and removes a few umov instructions. Performance falls within noise but this small change will allow us to do more vector optimizations in C code in the future. --- FEXCore/Source/Common/SoftFloat.h | 17 +++---- FEXCore/Source/Common/VectorRegType.h | 16 +++++++ .../Fallbacks/InterpreterFallbacks.cpp | 2 +- .../Fallbacks/StringCompareFallbacks.cpp | 20 ++++----- .../Interpreter/Fallbacks/VectorFallbacks.h | 3 +- .../Core/Interpreter/InterpreterOps.h | 4 +- FEXCore/Source/Interface/Core/JIT/JIT.cpp | 45 ++++++++++--------- 7 files changed, 59 insertions(+), 48 deletions(-) create mode 100644 FEXCore/Source/Common/VectorRegType.h diff --git a/FEXCore/Source/Common/SoftFloat.h b/FEXCore/Source/Common/SoftFloat.h index f7169e3e01..5757efde75 100644 --- a/FEXCore/Source/Common/SoftFloat.h +++ b/FEXCore/Source/Common/SoftFloat.h @@ -10,14 +10,7 @@ #include #include -#ifdef _M_ARM_64 -// Can't use uint8x16_t directly from arm_neon.h here. -// Overrides softfloat-3e's defines which causes problems. -using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t; -#elif defined(_M_X86_64) -#include -using VectorRegType = __m128i; -#endif +#include "Common/VectorRegType.h" extern "C" { #include "SoftFloat-3e/platform.h" @@ -485,8 +478,8 @@ struct FEX_PACKED X80SoftFloat { return FEXCore::BitCast(Result); } - VectorRegType ToVector() const { - VectorRegType Ret {}; + FEXCore::VectorRegType ToVector() const { + FEXCore::VectorRegType Ret {}; memcpy(&Ret, this, sizeof(*this)); return Ret; } @@ -582,7 +575,7 @@ struct FEX_PACKED X80SoftFloat { *this = i32_to_extF80(rhs); } - X80SoftFloat(const VectorRegType rhs) { + X80SoftFloat(const FEXCore::VectorRegType rhs) { memcpy(this, &rhs, sizeof(*this)); } @@ -592,7 +585,7 @@ struct FEX_PACKED X80SoftFloat { Sign = rhs.signExp >> 15; } - operator VectorRegType() const { + operator FEXCore::VectorRegType() const { return ToVector(); } diff --git a/FEXCore/Source/Common/VectorRegType.h b/FEXCore/Source/Common/VectorRegType.h new file mode 100644 index 0000000000..8265832b6f --- /dev/null +++ b/FEXCore/Source/Common/VectorRegType.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: MIT +#pragma once + +#ifdef _M_X86_64 +#include +#endif + +namespace FEXCore { +#ifdef _M_ARM_64 +// Can't use uint8x16_t directly from arm_neon.h here. +// Overrides softfloat-3e's defines which causes problems. +using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t; +#elif defined(_M_X86_64) +using VectorRegType = __m128i; +#endif +} // namespace FEXCore diff --git a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp index 8e88715eb1..16d045e025 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp +++ b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/InterpreterFallbacks.cpp @@ -230,7 +230,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I SupportsPreserveAllABI}; return true; case IR::OP_VPCMPISTRX: - *Info = {FABI_I32_I128_I128_I16, (void*)&FEXCore::CPU::OpHandlers::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI}; + *Info = {FABI_I32_V128_V128_I16, (void*)&FEXCore::CPU::OpHandlers::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI}; return true; default: break; diff --git a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp index 6a33b2e910..bb0735834b 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp +++ b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/StringCompareFallbacks.cpp @@ -1,5 +1,4 @@ // SPDX-License-Identifier: MIT -#include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h" #include "Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h" #include "Interface/IR/IR.h" @@ -11,12 +10,11 @@ namespace FEXCore::CPU { #ifdef _M_ARM_64 -FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) { +FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) { const auto is_using_words = (control & 1) != 0; if (is_using_words) { - uint16x8_t a {}; - memcpy(&a, &data, sizeof(a)); + uint16x8_t a = vreinterpretq_u16_u8(data); uint16x8_t VIndexes {}; const uint16x8_t VIndex16 = vdupq_n_u16(8); uint16_t Indexes[8] = { @@ -27,21 +25,19 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da auto SelectResult = vbslq_u16(MaskResult, VIndexes, VIndex16); return vminvq_u16(SelectResult); } else { - uint8x16_t a {}; - memcpy(&a, &data, sizeof(a)); uint8x16_t VIndexes {}; const uint8x16_t VIndex16 = vdupq_n_u8(16); uint8_t Indexes[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; memcpy(&VIndexes, Indexes, sizeof(VIndexes)); - auto MaskResult = vceqzq_u8(a); + auto MaskResult = vceqzq_u8(data); auto SelectResult = vbslq_u8(MaskResult, VIndexes, VIndex16); return vminvq_u8(SelectResult); } } #else -FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) { +FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) { const auto* data_u8 = reinterpret_cast(&data); const auto is_using_words = (control & 1) != 0; @@ -80,12 +76,16 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da // to be the max length possible for the given character size specified // in the control flags (16 characters for 8-bit, and 8 characters for 16-bit). // -FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers::handle(__uint128_t lhs, __uint128_t rhs, uint16_t control) { +FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers::handle(FEXCore::VectorRegType lhs, FEXCore::VectorRegType rhs, uint16_t control) { // Subtract by 1 in order to make validity limits 0-based const auto valid_lhs = GetImplicitLength(lhs, control) - 1; const auto valid_rhs = GetImplicitLength(rhs, control) - 1; + __uint128_t lhs_i; + memcpy(&lhs_i, &lhs, sizeof(lhs_i)); + __uint128_t rhs_i; + memcpy(&rhs_i, &rhs, sizeof(rhs_i)); - return OpHandlers::MainBody(lhs, valid_lhs, rhs, valid_rhs, control); + return OpHandlers::MainBody(lhs_i, valid_lhs, rhs_i, valid_rhs, control); } } // namespace FEXCore::CPU diff --git a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h index 7a8e9c3198..0328eec5fb 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h +++ b/FEXCore/Source/Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h @@ -8,6 +8,7 @@ #include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h" #include "Interface/IR/IR.h" +#include "Common/VectorRegType.h" namespace FEXCore::CPU { @@ -343,7 +344,7 @@ struct OpHandlers { template<> struct OpHandlers { - FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(__uint128_t lhs, __uint128_t rhs, uint16_t control); + FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(VectorRegType lhs, VectorRegType rhs, uint16_t control); }; } // namespace FEXCore::CPU diff --git a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h index aa53b130d4..d256373e17 100644 --- a/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h +++ b/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h @@ -1,8 +1,6 @@ // SPDX-License-Identifier: MIT #pragma once -#include -#include #include #include @@ -31,7 +29,7 @@ enum FallbackABI { FABI_F80_I16_F80, FABI_F80_I16_F80_F80, FABI_I32_I64_I64_I128_I128_I16, - FABI_I32_I128_I128_I16, + FABI_I32_V128_V128_I16, }; struct FallbackInfo { diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp index f7db5d6d3f..3227dbf5c8 100644 --- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp @@ -126,7 +126,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -142,7 +142,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -163,7 +163,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } else { blr(ARMEmitter::Reg::r2); } @@ -181,7 +181,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -205,7 +205,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -265,7 +265,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -288,7 +288,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -305,7 +305,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -336,7 +336,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -359,7 +359,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { mov(ARMEmitter::VReg::v0.Q(), Src1.Q()); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -384,7 +384,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { blr(ARMEmitter::Reg::r1); } @@ -428,7 +428,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { FillI32Result(); } break; - case FABI_I32_I128_I128_I16: { + case FABI_I32_V128_V128_I16: { SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); const auto Op = IROp->C(); @@ -437,19 +437,22 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) { const auto Src2 = GetVReg(Op->RHS.ID()); const auto Control = Op->Control; - umov(ARMEmitter::Reg::r0, Src1, 0); - umov(ARMEmitter::Reg::r1, Src1, 1); - - umov(ARMEmitter::Reg::r2, Src2, 0); - umov(ARMEmitter::Reg::r3, Src2, 1); + if (!TMP_ABIARGS) { + mov(VTMP1.Q(), Src1.Q()); + mov(ARMEmitter::VReg::v1.Q(), Src2.Q()); + mov(ARMEmitter::VReg::v0.Q(), VTMP1.Q()); + } else { + mov(ARMEmitter::VReg::v0.Q(), Src1.Q()); + mov(ARMEmitter::VReg::v1.Q(), Src2.Q()); + } - movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r4, Control); + movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r0, Control); - ldr(ARMEmitter::XReg::x5, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); + ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r5); + GenerateIndirectRuntimeCall(ARMEmitter::Reg::r1); } else { - blr(ARMEmitter::Reg::r5); + blr(ARMEmitter::Reg::r1); } FillI32Result(); From 549cdc4c2c04ed423b4ae8885c52acdf51d89791 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 11 Feb 2025 12:57:43 -0800 Subject: [PATCH 2/2] InstcountCI: Update --- .../InstructionCountCI/FlagM/HotBlocks.json | 14 ++++------ .../InstructionCountCI/SSE42_Strings.json | 28 ++++++++----------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks.json b/unittests/InstructionCountCI/FlagM/HotBlocks.json index 08b4be34d5..0f314aab2a 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks.json @@ -759,7 +759,7 @@ ] }, "pcmpistri xmm0, xmm1, 0_0_00_11_01b": { - "ExpectedInstructionCount": 38, + "ExpectedInstructionCount": 36, "Comment": [ "A Hat In Time spends at least 5% CPU time in this instruction", "Comes from vcruntime140.dll wcsstr" @@ -776,13 +776,11 @@ "st1 {v2.2d, v3.2d}, [x0], #32", "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64", "str x30, [x0], #16", - "mov x0, v16.d[0]", - "mov x1, v16.d[1]", - "mov x2, v17.d[0]", - "mov x3, v17.d[1]", - "mov w4, #0xd", - "ldr x5, [x28, #1760]", - "blr x5", + "mov v0.16b, v16.16b", + "mov v1.16b, v17.16b", + "mov w0, #0xd", + "ldr x1, [x28, #1760]", + "blr x1", "ldr w4, [x28, #1000]", "msr nzcv, x4", "ldp x4, x7, [x28, #280]", diff --git a/unittests/InstructionCountCI/SSE42_Strings.json b/unittests/InstructionCountCI/SSE42_Strings.json index 9829d7c564..3647de8013 100644 --- a/unittests/InstructionCountCI/SSE42_Strings.json +++ b/unittests/InstructionCountCI/SSE42_Strings.json @@ -128,7 +128,7 @@ ] }, "pcmpistrm xmm0, xmm1, 0_0_00_00_00b": { - "ExpectedInstructionCount": 34, + "ExpectedInstructionCount": 32, "Comment": [ "0x66 0x0f 0x3A 0x62" ], @@ -144,13 +144,11 @@ "st1 {v2.2d, v3.2d}, [x0], #32", "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64", "str x30, [x0], #16", - "mov x0, v16.d[0]", - "mov x1, v16.d[1]", - "mov x2, v17.d[0]", - "mov x3, v17.d[1]", - "mov w4, #0x0", - "ldr x5, [x28, #1760]", - "blr x5", + "mov v0.16b, v16.16b", + "mov v1.16b, v17.16b", + "mov w0, #0x0", + "ldr x1, [x28, #1760]", + "blr x1", "ldr w4, [x28, #1000]", "msr nzcv, x4", "ldp x4, x7, [x28, #280]", @@ -170,7 +168,7 @@ ] }, "pcmpistri xmm0, xmm1, 0_0_00_00_00b": { - "ExpectedInstructionCount": 38, + "ExpectedInstructionCount": 36, "Comment": [ "0x66 0x0f 0x3A 0x63" ], @@ -186,13 +184,11 @@ "st1 {v2.2d, v3.2d}, [x0], #32", "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64", "str x30, [x0], #16", - "mov x0, v16.d[0]", - "mov x1, v16.d[1]", - "mov x2, v17.d[0]", - "mov x3, v17.d[1]", - "mov w4, #0x0", - "ldr x5, [x28, #1760]", - "blr x5", + "mov v0.16b, v16.16b", + "mov v1.16b, v17.16b", + "mov w0, #0x0", + "ldr x1, [x28, #1760]", + "blr x1", "ldr w4, [x28, #1000]", "msr nzcv, x4", "ldp x4, x7, [x28, #280]",