Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEXCore: Keep PCMPISTRI arguments in vectors longer #4347

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions FEXCore/Source/Common/SoftFloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@
#include <cstring>
#include <stdint.h>

#ifdef _M_ARM_64
// Can't use uint8x16_t directly from arm_neon.h here.
// Overrides softfloat-3e's defines which causes problems.
using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t;
#elif defined(_M_X86_64)
#include <xmmintrin.h>
using VectorRegType = __m128i;
#endif
#include "Common/VectorRegType.h"

extern "C" {
#include "SoftFloat-3e/platform.h"
Expand Down Expand Up @@ -485,8 +478,8 @@ struct FEX_PACKED X80SoftFloat {
return FEXCore::BitCast<double>(Result);
}

VectorRegType ToVector() const {
VectorRegType Ret {};
FEXCore::VectorRegType ToVector() const {
FEXCore::VectorRegType Ret {};
memcpy(&Ret, this, sizeof(*this));
return Ret;
}
Expand Down Expand Up @@ -582,7 +575,7 @@ struct FEX_PACKED X80SoftFloat {
*this = i32_to_extF80(rhs);
}

X80SoftFloat(const VectorRegType rhs) {
X80SoftFloat(const FEXCore::VectorRegType rhs) {
memcpy(this, &rhs, sizeof(*this));
}

Expand All @@ -592,7 +585,7 @@ struct FEX_PACKED X80SoftFloat {
Sign = rhs.signExp >> 15;
}

operator VectorRegType() const {
operator FEXCore::VectorRegType() const {
return ToVector();
}

Expand Down
16 changes: 16 additions & 0 deletions FEXCore/Source/Common/VectorRegType.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-License-Identifier: MIT
#pragma once

#ifdef _M_X86_64
#include <xmmintrin.h>
#endif

namespace FEXCore {
#ifdef _M_ARM_64
// Can't use uint8x16_t directly from arm_neon.h here.
// Overrides softfloat-3e's defines which causes problems.
using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t;
#elif defined(_M_X86_64)
using VectorRegType = __m128i;
#endif
} // namespace FEXCore
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
SupportsPreserveAllABI};
return true;
case IR::OP_VPCMPISTRX:
*Info = {FABI_I32_I128_I128_I16, (void*)&FEXCore::CPU::OpHandlers<IR::OP_VPCMPISTRX>::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI};
*Info = {FABI_I32_V128_V128_I16, (void*)&FEXCore::CPU::OpHandlers<IR::OP_VPCMPISTRX>::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI};
return true;

default: break;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: MIT
#include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h"
#include "Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h"
#include "Interface/IR/IR.h"

Expand All @@ -11,12 +10,11 @@

namespace FEXCore::CPU {
#ifdef _M_ARM_64
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) {
const auto is_using_words = (control & 1) != 0;

if (is_using_words) {
uint16x8_t a {};
memcpy(&a, &data, sizeof(a));
uint16x8_t a = vreinterpretq_u16_u8(data);
uint16x8_t VIndexes {};
const uint16x8_t VIndex16 = vdupq_n_u16(8);
uint16_t Indexes[8] = {
Expand All @@ -27,21 +25,19 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da
auto SelectResult = vbslq_u16(MaskResult, VIndexes, VIndex16);
return vminvq_u16(SelectResult);
} else {
uint8x16_t a {};
memcpy(&a, &data, sizeof(a));
uint8x16_t VIndexes {};
const uint8x16_t VIndex16 = vdupq_n_u8(16);
uint8_t Indexes[16] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
memcpy(&VIndexes, Indexes, sizeof(VIndexes));
auto MaskResult = vceqzq_u8(a);
auto MaskResult = vceqzq_u8(data);
auto SelectResult = vbslq_u8(MaskResult, VIndexes, VIndex16);
return vminvq_u8(SelectResult);
}
}
#else
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) {
const auto* data_u8 = reinterpret_cast<const uint8_t*>(&data);
const auto is_using_words = (control & 1) != 0;

Expand Down Expand Up @@ -80,12 +76,16 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da
// to be the max length possible for the given character size specified
// in the control flags (16 characters for 8-bit, and 8 characters for 16-bit).
//
FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers<IR::OP_VPCMPISTRX>::handle(__uint128_t lhs, __uint128_t rhs, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers<IR::OP_VPCMPISTRX>::handle(FEXCore::VectorRegType lhs, FEXCore::VectorRegType rhs, uint16_t control) {
// Subtract by 1 in order to make validity limits 0-based
const auto valid_lhs = GetImplicitLength(lhs, control) - 1;
const auto valid_rhs = GetImplicitLength(rhs, control) - 1;
__uint128_t lhs_i;
memcpy(&lhs_i, &lhs, sizeof(lhs_i));
__uint128_t rhs_i;
memcpy(&rhs_i, &rhs, sizeof(rhs_i));

return OpHandlers<IR::OP_VPCMPESTRX>::MainBody(lhs, valid_lhs, rhs, valid_rhs, control);
return OpHandlers<IR::OP_VPCMPESTRX>::MainBody(lhs_i, valid_lhs, rhs_i, valid_rhs, control);
}

} // namespace FEXCore::CPU
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h"
#include "Interface/IR/IR.h"
#include "Common/VectorRegType.h"

namespace FEXCore::CPU {

Expand Down Expand Up @@ -343,7 +344,7 @@ struct OpHandlers<IR::OP_VPCMPESTRX> {

template<>
struct OpHandlers<IR::OP_VPCMPISTRX> {
FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(__uint128_t lhs, __uint128_t rhs, uint16_t control);
FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(VectorRegType lhs, VectorRegType rhs, uint16_t control);
};

} // namespace FEXCore::CPU
4 changes: 1 addition & 3 deletions FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: MIT
#pragma once

#include <array>
#include <cstddef>
#include <cstdint>

#include <FEXCore/Core/CoreState.h>
Expand Down Expand Up @@ -31,7 +29,7 @@ enum FallbackABI {
FABI_F80_I16_F80,
FABI_F80_I16_F80_F80,
FABI_I32_I64_I64_I128_I128_I16,
FABI_I32_I128_I128_I16,
FABI_I32_V128_V128_I16,
};

struct FallbackInfo {
Expand Down
45 changes: 24 additions & 21 deletions FEXCore/Source/Interface/Core/JIT/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, float>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, float>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -142,7 +142,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, double>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, double>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -163,7 +163,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, uint32_t>(ARMEmitter::Reg::r2);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, uint32_t>(ARMEmitter::Reg::r2);
} else {
blr(ARMEmitter::Reg::r2);
}
Expand All @@ -181,7 +181,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<float, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<float, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -205,7 +205,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<double, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<double, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -265,7 +265,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint32_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -288,7 +288,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint32_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -305,7 +305,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint64_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint64_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -336,7 +336,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint64_t, uint16_t, VectorRegType, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint64_t, uint16_t, FEXCore::VectorRegType, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -359,7 +359,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
mov(ARMEmitter::VReg::v0.Q(), Src1.Q());

if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -384,7 +384,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, VectorRegType, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, FEXCore::VectorRegType, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -428,7 +428,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

FillI32Result();
} break;
case FABI_I32_I128_I128_I16: {
case FABI_I32_V128_V128_I16: {
SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true);

const auto Op = IROp->C<IR::IROp_VPCMPISTRX>();
Expand All @@ -437,19 +437,22 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
const auto Src2 = GetVReg(Op->RHS.ID());
const auto Control = Op->Control;

umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r0, Src1, 0);
umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r1, Src1, 1);

umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r2, Src2, 0);
umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r3, Src2, 1);
if (!TMP_ABIARGS) {
mov(VTMP1.Q(), Src1.Q());
mov(ARMEmitter::VReg::v1.Q(), Src2.Q());
mov(ARMEmitter::VReg::v0.Q(), VTMP1.Q());
} else {
mov(ARMEmitter::VReg::v0.Q(), Src1.Q());
mov(ARMEmitter::VReg::v1.Q(), Src2.Q());
}

movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r4, Control);
movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r0, Control);

ldr(ARMEmitter::XReg::x5, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint64_t, uint64_t, uint64_t, uint64_t, uint16_t>(ARMEmitter::Reg::r5);
GenerateIndirectRuntimeCall<uint32_t, FEXCore::VectorRegType, FEXCore::VectorRegType, uint16_t>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r5);
blr(ARMEmitter::Reg::r1);
}

FillI32Result();
Expand Down
14 changes: 6 additions & 8 deletions unittests/InstructionCountCI/FlagM/HotBlocks.json
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 38,
"ExpectedInstructionCount": 36,
"Comment": [
"A Hat In Time spends at least 5% CPU time in this instruction",
"Comes from vcruntime140.dll wcsstr"
Expand All @@ -776,13 +776,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0xd",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0xd",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand Down
28 changes: 12 additions & 16 deletions unittests/InstructionCountCI/SSE42_Strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
]
},
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 34,
"ExpectedInstructionCount": 32,
"Comment": [
"0x66 0x0f 0x3A 0x62"
],
Expand All @@ -144,13 +144,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0x0",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0x0",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand All @@ -170,7 +168,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 38,
"ExpectedInstructionCount": 36,
"Comment": [
"0x66 0x0f 0x3A 0x63"
],
Expand All @@ -186,13 +184,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0x0",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0x0",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand Down
Loading