Skip to content

Commit

Permalink
Merge pull request #4347 from Sonicadvance1/pcmpistri_vector
Browse files Browse the repository at this point in the history
FEXCore: Keep PCMPISTRI arguments in vectors longer
  • Loading branch information
Sonicadvance1 authored Feb 11, 2025
2 parents 6a39a8d + 549cdc4 commit 1b144ba
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 72 deletions.
17 changes: 5 additions & 12 deletions FEXCore/Source/Common/SoftFloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@
#include <cstring>
#include <stdint.h>

#ifdef _M_ARM_64
// Can't use uint8x16_t directly from arm_neon.h here.
// Overrides softfloat-3e's defines which causes problems.
using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t;
#elif defined(_M_X86_64)
#include <xmmintrin.h>
using VectorRegType = __m128i;
#endif
#include "Common/VectorRegType.h"

extern "C" {
#include "SoftFloat-3e/platform.h"
Expand Down Expand Up @@ -485,8 +478,8 @@ struct FEX_PACKED X80SoftFloat {
return FEXCore::BitCast<double>(Result);
}

VectorRegType ToVector() const {
VectorRegType Ret {};
FEXCore::VectorRegType ToVector() const {
FEXCore::VectorRegType Ret {};
memcpy(&Ret, this, sizeof(*this));
return Ret;
}
Expand Down Expand Up @@ -582,7 +575,7 @@ struct FEX_PACKED X80SoftFloat {
*this = i32_to_extF80(rhs);
}

X80SoftFloat(const VectorRegType rhs) {
X80SoftFloat(const FEXCore::VectorRegType rhs) {
memcpy(this, &rhs, sizeof(*this));
}

Expand All @@ -592,7 +585,7 @@ struct FEX_PACKED X80SoftFloat {
Sign = rhs.signExp >> 15;
}

operator VectorRegType() const {
operator FEXCore::VectorRegType() const {
return ToVector();
}

Expand Down
16 changes: 16 additions & 0 deletions FEXCore/Source/Common/VectorRegType.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-License-Identifier: MIT
#pragma once

#ifdef _M_X86_64
#include <xmmintrin.h>
#endif

namespace FEXCore {
#ifdef _M_ARM_64
// Can't use uint8x16_t directly from arm_neon.h here.
// Overrides softfloat-3e's defines which causes problems.
using VectorRegType = __attribute__((neon_vector_type(16))) uint8_t;
#elif defined(_M_X86_64)
using VectorRegType = __m128i;
#endif
} // namespace FEXCore
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ bool InterpreterOps::GetFallbackHandler(bool SupportsPreserveAllABI, const IR::I
SupportsPreserveAllABI};
return true;
case IR::OP_VPCMPISTRX:
*Info = {FABI_I32_I128_I128_I16, (void*)&FEXCore::CPU::OpHandlers<IR::OP_VPCMPISTRX>::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI};
*Info = {FABI_I32_V128_V128_I16, (void*)&FEXCore::CPU::OpHandlers<IR::OP_VPCMPISTRX>::handle, Core::OPINDEX_VPCMPISTRX, SupportsPreserveAllABI};
return true;

default: break;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: MIT
#include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h"
#include "Interface/Core/Interpreter/Fallbacks/VectorFallbacks.h"
#include "Interface/IR/IR.h"

Expand All @@ -11,12 +10,11 @@

namespace FEXCore::CPU {
#ifdef _M_ARM_64
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) {
const auto is_using_words = (control & 1) != 0;

if (is_using_words) {
uint16x8_t a {};
memcpy(&a, &data, sizeof(a));
uint16x8_t a = vreinterpretq_u16_u8(data);
uint16x8_t VIndexes {};
const uint16x8_t VIndex16 = vdupq_n_u16(8);
uint16_t Indexes[8] = {
Expand All @@ -27,21 +25,19 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da
auto SelectResult = vbslq_u16(MaskResult, VIndexes, VIndex16);
return vminvq_u16(SelectResult);
} else {
uint8x16_t a {};
memcpy(&a, &data, sizeof(a));
uint8x16_t VIndexes {};
const uint8x16_t VIndex16 = vdupq_n_u8(16);
uint8_t Indexes[16] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
memcpy(&VIndexes, Indexes, sizeof(VIndexes));
auto MaskResult = vceqzq_u8(a);
auto MaskResult = vceqzq_u8(data);
auto SelectResult = vbslq_u8(MaskResult, VIndexes, VIndex16);
return vminvq_u8(SelectResult);
}
}
#else
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& data, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(FEXCore::VectorRegType data, uint16_t control) {
const auto* data_u8 = reinterpret_cast<const uint8_t*>(&data);
const auto is_using_words = (control & 1) != 0;

Expand Down Expand Up @@ -80,12 +76,16 @@ FEXCORE_PRESERVE_ALL_ATTR static int32_t GetImplicitLength(const __uint128_t& da
// to be the max length possible for the given character size specified
// in the control flags (16 characters for 8-bit, and 8 characters for 16-bit).
//
FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers<IR::OP_VPCMPISTRX>::handle(__uint128_t lhs, __uint128_t rhs, uint16_t control) {
FEXCORE_PRESERVE_ALL_ATTR uint32_t OpHandlers<IR::OP_VPCMPISTRX>::handle(FEXCore::VectorRegType lhs, FEXCore::VectorRegType rhs, uint16_t control) {
// Subtract by 1 in order to make validity limits 0-based
const auto valid_lhs = GetImplicitLength(lhs, control) - 1;
const auto valid_rhs = GetImplicitLength(rhs, control) - 1;
__uint128_t lhs_i;
memcpy(&lhs_i, &lhs, sizeof(lhs_i));
__uint128_t rhs_i;
memcpy(&rhs_i, &rhs, sizeof(rhs_i));

return OpHandlers<IR::OP_VPCMPESTRX>::MainBody(lhs, valid_lhs, rhs, valid_rhs, control);
return OpHandlers<IR::OP_VPCMPESTRX>::MainBody(lhs_i, valid_lhs, rhs_i, valid_rhs, control);
}

} // namespace FEXCore::CPU
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "Interface/Core/Interpreter/Fallbacks/FallbackOpHandler.h"
#include "Interface/IR/IR.h"
#include "Common/VectorRegType.h"

namespace FEXCore::CPU {

Expand Down Expand Up @@ -343,7 +344,7 @@ struct OpHandlers<IR::OP_VPCMPESTRX> {

template<>
struct OpHandlers<IR::OP_VPCMPISTRX> {
FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(__uint128_t lhs, __uint128_t rhs, uint16_t control);
FEXCORE_PRESERVE_ALL_ATTR static uint32_t handle(VectorRegType lhs, VectorRegType rhs, uint16_t control);
};

} // namespace FEXCore::CPU
4 changes: 1 addition & 3 deletions FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: MIT
#pragma once

#include <array>
#include <cstddef>
#include <cstdint>

#include <FEXCore/Core/CoreState.h>
Expand Down Expand Up @@ -31,7 +29,7 @@ enum FallbackABI {
FABI_F80_I16_F80,
FABI_F80_I16_F80_F80,
FABI_I32_I64_I64_I128_I128_I16,
FABI_I32_I128_I128_I16,
FABI_I32_V128_V128_I16,
};

struct FallbackInfo {
Expand Down
45 changes: 24 additions & 21 deletions FEXCore/Source/Interface/Core/JIT/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, float>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, float>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -142,7 +142,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, double>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, double>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -163,7 +163,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW));
ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, uint32_t>(ARMEmitter::Reg::r2);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, uint32_t>(ARMEmitter::Reg::r2);
} else {
blr(ARMEmitter::Reg::r2);
}
Expand All @@ -181,7 +181,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<float, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<float, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -205,7 +205,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<double, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<double, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -265,7 +265,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint32_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -288,7 +288,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint32_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -305,7 +305,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint64_t, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint64_t, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -336,7 +336,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint64_t, uint16_t, VectorRegType, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<uint64_t, uint16_t, FEXCore::VectorRegType, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -359,7 +359,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
mov(ARMEmitter::VReg::v0.Q(), Src1.Q());

if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand All @@ -384,7 +384,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<VectorRegType, uint16_t, VectorRegType, VectorRegType>(ARMEmitter::Reg::r1);
GenerateIndirectRuntimeCall<FEXCore::VectorRegType, uint16_t, FEXCore::VectorRegType, FEXCore::VectorRegType>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r1);
}
Expand Down Expand Up @@ -428,7 +428,7 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {

FillI32Result();
} break;
case FABI_I32_I128_I128_I16: {
case FABI_I32_V128_V128_I16: {
SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true);

const auto Op = IROp->C<IR::IROp_VPCMPISTRX>();
Expand All @@ -437,19 +437,22 @@ void Arm64JITCore::Op_Unhandled(const IR::IROp_Header* IROp, IR::NodeID Node) {
const auto Src2 = GetVReg(Op->RHS.ID());
const auto Control = Op->Control;

umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r0, Src1, 0);
umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r1, Src1, 1);

umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r2, Src2, 0);
umov<ARMEmitter::SubRegSize::i64Bit>(ARMEmitter::Reg::r3, Src2, 1);
if (!TMP_ABIARGS) {
mov(VTMP1.Q(), Src1.Q());
mov(ARMEmitter::VReg::v1.Q(), Src2.Q());
mov(ARMEmitter::VReg::v0.Q(), VTMP1.Q());
} else {
mov(ARMEmitter::VReg::v0.Q(), Src1.Q());
mov(ARMEmitter::VReg::v1.Q(), Src2.Q());
}

movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r4, Control);
movz(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r0, Control);

ldr(ARMEmitter::XReg::x5, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex]));
if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] {
GenerateIndirectRuntimeCall<uint32_t, uint64_t, uint64_t, uint64_t, uint64_t, uint16_t>(ARMEmitter::Reg::r5);
GenerateIndirectRuntimeCall<uint32_t, FEXCore::VectorRegType, FEXCore::VectorRegType, uint16_t>(ARMEmitter::Reg::r1);
} else {
blr(ARMEmitter::Reg::r5);
blr(ARMEmitter::Reg::r1);
}

FillI32Result();
Expand Down
14 changes: 6 additions & 8 deletions unittests/InstructionCountCI/FlagM/HotBlocks.json
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 38,
"ExpectedInstructionCount": 36,
"Comment": [
"A Hat In Time spends at least 5% CPU time in this instruction",
"Comes from vcruntime140.dll wcsstr"
Expand All @@ -776,13 +776,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0xd",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0xd",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand Down
28 changes: 12 additions & 16 deletions unittests/InstructionCountCI/SSE42_Strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
]
},
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 34,
"ExpectedInstructionCount": 32,
"Comment": [
"0x66 0x0f 0x3A 0x62"
],
Expand All @@ -144,13 +144,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0x0",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0x0",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand All @@ -170,7 +168,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 38,
"ExpectedInstructionCount": 36,
"Comment": [
"0x66 0x0f 0x3A 0x63"
],
Expand All @@ -186,13 +184,11 @@
"st1 {v2.2d, v3.2d}, [x0], #32",
"st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64",
"str x30, [x0], #16",
"mov x0, v16.d[0]",
"mov x1, v16.d[1]",
"mov x2, v17.d[0]",
"mov x3, v17.d[1]",
"mov w4, #0x0",
"ldr x5, [x28, #1760]",
"blr x5",
"mov v0.16b, v16.16b",
"mov v1.16b, v17.16b",
"mov w0, #0x0",
"ldr x1, [x28, #1760]",
"blr x1",
"ldr w4, [x28, #1000]",
"msr nzcv, x4",
"ldp x4, x7, [x28, #280]",
Expand Down

0 comments on commit 1b144ba

Please sign in to comment.