From 407e26bfee9536fdd6672589774c928450df33b3 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 8 Jan 2024 16:33:36 +0000 Subject: [PATCH] FEXCore: Use TMP1-4 for values that need preserving across spills The ARM64EC SRA layout will use x0-3 for x86_64 registers, as such any arguments passed to C ABI functions need to proxy their arguments through the temporaries and move as appropriate. --- .../Interface/Core/ArchHelpers/Arm64Emitter.h | 1 + .../Interface/Core/Dispatcher/Dispatcher.cpp | 169 +++++++---------- .../Interface/Core/JIT/Arm64/ALUOps.cpp | 48 ++--- .../Interface/Core/JIT/Arm64/BranchOps.cpp | 99 +++++----- .../Source/Interface/Core/JIT/Arm64/JIT.cpp | 171 ++++++++++-------- .../FlagM/SecondaryModRM.json | 28 +-- unittests/InstructionCountCI/FlagM/x87.json | 32 ++-- .../InstructionCountCI/FlagM/x87_f64.json | 24 +-- .../InstructionCountCI/SecondaryModRM.json | 28 +-- unittests/InstructionCountCI/x87.json | 32 ++-- unittests/InstructionCountCI/x87_f64.json | 24 +-- 11 files changed, 323 insertions(+), 333 deletions(-) diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index 21ce5adf58..abfdf8567d 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -43,6 +43,7 @@ constexpr auto TMP1 = FEXCore::ARMEmitter::XReg::x0; constexpr auto TMP2 = FEXCore::ARMEmitter::XReg::x1; constexpr auto TMP3 = FEXCore::ARMEmitter::XReg::x2; constexpr auto TMP4 = FEXCore::ARMEmitter::XReg::x3; +constexpr bool TMP_ABIARGS = true; // TMP{1-4} map to ABI arguments 0-3 // Vector temporaries constexpr auto VTMP1 = FEXCore::ARMEmitter::VReg::v0; diff --git a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp index 5b96be4dc3..db756602f7 100644 --- a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp +++ b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp @@ -103,80 +103,80 @@ void Dispatcher::EmitDispatcher() { AbsoluteLoopTopAddress = GetCursorAddress(); // Load in our RIP - // Don't modify x2 since it contains our RIP once the block doesn't exist + // Don't modify TMP3 since it contains our RIP once the block doesn't exist - auto RipReg = ARMEmitter::XReg::x2; + auto RipReg = TMP3; ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); // L1 Cache - ldr(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, Pointers.Common.L1Pointer)); + ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.L1Pointer)); - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, RipReg.R(), LookupCache::L1_ENTRIES_MASK); - add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, ARMEmitter::Reg::r0, ARMEmitter::Reg::r3, ARMEmitter::ShiftType::LSL , 4); - ldp(ARMEmitter::XReg::x3, ARMEmitter::XReg::x0, ARMEmitter::Reg::r0, 0); - sub(ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, RipReg); - cbnz(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, &FullLookup); + and_(ARMEmitter::Size::i64Bit, TMP4, RipReg.R(), LookupCache::L1_ENTRIES_MASK); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, TMP4, ARMEmitter::ShiftType::LSL , 4); + ldp(TMP4, TMP1, TMP1, 0); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, RipReg); + cbnz(ARMEmitter::Size::i64Bit, TMP1, &FullLookup); - br(ARMEmitter::Reg::r3); + br(TMP4); // L1C check failed, do a full lookup Bind(&FullLookup); // This is the block cache lookup routine // It matches what is going on it LookupCache.h::FindBlock - ldr(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, Pointers.Common.L2Pointer)); + ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.L2Pointer)); // Mask the address by the virtual address size so we can check for aliases uint64_t VirtualMemorySize = CTX->Config.VirtualMemSize; if (std::popcount(VirtualMemorySize) == 1) { - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, RipReg.R(), VirtualMemorySize - 1); + and_(ARMEmitter::Size::i64Bit, TMP4, RipReg.R(), VirtualMemorySize - 1); } else { - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, VirtualMemorySize); - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, RipReg.R(), ARMEmitter::Reg::r3); + LoadConstant(ARMEmitter::Size::i64Bit, TMP4, VirtualMemorySize); + and_(ARMEmitter::Size::i64Bit, TMP4, RipReg.R(), TMP4); } ARMEmitter::ForwardLabel NoBlock; { // Offset the address and add to our page pointer - lsr(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, ARMEmitter::Reg::r3, 12); + lsr(ARMEmitter::Size::i64Bit, TMP2, TMP4, 12); // Load the pointer from the offset - ldr(ARMEmitter::XReg::x0, ARMEmitter::Reg::r0, ARMEmitter::Reg::r1, ARMEmitter::ExtendedType::LSL_64, 3); + ldr(TMP1, TMP1, TMP2, ARMEmitter::ExtendedType::LSL_64, 3); // If page pointer is zero then we have no block - cbz(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, &NoBlock); + cbz(ARMEmitter::Size::i64Bit, TMP1, &NoBlock); // Steal the page offset - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, ARMEmitter::Reg::r3, 0x0FFF); + and_(ARMEmitter::Size::i64Bit, TMP2, TMP4, 0x0FFF); // Shift the offset by the size of the block cache entry - add(ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, ARMEmitter::XReg::x1, ARMEmitter::ShiftType::LSL, (int)log2(sizeof(FEXCore::LookupCache::LookupCacheEntry))); + add(TMP1, TMP1, TMP2, ARMEmitter::ShiftType::LSL, (int)log2(sizeof(FEXCore::LookupCache::LookupCacheEntry))); // The the full LookupCacheEntry with a single LDP. // Check the guest address first to ensure it maps to the address we are currently at. // This fixes aliasing problems - ldp(ARMEmitter::XReg::x3, ARMEmitter::XReg::x1, ARMEmitter::Reg::r0, 0); + ldp(TMP4, TMP2, TMP1, 0); // If the guest address doesn't match, Compile the block. - sub(ARMEmitter::XReg::x1, ARMEmitter::XReg::x1, RipReg); - cbnz(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, &NoBlock); + sub(TMP2, TMP2, RipReg); + cbnz(ARMEmitter::Size::i64Bit, TMP2, &NoBlock); // Check the host address to see if it matches, else compile the block. - cbz(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, &NoBlock); + cbz(ARMEmitter::Size::i64Bit, TMP4, &NoBlock); // If we've made it here then we have a real compiled block { // update L1 cache - ldr(ARMEmitter::XReg::x0, STATE_PTR(CpuStateFrame, Pointers.Common.L1Pointer)); + ldr(TMP1, STATE_PTR(CpuStateFrame, Pointers.Common.L1Pointer)); - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, RipReg.R(), LookupCache::L1_ENTRIES_MASK); - add(ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, ARMEmitter::XReg::x1, ARMEmitter::ShiftType::LSL, 4); - stp(ARMEmitter::XReg::x3, ARMEmitter::XReg::x2, ARMEmitter::Reg::r0); + and_(ARMEmitter::Size::i64Bit, TMP2, RipReg.R(), LookupCache::L1_ENTRIES_MASK); + add(TMP1, TMP1, TMP2, ARMEmitter::ShiftType::LSL, 4); + stp(TMP4, TMP3, TMP1); // Jump to the block - br(ARMEmitter::Reg::r3); + br(TMP4); } } @@ -212,17 +212,21 @@ void Dispatcher::EmitDispatcher() { blr(ARMEmitter::Reg::r2); } + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); + } + FillStaticRegs(); - ldr(ARMEmitter::XReg::x1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); - sub(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x1, ARMEmitter::XReg::x1, 1); - str(ARMEmitter::XReg::x1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); + ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); + sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 1); + str(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); // Trigger segfault if any deferred signals are pending - ldr(ARMEmitter::XReg::x1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalFaultAddress)); - str(ARMEmitter::XReg::zr, ARMEmitter::XReg::x1, 0); + ldr(TMP2, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalFaultAddress)); + str(ARMEmitter::XReg::zr, TMP2, 0); - br(ARMEmitter::Reg::r0); + br(TMP1); } // Need to create the block @@ -231,6 +235,10 @@ void Dispatcher::EmitDispatcher() { SpillStaticRegs(TMP1); + if (!TMP_ABIARGS) { + mov(ARMEmitter::XReg::x2, TMP3); + } + ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1); str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); @@ -250,9 +258,9 @@ void Dispatcher::EmitDispatcher() { FillStaticRegs(); - ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); - sub(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, 1); - str(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); + ldr(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 1); + str(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalRefCount)); // Trigger segfault if any deferred signals are pending ldr(TMP1, STATE, offsetof(FEXCore::Core::CPUState, DeferredSignalFaultAddress)); @@ -395,98 +403,45 @@ void Dispatcher::EmitDispatcher() { b(&LoopTop); } - { - LUDIVHandlerAddress = GetCursorAddress(); + auto EmitLongALUOpHandler = [&](auto R, auto Offset) { + auto Address = GetCursorAddress(); - PushDynamicRegsAndLR(ARMEmitter::Reg::r3); - SpillStaticRegs(ARMEmitter::Reg::r3); + PushDynamicRegsAndLR(TMP4); + SpillStaticRegs(TMP4); - ldr(ARMEmitter::XReg::x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV)); - if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); - } - else { - blr(ARMEmitter::Reg::r3); + if (!TMP_ABIARGS) { + mov(ARMEmitter::XReg::x0, TMP1); + mov(ARMEmitter::XReg::x1, TMP2); + mov(ARMEmitter::XReg::x2, TMP3); } - FillStaticRegs(); - // Result is now in x0 - // Fix the stack and any values that were stepped on - PopDynamicRegsAndLR(); - - // Go back to our code block - ret(); - } - - { - LDIVHandlerAddress = GetCursorAddress(); - - PushDynamicRegsAndLR(ARMEmitter::Reg::r3); - SpillStaticRegs(ARMEmitter::Reg::r3); - - ldr(ARMEmitter::XReg::x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV)); + ldr(ARMEmitter::XReg::x3, R, Offset); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); } else { blr(ARMEmitter::Reg::r3); } - FillStaticRegs(); - // Result is now in x0 - // Fix the stack and any values that were stepped on - PopDynamicRegsAndLR(); - - // Go back to our code block - ret(); - } - - { - LUREMHandlerAddress = GetCursorAddress(); - - PushDynamicRegsAndLR(ARMEmitter::Reg::r3); - SpillStaticRegs(ARMEmitter::Reg::r3); - ldr(ARMEmitter::XReg::x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM)); - if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); - } - else { - blr(ARMEmitter::Reg::r3); + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); } - FillStaticRegs(); - - // Result is now in x0 - // Fix the stack and any values that were stepped on - PopDynamicRegsAndLR(); - // Go back to our code block - ret(); - } - - { - LREMHandlerAddress = GetCursorAddress(); - - PushDynamicRegsAndLR(ARMEmitter::Reg::r3); - SpillStaticRegs(ARMEmitter::Reg::r3); - - ldr(ARMEmitter::XReg::x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM)); - - if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { - GenerateIndirectRuntimeCall(ARMEmitter::Reg::r3); - } - else { - blr(ARMEmitter::Reg::r3); - } FillStaticRegs(); - // Result is now in x0 // Fix the stack and any values that were stepped on PopDynamicRegsAndLR(); // Go back to our code block ret(); - } + return Address; + }; + + LUDIVHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV)); + LDIVHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV)); + LUREMHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM)); + LREMHandlerAddress = EmitLongALUOpHandler(STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM)); Bind(&l_CTX); dc64(reinterpret_cast(CTX)); diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 30dbd410da..95f67911bd 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -861,18 +861,18 @@ DEF_OP(LDiv) { // Long divide { - mov(EmitSize, ARMEmitter::Reg::r0, Upper); - mov(EmitSize, ARMEmitter::Reg::r1, Lower); - mov(EmitSize, ARMEmitter::Reg::r2, Divisor); + mov(EmitSize, TMP1, Upper); + mov(EmitSize, TMP2, Lower); + mov(EmitSize, TMP3, Divisor); - ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LDIVHandler)); + ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LDIVHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); - blr(ARMEmitter::Reg::r3); + blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move result to its destination register - mov(EmitSize, Dst, ARMEmitter::Reg::r0); + mov(EmitSize, Dst, TMP1); // Skip 64-bit path b(&LongDIVRet); @@ -929,18 +929,18 @@ DEF_OP(LUDiv) { // Long divide { - mov(EmitSize, ARMEmitter::Reg::r0, Upper); - mov(EmitSize, ARMEmitter::Reg::r1, Lower); - mov(EmitSize, ARMEmitter::Reg::r2, Divisor); + mov(EmitSize, TMP1, Upper); + mov(EmitSize, TMP2, Lower); + mov(EmitSize, TMP3, Divisor); - ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LUDIVHandler)); + ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LUDIVHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); - blr(ARMEmitter::Reg::r3); + blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move result to its destination register - mov(EmitSize, Dst, ARMEmitter::Reg::r0); + mov(EmitSize, Dst, TMP1); // Skip 64-bit path b(&LongDIVRet); @@ -1005,18 +1005,18 @@ DEF_OP(LRem) { // Long divide { - mov(EmitSize, ARMEmitter::Reg::r0, Upper); - mov(EmitSize, ARMEmitter::Reg::r1, Lower); - mov(EmitSize, ARMEmitter::Reg::r2, Divisor); + mov(EmitSize, TMP1, Upper); + mov(EmitSize, TMP2, Lower); + mov(EmitSize, TMP3, Divisor); - ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LREMHandler)); + ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LREMHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); - blr(ARMEmitter::Reg::r3); + blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move result to its destination register - mov(EmitSize, Dst, ARMEmitter::Reg::r0); + mov(EmitSize, Dst, TMP1); // Skip 64-bit path b(&LongDIVRet); @@ -1075,18 +1075,18 @@ DEF_OP(LURem) { // Long divide { - mov(EmitSize, ARMEmitter::Reg::r0, Upper); - mov(EmitSize, ARMEmitter::Reg::r1, Lower); - mov(EmitSize, ARMEmitter::Reg::r2, Divisor); + mov(EmitSize, TMP1, Upper); + mov(EmitSize, TMP2, Lower); + mov(EmitSize, TMP3, Divisor); - ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LUREMHandler)); + ldr(TMP4, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.AArch64.LUREMHandler)); str(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, -16); - blr(ARMEmitter::Reg::r3); + blr(TMP4); ldr(ARMEmitter::XReg::lr, ARMEmitter::Reg::rsp, 16); // Move result to its destination register - mov(EmitSize, Dst, ARMEmitter::Reg::r0); + mov(EmitSize, Dst, TMP1); // Skip 64-bit path b(&LongDIVRet); diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp index 9470d6bc7e..c982e84df5 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp @@ -55,8 +55,8 @@ DEF_OP(ExitFunction) { if (IsInlineConstant(Op->NewRIP, &NewRIP) || IsInlineEntrypointOffset(Op->NewRIP, &NewRIP)) { ARMEmitter::SingleUseForwardLabel l_BranchHost; - ldr(ARMEmitter::XReg::x0, &l_BranchHost); - blr(ARMEmitter::Reg::r0); + ldr(TMP1, &l_BranchHost); + blr(TMP1); Bind(&l_BranchHost); dc64(ThreadState->CurrentFrame->Pointers.Common.ExitFunctionLinker); @@ -67,16 +67,16 @@ DEF_OP(ExitFunction) { auto RipReg = GetReg(Op->NewRIP.ID()); // L1 Cache - ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.L1Pointer)); + ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.L1Pointer)); - and_(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, RipReg, LookupCache::L1_ENTRIES_MASK); - add(ARMEmitter::XReg::x0, ARMEmitter::XReg::x0, ARMEmitter::XReg::x3, ARMEmitter::ShiftType::LSL, 4); + and_(ARMEmitter::Size::i64Bit, TMP4, RipReg, LookupCache::L1_ENTRIES_MASK); + add(TMP1, TMP1, TMP4, ARMEmitter::ShiftType::LSL, 4); // Note: sub+cbnz used over cmp+br to preserve flags. - ldp(ARMEmitter::XReg::x1, ARMEmitter::XReg::x0, ARMEmitter::Reg::r0, 0); - sub(TMP1, ARMEmitter::XReg::x0, RipReg.X()); + ldp(TMP2, TMP1, TMP1, 0); + sub(TMP1, TMP1, RipReg.X()); cbnz(ARMEmitter::Size::i64Bit, TMP1, &FullLookup); - br(ARMEmitter::Reg::r1); + br(TMP2); Bind(&FullLookup); ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.DispatcherLoopTop)); @@ -350,58 +350,58 @@ DEF_OP(ValidateCode) { int idx = 0; LoadConstant(ARMEmitter::Size::i64Bit, GetReg(Node), 0); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, Entry + Op->Offset); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, 1); + LoadConstant(ARMEmitter::Size::i64Bit, TMP1, Entry + Op->Offset); + LoadConstant(ARMEmitter::Size::i64Bit, TMP2, 1); const auto Dst = GetReg(Node); while (len >= 8) { - ldr(ARMEmitter::XReg::x2, ARMEmitter::Reg::r0, idx); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, *(const uint32_t *)(OldCode + idx)); - cmp(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3); - csel(ARMEmitter::Size::i64Bit, Dst, Dst, ARMEmitter::Reg::r1, ARMEmitter::Condition::CC_EQ); + ldr(ARMEmitter::XReg::x2, TMP1, idx); + LoadConstant(ARMEmitter::Size::i64Bit, TMP4, *(const uint32_t *)(OldCode + idx)); + cmp(ARMEmitter::Size::i64Bit, TMP3, TMP4); + csel(ARMEmitter::Size::i64Bit, Dst, Dst, TMP2, ARMEmitter::Condition::CC_EQ); len -= 8; idx += 8; } while (len >= 4) { - ldr(ARMEmitter::WReg::w2, ARMEmitter::Reg::r0, idx); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, *(const uint32_t *)(OldCode + idx)); - cmp(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3); - csel(ARMEmitter::Size::i64Bit, Dst, Dst, ARMEmitter::Reg::r1, ARMEmitter::Condition::CC_EQ); + ldr(ARMEmitter::WReg::w2, TMP1, idx); + LoadConstant(ARMEmitter::Size::i64Bit, TMP4, *(const uint32_t *)(OldCode + idx)); + cmp(ARMEmitter::Size::i32Bit, TMP3, TMP4); + csel(ARMEmitter::Size::i64Bit, Dst, Dst, TMP2, ARMEmitter::Condition::CC_EQ); len -= 4; idx += 4; } while (len >= 2) { - ldrh(ARMEmitter::Reg::r2, ARMEmitter::Reg::r0, idx); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, *(const uint16_t *)(OldCode + idx)); - cmp(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3); - csel(ARMEmitter::Size::i64Bit, Dst, Dst, ARMEmitter::Reg::r1, ARMEmitter::Condition::CC_EQ); + ldrh(TMP3, TMP1, idx); + LoadConstant(ARMEmitter::Size::i64Bit, TMP4, *(const uint16_t *)(OldCode + idx)); + cmp(ARMEmitter::Size::i32Bit, TMP3, TMP4); + csel(ARMEmitter::Size::i64Bit, Dst, Dst, TMP2, ARMEmitter::Condition::CC_EQ); len -= 2; idx += 2; } while (len >= 1) { - ldrb(ARMEmitter::Reg::r2, ARMEmitter::Reg::r0, idx); - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r3, *(const uint8_t *)(OldCode + idx)); - cmp(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r2, ARMEmitter::Reg::r3); - csel(ARMEmitter::Size::i64Bit, Dst, Dst, ARMEmitter::Reg::r1, ARMEmitter::Condition::CC_EQ); + ldrb(TMP3, TMP1, idx); + LoadConstant(ARMEmitter::Size::i64Bit, TMP4, *(const uint8_t *)(OldCode + idx)); + cmp(ARMEmitter::Size::i32Bit, TMP3, TMP4); + csel(ARMEmitter::Size::i64Bit, Dst, Dst, TMP2, ARMEmitter::Condition::CC_EQ); len -= 1; idx += 1; } } DEF_OP(ThreadRemoveCodeEntry) { + PushDynamicRegsAndLR(TMP4); + SpillStaticRegs(TMP4); + // Arguments are passed as follows: // X0: Thread // X1: RIP - - PushDynamicRegsAndLR(TMP1); - SpillStaticRegs(TMP1); - mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, STATE.R()); + LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, Entry); ldr(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.ThreadRemoveCodeEntryFromJIT)); @@ -420,16 +420,23 @@ DEF_OP(ThreadRemoveCodeEntry) { DEF_OP(CPUID) { auto Op = IROp->C(); - PushDynamicRegsAndLR(TMP1); - SpillStaticRegs(TMP1); + mov(ARMEmitter::Size::i64Bit, TMP2, GetReg(Op->Function.ID())); + mov(ARMEmitter::Size::i64Bit, TMP3, GetReg(Op->Leaf.ID())); + + PushDynamicRegsAndLR(TMP4); + SpillStaticRegs(TMP4); // x0 = CPUID Handler // x1 = CPUID Function // x2 = CPUID Leaf ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.CPUIDObj)); ldr(ARMEmitter::XReg::x3, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.CPUIDFunction)); - mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, GetReg(Op->Function.ID())); - mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, GetReg(Op->Leaf.ID())); + + if (!TMP_ABIARGS) { + mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r1, TMP2); + mov(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r2, TMP3); + } + if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall<__uint128_t, void*, uint64_t, uint64_t>(ARMEmitter::Reg::r3); } @@ -437,6 +444,11 @@ DEF_OP(CPUID) { blr(ARMEmitter::Reg::r3); } + if (!TMP_ABIARGS) { + mov(ARMEmitter::Size::i64Bit, TMP1, ARMEmitter::Reg::r0); + mov(ARMEmitter::Size::i64Bit, TMP2, ARMEmitter::Reg::r1); + } + FillStaticRegs(); PopDynamicRegsAndLR(); @@ -444,21 +456,22 @@ DEF_OP(CPUID) { // Results are in x0, x1 // Results want to be in a i64v2 vector auto Dst = GetRegPair(Node); - mov(ARMEmitter::Size::i64Bit, Dst.first, ARMEmitter::Reg::r0); - mov(ARMEmitter::Size::i64Bit, Dst.second, ARMEmitter::Reg::r1); + mov(ARMEmitter::Size::i64Bit, Dst.first, TMP1); + mov(ARMEmitter::Size::i64Bit, Dst.second, TMP2); } DEF_OP(XGetBV) { auto Op = IROp->C(); - PushDynamicRegsAndLR(TMP1); - SpillStaticRegs(TMP1); + PushDynamicRegsAndLR(TMP4); + SpillStaticRegs(TMP4); + + mov(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, GetReg(Op->Function.ID())); // x0 = CPUID Handler // x1 = XCR Function ldr(ARMEmitter::XReg::x0, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.CPUIDObj)); ldr(ARMEmitter::XReg::x2, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.XCRFunction)); - mov(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, GetReg(Op->Function.ID())); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall(ARMEmitter::Reg::r2); } @@ -466,6 +479,10 @@ DEF_OP(XGetBV) { blr(ARMEmitter::Reg::r2); } + if (!TMP_ABIARGS) { + mov(ARMEmitter::Size::i64Bit, TMP1, ARMEmitter::Reg::r0); + } + FillStaticRegs(); PopDynamicRegsAndLR(); @@ -473,8 +490,8 @@ DEF_OP(XGetBV) { // Results are in x0 // Results want to be in a i32v2 vector auto Dst = GetRegPair(Node); - mov(ARMEmitter::Size::i32Bit, Dst.first, ARMEmitter::Reg::r0); - lsr(ARMEmitter::Size::i64Bit, Dst.second, ARMEmitter::Reg::r0, 32); + mov(ARMEmitter::Size::i32Bit, Dst.first, TMP1); + lsr(ARMEmitter::Size::i64Bit, Dst.second, TMP1, 32); } #undef DEF_OP diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp index b9a120ebb0..e13cbc28b1 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp @@ -84,6 +84,40 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { LOGMAN_MSG_A_FMT("Unhandled IR Op: {}", FEXCore::IR::GetName(IROp->Op)); #endif } else { + auto FillF80Result = [&]() { + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); + mov(TMP2, ARMEmitter::XReg::x1); + } + + FillForABICall(Info.SupportsPreserveAllABI, true); + + const auto Dst = GetVReg(Node); + eor(Dst.Q(), Dst.Q(), Dst.Q()); + ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, TMP1); + ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, TMP2); + }; + + auto FillF64Result = [&]() { + if (!TMP_ABIARGS) { + mov(VTMP1.D(), ARMEmitter::DReg::d0); + } + FillForABICall(Info.SupportsPreserveAllABI, true); + + const auto Dst = GetVReg(Node); + mov(Dst.D(), VTMP1.D()); + }; + + auto FillI32Result = [&]() { + if (!TMP_ABIARGS) { + mov(TMP1.W(), ARMEmitter::WReg::w0); + } + FillForABICall(Info.SupportsPreserveAllABI, true); + + const auto Dst = GetReg(Node); + mov(Dst.W(), TMP1.W()); + }; + switch(Info.ABI) { case FABI_F80_I16_F32:{ SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); @@ -99,12 +133,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r1); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - eor(Dst.Q(), Dst.Q(), Dst.Q()); - ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, ARMEmitter::Reg::r0); - ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, ARMEmitter::Reg::r1); + FillF80Result(); } break; @@ -122,12 +151,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r1); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - eor(Dst.Q(), Dst.Q(), Dst.Q()); - ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, ARMEmitter::Reg::r0); - ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, ARMEmitter::Reg::r1); + FillF80Result(); } break; @@ -136,13 +160,13 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); const auto Src1 = GetReg(IROp->Args[0].ID()); - ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); if (Info.ABI == FABI_F80_I16_I16) { sxth(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, Src1); } else { mov(ARMEmitter::Size::i32Bit, ARMEmitter::Reg::r1, Src1); } + ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); ldr(ARMEmitter::XReg::x2, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { GenerateIndirectRuntimeCall<__uint128_t, uint16_t, uint32_t>(ARMEmitter::Reg::r2); @@ -151,12 +175,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r2); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - eor(Dst.Q(), Dst.Q(), Dst.Q()); - ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, ARMEmitter::Reg::r0); - ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, ARMEmitter::Reg::r1); + FillF80Result(); } break; @@ -177,10 +196,13 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r3); } + if (!TMP_ABIARGS) { + fmov(VTMP1.S(), ARMEmitter::SReg::s0); + } FillForABICall(Info.SupportsPreserveAllABI, true); const auto Dst = GetVReg(Node); - fmov(Dst.S(), ARMEmitter::SReg::s0); + fmov(Dst.S(), VTMP1.S()); } break; @@ -201,10 +223,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r3); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - mov(Dst.D(), ARMEmitter::DReg::d0); + FillF64Result(); } break; @@ -223,21 +242,24 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r1); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - mov(Dst.D(), ARMEmitter::DReg::d0); + FillF64Result(); } break; case FABI_F64_I16_F64_F64: { - SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); - const auto Src1 = GetVReg(IROp->Args[0].ID()); const auto Src2 = GetVReg(IROp->Args[1].ID()); - mov(ARMEmitter::DReg::d0, Src1.D()); - mov(ARMEmitter::DReg::d1, Src2.D()); + mov(VTMP1.D(), Src1.D()); + mov(VTMP2.D(), Src2.D()); + + SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); + + if (!TMP_ABIARGS) { + mov(ARMEmitter::DReg::d0, VTMP1.D()); + mov(ARMEmitter::DReg::d1, VTMP2.D()); + } + ldrh(ARMEmitter::WReg::w0, STATE, offsetof(FEXCore::Core::CPUState, FCW)); ldr(ARMEmitter::XReg::x1, STATE_PTR(CpuStateFrame, Pointers.Common.FallbackHandlerPointers[Info.HandlerIndex])); if (!CTX->Config.DisableVixlIndirectCalls) [[unlikely]] { @@ -247,10 +269,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r1); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - mov(Dst.D(), ARMEmitter::DReg::d0); + FillF64Result(); } break; @@ -271,10 +290,13 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r3); } + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); + } FillForABICall(Info.SupportsPreserveAllABI, true); const auto Dst = GetReg(Node); - sxth(ARMEmitter::Size::i64Bit, Dst, ARMEmitter::Reg::r0); + sxth(ARMEmitter::Size::i64Bit, Dst, TMP1); } break; case FABI_I32_I16_F80:{ @@ -294,10 +316,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r3); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetReg(Node); - mov(ARMEmitter::Size::i32Bit, Dst, ARMEmitter::Reg::r0); + FillI32Result(); } break; case FABI_I64_I16_F80:{ @@ -316,10 +335,14 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { else { blr(ARMEmitter::Reg::r3); } + + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); + } FillForABICall(Info.SupportsPreserveAllABI, true); const auto Dst = GetReg(Node); - mov(ARMEmitter::Size::i64Bit, Dst, ARMEmitter::Reg::r0); + mov(ARMEmitter::Size::i64Bit, Dst, TMP1); } break; case FABI_I64_I16_F80_F80:{ @@ -342,10 +365,14 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { else { blr(ARMEmitter::Reg::r5); } + + if (!TMP_ABIARGS) { + mov(TMP1, ARMEmitter::XReg::x0); + } FillForABICall(Info.SupportsPreserveAllABI, true); const auto Dst = GetReg(Node); - mov(ARMEmitter::Size::i64Bit, Dst, ARMEmitter::Reg::r0); + mov(ARMEmitter::Size::i64Bit, Dst, TMP1); } break; case FABI_F80_I16_F80:{ @@ -365,12 +392,7 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r3); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - eor(Dst.Q(), Dst.Q(), Dst.Q()); - ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, ARMEmitter::Reg::r0); - ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, ARMEmitter::Reg::r1); + FillF80Result(); } break; case FABI_F80_I16_F80_F80:{ @@ -394,27 +416,28 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r5); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetVReg(Node); - eor(Dst.Q(), Dst.Q(), Dst.Q()); - ins(ARMEmitter::SubRegSize::i64Bit, Dst, 0, ARMEmitter::Reg::r0); - ins(ARMEmitter::SubRegSize::i16Bit, Dst, 4, ARMEmitter::Reg::r1); + FillF80Result(); } break; case FABI_I32_I64_I64_I128_I128_I16: { - SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); - const auto Op = IROp->C(); + const auto SrcRAX = GetReg(Op->RAX.ID()); + const auto SrcRDX = GetReg(Op->RDX.ID()); + + mov(TMP1, SrcRAX.X()); + mov(TMP2, SrcRDX.X()); + + SpillForABICall(Info.SupportsPreserveAllABI, TMP3, true); + const auto Control = Op->Control; const auto Src1 = GetVReg(Op->LHS.ID()); const auto Src2 = GetVReg(Op->RHS.ID()); - const auto SrcRAX = GetReg(Op->RAX.ID()); - const auto SrcRDX = GetReg(Op->RDX.ID()); - mov(ARMEmitter::XReg::x0, SrcRAX.X()); - mov(ARMEmitter::XReg::x1, SrcRDX.X()); + if (!TMP_ABIARGS) { + mov(ARMEmitter::XReg::x0, TMP1); + mov(ARMEmitter::XReg::x1, TMP2); + } umov(ARMEmitter::Reg::r2, Src1, 0); umov(ARMEmitter::Reg::r3, Src1, 1); @@ -432,12 +455,9 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r7); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetReg(Node); - mov(Dst.W(), ARMEmitter::WReg::w0); - break; + FillI32Result(); } + break; case FABI_I32_I128_I128_I16: { SpillForABICall(Info.SupportsPreserveAllABI, TMP1, true); @@ -463,12 +483,9 @@ void Arm64JITCore::Op_Unhandled(IR::IROp_Header const *IROp, IR::NodeID Node) { blr(ARMEmitter::Reg::r5); } - FillForABICall(Info.SupportsPreserveAllABI, true); - - const auto Dst = GetReg(Node); - mov(Dst.W(), ARMEmitter::WReg::w0); - break; + FillI32Result(); } + break; case FABI_UNKNOWN: default: #if defined(ASSERTIONS_ENABLED) && ASSERTIONS_ENABLED @@ -486,8 +503,8 @@ static void DirectBlockDelinker(FEXCore::Core::CpuStateFrame *Frame, FEXCore::Co uintptr_t branch = (uintptr_t)(Record) - 8; FEXCore::ARMEmitter::Emitter emit((uint8_t*)(branch), 8); FEXCore::ARMEmitter::SingleUseForwardLabel l_BranchHost; - emit.ldr(FEXCore::ARMEmitter::XReg::x0, &l_BranchHost); - emit.blr(FEXCore::ARMEmitter::Reg::r0); + emit.ldr(TMP1, &l_BranchHost); + emit.blr(TMP1); emit.Bind(&l_BranchHost); emit.dc64(LinkerAddress); FEXCore::ARMEmitter::Emitter::ClearICache((void*)branch, 8); @@ -763,8 +780,8 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, if (vixl::aarch64::Assembler::IsImmAddSub(TotalSpillSlotsSize)) { sub(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, TotalSpillSlotsSize); } else { - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, TotalSpillSlotsSize); - sub(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, ARMEmitter::XReg::x0, ARMEmitter::ExtendedType::LSL_64, 0); + LoadConstant(ARMEmitter::Size::i64Bit, TMP1, TotalSpillSlotsSize); + sub(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, TMP1, ARMEmitter::ExtendedType::LSL_64, 0); } } @@ -910,8 +927,8 @@ void Arm64JITCore::ResetStack() { add(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::rsp, ARMEmitter::Reg::rsp, TotalSpillSlotsSize); } else { // Too big to fit in a 12bit immediate - LoadConstant(ARMEmitter::Size::i64Bit, ARMEmitter::Reg::r0, TotalSpillSlotsSize); - add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, ARMEmitter::XReg::x0, ARMEmitter::ExtendedType::LSL_64, 0); + LoadConstant(ARMEmitter::Size::i64Bit, TMP1, TotalSpillSlotsSize); + add(ARMEmitter::Size::i64Bit, ARMEmitter::XReg::rsp, ARMEmitter::XReg::rsp, TMP1, ARMEmitter::ExtendedType::LSL_64, 0); } } diff --git a/unittests/InstructionCountCI/FlagM/SecondaryModRM.json b/unittests/InstructionCountCI/FlagM/SecondaryModRM.json index b3a09f0174..a74055f7d0 100644 --- a/unittests/InstructionCountCI/FlagM/SecondaryModRM.json +++ b/unittests/InstructionCountCI/FlagM/SecondaryModRM.json @@ -18,14 +18,14 @@ "Comment": "0xF 0x01 /2 RM-0", "ExpectedArm64ASM": [ "sub sp, sp, #0xf0 (240)", - "mov x0, sp", - "st1 {v2.2d, v3.2d}, [x0], #32", - "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64", - "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", - "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", - "str x30, [x0]", - "mrs x0, nzcv", - "str w0, [x28, #728]", + "mov x3, sp", + "st1 {v2.2d, v3.2d}, [x3], #32", + "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x3], #64", + "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x3], #64", + "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x3], #64", + "str x30, [x3]", + "mrs x3, nzcv", + "str w3, [x28, #728]", "stp x4, x5, [x28, #8]", "stp x6, x7, [x28, #24]", "stp x8, x9, [x28, #40]", @@ -36,14 +36,14 @@ "stp x19, x29, [x28, #120]", "str x26, [x28, #752]", "str x27, [x28, #760]", - "add x0, x28, #0xc0 (192)", - "st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64", - "st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64", - "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0], #64", - "st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x0], #64", + "add x3, x28, #0xc0 (192)", + "st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #64", + "st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x3], #64", + "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x3], #64", + "st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x3], #64", + "mov w1, w5", "ldr x0, [x28, #1240]", "ldr x2, [x28, #1256]", - "mov w1, w5", "blr x2", "ldr w4, [x28, #728]", "msr nzcv, x4", diff --git a/unittests/InstructionCountCI/FlagM/x87.json b/unittests/InstructionCountCI/FlagM/x87.json index b4d7ffcdaf..d6c37e418d 100644 --- a/unittests/InstructionCountCI/FlagM/x87.json +++ b/unittests/InstructionCountCI/FlagM/x87.json @@ -8122,8 +8122,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8245,8 +8245,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8368,8 +8368,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8497,8 +8497,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8634,8 +8634,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8757,8 +8757,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8880,8 +8880,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -9003,8 +9003,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -18990,8 +18990,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19113,8 +19113,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19236,8 +19236,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19365,8 +19365,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19502,8 +19502,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19625,8 +19625,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19748,8 +19748,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19871,8 +19871,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", diff --git a/unittests/InstructionCountCI/FlagM/x87_f64.json b/unittests/InstructionCountCI/FlagM/x87_f64.json index 14a4b6db71..770663bd18 100644 --- a/unittests/InstructionCountCI/FlagM/x87_f64.json +++ b/unittests/InstructionCountCI/FlagM/x87_f64.json @@ -2376,6 +2376,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2400,8 +2402,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1640]", "blr x1", @@ -2529,6 +2529,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v3.8b", + "mov v1.8b, v2.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2553,8 +2555,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v3.8b", - "mov v1.8b, v2.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1624]", "blr x1", @@ -2629,6 +2629,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2653,8 +2655,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1656]", "blr x1", @@ -2724,6 +2724,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2748,8 +2750,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1648]", "blr x1", @@ -2804,6 +2804,8 @@ "mov x20, #0x3ff0000000000000", "fmov d4, x20", "fadd d2, d2, d4", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2828,8 +2830,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1640]", "blr x1", @@ -3028,6 +3028,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -3052,8 +3054,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1664]", "blr x1", diff --git a/unittests/InstructionCountCI/SecondaryModRM.json b/unittests/InstructionCountCI/SecondaryModRM.json index c192b6d354..7cbc2234b4 100644 --- a/unittests/InstructionCountCI/SecondaryModRM.json +++ b/unittests/InstructionCountCI/SecondaryModRM.json @@ -18,14 +18,14 @@ "Comment": "0xF 0x01 /2 RM-0", "ExpectedArm64ASM": [ "sub sp, sp, #0xf0 (240)", - "mov x0, sp", - "st1 {v2.2d, v3.2d}, [x0], #32", - "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64", - "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", - "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", - "str x30, [x0]", - "mrs x0, nzcv", - "str w0, [x28, #728]", + "mov x3, sp", + "st1 {v2.2d, v3.2d}, [x3], #32", + "st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x3], #64", + "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x3], #64", + "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x3], #64", + "str x30, [x3]", + "mrs x3, nzcv", + "str w3, [x28, #728]", "stp x4, x5, [x28, #8]", "stp x6, x7, [x28, #24]", "stp x8, x9, [x28, #40]", @@ -36,14 +36,14 @@ "stp x19, x29, [x28, #120]", "str x26, [x28, #752]", "str x27, [x28, #760]", - "add x0, x28, #0xc0 (192)", - "st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64", - "st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64", - "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x0], #64", - "st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x0], #64", + "add x3, x28, #0xc0 (192)", + "st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x3], #64", + "st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x3], #64", + "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x3], #64", + "st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [x3], #64", + "mov w1, w5", "ldr x0, [x28, #1240]", "ldr x2, [x28, #1256]", - "mov w1, w5", "blr x2", "ldr w4, [x28, #728]", "msr nzcv, x4", diff --git a/unittests/InstructionCountCI/x87.json b/unittests/InstructionCountCI/x87.json index 27a74841f3..245ba23545 100644 --- a/unittests/InstructionCountCI/x87.json +++ b/unittests/InstructionCountCI/x87.json @@ -8121,8 +8121,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8244,8 +8244,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8367,8 +8367,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8496,8 +8496,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8633,8 +8633,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8756,8 +8756,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -8879,8 +8879,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -9002,8 +9002,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "mov w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1440]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19005,8 +19005,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19128,8 +19128,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19251,8 +19251,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19380,8 +19380,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19517,8 +19517,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19640,8 +19640,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19763,8 +19763,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", @@ -19886,8 +19886,8 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "ldrh w0, [x28, #1152]", "sxth w1, w21", + "ldrh w0, [x28, #1152]", "ldr x2, [x28, #1432]", "blr x2", "ld1 {v2.2d, v3.2d}, [sp], #32", diff --git a/unittests/InstructionCountCI/x87_f64.json b/unittests/InstructionCountCI/x87_f64.json index 286e08b0db..91758fdc2b 100644 --- a/unittests/InstructionCountCI/x87_f64.json +++ b/unittests/InstructionCountCI/x87_f64.json @@ -2394,6 +2394,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2418,8 +2420,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1640]", "blr x1", @@ -2547,6 +2547,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v3.8b", + "mov v1.8b, v2.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2571,8 +2573,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v3.8b", - "mov v1.8b, v2.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1624]", "blr x1", @@ -2647,6 +2647,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2671,8 +2673,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1656]", "blr x1", @@ -2742,6 +2742,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2766,8 +2768,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1648]", "blr x1", @@ -2822,6 +2822,8 @@ "mov x20, #0x3ff0000000000000", "fmov d4, x20", "fadd d2, d2, d4", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -2846,8 +2848,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1640]", "blr x1", @@ -3046,6 +3046,8 @@ "ldr d2, [x0, #768]", "add x0, x28, x21, lsl #4", "ldr d3, [x0, #768]", + "mov v0.8b, v2.8b", + "mov v1.8b, v3.8b", "mrs x0, nzcv", "str w0, [x28, #728]", "stp x4, x5, [x28, #8]", @@ -3070,8 +3072,6 @@ "st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64", "st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64", "str x30, [x0]", - "mov v0.8b, v2.8b", - "mov v1.8b, v3.8b", "ldrh w0, [x28, #1152]", "ldr x1, [x28, #1664]", "blr x1",