From 136fa78825c002c673fc733dbd0ee19864e94547 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 16 Dec 2023 20:42:37 -0800 Subject: [PATCH 1/7] FEXCore: Implements an efficient spin-loop API This will only be used internally inside of FEXCore for efficient shared codecach backpatch spin-loops. --- FEXCore/Source/CMakeLists.txt | 1 + FEXCore/Source/Utils/FutexSpinWait.cpp | 27 ++ FEXCore/Source/Utils/FutexSpinWait.h | 266 +++++++++++++++++++ FEXCore/unittests/APITests/FutexSpinTest.cpp | 85 ++++++ 4 files changed, 379 insertions(+) create mode 100644 FEXCore/Source/Utils/FutexSpinWait.cpp create mode 100644 FEXCore/Source/Utils/FutexSpinWait.h create mode 100644 FEXCore/unittests/APITests/FutexSpinTest.cpp diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt index 36b820dc25..d3381af54c 100644 --- a/FEXCore/Source/CMakeLists.txt +++ b/FEXCore/Source/CMakeLists.txt @@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS Utils/Allocator.cpp Utils/CPUInfo.cpp Utils/FileLoading.cpp + Utils/FutexSpinWait.cpp Utils/ForcedAssert.cpp Utils/LogManager.cpp ) diff --git a/FEXCore/Source/Utils/FutexSpinWait.cpp b/FEXCore/Source/Utils/FutexSpinWait.cpp new file mode 100644 index 0000000000..057dea917e --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.cpp @@ -0,0 +1,27 @@ +#include "Utils/FutexSpinWait.h" + +namespace FEXCore::Utils::FutexSpinWait { +#ifdef _M_ARM_64 + constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; + + static uint32_t GetCycleCounterFrequency() { + uint64_t Result{}; + __asm("mrs %[Res], CNTFRQ_EL0" + : [Res] "=r" (Result)); + return Result; + } + + static uint64_t CalculateCyclesPerNanosecond() { + // Snapdragon devices historically use a 19.2Mhz cycle counter frequency + // This means that the number of cycles per nanosecond ends up being 52.0833... + // + // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. + // This means the number of cycles per nanosecond ends up being 1. + uint64_t CounterFrequency = GetCycleCounterFrequency(); + return NanosecondsInSecond / CounterFrequency; + } + + uint32_t CycleCounterFrequency = GetCycleCounterFrequency(); + uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond(); +#endif +} diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/FutexSpinWait.h new file mode 100644 index 0000000000..7bb53561d5 --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.h @@ -0,0 +1,266 @@ +#include +#include +#include +#include + +namespace FEXCore::Utils::FutexSpinWait { + /** + * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. + * + * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact + * by putting the CPU in to a lower-power state using WFE. + * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time + * but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit + * that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. + * + * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. + * + * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is + * slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. + * + * On non-ARM platforms it is truly a spin-loop, which is okay for debugging only. + */ +#ifdef _M_ARM_64 + +#define SPINLOOP_BODY(LoadExclusiveOp, LoadAtomicOp, RegSize) \ + /* Prime the exclusive monitor with the passed in address. */ \ + #LoadExclusiveOp " %" #RegSize "[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + " #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; " + +#define SPINLOOP_8BIT SPINLOOP_BODY(ldaxrb, ldarb, w) +#define SPINLOOP_16BIT SPINLOOP_BODY(ldaxrh, ldarh, w) +#define SPINLOOP_32BIT SPINLOOP_BODY(ldaxr, ldar, w) +#define SPINLOOP_64BIT SPINLOOP_BODY(ldaxr, ldar, x) + + extern uint32_t CycleCounterFrequency; + extern uint64_t CyclesPerNanosecond; + + ///< Get the raw cycle counter which is synchronizing. + /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. + static inline uint64_t GetCycleCounter() { + uint64_t Result{}; + __asm volatile(R"( + isb; + mrs %[Res], CNTVCT_EL0; + )" + : [Res] "=r" (Result)); + return Result; + } + + ///< Converts nanoseconds to number of cycles. + /// If the cycle counter is 1Ghz then this is a direct 1:1 map. + static inline uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) { + const auto NanosecondCount = Nanoseconds.count(); + return NanosecondCount / CyclesPerNanosecond; + } + + template + static inline void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + } while (Result != ExpectedValue); + } + + template + void Wait(uint8_t*, uint8_t); + template + void Wait(uint16_t*, uint16_t); + template + void Wait(uint32_t*, uint32_t); + template + void Wait(uint64_t*, uint64_t); + + template + static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); + const auto Begin = GetCycleCounter(); + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + + const auto CurrentCycleCounter = GetCycleCounter(); + if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } + + template + bool Wait(uint8_t*, uint8_t, std::chrono::nanoseconds const &); + template + bool Wait(uint16_t*, uint16_t, std::chrono::nanoseconds const &); + template + bool Wait(uint32_t*, uint32_t, std::chrono::nanoseconds const &); + template + bool Wait(uint64_t*, uint64_t, std::chrono::nanoseconds const &); + +#else + template + static inline void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + do { + Result = AtomicFutex->load(); + } while (Result != ExpectedValue); + } + + template + static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + const auto Begin = std::chrono::high_resolution_clock::now(); + + do { + Result = AtomicFutex->load(); + + const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); + if ((CurrentCycleCounter - Begin) >= Timeout) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } +#endif + + template + static inline void lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return; + + do { + // Wait until the futex is unlocked. + Wait(Futex, 0); + } while (!AtomicFutex->compare_exchange_strong(Expected, Desired)); + } + + template + static inline bool try_lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true; + + return false; + } + + template + static inline void unlock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + AtomicFutex->store(0); + } + +#undef SPINLOOP_8BIT +#undef SPINLOOP_16BIT +#undef SPINLOOP_32BIT +#undef SPINLOOP_64BIT +} diff --git a/FEXCore/unittests/APITests/FutexSpinTest.cpp b/FEXCore/unittests/APITests/FutexSpinTest.cpp new file mode 100644 index 0000000000..9090992540 --- /dev/null +++ b/FEXCore/unittests/APITests/FutexSpinTest.cpp @@ -0,0 +1,85 @@ +#include "Utils/FutexSpinWait.h" +#include +#include +#include + +constexpr auto SleepAmount = std::chrono::milliseconds(250); + +TEST_CASE("FutexSpin-Timed-8bit") { + uint8_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Sleep-8bit") { + constexpr auto SleepAmount = std::chrono::seconds(1); + + uint8_t Test{}; + std::atomic ActualSpinLoop{}; + std::chrono::nanoseconds SleptAmount; + + std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() { + auto now = std::chrono::high_resolution_clock::now(); + ActualSpinLoop.store(1); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1); + auto end = std::chrono::high_resolution_clock::now(); + SleptAmount = end - now; + }); + + // Wait until the second thread lets us know to stop waiting sleeping. + while(ActualSpinLoop.load() == 0); + + // sleep this thread for the sleep amount. + std::this_thread::sleep_for(SleepAmount); + + // Set the futex + FEXCore::Utils::FutexSpinWait::lock(&Test); + + // Wait for the thread to get done. + t.join(); + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(SleptAmount >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-16bit") { + uint16_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-32bit") { + uint32_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-64bit") { + uint64_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} From 2c5dd20f3c291ae954063f7420be5c1f18284459 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 28 Dec 2023 17:35:57 -0800 Subject: [PATCH 2/7] FutexSpinWait: Implement spin-loop Unique mutex. --- FEXCore/Source/Utils/FutexSpinWait.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/FutexSpinWait.h index 7bb53561d5..c75252c8d2 100644 --- a/FEXCore/Source/Utils/FutexSpinWait.h +++ b/FEXCore/Source/Utils/FutexSpinWait.h @@ -263,4 +263,18 @@ namespace FEXCore::Utils::FutexSpinWait { #undef SPINLOOP_16BIT #undef SPINLOOP_32BIT #undef SPINLOOP_64BIT + template + class UniqueSpinMutex final { + public: + UniqueSpinMutex(T *Futex) + : Futex {Futex} { + FEXCore::Utils::FutexSpinWait::lock(Futex); + } + + ~UniqueSpinMutex() { + FEXCore::Utils::FutexSpinWait::unlock(Futex); + } + private: + T *Futex; + }; } From 39f49782daf9730a338ac09e1803b3663fd1bc01 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 28 Dec 2023 17:37:03 -0800 Subject: [PATCH 3/7] Arm64: Move ParanoidTSO checks up out of the non-paranoid code bath --- FEXCore/Source/Utils/ArchHelpers/Arm64.cpp | 123 ++++++++++----------- 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp index 5de71fa0bc..17c1599eff 100644 --- a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp +++ b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp @@ -2044,9 +2044,11 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ uint32_t Size = (Instr & 0xC000'0000) >> 30; uint32_t AddrReg = (Instr >> 5) & 0x1F; uint32_t DataReg = Instr & 0x1F; - if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* - (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* - if (ParanoidTSO) { + + // ParanoidTSO path doesn't modify any code. + if (ParanoidTSO) [[unlikely]] { + if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* + (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, 0)) { // Skip this instruction now return std::make_pair(true, 4); @@ -2056,20 +2058,7 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ return NotHandled; } } - else { - uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000; - LDR |= Size << 30; - LDR |= AddrReg << 5; - LDR |= DataReg; - PC[0] = LDR; - PC[1] = DMB_LD; // Back-patch the half-barrier. - ClearICache(&PC[-1], 16); - // With the instruction modified, now execute again. - return std::make_pair(true, 0); - } - } - else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR* - if (ParanoidTSO) { + else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR* if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, 0)) { // Skip this instruction now return std::make_pair(true, 4); @@ -2079,22 +2068,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ return NotHandled; } } - else { - uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000; - STR |= Size << 30; - STR |= AddrReg << 5; - STR |= DataReg; - PC[-1] = DMB; // Back-patch the half-barrier. - PC[0] = STR; - ClearICache(&PC[-1], 16); - // Back up one instruction and have another go - return std::make_pair(true, -4); - } - } - else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR* - // Extract the 9-bit offset from the instruction - int32_t Offset = static_cast(Instr) << 11 >> 23; - if (ParanoidTSO) { + else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR* + // Extract the 9-bit offset from the instruction + int32_t Offset = static_cast(Instr) << 11 >> 23; if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, Offset)) { // Skip this instruction now return std::make_pair(true, 4); @@ -2104,23 +2080,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ return NotHandled; } } - else { - uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000; - LDUR |= Size << 30; - LDUR |= AddrReg << 5; - LDUR |= DataReg; - LDUR |= Instr & (0b1'1111'1111 << 9); - PC[0] = LDUR; - PC[1] = DMB_LD; // Back-patch the half-barrier. - ClearICache(&PC[-1], 16); - // With the instruction modified, now execute again. - return std::make_pair(true, 0); - } - } - else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR* - // Extract the 9-bit offset from the instruction - int32_t Offset = static_cast(Instr) << 11 >> 23; - if (ParanoidTSO) { + else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR* + // Extract the 9-bit offset from the instruction + int32_t Offset = static_cast(Instr) << 11 >> 23; if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, Offset)) { // Skip this instruction now return std::make_pair(true, 4); @@ -2130,18 +2092,55 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ return NotHandled; } } - else { - uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000; - STUR |= Size << 30; - STUR |= AddrReg << 5; - STUR |= DataReg; - STUR |= Instr & (0b1'1111'1111 << 9); - PC[-1] = DMB; // Back-patch the half-barrier. - PC[0] = STUR; - ClearICache(&PC[-1], 16); - // Back up one instruction and have another go - return std::make_pair(true, -4); - } + } + + if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* + (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* + uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000; + LDR |= Size << 30; + LDR |= AddrReg << 5; + LDR |= DataReg; + PC[0] = LDR; + PC[1] = DMB_LD; // Back-patch the half-barrier. + ClearICache(&PC[-1], 16); + // With the instruction modified, now execute again. + return std::make_pair(true, 0); + } + else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR* + uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000; + STR |= Size << 30; + STR |= AddrReg << 5; + STR |= DataReg; + PC[-1] = DMB; // Back-patch the half-barrier. + PC[0] = STR; + ClearICache(&PC[-1], 16); + // Back up one instruction and have another go + return std::make_pair(true, -4); + } + else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR* + // Extract the 9-bit offset from the instruction + uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000; + LDUR |= Size << 30; + LDUR |= AddrReg << 5; + LDUR |= DataReg; + LDUR |= Instr & (0b1'1111'1111 << 9); + PC[0] = LDUR; + PC[1] = DMB_LD; // Back-patch the half-barrier. + ClearICache(&PC[-1], 16); + // With the instruction modified, now execute again. + return std::make_pair(true, 0); + } + else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR* + uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000; + STUR |= Size << 30; + STUR |= AddrReg << 5; + STUR |= DataReg; + STUR |= Instr & (0b1'1111'1111 << 9); + PC[-1] = DMB; // Back-patch the half-barrier. + PC[0] = STUR; + ClearICache(&PC[-1], 16); + // Back up one instruction and have another go + return std::make_pair(true, -4); } else if ((Instr & ArchHelpers::Arm64::LDAXP_MASK) == ArchHelpers::Arm64::LDAXP_INST) { // LDAXP //Should be compare and swap pair only. LDAXP not used elsewhere From e18453cb575b4d73af351c61ae1ba9231911d971 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 28 Dec 2023 17:38:31 -0800 Subject: [PATCH 4/7] Jitarm64: Implements spin-loop futex for JIT blocks This will ensure that multiple concurrent SIGBUS handlers in the same code block doesn't modify the same code. --- FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp | 1 + FEXCore/Source/Utils/ArchHelpers/Arm64.cpp | 13 +++++++++++++ FEXCore/include/FEXCore/Core/CPUBackend.h | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp index b9a120ebb0..db471785bb 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp @@ -841,6 +841,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, // TODO: This needs to be a data RIP relocation once code caching works. // Current relocation code doesn't support this feature yet. JITBlockTail->RIP = Entry; + JITBlockTail->SpinLockFutex = 0; { // Store the RIP entries. diff --git a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp index 17c1599eff..eb716806e2 100644 --- a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp +++ b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp @@ -1,4 +1,8 @@ // SPDX-License-Identifier: MIT + +#include "Utils/FutexSpinWait.h" + +#include #include #include #include @@ -2094,6 +2098,15 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ } } + const auto Frame = Thread->CurrentFrame; + const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; + auto InlineHeader = reinterpret_cast(BlockBegin); + auto InlineTail = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail); + + // Lock code mutex during any SIGBUS handling that potentially changes code. + // Need to be careful to not read any code part-way through modification. + FEXCore::Utils::FutexSpinWait::UniqueSpinMutex lk(&InlineTail->SpinLockFutex); + if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000; diff --git a/FEXCore/include/FEXCore/Core/CPUBackend.h b/FEXCore/include/FEXCore/Core/CPUBackend.h index 4f9cab7b0f..dd34561cf6 100644 --- a/FEXCore/include/FEXCore/Core/CPUBackend.h +++ b/FEXCore/include/FEXCore/Core/CPUBackend.h @@ -98,6 +98,11 @@ namespace CPU { // Offset after this block to the start of the RIP entries. uint32_t OffsetToRIPEntries; + + // Shared-code modification spin-loop futex. + uint32_t SpinLockFutex; + + uint32_t _Pad; }; // Entries that live after the JITCodeTail. From ab6c00bbcfafd66d6364a80c83c982478f3a8b14 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 28 Dec 2023 17:56:32 -0800 Subject: [PATCH 5/7] FEXCore/Utils: Rename FutexSpinWait to SpinWaitLock --- FEXCore/Source/CMakeLists.txt | 2 +- FEXCore/Source/Utils/ArchHelpers/Arm64.cpp | 4 ++-- .../Utils/{FutexSpinWait.cpp => SpinWaitLock.cpp} | 4 ++-- .../Utils/{FutexSpinWait.h => SpinWaitLock.h} | 6 +++--- FEXCore/unittests/APITests/FutexSpinTest.cpp | 14 +++++++------- 5 files changed, 15 insertions(+), 15 deletions(-) rename FEXCore/Source/Utils/{FutexSpinWait.cpp => SpinWaitLock.cpp} (91%) rename FEXCore/Source/Utils/{FutexSpinWait.h => SpinWaitLock.h} (98%) diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt index d3381af54c..4279591fcf 100644 --- a/FEXCore/Source/CMakeLists.txt +++ b/FEXCore/Source/CMakeLists.txt @@ -5,9 +5,9 @@ set (FEXCORE_BASE_SRCS Utils/Allocator.cpp Utils/CPUInfo.cpp Utils/FileLoading.cpp - Utils/FutexSpinWait.cpp Utils/ForcedAssert.cpp Utils/LogManager.cpp + Utils/SpinWaitLock.cpp ) if (NOT MINGW_BUILD) diff --git a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp index eb716806e2..dc32a33705 100644 --- a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp +++ b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp @@ -1,6 +1,6 @@ // SPDX-License-Identifier: MIT -#include "Utils/FutexSpinWait.h" +#include "Utils/SpinWaitLock.h" #include #include @@ -2105,7 +2105,7 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_ // Lock code mutex during any SIGBUS handling that potentially changes code. // Need to be careful to not read any code part-way through modification. - FEXCore::Utils::FutexSpinWait::UniqueSpinMutex lk(&InlineTail->SpinLockFutex); + FEXCore::Utils::SpinWaitLock::UniqueSpinMutex lk(&InlineTail->SpinLockFutex); if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR* (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR* diff --git a/FEXCore/Source/Utils/FutexSpinWait.cpp b/FEXCore/Source/Utils/SpinWaitLock.cpp similarity index 91% rename from FEXCore/Source/Utils/FutexSpinWait.cpp rename to FEXCore/Source/Utils/SpinWaitLock.cpp index 057dea917e..b2f011adec 100644 --- a/FEXCore/Source/Utils/FutexSpinWait.cpp +++ b/FEXCore/Source/Utils/SpinWaitLock.cpp @@ -1,6 +1,6 @@ -#include "Utils/FutexSpinWait.h" +#include "Utils/SpinWaitLock.h" -namespace FEXCore::Utils::FutexSpinWait { +namespace FEXCore::Utils::SpinWaitLock { #ifdef _M_ARM_64 constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/SpinWaitLock.h similarity index 98% rename from FEXCore/Source/Utils/FutexSpinWait.h rename to FEXCore/Source/Utils/SpinWaitLock.h index c75252c8d2..509f43ac24 100644 --- a/FEXCore/Source/Utils/FutexSpinWait.h +++ b/FEXCore/Source/Utils/SpinWaitLock.h @@ -3,7 +3,7 @@ #include #include -namespace FEXCore::Utils::FutexSpinWait { +namespace FEXCore::Utils::SpinWaitLock { /** * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. * @@ -268,11 +268,11 @@ namespace FEXCore::Utils::FutexSpinWait { public: UniqueSpinMutex(T *Futex) : Futex {Futex} { - FEXCore::Utils::FutexSpinWait::lock(Futex); + FEXCore::Utils::SpinWaitLock::lock(Futex); } ~UniqueSpinMutex() { - FEXCore::Utils::FutexSpinWait::unlock(Futex); + FEXCore::Utils::SpinWaitLock::unlock(Futex); } private: T *Futex; diff --git a/FEXCore/unittests/APITests/FutexSpinTest.cpp b/FEXCore/unittests/APITests/FutexSpinTest.cpp index 9090992540..9871b318cc 100644 --- a/FEXCore/unittests/APITests/FutexSpinTest.cpp +++ b/FEXCore/unittests/APITests/FutexSpinTest.cpp @@ -1,4 +1,4 @@ -#include "Utils/FutexSpinWait.h" +#include "Utils/SpinWaitLock.h" #include #include #include @@ -9,7 +9,7 @@ TEST_CASE("FutexSpin-Timed-8bit") { uint8_t Test{}; auto now = std::chrono::high_resolution_clock::now(); - FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount); auto end = std::chrono::high_resolution_clock::now(); auto diff = end - now; @@ -27,7 +27,7 @@ TEST_CASE("FutexSpin-Sleep-8bit") { std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() { auto now = std::chrono::high_resolution_clock::now(); ActualSpinLoop.store(1); - FEXCore::Utils::FutexSpinWait::Wait(&Test, 1); + FEXCore::Utils::SpinWaitLock::Wait(&Test, 1); auto end = std::chrono::high_resolution_clock::now(); SleptAmount = end - now; }); @@ -39,7 +39,7 @@ TEST_CASE("FutexSpin-Sleep-8bit") { std::this_thread::sleep_for(SleepAmount); // Set the futex - FEXCore::Utils::FutexSpinWait::lock(&Test); + FEXCore::Utils::SpinWaitLock::lock(&Test); // Wait for the thread to get done. t.join(); @@ -52,7 +52,7 @@ TEST_CASE("FutexSpin-Timed-16bit") { uint16_t Test{}; auto now = std::chrono::high_resolution_clock::now(); - FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount); auto end = std::chrono::high_resolution_clock::now(); auto diff = end - now; @@ -64,7 +64,7 @@ TEST_CASE("FutexSpin-Timed-32bit") { uint32_t Test{}; auto now = std::chrono::high_resolution_clock::now(); - FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount); auto end = std::chrono::high_resolution_clock::now(); auto diff = end - now; @@ -76,7 +76,7 @@ TEST_CASE("FutexSpin-Timed-64bit") { uint64_t Test{}; auto now = std::chrono::high_resolution_clock::now(); - FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount); auto end = std::chrono::high_resolution_clock::now(); auto diff = end - now; From 2af7e997f48f9772fb038fc739d988ce84e22965 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 30 Dec 2023 20:04:27 -0800 Subject: [PATCH 6/7] Spinlocks: Fix assembly Need to have a source be +r so it doesn't get overwritten. --- FEXCore/Source/Utils/SpinWaitLock.h | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/FEXCore/Source/Utils/SpinWaitLock.h b/FEXCore/Source/Utils/SpinWaitLock.h index 509f43ac24..08dbddcad0 100644 --- a/FEXCore/Source/Utils/SpinWaitLock.h +++ b/FEXCore/Source/Utils/SpinWaitLock.h @@ -71,32 +71,32 @@ namespace FEXCore::Utils::SpinWaitLock { __asm volatile(SPINLOOP_8BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 2) { __asm volatile(SPINLOOP_16BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 4) { __asm volatile(SPINLOOP_32BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 8) { __asm volatile(SPINLOOP_64BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else { @@ -132,32 +132,32 @@ namespace FEXCore::Utils::SpinWaitLock { __asm volatile(SPINLOOP_8BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 2) { __asm volatile(SPINLOOP_16BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 4) { __asm volatile(SPINLOOP_32BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else if constexpr (sizeof(T) == 8) { __asm volatile(SPINLOOP_64BIT : [Result] "=r" (Result) , [Tmp] "=r" (Tmp) - : [Futex] "r" (Futex) - , [ExpectedValue] "r" (ExpectedValue) + , [Futex] "+r" (Futex) + : [ExpectedValue] "r" (ExpectedValue) : "memory"); } else { From a6c57f71e97f03f1101f97984f5970c1bb29a2a1 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Wed, 17 Jan 2024 10:41:16 -0800 Subject: [PATCH 7/7] SpinWaitLock: Fixes potential extra wait that would occur on contended lock We had a chance of doing an additional bogus wfe if the expected value was hit in one iteration of a loop. Not the biggest problem on current hardware where WFE only ever sleeps for 1-4 system cycles, but on future hardware where WFE might actually sleep for longer then this could have been an issue. --- FEXCore/Source/Utils/SpinWaitLock.h | 181 ++++++++++++++++------------ 1 file changed, 101 insertions(+), 80 deletions(-) diff --git a/FEXCore/Source/Utils/SpinWaitLock.h b/FEXCore/Source/Utils/SpinWaitLock.h index 08dbddcad0..d0a3510d08 100644 --- a/FEXCore/Source/Utils/SpinWaitLock.h +++ b/FEXCore/Source/Utils/SpinWaitLock.h @@ -22,18 +22,25 @@ namespace FEXCore::Utils::SpinWaitLock { */ #ifdef _M_ARM_64 -#define SPINLOOP_BODY(LoadExclusiveOp, LoadAtomicOp, RegSize) \ +#define LOADEXCLUSIVE(LoadExclusiveOp, RegSize) \ /* Prime the exclusive monitor with the passed in address. */ \ - #LoadExclusiveOp " %" #RegSize "[Tmp], [%[Futex]]; \ + #LoadExclusiveOp " %" #RegSize "[Result], [%[Futex]];" + +#define SPINLOOP_BODY(LoadAtomicOp, RegSize) \ /* WFE will wait for either the memory to change or spurious wake-up. */ \ - wfe; \ + "wfe;" \ /* Load with acquire to get the result of memory. */ \ - " #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; " + #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; " + +#define SPINLOOP_WFE_LDX_8BIT LOADEXCLUSIVE(ldaxrb, w) +#define SPINLOOP_WFE_LDX_16BIT LOADEXCLUSIVE(ldaxrh, w) +#define SPINLOOP_WFE_LDX_32BIT LOADEXCLUSIVE(ldaxr, w) +#define SPINLOOP_WFE_LDX_64BIT LOADEXCLUSIVE(ldaxr, x) -#define SPINLOOP_8BIT SPINLOOP_BODY(ldaxrb, ldarb, w) -#define SPINLOOP_16BIT SPINLOOP_BODY(ldaxrh, ldarh, w) -#define SPINLOOP_32BIT SPINLOOP_BODY(ldaxr, ldar, w) -#define SPINLOOP_64BIT SPINLOOP_BODY(ldaxr, ldar, x) +#define SPINLOOP_8BIT SPINLOOP_BODY(ldarb, w) +#define SPINLOOP_16BIT SPINLOOP_BODY(ldarh, w) +#define SPINLOOP_32BIT SPINLOOP_BODY(ldar, w) +#define SPINLOOP_64BIT SPINLOOP_BODY(ldar, x) extern uint32_t CycleCounterFrequency; extern uint64_t CyclesPerNanosecond; @@ -57,51 +64,98 @@ namespace FEXCore::Utils::SpinWaitLock { return NanosecondCount / CyclesPerNanosecond; } + static inline uint8_t LoadExclusive(uint8_t *Futex) { + uint8_t Result{}; + __asm volatile(SPINLOOP_WFE_LDX_8BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint16_t LoadExclusive(uint16_t *Futex) { + uint16_t Result{}; + __asm volatile(SPINLOOP_WFE_LDX_16BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint32_t LoadExclusive(uint32_t *Futex) { + uint32_t Result{}; + __asm volatile(SPINLOOP_WFE_LDX_32BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint64_t LoadExclusive(uint64_t *Futex) { + uint64_t Result{}; + __asm volatile(SPINLOOP_WFE_LDX_64BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint8_t WFELoadAtomic(uint8_t *Futex) { + uint8_t Result{}; + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint16_t WFELoadAtomic(uint16_t *Futex) { + uint16_t Result{}; + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint32_t WFELoadAtomic(uint32_t *Futex) { + uint32_t Result{}; + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + + static inline uint64_t WFELoadAtomic(uint64_t *Futex) { + uint64_t Result{}; + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Futex] "+r" (Futex) + :: "memory"); + + return Result; + } + template static inline void Wait(T *Futex, TT ExpectedValue) { std::atomic *AtomicFutex = reinterpret_cast*>(Futex); - T Tmp{}; T Result = AtomicFutex->load(); // Early exit if possible. if (Result == ExpectedValue) return; do { - if constexpr (sizeof(T) == 1) { - __asm volatile(SPINLOOP_8BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 2) { - __asm volatile(SPINLOOP_16BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 4) { - __asm volatile(SPINLOOP_32BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 8) { - __asm volatile(SPINLOOP_64BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else { - static_assert(!std::is_same_v, "Invalid"); - } + Result = LoadExclusive(Futex); + if (Result == ExpectedValue) return; + Result = WFELoadAtomic(Futex); } while (Result != ExpectedValue); } @@ -118,7 +172,6 @@ namespace FEXCore::Utils::SpinWaitLock { static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { std::atomic *AtomicFutex = reinterpret_cast*>(Futex); - T Tmp{}; T Result = AtomicFutex->load(); // Early exit if possible. @@ -128,41 +181,9 @@ namespace FEXCore::Utils::SpinWaitLock { const auto Begin = GetCycleCounter(); do { - if constexpr (sizeof(T) == 1) { - __asm volatile(SPINLOOP_8BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 2) { - __asm volatile(SPINLOOP_16BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 4) { - __asm volatile(SPINLOOP_32BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else if constexpr (sizeof(T) == 8) { - __asm volatile(SPINLOOP_64BIT - : [Result] "=r" (Result) - , [Tmp] "=r" (Tmp) - , [Futex] "+r" (Futex) - : [ExpectedValue] "r" (ExpectedValue) - : "memory"); - } - else { - static_assert(!std::is_same_v, "Invalid"); - } + Result = LoadExclusive(Futex); + if (Result == ExpectedValue) return true; + Result = WFELoadAtomic(Futex); const auto CurrentCycleCounter = GetCycleCounter(); if ((CurrentCycleCounter - Begin) >= TimeoutCycles) {