From 567666cae88dea84bf3fed05663e67246ee53e30 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 16 Dec 2023 20:42:37 -0800 Subject: [PATCH] FEXCore: Implements an efficient spin-loop API This will only be used internally inside of FEXCore for efficient shared codecach backpatch spin-loops. --- FEXCore/Source/CMakeLists.txt | 1 + FEXCore/Source/Utils/FutexSpinWait.cpp | 27 ++ FEXCore/Source/Utils/FutexSpinWait.h | 266 +++++++++++++++++++ FEXCore/unittests/APITests/FutexSpinTest.cpp | 85 ++++++ 4 files changed, 379 insertions(+) create mode 100644 FEXCore/Source/Utils/FutexSpinWait.cpp create mode 100644 FEXCore/Source/Utils/FutexSpinWait.h create mode 100644 FEXCore/unittests/APITests/FutexSpinTest.cpp diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt index 1f4ed43f81..44625d03bf 100644 --- a/FEXCore/Source/CMakeLists.txt +++ b/FEXCore/Source/CMakeLists.txt @@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS Utils/Allocator.cpp Utils/CPUInfo.cpp Utils/FileLoading.cpp + Utils/FutexSpinWait.cpp Utils/ForcedAssert.cpp Utils/LogManager.cpp ) diff --git a/FEXCore/Source/Utils/FutexSpinWait.cpp b/FEXCore/Source/Utils/FutexSpinWait.cpp new file mode 100644 index 0000000000..057dea917e --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.cpp @@ -0,0 +1,27 @@ +#include "Utils/FutexSpinWait.h" + +namespace FEXCore::Utils::FutexSpinWait { +#ifdef _M_ARM_64 + constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; + + static uint32_t GetCycleCounterFrequency() { + uint64_t Result{}; + __asm("mrs %[Res], CNTFRQ_EL0" + : [Res] "=r" (Result)); + return Result; + } + + static uint64_t CalculateCyclesPerNanosecond() { + // Snapdragon devices historically use a 19.2Mhz cycle counter frequency + // This means that the number of cycles per nanosecond ends up being 52.0833... + // + // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. + // This means the number of cycles per nanosecond ends up being 1. + uint64_t CounterFrequency = GetCycleCounterFrequency(); + return NanosecondsInSecond / CounterFrequency; + } + + uint32_t CycleCounterFrequency = GetCycleCounterFrequency(); + uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond(); +#endif +} diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/FutexSpinWait.h new file mode 100644 index 0000000000..7bb53561d5 --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.h @@ -0,0 +1,266 @@ +#include +#include +#include +#include + +namespace FEXCore::Utils::FutexSpinWait { + /** + * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. + * + * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact + * by putting the CPU in to a lower-power state using WFE. + * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time + * but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit + * that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. + * + * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. + * + * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is + * slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. + * + * On non-ARM platforms it is truly a spin-loop, which is okay for debugging only. + */ +#ifdef _M_ARM_64 + +#define SPINLOOP_BODY(LoadExclusiveOp, LoadAtomicOp, RegSize) \ + /* Prime the exclusive monitor with the passed in address. */ \ + #LoadExclusiveOp " %" #RegSize "[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + " #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; " + +#define SPINLOOP_8BIT SPINLOOP_BODY(ldaxrb, ldarb, w) +#define SPINLOOP_16BIT SPINLOOP_BODY(ldaxrh, ldarh, w) +#define SPINLOOP_32BIT SPINLOOP_BODY(ldaxr, ldar, w) +#define SPINLOOP_64BIT SPINLOOP_BODY(ldaxr, ldar, x) + + extern uint32_t CycleCounterFrequency; + extern uint64_t CyclesPerNanosecond; + + ///< Get the raw cycle counter which is synchronizing. + /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. + static inline uint64_t GetCycleCounter() { + uint64_t Result{}; + __asm volatile(R"( + isb; + mrs %[Res], CNTVCT_EL0; + )" + : [Res] "=r" (Result)); + return Result; + } + + ///< Converts nanoseconds to number of cycles. + /// If the cycle counter is 1Ghz then this is a direct 1:1 map. + static inline uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) { + const auto NanosecondCount = Nanoseconds.count(); + return NanosecondCount / CyclesPerNanosecond; + } + + template + static inline void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + } while (Result != ExpectedValue); + } + + template + void Wait(uint8_t*, uint8_t); + template + void Wait(uint16_t*, uint16_t); + template + void Wait(uint32_t*, uint32_t); + template + void Wait(uint64_t*, uint64_t); + + template + static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); + const auto Begin = GetCycleCounter(); + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + + const auto CurrentCycleCounter = GetCycleCounter(); + if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } + + template + bool Wait(uint8_t*, uint8_t, std::chrono::nanoseconds const &); + template + bool Wait(uint16_t*, uint16_t, std::chrono::nanoseconds const &); + template + bool Wait(uint32_t*, uint32_t, std::chrono::nanoseconds const &); + template + bool Wait(uint64_t*, uint64_t, std::chrono::nanoseconds const &); + +#else + template + static inline void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + do { + Result = AtomicFutex->load(); + } while (Result != ExpectedValue); + } + + template + static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + const auto Begin = std::chrono::high_resolution_clock::now(); + + do { + Result = AtomicFutex->load(); + + const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); + if ((CurrentCycleCounter - Begin) >= Timeout) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } +#endif + + template + static inline void lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return; + + do { + // Wait until the futex is unlocked. + Wait(Futex, 0); + } while (!AtomicFutex->compare_exchange_strong(Expected, Desired)); + } + + template + static inline bool try_lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true; + + return false; + } + + template + static inline void unlock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + AtomicFutex->store(0); + } + +#undef SPINLOOP_8BIT +#undef SPINLOOP_16BIT +#undef SPINLOOP_32BIT +#undef SPINLOOP_64BIT +} diff --git a/FEXCore/unittests/APITests/FutexSpinTest.cpp b/FEXCore/unittests/APITests/FutexSpinTest.cpp new file mode 100644 index 0000000000..9090992540 --- /dev/null +++ b/FEXCore/unittests/APITests/FutexSpinTest.cpp @@ -0,0 +1,85 @@ +#include "Utils/FutexSpinWait.h" +#include +#include +#include + +constexpr auto SleepAmount = std::chrono::milliseconds(250); + +TEST_CASE("FutexSpin-Timed-8bit") { + uint8_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Sleep-8bit") { + constexpr auto SleepAmount = std::chrono::seconds(1); + + uint8_t Test{}; + std::atomic ActualSpinLoop{}; + std::chrono::nanoseconds SleptAmount; + + std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() { + auto now = std::chrono::high_resolution_clock::now(); + ActualSpinLoop.store(1); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1); + auto end = std::chrono::high_resolution_clock::now(); + SleptAmount = end - now; + }); + + // Wait until the second thread lets us know to stop waiting sleeping. + while(ActualSpinLoop.load() == 0); + + // sleep this thread for the sleep amount. + std::this_thread::sleep_for(SleepAmount); + + // Set the futex + FEXCore::Utils::FutexSpinWait::lock(&Test); + + // Wait for the thread to get done. + t.join(); + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(SleptAmount >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-16bit") { + uint16_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-32bit") { + uint32_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-64bit") { + uint64_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +}