diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt index 1f4ed43f81..44625d03bf 100644 --- a/FEXCore/Source/CMakeLists.txt +++ b/FEXCore/Source/CMakeLists.txt @@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS Utils/Allocator.cpp Utils/CPUInfo.cpp Utils/FileLoading.cpp + Utils/FutexSpinWait.cpp Utils/ForcedAssert.cpp Utils/LogManager.cpp ) diff --git a/FEXCore/Source/Utils/FutexSpinWait.cpp b/FEXCore/Source/Utils/FutexSpinWait.cpp new file mode 100644 index 0000000000..0326ba5e66 --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.cpp @@ -0,0 +1,41 @@ +#include "Utils/FutexSpinWait.h" + +#include + +namespace FEXCore::Utils { +#ifdef _M_ARM_64 + std::atomic FEXCore::Utils::FutexSpinWait::StaticDataInitialized; + std::mutex FEXCore::Utils::FutexSpinWait::StaticInitMutex; + uint32_t FEXCore::Utils::FutexSpinWait::CycleCounterFrequency; + uint64_t FEXCore::Utils::FutexSpinWait::CyclesPerNanosecond; + + constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; + + static uint32_t GetCycleCounterFrequency() { + uint64_t Result{}; + __asm("mrs %[Res], CNTFRQ_EL0" + : [Res] "=r" (Result)); + return Result; + } + + static uint64_t CalculateCyclesPerNanosecond(uint64_t CounterFrequency) { + // Fairly trivial calculation but broken out for additional information. + // + // Snapdragon devices historically use a 19.2Mhz cycle counter frequency + // This means that the number of cycles per nanosecond ends up being 52.0833... + // + // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. + // This means the number of cycles per nanosecond ends up being 1. + return NanosecondsInSecond / CounterFrequency; + } + + void FEXCore::Utils::FutexSpinWait::Init() { + std::unique_lock lk {StaticInitMutex}; + if (StaticDataInitialized == true) return; + + CycleCounterFrequency = GetCycleCounterFrequency(); + CyclesPerNanosecond = CalculateCyclesPerNanosecond(CycleCounterFrequency); + StaticDataInitialized = true; + } +#endif +} diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/FutexSpinWait.h new file mode 100644 index 0000000000..4a88a0eab8 --- /dev/null +++ b/FEXCore/Source/Utils/FutexSpinWait.h @@ -0,0 +1,281 @@ +#include +#include +#include + +namespace FEXCore::Utils { +#define SPINLOOP_8BIT " \ + /* Prime the exclusive monitor with the passed in address. */ \ + ldaxrb %w[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + ldarb %w[Result], [%[Futex]]; \ +" +#define SPINLOOP_16BIT " \ + /* Prime the exclusive monitor with the passed in address. */ \ + ldaxrh %w[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + ldarh %w[Result], [%[Futex]]; \ +" +#define SPINLOOP_32BIT " \ + /* Prime the exclusive monitor with the passed in address. */ \ + ldaxr %w[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + ldar %w[Result], [%[Futex]]; \ +" +#define SPINLOOP_64BIT " \ + /* Prime the exclusive monitor with the passed in address. */ \ + ldaxr %x[Tmp], [%[Futex]]; \ + /* WFE will wait for either the memory to change or spurious wake-up. */ \ + wfe; \ + /* Load with acquire to get the result of memory. */ \ + ldar %x[Result], [%[Futex]]; \ +" + + /** + * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. + * + * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact + * by putting the CPU in to a lower-power state using WFE. + * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time + * but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit + * that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. + * + * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. + * + * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is + * slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. + * + * On non-ARM platforms it is truly a spin-loop. Which is okay for debugging only. + */ + class FutexSpinWait final { + public: +#ifdef _M_ARM_64 + template + static void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init(); + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + } while (Result != ExpectedValue); + } + + template + static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init(); + + const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); + const auto Begin = GetCycleCounter(); + + do { + if constexpr (sizeof(T) == 1) { + __asm volatile(SPINLOOP_8BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 2) { + __asm volatile(SPINLOOP_16BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 4) { + __asm volatile(SPINLOOP_32BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else if constexpr (sizeof(T) == 8) { + __asm volatile(SPINLOOP_64BIT + : [Result] "=r" (Result) + , [Tmp] "=r" (Tmp) + : [Futex] "r" (Futex) + , [ExpectedValue] "r" (ExpectedValue) + : "memory"); + } + else { + static_assert(!std::is_same_v, "Invalid"); + } + + const auto CurrentCycleCounter = GetCycleCounter(); + if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } +#else + template + static void Wait(T *Futex, TT ExpectedValue) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return; + + do { + Result = AtomicFutex->load(); + } while (Result != ExpectedValue); + } + + template + static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + + T Tmp{}; + T Result = AtomicFutex->load(); + + // Early exit if possible. + if (Result == ExpectedValue) return true; + + const auto Begin = std::chrono::high_resolution_clock::now(); + + do { + Result = AtomicFutex->load(); + + const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); + if ((CurrentCycleCounter - Begin) >= Timeout) { + // Couldn't get value before timeout. + return false; + } + } while (Result != ExpectedValue); + + // We got our result. + return true; + } +#endif + + template + static void lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return; + + do { + // Wait until the futex is unlocked. + Wait(Futex, 0); + } while (!AtomicFutex->compare_exchange_strong(Expected, Desired)); + } + + template + static bool try_lock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + T Expected{}; + T Desired {1}; + + // Try to CAS immediately. + if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true; + + return false; + } + + template + static void unlock(T *Futex) { + std::atomic *AtomicFutex = reinterpret_cast*>(Futex); + AtomicFutex->store(0); + } + + private: +#ifdef _M_ARM_64 + // Static initialization + static std::atomic StaticDataInitialized; + static std::mutex StaticInitMutex; + + static uint32_t CycleCounterFrequency; + static uint64_t CyclesPerNanosecond; + + static void Init(); + + ///< Get the raw cycle counter which is synchronizing. + /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. + static uint64_t GetCycleCounter() { + uint64_t Result{}; + __asm volatile(R"( + isb; + mrs %[Res], CNTVCT_EL0; + )" + : [Res] "=r" (Result)); + return Result; + } + + ///< Converts nanoseconds to number of cycles. + /// If the cycle counter is 1Ghz then this is a direct 1:1 map. + static uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) { + const auto NanosecondCount = Nanoseconds.count(); + return NanosecondCount / CyclesPerNanosecond; + } +#endif + }; + +#undef SPINLOOP_8BIT +#undef SPINLOOP_16BIT +#undef SPINLOOP_32BIT +#undef SPINLOOP_64BIT +} diff --git a/FEXCore/unittests/APITests/FutexSpinTest.cpp b/FEXCore/unittests/APITests/FutexSpinTest.cpp new file mode 100644 index 0000000000..9090992540 --- /dev/null +++ b/FEXCore/unittests/APITests/FutexSpinTest.cpp @@ -0,0 +1,85 @@ +#include "Utils/FutexSpinWait.h" +#include +#include +#include + +constexpr auto SleepAmount = std::chrono::milliseconds(250); + +TEST_CASE("FutexSpin-Timed-8bit") { + uint8_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Sleep-8bit") { + constexpr auto SleepAmount = std::chrono::seconds(1); + + uint8_t Test{}; + std::atomic ActualSpinLoop{}; + std::chrono::nanoseconds SleptAmount; + + std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() { + auto now = std::chrono::high_resolution_clock::now(); + ActualSpinLoop.store(1); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1); + auto end = std::chrono::high_resolution_clock::now(); + SleptAmount = end - now; + }); + + // Wait until the second thread lets us know to stop waiting sleeping. + while(ActualSpinLoop.load() == 0); + + // sleep this thread for the sleep amount. + std::this_thread::sleep_for(SleepAmount); + + // Set the futex + FEXCore::Utils::FutexSpinWait::lock(&Test); + + // Wait for the thread to get done. + t.join(); + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(SleptAmount >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-16bit") { + uint16_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-32bit") { + uint32_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +} + +TEST_CASE("FutexSpin-Timed-64bit") { + uint64_t Test{}; + + auto now = std::chrono::high_resolution_clock::now(); + FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); + auto end = std::chrono::high_resolution_clock::now(); + auto diff = end - now; + + // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. + REQUIRE(std::chrono::duration_cast(diff) >= std::chrono::duration_cast(SleepAmount)); +}