-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEXCore: Implements an efficient spin-loop API
This will only be used internally inside of FEXCore for efficient shared codecach backpatch spin-loops.
- Loading branch information
1 parent
68d6cf5
commit 567666c
Showing
4 changed files
with
379 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#include "Utils/FutexSpinWait.h" | ||
|
||
namespace FEXCore::Utils::FutexSpinWait { | ||
#ifdef _M_ARM_64 | ||
constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; | ||
|
||
static uint32_t GetCycleCounterFrequency() { | ||
uint64_t Result{}; | ||
__asm("mrs %[Res], CNTFRQ_EL0" | ||
: [Res] "=r" (Result)); | ||
return Result; | ||
} | ||
|
||
static uint64_t CalculateCyclesPerNanosecond() { | ||
// Snapdragon devices historically use a 19.2Mhz cycle counter frequency | ||
// This means that the number of cycles per nanosecond ends up being 52.0833... | ||
// | ||
// ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. | ||
// This means the number of cycles per nanosecond ends up being 1. | ||
uint64_t CounterFrequency = GetCycleCounterFrequency(); | ||
return NanosecondsInSecond / CounterFrequency; | ||
} | ||
|
||
uint32_t CycleCounterFrequency = GetCycleCounterFrequency(); | ||
uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond(); | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
#include <atomic> | ||
#include <chrono> | ||
#include <mutex> | ||
#include <type_traits> | ||
|
||
namespace FEXCore::Utils::FutexSpinWait { | ||
/** | ||
* @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. | ||
* | ||
* Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact | ||
* by putting the CPU in to a lower-power state using WFE. | ||
* On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time | ||
* but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit | ||
* that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. | ||
* | ||
* FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. | ||
* | ||
* It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is | ||
* slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. | ||
* | ||
* On non-ARM platforms it is truly a spin-loop, which is okay for debugging only. | ||
*/ | ||
#ifdef _M_ARM_64 | ||
|
||
#define SPINLOOP_BODY(LoadExclusiveOp, LoadAtomicOp, RegSize) \ | ||
/* Prime the exclusive monitor with the passed in address. */ \ | ||
#LoadExclusiveOp " %" #RegSize "[Tmp], [%[Futex]]; \ | ||
/* WFE will wait for either the memory to change or spurious wake-up. */ \ | ||
wfe; \ | ||
/* Load with acquire to get the result of memory. */ \ | ||
" #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; " | ||
|
||
#define SPINLOOP_8BIT SPINLOOP_BODY(ldaxrb, ldarb, w) | ||
#define SPINLOOP_16BIT SPINLOOP_BODY(ldaxrh, ldarh, w) | ||
#define SPINLOOP_32BIT SPINLOOP_BODY(ldaxr, ldar, w) | ||
#define SPINLOOP_64BIT SPINLOOP_BODY(ldaxr, ldar, x) | ||
|
||
extern uint32_t CycleCounterFrequency; | ||
extern uint64_t CyclesPerNanosecond; | ||
|
||
///< Get the raw cycle counter which is synchronizing. | ||
/// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. | ||
static inline uint64_t GetCycleCounter() { | ||
uint64_t Result{}; | ||
__asm volatile(R"( | ||
isb; | ||
mrs %[Res], CNTVCT_EL0; | ||
)" | ||
: [Res] "=r" (Result)); | ||
return Result; | ||
} | ||
|
||
///< Converts nanoseconds to number of cycles. | ||
/// If the cycle counter is 1Ghz then this is a direct 1:1 map. | ||
static inline uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) { | ||
const auto NanosecondCount = Nanoseconds.count(); | ||
return NanosecondCount / CyclesPerNanosecond; | ||
} | ||
|
||
template<typename T, typename TT = T> | ||
static inline void Wait(T *Futex, TT ExpectedValue) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return; | ||
|
||
do { | ||
if constexpr (sizeof(T) == 1) { | ||
__asm volatile(SPINLOOP_8BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 2) { | ||
__asm volatile(SPINLOOP_16BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 4) { | ||
__asm volatile(SPINLOOP_32BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 8) { | ||
__asm volatile(SPINLOOP_64BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else { | ||
static_assert(!std::is_same_v<T, T>, "Invalid"); | ||
} | ||
} while (Result != ExpectedValue); | ||
} | ||
|
||
template | ||
void Wait<uint8_t>(uint8_t*, uint8_t); | ||
template | ||
void Wait<uint16_t>(uint16_t*, uint16_t); | ||
template | ||
void Wait<uint32_t>(uint32_t*, uint32_t); | ||
template | ||
void Wait<uint64_t>(uint64_t*, uint64_t); | ||
|
||
template<typename T, typename TT> | ||
static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
|
||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return true; | ||
|
||
const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); | ||
const auto Begin = GetCycleCounter(); | ||
|
||
do { | ||
if constexpr (sizeof(T) == 1) { | ||
__asm volatile(SPINLOOP_8BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 2) { | ||
__asm volatile(SPINLOOP_16BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 4) { | ||
__asm volatile(SPINLOOP_32BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 8) { | ||
__asm volatile(SPINLOOP_64BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else { | ||
static_assert(!std::is_same_v<T, T>, "Invalid"); | ||
} | ||
|
||
const auto CurrentCycleCounter = GetCycleCounter(); | ||
if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { | ||
// Couldn't get value before timeout. | ||
return false; | ||
} | ||
} while (Result != ExpectedValue); | ||
|
||
// We got our result. | ||
return true; | ||
} | ||
|
||
template | ||
bool Wait<uint8_t>(uint8_t*, uint8_t, std::chrono::nanoseconds const &); | ||
template | ||
bool Wait<uint16_t>(uint16_t*, uint16_t, std::chrono::nanoseconds const &); | ||
template | ||
bool Wait<uint32_t>(uint32_t*, uint32_t, std::chrono::nanoseconds const &); | ||
template | ||
bool Wait<uint64_t>(uint64_t*, uint64_t, std::chrono::nanoseconds const &); | ||
|
||
#else | ||
template<typename T, typename TT> | ||
static inline void Wait(T *Futex, TT ExpectedValue) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return; | ||
|
||
do { | ||
Result = AtomicFutex->load(); | ||
} while (Result != ExpectedValue); | ||
} | ||
|
||
template<typename T, typename TT> | ||
static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
|
||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return true; | ||
|
||
const auto Begin = std::chrono::high_resolution_clock::now(); | ||
|
||
do { | ||
Result = AtomicFutex->load(); | ||
|
||
const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); | ||
if ((CurrentCycleCounter - Begin) >= Timeout) { | ||
// Couldn't get value before timeout. | ||
return false; | ||
} | ||
} while (Result != ExpectedValue); | ||
|
||
// We got our result. | ||
return true; | ||
} | ||
#endif | ||
|
||
template<typename T> | ||
static inline void lock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Expected{}; | ||
T Desired {1}; | ||
|
||
// Try to CAS immediately. | ||
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return; | ||
|
||
do { | ||
// Wait until the futex is unlocked. | ||
Wait(Futex, 0); | ||
} while (!AtomicFutex->compare_exchange_strong(Expected, Desired)); | ||
} | ||
|
||
template<typename T> | ||
static inline bool try_lock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Expected{}; | ||
T Desired {1}; | ||
|
||
// Try to CAS immediately. | ||
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true; | ||
|
||
return false; | ||
} | ||
|
||
template<typename T> | ||
static inline void unlock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
AtomicFutex->store(0); | ||
} | ||
|
||
#undef SPINLOOP_8BIT | ||
#undef SPINLOOP_16BIT | ||
#undef SPINLOOP_32BIT | ||
#undef SPINLOOP_64BIT | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#include "Utils/FutexSpinWait.h" | ||
#include <catch2/catch.hpp> | ||
#include <chrono> | ||
#include <thread> | ||
|
||
constexpr auto SleepAmount = std::chrono::milliseconds(250); | ||
|
||
TEST_CASE("FutexSpin-Timed-8bit") { | ||
uint8_t Test{}; | ||
|
||
auto now = std::chrono::high_resolution_clock::now(); | ||
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
auto diff = end - now; | ||
|
||
// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. | ||
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount)); | ||
} | ||
|
||
TEST_CASE("FutexSpin-Sleep-8bit") { | ||
constexpr auto SleepAmount = std::chrono::seconds(1); | ||
|
||
uint8_t Test{}; | ||
std::atomic<uint8_t> ActualSpinLoop{}; | ||
std::chrono::nanoseconds SleptAmount; | ||
|
||
std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() { | ||
auto now = std::chrono::high_resolution_clock::now(); | ||
ActualSpinLoop.store(1); | ||
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
SleptAmount = end - now; | ||
}); | ||
|
||
// Wait until the second thread lets us know to stop waiting sleeping. | ||
while(ActualSpinLoop.load() == 0); | ||
|
||
// sleep this thread for the sleep amount. | ||
std::this_thread::sleep_for(SleepAmount); | ||
|
||
// Set the futex | ||
FEXCore::Utils::FutexSpinWait::lock(&Test); | ||
|
||
// Wait for the thread to get done. | ||
t.join(); | ||
|
||
// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. | ||
REQUIRE(SleptAmount >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount)); | ||
} | ||
|
||
TEST_CASE("FutexSpin-Timed-16bit") { | ||
uint16_t Test{}; | ||
|
||
auto now = std::chrono::high_resolution_clock::now(); | ||
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
auto diff = end - now; | ||
|
||
// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. | ||
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount)); | ||
} | ||
|
||
TEST_CASE("FutexSpin-Timed-32bit") { | ||
uint32_t Test{}; | ||
|
||
auto now = std::chrono::high_resolution_clock::now(); | ||
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
auto diff = end - now; | ||
|
||
// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. | ||
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount)); | ||
} | ||
|
||
TEST_CASE("FutexSpin-Timed-64bit") { | ||
uint64_t Test{}; | ||
|
||
auto now = std::chrono::high_resolution_clock::now(); | ||
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount); | ||
auto end = std::chrono::high_resolution_clock::now(); | ||
auto diff = end - now; | ||
|
||
// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late. | ||
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount)); | ||
} |