-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEXCore: Implements an efficient spin-loop API
This will only be used internally inside of FEXCore for efficient shared codecach backpatch spin-loops.
- Loading branch information
1 parent
12923ba
commit e51a16a
Showing
4 changed files
with
408 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#include "Utils/FutexSpinWait.h" | ||
|
||
#include <mutex> | ||
|
||
namespace FEXCore::Utils { | ||
#ifdef _M_ARM_64 | ||
std::atomic<bool> FEXCore::Utils::FutexSpinWait::StaticDataInitialized; | ||
std::mutex FEXCore::Utils::FutexSpinWait::StaticInitMutex; | ||
uint32_t FEXCore::Utils::FutexSpinWait::CycleCounterFrequency; | ||
uint64_t FEXCore::Utils::FutexSpinWait::CyclesPerNanosecond; | ||
|
||
constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL; | ||
|
||
static uint32_t GetCycleCounterFrequency() { | ||
uint64_t Result{}; | ||
__asm("mrs %[Res], CNTFRQ_EL0" | ||
: [Res] "=r" (Result)); | ||
return Result; | ||
} | ||
|
||
static uint64_t CalculateCyclesPerNanosecond(uint64_t CounterFrequency) { | ||
// Fairly trivial calculation but broken out for additional information. | ||
// | ||
// Snapdragon devices historically use a 19.2Mhz cycle counter frequency | ||
// This means that the number of cycles per nanosecond ends up being 52.0833... | ||
// | ||
// ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz. | ||
// This means the number of cycles per nanosecond ends up being 1. | ||
return NanosecondsInSecond / CounterFrequency; | ||
} | ||
|
||
void FEXCore::Utils::FutexSpinWait::Init() { | ||
std::unique_lock lk {StaticInitMutex}; | ||
if (StaticDataInitialized == true) return; | ||
|
||
CycleCounterFrequency = GetCycleCounterFrequency(); | ||
CyclesPerNanosecond = CalculateCyclesPerNanosecond(CycleCounterFrequency); | ||
StaticDataInitialized = true; | ||
} | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,281 @@ | ||
#include <atomic> | ||
#include <chrono> | ||
#include <type_traits> | ||
|
||
namespace FEXCore::Utils { | ||
#define SPINLOOP_8BIT " \ | ||
/* Prime the exclusive monitor with the passed in address. */ \ | ||
ldaxrb %w[Tmp], [%[Futex]]; \ | ||
/* WFE will wait for either the memory to change or spurious wake-up. */ \ | ||
wfe; \ | ||
/* Load with acquire to get the result of memory. */ \ | ||
ldarb %w[Result], [%[Futex]]; \ | ||
" | ||
#define SPINLOOP_16BIT " \ | ||
/* Prime the exclusive monitor with the passed in address. */ \ | ||
ldaxrh %w[Tmp], [%[Futex]]; \ | ||
/* WFE will wait for either the memory to change or spurious wake-up. */ \ | ||
wfe; \ | ||
/* Load with acquire to get the result of memory. */ \ | ||
ldarh %w[Result], [%[Futex]]; \ | ||
" | ||
#define SPINLOOP_32BIT " \ | ||
/* Prime the exclusive monitor with the passed in address. */ \ | ||
ldaxr %w[Tmp], [%[Futex]]; \ | ||
/* WFE will wait for either the memory to change or spurious wake-up. */ \ | ||
wfe; \ | ||
/* Load with acquire to get the result of memory. */ \ | ||
ldar %w[Result], [%[Futex]]; \ | ||
" | ||
#define SPINLOOP_64BIT " \ | ||
/* Prime the exclusive monitor with the passed in address. */ \ | ||
ldaxr %x[Tmp], [%[Futex]]; \ | ||
/* WFE will wait for either the memory to change or spurious wake-up. */ \ | ||
wfe; \ | ||
/* Load with acquire to get the result of memory. */ \ | ||
ldar %x[Result], [%[Futex]]; \ | ||
" | ||
|
||
/** | ||
* @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces. | ||
* | ||
* Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact | ||
* by putting the CPU in to a lower-power state using WFE. | ||
* On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time | ||
* but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit | ||
* that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits. | ||
* | ||
* FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using. | ||
* | ||
* It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is | ||
* slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient. | ||
* | ||
* On non-ARM platforms it is truly a spin-loop. Which is okay for debugging only. | ||
*/ | ||
class FutexSpinWait final { | ||
public: | ||
#ifdef _M_ARM_64 | ||
template<typename T, typename TT> | ||
static void Wait(T *Futex, TT ExpectedValue) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return; | ||
|
||
if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init(); | ||
|
||
do { | ||
if constexpr (sizeof(T) == 1) { | ||
__asm volatile(SPINLOOP_8BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 2) { | ||
__asm volatile(SPINLOOP_16BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 4) { | ||
__asm volatile(SPINLOOP_32BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 8) { | ||
__asm volatile(SPINLOOP_64BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else { | ||
static_assert(!std::is_same_v<T, T>, "Invalid"); | ||
} | ||
} while (Result != ExpectedValue); | ||
} | ||
|
||
template<typename T, typename TT> | ||
static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
|
||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return true; | ||
|
||
if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init(); | ||
|
||
const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout); | ||
const auto Begin = GetCycleCounter(); | ||
|
||
do { | ||
if constexpr (sizeof(T) == 1) { | ||
__asm volatile(SPINLOOP_8BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 2) { | ||
__asm volatile(SPINLOOP_16BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 4) { | ||
__asm volatile(SPINLOOP_32BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else if constexpr (sizeof(T) == 8) { | ||
__asm volatile(SPINLOOP_64BIT | ||
: [Result] "=r" (Result) | ||
, [Tmp] "=r" (Tmp) | ||
: [Futex] "r" (Futex) | ||
, [ExpectedValue] "r" (ExpectedValue) | ||
: "memory"); | ||
} | ||
else { | ||
static_assert(!std::is_same_v<T, T>, "Invalid"); | ||
} | ||
|
||
const auto CurrentCycleCounter = GetCycleCounter(); | ||
if ((CurrentCycleCounter - Begin) >= TimeoutCycles) { | ||
// Couldn't get value before timeout. | ||
return false; | ||
} | ||
} while (Result != ExpectedValue); | ||
|
||
// We got our result. | ||
return true; | ||
} | ||
#else | ||
template<typename T, typename TT> | ||
static void Wait(T *Futex, TT ExpectedValue) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return; | ||
|
||
do { | ||
Result = AtomicFutex->load(); | ||
} while (Result != ExpectedValue); | ||
} | ||
|
||
template<typename T, typename TT> | ||
static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
|
||
T Tmp{}; | ||
T Result = AtomicFutex->load(); | ||
|
||
// Early exit if possible. | ||
if (Result == ExpectedValue) return true; | ||
|
||
const auto Begin = std::chrono::high_resolution_clock::now(); | ||
|
||
do { | ||
Result = AtomicFutex->load(); | ||
|
||
const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now(); | ||
if ((CurrentCycleCounter - Begin) >= Timeout) { | ||
// Couldn't get value before timeout. | ||
return false; | ||
} | ||
} while (Result != ExpectedValue); | ||
|
||
// We got our result. | ||
return true; | ||
} | ||
#endif | ||
|
||
template<typename T> | ||
static void lock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Expected{}; | ||
T Desired {1}; | ||
|
||
// Try to CAS immediately. | ||
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return; | ||
|
||
do { | ||
// Wait until the futex is unlocked. | ||
Wait(Futex, 0); | ||
} while (!AtomicFutex->compare_exchange_strong(Expected, Desired)); | ||
} | ||
|
||
template<typename T> | ||
static bool try_lock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
T Expected{}; | ||
T Desired {1}; | ||
|
||
// Try to CAS immediately. | ||
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true; | ||
|
||
return false; | ||
} | ||
|
||
template<typename T> | ||
static void unlock(T *Futex) { | ||
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex); | ||
AtomicFutex->store(0); | ||
} | ||
|
||
private: | ||
#ifdef _M_ARM_64 | ||
// Static initialization | ||
static std::atomic<bool> StaticDataInitialized; | ||
static std::mutex StaticInitMutex; | ||
|
||
static uint32_t CycleCounterFrequency; | ||
static uint64_t CyclesPerNanosecond; | ||
|
||
static void Init(); | ||
|
||
///< Get the raw cycle counter which is synchronizing. | ||
/// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. | ||
static uint64_t GetCycleCounter() { | ||
uint64_t Result{}; | ||
__asm volatile(R"( | ||
isb; | ||
mrs %[Res], CNTVCT_EL0; | ||
)" | ||
: [Res] "=r" (Result)); | ||
return Result; | ||
} | ||
|
||
///< Converts nanoseconds to number of cycles. | ||
/// If the cycle counter is 1Ghz then this is a direct 1:1 map. | ||
static uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) { | ||
const auto NanosecondCount = Nanoseconds.count(); | ||
return NanosecondCount / CyclesPerNanosecond; | ||
} | ||
#endif | ||
}; | ||
|
||
#undef SPINLOOP_8BIT | ||
#undef SPINLOOP_16BIT | ||
#undef SPINLOOP_32BIT | ||
#undef SPINLOOP_64BIT | ||
} |
Oops, something went wrong.