Skip to content

Commit

Permalink
FEXCore: Implements an efficient spin-loop API
Browse files Browse the repository at this point in the history
This will only be used internally inside of FEXCore for efficient shared
codecach backpatch spin-loops.
  • Loading branch information
Sonicadvance1 committed Dec 17, 2023
1 parent 12923ba commit e51a16a
Show file tree
Hide file tree
Showing 4 changed files with 408 additions and 0 deletions.
1 change: 1 addition & 0 deletions FEXCore/Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS
Utils/Allocator.cpp
Utils/CPUInfo.cpp
Utils/FileLoading.cpp
Utils/FutexSpinWait.cpp
Utils/ForcedAssert.cpp
Utils/LogManager.cpp
)
Expand Down
41 changes: 41 additions & 0 deletions FEXCore/Source/Utils/FutexSpinWait.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "Utils/FutexSpinWait.h"

#include <mutex>

namespace FEXCore::Utils {
#ifdef _M_ARM_64
std::atomic<bool> FEXCore::Utils::FutexSpinWait::StaticDataInitialized;
std::mutex FEXCore::Utils::FutexSpinWait::StaticInitMutex;
uint32_t FEXCore::Utils::FutexSpinWait::CycleCounterFrequency;
uint64_t FEXCore::Utils::FutexSpinWait::CyclesPerNanosecond;

constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL;

static uint32_t GetCycleCounterFrequency() {
uint64_t Result{};
__asm("mrs %[Res], CNTFRQ_EL0"
: [Res] "=r" (Result));
return Result;
}

static uint64_t CalculateCyclesPerNanosecond(uint64_t CounterFrequency) {
// Fairly trivial calculation but broken out for additional information.
//
// Snapdragon devices historically use a 19.2Mhz cycle counter frequency
// This means that the number of cycles per nanosecond ends up being 52.0833...
//
// ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz.
// This means the number of cycles per nanosecond ends up being 1.
return NanosecondsInSecond / CounterFrequency;
}

void FEXCore::Utils::FutexSpinWait::Init() {
std::unique_lock lk {StaticInitMutex};
if (StaticDataInitialized == true) return;

CycleCounterFrequency = GetCycleCounterFrequency();
CyclesPerNanosecond = CalculateCyclesPerNanosecond(CycleCounterFrequency);
StaticDataInitialized = true;
}
#endif
}
281 changes: 281 additions & 0 deletions FEXCore/Source/Utils/FutexSpinWait.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#include <atomic>
#include <chrono>
#include <type_traits>

namespace FEXCore::Utils {
#define SPINLOOP_8BIT " \
/* Prime the exclusive monitor with the passed in address. */ \
ldaxrb %w[Tmp], [%[Futex]]; \
/* WFE will wait for either the memory to change or spurious wake-up. */ \
wfe; \
/* Load with acquire to get the result of memory. */ \
ldarb %w[Result], [%[Futex]]; \
"
#define SPINLOOP_16BIT " \
/* Prime the exclusive monitor with the passed in address. */ \
ldaxrh %w[Tmp], [%[Futex]]; \
/* WFE will wait for either the memory to change or spurious wake-up. */ \
wfe; \
/* Load with acquire to get the result of memory. */ \
ldarh %w[Result], [%[Futex]]; \
"
#define SPINLOOP_32BIT " \
/* Prime the exclusive monitor with the passed in address. */ \
ldaxr %w[Tmp], [%[Futex]]; \
/* WFE will wait for either the memory to change or spurious wake-up. */ \
wfe; \
/* Load with acquire to get the result of memory. */ \
ldar %w[Result], [%[Futex]]; \
"
#define SPINLOOP_64BIT " \
/* Prime the exclusive monitor with the passed in address. */ \
ldaxr %x[Tmp], [%[Futex]]; \
/* WFE will wait for either the memory to change or spurious wake-up. */ \
wfe; \
/* Load with acquire to get the result of memory. */ \
ldar %x[Result], [%[Futex]]; \
"

/**
* @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces.
*
* Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact
* by putting the CPU in to a lower-power state using WFE.
* On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time
* but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit
* that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits.
*
* FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using.
*
* It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is
* slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient.
*
* On non-ARM platforms it is truly a spin-loop. Which is okay for debugging only.
*/
class FutexSpinWait final {
public:
#ifdef _M_ARM_64
template<typename T, typename TT>
static void Wait(T *Futex, TT ExpectedValue) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return;

if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init();

do {
if constexpr (sizeof(T) == 1) {
__asm volatile(SPINLOOP_8BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 2) {
__asm volatile(SPINLOOP_16BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 4) {
__asm volatile(SPINLOOP_32BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 8) {
__asm volatile(SPINLOOP_64BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else {
static_assert(!std::is_same_v<T, T>, "Invalid");
}
} while (Result != ExpectedValue);
}

template<typename T, typename TT>
static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);

T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return true;

if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init();

const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout);
const auto Begin = GetCycleCounter();

do {
if constexpr (sizeof(T) == 1) {
__asm volatile(SPINLOOP_8BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 2) {
__asm volatile(SPINLOOP_16BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 4) {
__asm volatile(SPINLOOP_32BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 8) {
__asm volatile(SPINLOOP_64BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else {
static_assert(!std::is_same_v<T, T>, "Invalid");
}

const auto CurrentCycleCounter = GetCycleCounter();
if ((CurrentCycleCounter - Begin) >= TimeoutCycles) {
// Couldn't get value before timeout.
return false;
}
} while (Result != ExpectedValue);

// We got our result.
return true;
}
#else
template<typename T, typename TT>
static void Wait(T *Futex, TT ExpectedValue) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return;

do {
Result = AtomicFutex->load();
} while (Result != ExpectedValue);
}

template<typename T, typename TT>
static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);

T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return true;

const auto Begin = std::chrono::high_resolution_clock::now();

do {
Result = AtomicFutex->load();

const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now();
if ((CurrentCycleCounter - Begin) >= Timeout) {
// Couldn't get value before timeout.
return false;
}
} while (Result != ExpectedValue);

// We got our result.
return true;
}
#endif

template<typename T>
static void lock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Expected{};
T Desired {1};

// Try to CAS immediately.
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return;

do {
// Wait until the futex is unlocked.
Wait(Futex, 0);
} while (!AtomicFutex->compare_exchange_strong(Expected, Desired));
}

template<typename T>
static bool try_lock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Expected{};
T Desired {1};

// Try to CAS immediately.
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true;

return false;
}

template<typename T>
static void unlock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
AtomicFutex->store(0);
}

private:
#ifdef _M_ARM_64
// Static initialization
static std::atomic<bool> StaticDataInitialized;
static std::mutex StaticInitMutex;

static uint32_t CycleCounterFrequency;
static uint64_t CyclesPerNanosecond;

static void Init();

///< Get the raw cycle counter which is synchronizing.
/// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
static uint64_t GetCycleCounter() {
uint64_t Result{};
__asm volatile(R"(
isb;
mrs %[Res], CNTVCT_EL0;
)"
: [Res] "=r" (Result));
return Result;
}

///< Converts nanoseconds to number of cycles.
/// If the cycle counter is 1Ghz then this is a direct 1:1 map.
static uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) {
const auto NanosecondCount = Nanoseconds.count();
return NanosecondCount / CyclesPerNanosecond;
}
#endif
};

#undef SPINLOOP_8BIT
#undef SPINLOOP_16BIT
#undef SPINLOOP_32BIT
#undef SPINLOOP_64BIT
}
Loading

0 comments on commit e51a16a

Please sign in to comment.