Skip to content

Commit

Permalink
FEXCore: Implements an efficient spin-loop API
Browse files Browse the repository at this point in the history
This will only be used internally inside of FEXCore for efficient shared
codecach backpatch spin-loops.
  • Loading branch information
Sonicadvance1 committed Dec 18, 2023
1 parent 68d6cf5 commit 567666c
Show file tree
Hide file tree
Showing 4 changed files with 379 additions and 0 deletions.
1 change: 1 addition & 0 deletions FEXCore/Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS
Utils/Allocator.cpp
Utils/CPUInfo.cpp
Utils/FileLoading.cpp
Utils/FutexSpinWait.cpp
Utils/ForcedAssert.cpp
Utils/LogManager.cpp
)
Expand Down
27 changes: 27 additions & 0 deletions FEXCore/Source/Utils/FutexSpinWait.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "Utils/FutexSpinWait.h"

namespace FEXCore::Utils::FutexSpinWait {
#ifdef _M_ARM_64
constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL;

static uint32_t GetCycleCounterFrequency() {
uint64_t Result{};
__asm("mrs %[Res], CNTFRQ_EL0"
: [Res] "=r" (Result));
return Result;
}

static uint64_t CalculateCyclesPerNanosecond() {
// Snapdragon devices historically use a 19.2Mhz cycle counter frequency
// This means that the number of cycles per nanosecond ends up being 52.0833...
//
// ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz.
// This means the number of cycles per nanosecond ends up being 1.
uint64_t CounterFrequency = GetCycleCounterFrequency();
return NanosecondsInSecond / CounterFrequency;
}

uint32_t CycleCounterFrequency = GetCycleCounterFrequency();
uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond();
#endif
}
266 changes: 266 additions & 0 deletions FEXCore/Source/Utils/FutexSpinWait.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
#include <atomic>
#include <chrono>
#include <mutex>
#include <type_traits>

namespace FEXCore::Utils::FutexSpinWait {
/**
* @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces.
*
* Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact
* by putting the CPU in to a lower-power state using WFE.
* On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time
* but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit
* that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits.
*
* FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using.
*
* It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is
* slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient.
*
* On non-ARM platforms it is truly a spin-loop, which is okay for debugging only.
*/
#ifdef _M_ARM_64

#define SPINLOOP_BODY(LoadExclusiveOp, LoadAtomicOp, RegSize) \
/* Prime the exclusive monitor with the passed in address. */ \
#LoadExclusiveOp " %" #RegSize "[Tmp], [%[Futex]]; \
/* WFE will wait for either the memory to change or spurious wake-up. */ \
wfe; \
/* Load with acquire to get the result of memory. */ \
" #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; "

#define SPINLOOP_8BIT SPINLOOP_BODY(ldaxrb, ldarb, w)
#define SPINLOOP_16BIT SPINLOOP_BODY(ldaxrh, ldarh, w)
#define SPINLOOP_32BIT SPINLOOP_BODY(ldaxr, ldar, w)
#define SPINLOOP_64BIT SPINLOOP_BODY(ldaxr, ldar, x)

extern uint32_t CycleCounterFrequency;
extern uint64_t CyclesPerNanosecond;

///< Get the raw cycle counter which is synchronizing.
/// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
static inline uint64_t GetCycleCounter() {
uint64_t Result{};
__asm volatile(R"(
isb;
mrs %[Res], CNTVCT_EL0;
)"
: [Res] "=r" (Result));
return Result;
}

///< Converts nanoseconds to number of cycles.
/// If the cycle counter is 1Ghz then this is a direct 1:1 map.
static inline uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) {
const auto NanosecondCount = Nanoseconds.count();
return NanosecondCount / CyclesPerNanosecond;
}

template<typename T, typename TT = T>
static inline void Wait(T *Futex, TT ExpectedValue) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return;

do {
if constexpr (sizeof(T) == 1) {
__asm volatile(SPINLOOP_8BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 2) {
__asm volatile(SPINLOOP_16BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 4) {
__asm volatile(SPINLOOP_32BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 8) {
__asm volatile(SPINLOOP_64BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else {
static_assert(!std::is_same_v<T, T>, "Invalid");
}
} while (Result != ExpectedValue);
}

template
void Wait<uint8_t>(uint8_t*, uint8_t);
template
void Wait<uint16_t>(uint16_t*, uint16_t);
template
void Wait<uint32_t>(uint32_t*, uint32_t);
template
void Wait<uint64_t>(uint64_t*, uint64_t);

template<typename T, typename TT>
static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);

T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return true;

const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout);
const auto Begin = GetCycleCounter();

do {
if constexpr (sizeof(T) == 1) {
__asm volatile(SPINLOOP_8BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 2) {
__asm volatile(SPINLOOP_16BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 4) {
__asm volatile(SPINLOOP_32BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else if constexpr (sizeof(T) == 8) {
__asm volatile(SPINLOOP_64BIT
: [Result] "=r" (Result)
, [Tmp] "=r" (Tmp)
: [Futex] "r" (Futex)
, [ExpectedValue] "r" (ExpectedValue)
: "memory");
}
else {
static_assert(!std::is_same_v<T, T>, "Invalid");
}

const auto CurrentCycleCounter = GetCycleCounter();
if ((CurrentCycleCounter - Begin) >= TimeoutCycles) {
// Couldn't get value before timeout.
return false;
}
} while (Result != ExpectedValue);

// We got our result.
return true;
}

template
bool Wait<uint8_t>(uint8_t*, uint8_t, std::chrono::nanoseconds const &);
template
bool Wait<uint16_t>(uint16_t*, uint16_t, std::chrono::nanoseconds const &);
template
bool Wait<uint32_t>(uint32_t*, uint32_t, std::chrono::nanoseconds const &);
template
bool Wait<uint64_t>(uint64_t*, uint64_t, std::chrono::nanoseconds const &);

#else
template<typename T, typename TT>
static inline void Wait(T *Futex, TT ExpectedValue) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return;

do {
Result = AtomicFutex->load();
} while (Result != ExpectedValue);
}

template<typename T, typename TT>
static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);

T Tmp{};
T Result = AtomicFutex->load();

// Early exit if possible.
if (Result == ExpectedValue) return true;

const auto Begin = std::chrono::high_resolution_clock::now();

do {
Result = AtomicFutex->load();

const auto CurrentCycleCounter = std::chrono::high_resolution_clock::now();
if ((CurrentCycleCounter - Begin) >= Timeout) {
// Couldn't get value before timeout.
return false;
}
} while (Result != ExpectedValue);

// We got our result.
return true;
}
#endif

template<typename T>
static inline void lock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Expected{};
T Desired {1};

// Try to CAS immediately.
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return;

do {
// Wait until the futex is unlocked.
Wait(Futex, 0);
} while (!AtomicFutex->compare_exchange_strong(Expected, Desired));
}

template<typename T>
static inline bool try_lock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
T Expected{};
T Desired {1};

// Try to CAS immediately.
if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true;

return false;
}

template<typename T>
static inline void unlock(T *Futex) {
std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
AtomicFutex->store(0);
}

#undef SPINLOOP_8BIT
#undef SPINLOOP_16BIT
#undef SPINLOOP_32BIT
#undef SPINLOOP_64BIT
}
85 changes: 85 additions & 0 deletions FEXCore/unittests/APITests/FutexSpinTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include "Utils/FutexSpinWait.h"
#include <catch2/catch.hpp>
#include <chrono>
#include <thread>

constexpr auto SleepAmount = std::chrono::milliseconds(250);

TEST_CASE("FutexSpin-Timed-8bit") {
uint8_t Test{};

auto now = std::chrono::high_resolution_clock::now();
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount);
auto end = std::chrono::high_resolution_clock::now();
auto diff = end - now;

// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
}

TEST_CASE("FutexSpin-Sleep-8bit") {
constexpr auto SleepAmount = std::chrono::seconds(1);

uint8_t Test{};
std::atomic<uint8_t> ActualSpinLoop{};
std::chrono::nanoseconds SleptAmount;

std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() {
auto now = std::chrono::high_resolution_clock::now();
ActualSpinLoop.store(1);
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1);
auto end = std::chrono::high_resolution_clock::now();
SleptAmount = end - now;
});

// Wait until the second thread lets us know to stop waiting sleeping.
while(ActualSpinLoop.load() == 0);

// sleep this thread for the sleep amount.
std::this_thread::sleep_for(SleepAmount);

// Set the futex
FEXCore::Utils::FutexSpinWait::lock(&Test);

// Wait for the thread to get done.
t.join();

// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
REQUIRE(SleptAmount >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
}

TEST_CASE("FutexSpin-Timed-16bit") {
uint16_t Test{};

auto now = std::chrono::high_resolution_clock::now();
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount);
auto end = std::chrono::high_resolution_clock::now();
auto diff = end - now;

// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
}

TEST_CASE("FutexSpin-Timed-32bit") {
uint32_t Test{};

auto now = std::chrono::high_resolution_clock::now();
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount);
auto end = std::chrono::high_resolution_clock::now();
auto diff = end - now;

// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
}

TEST_CASE("FutexSpin-Timed-64bit") {
uint64_t Test{};

auto now = std::chrono::high_resolution_clock::now();
FEXCore::Utils::FutexSpinWait::Wait(&Test, 1, SleepAmount);
auto end = std::chrono::high_resolution_clock::now();
auto diff = end - now;

// The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
}

0 comments on commit 567666c

Please sign in to comment.