FEXCore: Implements an efficient spin-loop API

This will only be used internally inside of FEXCore for efficient shared codecach backpatch spin-loops.
FEX-Emu · Dec 17, 2023 · e51a16a · e51a16a
1 parent 12923ba
commit e51a16a
Show file tree

Hide file tree

Showing 4 changed files with 408 additions and 0 deletions.
diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt
@@ -5,6 +5,7 @@ set (FEXCORE_BASE_SRCS
   Utils/Allocator.cpp
   Utils/CPUInfo.cpp
   Utils/FileLoading.cpp
+  Utils/FutexSpinWait.cpp
   Utils/ForcedAssert.cpp
   Utils/LogManager.cpp
   )

diff --git a/FEXCore/Source/Utils/FutexSpinWait.cpp b/FEXCore/Source/Utils/FutexSpinWait.cpp
@@ -0,0 +1,41 @@
+#include "Utils/FutexSpinWait.h"
+
+#include <mutex>
+
+namespace FEXCore::Utils {
+#ifdef _M_ARM_64
+  std::atomic<bool> FEXCore::Utils::FutexSpinWait::StaticDataInitialized;
+  std::mutex FEXCore::Utils::FutexSpinWait::StaticInitMutex;
+  uint32_t FEXCore::Utils::FutexSpinWait::CycleCounterFrequency;
+  uint64_t FEXCore::Utils::FutexSpinWait::CyclesPerNanosecond;
+
+  constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL;
+
+  static uint32_t GetCycleCounterFrequency() {
+    uint64_t Result{};
+    __asm("mrs %[Res], CNTFRQ_EL0"
+        : [Res] "=r" (Result));
+    return Result;
+  }
+
+  static uint64_t CalculateCyclesPerNanosecond(uint64_t CounterFrequency) {
+    // Fairly trivial calculation but broken out for additional information.
+    //
+    // Snapdragon devices historically use a 19.2Mhz cycle counter frequency
+    // This means that the number of cycles per nanosecond ends up being 52.0833...
+    //
+    // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz.
+    // This means the number of cycles per nanosecond ends up being 1.
+    return NanosecondsInSecond / CounterFrequency;
+  }
+
+  void FEXCore::Utils::FutexSpinWait::Init() {
+    std::unique_lock lk {StaticInitMutex};
+    if (StaticDataInitialized == true)  return;
+
+    CycleCounterFrequency = GetCycleCounterFrequency();
+    CyclesPerNanosecond = CalculateCyclesPerNanosecond(CycleCounterFrequency);
+    StaticDataInitialized = true;
+  }
+#endif
+}
diff --git a/FEXCore/Source/Utils/FutexSpinWait.h b/FEXCore/Source/Utils/FutexSpinWait.h
@@ -0,0 +1,281 @@
+#include <atomic>
+#include <chrono>
+#include <type_traits>
+
+namespace FEXCore::Utils {
+#define SPINLOOP_8BIT " \
+  /* Prime the exclusive monitor with the passed in address. */ \
+  ldaxrb %w[Tmp], [%[Futex]]; \
+  /* WFE will wait for either the memory to change or spurious wake-up. */ \
+  wfe; \
+  /* Load with acquire to get the result of memory. */ \
+  ldarb %w[Result], [%[Futex]]; \
+"
+#define SPINLOOP_16BIT " \
+  /* Prime the exclusive monitor with the passed in address. */ \
+  ldaxrh %w[Tmp], [%[Futex]]; \
+  /* WFE will wait for either the memory to change or spurious wake-up. */ \
+  wfe; \
+  /* Load with acquire to get the result of memory. */ \
+  ldarh %w[Result], [%[Futex]]; \
+"
+#define SPINLOOP_32BIT " \
+  /* Prime the exclusive monitor with the passed in address. */ \
+  ldaxr %w[Tmp], [%[Futex]]; \
+  /* WFE will wait for either the memory to change or spurious wake-up. */ \
+  wfe; \
+  /* Load with acquire to get the result of memory. */ \
+  ldar %w[Result], [%[Futex]]; \
+"
+#define SPINLOOP_64BIT " \
+  /* Prime the exclusive monitor with the passed in address. */ \
+  ldaxr %x[Tmp], [%[Futex]]; \
+  /* WFE will wait for either the memory to change or spurious wake-up. */ \
+  wfe; \
+  /* Load with acquire to get the result of memory. */ \
+  ldar %x[Result], [%[Futex]]; \
+"
+
+  /**
+   * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces.
+   *
+   * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact
+   * by putting the CPU in to a lower-power state using WFE.
+   * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time
+   * but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit
+   * that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits.
+   *
+   * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using.
+   *
+   * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is
+   * slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient.
+   *
+   * On non-ARM platforms it is truly a spin-loop. Which is okay for debugging only.
+   */
+  class FutexSpinWait final {
+    public:
+#ifdef _M_ARM_64
+      template<typename T, typename TT>
+      static void Wait(T *Futex, TT ExpectedValue) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+        T Tmp{};
+        T Result = AtomicFutex->load();
+
+        // Early exit if possible.
+        if (Result == ExpectedValue) return;
+
+        if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init();
+
+        do {
+          if constexpr (sizeof(T) == 1) {
+            __asm volatile(SPINLOOP_8BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 2) {
+            __asm volatile(SPINLOOP_16BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 4) {
+            __asm volatile(SPINLOOP_32BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 8) {
+            __asm volatile(SPINLOOP_64BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else {
+            static_assert(!std::is_same_v<T, T>, "Invalid");
+          }
+        } while (Result != ExpectedValue);
+      }
+
+      template<typename T, typename TT>
+      static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+
+        T Tmp{};
+        T Result = AtomicFutex->load();
+
+        // Early exit if possible.
+        if (Result == ExpectedValue) return true;
+
+        if (!StaticDataInitialized.load(std::memory_order_relaxed)) [[unlikely]] Init();
+
+        const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout);
+        const auto Begin = GetCycleCounter();
+
+        do {
+          if constexpr (sizeof(T) == 1) {
+            __asm volatile(SPINLOOP_8BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 2) {
+            __asm volatile(SPINLOOP_16BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 4) {
+            __asm volatile(SPINLOOP_32BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else if constexpr (sizeof(T) == 8) {
+            __asm volatile(SPINLOOP_64BIT
+              : [Result] "=r" (Result)
+              , [Tmp] "=r" (Tmp)
+              : [Futex] "r" (Futex)
+              , [ExpectedValue] "r" (ExpectedValue)
+              : "memory");
+          }
+          else {
+            static_assert(!std::is_same_v<T, T>, "Invalid");
+          }
+
+          const auto CurrentCycleCounter = GetCycleCounter();
+          if ((CurrentCycleCounter - Begin) >= TimeoutCycles) {
+            // Couldn't get value before timeout.
+            return false;
+          }
+        } while (Result != ExpectedValue);
+
+        // We got our result.
+        return true;
+      }
+#else
+      template<typename T, typename TT>
+      static void Wait(T *Futex, TT ExpectedValue) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+        T Tmp{};
+        T Result = AtomicFutex->load();
+
+        // Early exit if possible.
+        if (Result == ExpectedValue) return;
+
+        do {
+          Result = AtomicFutex->load();
+        } while (Result != ExpectedValue);
+      }
+
+      template<typename T, typename TT>
+      static bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+
+        T Tmp{};
+        T Result = AtomicFutex->load();
+
+        // Early exit if possible.
+        if (Result == ExpectedValue) return true;
+
+        const auto Begin = std::chrono::high_resolution_clock::now();
+
+        do {
+          Result = AtomicFutex->load();
+
+          const auto CurrentCycleCounter =  std::chrono::high_resolution_clock::now();
+          if ((CurrentCycleCounter - Begin) >= Timeout) {
+            // Couldn't get value before timeout.
+            return false;
+          }
+        } while (Result != ExpectedValue);
+
+        // We got our result.
+        return true;
+      }
+#endif
+
+      template<typename T>
+      static void lock(T *Futex) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+        T Expected{};
+        T Desired {1};
+
+        // Try to CAS immediately.
+        if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return;
+
+        do {
+          // Wait until the futex is unlocked.
+          Wait(Futex, 0);
+        } while (!AtomicFutex->compare_exchange_strong(Expected, Desired));
+      }
+
+      template<typename T>
+      static bool try_lock(T *Futex) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+        T Expected{};
+        T Desired {1};
+
+        // Try to CAS immediately.
+        if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true;
+
+        return false;
+      }
+
+      template<typename T>
+      static void unlock(T *Futex) {
+        std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+        AtomicFutex->store(0);
+      }
+
+    private:
+#ifdef _M_ARM_64
+      // Static initialization
+      static std::atomic<bool> StaticDataInitialized;
+      static std::mutex StaticInitMutex;
+
+      static uint32_t CycleCounterFrequency;
+      static uint64_t CyclesPerNanosecond;
+
+      static void Init();
+
+      ///< Get the raw cycle counter which is synchronizing.
+      /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
+      static uint64_t GetCycleCounter() {
+        uint64_t Result{};
+        __asm volatile(R"(
+          isb;
+          mrs %[Res], CNTVCT_EL0;
+        )"
+        : [Res] "=r" (Result));
+        return Result;
+      }
+
+      ///< Converts nanoseconds to number of cycles.
+      /// If the cycle counter is 1Ghz then this is a direct 1:1 map.
+      static uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) {
+        const auto NanosecondCount = Nanoseconds.count();
+        return NanosecondCount / CyclesPerNanosecond;
+      }
+#endif
+  };
+
+#undef SPINLOOP_8BIT
+#undef SPINLOOP_16BIT
+#undef SPINLOOP_32BIT
+#undef SPINLOOP_64BIT
+}