diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt
index 36b820dc25..4279591fcf 100644
--- a/FEXCore/Source/CMakeLists.txt
+++ b/FEXCore/Source/CMakeLists.txt
@@ -7,6 +7,7 @@ set (FEXCORE_BASE_SRCS
   Utils/FileLoading.cpp
   Utils/ForcedAssert.cpp
   Utils/LogManager.cpp
+  Utils/SpinWaitLock.cpp
   )
 
 if (NOT MINGW_BUILD)
diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
index e13cbc28b1..f04f46d5d7 100644
--- a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
@@ -858,6 +858,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry,
   // TODO: This needs to be a data RIP relocation once code caching works.
   //   Current relocation code doesn't support this feature yet.
   JITBlockTail->RIP = Entry;
+  JITBlockTail->SpinLockFutex = 0;
 
   {
     // Store the RIP entries.
diff --git a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp
index 5de71fa0bc..dc32a33705 100644
--- a/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp
+++ b/FEXCore/Source/Utils/ArchHelpers/Arm64.cpp
@@ -1,4 +1,8 @@
 // SPDX-License-Identifier: MIT
+
+#include "Utils/SpinWaitLock.h"
+
+#include <FEXCore/Debug/InternalThreadState.h>
 #include <FEXCore/Utils/EnumUtils.h>
 #include <FEXCore/Utils/LogManager.h>
 #include <FEXCore/Utils/Telemetry.h>
@@ -2044,9 +2048,11 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
     uint32_t Size = (Instr & 0xC000'0000) >> 30;
     uint32_t AddrReg = (Instr >> 5) & 0x1F;
     uint32_t DataReg = Instr & 0x1F;
-    if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
-        (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
-      if (ParanoidTSO) {
+
+    // ParanoidTSO path doesn't modify any code.
+    if (ParanoidTSO) [[unlikely]] {
+      if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
+          (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
         if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, 0)) {
           // Skip this instruction now
           return std::make_pair(true, 4);
@@ -2056,20 +2062,7 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
           return NotHandled;
         }
       }
-      else {
-        uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000;
-        LDR |= Size << 30;
-        LDR |= AddrReg << 5;
-        LDR |= DataReg;
-        PC[0] = LDR;
-        PC[1] = DMB_LD; // Back-patch the half-barrier.
-        ClearICache(&PC[-1], 16);
-        // With the instruction modified, now execute again.
-        return std::make_pair(true, 0);
-      }
-    }
-    else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
-      if (ParanoidTSO) {
+      else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
         if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, 0)) {
           // Skip this instruction now
           return std::make_pair(true, 4);
@@ -2079,22 +2072,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
           return NotHandled;
         }
       }
-      else {
-        uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000;
-        STR |= Size << 30;
-        STR |= AddrReg << 5;
-        STR |= DataReg;
-        PC[-1] = DMB; // Back-patch the half-barrier.
-        PC[0] = STR;
-        ClearICache(&PC[-1], 16);
-        // Back up one instruction and have another go
-        return std::make_pair(true, -4);
-      }
-    }
-    else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
-      // Extract the 9-bit offset from the instruction
-      int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
-      if (ParanoidTSO) {
+      else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
+        // Extract the 9-bit offset from the instruction
+        int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
         if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, Offset)) {
           // Skip this instruction now
           return std::make_pair(true, 4);
@@ -2104,23 +2084,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
           return NotHandled;
         }
       }
-      else {
-        uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000;
-        LDUR |= Size << 30;
-        LDUR |= AddrReg << 5;
-        LDUR |= DataReg;
-        LDUR |= Instr & (0b1'1111'1111 << 9);
-        PC[0] = LDUR;
-        PC[1] = DMB_LD; // Back-patch the half-barrier.
-        ClearICache(&PC[-1], 16);
-        // With the instruction modified, now execute again.
-        return std::make_pair(true, 0);
-      }
-    }
-    else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
-      // Extract the 9-bit offset from the instruction
-      int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
-      if (ParanoidTSO) {
+      else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
+        // Extract the 9-bit offset from the instruction
+        int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
         if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, Offset)) {
           // Skip this instruction now
           return std::make_pair(true, 4);
@@ -2130,18 +2096,64 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
           return NotHandled;
         }
       }
-      else {
-        uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000;
-        STUR |= Size << 30;
-        STUR |= AddrReg << 5;
-        STUR |= DataReg;
-        STUR |= Instr & (0b1'1111'1111 << 9);
-        PC[-1] = DMB;  // Back-patch the half-barrier.
-        PC[0] = STUR;
-        ClearICache(&PC[-1], 16);
-        // Back up one instruction and have another go
-        return std::make_pair(true, -4);
-      }
+    }
+
+    const auto Frame = Thread->CurrentFrame;
+    const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader;
+    auto InlineHeader = reinterpret_cast<const CPU::CPUBackend::JITCodeHeader *>(BlockBegin);
+    auto InlineTail = reinterpret_cast<CPU::CPUBackend::JITCodeTail *>(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail);
+
+    // Lock code mutex during any SIGBUS handling that potentially changes code.
+    // Need to be careful to not read any code part-way through modification.
+    FEXCore::Utils::SpinWaitLock::UniqueSpinMutex lk(&InlineTail->SpinLockFutex);
+
+    if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
+        (Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
+      uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000;
+      LDR |= Size << 30;
+      LDR |= AddrReg << 5;
+      LDR |= DataReg;
+      PC[0] = LDR;
+      PC[1] = DMB_LD; // Back-patch the half-barrier.
+      ClearICache(&PC[-1], 16);
+      // With the instruction modified, now execute again.
+      return std::make_pair(true, 0);
+    }
+    else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
+      uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000;
+      STR |= Size << 30;
+      STR |= AddrReg << 5;
+      STR |= DataReg;
+      PC[-1] = DMB; // Back-patch the half-barrier.
+      PC[0] = STR;
+      ClearICache(&PC[-1], 16);
+      // Back up one instruction and have another go
+      return std::make_pair(true, -4);
+    }
+    else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
+      // Extract the 9-bit offset from the instruction
+      uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000;
+      LDUR |= Size << 30;
+      LDUR |= AddrReg << 5;
+      LDUR |= DataReg;
+      LDUR |= Instr & (0b1'1111'1111 << 9);
+      PC[0] = LDUR;
+      PC[1] = DMB_LD; // Back-patch the half-barrier.
+      ClearICache(&PC[-1], 16);
+      // With the instruction modified, now execute again.
+      return std::make_pair(true, 0);
+    }
+    else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
+      uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000;
+      STUR |= Size << 30;
+      STUR |= AddrReg << 5;
+      STUR |= DataReg;
+      STUR |= Instr & (0b1'1111'1111 << 9);
+      PC[-1] = DMB;  // Back-patch the half-barrier.
+      PC[0] = STUR;
+      ClearICache(&PC[-1], 16);
+      // Back up one instruction and have another go
+      return std::make_pair(true, -4);
     }
     else if ((Instr & ArchHelpers::Arm64::LDAXP_MASK) == ArchHelpers::Arm64::LDAXP_INST) { // LDAXP
       //Should be compare and swap pair only. LDAXP not used elsewhere
diff --git a/FEXCore/Source/Utils/SpinWaitLock.cpp b/FEXCore/Source/Utils/SpinWaitLock.cpp
new file mode 100644
index 0000000000..b2f011adec
--- /dev/null
+++ b/FEXCore/Source/Utils/SpinWaitLock.cpp
@@ -0,0 +1,27 @@
+#include "Utils/SpinWaitLock.h"
+
+namespace FEXCore::Utils::SpinWaitLock {
+#ifdef _M_ARM_64
+  constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL;
+
+  static uint32_t GetCycleCounterFrequency() {
+    uint64_t Result{};
+    __asm("mrs %[Res], CNTFRQ_EL0"
+        : [Res] "=r" (Result));
+    return Result;
+  }
+
+  static uint64_t CalculateCyclesPerNanosecond() {
+    // Snapdragon devices historically use a 19.2Mhz cycle counter frequency
+    // This means that the number of cycles per nanosecond ends up being 52.0833...
+    //
+    // ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz.
+    // This means the number of cycles per nanosecond ends up being 1.
+    uint64_t CounterFrequency = GetCycleCounterFrequency();
+    return NanosecondsInSecond / CounterFrequency;
+  }
+
+  uint32_t CycleCounterFrequency = GetCycleCounterFrequency();
+  uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond();
+#endif
+}
diff --git a/FEXCore/Source/Utils/SpinWaitLock.h b/FEXCore/Source/Utils/SpinWaitLock.h
new file mode 100644
index 0000000000..d0a3510d08
--- /dev/null
+++ b/FEXCore/Source/Utils/SpinWaitLock.h
@@ -0,0 +1,301 @@
+#include <atomic>
+#include <chrono>
+#include <mutex>
+#include <type_traits>
+
+namespace FEXCore::Utils::SpinWaitLock {
+  /**
+   * @brief This provides routines to implement implement an "efficient spin-loop" using ARM's WFE and exclusive monitor interfaces.
+   *
+   * Spin-loops on mobile devices with a battery can be a bad idea as they burn a bunch of power. This attempts to mitigate some of the impact
+   * by putting the CPU in to a lower-power state using WFE.
+   * On platforms tested, WFE will put the CPU in to a lower power state for upwards of 52ns per WFE. Which isn't a significant amount of time
+   * but should still have power savings. Ideally WFE would be able to keep the CPU in a lower power state for longer. This also has the added benefit
+   * that atomics aren't abusing the caches when spinning on a cacheline, which has knock-on powersaving benefits.
+   *
+   * FEAT_WFxT adds a new instruction with a timeout, but since the spurious wake-up is so aggressive it isn't worth using.
+   *
+   * It should be noted that this implementation has a few dozen cycles of start-up time. Which means the overhead for invoking this implementation is
+   * slightly higher than a true spin-loop. The hot loop body itself is only three instructions so it is quite efficient.
+   *
+   * On non-ARM platforms it is truly a spin-loop, which is okay for debugging only.
+   */
+#ifdef _M_ARM_64
+
+#define LOADEXCLUSIVE(LoadExclusiveOp, RegSize) \
+  /* Prime the exclusive monitor with the passed in address. */ \
+  #LoadExclusiveOp  " %" #RegSize "[Result], [%[Futex]];"
+
+#define SPINLOOP_BODY(LoadAtomicOp, RegSize) \
+  /* WFE will wait for either the memory to change or spurious wake-up. */ \
+  "wfe;" \
+  /* Load with acquire to get the result of memory. */ \
+  #LoadAtomicOp " %" #RegSize "[Result], [%[Futex]]; "
+
+#define SPINLOOP_WFE_LDX_8BIT  LOADEXCLUSIVE(ldaxrb, w)
+#define SPINLOOP_WFE_LDX_16BIT LOADEXCLUSIVE(ldaxrh, w)
+#define SPINLOOP_WFE_LDX_32BIT LOADEXCLUSIVE(ldaxr,  w)
+#define SPINLOOP_WFE_LDX_64BIT LOADEXCLUSIVE(ldaxr,  x)
+
+#define SPINLOOP_8BIT  SPINLOOP_BODY(ldarb, w)
+#define SPINLOOP_16BIT SPINLOOP_BODY(ldarh, w)
+#define SPINLOOP_32BIT SPINLOOP_BODY(ldar,  w)
+#define SPINLOOP_64BIT SPINLOOP_BODY(ldar,  x)
+
+  extern uint32_t CycleCounterFrequency;
+  extern uint64_t CyclesPerNanosecond;
+
+  ///< Get the raw cycle counter which is synchronizing.
+  /// `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
+  static inline uint64_t GetCycleCounter() {
+    uint64_t Result{};
+    __asm volatile(R"(
+      isb;
+      mrs %[Res], CNTVCT_EL0;
+    )"
+    : [Res] "=r" (Result));
+    return Result;
+  }
+
+  ///< Converts nanoseconds to number of cycles.
+  /// If the cycle counter is 1Ghz then this is a direct 1:1 map.
+  static inline uint64_t ConvertNanosecondsToCycles(std::chrono::nanoseconds const &Nanoseconds) {
+    const auto NanosecondCount = Nanoseconds.count();
+    return NanosecondCount / CyclesPerNanosecond;
+  }
+
+  static inline uint8_t LoadExclusive(uint8_t *Futex) {
+    uint8_t Result{};
+    __asm volatile(SPINLOOP_WFE_LDX_8BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint16_t LoadExclusive(uint16_t *Futex) {
+    uint16_t Result{};
+    __asm volatile(SPINLOOP_WFE_LDX_16BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint32_t LoadExclusive(uint32_t *Futex) {
+    uint32_t Result{};
+    __asm volatile(SPINLOOP_WFE_LDX_32BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint64_t LoadExclusive(uint64_t *Futex) {
+    uint64_t Result{};
+    __asm volatile(SPINLOOP_WFE_LDX_64BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint8_t WFELoadAtomic(uint8_t *Futex) {
+    uint8_t Result{};
+    __asm volatile(SPINLOOP_8BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint16_t WFELoadAtomic(uint16_t *Futex) {
+    uint16_t Result{};
+    __asm volatile(SPINLOOP_16BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint32_t WFELoadAtomic(uint32_t *Futex) {
+    uint32_t Result{};
+    __asm volatile(SPINLOOP_32BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  static inline uint64_t WFELoadAtomic(uint64_t *Futex) {
+    uint64_t Result{};
+    __asm volatile(SPINLOOP_64BIT
+        : [Result] "=r" (Result)
+        , [Futex] "+r" (Futex)
+        :: "memory");
+
+    return Result;
+  }
+
+  template<typename T, typename TT = T>
+  static inline void Wait(T *Futex, TT ExpectedValue) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+    T Result = AtomicFutex->load();
+
+    // Early exit if possible.
+    if (Result == ExpectedValue) return;
+
+    do {
+      Result = LoadExclusive(Futex);
+      if (Result == ExpectedValue) return;
+      Result = WFELoadAtomic(Futex);
+    } while (Result != ExpectedValue);
+  }
+
+  template
+  void Wait<uint8_t>(uint8_t*, uint8_t);
+  template
+  void Wait<uint16_t>(uint16_t*, uint16_t);
+  template
+  void Wait<uint32_t>(uint32_t*, uint32_t);
+  template
+  void Wait<uint64_t>(uint64_t*, uint64_t);
+
+  template<typename T, typename TT>
+  static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+
+    T Result = AtomicFutex->load();
+
+    // Early exit if possible.
+    if (Result == ExpectedValue) return true;
+
+    const auto TimeoutCycles = ConvertNanosecondsToCycles(Timeout);
+    const auto Begin = GetCycleCounter();
+
+    do {
+      Result = LoadExclusive(Futex);
+      if (Result == ExpectedValue) return true;
+      Result = WFELoadAtomic(Futex);
+
+      const auto CurrentCycleCounter = GetCycleCounter();
+      if ((CurrentCycleCounter - Begin) >= TimeoutCycles) {
+        // Couldn't get value before timeout.
+        return false;
+      }
+    } while (Result != ExpectedValue);
+
+    // We got our result.
+    return true;
+  }
+
+  template
+  bool Wait<uint8_t>(uint8_t*, uint8_t, std::chrono::nanoseconds const &);
+  template
+  bool Wait<uint16_t>(uint16_t*, uint16_t, std::chrono::nanoseconds const &);
+  template
+  bool Wait<uint32_t>(uint32_t*, uint32_t, std::chrono::nanoseconds const &);
+  template
+  bool Wait<uint64_t>(uint64_t*, uint64_t, std::chrono::nanoseconds const &);
+
+#else
+  template<typename T, typename TT>
+  static inline void Wait(T *Futex, TT ExpectedValue) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+    T Tmp{};
+    T Result = AtomicFutex->load();
+
+    // Early exit if possible.
+    if (Result == ExpectedValue) return;
+
+    do {
+      Result = AtomicFutex->load();
+    } while (Result != ExpectedValue);
+  }
+
+  template<typename T, typename TT>
+  static inline bool Wait(T *Futex, TT ExpectedValue, std::chrono::nanoseconds const &Timeout) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+
+    T Tmp{};
+    T Result = AtomicFutex->load();
+
+    // Early exit if possible.
+    if (Result == ExpectedValue) return true;
+
+    const auto Begin = std::chrono::high_resolution_clock::now();
+
+    do {
+      Result = AtomicFutex->load();
+
+      const auto CurrentCycleCounter =  std::chrono::high_resolution_clock::now();
+      if ((CurrentCycleCounter - Begin) >= Timeout) {
+        // Couldn't get value before timeout.
+        return false;
+      }
+    } while (Result != ExpectedValue);
+
+    // We got our result.
+    return true;
+  }
+#endif
+
+  template<typename T>
+  static inline void lock(T *Futex) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+    T Expected{};
+    T Desired {1};
+
+    // Try to CAS immediately.
+    if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return;
+
+    do {
+      // Wait until the futex is unlocked.
+      Wait(Futex, 0);
+    } while (!AtomicFutex->compare_exchange_strong(Expected, Desired));
+  }
+
+  template<typename T>
+  static inline bool try_lock(T *Futex) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+    T Expected{};
+    T Desired {1};
+
+    // Try to CAS immediately.
+    if (AtomicFutex->compare_exchange_strong(Expected, Desired)) return true;
+
+    return false;
+  }
+
+  template<typename T>
+  static inline void unlock(T *Futex) {
+    std::atomic<T> *AtomicFutex = reinterpret_cast<std::atomic<T>*>(Futex);
+    AtomicFutex->store(0);
+  }
+
+#undef SPINLOOP_8BIT
+#undef SPINLOOP_16BIT
+#undef SPINLOOP_32BIT
+#undef SPINLOOP_64BIT
+  template<typename T>
+  class UniqueSpinMutex final {
+    public:
+      UniqueSpinMutex(T *Futex)
+        : Futex {Futex} {
+          FEXCore::Utils::SpinWaitLock::lock(Futex);
+        }
+
+      ~UniqueSpinMutex() {
+        FEXCore::Utils::SpinWaitLock::unlock(Futex);
+      }
+    private:
+      T *Futex;
+  };
+}
diff --git a/FEXCore/include/FEXCore/Core/CPUBackend.h b/FEXCore/include/FEXCore/Core/CPUBackend.h
index 4f9cab7b0f..dd34561cf6 100644
--- a/FEXCore/include/FEXCore/Core/CPUBackend.h
+++ b/FEXCore/include/FEXCore/Core/CPUBackend.h
@@ -98,6 +98,11 @@ namespace CPU {
 
       // Offset after this block to the start of the RIP entries.
       uint32_t OffsetToRIPEntries;
+
+      // Shared-code modification spin-loop futex.
+      uint32_t SpinLockFutex;
+
+      uint32_t _Pad;
     };
 
     // Entries that live after the JITCodeTail.
diff --git a/FEXCore/unittests/APITests/FutexSpinTest.cpp b/FEXCore/unittests/APITests/FutexSpinTest.cpp
new file mode 100644
index 0000000000..9871b318cc
--- /dev/null
+++ b/FEXCore/unittests/APITests/FutexSpinTest.cpp
@@ -0,0 +1,85 @@
+#include "Utils/SpinWaitLock.h"
+#include <catch2/catch.hpp>
+#include <chrono>
+#include <thread>
+
+constexpr auto SleepAmount = std::chrono::milliseconds(250);
+
+TEST_CASE("FutexSpin-Timed-8bit") {
+  uint8_t Test{};
+
+  auto now = std::chrono::high_resolution_clock::now();
+  FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto diff = end - now;
+
+  // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
+  REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
+}
+
+TEST_CASE("FutexSpin-Sleep-8bit") {
+  constexpr auto SleepAmount = std::chrono::seconds(1);
+
+  uint8_t Test{};
+  std::atomic<uint8_t> ActualSpinLoop{};
+  std::chrono::nanoseconds SleptAmount;
+
+  std::thread t([&Test, &SleptAmount, &ActualSpinLoop]() {
+    auto now = std::chrono::high_resolution_clock::now();
+    ActualSpinLoop.store(1);
+    FEXCore::Utils::SpinWaitLock::Wait(&Test, 1);
+    auto end = std::chrono::high_resolution_clock::now();
+    SleptAmount = end - now;
+  });
+
+  // Wait until the second thread lets us know to stop waiting sleeping.
+  while(ActualSpinLoop.load() == 0);
+
+  // sleep this thread for the sleep amount.
+  std::this_thread::sleep_for(SleepAmount);
+
+  // Set the futex
+  FEXCore::Utils::SpinWaitLock::lock(&Test);
+
+  // Wait for the thread to get done.
+  t.join();
+
+  // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
+  REQUIRE(SleptAmount >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
+}
+
+TEST_CASE("FutexSpin-Timed-16bit") {
+  uint16_t Test{};
+
+  auto now = std::chrono::high_resolution_clock::now();
+  FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto diff = end - now;
+
+  // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
+  REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
+}
+
+TEST_CASE("FutexSpin-Timed-32bit") {
+  uint32_t Test{};
+
+  auto now = std::chrono::high_resolution_clock::now();
+  FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto diff = end - now;
+
+  // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
+  REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
+}
+
+TEST_CASE("FutexSpin-Timed-64bit") {
+  uint64_t Test{};
+
+  auto now = std::chrono::high_resolution_clock::now();
+  FEXCore::Utils::SpinWaitLock::Wait(&Test, 1, SleepAmount);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto diff = end - now;
+
+  // The futex spinwait needs to have slept for at /least/ the amount specified. It will always run slightly late.
+  REQUIRE(std::chrono::duration_cast<std::chrono::nanoseconds>(diff) >= std::chrono::duration_cast<std::chrono::nanoseconds>(SleepAmount));
+}