Skip to content

Commit

Permalink
Merge pull request #4291 from Sonicadvance1/profile_stats
Browse files Browse the repository at this point in the history
FEX: Implements new sampling based stats
  • Loading branch information
lioncash authored Feb 11, 2025
2 parents 4186b2a + 602c530 commit 6a39a8d
Show file tree
Hide file tree
Showing 20 changed files with 736 additions and 17 deletions.
8 changes: 8 additions & 0 deletions FEXCore/Source/Interface/Config/Config.json.in
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,14 @@
"Redirects the telemetry folder that FEX usually writes to.",
"By default telemetry data is stored in {$FEX_APP_DATA_LOCATION,{$XDG_DATA_HOME,$HOME}/.fex-emu/Telemetry/}"
]
},
"ProfileStats": {
"Type": "bool",
"Default": "false",
"Desc": [
"Enables FEX's low-overhead sampling profile statistics.",
"Requires a supported version of Mangohud to see the results"
]
}
},
"Hacks": {
Expand Down
3 changes: 2 additions & 1 deletion FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -773,8 +773,9 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
}

uintptr_t ContextImpl::CompileBlock(FEXCore::Core::CpuStateFrame* Frame, uint64_t GuestRIP, uint64_t MaxInst) {
FEXCORE_PROFILE_SCOPED("CompileBlock");
auto Thread = Frame->Thread;
FEXCORE_PROFILE_SCOPED("CompileBlock");
FEXCORE_PROFILE_ACCUMULATION(Thread, AccumulatedJITTime);

// Invalidate might take a unique lock on this, to guarantee that during invalidation no code gets compiled
auto lk = GuardSignalDeferringSection<std::shared_lock>(CodeInvalidationMutex, Thread);
Expand Down
7 changes: 7 additions & 0 deletions FEXCore/include/FEXCore/Debug/InternalThreadState.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ class OpDispatchBuilder;
class PassManager;
} // namespace FEXCore::IR

namespace FEXCore::Profiler {
struct ThreadStats;
};

namespace FEXCore::Core {

// Special-purpose replacement for std::unique_ptr to allow InternalThreadState to be standard layout.
Expand Down Expand Up @@ -95,6 +99,9 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators {

std::shared_mutex ObjectCacheRefCounter {};

// This pointer is owned by the frontend.
FEXCore::Profiler::ThreadStats* ThreadStats {};

///< Data pointer for exclusive use by the frontend
void* FrontendPtr;

Expand Down
97 changes: 97 additions & 0 deletions FEXCore/include/FEXCore/Utils/Profiler.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,73 @@
// SPDX-License-Identifier: MIT
#pragma once
#include <atomic>
#include <cstdint>
#include <string_view>

#ifdef _M_X86_64
#include <x86intrin.h>
#endif

#include <FEXCore/Utils/CompilerDefs.h>

namespace FEXCore::Profiler {
// FEXCore live-stats
constexpr uint8_t STATS_VERSION = 1;
enum class AppType : uint8_t {
LINUX_32,
LINUX_64,
WIN_ARM64EC,
WIN_WOW64,
};

struct ThreadStatsHeader {
uint8_t Version;
AppType app_type;
uint8_t _pad[2];
char fex_version[48];
std::atomic<uint32_t> Head;
std::atomic<uint32_t> Size;
uint32_t Pad;
};

struct ThreadStats {
std::atomic<uint32_t> Next;
std::atomic<uint32_t> TID;

// Accumulated time (In unscaled CPU cycles!)
uint64_t AccumulatedJITTime;
uint64_t AccumulatedSignalTime;

// Accumulated event counts
uint64_t AccumulatedSIGBUSCount;
uint64_t AccumulatedSMCCount;
};

#ifdef ENABLE_FEXCORE_PROFILER

#ifdef _M_ARM_64
/**
* @brief Get the raw cycle counter with synchronizing isb.
*
* `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature.
*/
static inline uint64_t GetCycleCounter() {
uint64_t Result {};
__asm volatile(R"(
isb;
mrs %[Res], CNTVCT_EL0;
)"
: [Res] "=r"(Result));
return Result;
}
#else
static inline uint64_t GetCycleCounter() {
unsigned dummy;
uint64_t tsc = __rdtscp(&dummy);
return tsc;
}
#endif

FEX_DEFAULT_VISIBILITY void Init();
FEX_DEFAULT_VISIBILITY void Shutdown();
FEX_DEFAULT_VISIBILITY void TraceObject(std::string_view const Format);
Expand All @@ -34,6 +94,36 @@ class ProfilerBlock final {
// Declare a scoped profile block variable with a fixed name.
#define FEXCORE_PROFILE_SCOPED(name) FEXCore::Profiler::ProfilerBlock UniqueScopeName(ScopedBlock_, __LINE__)(name)

template<typename T, size_t FlatOffset = 0>
class AccumulationBlock final {
public:
AccumulationBlock(T* Stat)
: Begin {GetCycleCounter()}
, Stat {Stat} {}

~AccumulationBlock() {
const auto Duration = GetCycleCounter() - Begin + FlatOffset;
if (Stat) {
auto ref = std::atomic_ref<T>(*Stat);
ref.fetch_add(Duration, std::memory_order_relaxed);
}
}

private:
uint64_t Begin;
T* Stat;
};

#define FEXCORE_PROFILE_ACCUMULATION(ThreadState, Stat) \
FEXCore::Profiler::AccumulationBlock<decltype(ThreadState->ThreadStats->Stat)> UniqueScopeName(ScopedAccumulation_, __LINE__)( \
ThreadState->ThreadStats ? &ThreadState->ThreadStats->Stat : nullptr);
#define FEXCORE_PROFILE_INSTANT_INCREMENT(ThreadState, Stat, value) \
do { \
if (ThreadState->ThreadStats) { \
ThreadState->ThreadStats->Stat += value; \
} \
} while (0)

#else
[[maybe_unused]]
static void Init() {}
Expand All @@ -50,5 +140,12 @@ static void TraceObject(std::string_view const, uint64_t) {}
#define FEXCORE_PROFILE_SCOPED(...) \
do { \
} while (0)
#define FEXCORE_PROFILE_ACCUMULATION(...) \
do { \
} while (0)
#define FEXCORE_PROFILE_INSTANT_INCREMENT(...) \
do { \
} while (0)

#endif
} // namespace FEXCore::Profiler
3 changes: 2 additions & 1 deletion Source/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ set(SRCS
EnvironmentLoader.cpp
HostFeatures.cpp
JSONPool.cpp
StringUtil.cpp)
StringUtil.cpp
Profiler.cpp)

if (NOT MINGW_BUILD)
list (APPEND SRCS
Expand Down
120 changes: 120 additions & 0 deletions Source/Common/Profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// SPDX-License-Identifier: MIT
#include "Common/Profiler.h"
#include "git_version.h"

#include <FEXCore/Debug/InternalThreadState.h>

namespace FEX::Profiler {
void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) {
if (!Base) {
return;
}

Head = reinterpret_cast<FEXCore::Profiler::ThreadStatsHeader*>(Base);
Head->Size.store(CurrentSize, std::memory_order_relaxed);
Head->Version = FEXCore::Profiler::STATS_VERSION;

std::string_view GitString = GIT_DESCRIBE_STRING;
strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version)));
Head->app_type = AppType;

Stats = reinterpret_cast<FEXCore::Profiler::ThreadStats*>(reinterpret_cast<uint64_t>(Base) + sizeof(FEXCore::Profiler::ThreadStatsHeader));

RemainingSlots = TotalSlotsFromSize();
}

bool StatAllocBase::AllocateMoreSlots() {
const auto OriginalSlotCount = TotalSlotsFromSize();

uint32_t NewSize = FrontendAllocateSlots(CurrentSize * 2);

if (NewSize == CurrentSize) {
return false;
}

CurrentSize = NewSize;
Head->Size.store(CurrentSize, std::memory_order_relaxed);
RemainingSlots = TotalSlotsFromSize() - OriginalSlotCount;

return true;
}

FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateSlot(uint32_t TID) {
if (!RemainingSlots) {
if (!AllocateMoreSlots()) {
return nullptr;
}
}

// Find a free slot
store_memory_barrier();
FEXCore::Profiler::ThreadStats* AllocatedSlot {};
for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
AllocatedSlot = &Stats[i];
if (AllocatedSlot->TID.load(std::memory_order_relaxed) == 0) {
break;
}
}

--RemainingSlots;

// Slot might be reused, just zero it now.
memset(AllocatedSlot, 0, sizeof(FEXCore::Profiler::ThreadStatsHeader));

// TID != 0 means slot is allocated.
AllocatedSlot->TID.store(TID, std::memory_order_relaxed);

// Setup singly-linked list
if (Head->Head.load(std::memory_order_relaxed) == 0) {
Head->Head.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
} else {
StatTail->Next.store(OffsetFromStat(AllocatedSlot), std::memory_order_relaxed);
}

// Update the tail.
StatTail = AllocatedSlot;
return AllocatedSlot;
}

void StatAllocBase::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) {
if (!AllocatedSlot) {
return;
}

// TID == 0 will signal the reader to ignore this slot & deallocate it!
AllocatedSlot->TID.store(0, std::memory_order_relaxed);

store_memory_barrier();

const auto SlotOffset = OffsetFromStat(AllocatedSlot);
const auto AllocatedSlotNext = AllocatedSlot->Next.load(std::memory_order_relaxed);

const bool IsTail = AllocatedSlot == StatTail;

// Update the linked list.
if (Head->Head == SlotOffset) {
Head->Head.store(AllocatedSlotNext, std::memory_order_relaxed);
if (IsTail) {
StatTail = nullptr;
}
} else {
for (size_t i = 0; i < TotalSlotsFromSize(); ++i) {
auto Slot = &Stats[i];
auto NextSlotOffset = Slot->Next.load(std::memory_order_relaxed);

if (NextSlotOffset == SlotOffset) {
Slot->Next.store(AllocatedSlotNext, std::memory_order_relaxed);

if (IsTail) {
// This slot is now the tail.
StatTail = Slot;
}
break;
}
}
}

++RemainingSlots;
}

} // namespace FEX::Profiler
69 changes: 69 additions & 0 deletions Source/Common/Profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-License-Identifier: MIT
/*
$info$
tags: Common|Profiler
desc: Frontend profiler common code
$end_info$
*/
#pragma once
#include <FEXCore/Utils/Profiler.h>

namespace FEXCore::Core {
struct InternalThreadState;
}

#ifdef _M_ARM_64
static inline void store_memory_barrier() {
asm volatile("dmb ishst;" ::: "memory");
}

#else
static inline void store_memory_barrier() {
// Intentionally empty.
// x86 is strongly memory ordered with regular loadstores. No need for barrier.
}
#endif

namespace FEX::Profiler {
class StatAllocBase {
protected:
FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID);
void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot);

uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const {
return reinterpret_cast<uint64_t>(Stat) - reinterpret_cast<uint64_t>(Base);
}
uint32_t TotalSlotsFromSize() const {
return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1;
}
static uint32_t TotalSlotsFromSize(uint32_t Size) {
return (Size - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1;
}

static uint32_t SlotIndexFromOffset(uint32_t Offset) {
return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats);
}

void SaveHeader(FEXCore::Profiler::AppType AppType);

void* Base;
uint32_t CurrentSize {};
FEXCore::Profiler::ThreadStatsHeader* Head {};
FEXCore::Profiler::ThreadStats* Stats;
FEXCore::Profiler::ThreadStats* StatTail {};
uint32_t RemainingSlots;

// Limited to 4MB which should be a few hundred threads of tracking capability.
// I (Sonicadvance1) wanted to reserve 128MB of VA space because it's cheap, but ran in to a bug when running WINE.
// WINE allocates [0x7fff'fe00'0000, 0x7fff'ffff'0000) which /consistently/ overlaps with FEX's sigaltstack.
// This only occurs when this stat allocation size is large as the top-down allocation pushes the alt-stack further.
// Additionally, only occurs on 48-bit VA systems, as mmap on lesser VA will fail regardless.
// TODO: Bump allocation size up once FEXCore's allocator can first use the 128TB of blocked VA space on 48-bit systems.
constexpr static uint32_t MAX_STATS_SIZE = 4 * 1024 * 1024;

private:
virtual uint32_t FrontendAllocateSlots(uint32_t NewSize) = 0;
bool AllocateMoreSlots();
};

} // namespace FEX::Profiler
Loading

0 comments on commit 6a39a8d

Please sign in to comment.