Skip to content

Commit

Permalink
D8_00 passes
Browse files Browse the repository at this point in the history
Generating prologue and epilogue properly.
  • Loading branch information
pmatos committed Apr 2, 2024
1 parent fee85e9 commit db49d4e
Showing 1 changed file with 235 additions and 81 deletions.
316 changes: 235 additions & 81 deletions FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "Interface/IR/IREmitter.h"
#include "Interface/IR/PassManager.h"
#include <FEXCore/IR/IR.h>
#include <FEXCore/IR/IntrusiveIRList.h>
#include <FEXCore/Utils/Profiler.h>
#include <FEXCore/fextl/vector.h>

Expand All @@ -10,35 +9,164 @@

namespace FEXCore::IR {

template<typename T>
class CircularBuffer {
private:
using StorageType = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
fextl::vector<StorageType> buffer;
fextl::vector<bool> constructed; // TODO(pmatos): We probably can use something better here
int index; // Current insertion index

public:
CircularBuffer(std::size_t size)
: buffer(size)
, constructed(size, false)
, index(0) {}

~CircularBuffer() {
for (std::size_t i = 0; i < size(); ++i) {
if (constructed[i]) {
reinterpret_cast<T*>(&buffer[i])->~T();
}
}
}

template<typename... Args>
void push(Args&&... args) {
index = (index - 1 + size()) % size();
std::size_t pos = index;
if (constructed[pos]) {
reinterpret_cast<T*>(&buffer[pos])->~T();
}
LogMan::Msg::DFmt("Push to {}\n", index);
new (&buffer[pos]) T(std::forward<Args>(args)...);
constructed[pos] = true;
}

template<typename... Args>
void setTop(Args&&... args) {
std::size_t pos = index;
if (constructed[pos]) {
reinterpret_cast<T*>(&buffer[pos])->~T();
}
LogMan::Msg::DFmt("SetTop to {}\n", index);
new (&buffer[pos]) T(std::forward<Args>(args)...);
constructed[pos] = true;
}

void pop() {
if (!constructed.empty() && constructed[index]) {
LogMan::Msg::DFmt("Pop\n");
std::size_t popIndex = (index + 1) % size();
reinterpret_cast<T*>(&buffer[popIndex])->~T();
constructed[popIndex] = false;
index = popIndex;
}
}

T& top() {
LogMan::Msg::DFmt("Top\n");
std::size_t pos = index;
return *reinterpret_cast<T*>(&buffer[pos]);
}

const T& top(size_t offset = 0) const {
size_t pos = index;
return *reinterpret_cast<const T*>(&buffer[(pos + offset) % size()]);
}

inline size_t count() const {
size_t sz = 0;
for (size_t i = 0; i < constructed.size(); ++i) {
if (constructed[i]) {
sz++;
}
}
LogMan::Msg::DFmt("Count: {}\n", sz);
return sz;
}

inline size_t size() const {
return constructed.size();
}

inline T& operator[](size_t i) {
return *reinterpret_cast<T*>(&buffer[i]);
}

inline const T& operator[](size_t i) const {
return *reinterpret_cast<const T*>(&buffer[i]);
}

inline void clear() {
for (std::size_t i = 0; i < size(); ++i) {
if (constructed[i]) {
reinterpret_cast<T*>(&buffer[i])->~T();
constructed[i] = false;
}
}
index = 0;
}

inline bool valid(size_t i) const {
return constructed[i];
}
};

class X87StackOptimization final : public FEXCore::IR::Pass {
public:
bool Run(IREmitter *IREmit) override;
bool Run(IREmitter* IREmit) override;

private:
// FIXME: copy from OpcodeDispatcher.h
[[nodiscard]] uint32_t MMBaseOffset() {
// FIXME(pmatos): copy from OpcodeDispatcher.h
[[nodiscard]]
uint32_t MMBaseOffset() {
return static_cast<uint32_t>(offsetof(Core::CPUState, mm[0][0]));
}

// Top Management Helpers
OrderedNode* GetX87Top(IREmitter* IREmit);
void SetX87Top(IREmitter* IREmit, OrderedNode* Value);
void SetX87ValidTag(IREmitter* IREmit, OrderedNode* Value, bool Valid);

struct StackMemberInfo {
IR::OpSize SourceDataSize; // Size of SourceDataNode
IR::OpSize StackDataSize; // Size of the loaded data (??? FIXME)
IR::NodeID SourceDataNodeID; // ID of the node
IR::OrderedNode* SourceDataNode; // Reference to the value pushed to stack
IR::OrderedNode* DataLoadNode; // Reference to the IR node that loaded the data
bool InterpretAsFloat {}; // True if this is a floating point value, false if integer


};
fextl::vector<StackMemberInfo> StackData{8};
// Index on vector is offset to top value at start of block
CircularBuffer<StackMemberInfo> StackData {8};
};

bool X87StackOptimization::Run(IREmitter *IREmit) {
OrderedNode* X87StackOptimization::GetX87Top(IREmitter* IREmit) {
return IREmit->_LoadContext(1, GPRClass, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC);
}

void X87StackOptimization::SetX87Top(IREmitter* IREmit, OrderedNode* Value) {
IREmit->_StoreContext(1, GPRClass, Value, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC);
}

void X87StackOptimization::SetX87ValidTag(IREmitter* IREmit, OrderedNode* Value, bool Valid) {
// if we are popping then we must first mark this location as empty
OrderedNode* AbridgedFTW = IREmit->_LoadContext(1, GPRClass, offsetof(FEXCore::Core::CPUState, AbridgedFTW));
OrderedNode* RegMask = IREmit->_Lshl(OpSize::i32Bit, IREmit->_Constant(1), Value);
OrderedNode* NewAbridgedFTW = Valid ? IREmit->_Or(OpSize::i32Bit, AbridgedFTW, RegMask) : IREmit->_Andn(OpSize::i32Bit, AbridgedFTW, RegMask);
IREmit->_StoreContext(1, GPRClass, NewAbridgedFTW, offsetof(FEXCore::Core::CPUState, AbridgedFTW));
}

bool X87StackOptimization::Run(IREmitter* IREmit) {
FEXCORE_PROFILE_SCOPED("PassManager::x87StackOpt");

bool Changed = false;
auto CurrentIR = IREmit->ViewIR();
auto *OriginalWriteCursor = IREmit->GetWriteCursor();
auto* OriginalWriteCursor = IREmit->GetWriteCursor();

auto *HeaderOp = CurrentIR.GetHeader();
auto* HeaderOp = CurrentIR.GetHeader();
LOGMAN_THROW_AA_FMT(HeaderOp->Header.Op == OP_IRHEADER, "First op wasn't IRHeader");

if (!HeaderOp->HasX87) {
Expand All @@ -54,86 +182,102 @@ bool X87StackOptimization::Run(IREmitter *IREmit) {
// through the x87 tag register.
// TODO(pmatos)

// Get beginning of block
// FIXME(pmatos): there must be a better way to do this.
auto [BlockBegin, BlockBeginHeader] = *CurrentIR.GetBlocks().begin();
auto [CodeBegin, IROpBegin] = *CurrentIR.GetCode(BlockBegin).begin();

// Get Top at beginning of block
IREmit->SetWriteCursor(CodeBegin);
auto* orig_top = GetX87Top(IREmit);

// Run optimization proper
for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) {
for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) {
switch (IROp->Op) {
case IR::OP_PUSHSTACK: {
LogMan::Msg::DFmt("OP_PUSHSTACK\n");
const auto *Op = IROp->C<IR::IROp_PushStack>();
auto SourceNodeID = Op->X80Src.ID();
auto *SourceNode = CurrentIR.GetNode(Op->X80Src);
auto *SourceNodeOp = CurrentIR.GetOp<IROp_Header>(SourceNode);
auto SourceNodeSize = SourceNodeOp->Size;
StackData.emplace_back(StackMemberInfo {
.SourceDataSize = IR::SizeToOpSize(SourceNodeSize),
.StackDataSize = IR::SizeToOpSize(Op->LoadSize),
.SourceDataNodeID = SourceNodeID,
.SourceDataNode = SourceNode,
.DataLoadNode = CodeNode,
.InterpretAsFloat = Op->Float,
});

LogMan::Msg::DFmt("Stack depth at: {}", StackData.size());
IREmit->SetWriteCursor(CodeNode);
IREmit->Remove(CodeNode); // Remove PushStack - it's a nop, we just need to track the stack
Changed = true;
break;
case IR::OP_PUSHSTACK: {
LogMan::Msg::DFmt("OP_PUSHSTACK\n");
const auto* Op = IROp->C<IR::IROp_PushStack>();
auto SourceNodeID = Op->X80Src.ID();
auto* SourceNode = CurrentIR.GetNode(Op->X80Src);
auto* SourceNodeOp = CurrentIR.GetOp<IROp_Header>(SourceNode);
auto SourceNodeSize = SourceNodeOp->Size;
StackData.push(StackMemberInfo {
.SourceDataSize = IR::SizeToOpSize(SourceNodeSize),
.StackDataSize = IR::SizeToOpSize(Op->LoadSize),
.SourceDataNodeID = SourceNodeID,
.SourceDataNode = SourceNode,
.DataLoadNode = CodeNode,
.InterpretAsFloat = Op->Float,
});

LogMan::Msg::DFmt("Stack depth at: {}", StackData.count());
IREmit->SetWriteCursor(CodeNode);
IREmit->Remove(CodeNode); // Remove PushStack - it's a nop, we just need to track the stack
Changed = true;
break;
}
case IR::OP_POPSTACKMEMORY: {
LogMan::Msg::DFmt("OP_POPSTACKMEMORY\n");
const auto* Op = IROp->C<IR::IROp_PopStackMemory>();
const auto& StackMember = StackData.top();
if (Op->Float == StackMember.InterpretAsFloat && Op->StoreSize == StackMember.StackDataSize && Op->StoreSize == StackMember.SourceDataSize) {
LogMan::Msg::DFmt("Could optimize memcpy!");
}
case IR::OP_POPSTACKMEMORY: {
LogMan::Msg::DFmt("OP_POPSTACKMEMORY\n");
const auto *Op = IROp->C<IR::IROp_PopStackMemory>();
const auto& StackMember = StackData.back();
if (Op->Float == StackMember.InterpretAsFloat &&
Op->StoreSize == StackMember.StackDataSize &&
Op->StoreSize == StackMember.SourceDataSize) {
LogMan::Msg::DFmt("Could optimize memcpy!");
}

IREmit->SetWriteCursor(CodeNode);
IREmit->SetWriteCursor(CodeNode);

auto *AddrNode = CurrentIR.GetNode(Op->Addr);
if (StackMember.SourceDataSize == OpSize::i128Bit) {
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackMember.SourceDataNode, 1);
auto NewLocation = IREmit->_Add(OpSize::i64Bit, AddrNode, IREmit->_Constant(8));
IREmit->_VStoreVectorElement(OpSize::i128Bit, OpSize::i16Bit, StackMember.SourceDataNode, 4, NewLocation);
}
else {
IREmit->_StoreMem(FPRClass, StackMember.SourceDataSize, AddrNode, StackMember.SourceDataNode, 1);
}
auto* AddrNode = CurrentIR.GetNode(Op->Addr);
if (StackMember.SourceDataSize == OpSize::i128Bit) {
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackMember.SourceDataNode, 1);
auto NewLocation = IREmit->_Add(OpSize::i64Bit, AddrNode, IREmit->_Constant(8));
IREmit->_VStoreVectorElement(OpSize::i128Bit, OpSize::i16Bit, StackMember.SourceDataNode, 4, NewLocation);
} else {
IREmit->_StoreMem(FPRClass, StackMember.SourceDataSize, AddrNode, StackMember.SourceDataNode, 1);
}

IREmit->Remove(StackMember.DataLoadNode);
IREmit->Remove(CodeNode);
Changed = true;
IREmit->Remove(StackMember.DataLoadNode);
IREmit->Remove(CodeNode);
Changed = true;

StackData.pop_back();
LogMan::Msg::DFmt("Stack depth at: {}", StackData.size());
break;
}
case IR::OP_F80ADDSTACK: {
LogMan::Msg::DFmt("OP_F80ADDSTACK\n");
const auto* Op = IROp->C<IR::IROp_F80AddStack>();
(void)Op; // avoid warning for now
LogMan::Msg::DFmt("Stack depth at: {}", StackData.size());
break;
}
case IR::OP_F80ADDVALUE: {
LogMan::Msg::DFmt("F80ADDVALUE\n");
const auto* Op = IROp->C<IR::IROp_F80AddValue>();
auto* ValueNode = CurrentIR.GetNode(Op->X80Src);

auto StackOffset = Op->SrcStack1;
const auto& StackMember = StackData[StackData.size() - StackOffset - 1];
auto* StackNode = StackMember.SourceDataNode;

IREmit->SetWriteCursor(CodeNode);
IREmit->_F80Add(ValueNode, StackNode);
IREmit->Remove(CodeNode);
Changed = true;
LogMan::Msg::DFmt("Stack depth at: {}", StackData.size());
break;
}
default: break;
StackData.pop();
LogMan::Msg::DFmt("Stack depth at: {}", StackData.count());
break;
}
case IR::OP_F80ADDSTACK: {
LogMan::Msg::DFmt("OP_F80ADDSTACK\n");
const auto* Op = IROp->C<IR::IROp_F80AddStack>();
(void)Op; // avoid warning for now
LogMan::Msg::DFmt("Stack depth at: {}", StackData.count());
break;
}
case IR::OP_F80ADDVALUE: {
LogMan::Msg::DFmt("F80ADDVALUE\n");
const auto* Op = IROp->C<IR::IROp_F80AddValue>();
auto SourceNodeID = Op->X80Src.ID();
auto* ValueNode = CurrentIR.GetNode(Op->X80Src);

auto StackOffset = Op->SrcStack1;
const auto& StackMember = StackData.top(StackOffset);
auto* StackNode = StackMember.SourceDataNode;

IREmit->SetWriteCursor(CodeNode);

auto AddNode = IREmit->_F80Add(ValueNode, StackNode);
// Store it in the stack
StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize,
.StackDataSize = StackMember.StackDataSize,
.SourceDataNodeID = SourceNodeID,
.SourceDataNode = AddNode,
.DataLoadNode = CodeNode,
.InterpretAsFloat = StackMember.InterpretAsFloat});

IREmit->Remove(CodeNode);
Changed = true;
LogMan::Msg::DFmt("Stack depth at: {}", StackData.count());
break;
}
default: break;
}
}
}
Expand All @@ -155,13 +299,23 @@ bool X87StackOptimization::Run(IREmitter *IREmit) {
// context so that the values are correct. Copy SourceDataNode in the
// stack to the respective mmX register.
for (size_t i = 0; i < StackData.size(); ++i) {
if (!StackData.valid(i)) {
continue;
}
LogMan::Msg::DFmt("Writing stack member {} to context", i);
Changed = true;
auto &StackMember = StackData[i];
auto *Node = StackMember.SourceDataNode;
IREmit->_StoreContextIndexed(Node, IREmit->_Constant(i), 16,
MMBaseOffset(), 16, FPRClass);
IREmit->_StoreContextIndexed(Node, IREmit->_Add(OpSize::i32Bit, orig_top, IREmit->_Constant(i)), 16, MMBaseOffset(), 16, FPRClass);
}

// Store new top which is now the original top - the number of elements in stack.
// Careful with underflow wraparound.
auto mask = IREmit->_Constant(0x7);
auto new_top = IREmit->_And(OpSize::i32Bit, IREmit->_Sub(OpSize::i32Bit, orig_top, IREmit->_Constant(1)), mask);
SetX87ValidTag(IREmit, new_top, true);
SetX87Top(IREmit, new_top);

break;
}
}
Expand Down

0 comments on commit db49d4e

Please sign in to comment.