Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eliminate xblock liveness for shifts #3548

Merged
merged 14 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,84 @@ DEF_OP(Ashr) {
}
}

DEF_OP(ShiftFlags) {
auto Op = IROp->C<IR::IROp_ShiftFlags>();
const uint8_t OpSize = Op->Size;
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

const auto PFOutput = GetReg(Node);
const auto PFInput = GetReg(Op->PFInput.ID());
const auto Dst = GetReg(Op->Result.ID());
const auto Src1 = GetReg(Op->Src1.ID());
const auto Src2 = GetReg(Op->Src2.ID());

bool PFBlocked = (PFOutput == Dst) || (PFOutput == Src1) || (PFOutput == Src2);
const auto PFTemp = PFBlocked ? TMP4 : PFOutput;

// Set the output outside the branch to avoid needing an extra leg of the
// branch. We specifically do not hardcode the PF register anywhere (relying
// on a tied SRA register instead) to avoid fighting with RA/RCLSE.
if (PFTemp != PFInput)
mov(ARMEmitter::Size::i64Bit, PFTemp, PFInput);

ARMEmitter::SingleUseForwardLabel Done;
cbz(EmitSize, Src2, &Done);
{
// PF/SF/ZF/OF
if (OpSize >= 4) {
ands(EmitSize, PFOutput, Dst, Dst);
} else {
unsigned Shift = 32 - (OpSize * 8);
cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift);
mov(ARMEmitter::Size::i64Bit, PFOutput, Dst);
}

// Extract the last bit shifted in to CF
if (Op->Shift == IR::ShiftType::LSL) {
if (OpSize >= 4) {
neg(EmitSize, TMP1, Src2);
} else {
mov(EmitSize, TMP1, OpSize * 8);
sub(EmitSize, TMP1, TMP1, Src2);
}
} else {
sub(ARMEmitter::Size::i64Bit, TMP1, Src2, 1);
}

lsrv(EmitSize, TMP1, Src1, TMP1);
Sonicadvance1 marked this conversation as resolved.
Show resolved Hide resolved

bool SetOF = Op->Shift != IR::ShiftType::ASR;
if (SetOF) {
// Only defined when Shift is 1 else undefined
// OF flag is set if a sign change occurred
eor(EmitSize, TMP3, Src1, Dst);
}

if (CTX->HostFeatures.SupportsFlagM) {
rmif(TMP1, 63, (1 << 1) /* C */);

if (SetOF)
rmif(TMP3, OpSize * 8 - 1, (1 << 0) /* V */);
} else {
mrs(TMP2, ARMEmitter::SystemRegister::NZCV);
bfi(ARMEmitter::Size::i32Bit, TMP2, TMP1, 29 /* C */, 1);

if (SetOF) {
lsr(EmitSize, TMP3, TMP3, OpSize * 8 - 1);
bfi(ARMEmitter::Size::i32Bit, TMP2, TMP3, 28 /* V */, 1);
}

msr(ARMEmitter::SystemRegister::NZCV, TMP2);
}
}

// TODO: Make RA less dumb so this can't happen (e.g. with late-kill).
if (PFBlocked)
mov(ARMEmitter::Size::i64Bit, PFOutput, PFTemp);

Bind(&Done);
}

DEF_OP(Ror) {
auto Op = IROp->C<IR::IROp_Ror>();
const uint8_t OpSize = IROp->Size;
Expand Down
21 changes: 5 additions & 16 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,9 +1577,7 @@ void OpDispatchBuilder::SHLOp(OpcodeArgs) {
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);

OrderedNode *Result = _Lshl(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Src);
StoreResult(GPRClass, Op, Result, -1);

GenerateFlags_ShiftLeft(Op, Result, Dest, Src);
HandleShift(Op, Result, Dest, ShiftType::LSL, Src);
}

template<bool SHL1Bit>
Expand All @@ -1602,9 +1600,7 @@ void OpDispatchBuilder::SHROp(OpcodeArgs) {
auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);

auto ALUOp = _Lshr(IR::SizeToOpSize(std::max<uint8_t>(4, GetSrcSize(Op))), Dest, Src);
StoreResult(GPRClass, Op, ALUOp, -1);

GenerateFlags_ShiftRight(Op, ALUOp, Dest, Src);
HandleShift(Op, ALUOp, Dest, ShiftType::LSR, Src);
}

template<bool SHR1Bit>
Expand Down Expand Up @@ -1663,10 +1659,7 @@ void OpDispatchBuilder::SHLDOp(OpcodeArgs) {
Shift, _Constant(0),
Dest, Res);

StoreResult(GPRClass, Op, Res, -1);

// No need to mask result, upper garbage is ignored in the flag calc
GenerateFlags_ShiftLeft(Op, Res, Dest, Shift);
HandleShift(Op, Res, Dest, ShiftType::LSL, Shift);
}

void OpDispatchBuilder::SHLDImmediateOp(OpcodeArgs) {
Expand Down Expand Up @@ -1734,9 +1727,7 @@ void OpDispatchBuilder::SHRDOp(OpcodeArgs) {
Shift, _Constant(0),
Dest, Res);

StoreResult(GPRClass, Op, Res, -1);

GenerateFlags_ShiftRight(Op, Res, Dest, Shift);
HandleShift(Op, Res, Dest, ShiftType::LSR, Shift);
}

void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) {
Expand Down Expand Up @@ -1781,9 +1772,7 @@ void OpDispatchBuilder::ASHROp(OpcodeArgs) {
}

OrderedNode *Result = _Ashr(IR::SizeToOpSize(std::max<uint8_t>(4, GetSrcSize(Op))), Dest, Src);
StoreResult(GPRClass, Op, Result, -1);

GenerateFlags_SignShiftRight(Op, Result, Dest, Src);
HandleShift(Op, Result, Dest, ShiftType::ASR, Src);
}

template<bool SHR1Bit>
Expand Down
79 changes: 16 additions & 63 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,9 @@ friend class FEXCore::IR::PassManager;
TYPE_MUL,
TYPE_UMUL,
TYPE_LOGICAL,
TYPE_LSHL,
TYPE_LSHLI,
TYPE_LSHR,
TYPE_LSHRI,
TYPE_LSHRDI,
TYPE_ASHR,
TYPE_ASHRI,
TYPE_BEXTR,
TYPE_BLSI,
Expand Down Expand Up @@ -1293,6 +1290,8 @@ friend class FEXCore::IR::PassManager;

// Set flag tracking to prepare for a read-modify-write operation on NZCV.
void HandleNZCV_RMW(uint32_t _PossiblySetNZCVBits = ~0) {
CalculateDeferredFlags();

if (NZCVDirty && CachedNZCV)
_StoreNZCV(CachedNZCV);

Expand Down Expand Up @@ -1567,6 +1566,19 @@ friend class FEXCore::IR::PassManager;
SetRFLAG<FEXCore::X86State::X87FLAG_C2_LOC>(V);
}

// Helper to store a variable shift and calculate its flags for a variable
// shift, with correct PF handling.
void HandleShift(X86Tables::DecodedOp Op, OrderedNode *Result,
OrderedNode *Dest, ShiftType Shift, OrderedNode *Src) {

StoreResult(GPRClass, Op, Result, -1);

auto OldPF = GetRFLAG(X86State::RFLAG_PF_RAW_LOC);

HandleNZCV_RMW();
CalculatePF(_ShiftFlags(OpSizeFromSrc(Op), Result, Dest, Shift, Src, OldPF));
}

// Helper to derive Dest by a given builder-using Expression with the opcode
// replaced with NewOp. Useful for generic building code. Not safe in general.
// but does the right handling of ImplicitFlagClobber at least and must be
Expand Down Expand Up @@ -1673,7 +1685,7 @@ friend class FEXCore::IR::PassManager;
OrderedNode *Src1;
} OneSource;

// Logical, LSHL, LSHR, ASHR
// Logical
struct {
OrderedNode *Src1;
OrderedNode *Src2;
Expand Down Expand Up @@ -1759,13 +1771,6 @@ friend class FEXCore::IR::PassManager;
PossiblySetNZCVBits |= OldSetNZCVBits;
}

template <typename F>
void CalculateFlags_ShiftVariable(OrderedNode *Shift, F&& CalculateFlags) {
// We are the ones calculating the deferred flags. Don't recurse!
InvalidateDeferredFlags();
Calculate_ShiftVariable(Shift, CalculateFlags);
}

/**
* @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs.
* @{ */
Expand All @@ -1791,7 +1796,6 @@ friend class FEXCore::IR::PassManager;
void CalculateFlags_ShiftRightImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift);
void CalculateFlags_ShiftRightDoubleImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift);
void CalculateFlags_ShiftRightImmediateCommon(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift);
void CalculateFlags_SignShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2);
void CalculateFlags_SignShiftRightImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift);
void CalculateFlags_BEXTR(OrderedNode *Src);
void CalculateFlags_BLSI(uint8_t SrcSize, OrderedNode *Src);
Expand Down Expand Up @@ -1861,57 +1865,6 @@ friend class FEXCore::IR::PassManager;
};
}

void GenerateFlags_ShiftLeft(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
// Flags need to be used, generate incoming flags first.
CalculateDeferredFlags();

CurrentDeferredFlags = DeferredFlagData {
.Type = FlagsGenerationType::TYPE_LSHL,
.SrcSize = GetSrcSize(Op),
.Res = Res,
.Sources = {
.TwoSource = {
.Src1 = Src1,
.Src2 = Src2,
},
},
};
}

void GenerateFlags_ShiftRight(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
// Flags need to be used, generate incoming flags first.
CalculateDeferredFlags();

CurrentDeferredFlags = DeferredFlagData {
.Type = FlagsGenerationType::TYPE_LSHR,
.SrcSize = GetSrcSize(Op),
.Res = Res,
.Sources = {
.TwoSource = {
.Src1 = Src1,
.Src2 = Src2,
},
},
};
}

void GenerateFlags_SignShiftRight(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
// Flags need to be used, generate incoming flags first.
CalculateDeferredFlags();

CurrentDeferredFlags = DeferredFlagData {
.Type = FlagsGenerationType::TYPE_ASHR,
.SrcSize = GetSrcSize(Op),
.Res = Res,
.Sources = {
.TwoSource = {
.Src1 = Src1,
.Src2 = Src2,
},
},
};
}

void GenerateFlags_ShiftLeftImmediate(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift) {
// No flags changed if shift is zero.
if (Shift == 0) return;
Expand Down
88 changes: 0 additions & 88 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,27 +303,13 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) {
CurrentDeferredFlags.Sources.TwoSource.Src1,
CurrentDeferredFlags.Sources.TwoSource.Src2);
break;
case FlagsGenerationType::TYPE_LSHL:
CalculateFlags_ShiftLeft(
CurrentDeferredFlags.SrcSize,
CurrentDeferredFlags.Res,
CurrentDeferredFlags.Sources.TwoSource.Src1,
CurrentDeferredFlags.Sources.TwoSource.Src2);
break;
case FlagsGenerationType::TYPE_LSHLI:
CalculateFlags_ShiftLeftImmediate(
CurrentDeferredFlags.SrcSize,
CurrentDeferredFlags.Res,
CurrentDeferredFlags.Sources.OneSrcImmediate.Src1,
CurrentDeferredFlags.Sources.OneSrcImmediate.Imm);
break;
case FlagsGenerationType::TYPE_LSHR:
CalculateFlags_ShiftRight(
CurrentDeferredFlags.SrcSize,
CurrentDeferredFlags.Res,
CurrentDeferredFlags.Sources.TwoSource.Src1,
CurrentDeferredFlags.Sources.TwoSource.Src2);
break;
case FlagsGenerationType::TYPE_LSHRI:
CalculateFlags_ShiftRightImmediate(
CurrentDeferredFlags.SrcSize,
Expand All @@ -338,13 +324,6 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) {
CurrentDeferredFlags.Sources.OneSrcImmediate.Src1,
CurrentDeferredFlags.Sources.OneSrcImmediate.Imm);
break;
case FlagsGenerationType::TYPE_ASHR:
CalculateFlags_SignShiftRight(
CurrentDeferredFlags.SrcSize,
CurrentDeferredFlags.Res,
CurrentDeferredFlags.Sources.TwoSource.Src1,
CurrentDeferredFlags.Sources.TwoSource.Src2);
break;
case FlagsGenerationType::TYPE_ASHRI:
CalculateFlags_SignShiftRightImmediate(
CurrentDeferredFlags.SrcSize,
Expand Down Expand Up @@ -580,73 +559,6 @@ void OpDispatchBuilder::CalculateFlags_Logical(uint8_t SrcSize, OrderedNode *Res
SetNZ_ZeroCV(SrcSize, Res);
}

void OpDispatchBuilder::CalculateFlags_ShiftLeft(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){
const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
SetNZ_ZeroCV(SrcSize, Res);

// Extract the last bit shifted in to CF
auto Size = _Constant(SrcSize * 8);
auto ShiftAmt = _Sub(OpSize, Size, Src2);
auto LastBit = _Lshr(OpSize, Src1, ShiftAmt);
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(LastBit, 0, true);

CalculatePF(Res);

// AF
// Undefined
_InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC);

// In the case of left shift. OF is only set from the result of <Top Source Bit> XOR <Top Result Bit>
// When Shift > 1 then OF is undefined
auto OFXor = _Xor(OpSize, Src1, Res);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(OFXor, SrcSize * 8 - 1, true);
});
}

void OpDispatchBuilder::CalculateFlags_ShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){
const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
SetNZ_ZeroCV(SrcSize, Res);

// Extract the last bit shifted in to CF
auto ShiftAmt = _Sub(OpSize::i64Bit, Src2, _Constant(1));
const auto CFSize = IR::SizeToOpSize(std::max<uint8_t>(4u, SrcSize));
auto LastBit = _Lshr(CFSize, Src1, ShiftAmt);
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(LastBit, 0, true);

CalculatePF(Res);

// AF
// Undefined
_InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC);

// Only defined when Shift is 1 else undefined
// OF flag is set if a sign change occurred
auto val = _Xor(OpSize, Src1, Res);
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(val, SrcSize * 8 - 1, true);
});
}

void OpDispatchBuilder::CalculateFlags_SignShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) {
CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){
// SF/ZF/OF
SetNZ_ZeroCV(SrcSize, Res);

// Extract the last bit shifted in to CF
const auto CFSize = IR::SizeToOpSize(std::max<uint32_t>(4u, GetOpSize(Src1)));
auto ShiftAmt = _Sub(OpSize::i64Bit, Src2, _Constant(1));
auto LastBit = _Lshr(CFSize, Src1, ShiftAmt);
SetRFLAG<FEXCore::X86State::RFLAG_CF_RAW_LOC>(LastBit, 0, true);

CalculatePF(Res);

// AF
// Undefined
_InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC);
});
}

void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(uint8_t SrcSize, OrderedNode *UnmaskedRes, OrderedNode *Src1, uint64_t Shift) {
// No flags changed if shift is zero
if (Shift == 0) return;
Expand Down
Loading
Loading