Skip to content

Commit

Permalink
JIT: rewrite pdep implementation
Browse files Browse the repository at this point in the history
- use better algorithm that is O(# set bits) instead of O(# total bits)
- eliminate spilling by careful management of our temporaries
- fix nzcv clobber bug (whoops)

Signed-off-by: Alyssa Rosenzweig <[email protected]>
  • Loading branch information
alyssarosenzweig committed Jan 23, 2024
1 parent 4b211b3 commit d049c91
Showing 1 changed file with 39 additions and 48 deletions.
87 changes: 39 additions & 48 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -713,64 +713,55 @@ DEF_OP(PDep) {
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

const auto Input = GetReg(Op->Input.ID());
const auto Mask = GetReg(Op->Mask.ID());
const auto Dest = GetReg(Node);

const auto ShiftedBitReg = TMP1.R();
const auto BitReg = TMP2.R();
const auto SubMaskReg = TMP3.R();
const auto IndexReg = TMP4.R();
const auto ZeroReg = ARMEmitter::Reg::zr;

const auto InputReg = StaticRegisters[0];
const auto MaskReg = StaticRegisters[1];
const auto DestReg = StaticRegisters[2];

const auto SpillCode = 1U << InputReg.Idx() |
1U << MaskReg.Idx() |
1U << DestReg.Idx();
// PDep implementation follows the ideas from
// http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set*
// bits only, which will be faster than the naive implementation as long as
// there are enough holes in the mask.
//
// The specific arm64 assembly used is based on the sequence that clang
// generates for the C code, giving context to the scheduling yielding better
// ILP than I would do by hand. The registers are allocated by hand however,
// to fit within the tight constraints we have here withot spilling. Also, we
// use cbz/cbnz for conditional branching to avoid clobbering NZCV.

// We can't clobber these
const auto OrigInput = GetReg(Op->Input.ID());
const auto OrigMask = GetReg(Op->Mask.ID());
const auto ZeroReg = ARMEmitter::Reg::zr;

// So we have shadow as temporaries
const auto Input = TMP1.R();
const auto Mask = TMP2.R();

// these get used variously as scratch
const auto T0 = TMP3.R();
const auto T1 = TMP4.R();

ARMEmitter::SingleUseForwardLabel EarlyExit;
ARMEmitter::BackwardLabel NextBit;
ARMEmitter::SingleUseForwardLabel Done;
cbz(EmitSize, Mask, &EarlyExit);
mov(EmitSize, IndexReg, ZeroReg);

// We sadly need to spill regs for this for the time being
// TODO: Remove when scratch registers can be allocated
// explicitly.
SpillStaticRegs(TMP1, false, SpillCode);

// Handle early exit case
mov(EmitSize, Dest, ZeroReg);
cbz(EmitSize, OrigMask, &Done);

mov(EmitSize, InputReg, Input);
mov(EmitSize, MaskReg, Mask);
mov(EmitSize, DestReg, ZeroReg);
// Setup for first iteration
mov(EmitSize, Input, OrigInput);
mov(EmitSize, Mask, OrigMask);
neg(EmitSize, T0, Mask);
and_(EmitSize, T0, T0, Mask);

// Main loop
Bind(&NextBit);
rbit(EmitSize, ShiftedBitReg, MaskReg);
clz(EmitSize, ShiftedBitReg, ShiftedBitReg);
lsrv(EmitSize, BitReg, InputReg, IndexReg);
and_(EmitSize, BitReg, BitReg, 1);
sub(EmitSize, SubMaskReg, MaskReg, 1);
add(EmitSize, IndexReg, IndexReg, 1);
ands(EmitSize, MaskReg, MaskReg, SubMaskReg);
lslv(EmitSize, ShiftedBitReg, BitReg, ShiftedBitReg);
orr(EmitSize, DestReg, DestReg, ShiftedBitReg);
b(ARMEmitter::Condition::CC_NE, &NextBit);
// Store result in a temp so it doesn't get clobbered.
// and restore it after the re-fill below.
mov(EmitSize, IndexReg, DestReg);
// Restore our registers before leaving
// TODO: Also remove along with above TODO.
FillStaticRegs(false, SpillCode);
mov(EmitSize, Dest, IndexReg);
b(&Done);

// Early exit
Bind(&EarlyExit);
mov(EmitSize, Dest, ZeroReg);
sbfx(EmitSize, T1, Input, 0, 1);
eor(EmitSize, Mask, Mask, T0);
and_(EmitSize, T0, T1, T0);
neg(EmitSize, T1, Mask);
orr(EmitSize, Dest, Dest, T0);
lsr(EmitSize, Input, Input, 1);
and_(EmitSize, T0, Mask, T1);
cbnz(EmitSize, T0, &NextBit);

// All done with nothing to do.
Bind(&Done);
Expand Down

0 comments on commit d049c91

Please sign in to comment.