diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 95f67911bd..f0abcbd767 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -713,64 +713,55 @@ DEF_OP(PDep) { LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize); const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; - const auto Input = GetReg(Op->Input.ID()); - const auto Mask = GetReg(Op->Mask.ID()); const auto Dest = GetReg(Node); - const auto ShiftedBitReg = TMP1.R(); - const auto BitReg = TMP2.R(); - const auto SubMaskReg = TMP3.R(); - const auto IndexReg = TMP4.R(); - const auto ZeroReg = ARMEmitter::Reg::zr; - - const auto InputReg = StaticRegisters[0]; - const auto MaskReg = StaticRegisters[1]; - const auto DestReg = StaticRegisters[2]; - - const auto SpillCode = 1U << InputReg.Idx() | - 1U << MaskReg.Idx() | - 1U << DestReg.Idx(); + // PDep implementation follows the ideas from + // http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set* + // bits only, which will be faster than the naive implementation as long as + // there are enough holes in the mask. + // + // The specific arm64 assembly used is based on the sequence that clang + // generates for the C code, giving context to the scheduling yielding better + // ILP than I would do by hand. The registers are allocated by hand however, + // to fit within the tight constraints we have here withot spilling. Also, we + // use cbz/cbnz for conditional branching to avoid clobbering NZCV. + + // We can't clobber these + const auto OrigInput = GetReg(Op->Input.ID()); + const auto OrigMask = GetReg(Op->Mask.ID()); + const auto ZeroReg = ARMEmitter::Reg::zr; + + // So we have shadow as temporaries + const auto Input = TMP1.R(); + const auto Mask = TMP2.R(); + + // these get used variously as scratch + const auto T0 = TMP3.R(); + const auto T1 = TMP4.R(); - ARMEmitter::SingleUseForwardLabel EarlyExit; ARMEmitter::BackwardLabel NextBit; ARMEmitter::SingleUseForwardLabel Done; - cbz(EmitSize, Mask, &EarlyExit); - mov(EmitSize, IndexReg, ZeroReg); - - // We sadly need to spill regs for this for the time being - // TODO: Remove when scratch registers can be allocated - // explicitly. - SpillStaticRegs(TMP1, false, SpillCode); + // Handle early exit case + mov(EmitSize, Dest, ZeroReg); + cbz(EmitSize, OrigMask, &Done); - mov(EmitSize, InputReg, Input); - mov(EmitSize, MaskReg, Mask); - mov(EmitSize, DestReg, ZeroReg); + // Setup for first iteration + mov(EmitSize, Input, OrigInput); + mov(EmitSize, Mask, OrigMask); + neg(EmitSize, T0, Mask); + and_(EmitSize, T0, T0, Mask); // Main loop Bind(&NextBit); - rbit(EmitSize, ShiftedBitReg, MaskReg); - clz(EmitSize, ShiftedBitReg, ShiftedBitReg); - lsrv(EmitSize, BitReg, InputReg, IndexReg); - and_(EmitSize, BitReg, BitReg, 1); - sub(EmitSize, SubMaskReg, MaskReg, 1); - add(EmitSize, IndexReg, IndexReg, 1); - ands(EmitSize, MaskReg, MaskReg, SubMaskReg); - lslv(EmitSize, ShiftedBitReg, BitReg, ShiftedBitReg); - orr(EmitSize, DestReg, DestReg, ShiftedBitReg); - b(ARMEmitter::Condition::CC_NE, &NextBit); - // Store result in a temp so it doesn't get clobbered. - // and restore it after the re-fill below. - mov(EmitSize, IndexReg, DestReg); - // Restore our registers before leaving - // TODO: Also remove along with above TODO. - FillStaticRegs(false, SpillCode); - mov(EmitSize, Dest, IndexReg); - b(&Done); - - // Early exit - Bind(&EarlyExit); - mov(EmitSize, Dest, ZeroReg); + sbfx(EmitSize, T1, Input, 0, 1); + eor(EmitSize, Mask, Mask, T0); + and_(EmitSize, T0, T1, T0); + neg(EmitSize, T1, Mask); + orr(EmitSize, Dest, Dest, T0); + lsr(EmitSize, Input, Input, 1); + and_(EmitSize, T0, Mask, T1); + cbnz(EmitSize, T0, &NextBit); // All done with nothing to do. Bind(&Done);