diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 84f00b923f..f10ba4cbef 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -863,6 +863,84 @@ DEF_OP(Ashr) { } } +DEF_OP(ShiftFlags) { + auto Op = IROp->C(); + const uint8_t OpSize = Op->Size; + const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + + const auto PFOutput = GetReg(Node); + const auto PFInput = GetReg(Op->PFInput.ID()); + const auto Dst = GetReg(Op->Result.ID()); + const auto Src1 = GetReg(Op->Src1.ID()); + const auto Src2 = GetReg(Op->Src2.ID()); + + bool PFBlocked = (PFOutput == Dst) || (PFOutput == Src1) || (PFOutput == Src2); + const auto PFTemp = PFBlocked ? TMP4 : PFOutput; + + // Set the output outside the branch to avoid needing an extra leg of the + // branch. We specifically do not hardcode the PF register anywhere (relying + // on a tied SRA register instead) to avoid fighting with RA/RCLSE. + if (PFTemp != PFInput) + mov(ARMEmitter::Size::i64Bit, PFTemp, PFInput); + + ARMEmitter::SingleUseForwardLabel Done; + cbz(EmitSize, Src2, &Done); + { + // PF/SF/ZF/OF + if (OpSize >= 4) { + ands(EmitSize, PFOutput, Dst, Dst); + } else { + unsigned Shift = 32 - (OpSize * 8); + cmn(EmitSize, ARMEmitter::Reg::zr, Dst, ARMEmitter::ShiftType::LSL, Shift); + mov(ARMEmitter::Size::i64Bit, PFOutput, Dst); + } + + // Extract the last bit shifted in to CF + if (Op->Shift == IR::ShiftType::LSL) { + if (OpSize >= 4) { + neg(EmitSize, TMP1, Src2); + } else { + mov(EmitSize, TMP1, OpSize * 8); + sub(EmitSize, TMP1, TMP1, Src2); + } + } else { + sub(ARMEmitter::Size::i64Bit, TMP1, Src2, 1); + } + + lsrv(EmitSize, TMP1, Src1, TMP1); + + bool SetOF = Op->Shift != IR::ShiftType::ASR; + if (SetOF) { + // Only defined when Shift is 1 else undefined + // OF flag is set if a sign change occurred + eor(EmitSize, TMP3, Src1, Dst); + } + + if (CTX->HostFeatures.SupportsFlagM) { + rmif(TMP1, 63, (1 << 1) /* C */); + + if (SetOF) + rmif(TMP3, OpSize * 8 - 1, (1 << 0) /* V */); + } else { + mrs(TMP2, ARMEmitter::SystemRegister::NZCV); + bfi(ARMEmitter::Size::i32Bit, TMP2, TMP1, 29 /* C */, 1); + + if (SetOF) { + lsr(EmitSize, TMP3, TMP3, OpSize * 8 - 1); + bfi(ARMEmitter::Size::i32Bit, TMP2, TMP3, 28 /* V */, 1); + } + + msr(ARMEmitter::SystemRegister::NZCV, TMP2); + } + } + + // TODO: Make RA less dumb so this can't happen (e.g. with late-kill). + if (PFBlocked) + mov(ARMEmitter::Size::i64Bit, PFOutput, PFTemp); + + Bind(&Done); +} + DEF_OP(Ror) { auto Op = IROp->C(); const uint8_t OpSize = IROp->Size; diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index e0ead2bfca..9282b91444 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -1577,9 +1577,7 @@ void OpDispatchBuilder::SHLOp(OpcodeArgs) { auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags); OrderedNode *Result = _Lshl(Size == 64 ? OpSize::i64Bit : OpSize::i32Bit, Dest, Src); - StoreResult(GPRClass, Op, Result, -1); - - GenerateFlags_ShiftLeft(Op, Result, Dest, Src); + HandleShift(Op, Result, Dest, ShiftType::LSL, Src); } template @@ -1602,9 +1600,7 @@ void OpDispatchBuilder::SHROp(OpcodeArgs) { auto Src = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags); auto ALUOp = _Lshr(IR::SizeToOpSize(std::max(4, GetSrcSize(Op))), Dest, Src); - StoreResult(GPRClass, Op, ALUOp, -1); - - GenerateFlags_ShiftRight(Op, ALUOp, Dest, Src); + HandleShift(Op, ALUOp, Dest, ShiftType::LSR, Src); } template @@ -1663,10 +1659,7 @@ void OpDispatchBuilder::SHLDOp(OpcodeArgs) { Shift, _Constant(0), Dest, Res); - StoreResult(GPRClass, Op, Res, -1); - - // No need to mask result, upper garbage is ignored in the flag calc - GenerateFlags_ShiftLeft(Op, Res, Dest, Shift); + HandleShift(Op, Res, Dest, ShiftType::LSL, Shift); } void OpDispatchBuilder::SHLDImmediateOp(OpcodeArgs) { @@ -1734,9 +1727,7 @@ void OpDispatchBuilder::SHRDOp(OpcodeArgs) { Shift, _Constant(0), Dest, Res); - StoreResult(GPRClass, Op, Res, -1); - - GenerateFlags_ShiftRight(Op, Res, Dest, Shift); + HandleShift(Op, Res, Dest, ShiftType::LSR, Shift); } void OpDispatchBuilder::SHRDImmediateOp(OpcodeArgs) { @@ -1781,9 +1772,7 @@ void OpDispatchBuilder::ASHROp(OpcodeArgs) { } OrderedNode *Result = _Ashr(IR::SizeToOpSize(std::max(4, GetSrcSize(Op))), Dest, Src); - StoreResult(GPRClass, Op, Result, -1); - - GenerateFlags_SignShiftRight(Op, Result, Dest, Src); + HandleShift(Op, Result, Dest, ShiftType::ASR, Src); } template diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index f43383dd28..7812dd1477 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -81,12 +81,9 @@ friend class FEXCore::IR::PassManager; TYPE_MUL, TYPE_UMUL, TYPE_LOGICAL, - TYPE_LSHL, TYPE_LSHLI, - TYPE_LSHR, TYPE_LSHRI, TYPE_LSHRDI, - TYPE_ASHR, TYPE_ASHRI, TYPE_BEXTR, TYPE_BLSI, @@ -1293,6 +1290,8 @@ friend class FEXCore::IR::PassManager; // Set flag tracking to prepare for a read-modify-write operation on NZCV. void HandleNZCV_RMW(uint32_t _PossiblySetNZCVBits = ~0) { + CalculateDeferredFlags(); + if (NZCVDirty && CachedNZCV) _StoreNZCV(CachedNZCV); @@ -1567,6 +1566,19 @@ friend class FEXCore::IR::PassManager; SetRFLAG(V); } + // Helper to store a variable shift and calculate its flags for a variable + // shift, with correct PF handling. + void HandleShift(X86Tables::DecodedOp Op, OrderedNode *Result, + OrderedNode *Dest, ShiftType Shift, OrderedNode *Src) { + + StoreResult(GPRClass, Op, Result, -1); + + auto OldPF = GetRFLAG(X86State::RFLAG_PF_RAW_LOC); + + HandleNZCV_RMW(); + CalculatePF(_ShiftFlags(OpSizeFromSrc(Op), Result, Dest, Shift, Src, OldPF)); + } + // Helper to derive Dest by a given builder-using Expression with the opcode // replaced with NewOp. Useful for generic building code. Not safe in general. // but does the right handling of ImplicitFlagClobber at least and must be @@ -1673,7 +1685,7 @@ friend class FEXCore::IR::PassManager; OrderedNode *Src1; } OneSource; - // Logical, LSHL, LSHR, ASHR + // Logical struct { OrderedNode *Src1; OrderedNode *Src2; @@ -1759,13 +1771,6 @@ friend class FEXCore::IR::PassManager; PossiblySetNZCVBits |= OldSetNZCVBits; } - template - void CalculateFlags_ShiftVariable(OrderedNode *Shift, F&& CalculateFlags) { - // We are the ones calculating the deferred flags. Don't recurse! - InvalidateDeferredFlags(); - Calculate_ShiftVariable(Shift, CalculateFlags); - } - /** * @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs. * @{ */ @@ -1791,7 +1796,6 @@ friend class FEXCore::IR::PassManager; void CalculateFlags_ShiftRightImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift); void CalculateFlags_ShiftRightDoubleImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift); void CalculateFlags_ShiftRightImmediateCommon(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift); - void CalculateFlags_SignShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2); void CalculateFlags_SignShiftRightImmediate(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift); void CalculateFlags_BEXTR(OrderedNode *Src); void CalculateFlags_BLSI(uint8_t SrcSize, OrderedNode *Src); @@ -1861,57 +1865,6 @@ friend class FEXCore::IR::PassManager; }; } - void GenerateFlags_ShiftLeft(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - // Flags need to be used, generate incoming flags first. - CalculateDeferredFlags(); - - CurrentDeferredFlags = DeferredFlagData { - .Type = FlagsGenerationType::TYPE_LSHL, - .SrcSize = GetSrcSize(Op), - .Res = Res, - .Sources = { - .TwoSource = { - .Src1 = Src1, - .Src2 = Src2, - }, - }, - }; - } - - void GenerateFlags_ShiftRight(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - // Flags need to be used, generate incoming flags first. - CalculateDeferredFlags(); - - CurrentDeferredFlags = DeferredFlagData { - .Type = FlagsGenerationType::TYPE_LSHR, - .SrcSize = GetSrcSize(Op), - .Res = Res, - .Sources = { - .TwoSource = { - .Src1 = Src1, - .Src2 = Src2, - }, - }, - }; - } - - void GenerateFlags_SignShiftRight(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - // Flags need to be used, generate incoming flags first. - CalculateDeferredFlags(); - - CurrentDeferredFlags = DeferredFlagData { - .Type = FlagsGenerationType::TYPE_ASHR, - .SrcSize = GetSrcSize(Op), - .Res = Res, - .Sources = { - .TwoSource = { - .Src1 = Src1, - .Src2 = Src2, - }, - }, - }; - } - void GenerateFlags_ShiftLeftImmediate(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, uint64_t Shift) { // No flags changed if shift is zero. if (Shift == 0) return; diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp index 52372af0d6..ba5bc2caeb 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp @@ -303,13 +303,6 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) { CurrentDeferredFlags.Sources.TwoSource.Src1, CurrentDeferredFlags.Sources.TwoSource.Src2); break; - case FlagsGenerationType::TYPE_LSHL: - CalculateFlags_ShiftLeft( - CurrentDeferredFlags.SrcSize, - CurrentDeferredFlags.Res, - CurrentDeferredFlags.Sources.TwoSource.Src1, - CurrentDeferredFlags.Sources.TwoSource.Src2); - break; case FlagsGenerationType::TYPE_LSHLI: CalculateFlags_ShiftLeftImmediate( CurrentDeferredFlags.SrcSize, @@ -317,13 +310,6 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) { CurrentDeferredFlags.Sources.OneSrcImmediate.Src1, CurrentDeferredFlags.Sources.OneSrcImmediate.Imm); break; - case FlagsGenerationType::TYPE_LSHR: - CalculateFlags_ShiftRight( - CurrentDeferredFlags.SrcSize, - CurrentDeferredFlags.Res, - CurrentDeferredFlags.Sources.TwoSource.Src1, - CurrentDeferredFlags.Sources.TwoSource.Src2); - break; case FlagsGenerationType::TYPE_LSHRI: CalculateFlags_ShiftRightImmediate( CurrentDeferredFlags.SrcSize, @@ -338,13 +324,6 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) { CurrentDeferredFlags.Sources.OneSrcImmediate.Src1, CurrentDeferredFlags.Sources.OneSrcImmediate.Imm); break; - case FlagsGenerationType::TYPE_ASHR: - CalculateFlags_SignShiftRight( - CurrentDeferredFlags.SrcSize, - CurrentDeferredFlags.Res, - CurrentDeferredFlags.Sources.TwoSource.Src1, - CurrentDeferredFlags.Sources.TwoSource.Src2); - break; case FlagsGenerationType::TYPE_ASHRI: CalculateFlags_SignShiftRightImmediate( CurrentDeferredFlags.SrcSize, @@ -580,73 +559,6 @@ void OpDispatchBuilder::CalculateFlags_Logical(uint8_t SrcSize, OrderedNode *Res SetNZ_ZeroCV(SrcSize, Res); } -void OpDispatchBuilder::CalculateFlags_ShiftLeft(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){ - const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; - SetNZ_ZeroCV(SrcSize, Res); - - // Extract the last bit shifted in to CF - auto Size = _Constant(SrcSize * 8); - auto ShiftAmt = _Sub(OpSize, Size, Src2); - auto LastBit = _Lshr(OpSize, Src1, ShiftAmt); - SetRFLAG(LastBit, 0, true); - - CalculatePF(Res); - - // AF - // Undefined - _InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC); - - // In the case of left shift. OF is only set from the result of XOR - // When Shift > 1 then OF is undefined - auto OFXor = _Xor(OpSize, Src1, Res); - SetRFLAG(OFXor, SrcSize * 8 - 1, true); - }); -} - -void OpDispatchBuilder::CalculateFlags_ShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){ - const auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; - SetNZ_ZeroCV(SrcSize, Res); - - // Extract the last bit shifted in to CF - auto ShiftAmt = _Sub(OpSize::i64Bit, Src2, _Constant(1)); - const auto CFSize = IR::SizeToOpSize(std::max(4u, SrcSize)); - auto LastBit = _Lshr(CFSize, Src1, ShiftAmt); - SetRFLAG(LastBit, 0, true); - - CalculatePF(Res); - - // AF - // Undefined - _InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC); - - // Only defined when Shift is 1 else undefined - // OF flag is set if a sign change occurred - auto val = _Xor(OpSize, Src1, Res); - SetRFLAG(val, SrcSize * 8 - 1, true); - }); -} - -void OpDispatchBuilder::CalculateFlags_SignShiftRight(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2) { - CalculateFlags_ShiftVariable(Src2, [this, SrcSize, Res, Src1, Src2](){ - // SF/ZF/OF - SetNZ_ZeroCV(SrcSize, Res); - - // Extract the last bit shifted in to CF - const auto CFSize = IR::SizeToOpSize(std::max(4u, GetOpSize(Src1))); - auto ShiftAmt = _Sub(OpSize::i64Bit, Src2, _Constant(1)); - auto LastBit = _Lshr(CFSize, Src1, ShiftAmt); - SetRFLAG(LastBit, 0, true); - - CalculatePF(Res); - - // AF - // Undefined - _InvalidateFlags(1 << X86State::RFLAG_AF_RAW_LOC); - }); -} - void OpDispatchBuilder::CalculateFlags_ShiftLeftImmediate(uint8_t SrcSize, OrderedNode *UnmaskedRes, OrderedNode *Src1, uint64_t Shift) { // No flags changed if shift is zero if (Shift == 0) return; diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 0e8cbef9ff..e8542f76ee 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -1218,6 +1218,12 @@ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, + "GPR = ShiftFlags OpSize:$Size, GPR:$Result, GPR:$Src1, ShiftType:$Shift, GPR:$Src2, GPR:$PFInput": { + "Desc": ["Set NZCV flags for specified variable integer shift with given result.", + "Returns updated raw PF."], + "HasSideEffects": true, + "DestSize": "8" + }, "GPR = Ror OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Integer rotate right" ], diff --git a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp index 5722cf918a..4248d08aee 100644 --- a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp @@ -157,6 +157,17 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp) .Replacement = OP_SBB, }; + case OP_SHIFTFLAGS: + // _ShiftFlags conditionally sets NZCV+PF, which we model here as a + // read-modify-write. Logically, it also conditionally makes AF undefined, + // which we model by omitting AF from both Read and Write sets (since + // "cond ? AF : undef" may be optimized to "AF"). + return { + .Read = FLAG_NZCV | FLAG_P, + .Write = FLAG_NZCV | FLAG_P, + .CanEliminate = true, + }; + case OP_ADDNZCV: case OP_SUBNZCV: case OP_TESTNZ: @@ -366,7 +377,7 @@ bool DeadFlagCalculationEliminination::Run(IREmitter *IREmit) { bool Eliminated = false; if ((FlagsRead & Info.Write) == 0) { - if (Info.CanEliminate) { + if (Info.CanEliminate && CodeNode->GetUses() == 0) { IREmit->Remove(CodeNode); Eliminated = true; Changed = true; diff --git a/unittests/ASM/FEX_bugs/ShiftPF.asm b/unittests/ASM/FEX_bugs/ShiftPF.asm new file mode 100644 index 0000000000..9699a3888c --- /dev/null +++ b/unittests/ASM/FEX_bugs/ShiftPF.asm @@ -0,0 +1,30 @@ +%ifdef CONFIG +{ + "RegData": { + "RBX": "0x6", + "RAX": "1" + } +} +%endif + +; FEX had a bug where variable shifts modified PF but RCLSE ignored this, +; causing RCLSE to invalidly propagate earlier PF results. + +; First set PF to odd +mov rcx, 0 +add rcx, 1 + +; Now do a variable shift that will set PF to even +mov rbx, 3 +mov cl, 1 +shl rbx, cl + +; Save the PF. This should be 1 = even +setp al + +; Trash NZCV. This means we'll optimize to calculate PF but not NZCV, which lets +; more constant prop happen needed to materialize the bug. This instruction is +; otherwise a no-op, but without it we pass by chance. +add rdx, rdx + +hlt diff --git a/unittests/ASM/FEX_bugs/nzcv_rmw.asm b/unittests/ASM/FEX_bugs/nzcv_rmw.asm new file mode 100644 index 0000000000..a3652a9eaf --- /dev/null +++ b/unittests/ASM/FEX_bugs/nzcv_rmw.asm @@ -0,0 +1,38 @@ +%ifdef CONFIG +{ + "RegData": { + "RAX": "0xcafe" + } +} +%endif + +; FEX had a bug where an NZCV RMW would fail to calculate previously deferred +; flags, resulting in garbage flag values + +; First zero NZCV and break visibility +mov rax, 0 +add rax, 1 +jz fexi_fexi_im_so_broken + +jmp .begin +.begin: + +; NZCV is zero. Set it to something nonzero with a deferred flag operation. +mov rax, 0 +popcnt rax, rax + +; Now do a variable shift that preserves flags. This would clear ZF if not for +; the condition on the shift flags. +mov rbx, 100 +mov cl, 0 +sar rbx, cl + +; ZF should still be set. +jnz fexi_fexi_im_so_broken + +mov rax, 0xcafe +hlt + +fexi_fexi_im_so_broken: +mov rax, 0xdead +hlt diff --git a/unittests/InstructionCountCI/FlagM/PrimaryGroup.json b/unittests/InstructionCountCI/FlagM/PrimaryGroup.json index 1bed89c1e2..6c1b652617 100644 --- a/unittests/InstructionCountCI/FlagM/PrimaryGroup.json +++ b/unittests/InstructionCountCI/FlagM/PrimaryGroup.json @@ -1676,15 +1676,15 @@ "uxtb w21, w5", "lsl w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x24", + "cbz w21, #+0x24", "cmn wzr, w22, lsl #24", - "mov w23, #0x8", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #7, #nzcV" + "mov w0, #0x8", + "sub w0, w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "rmif x0, #63, #nzCv", + "rmif x2, #7, #nzcV" ] }, "shr al, cl": { @@ -1695,14 +1695,14 @@ "uxtb w21, w5", "lsr w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x20", + "cbz w21, #+0x20", "cmn wzr, w22, lsl #24", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #7, #nzcV" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "rmif x0, #63, #nzCv", + "rmif x2, #7, #nzcV" ] }, "sar al, cl": { @@ -1714,12 +1714,12 @@ "sxtb x20, w20", "asr w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x18", + "cbz w21, #+0x18", "cmn wzr, w22, lsl #24", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "rmif x20, #63, #nzCv", - "mov x26, x22" + "mov x26, x22", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "rmif x0, #63, #nzCv" ] }, "rol ax, cl": { @@ -1943,53 +1943,46 @@ "uxth w21, w5", "lsl w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x24", + "cbz w21, #+0x24", "cmn wzr, w22, lsl #16", - "mov w23, #0x10", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #15, #nzcV" + "mov w0, #0x10", + "sub w0, w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "rmif x0, #63, #nzCv", + "rmif x2, #15, #nzcV" ] }, "shl eax, cl": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 10, "Comment": "GROUP2 0xd3 /4", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "lsl w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x24", - "tst w22, w22", - "mov w23, #0x20", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", - "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #31, #nzcV" + "lsl w4, w20, w21", + "cbz w21, #+0x1c", + "ands w26, w4, w4", + "neg w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w4", + "rmif x0, #63, #nzCv", + "rmif x2, #31, #nzcV" ] }, "shl rax, cl": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 9, "Comment": "GROUP2 0xd3 /4", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "lsl x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x24", - "tst x22, x22", - "mov w23, #0x40", - "sub x21, x23, x21", - "lsr x21, x20, x21", - "rmif x21, #63, #nzCv", - "mov x26, x22", - "eor x20, x20, x22", - "rmif x20, #63, #nzcV" + "lsl x4, x20, x5", + "cbz x5, #+0x1c", + "ands x26, x4, x4", + "neg x0, x5", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "rmif x0, #63, #nzCv", + "rmif x2, #63, #nzcV" ] }, "shr ax, cl": { @@ -2000,50 +1993,45 @@ "uxth w21, w5", "lsr w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x20", + "cbz w21, #+0x20", "cmn wzr, w22, lsl #16", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #15, #nzcV" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "rmif x0, #63, #nzCv", + "rmif x2, #15, #nzcV" ] }, "shr eax, cl": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 10, "Comment": "GROUP2 0xd3 /5", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "lsr w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x20", - "tst w22, w22", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "rmif x21, #63, #nzCv", - "mov x26, x22", - "eor w20, w20, w22", - "rmif x20, #31, #nzcV" + "lsr w4, w20, w21", + "cbz w21, #+0x1c", + "ands w26, w4, w4", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w4", + "rmif x0, #63, #nzCv", + "rmif x2, #31, #nzcV" ] }, "shr rax, cl": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 9, "Comment": "GROUP2 0xd3 /5", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "lsr x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x20", - "tst x22, x22", - "sub x21, x21, #0x1 (1)", - "lsr x21, x20, x21", - "rmif x21, #63, #nzCv", - "mov x26, x22", - "eor x20, x20, x22", - "rmif x20, #63, #nzcV" + "lsr x4, x20, x5", + "cbz x5, #+0x1c", + "ands x26, x4, x4", + "sub x0, x5, #0x1 (1)", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "rmif x0, #63, #nzCv", + "rmif x2, #63, #nzcV" ] }, "sar ax, cl": { @@ -2055,44 +2043,39 @@ "sxth x20, w20", "asr w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x18", + "cbz w21, #+0x18", "cmn wzr, w22, lsl #16", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "rmif x20, #63, #nzCv", - "mov x26, x22" + "mov x26, x22", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "rmif x0, #63, #nzCv" ] }, "sar eax, cl": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 8, "Comment": "GROUP2 0xd3 /7", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "asr w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x18", - "tst w22, w22", - "sub x21, x21, #0x1 (1)", - "lsr w20, w20, w21", - "rmif x20, #63, #nzCv", - "mov x26, x22" + "asr w4, w20, w21", + "cbz w21, #+0x14", + "ands w26, w4, w4", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "rmif x0, #63, #nzCv" ] }, "sar rax, cl": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 7, "Comment": "GROUP2 0xd3 /7", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "asr x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x18", - "tst x22, x22", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "rmif x20, #63, #nzCv", - "mov x26, x22" + "asr x4, x20, x5", + "cbz x5, #+0x14", + "ands x26, x4, x4", + "sub x0, x5, #0x1 (1)", + "lsr x0, x20, x0", + "rmif x0, #63, #nzCv" ] }, "test bl, 1": { diff --git a/unittests/InstructionCountCI/FlagM/Secondary.json b/unittests/InstructionCountCI/FlagM/Secondary.json index 616b060106..232921e309 100644 --- a/unittests/InstructionCountCI/FlagM/Secondary.json +++ b/unittests/InstructionCountCI/FlagM/Secondary.json @@ -794,19 +794,19 @@ "csel x20, x21, x20, eq", "bfxil x4, x20, #0, #16", "msr nzcv, x23", - "cbz x22, #+0x24", + "cbz w22, #+0x24", "cmn wzr, w20, lsl #16", - "mov w23, #0x10", - "sub w22, w23, w22", - "lsr w22, w21, w22", - "rmif x22, #63, #nzCv", "mov x26, x20", - "eor w20, w21, w20", - "rmif x20, #15, #nzcV" + "mov w0, #0x10", + "sub w0, w0, w22", + "lsr w0, w21, w0", + "eor w2, w21, w20", + "rmif x0, #63, #nzCv", + "rmif x2, #15, #nzcV" ] }, "shld eax, ebx, cl": { - "ExpectedInstructionCount": 21, + "ExpectedInstructionCount": 19, "Comment": "0x0f 0xad", "ExpectedArm64ASM": [ "mov w20, w7", @@ -821,19 +821,17 @@ "csel x20, x21, x20, eq", "mov w4, w20", "msr nzcv, x23", - "cbz x22, #+0x24", - "tst w20, w20", - "mov w23, #0x20", - "sub w22, w23, w22", - "lsr w22, w21, w22", - "rmif x22, #63, #nzCv", - "mov x26, x20", - "eor w20, w21, w20", - "rmif x20, #31, #nzcV" + "cbz w22, #+0x1c", + "ands w26, w20, w20", + "neg w0, w22", + "lsr w0, w21, w0", + "eor w2, w21, w20", + "rmif x0, #63, #nzCv", + "rmif x2, #31, #nzcV" ] }, "shld rax, rbx, cl": { - "ExpectedInstructionCount": 20, + "ExpectedInstructionCount": 17, "Comment": "0x0f 0xad", "ExpectedArm64ASM": [ "mov x20, x4", @@ -844,18 +842,15 @@ "orr x22, x23, x22", "mrs x23, nzcv", "cmp x21, #0x0 (0)", - "csel x22, x20, x22, eq", - "mov x4, x22", + "csel x4, x20, x22, eq", "msr nzcv, x23", - "cbz x21, #+0x24", - "tst x22, x22", - "mov w23, #0x40", - "sub x21, x23, x21", - "lsr x21, x20, x21", - "rmif x21, #63, #nzCv", - "mov x26, x22", - "eor x20, x20, x22", - "rmif x20, #63, #nzcV" + "cbz x21, #+0x1c", + "ands x26, x4, x4", + "neg x0, x21", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "rmif x0, #63, #nzCv", + "rmif x2, #63, #nzcV" ] }, "bts ax, bx": { diff --git a/unittests/InstructionCountCI/PrimaryGroup.json b/unittests/InstructionCountCI/PrimaryGroup.json index e17cc53126..bc6ada6e86 100644 --- a/unittests/InstructionCountCI/PrimaryGroup.json +++ b/unittests/InstructionCountCI/PrimaryGroup.json @@ -2024,52 +2024,50 @@ ] }, "shl al, cl": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 16, "Comment": "GROUP2 0xd2 /4", "ExpectedArm64ASM": [ "uxtb w20, w4", "uxtb w21, w5", "lsl w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x34", + "cbz w21, #+0x30", "cmn wzr, w22, lsl #24", - "mov w23, #0x8", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #7, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "mov w0, #0x8", + "sub w0, w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #7", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shr al, cl": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 15, "Comment": "GROUP2 0xd2 /5", "ExpectedArm64ASM": [ "uxtb w20, w4", "uxtb w21, w5", "lsr w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x30", + "cbz w21, #+0x2c", "cmn wzr, w22, lsl #24", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #7, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #7", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "sar al, cl": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 13, "Comment": "GROUP2 0xd2 /7", "ExpectedArm64ASM": [ "uxtb w20, w4", @@ -2077,15 +2075,14 @@ "sxtb x20, w20", "asr w22, w20, w21", "bfxil x4, x22, #0, #8", - "cbz x21, #+0x24", + "cbz w21, #+0x20", "cmn wzr, w22, lsl #24", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "ubfx x20, x20, #0, #1", - "mrs x21, nzcv", - "orr w20, w21, w20, lsl #29", "mov x26, x22", - "msr nzcv, x20" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "msr nzcv, x1" ] }, "rol ax, cl": { @@ -2378,142 +2375,124 @@ ] }, "shl ax, cl": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 16, "Comment": "GROUP2 0xd3 /4", "ExpectedArm64ASM": [ "uxth w20, w4", "uxth w21, w5", "lsl w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x34", + "cbz w21, #+0x30", "cmn wzr, w22, lsl #16", - "mov w23, #0x10", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #15, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "mov w0, #0x10", + "sub w0, w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #15", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shl eax, cl": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 13, "Comment": "GROUP2 0xd3 /4", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "lsl w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x34", - "tst w22, w22", - "mov w23, #0x20", - "sub w21, w23, w21", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", - "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #31, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "lsl w4, w20, w21", + "cbz w21, #+0x28", + "ands w26, w4, w4", + "neg w0, w21", + "lsr w0, w20, w0", + "eor w2, w20, w4", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #31", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shl rax, cl": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 12, "Comment": "GROUP2 0xd3 /4", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "lsl x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x34", - "tst x22, x22", - "mov w23, #0x40", - "sub x21, x23, x21", - "lsr x21, x20, x21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", - "mov x26, x22", - "eor x20, x20, x22", - "lsr x20, x20, #63", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "lsl x4, x20, x5", + "cbz x5, #+0x28", + "ands x26, x4, x4", + "neg x0, x5", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr x2, x2, #63", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shr ax, cl": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 15, "Comment": "GROUP2 0xd3 /5", "ExpectedArm64ASM": [ "uxth w20, w4", "uxth w21, w5", "lsr w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x30", + "cbz w21, #+0x2c", "cmn wzr, w22, lsl #16", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #15, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w22", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #15", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shr eax, cl": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 13, "Comment": "GROUP2 0xd3 /5", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "lsr w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x30", - "tst w22, w22", - "sub x21, x21, #0x1 (1)", - "lsr w21, w20, w21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", - "mov x26, x22", - "eor w20, w20, w22", - "ubfx x20, x20, #31, #1", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "lsr w4, w20, w21", + "cbz w21, #+0x28", + "ands w26, w4, w4", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "eor w2, w20, w4", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #31", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shr rax, cl": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 12, "Comment": "GROUP2 0xd3 /5", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "lsr x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x30", - "tst x22, x22", - "sub x21, x21, #0x1 (1)", - "lsr x21, x20, x21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", - "mov x26, x22", - "eor x20, x20, x22", - "lsr x20, x20, #63", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "lsr x4, x20, x5", + "cbz x5, #+0x28", + "ands x26, x4, x4", + "sub x0, x5, #0x1 (1)", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr x2, x2, #63", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "sar ax, cl": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 13, "Comment": "GROUP2 0xd3 /7", "ExpectedArm64ASM": [ "uxth w20, w4", @@ -2521,53 +2500,45 @@ "sxth x20, w20", "asr w22, w20, w21", "bfxil x4, x22, #0, #16", - "cbz x21, #+0x24", + "cbz w21, #+0x20", "cmn wzr, w22, lsl #16", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "ubfx x20, x20, #0, #1", - "mrs x21, nzcv", - "orr w20, w21, w20, lsl #29", "mov x26, x22", - "msr nzcv, x20" + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "msr nzcv, x1" ] }, "sar eax, cl": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 10, "Comment": "GROUP2 0xd3 /7", "ExpectedArm64ASM": [ "mov w20, w4", "mov w21, w5", - "asr w22, w20, w21", - "mov x4, x22", - "cbz x21, #+0x24", - "tst w22, w22", - "sub x21, x21, #0x1 (1)", - "lsr w20, w20, w21", - "ubfx x20, x20, #0, #1", - "mrs x21, nzcv", - "orr w20, w21, w20, lsl #29", - "mov x26, x22", - "msr nzcv, x20" + "asr w4, w20, w21", + "cbz w21, #+0x1c", + "ands w26, w4, w4", + "sub x0, x21, #0x1 (1)", + "lsr w0, w20, w0", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "msr nzcv, x1" ] }, "sar rax, cl": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 9, "Comment": "GROUP2 0xd3 /7", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x5", - "asr x22, x20, x21", - "mov x4, x22", - "cbz x21, #+0x24", - "tst x22, x22", - "sub x21, x21, #0x1 (1)", - "lsr x20, x20, x21", - "ubfx x20, x20, #0, #1", - "mrs x21, nzcv", - "orr w20, w21, w20, lsl #29", - "mov x26, x22", - "msr nzcv, x20" + "asr x4, x20, x5", + "cbz x5, #+0x1c", + "ands x26, x4, x4", + "sub x0, x5, #0x1 (1)", + "lsr x0, x20, x0", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "msr nzcv, x1" ] }, "test bl, 1": { diff --git a/unittests/InstructionCountCI/Secondary.json b/unittests/InstructionCountCI/Secondary.json index 1b224644ad..ad2bb84704 100644 --- a/unittests/InstructionCountCI/Secondary.json +++ b/unittests/InstructionCountCI/Secondary.json @@ -1588,7 +1588,7 @@ ] }, "shld ax, bx, cl": { - "ExpectedInstructionCount": 26, + "ExpectedInstructionCount": 25, "Comment": "0x0f 0xad", "ExpectedArm64ASM": [ "uxth w20, w7", @@ -1604,23 +1604,22 @@ "csel x20, x21, x20, eq", "bfxil x4, x20, #0, #16", "msr nzcv, x23", - "cbz x22, #+0x34", + "cbz w22, #+0x30", "cmn wzr, w20, lsl #16", - "mov w23, #0x10", - "sub w22, w23, w22", - "lsr w22, w21, w22", - "ubfx x22, x22, #0, #1", - "mrs x23, nzcv", - "orr w22, w23, w22, lsl #29", "mov x26, x20", - "eor w20, w21, w20", - "ubfx x20, x20, #15, #1", - "orr w20, w22, w20, lsl #28", - "msr nzcv, x20" + "mov w0, #0x10", + "sub w0, w0, w22", + "lsr w0, w21, w0", + "eor w2, w21, w20", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #15", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shld eax, ebx, cl": { - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 22, "Comment": "0x0f 0xad", "ExpectedArm64ASM": [ "mov w20, w7", @@ -1635,23 +1634,20 @@ "csel x20, x21, x20, eq", "mov w4, w20", "msr nzcv, x23", - "cbz x22, #+0x34", - "tst w20, w20", - "mov w23, #0x20", - "sub w22, w23, w22", - "lsr w22, w21, w22", - "ubfx x22, x22, #0, #1", - "mrs x23, nzcv", - "orr w22, w23, w22, lsl #29", - "mov x26, x20", - "eor w20, w21, w20", - "ubfx x20, x20, #31, #1", - "orr w20, w22, w20, lsl #28", - "msr nzcv, x20" + "cbz w22, #+0x28", + "ands w26, w20, w20", + "neg w0, w22", + "lsr w0, w21, w0", + "eor w2, w21, w20", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr w2, w2, #31", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "shld rax, rbx, cl": { - "ExpectedInstructionCount": 24, + "ExpectedInstructionCount": 20, "Comment": "0x0f 0xad", "ExpectedArm64ASM": [ "mov x20, x4", @@ -1662,22 +1658,18 @@ "orr x22, x23, x22", "mrs x23, nzcv", "cmp x21, #0x0 (0)", - "csel x22, x20, x22, eq", - "mov x4, x22", + "csel x4, x20, x22, eq", "msr nzcv, x23", - "cbz x21, #+0x34", - "tst x22, x22", - "mov w23, #0x40", - "sub x21, x23, x21", - "lsr x21, x20, x21", - "ubfx x21, x21, #0, #1", - "mrs x23, nzcv", - "orr w21, w23, w21, lsl #29", - "mov x26, x22", - "eor x20, x20, x22", - "lsr x20, x20, #63", - "orr w20, w21, w20, lsl #28", - "msr nzcv, x20" + "cbz x21, #+0x28", + "ands x26, x4, x4", + "neg x0, x21", + "lsr x0, x20, x0", + "eor x2, x20, x4", + "mrs x1, nzcv", + "bfi w1, w0, #29, #1", + "lsr x2, x2, #63", + "bfi w1, w2, #28, #1", + "msr nzcv, x1" ] }, "push gs": {