From bb10f258084b4c79fabcd244d8a0aabaedf15cdb Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 20 Dec 2024 13:20:52 -0800 Subject: [PATCH 1/2] OpcodeDispatcher: Minor division improvement No need to extract the subregisters out before operating on them since the long division and long remainder IR operations correctly zero/sign extend the incoming sources as necessary. Saves a couple of instructions. --- .../Source/Interface/Core/OpcodeDispatcher.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 9b08ea64f4..24d875f4f9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3613,16 +3613,16 @@ void OpDispatchBuilder::DIVOp(OpcodeArgs) { auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else if (Size == OpSize::i16Bit) { - Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); - Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); + Ref Src1 = LoadGPRRegister(X86State::REG_RAX); + Ref Src2 = LoadGPRRegister(X86State::REG_RDX); auto UDivOp = _LUDiv(OpSize::i16Bit, Src1, Src2, Divisor); auto URemOp = _LURem(OpSize::i16Bit, Src1, Src2, Divisor); StoreGPRRegister(X86State::REG_RAX, UDivOp, Size); StoreGPRRegister(X86State::REG_RDX, URemOp, Size); } else if (Size == OpSize::i32Bit) { - Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); - Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); + Ref Src1 = LoadGPRRegister(X86State::REG_RAX); + Ref Src2 = LoadGPRRegister(X86State::REG_RDX); Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LUDiv(OpSize::i32Bit, Src1, Src2, Divisor)); Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LURem(OpSize::i32Bit, Src1, Src2, Divisor)); @@ -3654,7 +3654,7 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); if (Size == OpSize::i8Bit) { - Ref Src1 = LoadGPRRegister(X86State::REG_RAX, OpSize::i16Bit); + Ref Src1 = LoadGPRRegister(X86State::REG_RAX); Src1 = _Sbfe(OpSize::i64Bit, 16, 0, Src1); Divisor = _Sbfe(OpSize::i64Bit, 8, 0, Divisor); @@ -3665,16 +3665,16 @@ void OpDispatchBuilder::IDIVOp(OpcodeArgs) { auto ResultAX = _Bfi(GPRSize, 8, 8, UDivOp, URemOp); StoreGPRRegister(X86State::REG_RAX, ResultAX, OpSize::i16Bit); } else if (Size == OpSize::i16Bit) { - Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); - Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); + Ref Src1 = LoadGPRRegister(X86State::REG_RAX); + Ref Src2 = LoadGPRRegister(X86State::REG_RDX); auto UDivOp = _LDiv(OpSize::i16Bit, Src1, Src2, Divisor); auto URemOp = _LRem(OpSize::i16Bit, Src1, Src2, Divisor); StoreGPRRegister(X86State::REG_RAX, UDivOp, Size); StoreGPRRegister(X86State::REG_RDX, URemOp, Size); } else if (Size == OpSize::i32Bit) { - Ref Src1 = LoadGPRRegister(X86State::REG_RAX, Size); - Ref Src2 = LoadGPRRegister(X86State::REG_RDX, Size); + Ref Src1 = LoadGPRRegister(X86State::REG_RAX); + Ref Src2 = LoadGPRRegister(X86State::REG_RDX); Ref UDivOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LDiv(OpSize::i32Bit, Src1, Src2, Divisor)); Ref URemOp = _Bfe(OpSize::i32Bit, IR::OpSizeAsBits(Size), 0, _LRem(OpSize::i32Bit, Src1, Src2, Divisor)); From ffb745b662edf643d502dd842ac814ebe104ef97 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 20 Dec 2024 13:22:42 -0800 Subject: [PATCH 2/2] InstCountCI: Update for divison improvements --- .../FlagM/PrimaryGroup.json | 21 +++--- .../InstructionCountCI/PrimaryGroup.json | 75 +++++++++---------- 2 files changed, 43 insertions(+), 53 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/PrimaryGroup.json b/unittests/InstructionCountCI/FlagM/PrimaryGroup.json index aa22c4d33a..f544dbb8c0 100644 --- a/unittests/InstructionCountCI/FlagM/PrimaryGroup.json +++ b/unittests/InstructionCountCI/FlagM/PrimaryGroup.json @@ -2196,12 +2196,11 @@ ] }, "idiv bl": { - "ExpectedInstructionCount": 9, + "ExpectedInstructionCount": 8, "Comment": "GROUP2 0xf6 /7", "ExpectedArm64ASM": [ "uxtb w20, w6", - "uxth w21, w4", - "sxth x21, w21", + "sxth x21, w4", "sxtb x20, w20", "sdiv x22, x21, x20", "sdiv x0, x21, x20", @@ -2367,20 +2366,18 @@ ] }, "div bx": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 10, "Comment": "GROUP2 0xf7 /6", "ExpectedArm64ASM": [ "uxth w20, w6", - "uxth w21, w4", - "uxth w22, w5", - "uxth w0, w21", - "bfi w0, w22, #16, #16", - "udiv w23, w0, w20", - "uxth w0, w21", - "bfi w0, w22, #16, #16", + "uxth w0, w4", + "bfi w0, w5, #16, #16", + "udiv w21, w0, w20", + "uxth w0, w4", + "bfi w0, w5, #16, #16", "udiv w1, w0, w20", "msub w20, w1, w20, w0", - "bfxil x4, x23, #0, #16", + "bfxil x4, x21, #0, #16", "bfxil x5, x20, #0, #16" ] }, diff --git a/unittests/InstructionCountCI/PrimaryGroup.json b/unittests/InstructionCountCI/PrimaryGroup.json index d2d75a8cd1..505de3e107 100644 --- a/unittests/InstructionCountCI/PrimaryGroup.json +++ b/unittests/InstructionCountCI/PrimaryGroup.json @@ -2588,12 +2588,11 @@ ] }, "idiv bl": { - "ExpectedInstructionCount": 9, + "ExpectedInstructionCount": 8, "Comment": "GROUP2 0xf6 /7", "ExpectedArm64ASM": [ "uxtb w20, w6", - "uxth w21, w4", - "sxth x21, w21", + "sxth x21, w4", "sxtb x20, w20", "sdiv x22, x21, x20", "sdiv x0, x21, x20", @@ -2786,39 +2785,36 @@ ] }, "div bx": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 10, "Comment": "GROUP2 0xf7 /6", "ExpectedArm64ASM": [ "uxth w20, w6", - "uxth w21, w4", - "uxth w22, w5", - "uxth w0, w21", - "bfi w0, w22, #16, #16", - "udiv w23, w0, w20", - "uxth w0, w21", - "bfi w0, w22, #16, #16", + "uxth w0, w4", + "bfi w0, w5, #16, #16", + "udiv w21, w0, w20", + "uxth w0, w4", + "bfi w0, w5, #16, #16", "udiv w1, w0, w20", "msub w20, w1, w20, w0", - "bfxil x4, x23, #0, #16", + "bfxil x4, x21, #0, #16", "bfxil x5, x20, #0, #16" ] }, "div ebx": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 11, "Comment": "GROUP2 0xf7 /6", "ExpectedArm64ASM": [ "mov w20, w6", - "mov w21, w4", - "mov w22, w5", - "mov x0, x21", - "bfi x0, x22, #32, #32", - "udiv x23, x0, x20", - "mov w4, w23", - "mov x0, x21", - "bfi x0, x22, #32, #32", + "mov x0, x4", + "bfi x0, x5, #32, #32", + "udiv x21, x0, x20", + "mov w21, w21", + "mov x0, x4", + "bfi x0, x5, #32, #32", "udiv x1, x0, x20", "msub x20, x1, x20, x0", - "mov w5, w20" + "mov w5, w20", + "mov x4, x21" ] }, "div rbx": { @@ -2852,43 +2848,40 @@ ] }, "idiv bx": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 12, "Comment": "GROUP2 0xf7 /7", "ExpectedArm64ASM": [ "uxth w20, w6", - "uxth w21, w4", - "uxth w22, w5", - "uxth w0, w21", - "bfi w0, w22, #16, #16", + "uxth w0, w4", + "bfi w0, w5, #16, #16", "sxth w1, w20", - "sdiv w23, w0, w1", - "uxth w0, w21", - "bfi w0, w22, #16, #16", + "sdiv w21, w0, w1", + "uxth w0, w4", + "bfi w0, w5, #16, #16", "sxth w1, w20", "sdiv w2, w0, w1", "msub w20, w2, w1, w0", - "bfxil x4, x23, #0, #16", + "bfxil x4, x21, #0, #16", "bfxil x5, x20, #0, #16" ] }, "idiv ebx": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 13, "Comment": "GROUP2 0xf7 /7", "ExpectedArm64ASM": [ "mov w20, w6", - "mov w21, w4", - "mov w22, w5", - "mov x0, x21", - "bfi x0, x22, #32, #32", + "mov x0, x4", + "bfi x0, x5, #32, #32", "sxtw x1, w20", - "sdiv x23, x0, x1", - "mov w4, w23", - "mov x0, x21", - "bfi x0, x22, #32, #32", + "sdiv x21, x0, x1", + "mov w21, w21", + "mov x0, x4", + "bfi x0, x5, #32, #32", "sxtw x2, w20", "sdiv x1, x0, x2", "msub x20, x1, x2, x0", - "mov w5, w20" + "mov w5, w20", + "mov x4, x21" ] }, "idiv rbx": {