Merge pull request #4367 from Sonicadvance1/sha1_msg2

OpcodeDispatcher: Implement support for SHA1MSG2 using SHA instructions
FEX-Emu · Feb 19, 2025 · 34a274d · 34a274d
2 parents 02d7261 + 9a70ae1
commit 34a274d
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 17 deletions.
diff --git a/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp b/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp
@@ -169,6 +169,25 @@ DEF_OP(VSha1H) {
   sha1h(Dst.S(), Src.S());
 }
 
+DEF_OP(VSha1SU1) {
+  auto Op = IROp->C<IR::IROp_VSha1SU1>();
+
+  const auto Dst = GetVReg(Node);
+  const auto Src1 = GetVReg(Op->Src1.ID());
+  const auto Src2 = GetVReg(Op->Src2.ID());
+
+  if (Dst == Src1) {
+    sha1su1(Dst, Src2);
+  } else if (Dst != Src2) {
+    mov(Dst.Q(), Src1.Q());
+    sha1su1(Dst, Src2);
+  } else {
+    mov(VTMP1.Q(), Src1.Q());
+    sha1su1(VTMP1, Src2);
+    mov(Dst.Q(), VTMP1.Q());
+  }
+}
+
 DEF_OP(VSha256U0) {
   auto Op = IROp->C<IR::IROp_VSha256U0>();
 

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
@@ -62,29 +62,40 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) {
   Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
   Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
 
-  // This instruction mostly matches ARMv8's SHA1SU1 instruction but one of the elements are flipped in an unexpected way.
-  // Do all the work without it.
+  Ref Result;
+  if (CTX->HostFeatures.SupportsSHA) {
+    // ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0.
+    auto FlipIt = [this](Ref Src) {
+      auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
+      return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
+    };
+    auto Src1 = FlipIt(Dest);
+    auto Src2 = FlipIt(Src);
 
-  const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
+    // The result is swizzled differently than expected
+    Result = FlipIt(_VSha1SU1(Src1, Src2));
+  } else {
+    // Shift the incoming source left by a 32-bit element, inserting Zeros.
+    // This could be slightly improved to use a VInsGPR with the zero register.
+    const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
+    auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12);
+    auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift);
 
-  // Shift the incoming source left by a 32-bit element, inserting Zeros.
-  // This could be slightly improved to use a VInsGPR with the zero register.
-  auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12);
-  auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift);
+    // Emulate rotate.
+    auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1);
+    auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31);
 
-  // Emulate rotate.
-  auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1);
-  auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31);
+    // Element0 didn't get XOR'd with anything, so do it now.
+    auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3);
+    auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper);
 
-  // Element0 didn't get XOR'd with anything, so do it now.
-  auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3);
-  auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper);
+    // Emulate rotate.
+    auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1);
+    auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31);
 
-  // Emulate rotate.
-  auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1);
-  auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31);
+    Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower);
+  }
 
-  auto Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower);
 
   StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
 }

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -2656,6 +2656,11 @@
         "Desc": "Does vector scalar SHA1H instruction",
         "DestSize": "FEXCore::IR::OpSize::i32Bit"
       },
+      "FPR = VSha1SU1 FPR:$Src1, FPR:$Src2": {
+        "Desc": "Does vector scalar SHA1H instruction",
+        "DestSize": "FEXCore::IR::OpSize::i128Bit",
+        "TiedSource": 0
+      },
       "FPR = VSha256U0 FPR:$Src1, FPR:$Src2": {
         "Desc": "Does vector scalar VSha256U0 instruction",
         "DestSize": "FEXCore::IR::OpSize::i128Bit",

diff --git a/unittests/InstructionCountCI/Crypto/H0F38.json b/unittests/InstructionCountCI/Crypto/H0F38.json
@@ -25,6 +25,21 @@
         "mov v16.s[3], v2.s[3]"
       ]
     },
+    "sha1msg2 xmm0, xmm1": {
+      "ExpectedInstructionCount": 7,
+      "Comment": [
+        "0x66 0x0f 0x38 0xca"
+      ],
+      "ExpectedArm64ASM": [
+        "rev64 v2.4s, v16.4s",
+        "ext v2.16b, v2.16b, v2.16b, #8",
+        "rev64 v3.4s, v17.4s",
+        "ext v3.16b, v3.16b, v3.16b, #8",
+        "unimplemented (Unimplemented)",
+        "rev64 v2.4s, v2.4s",
+        "ext v16.16b, v2.16b, v2.16b, #8"
+      ]
+    },
     "sha256msg1 xmm0, xmm1": {
       "ExpectedInstructionCount": 1,
       "Comment": [