Skip to content

Commit

Permalink
Merge pull request #4367 from Sonicadvance1/sha1_msg2
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Implement support for SHA1MSG2 using SHA instructions
  • Loading branch information
lioncash authored Feb 19, 2025
2 parents 02d7261 + 9a70ae1 commit 34a274d
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 17 deletions.
19 changes: 19 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,25 @@ DEF_OP(VSha1H) {
sha1h(Dst.S(), Src.S());
}

DEF_OP(VSha1SU1) {
auto Op = IROp->C<IR::IROp_VSha1SU1>();

const auto Dst = GetVReg(Node);
const auto Src1 = GetVReg(Op->Src1.ID());
const auto Src2 = GetVReg(Op->Src2.ID());

if (Dst == Src1) {
sha1su1(Dst, Src2);
} else if (Dst != Src2) {
mov(Dst.Q(), Src1.Q());
sha1su1(Dst, Src2);
} else {
mov(VTMP1.Q(), Src1.Q());
sha1su1(VTMP1, Src2);
mov(Dst.Q(), VTMP1.Q());
}
}

DEF_OP(VSha256U0) {
auto Op = IROp->C<IR::IROp_VSha256U0>();

Expand Down
45 changes: 28 additions & 17 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,29 +62,40 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) {
Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags);
Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);

// This instruction mostly matches ARMv8's SHA1SU1 instruction but one of the elements are flipped in an unexpected way.
// Do all the work without it.
Ref Result;
if (CTX->HostFeatures.SupportsSHA) {
// ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0.
auto FlipIt = [this](Ref Src) {
auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
};
auto Src1 = FlipIt(Dest);
auto Src2 = FlipIt(Src);

const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
// The result is swizzled differently than expected
Result = FlipIt(_VSha1SU1(Src1, Src2));
} else {
// Shift the incoming source left by a 32-bit element, inserting Zeros.
// This could be slightly improved to use a VInsGPR with the zero register.
const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12);
auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift);

// Shift the incoming source left by a 32-bit element, inserting Zeros.
// This could be slightly improved to use a VInsGPR with the zero register.
auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12);
auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift);
// Emulate rotate.
auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1);
auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31);

// Emulate rotate.
auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1);
auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31);
// Element0 didn't get XOR'd with anything, so do it now.
auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3);
auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper);

// Element0 didn't get XOR'd with anything, so do it now.
auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3);
auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper);
// Emulate rotate.
auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1);
auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31);

// Emulate rotate.
auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1);
auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31);
Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower);
}

auto Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower);

StoreResult(FPRClass, Op, Result, OpSize::iInvalid);
}
Expand Down
5 changes: 5 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2656,6 +2656,11 @@
"Desc": "Does vector scalar SHA1H instruction",
"DestSize": "FEXCore::IR::OpSize::i32Bit"
},
"FPR = VSha1SU1 FPR:$Src1, FPR:$Src2": {
"Desc": "Does vector scalar SHA1H instruction",
"DestSize": "FEXCore::IR::OpSize::i128Bit",
"TiedSource": 0
},
"FPR = VSha256U0 FPR:$Src1, FPR:$Src2": {
"Desc": "Does vector scalar VSha256U0 instruction",
"DestSize": "FEXCore::IR::OpSize::i128Bit",
Expand Down
15 changes: 15 additions & 0 deletions unittests/InstructionCountCI/Crypto/H0F38.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@
"mov v16.s[3], v2.s[3]"
]
},
"sha1msg2 xmm0, xmm1": {
"ExpectedInstructionCount": 7,
"Comment": [
"0x66 0x0f 0x38 0xca"
],
"ExpectedArm64ASM": [
"rev64 v2.4s, v16.4s",
"ext v2.16b, v2.16b, v2.16b, #8",
"rev64 v3.4s, v17.4s",
"ext v3.16b, v3.16b, v3.16b, #8",
"unimplemented (Unimplemented)",
"rev64 v2.4s, v2.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
]
},
"sha256msg1 xmm0, xmm1": {
"ExpectedInstructionCount": 1,
"Comment": [
Expand Down

0 comments on commit 34a274d

Please sign in to comment.