From fd391b1b18b07267270469255d7c26581874993e Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 17 Mar 2024 18:40:46 -0700 Subject: [PATCH 1/2] JIT: Optimize pmovmaskb with a named vector constant I was looking at some other JIT overheads and this cropped up as some overhead. Instead of materializing a constant using mov+movk+movk+movk, load it from the named vector constant array. In a micro-benchmark this improved performance by 34%. In bytemark this improved on subbench by 0.82% --- FEXCore/Source/Interface/Core/CPUBackend.cpp | 2 ++ FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp | 2 +- FEXCore/include/FEXCore/IR/IR.h | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/Core/CPUBackend.cpp b/FEXCore/Source/Interface/Core/CPUBackend.cpp index ddd02a1204..e768a91ef1 100644 --- a/FEXCore/Source/Interface/Core/CPUBackend.cpp +++ b/FEXCore/Source/Interface/Core/CPUBackend.cpp @@ -27,6 +27,8 @@ constexpr static uint64_t NamedVectorConstants[FEXCore::IR::NamedVectorConstant: {0x0706'0504'0302'0100ULL, 0x0F0E'0D0C'FFFF'FFFFULL}, // NAMED_VECTOR_BLENDPS_1011B {0xFFFF'FFFF'0302'0100ULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1101B {0x0706'0504'FFFF'FFFFULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1110B + {0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB + {0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB_UPPER }; constexpr static auto PSHUFLW_LUT { diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 6532e4d9ab..3ee2d75159 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -1104,7 +1104,7 @@ void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) { const auto ExtractSize = Is256Bit ? 4 : 2; OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - OrderedNode *VMask = _VDupFromGPR(SrcSize, 8, _Constant(0x80'40'20'10'08'04'02'01ULL)); + OrderedNode *VMask = LoadAndCacheNamedVectorConstant(SrcSize, NAMED_VECTOR_MOVMASKB); auto VCMP = _VCMPLTZ(SrcSize, 1, Src); auto VAnd = _VAnd(SrcSize, 1, VCMP, VMask); diff --git a/FEXCore/include/FEXCore/IR/IR.h b/FEXCore/include/FEXCore/IR/IR.h index 294ad5ba9a..1ca505867e 100644 --- a/FEXCore/include/FEXCore/IR/IR.h +++ b/FEXCore/include/FEXCore/IR/IR.h @@ -533,6 +533,8 @@ enum NamedVectorConstant : uint8_t { NAMED_VECTOR_BLENDPS_1011B, NAMED_VECTOR_BLENDPS_1101B, NAMED_VECTOR_BLENDPS_1110B, + NAMED_VECTOR_MOVMASKB, + NAMED_VECTOR_MOVMASKB_UPPER, NAMED_VECTOR_CONST_POOL_MAX, // Beginning of named constants that don't have a constant pool backing. NAMED_VECTOR_ZERO = NAMED_VECTOR_CONST_POOL_MAX, From 20da1e42448e6f90ce70e4452e9d1336eee5e410 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 17 Mar 2024 18:48:06 -0700 Subject: [PATCH 2/2] InstcountCI: Update for pmovmaskb --- .../InstructionCountCI/Crypto/H0F3A.json | 4 +-- .../InstructionCountCI/FEXOpt/libnss.json | 6 ++--- .../InstructionCountCI/FlagM/Secondary.json | 8 ++---- .../FlagM/Secondary_OpSize.json | 8 ++---- .../InstructionCountCI/FlagM/VEX_map1.json | 17 ++++-------- unittests/InstructionCountCI/H0F38.json | 2 +- unittests/InstructionCountCI/H0F3A.json | 14 +++++----- .../InstructionCountCI/PrimaryGroup.json | 8 +++--- unittests/InstructionCountCI/Secondary.json | 22 +++++++-------- .../InstructionCountCI/Secondary_OpSize.json | 14 ++++------ .../InstructionCountCI/Secondary_REP.json | 2 +- .../InstructionCountCI/Secondary_REPNE.json | 4 +-- unittests/InstructionCountCI/VEX_map1.json | 27 +++++++------------ unittests/InstructionCountCI/VEX_map2.json | 2 +- unittests/InstructionCountCI/VEX_map3.json | 4 +-- 15 files changed, 56 insertions(+), 86 deletions(-) diff --git a/unittests/InstructionCountCI/Crypto/H0F3A.json b/unittests/InstructionCountCI/Crypto/H0F3A.json index 62fd0d6802..ba7bbd5e0f 100644 --- a/unittests/InstructionCountCI/Crypto/H0F3A.json +++ b/unittests/InstructionCountCI/Crypto/H0F3A.json @@ -55,7 +55,7 @@ "0x66 0x0f 0x3a 0xdf" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2080]", + "ldr q2, [x28, #2096]", "movi v3.2d, #0x0", "mov v16.16b, v17.16b", "unimplemented (Unimplemented)", @@ -68,7 +68,7 @@ "0x66 0x0f 0x3a 0xdf" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2080]", + "ldr q2, [x28, #2096]", "movi v3.2d, #0x0", "mov v16.16b, v17.16b", "unimplemented (Unimplemented)", diff --git a/unittests/InstructionCountCI/FEXOpt/libnss.json b/unittests/InstructionCountCI/FEXOpt/libnss.json index 6ca42a9feb..88d5048763 100644 --- a/unittests/InstructionCountCI/FEXOpt/libnss.json +++ b/unittests/InstructionCountCI/FEXOpt/libnss.json @@ -197,10 +197,10 @@ "ldr q3, [x11, #272]", "ldr q4, [x11]", "ldr q5, [x11, #16]", - "ldr x0, [x28, #1688]", + "ldr x0, [x28, #1704]", "ldr q6, [x0, #2832]", "tbl v2.16b, {v2.16b}, v6.16b", - "ldr x0, [x28, #1688]", + "ldr x0, [x28, #1704]", "ldr q7, [x0, #432]", "tbl v3.16b, {v3.16b}, v7.16b", "ldr q8, [x11, #32]", @@ -281,7 +281,7 @@ "mov v9.s[2], w25", "mov v9.s[1], w20", "mov v9.s[0], w22", - "ldr x0, [x28, #1688]", + "ldr x0, [x28, #1704]", "ldr q10, [x0, #224]", "tbl v4.16b, {v4.16b}, v10.16b", "mov w20, v9.s[1]", diff --git a/unittests/InstructionCountCI/FlagM/Secondary.json b/unittests/InstructionCountCI/FlagM/Secondary.json index 0dfe194242..c35fc41f3d 100644 --- a/unittests/InstructionCountCI/FlagM/Secondary.json +++ b/unittests/InstructionCountCI/FlagM/Secondary.json @@ -1614,15 +1614,11 @@ ] }, "pmovmskb eax, mm0": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 8, "Comment": "0x0f 0xd7", "ExpectedArm64ASM": [ "ldr d2, [x28, #768]", - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v3.2d, x20", + "ldr d3, [x28, #2208]", "cmlt v2.16b, v2.16b, #0", "and v2.16b, v2.16b, v3.16b", "addp v2.16b, v2.16b, v2.16b", diff --git a/unittests/InstructionCountCI/FlagM/Secondary_OpSize.json b/unittests/InstructionCountCI/FlagM/Secondary_OpSize.json index c96d737118..2fc1391da1 100644 --- a/unittests/InstructionCountCI/FlagM/Secondary_OpSize.json +++ b/unittests/InstructionCountCI/FlagM/Secondary_OpSize.json @@ -35,14 +35,10 @@ ] }, "pmovmskb eax, xmm0": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 7, "Comment": "0x66 0x0f 0xd7", "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v2.2d, x20", + "ldr q2, [x28, #2208]", "cmlt v3.16b, v16.16b, #0", "and v2.16b, v3.16b, v2.16b", "addp v2.16b, v2.16b, v2.16b", diff --git a/unittests/InstructionCountCI/FlagM/VEX_map1.json b/unittests/InstructionCountCI/FlagM/VEX_map1.json index a4a1179c59..bce4ae2228 100644 --- a/unittests/InstructionCountCI/FlagM/VEX_map1.json +++ b/unittests/InstructionCountCI/FlagM/VEX_map1.json @@ -67,16 +67,12 @@ ] }, "vpmovmskb rax, xmm0": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 7, "Comment": [ "Map 1 0b01 0xd7 256-bit" ], "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v2.2d, x20", + "ldr q2, [x28, #2208]", "cmlt v3.16b, v16.16b, #0", "and v2.16b, v3.16b, v2.16b", "addp v2.16b, v2.16b, v2.16b", @@ -86,16 +82,13 @@ ] }, "vpmovmskb rax, ymm0": { - "ExpectedInstructionCount": 21, + "ExpectedInstructionCount": 18, "Comment": [ "Map 1 0b01 0xd7 256-bit" ], "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "mov z2.d, x20", + "ldr x0, [x28, #1672]", + "ld1b {z2.b}, p7/z, [x0]", "mrs x0, nzcv", "mov z0.d, #0", "cmplt p0.b, p7/z, z16.b, #0", diff --git a/unittests/InstructionCountCI/H0F38.json b/unittests/InstructionCountCI/H0F38.json index 59fd643440..2b8e180cd2 100644 --- a/unittests/InstructionCountCI/H0F38.json +++ b/unittests/InstructionCountCI/H0F38.json @@ -624,7 +624,7 @@ "0x66 0x0f 0x38 0x41" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #1968]", + "ldr q2, [x28, #1984]", "zip1 v3.8h, v2.8h, v17.8h", "zip2 v2.8h, v2.8h, v17.8h", "umin v2.4s, v3.4s, v2.4s", diff --git a/unittests/InstructionCountCI/H0F3A.json b/unittests/InstructionCountCI/H0F3A.json index c79dc9b6eb..6411205ce9 100644 --- a/unittests/InstructionCountCI/H0F3A.json +++ b/unittests/InstructionCountCI/H0F3A.json @@ -315,7 +315,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2096]", + "ldr q2, [x28, #2112]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -325,7 +325,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2112]", + "ldr q2, [x28, #2128]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -344,7 +344,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2128]", + "ldr q2, [x28, #2144]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -364,7 +364,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2144]", + "ldr q2, [x28, #2160]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -383,7 +383,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2160]", + "ldr q2, [x28, #2176]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -393,7 +393,7 @@ "0x66 0x0f 0x3a 0x0c" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2176]", + "ldr q2, [x28, #2192]", "tbx v16.16b, {v17.16b}, v2.16b" ] }, @@ -462,7 +462,7 @@ "0x66 0x0f 0x3a 0x0e" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1720]", + "ldr x0, [x28, #1736]", "ldr q2, [x0, #3440]", "tbx v16.16b, {v17.16b}, v2.16b" ] diff --git a/unittests/InstructionCountCI/PrimaryGroup.json b/unittests/InstructionCountCI/PrimaryGroup.json index 7d3b13adc6..34a1738496 100644 --- a/unittests/InstructionCountCI/PrimaryGroup.json +++ b/unittests/InstructionCountCI/PrimaryGroup.json @@ -2909,7 +2909,7 @@ "mov x0, x6", "mov x1, x20", "mov x2, x7", - "ldr x3, [x28, #2224]", + "ldr x3, [x28, #2272]", "str x30, [sp, #-16]!", "blr x3", "ldr x30, [sp], #16", @@ -2920,7 +2920,7 @@ "mov x0, x6", "mov x1, x20", "mov x2, x7", - "ldr x3, [x28, #2240]", + "ldr x3, [x28, #2288]", "str x30, [sp, #-16]!", "blr x3", "ldr x30, [sp], #16", @@ -2981,7 +2981,7 @@ "mov x0, x6", "mov x1, x20", "mov x2, x7", - "ldr x3, [x28, #2232]", + "ldr x3, [x28, #2280]", "str x30, [sp, #-16]!", "blr x3", "ldr x30, [sp], #16", @@ -2994,7 +2994,7 @@ "mov x0, x6", "mov x1, x20", "mov x2, x7", - "ldr x3, [x28, #2248]", + "ldr x3, [x28, #2296]", "str x30, [sp, #-16]!", "blr x3", "ldr x30, [sp], #16", diff --git a/unittests/InstructionCountCI/Secondary.json b/unittests/InstructionCountCI/Secondary.json index 13aa7e9f5e..c86597fadc 100644 --- a/unittests/InstructionCountCI/Secondary.json +++ b/unittests/InstructionCountCI/Secondary.json @@ -646,7 +646,7 @@ "Comment": "0x0f 0x50", "ExpectedArm64ASM": [ "ushr v2.4s, v16.4s, #31", - "ldr q3, [x28, #2064]", + "ldr q3, [x28, #2080]", "ushl v2.4s, v2.4s, v3.4s", "addv s2, v2.4s", "mov w4, v2.s[0]" @@ -657,7 +657,7 @@ "Comment": "0x0f 0x50", "ExpectedArm64ASM": [ "ushr v2.4s, v16.4s, #31", - "ldr q3, [x28, #2064]", + "ldr q3, [x28, #2080]", "ushl v2.4s, v2.4s, v3.4s", "addv s2, v2.4s", "mov w4, v2.s[0]" @@ -1041,7 +1041,7 @@ "Comment": "0x0f 0x70", "ExpectedArm64ASM": [ "ldr d2, [x28, #784]", - "ldr x0, [x28, #1672]", + "ldr x0, [x28, #1688]", "ldr d3, [x0, #16]", "tbl v2.8b, {v2.16b}, v3.8b", "str d2, [x28, #768]" @@ -1052,7 +1052,7 @@ "Comment": "0x0f 0x70", "ExpectedArm64ASM": [ "ldr d2, [x4]", - "ldr x0, [x28, #1672]", + "ldr x0, [x28, #1688]", "ldr d3, [x0, #16]", "tbl v2.8b, {v2.16b}, v3.8b", "str d2, [x28, #768]" @@ -3315,7 +3315,7 @@ "ExpectedInstructionCount": 3, "Comment": "0x0f 0xc6", "ExpectedArm64ASM": [ - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q2, [x0, #16]", "tbl v16.16b, {v16.16b, v17.16b}, v2.16b" ] @@ -3324,7 +3324,7 @@ "ExpectedInstructionCount": 5, "Comment": "0x0f 0xc6", "ExpectedArm64ASM": [ - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q2, [x0, #16]", "mov v0.16b, v17.16b", "mov v1.16b, v16.16b", @@ -3336,7 +3336,7 @@ "Comment": "0x0f 0xc6", "ExpectedArm64ASM": [ "ldr q2, [x4]", - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q3, [x0, #16]", "mov v0.16b, v16.16b", "mov v1.16b, v2.16b", @@ -3430,15 +3430,11 @@ ] }, "pmovmskb eax, mm0": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 8, "Comment": "0x0f 0xd7", "ExpectedArm64ASM": [ "ldr d2, [x28, #768]", - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v3.2d, x20", + "ldr d3, [x28, #2208]", "cmlt v2.16b, v2.16b, #0", "and v2.16b, v2.16b, v3.16b", "addp v2.16b, v2.16b, v2.16b", diff --git a/unittests/InstructionCountCI/Secondary_OpSize.json b/unittests/InstructionCountCI/Secondary_OpSize.json index 162204941e..cf4efd1202 100644 --- a/unittests/InstructionCountCI/Secondary_OpSize.json +++ b/unittests/InstructionCountCI/Secondary_OpSize.json @@ -522,7 +522,7 @@ "0x66 0x0f 0x70" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1688]", + "ldr x0, [x28, #1704]", "ldr q2, [x0, #16]", "tbl v16.16b, {v17.16b}, v2.16b" ] @@ -536,7 +536,7 @@ ], "ExpectedArm64ASM": [ "ldr q2, [x4]", - "ldr x0, [x28, #1688]", + "ldr x0, [x28, #1704]", "ldr q3, [x0, #16]", "tbl v16.16b, {v2.16b}, v3.16b" ] @@ -1014,7 +1014,7 @@ "ExpectedInstructionCount": 3, "Comment": "0x66 0x0f 0xd0", "ExpectedArm64ASM": [ - "ldr q2, [x28, #2032]", + "ldr q2, [x28, #2048]", "eor v2.16b, v17.16b, v2.16b", "fadd v16.2d, v16.2d, v2.2d" ] @@ -1067,14 +1067,10 @@ ] }, "pmovmskb eax, xmm0": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 7, "Comment": "0x66 0x0f 0xd7", "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v2.2d, x20", + "ldr q2, [x28, #2208]", "cmlt v3.16b, v16.16b, #0", "and v2.16b, v3.16b, v2.16b", "addp v2.16b, v2.16b, v2.16b", diff --git a/unittests/InstructionCountCI/Secondary_REP.json b/unittests/InstructionCountCI/Secondary_REP.json index a4fa5d02a7..bc1f11e7fb 100644 --- a/unittests/InstructionCountCI/Secondary_REP.json +++ b/unittests/InstructionCountCI/Secondary_REP.json @@ -354,7 +354,7 @@ "0xf3 0x0f 0x70" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1680]", + "ldr x0, [x28, #1696]", "ldr q2, [x0, #16]", "tbl v16.16b, {v17.16b}, v2.16b" ] diff --git a/unittests/InstructionCountCI/Secondary_REPNE.json b/unittests/InstructionCountCI/Secondary_REPNE.json index da29314b7d..01de2bdc18 100644 --- a/unittests/InstructionCountCI/Secondary_REPNE.json +++ b/unittests/InstructionCountCI/Secondary_REPNE.json @@ -296,7 +296,7 @@ "0xf2 0x0f 0x70" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1672]", + "ldr x0, [x28, #1688]", "ldr q2, [x0, #16]", "tbl v16.16b, {v17.16b}, v2.16b" ] @@ -452,7 +452,7 @@ "ExpectedInstructionCount": 3, "Comment": "0xf2 0x0f 0xd0", "ExpectedArm64ASM": [ - "ldr q2, [x28, #2000]", + "ldr q2, [x28, #2016]", "eor v2.16b, v17.16b, v2.16b", "fadd v16.4s, v16.4s, v2.4s" ] diff --git a/unittests/InstructionCountCI/VEX_map1.json b/unittests/InstructionCountCI/VEX_map1.json index f8d057ec79..e1195c29c2 100644 --- a/unittests/InstructionCountCI/VEX_map1.json +++ b/unittests/InstructionCountCI/VEX_map1.json @@ -2755,7 +2755,7 @@ "Map 1 0b00 0xC6 128-bit" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q2, [x0, #16]", "tbl v16.16b, {v17.16b, v18.16b}, v2.16b" ] @@ -2824,7 +2824,7 @@ "Map 1 0b00 0xC6 128-bit" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q2, [x0, #32]", "tbl v16.16b, {v17.16b, v18.16b}, v2.16b" ] @@ -2893,7 +2893,7 @@ "Map 1 0b00 0xC6 128-bit" ], "ExpectedArm64ASM": [ - "ldr x0, [x28, #1696]", + "ldr x0, [x28, #1712]", "ldr q2, [x0, #48]", "tbl v16.16b, {v17.16b, v18.16b}, v2.16b" ] @@ -4338,7 +4338,7 @@ "Map 1 0b01 0xd0 128-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2032]", + "ldr q2, [x28, #2048]", "eor v2.16b, v18.16b, v2.16b", "fadd v16.2d, v17.2d, v2.2d" ] @@ -4361,7 +4361,7 @@ "Map 1 0b11 0xd0 128-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2000]", + "ldr q2, [x28, #2016]", "eor v2.16b, v18.16b, v2.16b", "fadd v16.4s, v17.4s, v2.4s" ] @@ -4493,16 +4493,12 @@ ] }, "vpmovmskb rax, xmm0": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 7, "Comment": [ "Map 1 0b01 0xd7 256-bit" ], "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "dup v2.2d, x20", + "ldr q2, [x28, #2208]", "cmlt v3.16b, v16.16b, #0", "and v2.16b, v3.16b, v2.16b", "addp v2.16b, v2.16b, v2.16b", @@ -4512,16 +4508,13 @@ ] }, "vpmovmskb rax, ymm0": { - "ExpectedInstructionCount": 21, + "ExpectedInstructionCount": 18, "Comment": [ "Map 1 0b01 0xd7 256-bit" ], "ExpectedArm64ASM": [ - "mov x20, #0x201", - "movk x20, #0x804, lsl #16", - "movk x20, #0x2010, lsl #32", - "movk x20, #0x8040, lsl #48", - "mov z2.d, x20", + "ldr x0, [x28, #1672]", + "ld1b {z2.b}, p7/z, [x0]", "mrs x0, nzcv", "mov z0.d, #0", "cmplt p0.b, p7/z, z16.b, #0", diff --git a/unittests/InstructionCountCI/VEX_map2.json b/unittests/InstructionCountCI/VEX_map2.json index d0caa9970d..e98447f879 100644 --- a/unittests/InstructionCountCI/VEX_map2.json +++ b/unittests/InstructionCountCI/VEX_map2.json @@ -1575,7 +1575,7 @@ "Map 2 0b01 0x41 256-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #1968]", + "ldr q2, [x28, #1984]", "zip1 v3.8h, v2.8h, v17.8h", "zip2 v2.8h, v2.8h, v17.8h", "umin v2.4s, v3.4s, v2.4s", diff --git a/unittests/InstructionCountCI/VEX_map3.json b/unittests/InstructionCountCI/VEX_map3.json index b410199957..57330e2cb4 100644 --- a/unittests/InstructionCountCI/VEX_map3.json +++ b/unittests/InstructionCountCI/VEX_map3.json @@ -4799,7 +4799,7 @@ "Map 3 0b01 0xdf 128-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2080]", + "ldr q2, [x28, #2096]", "movi v3.2d, #0x0", "mov v16.16b, v17.16b", "unimplemented (Unimplemented)", @@ -4812,7 +4812,7 @@ "Map 3 0b01 0xdf 128-bit" ], "ExpectedArm64ASM": [ - "ldr q2, [x28, #2080]", + "ldr q2, [x28, #2096]", "movi v3.2d, #0x0", "mov v16.16b, v17.16b", "unimplemented (Unimplemented)",