diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 20a72a96dd780c..d3ae09a06232ee 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2249,8 +2249,7 @@ class emitter ssize_t emitGetInsCIdisp(instrDesc* id); unsigned emitGetInsCIargs(instrDesc* id); - inline emitAttr emitGetMemOpSize(instrDesc* id) const; - inline emitAttr emitGetBaseMemOpSize(instrDesc*) const; + inline emitAttr emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast = false) const; // Return the argument count for a direct call "id". int emitGetInsCDinfo(instrDesc* id); @@ -3958,51 +3957,11 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) //----------------------------------------------------------------------------- // emitGetMemOpSize: Get the memory operand size of instrDesc. // -// Note: there are cases when embedded broadcast is enabled, so the memory operand -// size is different from the intrinsic simd size, we will check here if emitter is -// emiting a embedded broadcast enabled instruction. - -// Arguments: -// id - Instruction descriptor -// -emitAttr emitter::emitGetMemOpSize(instrDesc* id) const -{ - if (id->idIsEvexbContextSet()) - { - // should have the assumption that Evex.b now stands for the embedded broadcast context. - // reference: Section 2.7.5 in Intel 64 and ia-32 architectures software developer's manual volume 2. - ssize_t inputSize = GetInputSizeInBytes(id); - switch (inputSize) - { - case 4: - return EA_4BYTE; - case 8: - return EA_8BYTE; - - default: - unreached(); - } - } - else - { - return emitGetBaseMemOpSize(id); - } -} - -//----------------------------------------------------------------------------- -// emitGetMemOpSize: Get the memory operand size of instrDesc. -// -// Note: vextractf128 has a 128-bit output (register or memory) but a 256-bit input (register). -// vinsertf128 is the inverse with a 256-bit output (register), a 256-bit input(register), -// and a 128-bit input (register or memory). -// Similarly, vextractf64x4 has a 256-bit output and 128-bit input and vinsertf64x4 the inverse -// This method is mainly used for such instructions to return the appropriate memory operand -// size, otherwise returns the regular operand size of the instruction. - // Arguments: -// id - Instruction descriptor +// id - Instruction descriptor +// ignoreEmbeddedBroadcast - true to get the non-embedded operand size; otherwise false // -emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const +emitAttr emitter::emitGetMemOpSize(instrDesc* id, bool ignoreEmbeddedBroadcast) const { ssize_t memSize = 0; @@ -4018,7 +3977,7 @@ emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const else if (tupleType == INS_TT_FULL) { // Embedded broadcast supported, so either loading scalar or full vector - if (id->idIsEvexbContextSet()) + if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) { memSize = GetInputSizeInBytes(id); } @@ -4040,7 +3999,7 @@ emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const { memSize = 16; } - else if (id->idIsEvexbContextSet()) + else if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) { memSize = GetInputSizeInBytes(id); } @@ -4052,7 +4011,7 @@ emitAttr emitter::emitGetBaseMemOpSize(instrDesc* id) const else if (tupleType == INS_TT_HALF) { // Embedded broadcast supported, so either loading scalar or half vector - if (id->idIsEvexbContextSet()) + if (id->idIsEvexbContextSet() && !ignoreEmbeddedBroadcast) { memSize = GetInputSizeInBytes(id); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 808fa1360337c5..e3561bbff37978 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10987,7 +10987,7 @@ void emitter::emitDispEmbBroadcastCount(instrDesc* id) const return; } ssize_t baseSize = GetInputSizeInBytes(id); - ssize_t vectorSize = (ssize_t)emitGetBaseMemOpSize(id); + ssize_t vectorSize = (ssize_t)emitGetMemOpSize(id, /* ignoreEmbeddedBroadcast */ true); printf(" {1to%d}", vectorSize / baseSize); } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index bcab097e35f2bf..f687f9139f5981 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20933,6 +20933,13 @@ GenTree* Compiler::gtNewSimdBinOpNode( std::swap(op1, op2); #endif // TARGET_XARCH } +#ifdef TARGET_XARCH + if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic) && varTypeIsSmall(simdBaseType)) + { + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT; + simdBaseType = JitType2PreciseVarType(simdBaseJitType); + } +#endif // TARGET_XARCH return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); } @@ -25691,6 +25698,15 @@ GenTree* Compiler::gtNewSimdTernaryLogicNode(var_types type, intrinsic = NI_AVX512F_VL_TernaryLogic; } +#ifdef TARGET_XARCH + assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic)); + if (varTypeIsSmall(simdBaseType)) + { + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT; + simdBaseType = JitType2PreciseVarType(simdBaseJitType); + } +#endif // TARGET_XARCH + return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, op4, intrinsic, simdBaseJitType, simdSize); } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 3d895ff846eb78..e791d2fd064d3c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6667,6 +6667,20 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool ShouldConstantProp(GenTree* operand, GenTreeVecCon* vecCon); + void NormalizeJitBaseTypeToInt(NamedIntrinsic id, var_types simdBaseType) + { + assert(varTypeIsSmall(simdBaseType)); + + if (varTypeIsUnsigned(simdBaseType)) + { + SetSimdBaseJitType(CORINFO_TYPE_UINT); + } + else + { + SetSimdBaseJitType(CORINFO_TYPE_UINT); + } + } + private: void SetHWIntrinsicId(NamedIntrinsic intrinsicId); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 05ab3da9f3b206..9d3db42ec4c097 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1818,6 +1818,13 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, if (simdBaseJitType != CORINFO_TYPE_UNDEF) { simdBaseType = JitType2PreciseVarType(simdBaseJitType); +#ifdef TARGET_XARCH + if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic) && varTypeIsSmall(simdBaseType)) + { + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT; + simdBaseType = JitType2PreciseVarType(simdBaseJitType); + } +#endif // TARGET_XARCH } const unsigned simdSize = HWIntrinsicInfo::lookupSimdSize(this, intrinsic, sig); diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index d1b8340c5677f9..15e942a4157089 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -227,6 +227,9 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is an embedded masking compatible intrinsic HW_Flag_EmbMaskingCompatible = 0x10000000, + + // The base type of this intrinsic needs to be normalized to int/uint unless it is long/ulong. + HW_Flag_NormalizeSmallTypeToInt = 0x20000000, #elif defined(TARGET_ARM64) // The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand. @@ -755,6 +758,12 @@ struct HWIntrinsicInfo HWIntrinsicFlag flags = lookupFlags(id); return (flags & HW_Flag_MaybeMemoryStore) != 0; } + + static bool NeedsNormalizeSmallTypeToInt(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_NormalizeSmallTypeToInt) != 0; + } #endif static bool NoJmpTableImm(NamedIntrinsic id) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index cb6d1821e3f034..c5b875a9630c2b 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -186,6 +186,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // We need to validate that other phases of the compiler haven't introduced unsupported intrinsics assert(compiler->compIsaSupportedDebugOnly(isa)); assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId)); + assert(!HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId) || !varTypeIsSmall(node->GetSimdBaseType())); bool isTableDriven = genIsTableDrivenHWIntrinsic(intrinsicId, category); insOpts instOptions = INS_OPTS_NONE; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 27adbaacf3ea1b..d44457c727c6fb 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -489,8 +489,8 @@ HARDWARE_INTRINSIC(SSE_X64, ConvertToInt64WithTruncation, HARDWARE_INTRINSIC(SSE2, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) @@ -562,7 +562,7 @@ HARDWARE_INTRINSIC(SSE2, MultiplyAddAdjacent, HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, Or, 16, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(SSE2, Or, 16, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(SSE2, PackSignedSaturate, 16, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, ShiftLeftLogical, 16, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) @@ -588,7 +588,7 @@ HARDWARE_INTRINSIC(SSE2, SubtractScalar, HARDWARE_INTRINSIC(SSE2, SumAbsoluteDifferences, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, UnpackHigh, 16, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(SSE2, UnpackLow, 16, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(SSE2, Xor, 16, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp|HW_Flag_NormalizeSmallTypeToInt) #define LAST_NI_SSE2 NI_SSE2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -802,8 +802,8 @@ HARDWARE_INTRINSIC(AVX2, Abs, HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) @@ -840,7 +840,7 @@ HARDWARE_INTRINSIC(AVX2, MultiplyAddAdjacent, HARDWARE_INTRINSIC(AVX2, MultiplyHigh, 32, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, MultiplyHighRoundScale, 32, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, MultiplyLow, 32, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX2, PackSignedSaturate, 32, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, PackUnsignedSaturate, 32, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) @@ -863,7 +863,7 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate, HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp|HW_Flag_NormalizeSmallTypeToInt) #define LAST_NI_AVX2 NI_AVX2_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -877,8 +877,8 @@ HARDWARE_INTRINSIC(AVX512F, Add, HARDWARE_INTRINSIC(AVX512F, AddScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(AVX512F, AndNot, 64, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, BlendVariable, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, BroadcastVector128ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeMemoryLoad) @@ -933,7 +933,7 @@ HARDWARE_INTRINSIC(AVX512F, DivideScalar, HARDWARE_INTRINSIC(AVX512F, DuplicateEvenIndexed, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, DuplicateOddIndexed, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, ExtractVector128, 64, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextractf128, INS_vextractf128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, ExtractVector256, 64, 2, {INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextracti64x4, INS_vextractf64x4, INS_vextractf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, ExtractVector256, 64, 2, {INS_vextracti32x8, INS_vextracti32x8, INS_vextracti32x8, INS_vextracti32x8, INS_vextracti32x8, INS_vextracti32x8, INS_vextracti64x4, INS_vextracti64x4, INS_vextractf64x4, INS_vextractf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, Fixup, 64, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfixupimmps, INS_vfixupimmpd}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, FixupScalar, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfixupimmss, INS_vfixupimmsd}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, FusedMultiplyAdd, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vfmadd213ps, INS_vfmadd213pd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_FmaIntrinsic|HW_Flag_RmwIntrinsic|HW_Flag_EmbRoundingCompatible|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) @@ -951,7 +951,7 @@ HARDWARE_INTRINSIC(AVX512F, GetExponentScalar, HARDWARE_INTRINSIC(AVX512F, GetMantissa, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetmantps, INS_vgetmantpd}, HW_Category_IMM, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, GetMantissaScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vgetmantss, INS_vgetmantsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, InsertVector128, 64, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti32x8, INS_vinserti32x8, INS_vinserti32x8, INS_vinserti32x8, INS_vinserti32x8, INS_vinserti32x8, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_vmovdqa64, INS_vmovdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512F, LoadVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) @@ -960,7 +960,7 @@ HARDWARE_INTRINSIC(AVX512F, Min, HARDWARE_INTRINSIC(AVX512F, Multiply, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F, MultiplyLow, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, MultiplyScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_mulsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, Permute2x64, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, Permute4x32, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, Permute4x64, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) @@ -997,10 +997,10 @@ HARDWARE_INTRINSIC(AVX512F, StoreAligned, HARDWARE_INTRINSIC(AVX512F, StoreAlignedNonTemporal, 64, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512F, Subtract, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX512F, SubtractScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) -HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F, TernaryLogic, 64, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512F, UnpackHigh, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F, UnpackLow, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(AVX512F, Xor, 64, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_vpxorq, INS_vpxorq, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_CanBenefitFromConstantProp|HW_Flag_NormalizeSmallTypeToInt) #define LAST_NI_AVX512F NI_AVX512F_Xor // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1057,7 +1057,7 @@ HARDWARE_INTRINSIC(AVX512F_VL, Scale, HARDWARE_INTRINSIC(AVX512F_VL, ShiftRightArithmetic, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsraq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F_VL, ShiftRightArithmeticVariable, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpsravq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) HARDWARE_INTRINSIC(AVX512F_VL, Shuffle2x128, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vshufi32x4, INS_vshufi32x4, INS_vshufi64x2, INS_vshufi64x2, INS_vshuff32x4, INS_vshuff64x2}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX512F_VL, TernaryLogic, -1, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX512F_VL, TernaryLogic, -1, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) #define LAST_NI_AVX512F_VL NI_AVX512F_VL_TernaryLogic // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -1356,7 +1356,7 @@ HARDWARE_INTRINSIC(AVX10v1, Shuffle2x128, HARDWARE_INTRINSIC(AVX10v1, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX10v1, SubtractScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_EmbRoundingCompatible) HARDWARE_INTRINSIC(AVX10v1, SumAbsoluteDifferencesInBlock32, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_vdbpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_EmbMaskingCompatible) -HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, -1, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible) +HARDWARE_INTRINSIC(AVX10v1, TernaryLogic, -1, 4, {INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogd, INS_vpternlogq, INS_vpternlogq, INS_vpternlogd, INS_vpternlogq}, HW_Category_IMM, HW_Flag_SpecialImport|HW_Flag_FullRangeIMM|HW_Flag_EmbBroadcastCompatible|HW_Flag_EmbMaskingCompatible|HW_Flag_NormalizeSmallTypeToInt) #define LAST_NI_AVX10v1 NI_AVX10v1_TernaryLogic // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index eb0ae8e6ec967e..fb8d63e5aaca27 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1672,6 +1672,11 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) BlockRange().InsertBefore(userIntrin, op4); userIntrin->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, op4); + if (varTypeIsSmall(simdBaseType)) + { + assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(ternaryLogicId)); + userIntrin->NormalizeJitBaseTypeToInt(ternaryLogicId, simdBaseType); + } return nextNode; } } @@ -1737,6 +1742,11 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) BlockRange().InsertBefore(node, control); node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); + if (varTypeIsSmall(simdBaseType)) + { + assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(ternaryLogicId)); + node->NormalizeJitBaseTypeToInt(ternaryLogicId, simdBaseType); + } return LowerNode(node); } } @@ -1823,6 +1833,10 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerNode(op2); node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2); + if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId) && varTypeIsSmall(simdBaseType)) + { + node->NormalizeJitBaseTypeToInt(intrinsicId, simdBaseType); + } break; } @@ -1882,6 +1896,10 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerNode(op3); node->ResetHWIntrinsicId(intrinsicId, comp, op1, op2, op3); + if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId) && varTypeIsSmall(simdBaseType)) + { + node->NormalizeJitBaseTypeToInt(intrinsicId, simdBaseType); + } break; } @@ -3052,11 +3070,12 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTreeHWIntrinsic* op1Intrinsic = op1->AsHWIntrinsic(); NamedIntrinsic op1IntrinsicId = op1Intrinsic->GetHWIntrinsicId(); - switch (op1IntrinsicId) + bool isScalar = false; + genTreeOps oper = op1Intrinsic->GetOperForHWIntrinsicId(&isScalar); + + switch (oper) { - case NI_AVX512F_And: - case NI_AVX512DQ_And: - case NI_AVX10v1_V512_And: + case GT_AND: { // We have `(x & y) == 0` with GenCondition::EQ (jz, setz, cmovz) // or `(x & y) != 0`with GenCondition::NE (jnz, setnz, cmovnz) @@ -3076,11 +3095,62 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm assert(testIntrinsicId == NI_EVEX_PTESTM); - node->Op(1) = op1Intrinsic->Op(1); - node->Op(2) = op1Intrinsic->Op(2); + GenTree* nestedOp1 = op1Intrinsic->Op(1); + GenTree* nestedOp2 = op1Intrinsic->Op(2); + + if (nestedOp2->isContained() && nestedOp2->OperIsHWIntrinsic()) + { + GenTreeHWIntrinsic* nestedIntrin = nestedOp2->AsHWIntrinsic(); + NamedIntrinsic nestedIntrinId = nestedIntrin->GetHWIntrinsicId(); + + if ((nestedIntrinId == NI_SSE3_MoveAndDuplicate) || + (nestedIntrinId == NI_AVX2_BroadcastScalarToVector128) || + (nestedIntrinId == NI_AVX2_BroadcastScalarToVector256) || + (nestedIntrinId == NI_AVX512F_BroadcastScalarToVector512)) + { + // We need to rewrite the embedded broadcast back to a regular constant + // so that the subsequent containment check for ptestm can determine + // if the embedded broadcast is still relevant + + GenTree* broadcastOp = nestedIntrin->Op(1); + + if (broadcastOp->OperIsHWIntrinsic(NI_Vector128_CreateScalarUnsafe)) + { + BlockRange().Remove(broadcastOp); + broadcastOp = broadcastOp->AsHWIntrinsic()->Op(1); + } + + assert(broadcastOp->OperIsConst()); + + GenTree* vecCns = + comp->gtNewSimdCreateBroadcastNode(simdType, broadcastOp, + op1Intrinsic->GetSimdBaseJitType(), simdSize); + + BlockRange().InsertAfter(broadcastOp, vecCns); + nestedOp2 = vecCns; + + BlockRange().Remove(broadcastOp); + BlockRange().Remove(nestedIntrin); + } + } + + node->Op(1) = nestedOp1; + node->Op(2) = nestedOp2; // Make sure we aren't contained since ptestm will do its own containment check - node->Op(2)->ClearContained(); + nestedOp2->ClearContained(); + + if (varTypeIsSmall(simdBaseType)) + { + // Fixup the base type so embedded broadcast and the mask size checks still work + node->NormalizeJitBaseTypeToInt(testIntrinsicId, simdBaseType); + + simdBaseJitType = node->GetSimdBaseJitType(); + simdBaseType = node->GetSimdBaseType(); + + maskBaseJitType = simdBaseJitType; + maskBaseType = simdBaseType; + } BlockRange().Remove(op1); BlockRange().Remove(op2); @@ -3458,6 +3528,11 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) BlockRange().InsertBefore(node, control); node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); + if (varTypeIsSmall(simdBaseType)) + { + assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(ternaryLogicId)); + node->NormalizeJitBaseTypeToInt(ternaryLogicId, simdBaseType); + } return LowerNode(node); } diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.cs b/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.cs new file mode 100644 index 00000000000000..38ed741f330ef5 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using Xunit; + +// Generated by Fuzzlyn v2.2 on 2024-08-04 15:50:11 +// Run on X64 Linux +// Seed: 9513455124659677293-vectort,vector128,vector256,vector512,x86aes,x86avx,x86avx2,x86avx512bw,x86avx512bwvl,x86avx512cd,x86avx512cdvl,x86avx512dq,x86avx512dqvl,x86avx512f,x86avx512fvl,x86avx512fx64,x86avx512vbmi,x86avx512vbmivl,x86bmi1,x86bmi1x64,x86bmi2,x86bmi2x64,x86fma,x86lzcnt,x86lzcntx64,x86pclmulqdq,x86popcnt,x86popcntx64,x86sse,x86ssex64,x86sse2,x86sse2x64,x86sse3,x86sse41,x86sse41x64,x86sse42,x86sse42x64,x86ssse3,x86x86base +// Reduced from 26.0 KiB to 1.1 KiB in 00:00:56 +// Debug: Outputs <0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> +// Release: Outputs <0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> + +public class Runtime_105969 +{ + public static byte s_5; + + [Fact] + public static void TestEntryPoint() + { + if (Avx512BW.IsSupported) + { + var vr16 = Vector512.CreateScalar(s_5); + var vr17 = Vector512.Create(1); + var vr18 = (byte)0; + var vr19 = Vector512.CreateScalar(vr18); + var vr20 = Vector128.Create(0); + var vr21 = Avx512BW.BroadcastScalarToVector512(vr20); + var vr22 = Vector256.Create(1); + var vr23 = Avx512F.InsertVector256(vr21, vr22, 0); + var vr24 = Vector512.Create(249, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0); + var vr25 = Avx512BW.BlendVariable(vr19, vr23, vr24); + var vr26 = Avx512BW.Min(vr17, vr25); + Vector512 vr27 = Avx512BW.UnpackLow(vr16, vr26); + Vector512 expected = Vector512.Create(0, (byte)1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + Assert.Equal(expected, vr27); + } + } +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.csproj new file mode 100644 index 00000000000000..de6d5e08882e86 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_105969/Runtime_105969.csproj @@ -0,0 +1,8 @@ + + + True + + + + +