Skip to content

Commit a28e981

Browse files
committed
Optimize logical right shifts for byte values on XArch
1 parent b6266a6 commit a28e981

File tree

4 files changed

+46
-18
lines changed

4 files changed

+46
-18
lines changed

src/coreclr/jit/gentree.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19799,7 +19799,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
1979919799
simdBaseType = TYP_LONG;
1980019800
}
1980119801

19802-
assert(!varTypeIsByte(simdBaseType));
19802+
assert(!varTypeIsByte(simdBaseType) || op == GT_RSZ);
1980319803

1980419804
// "over shifting" is platform specific behavior. We will match the C# behavior
1980519805
// this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot
@@ -19809,13 +19809,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(
1980919809

1981019810
unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;
1981119811

19812+
GenTree* nonConstantByteShiftCountOp = NULL;
19813+
1981219814
if (op2->IsCnsIntOrI())
1981319815
{
1981419816
op2->AsIntCon()->gtIconVal &= shiftCountMask;
1981519817
}
1981619818
else
1981719819
{
1981819820
op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask));
19821+
19822+
if (varTypeIsByte(simdBaseType))
19823+
{
19824+
// Save the intermediate "shiftCount & shiftCountMask" value as we will need it to
19825+
// calculate which bits we should mask off after the 32-bit shift we use for 8-bit values.
19826+
nonConstantByteShiftCountOp = fgMakeMultiUse(&op2);
19827+
}
19828+
1981919829
op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, CORINFO_TYPE_INT,
1982019830
16);
1982119831
}
@@ -19908,6 +19918,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
1990819918
assert(op == GT_RSZ);
1990919919
intrinsic = NI_SSE2_ShiftRightLogical;
1991019920
}
19921+
19922+
if (varTypeIsByte(simdBaseType))
19923+
{
19924+
assert(op == GT_RSZ);
19925+
19926+
// We don't have actual instructions for shifting bytes, so we'll emulate them
19927+
// by shifting 32-bit values and masking off the bits that should be zeroed.
19928+
GenTree* maskAmountOp;
19929+
19930+
if (op2->IsCnsIntOrI())
19931+
{
19932+
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
19933+
ssize_t mask = 255 >> shiftCount;
19934+
19935+
maskAmountOp = gtNewIconNode(mask, type);
19936+
}
19937+
else
19938+
{
19939+
assert(nonConstantByteShiftCountOp != NULL);
19940+
19941+
maskAmountOp = gtNewOperNode(GT_RSZ, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
19942+
}
19943+
19944+
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
19945+
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
19946+
19947+
return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
19948+
}
1991119949
break;
1991219950
}
1991319951

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2636,12 +2636,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
26362636
{
26372637
assert(sig->numArgs == 2);
26382638

2639-
if (varTypeIsByte(simdBaseType))
2640-
{
2641-
// byte and sbyte would require more work to support
2642-
break;
2643-
}
2644-
26452639
if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
26462640
{
26472641
op2 = impPopStack().val;

src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -879,10 +879,10 @@ private static Vector128<byte> IndexOfAnyLookupCore(Vector128<byte> source, Vect
879879

880880
// On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values.
881881
// The shift will map values above 127 to values above 16, which the shuffle will then map to 0.
882-
// On X86 and WASM, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift.
882+
// On X86 and WASM, use a logical right shift instead.
883883
Vector128<byte> highNibbles = AdvSimd.IsSupported
884884
? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte()
885-
: (source.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)0xF);
885+
: source >>> 4;
886886

887887
// The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
888888
// Lookup the rows via the lower nibble and the column via the higher nibble.
@@ -913,7 +913,7 @@ private static Vector256<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
913913
private static Vector256<byte> IndexOfAnyLookupCore(Vector256<byte> source, Vector256<byte> bitmapLookup)
914914
{
915915
// See comments in IndexOfAnyLookupCore(Vector128<byte>) above for more details.
916-
Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
916+
Vector256<byte> highNibbles = source >>> 4;
917917
Vector256<byte> bitMask = Avx2.Shuffle(bitmapLookup, source);
918918
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
919919
Vector256<byte> result = bitMask & bitPositions;

src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,9 @@ private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLo
131131
[CompExactlyDependsOn(typeof(Avx2))]
132132
private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
133133
{
134-
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
135-
Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
134+
Vector256<byte> shifted = values >>> VectorizedIndexShift;
136135

137-
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
136+
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), shifted);
138137

139138
Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
140139
Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
@@ -175,12 +174,9 @@ private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower,
175174
[CompExactlyDependsOn(typeof(PackedSimd))]
176175
private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
177176
{
178-
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
179-
Vector128<byte> highNibble = Sse2.IsSupported
180-
? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
181-
: values >>> VectorizedIndexShift;
177+
Vector128<byte> shifted = values >>> VectorizedIndexShift;
182178

183-
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
179+
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), shifted);
184180

185181
Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
186182
Vector128<byte> bitMask;

0 commit comments

Comments
 (0)