diff --git a/benchmark/benchmark.csproj b/benchmark/benchmark.csproj index e95e7ab..5a3f2de 100644 --- a/benchmark/benchmark.csproj +++ b/benchmark/benchmark.csproj @@ -2,7 +2,7 @@ Exe - net8.0 + net9.0 enable enable true diff --git a/src/Base64ARMUTF8.cs b/src/Base64ARMUTF8.cs index f22f2ec..cb6acbc 100644 --- a/src/Base64ARMUTF8.cs +++ b/src/Base64ARMUTF8.cs @@ -13,6 +13,7 @@ namespace SimdBase64 { + namespace Arm { public static partial class Base64 { /* @@ -65,16 +66,6 @@ private unsafe static void LoadBlock(Block64* b, char* src) b->chunk3 = AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(m7.AsInt16()), m8.AsInt16()).AsByte(); } - /* [MethodImpl(MethodImplOptions.AggressiveInlining)] - - { - ulong m0 = ToBase64Mask(base64Url, ref b->chunk0, ref error); - ulong m1 = ToBase64Mask(base64Url, ref b->chunk1, ref error); - ulong m2 = ToBase64Mask(base64Url, ref b->chunk2, ref error); - ulong m3 = ToBase64Mask(base64Url, ref b->chunk3, ref error); - return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); - }*/ - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe ulong ToBase64Mask(bool base64Url, Block64* b, ref bool error) { @@ -101,10 +92,10 @@ private static unsafe ulong ToBase64Mask(bool base64Url, Block64* b, ref bool er Vector128 loNibbles3 = b->chunk2 & v0f; // Extract higher nibbles - Vector128 hiNibbles0 = ArmBase.ShiftRightLogical(b->chunk0, 4); - Vector128 hiNibbles1 = ArmBase.ShiftRightLogical(b->chunk1, 4); - Vector128 hiNibbles2 = ArmBase.ShiftRightLogical(b->chunk2, 4); - Vector128 hiNibbles3 = ArmBase.ShiftRightLogical(b->chunk3, 4); + Vector128 hiNibbles0 = AdvSimd.ShiftRightLogical(b->chunk0, 4); + Vector128 hiNibbles1 = AdvSimd.ShiftRightLogical(b->chunk1, 4); + Vector128 hiNibbles2 = AdvSimd.ShiftRightLogical(b->chunk2, 4); + Vector128 hiNibbles3 = AdvSimd.ShiftRightLogical(b->chunk3, 4); // Lookup tables for encoding Vector128 lutLo = base64Url @@ -117,28 +108,27 @@ private static unsafe ulong ToBase64Mask(bool base64Url, Block64* b, ref bool er : Vector128.Create((byte)0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20); // Lookup for lower and higher nibbles - Vector128 lo0 = ArmBase.LookupVector128(lutLo, loNibbles0); - Vector128 hi0 = ArmBase.LookupVector128(lutHi, hiNibbles0); - Vector128 lo1 = ArmBase.LookupVector128(lutLo, loNibbles1); - Vector128 hi1 = ArmBase.LookupVector128(lutHi, hiNibbles1); - Vector128 lo2 = ArmBase.LookupVector128(lutLo, loNibbles2); - Vector128 hi2 = ArmBase.LookupVector128(lutHi, hiNibbles2); - Vector128 lo3 = ArmBase.LookupVector128(lutLo, loNibbles3); - Vector128 hi3 = ArmBase.LookupVector128(lutHi, hiNibbles3); + Vector128 lo0 = AdvSimd.Arm64.VectorTableLookup(lutLo, loNibbles0); + Vector128 hi0 = AdvSimd.Arm64.VectorTableLookup(lutHi, hiNibbles0); + Vector128 lo1 = AdvSimd.Arm64.VectorTableLookup(lutLo, loNibbles1); + Vector128 hi1 = AdvSimd.Arm64.VectorTableLookup(lutHi, hiNibbles1); + Vector128 lo2 = AdvSimd.Arm64.VectorTableLookup(lutLo, loNibbles2); + Vector128 hi2 = AdvSimd.Arm64.VectorTableLookup(lutHi, hiNibbles2); + Vector128 lo3 = AdvSimd.Arm64.VectorTableLookup(lutLo, loNibbles3); + Vector128 hi3 = AdvSimd.Arm64.VectorTableLookup(lutHi, hiNibbles3); if (base64Url) { - hi0 = ArmBase.BitwiseClear(hi0, underscore0); - hi1 = ArmBase.BitwiseClear(hi1, underscore1); - hi2 = ArmBase.BitwiseClear(hi2, underscore2); - hi3 = ArmBase.BitwiseClear(hi3, underscore3); + hi0 = AdvSimd.BitwiseClear(hi0, underscore0); + hi1 = AdvSimd.BitwiseClear(hi1, underscore1); + hi2 = AdvSimd.BitwiseClear(hi2, underscore2); + hi3 = AdvSimd.BitwiseClear(hi3, underscore3); } // Check for invalid characters - Vector128 checks = ArmBase.MaxAcross(hi0 | hi1 | hi2 | hi3); + // Note that the maxaccross can be replaced. + error = (AdvSimd.Arm64.MaxAcross(hi0 | hi1 | hi2 | hi3).ToScalar() > 0x3); - error = (checks.ToScalar() > 0x3); - - ushort badCharmask = 0; + ulong badCharmask = 0; if (error) { Vector128 test0 = AdvSimd.CompareTest(lo0, hi0); @@ -151,7 +141,7 @@ private static unsafe ulong ToBase64Mask(bool base64Url, Block64* b, ref bool er Vector128 sum1 = AdvSimd.Arm64.AddPairwise(test2 & bit_mask, test3 & bit_mask); sum0 = AdvSimd.Arm64.AddPairwise(sum0, sum1); sum0 = AdvSimd.Arm64.AddPairwise(sum0, sum0); - badcharmask = sum0.AsUInt64().ToScalar(); + badCharmask = sum0.AsUInt64().ToScalar(); } Vector128 roll_lut = base64Url @@ -159,24 +149,24 @@ private static unsafe ulong ToBase64Mask(bool base64Url, Block64* b, ref bool er 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0) : Vector128.Create((byte)0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); - Vector128 vsecond_last = base64Url + Vector128 SecondLast = base64Url ? Vector128.Create((byte)0x2d) : Vector128.Create((byte)0x2f); - if (base64_url) { - hiNibbles0 = ArmBase.BitwiseClear(hiNibbles0, underscore0); - hiNibbles1 = ArmBase.BitwiseClear(hiNibbles1, underscore1); - hiNibbles2 = ArmBase.BitwiseClear(hiNibbles2, underscore2); - hiNibbles3 = ArmBase.BitwiseClear(hiNibbles3, underscore3); + if (base64Url) { + hiNibbles0 = AdvSimd.BitwiseClear(hiNibbles0, underscore0); + hiNibbles1 = AdvSimd.BitwiseClear(hiNibbles1, underscore1); + hiNibbles2 = AdvSimd.BitwiseClear(hiNibbles2, underscore2); + hiNibbles3 = AdvSimd.BitwiseClear(hiNibbles3, underscore3); } - Vector128 roll0 = ArmBase.LookupVector128(roll_lut, (b->chunks[0] == vsecond_last) + hiNibbles0); - Vector128 roll1 = ArmBase.LookupVector128(roll_lut, (b->chunks[1] == vsecond_last) + hiNibbles1); - Vector128 roll2 = ArmBase.LookupVector128(roll_lut, (b->chunks[2] == vsecond_last) + hiNibbles2); - Vector128 roll3 = ArmBase.LookupVector128(roll_lut, (b->chunks[3] == vsecond_last) + hiNibbles3); - b->chunks[0] += roll0; - b->chunks[1] += roll1; - b->chunks[2] += roll2; - b->chunks[3] += roll3; - return badcharmask; + Vector128 roll0 = AdvSimd.Arm64.VectorTableLookup(roll_lut, AdvSimd.CompareEqual(b->chunk0, SecondLast) + hiNibbles0); + Vector128 roll1 = AdvSimd.Arm64.VectorTableLookup(roll_lut, AdvSimd.CompareEqual(b->chunk1, SecondLast) + hiNibbles1); + Vector128 roll2 = AdvSimd.Arm64.VectorTableLookup(roll_lut, AdvSimd.CompareEqual(b->chunk2, SecondLast) + hiNibbles2); + Vector128 roll3 = AdvSimd.Arm64.VectorTableLookup(roll_lut, AdvSimd.CompareEqual(b->chunk3, SecondLast) + hiNibbles3); + b->chunk0 += roll0; + b->chunk1 += roll1; + b->chunk2 += roll2; + b->chunk3 += roll3; + return badCharmask; } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -217,7 +207,7 @@ private static unsafe void Compress(Vector128 data, ushort mask, byte* out shufmask = shufmask + Vector128.Create(0x08080808, 0x08080808, 0, 0).AsSByte(); // this is the version "nearly pruned" - Vector128 pruned = Ssse3.Shuffle(data.AsSByte(), shufmask); + Vector128 pruned = AdvSimd.Arm64.VectorTableLookup(data.AsSByte(), shufmask); // we still need to put the two halves together. // we compute the popcount of the first half: int pop1 = Tables.BitsSetTable256mul2[mask1]; @@ -230,7 +220,7 @@ private static unsafe void Compress(Vector128 data, ushort mask, byte* out { Vector128 compactmask = Vector128.Load(tablePtr + pop1 * 8); - Vector128 answer = Ssse3.Shuffle(pruned.AsByte(), compactmask); + Vector128 answer = AdvSimd.Arm64.VectorTableLookup(pruned.AsByte(), compactmask); Vector128.Store(answer, output); } } @@ -243,80 +233,34 @@ private static unsafe void CopyBlock(Block64* b, byte* output) Vector128.Store(b->chunk2, output + 32); Vector128.Store(b->chunk3, output + 48); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Base64DecodeBlockSafe(byte* outPtr, Block64* b) - { - Base64Decode(outPtr, b->chunk0); - Base64Decode(outPtr + 12, b->chunk1); - Base64Decode(outPtr + 24, b->chunk2); - byte[] buffer = new byte[16]; - - // Safe memory copy for the last part of the data - fixed (byte* bufferStart = buffer) - { - Base64Decode(bufferStart, b->chunk3); - Buffer.MemoryCopy(bufferStart, outPtr + 36, 12, 12); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe static void Base64Decode(byte* output, Vector128 input) + private static unsafe void Base64DecodeBlock(byte* outPtr, byte* srcPtr) { - // credit: aqrit - Vector128 packShuffle = Vector128.Create(2, 1, 0, 6, - 5, 4, 10, 9, - 8, 14, 13, 12, - -1, -1, -1, -1); - - // Perform the initial multiply and add operation across unsigned 8-bit integers. - Vector128 t0 = Ssse3.MultiplyAddAdjacent(input, Vector128.Create((Int32)0x01400140).AsSByte()); + // Load 4 vectors from src + var (str0, str1, str2, str3) = AdvSimd.Arm64.Load4xVector128AndUnzip(srcPtr); - // Perform another multiply and add to finalize the byte positions. - Vector128 t1 = Sse2.MultiplyAddAdjacent(t0, Vector128.Create((Int32)0x00011000).AsInt16()); - // Shuffle the bytes according to the packShuffle pattern. - Vector128 t2 = Ssse3.Shuffle(t1.AsSByte(), packShuffle).AsByte(); - // Store the output. This writes 16 bytes, but we only need 12. - Vector128.Store(t2, output); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Base64DecodeBlock(byte* outPtr, byte* srcPtr) - { - Base64Decode(outPtr, Vector128.Load(srcPtr)); - Base64Decode(outPtr + 12, Vector128.Load(srcPtr + 16)); - Base64Decode(outPtr + 24, Vector128.Load(srcPtr + 32)); - Base64Decode(outPtr + 36, Vector128.Load(srcPtr + 48)); - } + // Perform bitwise operations to simulate NEON intrinsics + Vector128 outvec0 = AdvSimd.Or( + AdvSimd.ShiftLeftLogical(str0, 2), + AdvSimd.ShiftRightLogical(str1, 4) + ); - // Function to decode a Base64 block into binary data. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Base64DecodeBlock(byte* output, Block64* block) - { - Base64Decode(output, block->chunk0); - Base64Decode(output + 12, block->chunk1); - Base64Decode(output + 24, block->chunk2); - Base64Decode(output + 36, block->chunk3); - } + Vector128 outvec1 = AdvSimd.Or( + AdvSimd.ShiftLeftLogical(str1, 4), + AdvSimd.ShiftRightLogical(str2, 2) + ); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void Base64DecodeBlockSafe(byte* outPtr, byte* srcPtr) - { - Base64Decode(outPtr, Vector128.Load(srcPtr)); - Base64Decode(outPtr + 12, Vector128.Load(srcPtr + 16)); - Base64Decode(outPtr + 24, Vector128.Load(srcPtr + 32)); - Vector128 tempBlock = Vector128.Load(srcPtr + 48); - byte[] buffer = new byte[16]; - fixed (byte* bufferPtr = buffer) - { - Base64Decode(bufferPtr, tempBlock); + Vector128 outvec2 = AdvSimd.Or( + AdvSimd.ShiftLeftLogical(str2, 6), + str3 + ); - // Copy only the first 12 bytes of the decoded fourth block into the output buffer, offset by 36 bytes. - // This step is necessary because the fourth block may not need all 16 bytes if it contains padding characters. - Buffer.MemoryCopy(bufferPtr, outPtr + 36, 12, 12);// DEGUG:Uncomment - } + // Store the result in outData + AdvSimd.Arm64.Store(outPtr, (outvec0, outvec1, outvec2)); } // Caller is responsible for checking tha (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) @@ -356,7 +300,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp int bytesToProcess = source.Length; // skip trailing spaces - while (bytesToProcess > 0 && Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) + while (bytesToProcess > 0 && SimdBase64.Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) { bytesToProcess--; whiteSpaces++; @@ -367,7 +311,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp { bytesToProcess -= 1; equalsigns++; - while (bytesToProcess > 0 && Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) + while (bytesToProcess > 0 && SimdBase64.Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) { bytesToProcess--; whiteSpaces++; @@ -416,7 +360,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp int remainderBytesWritten = 0; OperationStatus result = - Base64WithWhiteSpaceToBinaryScalar(source.Slice(Math.Max(0, bytesConsumed)), dest.Slice(Math.Max(0, bytesWritten)), out remainderBytesConsumed, out remainderBytesWritten, isUrl); + SimdBase64.Base64.Base64WithWhiteSpaceToBinaryScalar(source.Slice(Math.Max(0, bytesConsumed)), dest.Slice(Math.Max(0, bytesWritten)), out remainderBytesConsumed, out remainderBytesWritten, isUrl); bytesConsumed += remainderBytesConsumed; bytesWritten += remainderBytesWritten; @@ -433,25 +377,12 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp } - else if (bufferPtr != startOfBuffer) + else { CopyBlock(&b, bufferPtr); bufferPtr += 64; bufferBytesConsumed += 64; } - else - { - if (dst >= endOfSafe64ByteZone) - { - Base64DecodeBlockSafe(dst, &b); - } - else - { - Base64DecodeBlock(dst, &b); - } - bufferBytesWritten += 48; - dst += 48; - } if (bufferPtr >= (blocksSize - 1) * 64 + startOfBuffer) // We treat the last block separately later on { @@ -461,16 +392,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp bufferBytesWritten += 48; dst += 48; } - if (dst >= endOfSafe64ByteZone) // for the second to last block, we may need to chcek if its unsafe to proceed - { - Base64DecodeBlockSafe(dst, startOfBuffer + (blocksSize - 2) * 64); - } - else - { - Base64DecodeBlock(dst, startOfBuffer + (blocksSize - 2) * 64); - } - - + Base64DecodeBlock(dst, startOfBuffer + (blocksSize - 2) * 64); dst += 48; Buffer.MemoryCopy(startOfBuffer + (blocksSize - 1) * 64, startOfBuffer, 64, 64); @@ -502,7 +424,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp int remainderBytesWritten = 0; OperationStatus result = - Base64WithWhiteSpaceToBinaryScalar(source.Slice(Math.Max(0, bytesConsumed)), dest.Slice(Math.Max(0, bytesWritten)), out remainderBytesConsumed, out remainderBytesWritten, isUrl); + SimdBase64.Base64.Base64WithWhiteSpaceToBinaryScalar(source.Slice(Math.Max(0, bytesConsumed)), dest.Slice(Math.Max(0, bytesWritten)), out remainderBytesConsumed, out remainderBytesWritten, isUrl); bytesConsumed += remainderBytesConsumed; bytesWritten += remainderBytesWritten; @@ -517,15 +439,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp byte* subBufferPtr = startOfBuffer; for (; subBufferPtr + 64 <= bufferPtr; subBufferPtr += 64) { - if (dst >= endOfSafe64ByteZone) - { - Base64DecodeBlockSafe(dst, subBufferPtr); - } - else - { - Base64DecodeBlock(dst, subBufferPtr); - } - + Base64DecodeBlock(dst, subBufferPtr); dst += 48;// 64 bits of base64 decodes to 48 bits } if ((bufferPtr - subBufferPtr) % 64 != 0) @@ -631,7 +545,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMRegular(ReadOnlySp int remainderBytesWritten = 0; OperationStatus result = - Base64WithWhiteSpaceToBinaryScalar(source.Slice(bytesConsumed), dest.Slice(bytesWritten), out remainderBytesConsumed, out remainderBytesWritten, isUrl); + SimdBase64.Base64.Base64WithWhiteSpaceToBinaryScalar(source.Slice(bytesConsumed), dest.Slice(bytesWritten), out remainderBytesConsumed, out remainderBytesWritten, isUrl); if (result == OperationStatus.InvalidData) @@ -696,7 +610,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan 0 && Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) + while (bytesToProcess > 0 && SimdBase64.Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) { bytesToProcess--; whiteSpaces++; @@ -707,7 +621,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan 0 && Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) + while (bytesToProcess > 0 && SimdBase64.Base64.IsAsciiWhiteSpace((char)source[bytesToProcess - 1])) { bytesToProcess--; whiteSpaces++; @@ -756,7 +670,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan= endOfSafe64ByteZone) - { - Base64DecodeBlockSafe(dst, &b); - } - else - { - Base64DecodeBlock(dst, &b); - } - bufferBytesWritten += 48; - dst += 48; - } if (bufferPtr >= (blocksSize - 1) * 64 + startOfBuffer) // We treat the last block separately later on { @@ -801,16 +702,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan= endOfSafe64ByteZone) // for the second to last block, we may need to chcek if its unsafe to proceed - { - Base64DecodeBlockSafe(dst, startOfBuffer + (blocksSize - 2) * 64); - } - else - { - Base64DecodeBlock(dst, startOfBuffer + (blocksSize - 2) * 64); - } - - + Base64DecodeBlock(dst, startOfBuffer + (blocksSize - 2) * 64); dst += 48; Buffer.MemoryCopy(startOfBuffer + (blocksSize - 1) * 64, startOfBuffer, 64, 64); @@ -842,7 +734,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan= endOfSafe64ByteZone) - { - Base64DecodeBlockSafe(dst, subBufferPtr); - } - else - { - Base64DecodeBlock(dst, subBufferPtr); - } + Base64DecodeBlock(dst, subBufferPtr); dst += 48;// 64 bits of base64 decodes to 48 bits } @@ -971,7 +856,7 @@ private unsafe static OperationStatus InnerDecodeFromBase64ARMUrl(ReadOnlySpan Library - net8.0 + net9.0 enable true diff --git a/test/tests.csproj b/test/tests.csproj index ca89c20..860d5cc 100644 --- a/test/tests.csproj +++ b/test/tests.csproj @@ -1,7 +1,7 @@ - net8.0 + net9.0 enable enable