From 9b12ce2dd977931c801180527d70b0153e4c198e Mon Sep 17 00:00:00 2001 From: macaba <1031306+macaba@users.noreply.github.com> Date: Sat, 21 Sep 2024 12:38:25 +0100 Subject: [PATCH] Shuffle scalar fallbacks added --- source/TS.NET.Benchmarks/Program.cs | 6 +- source/TS.NET.Benchmarks/README.md | 16 ++ .../TS.NET.Benchmarks/ShuffleI8Benchmark.cs | 9 +- source/TS.NET.Engine/Program.cs | 19 +- .../TS.NET.Engine/Tasks/ProcessingThread.cs | 49 ++--- source/TS.NET.Tests/ShuffleI8Tests.cs | 19 +- source/TS.NET/Processing/ShuffleI8.cs | 196 ++++++++++++------ 7 files changed, 211 insertions(+), 103 deletions(-) diff --git a/source/TS.NET.Benchmarks/Program.cs b/source/TS.NET.Benchmarks/Program.cs index f38f436..26de802 100644 --- a/source/TS.NET.Benchmarks/Program.cs +++ b/source/TS.NET.Benchmarks/Program.cs @@ -4,12 +4,12 @@ DefaultConfig.Instance.WithOptions(ConfigOptions.JoinSummary); //_ = BenchmarkRunner.Run(typeof(Program).Assembly); -//_ = BenchmarkRunner.Run(); +_ = BenchmarkRunner.Run(); //_ = BenchmarkRunner.Run(); //_ = BenchmarkRunner.Run(); //_ = BenchmarkRunner.Run(); //_ = BenchmarkRunner.Run(); -_ = BenchmarkRunner.Run(); +//_ = BenchmarkRunner.Run(); //_ = BenchmarkRunner.Run(); -_ = BenchmarkRunner.Run(); +//_ = BenchmarkRunner.Run(); Console.ReadKey(); \ No newline at end of file diff --git a/source/TS.NET.Benchmarks/README.md b/source/TS.NET.Benchmarks/README.md index 9a33d12..ee31ccf 100644 --- a/source/TS.NET.Benchmarks/README.md +++ b/source/TS.NET.Benchmarks/README.md @@ -1,3 +1,19 @@ +## ShuffleI8 + +Scalar processing + +| Method | Mean | Error | StdDev | Allocated | +|----------------------------------- |---------:|--------:|--------:|----------:| +| 'Four channel shuffle (125 x 8MS)' | 226.7 ms | 0.37 ms | 0.31 ms | 133 B | +| 'Two channel shuffle (125 x 8MS)' | 238.2 ms | 0.12 ms | 0.10 ms | 21 B | + +AVX2 processing + +| Method | Mean | Error | StdDev | Allocated | +|------------------------------------------------ |---------:|---------:|---------:|----------:| +| 'Four channel shuffle (125 x 8MS)' | 34.85 ms | 0.052 ms | 0.047 ms | 7 B | +| 'Two channel shuffle (125 x 8MS)' | 37.77 ms | 0.099 ms | 0.092 ms | 29 B | + ## RisingEdgeTriggerI8 Scalar processing diff --git a/source/TS.NET.Benchmarks/ShuffleI8Benchmark.cs b/source/TS.NET.Benchmarks/ShuffleI8Benchmark.cs index 1a6ecac..e57ec75 100644 --- a/source/TS.NET.Benchmarks/ShuffleI8Benchmark.cs +++ b/source/TS.NET.Benchmarks/ShuffleI8Benchmark.cs @@ -12,6 +12,7 @@ public class ShuffleI8Benchmark private const int byteBufferSize = 8000000; private readonly Memory input = new sbyte[byteBufferSize]; private readonly Memory output = new sbyte[byteBufferSize]; + private ShuffleI8 shuffle = new ShuffleI8(false); [GlobalSetup] public void Setup() @@ -19,11 +20,11 @@ public void Setup() Waveforms.FourChannelCountSignedByte(input.Span); } - [Benchmark(Description = "Four channel shuffle [production] (125 x 8MS)")] + [Benchmark(Description = "Four channel shuffle (125 x 8MS)")] public void FourChannels() { for (int i = 0; i < 125; i++) - ShuffleI8.FourChannels(input.Span, output.Span); + shuffle.FourChannels(input.Span, output.Span); } //[Benchmark(Description = "Four channel shuffle [run length 1, baseline] (125 x 8MS)")] @@ -89,11 +90,11 @@ public void FourChannels() // Shuffle.FourChannelsRunLength32NoSimd(input.Span, output.Span); //} - [Benchmark(Description = "Two channel shuffle [production] (125 x 8MS)")] + [Benchmark(Description = "Two channel shuffle (125 x 8MS)")] public void TwoChannels() { for (int i = 0; i < 125; i++) - ShuffleI8.TwoChannels(input.Span, output.Span); + shuffle.TwoChannels(input.Span, output.Span); } //[Benchmark(Description = "Two channel shuffle [run length 1,variant A] (125 x 8MS)")] diff --git a/source/TS.NET.Engine/Program.cs b/source/TS.NET.Engine/Program.cs index f202abe..5af1765 100644 --- a/source/TS.NET.Engine/Program.cs +++ b/source/TS.NET.Engine/Program.cs @@ -2,6 +2,7 @@ using Microsoft.Extensions.Logging; using NReco.Logging.File; using System.CommandLine; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using TS.NET; @@ -71,18 +72,22 @@ static void Start(int deviceIndex, string configurationFilePath) }); var logger = loggerFactory.CreateLogger("TS.NET.Engine"); - // Validation of CPU architecture - if (!Avx2.IsSupported) + if (RuntimeInformation.ProcessArchitecture == Architecture.X86 || RuntimeInformation.ProcessArchitecture == Architecture.X64) { - if (AdvSimd.Arm64.IsSupported) + if (!Avx2.IsSupported) { - logger?.LogCritical("AArch64 not yet supported."); - return; + logger?.LogWarning("x86/x64 CPU without AVX2. CPU load will be high."); + } + } + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + if (!AdvSimd.Arm64.IsSupported) + { + logger?.LogWarning("AArch64 CPU without Neon. CPU load will be high."); } else { - logger?.LogCritical("CPU does not support AVX2."); - return; + logger?.LogWarning("AArch64 CPU with Neon. Neon hot paths not implemented. CPU load will be high."); } } diff --git a/source/TS.NET.Engine/Tasks/ProcessingThread.cs b/source/TS.NET.Engine/Tasks/ProcessingThread.cs index 44a0211..86e5cac 100644 --- a/source/TS.NET.Engine/Tasks/ProcessingThread.cs +++ b/source/TS.NET.Engine/Tasks/ProcessingThread.cs @@ -103,15 +103,15 @@ private static void Loop( // Shuffle buffers. Only needed for 2/4 channel modes. Span shuffleBuffer = new sbyte[ThunderscopeMemory.Length]; // --2 channel buffers - int blockLength_2 = (int)ThunderscopeMemory.Length / 2; - Span postShuffleCh1_2 = shuffleBuffer.Slice(0, blockLength_2); - Span postShuffleCh2_2 = shuffleBuffer.Slice(blockLength_2, blockLength_2); + int blockLength_2Ch = (int)ThunderscopeMemory.Length / 2; + Span shuffleBuffer2Ch_1 = shuffleBuffer.Slice(0, blockLength_2Ch); + Span shuffleBuffer2Ch_2 = shuffleBuffer.Slice(blockLength_2Ch, blockLength_2Ch); // --4 channel buffers - int blockLength_4 = (int)ThunderscopeMemory.Length / 4; - Span postShuffleCh1_4 = shuffleBuffer.Slice(0, blockLength_4); - Span postShuffleCh2_4 = shuffleBuffer.Slice(blockLength_4, blockLength_4); - Span postShuffleCh3_4 = shuffleBuffer.Slice(blockLength_4 * 2, blockLength_4); - Span postShuffleCh4_4 = shuffleBuffer.Slice(blockLength_4 * 3, blockLength_4); + int blockLength_4Ch = (int)ThunderscopeMemory.Length / 4; + Span shuffleBuffer4Ch_1 = shuffleBuffer.Slice(0, blockLength_4Ch); + Span shuffleBuffer4Ch_2 = shuffleBuffer.Slice(blockLength_4Ch, blockLength_4Ch); + Span shuffleBuffer4Ch_3 = shuffleBuffer.Slice(blockLength_4Ch * 2, blockLength_4Ch); + Span shuffleBuffer4Ch_4 = shuffleBuffer.Slice(blockLength_4Ch * 3, blockLength_4Ch); Span captureEndIndices = new uint[ThunderscopeMemory.Length / 1000]; // 1000 samples is the minimum window width // Periodic debug display variables @@ -140,6 +140,7 @@ private static void Loop( AdcChannelMode cachedAdcChannelMode = AdcChannelMode.Quad; IEdgeTriggerI8 edgeTriggerI8 = new RisingEdgeTriggerI8(); + ShuffleI8 shuffle = new ShuffleI8(); bool runMode = true; bool forceTriggerLatch = false; // "Latch" because it will reset state back to false. If the force is invoked and a trigger happens anyway, it will be reset (effectively ignoring it and only updating the bridge once). bool singleTriggerLatch = false; // "Latch" because it will reset state back to false. When reset, runTrigger will be set to false. @@ -364,13 +365,13 @@ private static void Loop( break; case AdcChannelMode.Dual: // Shuffle - ShuffleI8.TwoChannels(input: inputDataDto.Memory.SpanI8, output: shuffleBuffer); + shuffle.TwoChannels(input: inputDataDto.Memory.SpanI8, output: shuffleBuffer); // Finished with the memory, return it inputChannel.Write(inputDataDto.Memory); // Write to circular buffer - circularBuffer1.Write(postShuffleCh1_2); - circularBuffer2.Write(postShuffleCh2_2); - streamSampleCounter += postShuffleCh1_2.Length; + circularBuffer1.Write(shuffleBuffer2Ch_1); + circularBuffer2.Write(shuffleBuffer2Ch_2); + streamSampleCounter += shuffleBuffer2Ch_1.Length; // Trigger if (runMode) { @@ -381,9 +382,9 @@ private static void Loop( case TriggerMode.Auto: if (hardwareConfig.IsTriggerChannelAnEnabledChannel(processingConfig.TriggerChannel)) { - var triggerChannelBuffer = postShuffleCh2_2; + var triggerChannelBuffer = shuffleBuffer2Ch_2; if (hardwareConfig.DualChannelModeIsTriggerChannelInFirstPosition(processingConfig.TriggerChannel)) - triggerChannelBuffer = postShuffleCh1_2; + triggerChannelBuffer = shuffleBuffer2Ch_1; uint captureEndCount = 0; edgeTriggerI8.Process(input: triggerChannelBuffer, captureEndIndices: captureEndIndices, out captureEndCount); @@ -435,15 +436,15 @@ private static void Loop( break; case AdcChannelMode.Quad: // Shuffle - ShuffleI8.FourChannels(input: inputDataDto.Memory.SpanI8, output: shuffleBuffer); + shuffle.FourChannels(input: inputDataDto.Memory.SpanI8, output: shuffleBuffer); // Finished with the memory, return it inputChannel.Write(inputDataDto.Memory); // Write to circular buffer - circularBuffer1.Write(postShuffleCh1_4); - circularBuffer2.Write(postShuffleCh2_4); - circularBuffer3.Write(postShuffleCh3_4); - circularBuffer4.Write(postShuffleCh4_4); - streamSampleCounter += postShuffleCh1_4.Length; + circularBuffer1.Write(shuffleBuffer4Ch_1); + circularBuffer2.Write(shuffleBuffer4Ch_2); + circularBuffer3.Write(shuffleBuffer4Ch_3); + circularBuffer4.Write(shuffleBuffer4Ch_4); + streamSampleCounter += shuffleBuffer4Ch_1.Length; // Trigger if (runMode) { @@ -456,10 +457,10 @@ private static void Loop( { var triggerChannelBuffer = processingConfig.TriggerChannel switch { - TriggerChannel.Channel1 => postShuffleCh1_4, - TriggerChannel.Channel2 => postShuffleCh2_4, - TriggerChannel.Channel3 => postShuffleCh3_4, - TriggerChannel.Channel4 => postShuffleCh4_4, + TriggerChannel.Channel1 => shuffleBuffer4Ch_1, + TriggerChannel.Channel2 => shuffleBuffer4Ch_2, + TriggerChannel.Channel3 => shuffleBuffer4Ch_3, + TriggerChannel.Channel4 => shuffleBuffer4Ch_4, _ => throw new ArgumentException("Invalid TriggerChannel value") }; diff --git a/source/TS.NET.Tests/ShuffleI8Tests.cs b/source/TS.NET.Tests/ShuffleI8Tests.cs index 0abc4fe..31e54c3 100644 --- a/source/TS.NET.Tests/ShuffleI8Tests.cs +++ b/source/TS.NET.Tests/ShuffleI8Tests.cs @@ -5,6 +5,8 @@ namespace TS.NET.Tests { public class ShuffleI8Tests { + const bool forceScalar = false; + [Fact] public void ShuffleI8_FourChannels_Samples64() { @@ -12,7 +14,8 @@ public void ShuffleI8_FourChannels_Samples64() ReadOnlySpan input = [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]; Span output = new sbyte[length]; - ShuffleI8.FourChannels(input, output); + var shuffle = new ShuffleI8(forceScalar); + shuffle.FourChannels(input, output); Span expectedOutput = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]; @@ -36,7 +39,8 @@ public void ShuffleI8_FourChannels_Samples128() } Span output = new sbyte[length]; - ShuffleI8.FourChannels(input, output); + var shuffle = new ShuffleI8(forceScalar); + shuffle.FourChannels(input, output); Span expectedOutput = new sbyte[length]; var runLength = length / 4; @@ -65,7 +69,8 @@ public void ShuffleI8_FourChannels_Samples8388608() } Span output = new sbyte[length]; - ShuffleI8.FourChannels(input, output); + var shuffle = new ShuffleI8(forceScalar); + shuffle.FourChannels(input, output); Span expectedOutput = new sbyte[length]; var runLength = length / 4; @@ -94,6 +99,7 @@ public void ShuffleI8_FourChannels_RunLength1_VariantA_Samples128() } Span output = new sbyte[length]; + var shuffle = new ShuffleI8(forceScalar); ShuffleI8.FourChannelsRunLength1VariantA(input, output); Span expectedOutput = new sbyte[length]; @@ -247,6 +253,7 @@ public void ShuffleI8_FourChannels_RunLength32_Samples1024() i += 32; } Span output = new sbyte[length]; + var shuffle = new ShuffleI8(forceScalar); ShuffleI8.FourChannelsRunLength32(input, output); for (int i = 0; i < 256; i++) @@ -279,7 +286,8 @@ public void ShuffleI8_TwoChannels_Samples64() } Span output = new sbyte[length]; - ShuffleI8.TwoChannels(input, output); + var shuffle = new ShuffleI8(forceScalar); + shuffle.TwoChannels(input, output); Span expectedOutput = new sbyte[length]; var runLength = length / 2; @@ -304,7 +312,8 @@ public void ShuffleI8_TwoChannels_Samples8388608() } Span output = new sbyte[length]; - ShuffleI8.TwoChannels(input, output); + var shuffle = new ShuffleI8(forceScalar); + shuffle.TwoChannels(input, output); Span expectedOutput = new sbyte[length]; var runLength = length / 2; diff --git a/source/TS.NET/Processing/ShuffleI8.cs b/source/TS.NET/Processing/ShuffleI8.cs index 57b0f93..1624a37 100644 --- a/source/TS.NET/Processing/ShuffleI8.cs +++ b/source/TS.NET/Processing/ShuffleI8.cs @@ -6,86 +6,162 @@ namespace TS.NET; public class ShuffleI8 { - public static void FourChannels(ReadOnlySpan input, Span output) + enum Architecture { Scalar, AVX2 } // Architectures supported by shuffle processing + private readonly Architecture processingArchitecture; + + public ShuffleI8(bool forceScalar = false) + { + if (Avx2.IsSupported) + processingArchitecture = Architecture.AVX2; + else + processingArchitecture = Architecture.Scalar; + + if (forceScalar) + processingArchitecture = Architecture.Scalar; + } + + public void FourChannels(ReadOnlySpan input, Span output) { - if (input.Length % 64 != 0) - throw new ArgumentException($"Input length must be multiple of 64"); if (input.Length != output.Length) throw new ArgumentException("Array lengths must match"); - Vector256 shuffleMask = Vector256.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15).AsSByte(); - Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7); - int channelBlockSize8B = output.Length / 4; - int ch2Offset64B = channelBlockSize8B / 8; - int ch3Offset64B = (channelBlockSize8B * 2) / 8; - int ch4Offset64B = (channelBlockSize8B * 3) / 8; - unsafe + int channelBlockSizeBytes = output.Length / 4; + + if (processingArchitecture == Architecture.AVX2) { - fixed (sbyte* inputP = input) - fixed (sbyte* outputP = output) + if (input.Length % 64 != 0) + throw new ArgumentException($"Input length must be multiple of 64"); + + int ch2Offset64b = channelBlockSizeBytes / 8; + int ch3Offset64b = (channelBlockSizeBytes * 2) / 8; + int ch4Offset64b = (channelBlockSizeBytes * 3) / 8; + Vector256 shuffleMask = Vector256.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15).AsSByte(); + Vector256 permuteMask = Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7); + unsafe { - sbyte* inputPtr = inputP; - ulong* outputPtr = (ulong*)outputP; - sbyte* finishPtr = inputP + input.Length; - while (inputPtr < finishPtr) + fixed (sbyte* inputP = input) + fixed (sbyte* outputP = output) { - // Note: x2 unroll seems to be the sweet spot in benchmarks - var shuffled1 = Avx2.Shuffle(Avx.LoadVector256(inputPtr), shuffleMask); // shuffled1 = <1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4> - var shuffled2 = Avx2.Shuffle(Avx.LoadVector256(inputPtr + 32), shuffleMask); - var permuted1 = Avx2.PermuteVar8x32(shuffled1.AsInt32(), permuteMask); // permuted1 = <16843009, 16843009, 33686018, 33686018, 50529027, 50529027, 67372036, 67372036> - var permuted2 = Avx2.PermuteVar8x32(shuffled2.AsInt32(), permuteMask); - var permuted1_64 = permuted1.AsUInt64(); - var permuted2_64 = permuted2.AsUInt64(); - - outputPtr[0] = permuted1_64[0]; - outputPtr[1] = permuted2_64[0]; - - outputPtr[0 + ch2Offset64B] = permuted1_64[1]; - outputPtr[1 + ch2Offset64B] = permuted2_64[1]; - - outputPtr[0 + ch3Offset64B] = permuted1_64[2]; - outputPtr[1 + ch3Offset64B] = permuted2_64[2]; - - outputPtr[0 + ch4Offset64B] = permuted1_64[3]; - outputPtr[1 + ch4Offset64B] = permuted2_64[3]; + sbyte* inputPtr = inputP; + ulong* outputPtr = (ulong*)outputP; + sbyte* finishPtr = inputP + input.Length; + while (inputPtr < finishPtr) + { + // Note: x2 unroll seems to be the sweet spot in benchmarks + var shuffled1 = Avx2.Shuffle(Avx.LoadVector256(inputPtr), shuffleMask); // shuffled1 = <1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4> + var shuffled2 = Avx2.Shuffle(Avx.LoadVector256(inputPtr + 32), shuffleMask); + var permuted1 = Avx2.PermuteVar8x32(shuffled1.AsInt32(), permuteMask); // permuted1 = <16843009, 16843009, 33686018, 33686018, 50529027, 50529027, 67372036, 67372036> + var permuted2 = Avx2.PermuteVar8x32(shuffled2.AsInt32(), permuteMask); + var permuted1_64 = permuted1.AsUInt64(); + var permuted2_64 = permuted2.AsUInt64(); + + outputPtr[0] = permuted1_64[0]; + outputPtr[1] = permuted2_64[0]; + + outputPtr[0 + ch2Offset64b] = permuted1_64[1]; + outputPtr[1 + ch2Offset64b] = permuted2_64[1]; + + outputPtr[0 + ch3Offset64b] = permuted1_64[2]; + outputPtr[1 + ch3Offset64b] = permuted2_64[2]; + + outputPtr[0 + ch4Offset64b] = permuted1_64[3]; + outputPtr[1 + ch4Offset64b] = permuted2_64[3]; + + inputPtr += 64; + outputPtr += 2; + } + } + } + } + else + { + if (input.Length % 4 != 0) + throw new ArgumentException($"Input length must be multiple of 4"); - inputPtr += 64; - outputPtr += 2; + int ch2Offset8b = channelBlockSizeBytes; + int ch3Offset8b = (channelBlockSizeBytes * 2); + int ch4Offset8b = (channelBlockSizeBytes * 3); + unsafe + { + fixed (sbyte* inputP = input) + fixed (sbyte* outputP = output) + { + sbyte* inputPtr = inputP; + sbyte* outputPtr = outputP; + sbyte* finishPtr = inputP + input.Length; + while (inputPtr < finishPtr) + { + outputPtr[0] = inputPtr[0]; + outputPtr[0 + ch2Offset8b] = inputPtr[1]; + outputPtr[0 + ch3Offset8b] = inputPtr[2]; + outputPtr[0 + ch4Offset8b] = inputPtr[3]; + inputPtr += 4; + outputPtr++; + } } } } } - public static void TwoChannels(ReadOnlySpan input, Span output) + public void TwoChannels(ReadOnlySpan input, Span output) { - if (input.Length % 32 != 0) - throw new ArgumentException($"Length of samples ({input.Length}) is not multiple of 32"); if (input.Length != output.Length) throw new ArgumentException("Array lengths must match"); - Vector256 shuffleMask = Vector256.Create(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).AsSByte(); - Vector256 permuteMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); - Span outputU64 = MemoryMarshal.Cast(output); - int channelBlockSize = outputU64.Length / 2; - unsafe + int channelBlockSizeBytes = output.Length / 2; + + if (processingArchitecture == Architecture.AVX2) { - fixed (sbyte* inputP = input) - fixed (ulong* outputP = outputU64) + if (input.Length % 32 != 0) + throw new ArgumentException($"Length of samples ({input.Length}) is not multiple of 32"); + + int ch2Offset64b = channelBlockSizeBytes / 8; + Vector256 shuffleMask = Vector256.Create(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).AsSByte(); + Vector256 permuteMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); + unsafe { - sbyte* inputPtr = inputP; - ulong* outputPtr = outputP; - sbyte* finishPtr = inputP + input.Length; - while (inputPtr < finishPtr) + fixed (sbyte* inputP = input) + fixed (sbyte* outputP = output) { - var shuffled1 = Avx2.Shuffle(Avx.LoadVector256(inputPtr), shuffleMask); // shuffled1 = <1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2> - var permuted1 = Avx2.PermuteVar8x32(shuffled1.AsInt32(), permuteMask); // permuted1 = <16843009, 16843009, 16843009, 16843009, 33686018, 33686018, 33686018, 33686018> - var permuted1_64 = permuted1.AsUInt64(); - outputPtr[0] = permuted1_64[0]; - outputPtr[1] = permuted1_64[1]; - outputPtr[0 + channelBlockSize] = permuted1_64[2]; - outputPtr[1 + channelBlockSize] = permuted1_64[3]; - inputPtr += 32; - outputPtr += 2; + sbyte* inputPtr = inputP; + ulong* outputPtr = (ulong*)outputP; + sbyte* finishPtr = inputP + input.Length; + while (inputPtr < finishPtr) + { + var shuffled1 = Avx2.Shuffle(Avx.LoadVector256(inputPtr), shuffleMask); // shuffled1 = <1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2> + var permuted1 = Avx2.PermuteVar8x32(shuffled1.AsInt32(), permuteMask); // permuted1 = <16843009, 16843009, 16843009, 16843009, 33686018, 33686018, 33686018, 33686018> + var permuted1_64 = permuted1.AsUInt64(); + outputPtr[0] = permuted1_64[0]; + outputPtr[1] = permuted1_64[1]; + outputPtr[0 + ch2Offset64b] = permuted1_64[2]; + outputPtr[1 + ch2Offset64b] = permuted1_64[3]; + inputPtr += 32; + outputPtr += 2; + } + } + } + } + else + { + if (input.Length % 2 != 0) + throw new ArgumentException($"Input length must be multiple of 2"); + + int ch2Offset8b = channelBlockSizeBytes; + unsafe + { + fixed (sbyte* inputP = input) + fixed (sbyte* outputP = output) + { + sbyte* inputPtr = inputP; + sbyte* outputPtr = outputP; + sbyte* finishPtr = inputP + input.Length; + while (inputPtr < finishPtr) + { + outputPtr[0] = inputPtr[0]; + outputPtr[0 + ch2Offset8b] = inputPtr[1]; + inputPtr += 2; + outputPtr++; + } } } }