Skip to content

scalar GetIndexOfFirstNonAsciiByte #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 48 additions & 6 deletions benchmark/Benchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ namespace SimdUnicodeBenchmarks
public class Checker
{
List<char[]> names;
List<bool> results;
public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
{
// The runtime as of NET 8.0 has a dedicated method for this, but
// it is not available prior to that, so let us branch.
List<char[]> nonAsciichars;
public List<byte[]> nonAsciiByteArrays; // Declare at the class level

List<bool> results;

public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
{
// The runtime as of NET 8.0 has a dedicated method for this, but
// it is not available prior to that, so let us branch.
#if NET8_0_OR_GREATER
return Ascii.IsValid(s);
#else
Expand Down Expand Up @@ -49,23 +53,45 @@ public static char[] GetRandomASCIIString(uint n)
return chars;
}

public static char[] GetRandomNonASCIIString(uint n)
{
// Chose a few Latin Extended-A and Latin Extended-B characters alongside ASCII chars
var allowedChars = "abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ01234567é89šžŸũŭůűųŷŹźŻżŽ";

var chars = new char[n];
var rd = new Random(12345); // fixed seed

for (var i = 0; i < n; i++)
{
chars[i] = allowedChars[rd.Next(0, allowedChars.Length)];
}

return chars;
}

[Params(100, 200, 500)]


[Params(100, 200, 500,1000,2000)]
public uint N;


[GlobalSetup]
public void Setup()
{
names = new List<char[]>();
nonAsciiByteArrays = new List<byte[]>(); // Initialize the list of byte arrays
results = new List<bool>();

for (int i = 0; i < 100; i++)
{
names.Add(GetRandomASCIIString(N));
char[] nonAsciiChars = GetRandomNonASCIIString(N);
nonAsciiByteArrays.Add(Encoding.UTF8.GetBytes(nonAsciiChars)); // Convert to byte array and store
results.Add(false);
}
}


[Benchmark]
public void FastUnicodeIsAscii()
{
Expand Down Expand Up @@ -98,6 +124,22 @@ public void RuntimeIsAscii()
count += 1;
}
}
[Benchmark]
public void TestErrorGetIndexOfFirstNonAsciiByteBenchmark()
{
foreach (byte[] nonAsciiBytes in nonAsciiByteArrays) // Use nonAsciiByteArrays directly
{
unsafe
{
fixed (byte* pNonAscii = nonAsciiBytes)
{
nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pNonAscii, (nuint)nonAsciiBytes.Length);
}
}
}
}



}

Expand Down
42 changes: 42 additions & 0 deletions src/Ascii.cs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,49 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
}
return true;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
{
byte* pBufferEnd = pBuffer + bufferLength;
byte* pCurrent = pBuffer;

// Process in blocks of 16 bytes when possible
while (pCurrent + 16 <= pBufferEnd)
{
ulong v1 = *(ulong*)pCurrent;
ulong v2 = *(ulong*)(pCurrent + 8);
ulong v = v1 | v2;

if ((v & 0x8080808080808080) != 0)
{
for (; pCurrent < pBufferEnd; pCurrent++)
{
if (*pCurrent >= 0b10000000)
{
return (nuint)(pCurrent - pBuffer);
}
}
}

pCurrent += 16;
}

// Process the tail byte-by-byte
for (; pCurrent < pBufferEnd; pCurrent++)
{
if (*pCurrent >= 0b10000000)
{
return (nuint)(pCurrent - pBuffer);
}
}

return bufferLength;
}

}


}
// Further reading:
// https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs
Expand Down
100 changes: 100 additions & 0 deletions test/AsciiTest.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
namespace tests;
using System.Text;
using SimdUnicode;

//TODO (Nick Nuon): Test UTF8 Generator works correctly

public class AsciiTest
{
Expand Down Expand Up @@ -77,6 +80,7 @@ public void HardCodedSequencesTest()
{
Assert.True(SimdUnicode.Ascii.IsAscii(sequence), "Expected valid ASCII sequence");
Assert.True(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to validate ASCII sequence");

}

foreach (var sequence in badsequences)
Expand All @@ -85,4 +89,100 @@ public void HardCodedSequencesTest()
Assert.False(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to invalidate non-ASCII sequence");
}
}

[Fact]
public void Test_random_ASCII_sequences_of_varying_lengths()
{
const int NUM_TRIALS = 1000;
const int MAX_LENGTH = 255;
RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0); // Only ASCII/one-bytes

for (int length = 1; length <= MAX_LENGTH; length++)
{
int validSequencesCount = 0;

for (int i = 0; i < NUM_TRIALS; i++)
{
byte[] sequence = utf8Generator.Generate(length);

if (sequence.All(b => b >= 0x00 && b <= 0x7F))
{
validSequencesCount++;
}

// Console.WriteLine($"{length}-byte sequence: {BitConverter.ToString(sequence)}"); // Print the sequence as hex bytes
}

// Print the validation results
// Console.WriteLine($"For {length}-byte sequences, {validSequencesCount * 100.0 / NUM_TRIALS}% were valid ASCII.");

// Assertion or check to ensure all sequences were valid ASCII
if (validSequencesCount != NUM_TRIALS)
{
throw new Exception($"Invalid ASCII sequences were generated for {length}-byte sequences!");
}
}
}


[Fact]
// This mimics the no_error_ASCII test
public void TestNoErrorASCII()
{
const int NUM_TRIALS = 1000;
const int LENGTH = 512;
RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0); // Only ASCII/one-bytes

for (int trial = 0; trial < NUM_TRIALS; trial++)
{
byte[] ascii = utf8Generator.Generate(LENGTH);

unsafe
{
fixed (byte* pAscii = ascii)
{
nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
if (result != (nuint)ascii.Length)
{
throw new Exception($"Unexpected non-ASCII character found at index {result}");
}
}
}
}
}

[Fact]
// This mimics the error_ASCII test
public void TestErrorASCII()
{
const int NUM_TRIALS = 1000;
const int LENGTH = 512;
RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0); // Only ASCII/one-bytes

for (int trial = 0; trial < NUM_TRIALS; trial++)
{
byte[] ascii = utf8Generator.Generate(LENGTH);

for (int i = 0; i < ascii.Length; i++)
{
ascii[i] += 0b10000000;

unsafe
{
fixed (byte* pAscii = ascii)
{
nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
if (result != (nuint)i)
{
throw new Exception($"Expected non-ASCII character at index {i}, but found at index {result}");
}
}
}

ascii[i] -= 0b10000000;
}
}
}


}
104 changes: 104 additions & 0 deletions test/helpers/randomutf8.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
using System;
using System.Collections.Generic;
using System.Linq;

public class RandomUtf8
{
// Internal random number generator
private Random gen;

// Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.)
private double[] probabilities;

// Maximum number of bytes a UTF-8 character can be (based on the standard)
private const int maxByteLength = 4;

// Constructor initializing the generator with seed and probabilities
public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes)
{
gen = new Random((int)seed);
probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes };
}

// Generates a byte array of random UTF-8 sequences of specified length
public byte[] Generate(int outputBytes)
{
List<byte> result = new List<byte>(outputBytes);
while (result.Count < outputBytes)
{
uint codePoint = GenerateCodePoint();
byte[] utf8Bytes = EncodeToUTF8(codePoint);

// Ensure we don't exceed the desired length
if (result.Count + utf8Bytes.Length > outputBytes)
break;

result.AddRange(utf8Bytes);
}
return result.ToArray();
}

// Generates a byte array of random UTF-8 sequences and returns it along with its length
public (byte[] utf8, int count) GenerateCounted(int outputBytes)
{
var utf8 = Generate(outputBytes);
return (utf8, utf8.Length);
}

// Overload to regenerate the byte sequence with a new seed
public byte[] Generate(int outputBytes, long seed)
{
gen = new Random((int)seed);
return Generate(outputBytes);
}

// Generate a random UTF-8 code point based on probabilities
private uint GenerateCodePoint()
{
int byteCount = PickRandomByteCount();

// Depending on the byte count, generate an appropriate UTF-8 sequence
switch (byteCount)
{
// Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences
case 1: return (uint)gen.Next(0x00, 0x80); // 1-byte sequence
case 2: return (uint)((gen.Next(0xC2, 0xDF) << 8) | (0x80 | gen.Next(0x00, 0x40)));
case 3: return (uint)((gen.Next(0xE0, 0xEF) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
case 4: return (uint)((gen.Next(0xF0, 0xF4) << 24) | ((0x80 | gen.Next(0x00, 0x40)) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); // Guard clause for invalid byte count
}
}

// Pick a random byte count based on the given probabilities
private int PickRandomByteCount()
{
double randomValue = gen.NextDouble() * probabilities.Sum();
double cumulative = 0.0;

// Check each cumulative probability until the random value is less than the cumulative sum
for (int i = 0; i < maxByteLength; i++)
{
cumulative += probabilities[i];
if (randomValue <= cumulative)
return i + 1; // Return the byte count
}

return maxByteLength; // Default to max byte length
}

// Convert the generated code point into a valid UTF-8 sequence
private byte[] EncodeToUTF8(uint codePoint)
{
var result = new List<byte>();

// Break the code point into its constituent bytes
while (codePoint != 0)
{
result.Add((byte)(codePoint & 0xFF));
codePoint >>= 8;
}

result.Reverse(); // Reverse to get the bytes in the correct order
return result.ToArray();
}
}