simdutf · Nick-Nuon · Oct 27, 2023 · Oct 15, 2023 · Oct 15, 2023 · Oct 16, 2023
diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs
@@ -15,11 +15,15 @@ namespace SimdUnicodeBenchmarks
     public class Checker
     {
         List<char[]> names;
-        List<bool> results;
-        public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
-        {
-            // The runtime as of NET 8.0 has a dedicated method for this, but
-            // it is not available prior to that, so let us branch.
+        List<char[]> nonAsciichars;
+public List<byte[]> nonAsciiByteArrays; // Declare at the class level
+
+        List<bool> results;
+
+        public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
+        {
+            // The runtime as of NET 8.0 has a dedicated method for this, but
+            // it is not available prior to that, so let us branch.
 #if NET8_0_OR_GREATER
             return Ascii.IsValid(s);
 #else
@@ -49,23 +53,45 @@ public static char[] GetRandomASCIIString(uint n)
             return chars;
         }
 
+        public static char[] GetRandomNonASCIIString(uint n)
+        {
+            // Chose a few Latin Extended-A and Latin Extended-B characters alongside ASCII chars
+            var allowedChars = "abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNOPQRSTUVWXYZ01234567é89šžŸũŭůűųŷŹźŻżŽ";
+
+            var chars = new char[n];
+            var rd = new Random(12345); // fixed seed
+
+            for (var i = 0; i < n; i++)
+            {
+                chars[i] = allowedChars[rd.Next(0, allowedChars.Length)];
+            }
+
+            return chars;
+        }
 
-        [Params(100, 200, 500)]
+
+
+        [Params(100, 200, 500,1000,2000)]
         public uint N;
 
+
         [GlobalSetup]
         public void Setup()
         {
             names = new List<char[]>();
+            nonAsciiByteArrays = new List<byte[]>(); // Initialize the list of byte arrays
             results = new List<bool>();
 
             for (int i = 0; i < 100; i++)
             {
                 names.Add(GetRandomASCIIString(N));
+                char[] nonAsciiChars = GetRandomNonASCIIString(N);
+                nonAsciiByteArrays.Add(Encoding.UTF8.GetBytes(nonAsciiChars));  // Convert to byte array and store
                 results.Add(false);
             }
         }
 
+
         [Benchmark]
         public void FastUnicodeIsAscii()
         {
@@ -98,6 +124,22 @@ public void RuntimeIsAscii()
                 count += 1;
             }
         }
+        [Benchmark]
+        public void TestErrorGetIndexOfFirstNonAsciiByteBenchmark()
+        {
+            foreach (byte[] nonAsciiBytes in nonAsciiByteArrays)  // Use nonAsciiByteArrays directly
+            {
+                unsafe
+                {
+                    fixed (byte* pNonAscii = nonAsciiBytes)
+                    {
+                        nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pNonAscii, (nuint)nonAsciiBytes.Length);
+                    }
+                }
+            }
+        }
+
+
 
     }
 

diff --git a/src/Ascii.cs b/src/Ascii.cs
@@ -131,7 +131,49 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
             }
             return true;
         }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
+        {
+            byte* pBufferEnd = pBuffer + bufferLength;
+            byte* pCurrent = pBuffer;
+
+            // Process in blocks of 16 bytes when possible
+            while (pCurrent + 16 <= pBufferEnd)
+            {
+                ulong v1 = *(ulong*)pCurrent;
+                ulong v2 = *(ulong*)(pCurrent + 8);
+                ulong v = v1 | v2;
+
+                if ((v & 0x8080808080808080) != 0)
+                {
+                    for (; pCurrent < pBufferEnd; pCurrent++)
+                    {
+                        if (*pCurrent >= 0b10000000)
+                        {
+                            return (nuint)(pCurrent - pBuffer);
+                        }
+                    }
+                }
+
+                pCurrent += 16;
+            }
+
+            // Process the tail byte-by-byte
+            for (; pCurrent < pBufferEnd; pCurrent++)
+            {
+                if (*pCurrent >= 0b10000000)
+                {
+                    return (nuint)(pCurrent - pBuffer);
+                }
+            }
+
+            return bufferLength;
+        }
+
     }
+
+
 }
 // Further reading:
 // https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs

diff --git a/test/AsciiTest.cs b/test/AsciiTest.cs
@@ -1,5 +1,8 @@
 namespace tests;
 using System.Text;
+using SimdUnicode;
+
+//TODO (Nick Nuon): Test UTF8 Generator works correctly
 
 public class AsciiTest
 {
@@ -77,6 +80,7 @@ public void HardCodedSequencesTest()
         {
             Assert.True(SimdUnicode.Ascii.IsAscii(sequence), "Expected valid ASCII sequence");
             Assert.True(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to validate ASCII sequence");
+
         }
 
         foreach (var sequence in badsequences)
@@ -85,4 +89,100 @@ public void HardCodedSequencesTest()
             Assert.False(SimdUnicode.Ascii.SIMDIsAscii(sequence), "Expected SIMDIsAscii to invalidate non-ASCII sequence");
         }
     }
+
+    [Fact]
+    public void Test_random_ASCII_sequences_of_varying_lengths()
+    {
+        const int NUM_TRIALS = 1000;
+        const int MAX_LENGTH = 255;
+        RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0); // Only ASCII/one-bytes
+
+        for (int length = 1; length <= MAX_LENGTH; length++)
+        {
+            int validSequencesCount = 0;
+
+            for (int i = 0; i < NUM_TRIALS; i++)
+            {
+                byte[] sequence = utf8Generator.Generate(length);
+
+                if (sequence.All(b => b >= 0x00 && b <= 0x7F))
+                {
+                    validSequencesCount++;
+                }
+
+                // Console.WriteLine($"{length}-byte sequence: {BitConverter.ToString(sequence)}"); // Print the sequence as hex bytes
+            }
+
+            // Print the validation results
+            // Console.WriteLine($"For {length}-byte sequences, {validSequencesCount * 100.0 / NUM_TRIALS}% were valid ASCII.");
+
+            // Assertion or check to ensure all sequences were valid ASCII
+            if (validSequencesCount != NUM_TRIALS)
+            {
+                throw new Exception($"Invalid ASCII sequences were generated for {length}-byte sequences!");
+            }
+        }
+    }
+
+
+    [Fact]
+    // This mimics the no_error_ASCII test
+    public void TestNoErrorASCII()
+    {
+        const int NUM_TRIALS = 1000;
+        const int LENGTH = 512;
+        RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0);  // Only ASCII/one-bytes
+
+        for (int trial = 0; trial < NUM_TRIALS; trial++)
+        {
+            byte[] ascii = utf8Generator.Generate(LENGTH);
+
+            unsafe
+            {
+                fixed (byte* pAscii = ascii)
+                {
+                    nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
+                    if (result != (nuint)ascii.Length)
+                    {
+                        throw new Exception($"Unexpected non-ASCII character found at index {result}");
+                    }
+                }
+            }
+        }
+    }
+
+    [Fact]
+    // This mimics the error_ASCII test
+    public void TestErrorASCII()
+    {
+        const int NUM_TRIALS = 1000;
+        const int LENGTH = 512;
+        RandomUtf8 utf8Generator = new RandomUtf8(0, 100, 0, 0, 0);  // Only ASCII/one-bytes
+
+        for (int trial = 0; trial < NUM_TRIALS; trial++)
+        {
+            byte[] ascii = utf8Generator.Generate(LENGTH);
+
+            for (int i = 0; i < ascii.Length; i++)
+            {
+                ascii[i] += 0b10000000;
+
+                unsafe
+                {
+                    fixed (byte* pAscii = ascii)
+                    {
+                        nuint result = Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
+                        if (result != (nuint)i)
+                        {
+                            throw new Exception($"Expected non-ASCII character at index {i}, but found at index {result}");
+                        }
+                    }
+                }
+
+                ascii[i] -= 0b10000000;
+            }
+        }
+    }
+
+
 }
diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs
@@ -0,0 +1,104 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+public class RandomUtf8
+{
+    // Internal random number generator
+    private Random gen;
+
+    // Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.)
+    private double[] probabilities;
+
+    // Maximum number of bytes a UTF-8 character can be (based on the standard)
+    private const int maxByteLength = 4;
+
+    // Constructor initializing the generator with seed and probabilities
+    public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes)
+    {
+        gen = new Random((int)seed);
+        probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes };
+    }
+
+    // Generates a byte array of random UTF-8 sequences of specified length
+    public byte[] Generate(int outputBytes)
+    {
+        List<byte> result = new List<byte>(outputBytes);
+        while (result.Count < outputBytes)
+        {
+            uint codePoint = GenerateCodePoint();
+            byte[] utf8Bytes = EncodeToUTF8(codePoint);
+
+            // Ensure we don't exceed the desired length
+            if (result.Count + utf8Bytes.Length > outputBytes)
+                break;
+
+            result.AddRange(utf8Bytes);
+        }
+        return result.ToArray();
+    }
+
+    // Generates a byte array of random UTF-8 sequences and returns it along with its length
+    public (byte[] utf8, int count) GenerateCounted(int outputBytes)
+    {
+        var utf8 = Generate(outputBytes);
+        return (utf8, utf8.Length);
+    }
+
+    // Overload to regenerate the byte sequence with a new seed
+    public byte[] Generate(int outputBytes, long seed)
+    {
+        gen = new Random((int)seed);
+        return Generate(outputBytes);
+    }
+
+    // Generate a random UTF-8 code point based on probabilities
+    private uint GenerateCodePoint()
+    {
+        int byteCount = PickRandomByteCount();
+
+        // Depending on the byte count, generate an appropriate UTF-8 sequence
+        switch (byteCount)
+        {
+            // Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences
+            case 1: return (uint)gen.Next(0x00, 0x80); // 1-byte sequence
+            case 2: return (uint)((gen.Next(0xC2, 0xDF) << 8) | (0x80 | gen.Next(0x00, 0x40))); 
+            case 3: return (uint)((gen.Next(0xE0, 0xEF) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
+            case 4: return (uint)((gen.Next(0xF0, 0xF4) << 24) | ((0x80 | gen.Next(0x00, 0x40)) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
+            default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); // Guard clause for invalid byte count
+        }
+    }
+
+    // Pick a random byte count based on the given probabilities
+    private int PickRandomByteCount()
+    {
+        double randomValue = gen.NextDouble() * probabilities.Sum();
+        double cumulative = 0.0;
+
+        // Check each cumulative probability until the random value is less than the cumulative sum
+        for (int i = 0; i < maxByteLength; i++)
+        {
+            cumulative += probabilities[i];
+            if (randomValue <= cumulative)
+                return i + 1; // Return the byte count
+        }
+
+        return maxByteLength; // Default to max byte length
+    }
+
+    // Convert the generated code point into a valid UTF-8 sequence
+    private byte[] EncodeToUTF8(uint codePoint)
+    {
+        var result = new List<byte>();
+
+        // Break the code point into its constituent bytes
+        while (codePoint != 0)
+        {
+            result.Add((byte)(codePoint & 0xFF));
+            codePoint >>= 8;
+        }
+
+        result.Reverse(); // Reverse to get the bytes in the correct order
+        return result.ToArray();
+    }
+}