-
Notifications
You must be signed in to change notification settings - Fork 7
scalar GetIndexOfFirstNonAsciiByte #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 6 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
362cba1
tested ASCII generator + untested utf8 generator
Nick-Nuon c6af788
scalar ASCII function itself (untested as of now)
Nick-Nuon e1b850d
Added tests (working)
Nick-Nuon 0f7634d
added benchmarks
Nick-Nuon 2e4e57c
More sensical benchmarks
Nick-Nuon 7a687b8
benchmark changes
Nick-Nuon 70ca2a1
dotnet format
Nick-Nuon 362ac77
Working benchmarks for Runtime
Nick-Nuon fff93b0
Bench ASCII for GetIndexOfFirstNonAsciiByte
Nick-Nuon c59fd98
Remove extraneous files
Nick-Nuon File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
|
||
public class RandomUtf8 | ||
{ | ||
// Internal random number generator | ||
private Random gen; | ||
|
||
// Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.) | ||
private double[] probabilities; | ||
|
||
// Maximum number of bytes a UTF-8 character can be (based on the standard) | ||
private const int maxByteLength = 4; | ||
|
||
// Constructor initializing the generator with seed and probabilities | ||
public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes) | ||
{ | ||
gen = new Random((int)seed); | ||
probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes }; | ||
} | ||
|
||
// Generates a byte array of random UTF-8 sequences of specified length | ||
public byte[] Generate(int outputBytes) | ||
{ | ||
List<byte> result = new List<byte>(outputBytes); | ||
while (result.Count < outputBytes) | ||
{ | ||
uint codePoint = GenerateCodePoint(); | ||
byte[] utf8Bytes = EncodeToUTF8(codePoint); | ||
|
||
// Ensure we don't exceed the desired length | ||
if (result.Count + utf8Bytes.Length > outputBytes) | ||
break; | ||
|
||
result.AddRange(utf8Bytes); | ||
} | ||
return result.ToArray(); | ||
} | ||
|
||
// Generates a byte array of random UTF-8 sequences and returns it along with its length | ||
public (byte[] utf8, int count) GenerateCounted(int outputBytes) | ||
{ | ||
var utf8 = Generate(outputBytes); | ||
return (utf8, utf8.Length); | ||
} | ||
|
||
// Overload to regenerate the byte sequence with a new seed | ||
public byte[] Generate(int outputBytes, long seed) | ||
{ | ||
gen = new Random((int)seed); | ||
return Generate(outputBytes); | ||
} | ||
|
||
// Generate a random UTF-8 code point based on probabilities | ||
private uint GenerateCodePoint() | ||
{ | ||
int byteCount = PickRandomByteCount(); | ||
|
||
// Depending on the byte count, generate an appropriate UTF-8 sequence | ||
switch (byteCount) | ||
{ | ||
// Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences | ||
case 1: return (uint)gen.Next(0x00, 0x80); // 1-byte sequence | ||
case 2: return (uint)((gen.Next(0xC2, 0xDF) << 8) | (0x80 | gen.Next(0x00, 0x40))); | ||
case 3: return (uint)((gen.Next(0xE0, 0xEF) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40))); | ||
case 4: return (uint)((gen.Next(0xF0, 0xF4) << 24) | ((0x80 | gen.Next(0x00, 0x40)) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40))); | ||
default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); // Guard clause for invalid byte count | ||
} | ||
} | ||
|
||
// Pick a random byte count based on the given probabilities | ||
private int PickRandomByteCount() | ||
{ | ||
double randomValue = gen.NextDouble() * probabilities.Sum(); | ||
double cumulative = 0.0; | ||
|
||
// Check each cumulative probability until the random value is less than the cumulative sum | ||
for (int i = 0; i < maxByteLength; i++) | ||
{ | ||
cumulative += probabilities[i]; | ||
if (randomValue <= cumulative) | ||
return i + 1; // Return the byte count | ||
} | ||
|
||
return maxByteLength; // Default to max byte length | ||
} | ||
|
||
// Convert the generated code point into a valid UTF-8 sequence | ||
private byte[] EncodeToUTF8(uint codePoint) | ||
{ | ||
var result = new List<byte>(); | ||
|
||
// Break the code point into its constituent bytes | ||
while (codePoint != 0) | ||
{ | ||
result.Add((byte)(codePoint & 0xFF)); | ||
codePoint >>= 8; | ||
} | ||
|
||
result.Reverse(); // Reverse to get the bytes in the correct order | ||
return result.ToArray(); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.