diff --git a/docs/design.md b/docs/design.md index 0faba7d4..f1991226 100644 --- a/docs/design.md +++ b/docs/design.md @@ -253,130 +253,77 @@ public struct PageHeader ### SlottedArray -The `SlottedArray` component is responsible for storing data in-page. It is capable of mapping a `NibblePath` to a value represented by `ReadOnlySpan`. Empyt values are allowed. +The `SlottedArray` component is responsible for storing data in a page. It is capable of mapping a `NibblePath` to a value represented by `ReadOnlySpan`. Empty values are allowed as they are treated as tombstones. The tombstoning is needed to provide the write-through buffering capability, so that a value can be marked as deleted only later to be flushed down. #### SlottedArray layout -`SlottedArray` needs to store values with variant lengths over a fixed `Span` provided by the page. To make it work, Paprika uses a modified pattern of the slot array, used by major players in the world of B+ oriented databases (see: [PostgreSQL page layout](https://www.postgresql.org/docs/current/storage-page-layout.html#STORAGE-PAGE-LAYOUT-FIGURE)). How it works then? +`SlottedArray` needs to store values with variant lengths over a fixed `Span` provided by the page. To make it work, Paprika uses a modified pattern of the slot array, used by major players in the world of B+ oriented databases (see: [PostgreSQL page layout](https://www.postgresql.org/docs/current/storage-page-layout.html#STORAGE-PAGE-LAYOUT-FIGURE)). The slot array pattern uses a fixed-size buffer that is provided within the page. It allocates chunks of it from two directions: 1. from `0` forward 2. from the end downward -The first direction, from `0` is used for fixed-size structures that represent slots. Each slot has some metadata, including the most important one, the offset to the start of data. The direction from the end is used to store var length payloads. Paprika diverges from the usual slot array though. The slot array assumes that it's up to the higher level to map the slot identifiers to keys. What the page provides is just a container for tuples that stores them and maps them to the `CTID`s (see: [PostgreSQL system columns](https://www.postgresql.org/docs/current/ddl-system-columns.html)). How Paprika uses this approach +The first direction, from `0` is used for fixed-size structures that represent slots. Each slot has some metadata, including the most important one, the offset to the start of data. The direction from the end is used to store var length payloads. Paprika diverges from the usual slot array though. The slot array assumes that it's up to the higher level to map the slot identifiers to keys. What the page provides is just a container for tuples that stores them and maps them to the `CTID`s (see: [PostgreSQL system columns](https://www.postgresql.org/docs/current/ddl-system-columns.html)). -In Paprika, each page level represents a cutoff in the nibble path to make it aligned to the Merkle construct. The key management could be extracted out of the `SlottedArray` component, but it would make it less self-contained. `SlottedArray` then provides `TrySet` and `TryGet` methods that accept nibble paths. This impacts the design of the slot, which is as follows: +Paprika provides a vector-aligned slotted array, that stores lookup data from the beginning and the actual payload from the end. What makes it special is that Paprika uses vectorized instructions (`Vector256` or `Vector128` depending on the architecture) and alignment to its sizes to make the search as efficient as possible. The space that a given slotted array occupies will be then split into the following: -```csharp -private struct Slot -{ - public const int Size = 4; +1. `Header` +1. `Vector` of key hashes +1. `Vector` of `Slot` entries +1. `Vector` of key hashes +1. `Vector` of `Slot` entries +1. ... +1. ... +1. data - /// - /// The address currently requires 12 bits [0-11] to address whole page. - /// - private const ushort AddressMask = Page.PageSize - 1; +where each `Vector` is aligned to a machine that it runs on (`Vector` will be 32 bytes on modern `x64` and 16 bytes on `ARM`) and `Vector`s are allocated in pairs (hashes + corresponding slots). Keeping hashes and slots in separate chunks, allows for a fast vectorized search over hashes, without the need to scan over slot entries. The entries are inspected only on the hash match. - /// - /// The address of this item. - /// - public ushort ItemAddress { /* bitwise magic */ } - - /// - /// Whether the given entry is deleted or not - /// - public bool IsDeleted => KeyPreamble == KeyPreambleDelete; +The `Header` consists of 8 bytes and keeps track of `hi` and `lo` as in a usual `SlottedArray`. - public byte KeyPreamble { /* bitwise magic */ } +##### Slot - private ushort Raw; +A `Slot` occupies `2 bytes` and represents several values needed to keep track of the value stored in an array. It uses bit-wise operations to store all the information in 2 bytes: - /// - /// Used for vectorized search - /// - public const int HashShiftForSearch = 1; +1. `ItemAddress` - represents the address to go to for the data of the given slot (address currently requires 12 bits [0-11] to address the whole 4 kb page) +2. `Preamble` - shows whether there are some bytes left for the key or other +3. `IsDeleted`- whether the item was deleted in the array but not GCed yet - /// - /// The memorized result of of this item. - /// - public ushort Hash; - - /// - /// Prepares the key for the search. - /// - public static ushort PrepareKey(/* ... */) - { - // ... - } - - public static NibblePath UnPrepareKey(/* ... */) - { - // ... - } -} -``` - -The slot is 4 bytes long. Using the `PrepareKey` method, some of the nibbles are extrated from the key as a `Hash` for fast comparisons. It has the actual `ItemAddress` that points to the beginning of the payload. The length of the item is calculated by subtracting the address from the previous slot address. The drawback of this design is a linear search across all the slots when an item must be found. With the expected number of items per page, which should be no bigger than 100, it gives 400 bytes of slots to search through. This should be ok-ish with modern processors as the search uses the vectorized index search. Additionally, it adds some checks for the preamble so that the collissions should not be that likely. - -With this, the `SlottedArray` memory representation looks like the following. - -```bash -┌───────────────┬───────┬───────┬───────────────────────────────┐ -│HEADER │Slot 0 │Slot 1 │ │ -│ │ │ │ │ -│High │Prefix │Prefix │ │ -│Low │Addr │Addr │ ► ► ► │ -│Deleted │ │ │ │ │ │ -│ │ │ │ │ │ │ -├───────────────┴───┼───┴───┼───┘ │ -│ │ │ │ -│ ┌──┼───────┘ │ -│ │ │ │ -│ │ │ │ -│ │ └──────────┐ │ -│ │ │ │ -│ ▼ ▼ │ -│ ┌─────────────┬────────────────────────────────┤ -│ │ │ │ -│ │ │ │ -│ ◄ ◄ ◄ │ DATA │ DATA │ -│ │ for slot1 │ for slot 0 │ -│ │ │ │ -└────────────────┴─────────────┴────────────────────────────────┘ -``` +The `Slot` provides a method called `PrepareKey` that is responsible for extracting the `hash` for the given `NibblePath` key, returning a trimmed version of the key (what goes in the hash is extracted away) and a preamble. There's another counterpart method called `UnPrepareKey` that does the opposite. While `PrepareKey` is used for all the operations like `TrySet` and `TryGet`, `UnPrepareKey` is used only to materialize back the keys. This happens for example when a `SlottedArray` is enumerated. In other cases, the caller does not need to reconstruct the key as they have it. The `SlottedArray` can wrap an arbitrary span of memory so it can be used for any page that wants to store data by key. #### Deletion and tombstones -`SlottedArray` uses a tombstoning to mark the given entry as deleted. It's much cheaper to mark something as deleted and collect garbage from time to time than to compress it every single time. The marker of deleteion is frequently called a `tombstone`. To decide whether or not a GC should be called when there's not enough place to just append data, a counter of tombstones is held. If non zero, GC can be used to reclaim memory. +When deleting an item `SlottedArray` marks a given slot as deleted. Then tries to collect deleted from the last one. Eventually, when the deleted space is scattered across the map, it will run a `Defragment` procedure that copies what is alive and removes all the gaps. + +When a slot is marked as deleted, its hash is set to `~hash`. This is done to prevent it from being searched when performing the vectorized search over hashes. The negation is used so that there's no single value that will make all the deletes collide with the same entry. #### Iteration -`SlottedArray` allows an efficient iteration over each entries using the `map.EnumerateAll()` method. It provides the caller with a `ref struct Enumerator` that does not allocate and allows traversing the map. It's worth to mention that the enumerator allows to delete an entry when enumerating by calling the delete method with the item from the enumerator `map.Delete(item)`. Again, it's based on the tombstoning mentioned above and just marks the data as deleted. +`SlottedArray` allows an efficient iteration of its entries using the `map.EnumerateAll()` method. It provides the caller with a `ref struct Enumerator` that does not allocate and allows traversing the map. There's a special feature of the enumerator that allows deleting an entry when enumerating by calling the delete method with the item from the enumerator `map.Delete(item)`. Again, it's based on marking slots as deleted. ### Merkle construct From Ethereum's point of view, any storage mechanism needs to be able to calculate the `StateRootHash`. This hash allows us to verify whether the block state is valid. How it is done and what is used underneath is not important as long as the store mechanism can provide the answer to the ultimate question: _what is the StateRootHash of the given block?_ -To address this `Merkle` is implemented as a pre-commit hook. This hook is run when a block is committed to the blockchain. After all, from the point of execution there's no reason to run it before. Merkleization of the tree is split into the following steps executed sequentially: +To address this `Merkle` is implemented as a pre-commit hook. This hook is run when a block is committed to the blockchain. After all, from the point of execution, there's no reason to run it before. Merkleization of the tree is split into the following steps executed sequentially: 1. Visit all Storage operations (SSTORE). For each key: - 1. remember `Account` that `Storage`` belongs to - 1. walk through the MPT of Account Storage to create/amend Trie nodes. This part is marking paths as dirty + 1. remember `Account` that `Storage` belongs to + 1. walk through the MPT of Account Storage to create/amend Trie nodes. This part marks paths as dirty 1. Visit all State operations. For each key: 1. check if it was one of the Storage operations. If yes, remove it from the set above 1. walk through the MPT of Account State to create/amend Trie nodes 1. Visit all the accounts that were not accessed in 2., but were remembered in 1, meaning Accounts that had their storage modified but no changes to codehash, balance, nonce. For each key: 1. walk through the MPT of Account State to create/amend Trie nodes 1. Calculate the Root Hash - 1. for each of accounts that had their storage modified (from 1.), + 1. for each of the accounts that had their storage modified (from 1.), 1. calculate the storage root hash 1. store it in the account (decode account, encode, set) 1. calculate the root hash of the State. **Parallel** -It's worth to mention that even though `RLP` of branches is not stored in the database, its transient form is memoized in memory. This greatly improves the overall performance of Merkleization as reduced the number of fetched data from the database (no calls for children). Of course it requires cache invalidation which is done whenever marking the paths is done. +Even though `RLP` of branches is not stored in the database, its transient form is memoized in memory. This greatly improves the overall performance of Merkleization as reduces the number of fetched data from the database (no calls for children). Of course, it requires cache invalidation which is done whenever marking the paths is done. ## Examples diff --git a/src/Paprika.Benchmarks/Program.cs b/src/Paprika.Benchmarks/Program.cs index 6064c545..2f78ee5c 100644 --- a/src/Paprika.Benchmarks/Program.cs +++ b/src/Paprika.Benchmarks/Program.cs @@ -1,6 +1,8 @@ // See https://aka.ms/new-console-template for more information using System.Diagnostics.CodeAnalysis; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Jobs; using BenchmarkDotNet.Running; [assembly: ExcludeFromCodeCoverage] @@ -11,6 +13,16 @@ public class Program { public static void Main(string[] args) { - BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); + // Vector128 + // IConfig config = DefaultConfig.Instance + // .AddJob(Job.Default.WithEnvironmentVariable("DOTNET_EnableAVX2", "0").WithId("Vector128")); + + // Scalar, throw + // IConfig config = DefaultConfig.Instance + // .AddJob(Job.Default.WithEnvironmentVariable("DOTNET_EnableHWIntrinsic", "0").WithId("Vector128")); + + IConfig? config = null; + + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config); } } diff --git a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs index e51b89b9..96b7cc25 100644 --- a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs +++ b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs @@ -1,261 +1,144 @@ -using System.Numerics; +using System.Runtime.InteropServices; using BenchmarkDotNet.Attributes; using Paprika.Crypto; using Paprika.Data; using Paprika.Store; -using static System.Buffers.Binary.BinaryPrimitives; namespace Paprika.Benchmarks; [DisassemblyDiagnoser(maxDepth: 2)] -public class SlottedArrayBenchmarks +public unsafe class SlottedArrayBenchmarks { - private readonly byte[] _onePage = new byte[Page.PageSize]; + private const int KeyCount = 97; - // defragmentation - private readonly byte[] _defragmentation = new byte[Page.PageSize]; - private readonly byte[] _defragmentationCopy = new byte[Page.PageSize]; - private readonly byte[] _defragmentationValue = new byte[64]; - private readonly ushort _writtenTo; + private const int + BytesPerKey = + 3; // 3 repeated bytes allow to cut off the first nibble and still have a unique key. Also, allow storing some key leftover - private readonly byte[] _writtenLittleEndian = new byte[Page.PageSize]; - private readonly byte[] _writtenBigEndian = new byte[Page.PageSize]; - private readonly byte[] _writable = new byte[Page.PageSize]; - private readonly int _to; + private readonly void* _keys; + private readonly void* _map; - // hash collisions are fixed in size to make them comparable - private readonly byte[] _hashCollisions = new byte[Page.PageSize]; - private const int HashCollisionsCount = NibblePath.KeccakNibbleCount; - private static readonly byte[] HashCollisionValue = new byte[13]; + // Hash colliding + private const int HashCollidingKeyCount = 32; - private readonly byte[] _copy0 = new byte[Page.PageSize]; - private readonly byte[] _copy1 = new byte[Page.PageSize]; + // Use first and last as opportunity to collide + private const int BytesPerKeyHashColliding = 3; + private readonly void* _hashCollidingKeys; + private readonly void* _hashCollidingMap; - public SlottedArrayBenchmarks() - { - // Big and small endian tests - { - var little = new SlottedArray(_writtenLittleEndian); - var big = new SlottedArray(_writtenBigEndian); - - Span key = stackalloc byte[4]; - - - while (true) - { - WriteInt32LittleEndian(key, _to); - if (little.TrySet(NibblePath.FromKey(key), key) == false) - { - // filled - break; - } - - WriteInt32BigEndian(key, _to); - if (big.TrySet(NibblePath.FromKey(key), key) == false) - { - // filled - break; - } - - _to++; - } - } - - // Hash collisions tests - { - var zeroes = NibblePath.FromKey(Keccak.Zero); - var hashCollisions = new SlottedArray(_hashCollisions); - - for (var i = 0; i <= HashCollisionsCount; i++) - { - if (!hashCollisions.TrySet(zeroes.SliceTo(i), HashCollisionValue)) - { - throw new Exception($"No place to set hash collision at {i}"); - } - } - } - // Defragmentation - { - var map = new SlottedArray(_defragmentation); - ushort i = 0; - Span key = stackalloc byte[2]; - - // Set as many as possible - while (map.TrySet(NibblePath.FromKey(key), _defragmentationValue)) - { - i++; - WriteUInt16LittleEndian(key, i); - } - - _writtenTo = i; - } - } - - [Benchmark] - public int Write_whole_page_of_data() + public SlottedArrayBenchmarks() { - _writable.AsSpan().Clear(); - var map = new SlottedArray(_writable); - - Span key = stackalloc byte[4]; + // Create keys + _keys = AllocAlignedPage(); - int count = 0; - - // fill - for (int i = 0; i < _to; i++) + var span = new Span(_keys, Page.PageSize); + for (byte i = 0; i < KeyCount; i++) { - WriteInt32LittleEndian(key, i); - if (map.TrySet(NibblePath.FromKey(key), key)) + for (var j = 0; j < BytesPerKey; j++) { - count++; + span[i * BytesPerKey + j] = i; } } - return count; - } - - [Benchmark] - public int Read_existing_keys_prefix_different() - { - var map = new SlottedArray(_writtenLittleEndian); - Span key = stackalloc byte[4]; - - var result = 0; + // Map + _map = AllocAlignedPage(); + Span value = stackalloc byte[1]; - // find all values - for (var i = 0; i < _to; i++) + var map = new SlottedArray(new Span(_map, Page.PageSize)); + for (byte i = 0; i < KeyCount; i++) { - WriteInt32LittleEndian(key, i); - if (map.TryGet(NibblePath.FromKey(key), out var data)) + value[0] = i; + if (map.TrySet(GetKey(i, false), value) == false) { - result += data.Length; + throw new Exception("Not enough memory"); } } - return result; - } - - [Benchmark] - public int Read_existing_keys_suffix_different() - { - var map = new SlottedArray(_writtenBigEndian); - Span key = stackalloc byte[4]; + // Hash colliding + _hashCollidingKeys = AllocAlignedPage(); - var result = 0; - - // find all values - for (var i = 0; i < _to; i++) + // Create keys so that two consecutive ones share the hash. + // This should make it somewhat realistic where there are some collisions but not a lot of them. + var hashCollidingKeys = new Span(_hashCollidingKeys, Page.PageSize); + for (byte i = 0; i < HashCollidingKeyCount; i++) { - WriteInt32BigEndian(key, i); - if (map.TryGet(NibblePath.FromKey(key), out var data)) - { - result += data.Length; - } - } - - return result; - } + // 0th divide by 2 to collide + hashCollidingKeys[i * BytesPerKeyHashColliding] = (byte)(i / 2); - [Benchmark] - public int Read_nonexistent_keys() - { - var map = new SlottedArray(_writtenLittleEndian); - Span key = stackalloc byte[4]; + // 1th differentiate with the first + hashCollidingKeys[i * BytesPerKeyHashColliding + 1] = i; - var result = 0; - - // miss all the next - for (int i = _to; i < _to * 2; i++) - { - WriteInt32LittleEndian(key, i); - if (map.TryGet(NibblePath.FromKey(key), out _) == false) - { - result += 1; - } + // 2nd divide by 2 to collide + hashCollidingKeys[i * BytesPerKeyHashColliding + 2] = (byte)(i / 2); } - return result; - } - - [Benchmark] - public int Hash_collisions() - { - var map = new SlottedArray(_hashCollisions); - var zeroes = NibblePath.FromKey(Keccak.Zero); - - var length = 0; + _hashCollidingMap = AllocAlignedPage(); - for (var i = 0; i < HashCollisionsCount; i++) + var hashColliding = new SlottedArray(new Span(_hashCollidingMap, Page.PageSize)); + for (byte i = 0; i < HashCollidingKeyCount; i++) { - if (map.TryGet(zeroes.SliceTo(i), out var value)) + value[0] = i; + if (hashColliding.TrySet(GetHashCollidingKey(i), value) == false) { - length += value.Length; + throw new Exception("Not enough memory"); } } - return length; - } + return; - [Benchmark] - public int EnumerateAll() - { - var map = new SlottedArray(_writtenLittleEndian); - - var length = 0; - foreach (var item in map.EnumerateAll()) + static void* AllocAlignedPage() { - length += item.Key.Length; - length += item.RawData.Length; + const UIntPtr size = Page.PageSize; + var memory = NativeMemory.AlignedAlloc(size, size); + NativeMemory.Clear(memory, size); + return memory; } - - return length; } - [Benchmark] - public int Move_to_keys() + [Benchmark(OperationsPerInvoke = 4)] + [Arguments((byte)1, false)] + [Arguments((byte)15, false)] + [Arguments((byte)16, false)] + [Arguments((byte)31, false)] + [Arguments((byte)32, false)] + [Arguments((byte)47, false)] + [Arguments((byte)48, false)] + [Arguments((byte)63, false)] + [Arguments((byte)64, false)] + [Arguments((byte)95, false)] + [Arguments((byte)KeyCount - 1, false)] + public int TryGet(byte index, bool odd) { - var map = new SlottedArray(_writtenLittleEndian); - - _copy0.AsSpan().Clear(); - _copy1.AsSpan().Clear(); - - var map0 = new SlottedArray(_copy0); - var map1 = new SlottedArray(_copy1); - - map.MoveNonEmptyKeysTo(new MapSource(map0, map1)); - - return map.Count + map0.Count + map1.Count; + var map = new SlottedArray(new Span(_map, Page.PageSize)); + var key = GetKey(index, odd); + + var count = 0; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + return count; } - /// - /// Multiple rounds of setting and deleting to ensure that tombstones do not impact the search nor insert. - /// Increasing values are used so that slot cannot be easily reused. - /// - [Benchmark] - public void Set_And_Delete() + [Benchmark(OperationsPerInvoke = 4)] + [Arguments((byte)1)] + [Arguments((byte)2)] + [Arguments((byte)3)] + [Arguments((byte)4)] + [Arguments((byte)30)] + [Arguments((byte)31)] + public int TryGet_With_Hash_Collisions(byte index) { - const int count = 80; - - Span data = stackalloc byte[count]; - var a = NibblePath.FromKey(stackalloc byte[] { 12, 34, 98 }); - var b = NibblePath.FromKey(stackalloc byte[] { 78, 34, 35 }); - - var map = new SlottedArray(_onePage); - map.Clear(); - - // init by setting a - map.TrySet(a, ReadOnlySpan.Empty); - - for (int i = 1; i < count; i++) - { - var d = data[..i]; - - map.TrySet(b, d); - map.Delete(a); // delete previous a, b above prohibits collect tombstones - map.TrySet(a, d); // set new - map.Delete(b); // delete previous b, a above prohibits collect tombstones - } + var map = new SlottedArray(new Span(_hashCollidingMap, Page.PageSize)); + var key = GetHashCollidingKey(index); + + var count = 0; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + if (map.TryGet(key, out _)) count += 1; + return count; } [Benchmark(OperationsPerInvoke = 2)] @@ -288,86 +171,36 @@ public int Prepare_Key(int sliceFrom, int length) hash2 + preamble2 + trimmed2.Length; } - [Benchmark(OperationsPerInvoke = 2)] - [Arguments(0, 0)] - [Arguments(0, 1)] - [Arguments(1, 1)] - [Arguments(0, 2)] - [Arguments(1, 2)] - [Arguments(0, 3)] - [Arguments(1, 3)] - [Arguments(0, 4)] - [Arguments(1, 4)] - [Arguments(0, 6)] - [Arguments(1, 6)] - [Arguments(0, 32)] - [Arguments(1, 31)] - [Arguments(1, 30)] - public int Prepare_Key_UnPrepare(int sliceFrom, int length) + [Benchmark] + public int EnumerateAll() { - var key = NibblePath.FromKey(Keccak.EmptyTreeHash).Slice(sliceFrom, length); - - // prepare - var hash = SlottedArray.PrepareKeyForTests(key, out var preamble, out var trimmed); - var written = trimmed.WriteTo(stackalloc byte[33]); - - Span working = stackalloc byte[32]; - - // spin: 1 - var key1 = SlottedArray.UnPrepareKeyForTests(hash, preamble, written, working, out var data); - - // spin: 2 - var key2 = SlottedArray.UnPrepareKeyForTests(hash, preamble, written, working, out data); + var map = new SlottedArray(new Span(_map, Page.PageSize)); + var length = 0; + foreach (var item in map.EnumerateAll()) + { + length += item.Key.Length; + length += item.RawData.Length; + } - return key1.Length + key2.Length; + return length; } - private const int DefragmentOpsCount = 4; - - [Benchmark(OperationsPerInvoke = DefragmentOpsCount)] - public void Defragment_first_key_deleted() + private NibblePath GetKey(byte i, bool odd) { - _defragmentation.CopyTo(_defragmentationCopy.AsSpan()); - var map = new SlottedArray(_defragmentationCopy); + var span = new Span(_keys, BytesPerKey * KeyCount); + var slice = span.Slice(i * BytesPerKey, BytesPerKey); - Span key = stackalloc byte[2]; - var i = _writtenTo; - - // Delete & defragment - for (ushort j = 0; j < DefragmentOpsCount; j++) - { - // Delete first - WriteUInt16LittleEndian(key, j); - map.Delete(NibblePath.FromKey(key)); - - // Encode new key and set - WriteUInt16LittleEndian(key, i++); - map.TrySet(NibblePath.FromKey(key), _defragmentationValue); - } + return NibblePath.FromKey(slice, odd ? 1 : 0, 4); } - [Benchmark(OperationsPerInvoke = DefragmentOpsCount)] - public void Defragment_last_key_deleted() + private NibblePath GetHashCollidingKey(byte i) { - _defragmentation.CopyTo(_defragmentationCopy.AsSpan()); - - var map = new SlottedArray(_defragmentationCopy); - - Span key = stackalloc byte[2]; - var last = (ushort)(_writtenTo - 1); + var span = new Span(_hashCollidingKeys, BytesPerKeyHashColliding * HashCollidingKeyCount); + var slice = span.Slice(i * BytesPerKeyHashColliding, BytesPerKeyHashColliding); - // Delete & defragment - for (ushort j = 0; j < DefragmentOpsCount; j++) - { - // Delete first - WriteUInt16LittleEndian(key, last); - map.Delete(NibblePath.FromKey(key)); - - // Encode new key and set - WriteUInt16LittleEndian(key, last++); - map.TrySet(NibblePath.FromKey(key), _defragmentationValue); - } + // Use full key + return NibblePath.FromKey(slice, 0, BytesPerKeyHashColliding * NibblePath.NibblePerByte); } } \ No newline at end of file diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin index 422ac983..fbae61d5 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin index 038a806a..c0c5bb43 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin index 5635d1b2..9002515a 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin index 8addd03b..4b0f4d8f 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin index 4511a3b4..d3f83608 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin index 9db9ecc3..93356408 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin index cd122417..6b1248c5 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin index bc14e4a6..7cc70d69 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin index 43801544..2c6e155b 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin index cd4b4c6b..4e9a9f9c 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin index 6fcca8ad..bc96d424 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin index 6456fd84..e80592aa 100644 Binary files a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin and b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin differ diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs index 7dcb418e..06c304fb 100644 --- a/src/Paprika.Tests/Data/SlottedArrayTests.cs +++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs @@ -1,8 +1,7 @@ using FluentAssertions; -using NUnit.Framework; using Paprika.Crypto; using Paprika.Data; -using Paprika.Merkle; +using Paprika.Store; namespace Paprika.Tests.Data; @@ -22,7 +21,7 @@ public Task Set_Get_Delete_Get_AnotherSet() { var key0 = Values.Key0.Span; - Span span = stackalloc byte[48]; + Span span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + key0.Length + Data0.Length]; var map = new SlottedArray(span); map.SetAssert(key0, Data0); @@ -120,7 +119,7 @@ public Task Set_Get_Empty() { var key0 = Values.Key0.Span; - Span span = stackalloc byte[48]; + Span span = stackalloc byte[128]; var map = new SlottedArray(span); var data = ReadOnlySpan.Empty; @@ -151,7 +150,7 @@ public Task Set_Get_Empty() public Task Defragment_when_no_more_space() { // by trial and error, found the smallest value that will allow to put these two - Span span = stackalloc byte[88]; + Span span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + 88]; var map = new SlottedArray(span); var key0 = Values.Key0.Span; @@ -179,7 +178,7 @@ public Task Defragment_when_no_more_space() public Task Update_in_situ() { // by trial and error, found the smallest value that will allow to put these two - Span span = stackalloc byte[48]; + Span span = stackalloc byte[128]; var map = new SlottedArray(span); var key1 = Values.Key1.Span; @@ -196,12 +195,12 @@ public Task Update_in_situ() [Test] public Task Update_in_resize() { + var key0 = Values.Key0.Span; + // Update the value, with the next one being bigger. - Span span = stackalloc byte[56]; + Span span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + key0.Length + Data0.Length]; var map = new SlottedArray(span); - var key0 = Values.Key0.Span; - map.SetAssert(key0, Data0); map.SetAssert(key0, Data2); @@ -214,7 +213,7 @@ public Task Update_in_resize() [Test] public Task Small_keys_compression() { - Span span = stackalloc byte[256]; + Span span = stackalloc byte[512]; var map = new SlottedArray(span); Span key = stackalloc byte[1]; @@ -244,6 +243,62 @@ public Task Small_keys_compression() return Verify(span.ToArray()); } + [Test(Description = "Make a lot of requests to make breach the vector count")] + public void Breach_VectorSize_with_key_count() + { + const int seed = 13; + var random = new Random(seed); + Span key = stackalloc byte[4]; + + var map = new SlottedArray(new byte[Page.PageSize]); + + const int count = 257; + + for (var i = 0; i < count; i++) + { + random.NextBytes(key); + map.SetAssert(key, [(byte)(i & 255)]); + } + + // reset + random = new Random(seed); + for (var i = 0; i < count; i++) + { + random.NextBytes(key); + map.GetAssert(key, [(byte)(i & 255)]); + } + } + + [Test(Description = "Make a lot of requests to make breach the vector count")] + public void Set_Get_With_Specific_Lengths([Values(8, 16, 32, 64, 68, 72)] int count) + { + const int keyLength = 2; + + Span keys = stackalloc byte[count * 2]; + for (byte i = 0; i < count; i++) + { + keys[i * keyLength] = i; + keys[i * keyLength + 1] = i; + } + + var map = new SlottedArray(new byte[Page.PageSize]); + + for (var i = 0; i < count; i++) + { + map.SetAssert(GetKey(keys, i), GetValue(i)); + } + + for (var i = 0; i < count; i++) + { + map.GetAssert(GetKey(keys, i), GetValue(i)); + } + + return; + + static NibblePath GetKey(Span keys, int i) => NibblePath.FromKey(keys.Slice(i * keyLength, keyLength)); + static ReadOnlySpan GetValue(int i) => new byte[(byte)(i & 255)]; + } + private static ReadOnlySpan Data(byte key) => new[] { key }; [Test] @@ -454,14 +509,14 @@ public void Move_to_4() public void Move_to_8() { var original = new SlottedArray(stackalloc byte[512]); - var copy0 = new SlottedArray(stackalloc byte[64]); - var copy1 = new SlottedArray(stackalloc byte[64]); - var copy2 = new SlottedArray(stackalloc byte[64]); - var copy3 = new SlottedArray(stackalloc byte[64]); - var copy4 = new SlottedArray(stackalloc byte[64]); - var copy5 = new SlottedArray(stackalloc byte[64]); - var copy6 = new SlottedArray(stackalloc byte[64]); - var copy7 = new SlottedArray(stackalloc byte[64]); + var copy0 = new SlottedArray(stackalloc byte[128]); + var copy1 = new SlottedArray(stackalloc byte[128]); + var copy2 = new SlottedArray(stackalloc byte[128]); + var copy3 = new SlottedArray(stackalloc byte[128]); + var copy4 = new SlottedArray(stackalloc byte[128]); + var copy5 = new SlottedArray(stackalloc byte[128]); + var copy6 = new SlottedArray(stackalloc byte[128]); + var copy7 = new SlottedArray(stackalloc byte[128]); var key0 = NibblePath.Empty; var key1 = NibblePath.Parse("1"); @@ -635,13 +690,15 @@ public static void DeleteAssert(this SlottedArray map, in ReadOnlySpan key public static void GetAssert(this SlottedArray map, in ReadOnlySpan key, ReadOnlySpan expected) { - map.TryGet(NibblePath.FromKey(key), out var actual).Should().BeTrue(); + var retrieved = map.TryGet(NibblePath.FromKey(key), out var actual); + retrieved.Should().BeTrue(); actual.SequenceEqual(expected).Should().BeTrue("Actual data should equal expected"); } public static void GetAssert(this SlottedArray map, in NibblePath key, ReadOnlySpan expected) { - map.TryGet(key, out var actual).Should().BeTrue(); + var retrieved = map.TryGet(key, out var actual); + retrieved.Should().BeTrue(); actual.SequenceEqual(expected).Should().BeTrue("Actual data should equal expected"); } diff --git a/src/Paprika.Tests/Store/DbTests.cs b/src/Paprika.Tests/Store/DbTests.cs index 4839fcb9..7adf4a3b 100644 --- a/src/Paprika.Tests/Store/DbTests.cs +++ b/src/Paprika.Tests/Store/DbTests.cs @@ -1,7 +1,6 @@ using System.Buffers.Binary; using FluentAssertions; using Nethermind.Int256; -using NUnit.Framework; using Paprika.Crypto; using Paprika.Store; using static Paprika.Tests.Values; diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs index a1c9c7c1..fd0064e1 100644 --- a/src/Paprika/Data/SlottedArray.cs +++ b/src/Paprika/Data/SlottedArray.cs @@ -1,8 +1,10 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using Paprika.Store; using Paprika.Utils; @@ -16,8 +18,8 @@ namespace Paprika.Data; /// The map is fixed in since as it's page dependent, hence the name. /// It is a modified version of a slot array, that does not externalize slot indexes. /// -/// It keeps an internal map, now implemented with a not-the-best loop over slots. -/// With the use of key prefix, it should be small enough and fast enough for now. +/// It keeps an internal map, that is aligned with the local hardware vector size, so that even vectors (0th, 2nd, 4th...) +/// are used for hashes, while odd (1st, 3rd, 5th...) are used to store slots. /// public readonly ref struct SlottedArray { @@ -27,31 +29,52 @@ public readonly ref struct SlottedArray private readonly ref Header _header; private readonly Span _data; + private static readonly int VectorSize = + Vector256.IsHardwareAccelerated ? Vector256.Count : Vector128.Count; + + private const int VectorsByBatch = 2; + private static readonly int DoubleVectorSize = VectorSize * VectorsByBatch; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int AlignToDoubleVectorSize(int count) => (count + (DoubleVectorSize - 1)) & -DoubleVectorSize; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int AlignToVectorSize(int count) => (count + (VectorSize - 1)) & -VectorSize; + public SlottedArray(Span buffer) { + Debug.Assert(buffer.Length > MinimalSizeWithNoData, + $"The buffer should be reasonably big, more than {MinimalSizeWithNoData}"); + _header = ref Unsafe.As(ref MemoryMarshal.GetReference(buffer)); _data = buffer.Slice(Header.Size); } - private readonly ref Slot this[int index] + public static int MinimalSizeWithNoData => DoubleVectorSize + Header.Size; + + private ref ushort GetHashRef(int index) { - get - { - var offset = index * Slot.Size; - if (offset >= _data.Length - Slot.Size) - { - ThrowIndexOutOfRangeException(); - } + // Hashes are at [0, VectorSize), then [VectorSize*2, VectorSize*3), then [VectorSize*4, VectorSize*5) + // To extract them extract the higher part and multiply by two, then add the lower part. - return ref Unsafe.As(ref Unsafe.Add(ref MemoryMarshal.GetReference(_data), offset)); + var uShortsPerVector = VectorSize / 2; + var mask = uShortsPerVector - 1; + var offset = (index & ~mask) * 2 + (index & mask); - [DoesNotReturn] - [StackTraceHidden] - static void ThrowIndexOutOfRangeException() - { - throw new IndexOutOfRangeException(); - } - } + return ref Unsafe.Add(ref Unsafe.As(ref MemoryMarshal.GetReference(_data)), offset); + } + + private ref Slot GetSlotRef(int index) + { + // Slots are at [VectorSize, VectorSize*2), then [VectorSize*3, VectorSize*4), then [VectorSize*5, VectorSize*6) + // To extract them extract the higher part and multiply by two, then add the lower part. + // Additionally, add one ushorts per vector + var uShortsPerVector = VectorSize / 2; + + var mask = uShortsPerVector - 1; + var offset = (index & ~mask) * 2 + (index & mask) + uShortsPerVector; + + return ref Unsafe.Add(ref Unsafe.As(ref MemoryMarshal.GetReference(_data)), offset); } public bool TrySet(in NibblePath key, ReadOnlySpan data) @@ -79,7 +102,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO // does not exist yet, calculate total memory needed var total = GetTotalSpaceRequired(preamble, trimmed, data); - if (_header.Taken + total + Slot.Size > _data.Length) + if (_header.TakenAfterOneMoreSlot + total > _data.Length) { if (_header.Deleted == 0) { @@ -91,18 +114,20 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO Defragment(); // re-evaluate again - if (_header.Taken + total + Slot.Size > _data.Length) + if (_header.TakenAfterOneMoreSlot + total > _data.Length) { // not enough memory return false; } } - var at = _header.Low; - ref var slot = ref this[at / Slot.Size]; + var at = _header.Low / Slot.TotalSize; + + // Write hash at its place + GetHashRef(at) = hash; - // write slot - slot.Hash = hash; + // Writing slot at its place + ref var slot = ref GetSlotRef(at); slot.KeyPreamble = preamble; slot.ItemAddress = (ushort)(_data.Length - _header.High - total); @@ -120,7 +145,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO } // commit low and high - _header.Low += Slot.Size; + _header.Low += Slot.TotalSize; _header.High += (ushort)total; return true; @@ -129,7 +154,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO /// /// Gets how many slots are used in the map. /// - public int Count => _header.Low / Slot.Size; + public int Count => _header.Low / Slot.TotalSize; public int CapacityLeft => _data.Length - _header.Taken; @@ -171,13 +196,13 @@ public bool MoveNext() int index = _index + 1; var to = _map.Count; - ref var slot = ref _map[index]; + ref var slot = ref _map.GetSlotRef(index); while (index < to && slot.IsDeleted) // filter out deleted { // move by 1 index += 1; - slot = ref Unsafe.Add(ref slot, 1); + slot = ref _map.GetSlotRef(index); } if (index < to) @@ -194,9 +219,11 @@ public bool MoveNext() private void Build(out Item value) { - ref var slot = ref _map[_index]; - var span = _map.GetSlotPayload(ref slot); - var key = Slot.UnPrepareKey(slot.Hash, slot.KeyPreamble, span, _bytes.Span, out var data); + ref var slot = ref _map.GetSlotRef(_index); + var hash = _map.GetHashRef(_index); + + var span = _map.GetSlotPayload(_index); + var key = Slot.UnPrepareKey(hash, slot.KeyPreamble, span, _bytes.Span, out var data); value = new Item(key, data, _index); } @@ -223,16 +250,16 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst for (int i = 0; i < to; i++) { - ref var slot = ref this[i]; + ref var slot = ref GetSlotRef(i); if (slot.IsDeleted) continue; if (slot.HasAtLeastOneNibble == false) continue; - var nibble = slot.Nibble0; + var nibble = slot.GetNibble0(GetHashRef(i)); ref readonly var map = ref MapSource.GetMap(destination, nibble); - var payload = GetSlotPayload(ref slot); + var payload = GetSlotPayload(i); Span data; @@ -247,10 +274,11 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst data = payload; } + var hash = GetHashRef(i); if (data.IsEmpty && treatEmptyAsTombstone) { // special case for tombstones in overflows - var index = map.TryGetImpl(trimmed, slot.Hash, slot.KeyPreamble, out _); + var index = map.TryGetImpl(trimmed, hash, slot.KeyPreamble, out _); if (index != NotFound) { map.DeleteImpl(index); @@ -258,7 +286,7 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst slot.MarkAsDeleted(); } - else if (map.TrySetImpl(slot.Hash, slot.KeyPreamble, trimmed, data)) + else if (map.TrySetImpl(hash, slot.KeyPreamble, trimmed, data)) { slot.MarkAsDeleted(); moved++; @@ -281,15 +309,15 @@ public void GatherCountStatistics(Span buckets) { Debug.Assert(buckets.Length == BucketCount); - var to = _header.Low / Slot.Size; + var to = _header.Low / Slot.TotalSize; for (var i = 0; i < to; i++) { - ref var slot = ref this[i]; + ref var slot = ref GetSlotRef(i); // extract only not deleted and these which have at least one nibble if (slot.IsDeleted == false && slot.HasAtLeastOneNibble) { - buckets[slot.Nibble0] += 1; + buckets[slot.GetNibble0(GetHashRef(i))] += 1; } } } @@ -327,8 +355,8 @@ public bool Delete(in NibblePath key) private void DeleteImpl(int index, bool collectTombstones = true) { - // mark as deleted first - this[index].MarkAsDeleted(); + // Mark as deleted first + MarkAsDeleted(index); _header.Deleted++; if (collectTombstones) @@ -337,10 +365,24 @@ private void DeleteImpl(int index, bool collectTombstones = true) } } + private void MarkAsDeleted(int index) + { + GetSlotRef(index).MarkAsDeleted(); + + // Provide a different hash so that further searches with TryGet won't be hitting this slot. + // + // We could use a constant value, but then on a collision with an actual value the tail + // performance would be terrible. + // + // The easiest way is to negate the hash that makes it not equal and yet is not a single value. + ref var hash = ref GetHashRef(index); + hash = (ushort)~hash; + } + private void Defragment() { // As data were fitting before, the will fit after so all the checks can be skipped - var count = _header.Low / Slot.Size; + var count = _header.Low / Slot.TotalSize; // The pointer where the writing in the array ended, move it up when written. var writeAt = 0; @@ -350,7 +392,7 @@ private void Defragment() for (int i = 0; i < count; i++) { - var slot = this[i]; + ref var slot = ref GetSlotRef(i); var addr = slot.ItemAddress; if (!slot.IsDeleted) @@ -370,10 +412,12 @@ private void Defragment() writtenTo = (ushort)(writtenTo - source.Length); var destination = _data.Slice(writtenTo, source.Length); source.CopyTo(destination); - ref var destinationSlot = ref this[writeAt]; + ref var destinationSlot = ref GetSlotRef(writeAt); + + // Copy hash + GetHashRef(writeAt) = GetHashRef(i); // Copy everything, just overwrite the address - destinationSlot.Hash = slot.Hash; destinationSlot.KeyPreamble = slot.KeyPreamble; destinationSlot.ItemAddress = writtenTo; @@ -386,7 +430,7 @@ private void Defragment() } // Finalize by setting the header - _header.Low = (ushort)(newCount * Slot.Size); + _header.Low = (ushort)(newCount * Slot.TotalSize); _header.High = (ushort)(_data.Length - writtenTo); _header.Deleted = 0; } @@ -399,18 +443,19 @@ private void CollectTombstones() // start with the last written and perform checks and cleanup till all the deleted are gone var index = Count - 1; - while (index >= 0 && this[index].IsDeleted) + while (index >= 0 && GetSlotRef(index).IsDeleted) { // undo writing low - _header.Low -= Slot.Size; + _header.Low -= Slot.TotalSize; // undo writing high - var slice = GetSlotPayload(ref this[index]); + var slice = GetSlotPayload(index); var total = slice.Length; _header.High = (ushort)(_header.High - total); // cleanup - this[index] = default; + // Hash is already replaced with its delete. Clean the slot + GetSlotRef(index) = default; _header.Deleted--; // move back by one to see if it's deleted as well @@ -445,70 +490,134 @@ public void Clear() "key encoding is delayed but it might be called twice, here + TrySet")] private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span data) { - var to = _header.Low; - - // uses vectorized search, treating slots as a Span - // if the found index is odd -> found a slot to be queried - - const int notFound = -1; - var span = MemoryMarshal.Cast(_data.Slice(0, to)); + var count = _header.Low / Slot.TotalSize; + var jump = DoubleVectorSize / sizeof(ushort); + var aligned = AlignToDoubleVectorSize(_header.Low) / sizeof(ushort); - var offset = 0; - int index = span.IndexOf(hash); - - if (index == notFound) - { - data = default; - return NotFound; - } + ref var d = ref Unsafe.As(ref MemoryMarshal.GetReference(_data)); - while (index != notFound) + if (Vector256.IsHardwareAccelerated) { - // move offset to the given position - offset += index; + var search = Vector256.Create(hash); - if ((offset & Slot.HashShiftForSearch) == Slot.HashShiftForSearch) + for (var i = 0; i < aligned; i += jump) { - var i = offset / 2; - - ref var slot = ref this[i]; - - // Preamble check is sufficient as IsDeleted is a special value of the preamble - if ( /*slot.IsDeleted == false &&*/ slot.KeyPreamble == preamble) + var value = Vector256.LoadUnsafe(ref d, (UIntPtr)i); + if (Vector256.EqualsAny(value, search)) { - var actual = GetSlotPayload(ref slot); + var matches = Vector256.Equals(value, search).ExtractMostSignificantBits(); - if (slot.HasKeyBytes) + if (i + jump >= aligned) { - if (NibblePath.TryReadFrom(actual, key, out var leftover)) - { - data = leftover; - return i; - } + // Undoing the multiplication done above to calculate aligned, to get the number of items. + var alignedCount = aligned / VectorsByBatch; + var toClear = alignedCount - count; + + // This is the last in batch, masking is required to remove potential hits that are false positive + var hashesPerVector = VectorSize / sizeof(ushort); + var mask = (1U << hashesPerVector - toClear) - 1; + matches &= mask; } - else + + if (matches > 0) { - // The key is contained in the hash, all is equal and good to go! - data = actual; - return i; + var found = TryFind(i / VectorsByBatch, matches, key, preamble, out data); + if (found != NotFound) + { + return found; + } } } } + } + else if (Vector128.IsHardwareAccelerated) + { + var search = Vector128.Create(hash); - if (index + 1 >= span.Length) + for (var i = 0; i < aligned; i += jump) { - // the span is empty and there's not place to move forward - break; + var value = Vector128.LoadUnsafe(ref d, (UIntPtr)i); + if (Vector128.EqualsAny(value, search)) + { + var matches = Vector128.Equals(value, search).ExtractMostSignificantBits(); + + if (i + jump >= aligned) + { + // Undoing the multiplication done above to calculate aligned, to get the number of items. + var alignedCount = aligned / VectorsByBatch; + var toClear = alignedCount - count; + + // This is the last in batch, masking is required to remove potential hits that are false positive + var hashesPerVector = VectorSize / sizeof(ushort); + var mask = (1U << hashesPerVector - toClear) - 1; + matches &= mask; + } + + if (matches > 0) + { + var found = TryFind(i / VectorsByBatch, matches, key, preamble, out data); + if (found != NotFound) + { + return found; + } + } + } } + } + else + { + ThrowNoVectorSupport(); + } - // move next: ushorts sliced to the next - // offset moved by 1 to align - span = span.Slice(index + 1); - offset += 1; + data = default; + return NotFound; - // move to next index - index = span.IndexOf(hash); + [MethodImpl(MethodImplOptions.NoInlining)] + void ThrowNoVectorSupport() + { + throw new NotSupportedException( + $"This platform does not support {nameof(Vector256)} nor {nameof(Vector128)}"); } + } + + private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out Span data) + { + var search = matches; + + Debug.Assert(search != 0); + + do + { + var index = BitOperations.TrailingZeroCount(search); + + // remove the match flag + search ^= 1U << index; + + var i = index + at; + + ref var slot = ref GetSlotRef(i); + + // Preamble check is sufficient as IsDeleted is a special value of the preamble + if ( /*slot.IsDeleted == false &&*/ slot.KeyPreamble == preamble) + { + var actual = GetSlotPayload(i); + + if (slot.HasKeyBytes) + { + if (NibblePath.TryReadFrom(actual, key, out var leftover)) + { + data = leftover; + return i; + } + } + else + { + // The key is contained in the hash, all is equal and good to go! + data = actual; + return i; + } + } + } while (search != 0); data = default; return NotFound; @@ -517,15 +626,19 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span /// Gets the payload pointed to by the given slot without the length prefix. /// - private Span GetSlotPayload(ref Slot slot) + [SkipLocalsInit] + private Span GetSlotPayload(int index) { - // assert whether the slot has a previous, if not use data.length - var previousSlotAddress = Unsafe.IsAddressLessThan(ref this[0], ref slot) - ? Unsafe.Add(ref slot, -1).ItemAddress - : _data.Length; + var addr = GetSlotRef(index).ItemAddress; - var length = previousSlotAddress - slot.ItemAddress; - return _data.Slice(slot.ItemAddress, length); + // If this is the first, just slice of data + if (index == 0) + return _data[addr..]; + + // Not the first, calculate on the basis of the address. + var previousSlotAddress = GetSlotRef(index - 1).ItemAddress; + var length = previousSlotAddress - addr; + return _data.Slice(addr, length); } /// @@ -541,19 +654,18 @@ public static ushort PrepareKeyForTests(in NibblePath key, out byte preamble, ou Slot.PrepareKey(key, out preamble, out trimmed); /// - /// The slot is a size of bytes. - /// - /// It consists of two ushort parts, - /// 1. and - /// 2. . - /// - /// is a result of that returns the value to be memoized in a slot. It only 2 bytes so collision may occur. - /// encodes all the metadata related to the slot. + /// The slot is a size of bytes and represents non-hash part of the entry. + /// The separation is done to make the search as vector aligned as possible. /// [StructLayout(LayoutKind.Sequential, Pack = sizeof(byte), Size = Size)] private struct Slot { - public const int Size = 4; + /// + /// The size of with hash combined. + /// + public const int TotalSize = Size + sizeof(ushort); + + public const int Size = 2; /// /// The address currently requires 12 bits [0-11] to address whole page. @@ -580,14 +692,6 @@ public ushort ItemAddress public void MarkAsDeleted() { KeyPreamble = KeyPreambleDelete; - - // Provide a different hash so that further searches with TryGet won't be hitting this slot. - // - // We could use a constant value, but then on a collision with an actual value the tail - // performance would be terrible. - // - // The easiest way is to negate the hash that makes it not equal and yet is not a single value. - Hash = (ushort)~Hash; } // Preamble uses all bits that AddressMask does not @@ -612,19 +716,16 @@ public void MarkAsDeleted() public bool HasAtLeastOneNibble => KeyPreamble != KeyPreambleEmpty; // Shift by 12, unless it's odd. If odd, shift by 8 - public byte Nibble0 + public byte GetNibble0(ushort hash) { - get - { - var count = KeyPreamble >> KeyPreambleLengthShift; + var count = KeyPreamble >> KeyPreambleLengthShift; - // Remove the length mask - var hash = (ushort)(Hash ^ GetHashMask(count)); + // Remove the length mask + var h = (ushort)(hash ^ GetHashMask(count)); - return (byte)(0x0F & (hash >> (3 * NibblePath.NibbleShift - - ((Raw >> KeyPreambleShift) & KeyPreambleOddBit) * - NibblePath.NibbleShift))); - } + return (byte)(0x0F & (h >> (3 * NibblePath.NibbleShift - + ((Raw >> KeyPreambleShift) & KeyPreambleOddBit) * + NibblePath.NibbleShift))); } public byte KeyPreamble @@ -637,21 +738,10 @@ public byte KeyPreamble private ushort Raw; - /// - /// Used for vectorized search - /// - public const int HashShiftForSearch = 1; - /// /// The memorized result of of this item. /// - public ushort Hash; - - public override readonly string ToString() - { - return - $"{nameof(Hash)}: {Hash}, {nameof(ItemAddress)}: {ItemAddress}"; - } + public readonly override string ToString() => $"{nameof(ItemAddress)}: {ItemAddress}"; /// /// Mask selected in a way that it can be shifted by 0, 1, 2 and @@ -866,6 +956,8 @@ private struct Header public ushort Deleted; public readonly ushort Taken => (ushort)(Low + High); + + public readonly ushort TakenAfterOneMoreSlot => (ushort)(AlignToDoubleVectorSize(Low + Slot.TotalSize) + High); } }