From a8184508949b2d675547fdbf987298e6c3756764 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Tue, 23 Jul 2024 13:52:38 +0200
Subject: [PATCH 01/19] more vectorization

---
 src/Paprika/Data/SlottedArray.cs | 136 +++++++++++++++++++++----------
 1 file changed, 93 insertions(+), 43 deletions(-)
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index a1c9c7c1..71c80206 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -1,8 +1,10 @@
 using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 using Paprika.Store;
 using Paprika.Utils;
 
@@ -445,69 +447,117 @@ public void Clear()
         "key encoding is delayed but it might be called twice, here + TrySet")]
     private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<byte> data)
     {
-        var to = _header.Low;
+        // Count is the number of ushort hashes to scan from Slots.
+        // Each Slot has the hash as the second one.
+        var count = _header.Low / sizeof(ushort);
 
-        // uses vectorized search, treating slots as a Span<ushort>
-        // if the found index is odd -> found a slot to be queried
+        ref var searchSpace = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
+        ref var currentSearchSpace = ref searchSpace;
 
-        const int notFound = -1;
-        var span = MemoryMarshal.Cast<byte, ushort>(_data.Slice(0, to));
+        // Vectorized search use the same approach, with shuffling batches of two vectors at the time.
+        // As hash is held as ushort (2 bytes) at a Slot struct (4 bytes), we can use shuffle instruction to extract them.
+        // Hashes will be at indexes 1, 3, 5, ... so with shuffle indexes can be shuffled to lower (first vector)
+        // and upper (second vector).
+        // This amortizes the comparison making 2x less comparisons and no false matches (only hashes are compared).
+        // When found, TryFind is executed with all the matches from the given 2*vector size batch.
 
-        var offset = 0;
-        int index = span.IndexOf(hash);
-
-        if (index == notFound)
+        if (Vector256.IsHardwareAccelerated)
         {
-            data = default;
-            return NotFound;
-        }
+            // Consume 2 vectors at the time as each vector will have half of it shuffled away
+            const int batch = 2;
+            ref var twoVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, count - Vector256<ushort>.Count * batch);
 
-        while (index != notFound)
-        {
-            // move offset to the given position
-            offset += index;
+            var search = Vector256.Create(hash);
 
-            if ((offset & Slot.HashShiftForSearch) == Slot.HashShiftForSearch)
+            while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorAwayFromEnd))
             {
-                var i = offset / 2;
+                // There's more than 2 vectors to scan, use shuffling approach.
+                // Pack lower with first hashes, then higher with high
+                var a = Vector256.LoadUnsafe(ref currentSearchSpace);
+                var shuffleLow = Vector256.Create((ushort)1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0);
+                var lower = Vector256.Shuffle(a, shuffleLow).GetLower();
 
-                ref var slot = ref this[i];
+                var b = Vector256.LoadUnsafe(ref currentSearchSpace, (UIntPtr)Vector256<ushort>.Count);
+                var shuffleHigh = Vector256.Create((ushort)0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
+                var higher = Vector256.Shuffle(b, shuffleHigh).GetUpper();
 
-                // Preamble check is sufficient as IsDeleted is a special value of the preamble
-                if ( /*slot.IsDeleted == false &&*/ slot.KeyPreamble == preamble)
+                var combined = Vector256.Create(lower, higher);
+
+                if (Vector256.EqualsAny(combined, search))
                 {
-                    var actual = GetSlotPayload(ref slot);
+                    var matches = Vector256.Equals(combined, search).ExtractMostSignificantBits();
+                    var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
 
-                    if (slot.HasKeyBytes)
-                    {
-                        if (NibblePath.TryReadFrom(actual, key, out var leftover))
-                        {
-                            data = leftover;
-                            return i;
-                        }
-                    }
-                    else
+                    var found = TryFind(at, matches, key, preamble, out data);
+                    if (found != NotFound)
                     {
-                        // The key is contained in the hash, all is equal and good to go!
-                        data = actual;
-                        return i;
+                        return found;
                     }
                 }
+
+                currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<ushort>.Count * batch);
             }
 
-            if (index + 1 >= span.Length)
+            // there might be a leftover here! Optimize by checking whether it can be handled with a mask. If it can, loop one more time
+        }
+
+        // Leftover handling
+        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Slot.HashShiftForSearch);
+
+        ref var end = ref Unsafe.Add(ref searchSpace, count);
+        while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref end))
+        {
+            if (currentSearchSpace == hash)
             {
-                // the span is empty and there's not place to move forward
-                break;
+                var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
+                var found = TryFind(at, 1, key, preamble, out data);
+                if (found != NotFound)
+                {
+                    return found;
+                }
             }
 
-            // move next: ushorts sliced to the next
-            // offset moved by 1 to align
-            span = span.Slice(index + 1);
-            offset += 1;
+            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Slot.Size / sizeof(ushort));
+        }
+
+        data = default;
+        return NotFound;
+    }
+
+    private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out Span<byte> data)
+    {
+        var search = matches;
+        while (search != 0)
+        {
+            var index = BitOperations.TrailingZeroCount(matches);
+
+            // remove the match flag
+            search ^= (uint)(1 << index);
+
+            var i = index + at;
+
+            ref var slot = ref this[i];
+
+            // Preamble check is sufficient as IsDeleted is a special value of the preamble
+            if ( /*slot.IsDeleted == false &&*/ slot.KeyPreamble == preamble)
+            {
+                var actual = GetSlotPayload(ref slot);
 
-            // move to next index
-            index = span.IndexOf(hash);
+                if (slot.HasKeyBytes)
+                {
+                    if (NibblePath.TryReadFrom(actual, key, out var leftover))
+                    {
+                        data = leftover;
+                        return i;
+                    }
+                }
+                else
+                {
+                    // The key is contained in the hash, all is equal and good to go!
+                    data = actual;
+                    return i;
+                }
+            }
         }
 
         data = default;

From dfe3f40686227d4598f467cafbb7b5f8768dd2c7 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Tue, 23 Jul 2024 14:00:46 +0200
Subject: [PATCH 02/19] TryFind loop shortened

---
 src/Paprika/Data/SlottedArray.cs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index 71c80206..b834bafe 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -527,7 +527,10 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
     private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out Span<byte> data)
     {
         var search = matches;
-        while (search != 0)
+
+        Debug.Assert(search != 0);
+
+        do
         {
             var index = BitOperations.TrailingZeroCount(matches);
 
@@ -558,7 +561,7 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
                     return i;
                 }
             }
-        }
+        } while (search != 0);
 
         data = default;
         return NotFound;

From 902f3d14c3d14cf616cec3b1d18ff69c1cf93505 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Tue, 23 Jul 2024 15:32:50 +0200
Subject: [PATCH 03/19] Search fixed

---
 src/Paprika/Data/SlottedArray.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index b834bafe..b9e64769 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -458,7 +458,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         // As hash is held as ushort (2 bytes) at a Slot struct (4 bytes), we can use shuffle instruction to extract them.
         // Hashes will be at indexes 1, 3, 5, ... so with shuffle indexes can be shuffled to lower (first vector)
         // and upper (second vector).
-        // This amortizes the comparison making 2x less comparisons and no false matches (only hashes are compared).
+        // This amortizes the comparison making 2x fewer comparisons and no false matches (only hashes are compared).
         // When found, TryFind is executed with all the matches from the given 2*vector size batch.
 
         if (Vector256.IsHardwareAccelerated)
@@ -498,7 +498,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                 currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<ushort>.Count * batch);
             }
 
-            // there might be a leftover here! Optimize by checking whether it can be handled with a mask. If it can, loop one more time
+            // There might be a leftover here! Optimize by checking whether it can be handled with a mask. If it can, loop one more time
         }
 
         // Leftover handling
@@ -532,7 +532,7 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
 
         do
         {
-            var index = BitOperations.TrailingZeroCount(matches);
+            var index = BitOperations.TrailingZeroCount(search);
 
             // remove the match flag
             search ^= (uint)(1 << index);

From a16c1c6faac2acc471ab169efdfe0e298696e975 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Tue, 23 Jul 2024 17:47:47 +0200
Subject: [PATCH 04/19] aligned search

---
 src/Paprika.Tests/Data/SlottedArrayTests.cs |  2 -
 src/Paprika/Data/SlottedArray.cs            | 59 +++++++++++++++++----
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index 7dcb418e..b5ab4639 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -1,8 +1,6 @@
 using FluentAssertions;
-using NUnit.Framework;
 using Paprika.Crypto;
 using Paprika.Data;
-using Paprika.Merkle;
 
 namespace Paprika.Tests.Data;
 
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index b9e64769..091167e1 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -452,6 +452,8 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         var count = _header.Low / sizeof(ushort);
 
         ref var searchSpace = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
+        ref var end = ref Unsafe.Add(ref searchSpace, count);
+
         ref var currentSearchSpace = ref searchSpace;
 
         // Vectorized search use the same approach, with shuffling batches of two vectors at the time.
@@ -465,11 +467,18 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         {
             // Consume 2 vectors at the time as each vector will have half of it shuffled away
             const int batch = 2;
-            ref var twoVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, count - Vector256<ushort>.Count * batch);
+            var batchSize = Vector256<ushort>.Count * batch;
+
+            // Aligned count to the batch size.
+            var alignedCount = (count + (batchSize - 1)) & -batchSize;
+
+            // if aligned count ends before, add batch size to make it aligned.
+            ref var loopEnd = ref Unsafe.Add(ref searchSpace,
+                alignedCount <= _data.Length ? alignedCount : alignedCount - batchSize);
 
             var search = Vector256.Create(hash);
 
-            while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorAwayFromEnd))
+            while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref loopEnd))
             {
                 // There's more than 2 vectors to scan, use shuffling approach.
                 // Pack lower with first hashes, then higher with high
@@ -486,8 +495,21 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                 if (Vector256.EqualsAny(combined, search))
                 {
                     var matches = Vector256.Equals(combined, search).ExtractMostSignificantBits();
-                    var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
 
+                    // Check if this was not a test over the boundary
+                    if (Unsafe.IsAddressGreaterThan(ref Unsafe.Add(ref currentSearchSpace, batchSize), ref end))
+                    {
+                        // It was and it requires removing some bits that might be over the boundary.
+                        var shift = count - (alignedCount - batchSize);
+                        matches &= (1U << shift) - 1;
+                        if (matches == 0)
+                        {
+                            data = default;
+                            return NotFound;
+                        }
+                    }
+
+                    var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
                     var found = TryFind(at, matches, key, preamble, out data);
                     if (found != NotFound)
                     {
@@ -495,21 +517,36 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                     }
                 }
 
-                currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<ushort>.Count * batch);
+                currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, batchSize);
             }
+        }
 
-            // There might be a leftover here! Optimize by checking whether it can be handled with a mask. If it can, loop one more time
+        if (!Unsafe.IsAddressLessThan(ref currentSearchSpace, ref end))
+        {
+            data = default;
+            return NotFound;
         }
+        else
+        {
+            return SlowTail(key, hash, preamble, out data, ref currentSearchSpace, ref end, ref searchSpace);
+        }
+    }
 
+    /// <summary>
+    /// Slow that is just a scan over the leftovers.
+    /// </summary>
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private int SlowTail(in NibblePath key, ushort hash, byte preamble, out Span<byte> data, ref ushort currentSearchSpace, ref ushort end,
+        ref ushort searchSpace)
+    {
         // Leftover handling
-        currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Slot.HashShiftForSearch);
+        ref var search = ref Unsafe.Add(ref currentSearchSpace, Slot.HashShiftForSearch);
 
-        ref var end = ref Unsafe.Add(ref searchSpace, count);
-        while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref end))
+        while (!Unsafe.IsAddressGreaterThan(ref search, ref end))
         {
-            if (currentSearchSpace == hash)
+            if (search == hash)
             {
-                var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
+                var at = (int)Unsafe.ByteOffset(ref searchSpace, ref search) / Slot.Size;
                 var found = TryFind(at, 1, key, preamble, out data);
                 if (found != NotFound)
                 {
@@ -517,7 +554,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                 }
             }
 
-            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Slot.Size / sizeof(ushort));
+            search = ref Unsafe.Add(ref currentSearchSpace, Slot.Size / sizeof(ushort));
         }
 
         data = default;

From 2e8823f8a80e8d0f1dbab46180a8e57992a5da93 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Tue, 23 Jul 2024 18:42:00 +0200
Subject: [PATCH 05/19] single vector at a time

---
 src/Paprika/Data/SlottedArray.cs | 35 ++++++++++++++------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index 091167e1..e6f558e1 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -466,8 +466,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         if (Vector256.IsHardwareAccelerated)
         {
             // Consume 2 vectors at the time as each vector will have half of it shuffled away
-            const int batch = 2;
-            var batchSize = Vector256<ushort>.Count * batch;
+            var batchSize = Vector256<ushort>.Count;
 
             // Aligned count to the batch size.
             var alignedCount = (count + (batchSize - 1)) & -batchSize;
@@ -482,26 +481,19 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
             {
                 // There's more than 2 vectors to scan, use shuffling approach.
                 // Pack lower with first hashes, then higher with high
-                var a = Vector256.LoadUnsafe(ref currentSearchSpace);
-                var shuffleLow = Vector256.Create((ushort)1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0);
-                var lower = Vector256.Shuffle(a, shuffleLow).GetLower();
+                var value = Vector256.LoadUnsafe(ref currentSearchSpace);
 
-                var b = Vector256.LoadUnsafe(ref currentSearchSpace, (UIntPtr)Vector256<ushort>.Count);
-                var shuffleHigh = Vector256.Create((ushort)0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
-                var higher = Vector256.Shuffle(b, shuffleHigh).GetUpper();
-
-                var combined = Vector256.Create(lower, higher);
-
-                if (Vector256.EqualsAny(combined, search))
+                if (Vector256.EqualsAny(value, search))
                 {
-                    var matches = Vector256.Equals(combined, search).ExtractMostSignificantBits();
+                    var matches = Vector256.Equals(value, search).ExtractMostSignificantBits();
+                    matches &= 0xAAAAAAAA; // 0b101010 aligned with placement of Slot.Hash
 
                     // Check if this was not a test over the boundary
                     if (Unsafe.IsAddressGreaterThan(ref Unsafe.Add(ref currentSearchSpace, batchSize), ref end))
                     {
                         // It was and it requires removing some bits that might be over the boundary.
                         var shift = count - (alignedCount - batchSize);
-                        matches &= (1U << shift) - 1;
+                        matches &= (1U << (shift * 2)) - 1;
                         if (matches == 0)
                         {
                             data = default;
@@ -509,11 +501,14 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                         }
                     }
 
-                    var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
-                    var found = TryFind(at, matches, key, preamble, out data);
-                    if (found != NotFound)
+                    if (matches > 0)
                     {
-                        return found;
+                        var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
+                        var found = TryFind(at, matches, key, preamble, out data);
+                        if (found != NotFound)
+                        {
+                            return found;
+                        }    
                     }
                 }
 
@@ -569,10 +564,10 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
 
         do
         {
-            var index = BitOperations.TrailingZeroCount(search);
+            var index = (BitOperations.TrailingZeroCount(search) - 1) >> 1;
 
             // remove the match flag
-            search ^= (uint)(1 << index);
+            search ^= (uint)(0b10 << (index * 2));
 
             var i = index + at;
 

From b3e3d981806450c34d17127c9c1f67ad473fc742 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Wed, 24 Jul 2024 14:45:07 +0200
Subject: [PATCH 06/19] SlottedArray made more vector-aware

---
 ...Defragment_when_no_more_space.verified.bin | Bin 88 -> 160 bytes
 ...rrayTests.Enumerate_all_odd=0.verified.bin | Bin 256 -> 256 bytes
 ...rrayTests.Enumerate_all_odd=1.verified.bin | Bin 256 -> 256 bytes
 ...key_oddStart=0_lengthCutOff=0.verified.bin | Bin 128 -> 128 bytes
 ...key_oddStart=0_lengthCutOff=1.verified.bin | Bin 128 -> 128 bytes
 ...key_oddStart=1_lengthCutOff=0.verified.bin | Bin 128 -> 128 bytes
 ...key_oddStart=1_lengthCutOff=1.verified.bin | Bin 128 -> 128 bytes
 ...Set_Get_Delete_Get_AnotherSet.verified.bin | Bin 48 -> 105 bytes
 ...ottedArrayTests.Set_Get_Empty.verified.bin | Bin 48 -> 128 bytes
 ...yTests.Small_keys_compression.verified.bin | Bin 256 -> 512 bytes
 ...edArrayTests.Update_in_resize.verified.bin | Bin 56 -> 105 bytes
 ...ttedArrayTests.Update_in_situ.verified.bin | Bin 48 -> 128 bytes
 src/Paprika.Tests/Data/SlottedArrayTests.cs   | 110 ++++++-
 src/Paprika/Data/SlottedArray.cs              | 306 ++++++++----------
 14 files changed, 221 insertions(+), 195 deletions(-)

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Defragment_when_no_more_space.verified.bin
index 422ac9836a4c4ecec874804d3f28463a66428181..fbae61d59c0f0efe1a1f1116d22c5bb04df3c920 100644
GIT binary patch
literal 160
vcmd;JaAE)fc1AWfFarlsz94J?4k<js6>NBcs(J+@6Eh1d8#@DbT2>wakctHX

delta 33
jcmZ3$7$MHV;KTp|`U}_@`4+ITfjJBn6BA`7R;vO4Qg{WU

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=0.verified.bin
index 038a806a820ded711f20668143ed7fc4c77b32a5..c0c5bb43497a06c8d7d6c49bcf4ea9aebcd673d4 100644
GIT binary patch
literal 256
zcmWe(;Aa2<E(Qi(UUqqQO?EIBgZR$y)!}o)`vor-U`P!u0S;byaS2HlUU^Mbby<0F
E05b;&&;S4c

literal 256
zcmWe(;Aa4V?+jcFUmX~DKR58Qzh5BF{&ImPJ6K}45gfeo;u4Z9yz-i=>az0U0O&sm
A&;S4c

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_all_odd=1.verified.bin
index 5635d1b2c4c4cceb6a3f266383c7ea80d4f49852..9002515a268af18e4979dec8690688b6218ce56f 100644
GIT binary patch
literal 256
zcmWe(;Aa2<E`|l{yvupN^G*kIF^KOBUkyGNyx;J01BTSl5@3^85SNf-5|GzaRhN|)
F2LPOu3vK`a

literal 256
zcmWe(;Aa4V?+jcFUkw(pe=gu%{(b}R_m>-Zr-LPi8^I>8ATA-vBp|P;sxB)p4gf#9
B3vK`a

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=0.verified.bin
index 8addd03b220cee7e65fff06c07cb8e27e2b2de14..4b0f4d8fe13631636505689b3b5421b4ff465d83 100644
GIT binary patch
delta 27
gcmZo*Y+&SKVNhTI0mZP1f^tj|3nmJhO-xV#06RhjO8@`>

delta 29
icmZo*Y+&SJVNhTIfrtf)VG|AYCpsuhRA8N$pa1|*4hBmA

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=0_lengthCutOff=1.verified.bin
index 4511a3b4bb3ebfa3d199540545dec35710b3997f..d3f83608530a60b06c539d1578e05885910bcc57 100644
GIT binary patch
delta 27
gcmZo*Y+&SKVNhTI0jIEuf^tj|3nmJhO-xV#06h=|YybcN

delta 29
icmZo*Y+&SJVNhTIfrtf8VG|AYCpsuhRA8N$pa1|++y-m_

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=0.verified.bin
index 9db9ecc30ae25a9c0d364741f7167143ed488c2d..93356408ed257a5e181d906e0a73b590dd59b767 100644
GIT binary patch
delta 27
gcmZo*Y+&SKVNhTIffr&E1?89`HcS*Wo0y;g07Fy-;s5{u

delta 29
icmZo*Y+&SJVNhTIfrt$+#3mZ*Pjpb2sK7cgK>+|)pa$Xq

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Enumerate_long_key_oddStart=1_lengthCutOff=1.verified.bin
index cd122417d5de73c113f704392088c33869788039..6b1248c57d3eb8dc90b3ad45eb9da3a03d5935dc 100644
GIT binary patch
delta 27
gcmZo*Y+&SKVUTA4ffs5M1?89`H%t^XpO`2Q07M-H^8f$<

delta 29
icmZo*Y+&SJVUTA4fyfOn)Fv7lOmtA3sK7EYQ62zTcn0$T

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Delete_Get_AnotherSet.verified.bin
index bc14e4a6a9ad070b812a4a83152150e5b0e36786..7cc70d695b3b5c38b0d5544586be4ba354fdad9a 100644
GIT binary patch
literal 105
kcmZQ!P-FlBb})&Xa9Ds_4kS~-$i&RT%Er!sotBjc0AT(CDgXcg

literal 48
ecmZQ!P-FlB_66)9s)CV;nT3^&odG*7D-QrTBLVRM

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Set_Get_Empty.verified.bin
index 4380154428a1277af9579812b0a8f0775c35134e..2c6e155bbf59f8bb55f8fdd30f1dd9480d600c9e 100644
GIT binary patch
literal 128
icmZQ!kY@k^b})&Xh+KeM4zElFBNH<VD;qllb{YVFssaT7

literal 48
dcmZQ!kY@k^&IRmXnxTS`iJ66!jhz8I4FEO#0kr@C

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Small_keys_compression.verified.bin
index cd4b4c6b0a7316bc23d7904935d4f6dd4fb079b9..4e9a9f9c98cf4e1cd35d1caa1052ccc8d79e259f 100644
GIT binary patch
delta 187
zcmZ|DO-cfB06_7Vj-{D>sHNtl`{oo6yjL#W{%_zRB1DD=5s8Qh5s?Uqkcfzoh}K2>
z9>3M=X1dW?j}CurQzTD;UG~_aM276|-(Zt1B68n;`RUpZ-+lAd7gs*}<kCkUT*xs^
z>NIH4Cgy}P74|vch#FN6=gypZ@13{ac<scoBd<zcnmTkKrz9c7(S_%Rkc58xk6cJQ
EKR&NHP5=M^

delta 157
zcmWl~OG-ik06@{JS!vqGfBDI|wTL@-`^sTcX(d5~$Ri{qL_$PFBt%3aL?lAQadjrA
z5o<*6?g{v}my?|&w-W!6#J8Mn<a{lum7IP_^ivL&!a^c*Ihx7whs36GIFZs=$|JcP
u%GE%w-=)@<N>8e9lIcqJRW3S`evv}>cd;$GmgJwM{v<ce$J>W668r*&-z;<h

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_resize.verified.bin
index 6fcca8ada9c8ab674dcc0abd1e91a79a43c366b2..bc96d424bce8f95c2e272a484ef8a7e4e8f4c835 100644
GIT binary patch
literal 105
kcmZQ!P-FlBb})&Xa9Ds_4kS~-$i&RT%Er!somN!`0AUsaIsgCw

literal 56
fcmZQ!P-FlB{srt{8cI|!GBLBTvavH@r&ZMfKwAO{

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin b/src/Paprika.Tests/Data/SlottedArrayTests.Update_in_situ.verified.bin
index 6456fd8408442dca53ae8a343b9501739015fd63..e80592aace436e35e89afa2f75a88ffe84f91e7c 100644
GIT binary patch
literal 128
lcmZQ!P-FlBc1AFTlL%jcQwpC%1tSwP3o9Et19n<f9RPu#0#X0~

literal 48
fcmZQ!P-FlB_66*WK&pa~iJ66!jhz8It*Q<HIA8(-

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index b5ab4639..063cdad7 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -1,6 +1,9 @@
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
 using FluentAssertions;
 using Paprika.Crypto;
 using Paprika.Data;
+using Paprika.Store;
 
 namespace Paprika.Tests.Data;
 
@@ -20,7 +23,7 @@ public Task Set_Get_Delete_Get_AnotherSet()
     {
         var key0 = Values.Key0.Span;
 
-        Span<byte> span = stackalloc byte[48];
+        Span<byte> span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + key0.Length + Data0.Length];
         var map = new SlottedArray(span);
 
         map.SetAssert(key0, Data0);
@@ -118,7 +121,7 @@ public Task Set_Get_Empty()
     {
         var key0 = Values.Key0.Span;
 
-        Span<byte> span = stackalloc byte[48];
+        Span<byte> span = stackalloc byte[128];
         var map = new SlottedArray(span);
 
         var data = ReadOnlySpan<byte>.Empty;
@@ -149,7 +152,7 @@ public Task Set_Get_Empty()
     public Task Defragment_when_no_more_space()
     {
         // by trial and error, found the smallest value that will allow to put these two
-        Span<byte> span = stackalloc byte[88];
+        Span<byte> span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + 88];
         var map = new SlottedArray(span);
 
         var key0 = Values.Key0.Span;
@@ -177,7 +180,7 @@ public Task Defragment_when_no_more_space()
     public Task Update_in_situ()
     {
         // by trial and error, found the smallest value that will allow to put these two
-        Span<byte> span = stackalloc byte[48];
+        Span<byte> span = stackalloc byte[128];
         var map = new SlottedArray(span);
 
         var key1 = Values.Key1.Span;
@@ -194,12 +197,12 @@ public Task Update_in_situ()
     [Test]
     public Task Update_in_resize()
     {
+        var key0 = Values.Key0.Span;
+
         // Update the value, with the next one being bigger.
-        Span<byte> span = stackalloc byte[56];
+        Span<byte> span = stackalloc byte[SlottedArray.MinimalSizeWithNoData + key0.Length + Data0.Length];
         var map = new SlottedArray(span);
 
-        var key0 = Values.Key0.Span;
-
         map.SetAssert(key0, Data0);
         map.SetAssert(key0, Data2);
 
@@ -212,7 +215,7 @@ public Task Update_in_resize()
     [Test]
     public Task Small_keys_compression()
     {
-        Span<byte> span = stackalloc byte[256];
+        Span<byte> span = stackalloc byte[512];
         var map = new SlottedArray(span);
 
         Span<byte> key = stackalloc byte[1];
@@ -242,6 +245,81 @@ public Task Small_keys_compression()
         return Verify(span.ToArray());
     }
 
+    [Test(Description = "Make a lot of requests to make breach the vector count")]
+    public void Breach_VectorSize_with_key_count()
+    {
+        const int seed = 13;
+        var random = new Random(seed);
+        Span<byte> key = stackalloc byte[4];
+
+        var map = new SlottedArray(new byte[Page.PageSize]);
+
+        const int count = 257;
+
+        for (var i = 0; i < count; i++)
+        {
+            random.NextBytes(key);
+            map.SetAssert(key, [(byte)(i & 255)]);
+        }
+
+        // reset
+        random = new Random(seed);
+        for (var i = 0; i < count; i++)
+        {
+            random.NextBytes(key);
+            map.GetAssert(key, [(byte)(i & 255)]);
+        }
+    }
+
+    [Test]
+    public void Roll_over()
+    {
+        const int seed = 13;
+        var random = new Random(seed);
+        Span<byte> key = stackalloc byte[4];
+
+        var map = new SlottedArray(new byte[1024]);
+
+        byte count = 0;
+
+        random.NextBytes(key);
+        while (map.TrySet(NibblePath.FromKey(key), [count]))
+        {
+            count++;
+            random.NextBytes(key);
+        }
+
+        // reset, delete some
+        //random = new Random(seed);
+
+        using var e = map.EnumerateAll();
+        for (var i = 0; i < count; i++)
+        {
+            //random.NextBytes(key);
+            e.MoveNext().Should().BeTrue();
+
+            if (ShouldBeDeleted(i))
+            {
+                map.Delete(e.Current);
+                //map.Delete(NibblePath.FromKey(key)).Should().BeTrue();
+            }
+        }
+
+        // reset, assert
+        random = new Random(seed);
+        for (var i = 0; i < count; i++)
+        {
+            random.NextBytes(key);
+
+            var exist = map.TryGet(NibblePath.FromKey(key), out var data);
+            exist.Should().NotBe(ShouldBeDeleted(i));
+        }
+
+        return;
+
+        static bool ShouldBeDeleted(int i) => i % 2 == 0;
+    }
+
     private static ReadOnlySpan<byte> Data(byte key) => new[] { key };
 
     [Test]
@@ -452,14 +530,14 @@ public void Move_to_4()
     public void Move_to_8()
     {
         var original = new SlottedArray(stackalloc byte[512]);
-        var copy0 = new SlottedArray(stackalloc byte[64]);
-        var copy1 = new SlottedArray(stackalloc byte[64]);
-        var copy2 = new SlottedArray(stackalloc byte[64]);
-        var copy3 = new SlottedArray(stackalloc byte[64]);
-        var copy4 = new SlottedArray(stackalloc byte[64]);
-        var copy5 = new SlottedArray(stackalloc byte[64]);
-        var copy6 = new SlottedArray(stackalloc byte[64]);
-        var copy7 = new SlottedArray(stackalloc byte[64]);
+        var copy0 = new SlottedArray(stackalloc byte[128]);
+        var copy1 = new SlottedArray(stackalloc byte[128]);
+        var copy2 = new SlottedArray(stackalloc byte[128]);
+        var copy3 = new SlottedArray(stackalloc byte[128]);
+        var copy4 = new SlottedArray(stackalloc byte[128]);
+        var copy5 = new SlottedArray(stackalloc byte[128]);
+        var copy6 = new SlottedArray(stackalloc byte[128]);
+        var copy7 = new SlottedArray(stackalloc byte[128]);
 
         var key0 = NibblePath.Empty;
         var key1 = NibblePath.Parse("1");
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index e6f558e1..c8c55d99 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -18,8 +18,8 @@ namespace Paprika.Data;
 /// The map is fixed in since as it's page dependent, hence the name.
 /// It is a modified version of a slot array, that does not externalize slot indexes.
 ///
-/// It keeps an internal map, now implemented with a not-the-best loop over slots.
-/// With the use of key prefix, it should be small enough and fast enough for now.
+/// It keeps an internal map, that is aligned with the local hardware vector size, so that even vectors (0th, 2nd, 4th...)
+/// are used for hashes, while odd (1st, 3rd, 5th...) are used to store slots.
 /// </remarks>
 public readonly ref struct SlottedArray
 {
@@ -29,31 +29,47 @@ public readonly ref struct SlottedArray
     private readonly ref Header _header;
     private readonly Span<byte> _data;
 
+    private static int VectorSize => Vector256.IsHardwareAccelerated ? Vector256<byte>.Count : Vector128<byte>.Count;
+
+    private static int DoubleVectorSize => VectorSize * 2;
+
+    private static int AlignToDoubleVectorSize(int count) => (count + (DoubleVectorSize - 1)) & -DoubleVectorSize;
+
+
     public SlottedArray(Span<byte> buffer)
     {
+        Debug.Assert(buffer.Length > MinimalSizeWithNoData,
+            $"The buffer should be reasonably big, more than {MinimalSizeWithNoData}");
+
         _header = ref Unsafe.As<byte, Header>(ref MemoryMarshal.GetReference(buffer));
         _data = buffer.Slice(Header.Size);
     }
 
-    private readonly ref Slot this[int index]
+    public static int MinimalSizeWithNoData => DoubleVectorSize + Header.Size;
+
+    private ref ushort GetHashRef(int index)
     {
-        get
-        {
-            var offset = index * Slot.Size;
-            if (offset >= _data.Length - Slot.Size)
-            {
-                ThrowIndexOutOfRangeException();
-            }
+        // Hashes are at [0, VectorSize), then [VectorSize*2, VectorSize*3), then [VectorSize*4, VectorSize*5)
+        // To extract them extract the higher part and multiply by two, then add the lower part.
 
-            return ref Unsafe.As<byte, Slot>(ref Unsafe.Add(ref MemoryMarshal.GetReference(_data), offset));
+        var uShortsPerVector = VectorSize / 2;
+        var mask = uShortsPerVector - 1;
+        var offset = (index & ~mask) * 2 + (index & mask);
 
-            [DoesNotReturn]
-            [StackTraceHidden]
-            static void ThrowIndexOutOfRangeException()
-            {
-                throw new IndexOutOfRangeException();
-            }
-        }
+        return ref Unsafe.Add(ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data)), offset);
+    }
+
+    private ref Slot GetSlotRef(int index)
+    {
+        // Slots are at [VectorSize, VectorSize*2), then [VectorSize*3, VectorSize*4), then [VectorSize*5, VectorSize*6) 
+        // To extract them extract the higher part and multiply by two, then add the lower part.
+        // Additionally, add one ushorts per vector
+        var uShortsPerVector = VectorSize / 2;
+
+        var mask = uShortsPerVector - 1;
+        var offset = (index & ~mask) * 2 + (index & mask) + uShortsPerVector;
+
+        return ref Unsafe.Add(ref Unsafe.As<byte, Slot>(ref MemoryMarshal.GetReference(_data)), offset);
     }
 
     public bool TrySet(in NibblePath key, ReadOnlySpan<byte> data)
@@ -81,7 +97,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
         // does not exist yet, calculate total memory needed
         var total = GetTotalSpaceRequired(preamble, trimmed, data);
 
-        if (_header.Taken + total + Slot.Size > _data.Length)
+        if (_header.TakenAfterOneMoreSlot + total > _data.Length)
         {
             if (_header.Deleted == 0)
             {
@@ -93,18 +109,20 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
             Defragment();
 
             // re-evaluate again
-            if (_header.Taken + total + Slot.Size > _data.Length)
+            if (_header.TakenAfterOneMoreSlot + total > _data.Length)
             {
                 // not enough memory
                 return false;
             }
         }
 
-        var at = _header.Low;
-        ref var slot = ref this[at / Slot.Size];
+        var at = _header.Low / Slot.TotalSize;
 
-        // write slot
-        slot.Hash = hash;
+        // Write hash at its place
+        GetHashRef(at) = hash;
+
+        // Writing slot at its place
+        ref var slot = ref GetSlotRef(at);
         slot.KeyPreamble = preamble;
         slot.ItemAddress = (ushort)(_data.Length - _header.High - total);
 
@@ -122,7 +140,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
         }
 
         // commit low and high
-        _header.Low += Slot.Size;
+        _header.Low += Slot.TotalSize;
         _header.High += (ushort)total;
 
         return true;
@@ -131,7 +149,7 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
     /// <summary>
     /// Gets how many slots are used in the map.
     /// </summary>
-    public int Count => _header.Low / Slot.Size;
+    public int Count => _header.Low / Slot.TotalSize;
 
     public int CapacityLeft => _data.Length - _header.Taken;
 
@@ -173,13 +191,13 @@ public bool MoveNext()
             int index = _index + 1;
             var to = _map.Count;
 
-            ref var slot = ref _map[index];
+            ref var slot = ref _map.GetSlotRef(index);
 
             while (index < to && slot.IsDeleted) // filter out deleted
             {
                 // move by 1
                 index += 1;
-                slot = ref Unsafe.Add(ref slot, 1);
+                slot = ref _map.GetSlotRef(index);
             }
 
             if (index < to)
@@ -196,9 +214,11 @@ public bool MoveNext()
 
         private void Build(out Item value)
         {
-            ref var slot = ref _map[_index];
-            var span = _map.GetSlotPayload(ref slot);
-            var key = Slot.UnPrepareKey(slot.Hash, slot.KeyPreamble, span, _bytes.Span, out var data);
+            ref var slot = ref _map.GetSlotRef(_index);
+            var hash = _map.GetHashRef(_index);
+
+            var span = _map.GetSlotPayload(_index);
+            var key = Slot.UnPrepareKey(hash, slot.KeyPreamble, span, _bytes.Span, out var data);
 
             value = new Item(key, data, _index);
         }
@@ -225,16 +245,16 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst
 
         for (int i = 0; i < to; i++)
         {
-            ref var slot = ref this[i];
+            ref var slot = ref GetSlotRef(i);
             if (slot.IsDeleted)
                 continue;
 
             if (slot.HasAtLeastOneNibble == false)
                 continue;
 
-            var nibble = slot.Nibble0;
+            var nibble = slot.GetNibble0(GetHashRef(i));
             ref readonly var map = ref MapSource.GetMap(destination, nibble);
-            var payload = GetSlotPayload(ref slot);
+            var payload = GetSlotPayload(i);
 
             Span<byte> data;
 
@@ -249,10 +269,11 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst
                 data = payload;
             }
 
+            var hash = GetHashRef(i);
             if (data.IsEmpty && treatEmptyAsTombstone)
             {
                 // special case for tombstones in overflows
-                var index = map.TryGetImpl(trimmed, slot.Hash, slot.KeyPreamble, out _);
+                var index = map.TryGetImpl(trimmed, hash, slot.KeyPreamble, out _);
                 if (index != NotFound)
                 {
                     map.DeleteImpl(index);
@@ -260,7 +281,7 @@ public void MoveNonEmptyKeysTo(in MapSource destination, bool treatEmptyAsTombst
 
                 slot.MarkAsDeleted();
             }
-            else if (map.TrySetImpl(slot.Hash, slot.KeyPreamble, trimmed, data))
+            else if (map.TrySetImpl(hash, slot.KeyPreamble, trimmed, data))
             {
                 slot.MarkAsDeleted();
                 moved++;
@@ -283,15 +304,15 @@ public void GatherCountStatistics(Span<ushort> buckets)
     {
         Debug.Assert(buckets.Length == BucketCount);
 
-        var to = _header.Low / Slot.Size;
+        var to = _header.Low / Slot.TotalSize;
         for (var i = 0; i < to; i++)
         {
-            ref var slot = ref this[i];
+            ref var slot = ref GetSlotRef(i);
 
             // extract only not deleted and these which have at least one nibble
             if (slot.IsDeleted == false && slot.HasAtLeastOneNibble)
             {
-                buckets[slot.Nibble0] += 1;
+                buckets[slot.GetNibble0(GetHashRef(i))] += 1;
             }
         }
     }
@@ -329,8 +350,8 @@ public bool Delete(in NibblePath key)
 
     private void DeleteImpl(int index, bool collectTombstones = true)
     {
-        // mark as deleted first
-        this[index].MarkAsDeleted();
+        // Mark as deleted first
+        MarkAsDeleted(index);
         _header.Deleted++;
 
         if (collectTombstones)
@@ -339,10 +360,24 @@ private void DeleteImpl(int index, bool collectTombstones = true)
         }
     }
 
+    private void MarkAsDeleted(int index)
+    {
+        GetSlotRef(index).MarkAsDeleted();
+
+        // Provide a different hash so that further searches with TryGet won't be hitting this slot.
+        //
+        // We could use a constant value, but then on a collision with an actual value the tail
+        // performance would be terrible.
+        //
+        // The easiest way is to negate the hash that makes it not equal and yet is not a single value.
+        ref var hash = ref GetHashRef(index);
+        hash = (ushort)~hash;
+    }
+
     private void Defragment()
     {
         // As data were fitting before, the will fit after so all the checks can be skipped
-        var count = _header.Low / Slot.Size;
+        var count = _header.Low / Slot.TotalSize;
 
         // The pointer where the writing in the array ended, move it up when written.
         var writeAt = 0;
@@ -352,7 +387,7 @@ private void Defragment()
 
         for (int i = 0; i < count; i++)
         {
-            var slot = this[i];
+            ref var slot = ref GetSlotRef(i);
             var addr = slot.ItemAddress;
 
             if (!slot.IsDeleted)
@@ -372,10 +407,12 @@ private void Defragment()
                     writtenTo = (ushort)(writtenTo - source.Length);
                     var destination = _data.Slice(writtenTo, source.Length);
                     source.CopyTo(destination);
-                    ref var destinationSlot = ref this[writeAt];
+                    ref var destinationSlot = ref GetSlotRef(writeAt);
+
+                    // Copy hash
+                    GetHashRef(writeAt) = GetHashRef(i);
 
                     // Copy everything, just overwrite the address
-                    destinationSlot.Hash = slot.Hash;
                     destinationSlot.KeyPreamble = slot.KeyPreamble;
                     destinationSlot.ItemAddress = writtenTo;
 
@@ -388,7 +425,7 @@ private void Defragment()
         }
 
         // Finalize by setting the header
-        _header.Low = (ushort)(newCount * Slot.Size);
+        _header.Low = (ushort)(newCount * Slot.TotalSize);
         _header.High = (ushort)(_data.Length - writtenTo);
         _header.Deleted = 0;
     }
@@ -401,18 +438,19 @@ private void CollectTombstones()
         // start with the last written and perform checks and cleanup till all the deleted are gone
         var index = Count - 1;
 
-        while (index >= 0 && this[index].IsDeleted)
+        while (index >= 0 && GetSlotRef(index).IsDeleted)
         {
             // undo writing low
-            _header.Low -= Slot.Size;
+            _header.Low -= Slot.TotalSize;
 
             // undo writing high
-            var slice = GetSlotPayload(ref this[index]);
+            var slice = GetSlotPayload(index);
             var total = slice.Length;
             _header.High = (ushort)(_header.High - total);
 
             // cleanup
-            this[index] = default;
+            // Hash is already replaced with its delete. Clean the slot
+            GetSlotRef(index) = default;
             _header.Deleted--;
 
             // move back by one to see if it's deleted as well
@@ -447,111 +485,43 @@ public void Clear()
         "key encoding is delayed but it might be called twice, here + TrySet")]
     private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<byte> data)
     {
-        // Count is the number of ushort hashes to scan from Slots.
-        // Each Slot has the hash as the second one.
-        var count = _header.Low / sizeof(ushort);
-
-        ref var searchSpace = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
-        ref var end = ref Unsafe.Add(ref searchSpace, count);
+        var aligned = AlignToDoubleVectorSize(_header.Low);
+        var count = _header.Low / Slot.TotalSize;
 
-        ref var currentSearchSpace = ref searchSpace;
-
-        // Vectorized search use the same approach, with shuffling batches of two vectors at the time.
-        // As hash is held as ushort (2 bytes) at a Slot struct (4 bytes), we can use shuffle instruction to extract them.
-        // Hashes will be at indexes 1, 3, 5, ... so with shuffle indexes can be shuffled to lower (first vector)
-        // and upper (second vector).
-        // This amortizes the comparison making 2x fewer comparisons and no false matches (only hashes are compared).
-        // When found, TryFind is executed with all the matches from the given 2*vector size batch.
+        ref var d = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
 
         if (Vector256.IsHardwareAccelerated)
         {
-            // Consume 2 vectors at the time as each vector will have half of it shuffled away
-            var batchSize = Vector256<ushort>.Count;
-
-            // Aligned count to the batch size.
-            var alignedCount = (count + (batchSize - 1)) & -batchSize;
-
-            // if aligned count ends before, add batch size to make it aligned.
-            ref var loopEnd = ref Unsafe.Add(ref searchSpace,
-                alignedCount <= _data.Length ? alignedCount : alignedCount - batchSize);
-
             var search = Vector256.Create(hash);
+            var jump = DoubleVectorSize / sizeof(ushort);
 
-            while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref loopEnd))
+            for (var i = 0; i < aligned; i += jump)
             {
-                // There's more than 2 vectors to scan, use shuffling approach.
-                // Pack lower with first hashes, then higher with high
-                var value = Vector256.LoadUnsafe(ref currentSearchSpace);
-
+                var value = Vector256.LoadUnsafe(ref d, (UIntPtr)i);
                 if (Vector256.EqualsAny(value, search))
                 {
                     var matches = Vector256.Equals(value, search).ExtractMostSignificantBits();
-                    matches &= 0xAAAAAAAA; // 0b101010 aligned with placement of Slot.Hash
 
-                    // Check if this was not a test over the boundary
-                    if (Unsafe.IsAddressGreaterThan(ref Unsafe.Add(ref currentSearchSpace, batchSize), ref end))
+                    if (i + jump > aligned)
                     {
-                        // It was and it requires removing some bits that might be over the boundary.
-                        var shift = count - (alignedCount - batchSize);
-                        matches &= (1U << (shift * 2)) - 1;
-                        if (matches == 0)
-                        {
-                            data = default;
-                            return NotFound;
-                        }
+                        // This is the last in batch, masking is required to remove potential hits that are false positive
+                        var shift = count & (VectorSize - 1);
+                        var mask = (1U << shift) - 1;
+                        matches &= mask;
                     }
 
                     if (matches > 0)
                     {
-                        var at = (int)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / Slot.Size;
-                        var found = TryFind(at, matches, key, preamble, out data);
+                        var found = TryFind(i / sizeof(ushort), matches, key, preamble, out data);
                         if (found != NotFound)
                         {
                             return found;
-                        }    
+                        }
                     }
                 }
-
-                currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, batchSize);
             }
         }
 
-        if (!Unsafe.IsAddressLessThan(ref currentSearchSpace, ref end))
-        {
-            data = default;
-            return NotFound;
-        }
-        else
-        {
-            return SlowTail(key, hash, preamble, out data, ref currentSearchSpace, ref end, ref searchSpace);
-        }
-    }
-
-    /// <summary>
-    /// Slow that is just a scan over the leftovers.
-    /// </summary>
-    [MethodImpl(MethodImplOptions.NoInlining)]
-    private int SlowTail(in NibblePath key, ushort hash, byte preamble, out Span<byte> data, ref ushort currentSearchSpace, ref ushort end,
-        ref ushort searchSpace)
-    {
-        // Leftover handling
-        ref var search = ref Unsafe.Add(ref currentSearchSpace, Slot.HashShiftForSearch);
-
-        while (!Unsafe.IsAddressGreaterThan(ref search, ref end))
-        {
-            if (search == hash)
-            {
-                var at = (int)Unsafe.ByteOffset(ref searchSpace, ref search) / Slot.Size;
-                var found = TryFind(at, 1, key, preamble, out data);
-                if (found != NotFound)
-                {
-                    return found;
-                }
-            }
-
-            search = ref Unsafe.Add(ref currentSearchSpace, Slot.Size / sizeof(ushort));
-        }
-
         data = default;
         return NotFound;
     }
@@ -564,19 +534,19 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
 
         do
         {
-            var index = (BitOperations.TrailingZeroCount(search) - 1) >> 1;
+            var index = BitOperations.TrailingZeroCount(search);
 
             // remove the match flag
-            search ^= (uint)(0b10 << (index * 2));
+            search ^= 1U << index;
 
             var i = index + at;
 
-            ref var slot = ref this[i];
+            ref var slot = ref GetSlotRef(i);
 
             // Preamble check is sufficient as IsDeleted is a special value of the preamble
             if ( /*slot.IsDeleted == false &&*/ slot.KeyPreamble == preamble)
             {
-                var actual = GetSlotPayload(ref slot);
+                var actual = GetSlotPayload(i);
 
                 if (slot.HasKeyBytes)
                 {
@@ -602,14 +572,13 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
     /// <summary>
     /// Gets the payload pointed to by the given slot without the length prefix.
     /// </summary>
-    private Span<byte> GetSlotPayload(ref Slot slot)
+    private Span<byte> GetSlotPayload(int index)
     {
         // assert whether the slot has a previous, if not use data.length
-        var previousSlotAddress = Unsafe.IsAddressLessThan(ref this[0], ref slot)
-            ? Unsafe.Add(ref slot, -1).ItemAddress
-            : _data.Length;
-
+        var previousSlotAddress = index > 0 ? GetSlotRef(index - 1).ItemAddress : _data.Length;
+        ref var slot = ref GetSlotRef(index);
         var length = previousSlotAddress - slot.ItemAddress;
+
         return _data.Slice(slot.ItemAddress, length);
     }
 
@@ -626,19 +595,18 @@ public static ushort PrepareKeyForTests(in NibblePath key, out byte preamble, ou
         Slot.PrepareKey(key, out preamble, out trimmed);
 
     /// <summary>
-    /// The slot is a size of <see cref="Size"/> bytes.
-    ///
-    /// It consists of two ushort parts,
-    /// 1. <see cref="Raw"/> and
-    /// 2. <see cref="Hash"/>.
-    ///
-    /// <see cref="Hash"/> is a result of <see cref="PrepareKey"/> that returns the value to be memoized in a slot. It only 2 bytes so collision may occur.
-    /// <see cref="Raw"/> encodes all the metadata related to the slot.
+    /// The slot is a size of <see cref="Size"/> bytes and represents non-hash part of the entry.
+    /// The separation is done to make the search as vector aligned as possible.
     /// </summary>
     [StructLayout(LayoutKind.Sequential, Pack = sizeof(byte), Size = Size)]
     private struct Slot
     {
-        public const int Size = 4;
+        /// <summary>
+        /// The size of <see cref="Slot"/> with hash combined.
+        /// </summary>
+        public const int TotalSize = Size + sizeof(ushort);
+
+        public const int Size = 2;
 
         /// <summary>
         /// The address currently requires 12 bits [0-11] to address whole page. 
@@ -665,14 +633,6 @@ public ushort ItemAddress
         public void MarkAsDeleted()
         {
             KeyPreamble = KeyPreambleDelete;
-
-            // Provide a different hash so that further searches with TryGet won't be hitting this slot.
-            //
-            // We could use a constant value, but then on a collision with an actual value the tail
-            // performance would be terrible.
-            //
-            // The easiest way is to negate the hash that makes it not equal and yet is not a single value.
-            Hash = (ushort)~Hash;
         }
 
         // Preamble uses all bits that AddressMask does not
@@ -697,19 +657,16 @@ public void MarkAsDeleted()
         public bool HasAtLeastOneNibble => KeyPreamble != KeyPreambleEmpty;
 
         // Shift by 12, unless it's odd. If odd, shift by 8
-        public byte Nibble0
+        public byte GetNibble0(ushort hash)
         {
-            get
-            {
-                var count = KeyPreamble >> KeyPreambleLengthShift;
+            var count = KeyPreamble >> KeyPreambleLengthShift;
 
-                // Remove the length mask
-                var hash = (ushort)(Hash ^ GetHashMask(count));
+            // Remove the length mask
+            var h = (ushort)(hash ^ GetHashMask(count));
 
-                return (byte)(0x0F & (hash >> (3 * NibblePath.NibbleShift -
-                                               ((Raw >> KeyPreambleShift) & KeyPreambleOddBit) *
-                                               NibblePath.NibbleShift)));
-            }
+            return (byte)(0x0F & (h >> (3 * NibblePath.NibbleShift -
+                                        ((Raw >> KeyPreambleShift) & KeyPreambleOddBit) *
+                                        NibblePath.NibbleShift)));
         }
 
         public byte KeyPreamble
@@ -722,21 +679,10 @@ public byte KeyPreamble
 
         private ushort Raw;
 
-        /// <summary>
-        /// Used for vectorized search
-        /// </summary>
-        public const int HashShiftForSearch = 1;
-
         /// <summary>
         /// The memorized result of <see cref="PrepareKey"/> of this item.
         /// </summary>
-        public ushort Hash;
-
-        public override readonly string ToString()
-        {
-            return
-                $"{nameof(Hash)}: {Hash}, {nameof(ItemAddress)}: {ItemAddress}";
-        }
+        public readonly override string ToString() => $"{nameof(ItemAddress)}: {ItemAddress}";
 
         /// <summary>
         /// Mask selected in a way that it can be shifted by 0, 1, 2 and
@@ -951,6 +897,8 @@ private struct Header
         public ushort Deleted;
 
         public readonly ushort Taken => (ushort)(Low + High);
+
+        public readonly ushort TakenAfterOneMoreSlot => (ushort)(AlignToDoubleVectorSize(Low + Slot.TotalSize) + High);
     }
 }
 

From c22ebee6343d36401dfb4f7a4e651dc7e72670ee Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Wed, 24 Jul 2024 17:36:41 +0200
Subject: [PATCH 07/19] benchmarks reworked

---
 .../SlottedArrayBenchmarks.cs                 | 390 +++---------------
 1 file changed, 54 insertions(+), 336 deletions(-)

diff --git a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
index e51b89b9..2246e57d 100644
--- a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
+++ b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
@@ -1,373 +1,91 @@
-using System.Numerics;
+using System.Runtime.InteropServices;
 using BenchmarkDotNet.Attributes;
-using Paprika.Crypto;
 using Paprika.Data;
 using Paprika.Store;
-using static System.Buffers.Binary.BinaryPrimitives;
 
 namespace Paprika.Benchmarks;
 
 [DisassemblyDiagnoser(maxDepth: 2)]
-public class SlottedArrayBenchmarks
+public unsafe class SlottedArrayBenchmarks
 {
-    private readonly byte[] _onePage = new byte[Page.PageSize];
+    private const int KeyCount = 97;
 
-    // defragmentation
-    private readonly byte[] _defragmentation = new byte[Page.PageSize];
-    private readonly byte[] _defragmentationCopy = new byte[Page.PageSize];
-    private readonly byte[] _defragmentationValue = new byte[64];
-    private readonly ushort _writtenTo;
+    private const int
+        BytesPerKey =
+            3; // 3 repeated bytes allow to cut off the first nibble and still have a unique key. Also, allow storing some key leftover
 
-    private readonly byte[] _writtenLittleEndian = new byte[Page.PageSize];
-    private readonly byte[] _writtenBigEndian = new byte[Page.PageSize];
-    private readonly byte[] _writable = new byte[Page.PageSize];
-    private readonly int _to;
-
-    // hash collisions are fixed in size to make them comparable
-    private readonly byte[] _hashCollisions = new byte[Page.PageSize];
-    private const int HashCollisionsCount = NibblePath.KeccakNibbleCount;
-    private static readonly byte[] HashCollisionValue = new byte[13];
-
-    private readonly byte[] _copy0 = new byte[Page.PageSize];
-    private readonly byte[] _copy1 = new byte[Page.PageSize];
+    private readonly void* _keys;
+    private readonly void* _map;
 
     public SlottedArrayBenchmarks()
     {
-        // Big and small endian tests
-        {
-            var little = new SlottedArray(_writtenLittleEndian);
-            var big = new SlottedArray(_writtenBigEndian);
-
-            Span<byte> key = stackalloc byte[4];
-
-
-            while (true)
-            {
-                WriteInt32LittleEndian(key, _to);
-                if (little.TrySet(NibblePath.FromKey(key), key) == false)
-                {
-                    // filled
-                    break;
-                }
-
-                WriteInt32BigEndian(key, _to);
-                if (big.TrySet(NibblePath.FromKey(key), key) == false)
-                {
-                    // filled
-                    break;
-                }
+        // Create keys
+        _keys = AllocAlignedPage();
 
-                _to++;
-            }
-        }
-
-        // Hash collisions tests
+        var span = new Span<byte>(_keys, Page.PageSize);
+        for (byte i = 0; i < KeyCount; i++)
         {
-            var zeroes = NibblePath.FromKey(Keccak.Zero);
-            var hashCollisions = new SlottedArray(_hashCollisions);
-
-            for (var i = 0; i <= HashCollisionsCount; i++)
+            for (var j = 0; j < BytesPerKey; j++)
             {
-                if (!hashCollisions.TrySet(zeroes.SliceTo(i), HashCollisionValue))
-                {
-                    throw new Exception($"No place to set hash collision at {i}");
-                }
+                span[i * BytesPerKey + j] = i;
             }
         }
 
-        // Defragmentation
-        {
-            var map = new SlottedArray(_defragmentation);
-            ushort i = 0;
-            Span<byte> key = stackalloc byte[2];
+        // Map
+        _map = AllocAlignedPage();
+        Span<byte> value = stackalloc byte[1];
 
-            // Set as many as possible
-            while (map.TrySet(NibblePath.FromKey(key), _defragmentationValue))
-            {
-                i++;
-                WriteUInt16LittleEndian(key, i);
-            }
-
-            _writtenTo = i;
-        }
-    }
-
-    [Benchmark]
-    public int Write_whole_page_of_data()
-    {
-        _writable.AsSpan().Clear();
-        var map = new SlottedArray(_writable);
-
-        Span<byte> key = stackalloc byte[4];
-
-        int count = 0;
-
-        // fill 
-        for (int i = 0; i < _to; i++)
+        var map = new SlottedArray(new Span<byte>(_map, Page.PageSize));
+        for (byte i = 0; i < KeyCount; i++)
         {
-            WriteInt32LittleEndian(key, i);
-            if (map.TrySet(NibblePath.FromKey(key), key))
+            value[0] = i;
+            if (map.TrySet(GetKey(i, false), value) == false)
             {
-                count++;
+                throw new Exception("Not enough memory");
             }
         }
 
-        return count;
-    }
+        return;
 
-    [Benchmark]
-    public int Read_existing_keys_prefix_different()
-    {
-        var map = new SlottedArray(_writtenLittleEndian);
-        Span<byte> key = stackalloc byte[4];
-
-        var result = 0;
-
-        // find all values
-        for (var i = 0; i < _to; i++)
+        static void* AllocAlignedPage()
         {
-            WriteInt32LittleEndian(key, i);
-            if (map.TryGet(NibblePath.FromKey(key), out var data))
-            {
-                result += data.Length;
-            }
-        }
-
-        return result;
-    }
-
-    [Benchmark]
-    public int Read_existing_keys_suffix_different()
-    {
-        var map = new SlottedArray(_writtenBigEndian);
-        Span<byte> key = stackalloc byte[4];
-
-        var result = 0;
-
-        // find all values
-        for (var i = 0; i < _to; i++)
-        {
-            WriteInt32BigEndian(key, i);
-            if (map.TryGet(NibblePath.FromKey(key), out var data))
-            {
-                result += data.Length;
-            }
-        }
-
-        return result;
-    }
-
-    [Benchmark]
-    public int Read_nonexistent_keys()
-    {
-        var map = new SlottedArray(_writtenLittleEndian);
-        Span<byte> key = stackalloc byte[4];
-
-        var result = 0;
-
-        // miss all the next
-        for (int i = _to; i < _to * 2; i++)
-        {
-            WriteInt32LittleEndian(key, i);
-            if (map.TryGet(NibblePath.FromKey(key), out _) == false)
-            {
-                result += 1;
-            }
+            const UIntPtr size = Page.PageSize;
+            var memory = NativeMemory.AlignedAlloc(size, size);
+            NativeMemory.Clear(memory, size);
+            return memory;
         }
-
-        return result;
     }
 
-    [Benchmark]
-    public int Hash_collisions()
+    [Benchmark(OperationsPerInvoke = 4)]
+    [Arguments((byte)1, false)]
+    [Arguments((byte)15, false)]
+    [Arguments((byte)16, false)]
+    [Arguments((byte)31, false)]
+    [Arguments((byte)32, false)]
+    [Arguments((byte)47, false)]
+    [Arguments((byte)48, false)]
+    [Arguments((byte)63, false)]
+    [Arguments((byte)64, false)]
+    [Arguments((byte)95, false)]
+    [Arguments((byte)KeyCount - 1, false)]
+    public int TryGet(byte index, bool odd)
     {
-        var map = new SlottedArray(_hashCollisions);
-        var zeroes = NibblePath.FromKey(Keccak.Zero);
-
-        var length = 0;
-
-        for (var i = 0; i < HashCollisionsCount; i++)
-        {
-            if (map.TryGet(zeroes.SliceTo(i), out var value))
-            {
-                length += value.Length;
-            }
-        }
-
-        return length;
-    }
-
-    [Benchmark]
-    public int EnumerateAll()
-    {
-        var map = new SlottedArray(_writtenLittleEndian);
-
-        var length = 0;
-        foreach (var item in map.EnumerateAll())
-        {
-            length += item.Key.Length;
-            length += item.RawData.Length;
-        }
-
-        return length;
-    }
-
-    [Benchmark]
-    public int Move_to_keys()
-    {
-        var map = new SlottedArray(_writtenLittleEndian);
-
-        _copy0.AsSpan().Clear();
-        _copy1.AsSpan().Clear();
-
-        var map0 = new SlottedArray(_copy0);
-        var map1 = new SlottedArray(_copy1);
-
-        map.MoveNonEmptyKeysTo(new MapSource(map0, map1));
-
-        return map.Count + map0.Count + map1.Count;
-    }
-
-    /// <summary>
-    /// Multiple rounds of setting and deleting to ensure that tombstones do not impact the search nor insert.
-    /// Increasing values are used so that slot cannot be easily reused.
-    /// </summary>
-    [Benchmark]
-    public void Set_And_Delete()
-    {
-        const int count = 80;
-
-        Span<byte> data = stackalloc byte[count];
-        var a = NibblePath.FromKey(stackalloc byte[] { 12, 34, 98 });
-        var b = NibblePath.FromKey(stackalloc byte[] { 78, 34, 35 });
-
-        var map = new SlottedArray(_onePage);
-        map.Clear();
-
-        // init by setting a
-        map.TrySet(a, ReadOnlySpan<byte>.Empty);
-
-        for (int i = 1; i < count; i++)
-        {
-            var d = data[..i];
-
-            map.TrySet(b, d);
-            map.Delete(a); // delete previous a, b above prohibits collect tombstones
-            map.TrySet(a, d); // set new
-            map.Delete(b); // delete previous b, a above prohibits collect tombstones
-        }
-    }
-
-    [Benchmark(OperationsPerInvoke = 2)]
-    [Arguments(0, 0)]
-    [Arguments(0, 1)]
-    [Arguments(1, 1)]
-    [Arguments(0, 2)]
-    [Arguments(1, 2)]
-    [Arguments(0, 3)]
-    [Arguments(1, 3)]
-    [Arguments(0, 4)]
-    [Arguments(1, 4)]
-    [Arguments(0, 6)]
-    [Arguments(1, 6)]
-    [Arguments(0, 32)]
-    [Arguments(1, 31)]
-    [Arguments(1, 30)]
-    public int Prepare_Key(int sliceFrom, int length)
-    {
-        var key = NibblePath.FromKey(Keccak.EmptyTreeHash).Slice(sliceFrom, length);
-
-        // spin: 1
-        var hash = SlottedArray.PrepareKeyForTests(key, out var preamble, out var trimmed);
-
-        // spin: 2
-        var hash2 = SlottedArray.PrepareKeyForTests(key, out var preamble2, out var trimmed2);
-
-        return
-            hash + preamble + trimmed.Length +
-            hash2 + preamble2 + trimmed2.Length;
-    }
-
-    [Benchmark(OperationsPerInvoke = 2)]
-    [Arguments(0, 0)]
-    [Arguments(0, 1)]
-    [Arguments(1, 1)]
-    [Arguments(0, 2)]
-    [Arguments(1, 2)]
-    [Arguments(0, 3)]
-    [Arguments(1, 3)]
-    [Arguments(0, 4)]
-    [Arguments(1, 4)]
-    [Arguments(0, 6)]
-    [Arguments(1, 6)]
-    [Arguments(0, 32)]
-    [Arguments(1, 31)]
-    [Arguments(1, 30)]
-    public int Prepare_Key_UnPrepare(int sliceFrom, int length)
-    {
-        var key = NibblePath.FromKey(Keccak.EmptyTreeHash).Slice(sliceFrom, length);
-
-        // prepare
-        var hash = SlottedArray.PrepareKeyForTests(key, out var preamble, out var trimmed);
-        var written = trimmed.WriteTo(stackalloc byte[33]);
-
-        Span<byte> working = stackalloc byte[32];
-
-        // spin: 1
-        var key1 = SlottedArray.UnPrepareKeyForTests(hash, preamble, written, working, out var data);
-
-        // spin: 2
-        var key2 = SlottedArray.UnPrepareKeyForTests(hash, preamble, written, working, out data);
-
-
-        return key1.Length + key2.Length;
-    }
-
-    private const int DefragmentOpsCount = 4;
-
-    [Benchmark(OperationsPerInvoke = DefragmentOpsCount)]
-    public void Defragment_first_key_deleted()
-    {
-        _defragmentation.CopyTo(_defragmentationCopy.AsSpan());
-
-        var map = new SlottedArray(_defragmentationCopy);
-
-        Span<byte> key = stackalloc byte[2];
-        var i = _writtenTo;
-
-        // Delete & defragment
-        for (ushort j = 0; j < DefragmentOpsCount; j++)
-        {
-            // Delete first
-            WriteUInt16LittleEndian(key, j);
-            map.Delete(NibblePath.FromKey(key));
-
-            // Encode new key and set
-            WriteUInt16LittleEndian(key, i++);
-            map.TrySet(NibblePath.FromKey(key), _defragmentationValue);
-        }
+        var map = new SlottedArray(new Span<byte>(_map, Page.PageSize));
+        var key = GetKey(index, odd);
+
+        var count = 0;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        return count;
     }
 
-    [Benchmark(OperationsPerInvoke = DefragmentOpsCount)]
-    public void Defragment_last_key_deleted()
+    private NibblePath GetKey(byte i, bool odd)
     {
-        _defragmentation.CopyTo(_defragmentationCopy.AsSpan());
-
-        var map = new SlottedArray(_defragmentationCopy);
-
-        Span<byte> key = stackalloc byte[2];
-        var last = (ushort)(_writtenTo - 1);
+        var span = new Span<byte>(_keys, Page.PageSize);
+        var slice = span.Slice(i * BytesPerKey, BytesPerKey);
 
-        // Delete & defragment
-        for (ushort j = 0; j < DefragmentOpsCount; j++)
-        {
-            // Delete first
-            WriteUInt16LittleEndian(key, last);
-            map.Delete(NibblePath.FromKey(key));
-
-            // Encode new key and set
-            WriteUInt16LittleEndian(key, last++);
-            map.TrySet(NibblePath.FromKey(key), _defragmentationValue);
-        }
+        return NibblePath.FromKey(slice, odd ? 1 : 0, 4);
     }
 }
\ No newline at end of file

From ef7f8604e14414d0e870f94eecd5512e4b88c080 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Wed, 24 Jul 2024 17:37:24 +0200
Subject: [PATCH 08/19] unneeded removed

---
 src/Paprika.Tests/Data/SlottedArrayTests.cs | 49 ---------------------
 1 file changed, 49 deletions(-)

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index 063cdad7..aa77e633 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -271,55 +271,6 @@ public void Breach_VectorSize_with_key_count()
         }
     }
 
-    [Test]
-    public void Roll_over()
-    {
-        const int seed = 13;
-        var random = new Random(seed);
-        Span<byte> key = stackalloc byte[4];
-
-        var map = new SlottedArray(new byte[1024]);
-
-        byte count = 0;
-
-        random.NextBytes(key);
-        while (map.TrySet(NibblePath.FromKey(key), [count]))
-        {
-            count++;
-            random.NextBytes(key);
-        }
-
-        // reset, delete some
-        //random = new Random(seed);
-
-        using var e = map.EnumerateAll();
-        for (var i = 0; i < count; i++)
-        {
-            //random.NextBytes(key);
-            e.MoveNext().Should().BeTrue();
-
-            if (ShouldBeDeleted(i))
-            {
-                map.Delete(e.Current);
-                //map.Delete(NibblePath.FromKey(key)).Should().BeTrue();
-            }
-        }
-
-        // reset, assert
-        random = new Random(seed);
-        for (var i = 0; i < count; i++)
-        {
-            random.NextBytes(key);
-
-            var exist = map.TryGet(NibblePath.FromKey(key), out var data);
-            exist.Should().NotBe(ShouldBeDeleted(i));
-        }
-
-        return;
-
-        static bool ShouldBeDeleted(int i) => i % 2 == 0;
-    }
-
     private static ReadOnlySpan<byte> Data(byte key) => new[] { key };
 
     [Test]

From 00263664f2727e704c51d78d9e63acebc3b67722 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Wed, 24 Jul 2024 17:38:24 +0200
Subject: [PATCH 09/19] assert removed

---
 src/Paprika/Data/SlottedArray.cs | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index c8c55d99..7d9c9292 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -29,13 +29,15 @@ public readonly ref struct SlottedArray
     private readonly ref Header _header;
     private readonly Span<byte> _data;
 
-    private static int VectorSize => Vector256.IsHardwareAccelerated ? Vector256<byte>.Count : Vector128<byte>.Count;
+    private static readonly int VectorSize =
+        Vector256.IsHardwareAccelerated ? Vector256<byte>.Count : Vector128<byte>.Count;
 
-    private static int DoubleVectorSize => VectorSize * 2;
+    private const int VectorsByBatch = 2;
+    private static readonly int DoubleVectorSize = VectorSize * VectorsByBatch;
 
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private static int AlignToDoubleVectorSize(int count) => (count + (DoubleVectorSize - 1)) & -DoubleVectorSize;
 
-
     public SlottedArray(Span<byte> buffer)
     {
         Debug.Assert(buffer.Length > MinimalSizeWithNoData,
@@ -143,6 +145,8 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
         _header.Low += Slot.TotalSize;
         _header.High += (ushort)total;
 
+        //AssertAllSlots();
+
         return true;
     }
 
@@ -358,6 +362,8 @@ private void DeleteImpl(int index, bool collectTombstones = true)
         {
             CollectTombstones();
         }
+
+        //AssertAllSlots();
     }
 
     private void MarkAsDeleted(int index)
@@ -428,6 +434,8 @@ private void Defragment()
         _header.Low = (ushort)(newCount * Slot.TotalSize);
         _header.High = (ushort)(_data.Length - writtenTo);
         _header.Deleted = 0;
+
+        //AssertAllSlots();
     }
 
     /// <summary>
@@ -456,6 +464,8 @@ private void CollectTombstones()
             // move back by one to see if it's deleted as well
             index--;
         }
+
+        //AssertAllSlots();
     }
 
     public bool TryGet(scoped in NibblePath key, out ReadOnlySpan<byte> data)
@@ -502,7 +512,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                 {
                     var matches = Vector256.Equals(value, search).ExtractMostSignificantBits();
 
-                    if (i + jump > aligned)
+                    if (i + jump >= aligned)
                     {
                         // This is the last in batch, masking is required to remove potential hits that are false positive
                         var shift = count & (VectorSize - 1);
@@ -512,7 +522,7 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
 
                     if (matches > 0)
                     {
-                        var found = TryFind(i / sizeof(ushort), matches, key, preamble, out data);
+                        var found = TryFind(i / VectorsByBatch, matches, key, preamble, out data);
                         if (found != NotFound)
                         {
                             return found;
@@ -569,6 +579,16 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
         return NotFound;
     }
 
+    // private void AssertAllSlots()
+    // {
+    //     var count = _header.Low / Slot.TotalSize;
+    //
+    //     for (int i = 0; i < count; i++)
+    //     {
+    //         Debug.Assert(GetSlotPayload(i).Length >= 0);
+    //     }
+    // }
+
     /// <summary>
     /// Gets the payload pointed to by the given slot without the length prefix.
     /// </summary>

From 0d955440c61cdc3f7c9e8ffe89d67623f77a4087 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Wed, 24 Jul 2024 18:21:05 +0200
Subject: [PATCH 10/19] slightly simplified payload retrieval

---
 src/Paprika/Data/SlottedArray.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index 7d9c9292..9cca74d5 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -592,14 +592,15 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
     /// <summary>
     /// Gets the payload pointed to by the given slot without the length prefix.
     /// </summary>
+    [SkipLocalsInit]
     private Span<byte> GetSlotPayload(int index)
     {
         // assert whether the slot has a previous, if not use data.length
         var previousSlotAddress = index > 0 ? GetSlotRef(index - 1).ItemAddress : _data.Length;
-        ref var slot = ref GetSlotRef(index);
-        var length = previousSlotAddress - slot.ItemAddress;
+        var addr = GetSlotRef(index).ItemAddress;
+        var length = previousSlotAddress - addr;
 
-        return _data.Slice(slot.ItemAddress, length);
+        return _data.Slice(addr, length);
     }
 
     /// <summary>

From dae49e9112f2685c01eb4bcc62ffdcbf4d844986 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Thu, 25 Jul 2024 10:36:02 +0200
Subject: [PATCH 11/19] Vector128 added

---
 src/Paprika.Benchmarks/Program.cs | 14 ++++++++++-
 src/Paprika/Data/SlottedArray.cs  | 42 +++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/Paprika.Benchmarks/Program.cs b/src/Paprika.Benchmarks/Program.cs
index 6064c545..2f78ee5c 100644
--- a/src/Paprika.Benchmarks/Program.cs
+++ b/src/Paprika.Benchmarks/Program.cs
@@ -1,6 +1,8 @@
 // See https://aka.ms/new-console-template for more information
 
 using System.Diagnostics.CodeAnalysis;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Jobs;
 using BenchmarkDotNet.Running;
 
 [assembly: ExcludeFromCodeCoverage]
@@ -11,6 +13,16 @@ public class Program
 {
     public static void Main(string[] args)
     {
-        BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
+        // Vector128
+        // IConfig config = DefaultConfig.Instance
+        //     .AddJob(Job.Default.WithEnvironmentVariable("DOTNET_EnableAVX2", "0").WithId("Vector128"));
+
+        // Scalar, throw
+        // IConfig config = DefaultConfig.Instance
+        //     .AddJob(Job.Default.WithEnvironmentVariable("DOTNET_EnableHWIntrinsic", "0").WithId("Vector128"));
+
+        IConfig? config = null;
+
+        BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config);
     }
 }
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index 9cca74d5..b41eb19a 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -531,9 +531,51 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
                 }
             }
         }
+        else if (Vector128.IsHardwareAccelerated)
+        {
+            var search = Vector128.Create(hash);
+            var jump = DoubleVectorSize / sizeof(ushort);
+
+            for (var i = 0; i < aligned; i += jump)
+            {
+                var value = Vector128.LoadUnsafe(ref d, (UIntPtr)i);
+                if (Vector128.EqualsAny(value, search))
+                {
+                    var matches = Vector128.Equals(value, search).ExtractMostSignificantBits();
+
+                    if (i + jump >= aligned)
+                    {
+                        // This is the last in batch, masking is required to remove potential hits that are false positive
+                        var shift = count & (VectorSize - 1);
+                        var mask = (1U << shift) - 1;
+                        matches &= mask;
+                    }
+
+                    if (matches > 0)
+                    {
+                        var found = TryFind(i / VectorsByBatch, matches, key, preamble, out data);
+                        if (found != NotFound)
+                        {
+                            return found;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            ThrowNoVectorSupport();
+        }
 
         data = default;
         return NotFound;
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        void ThrowNoVectorSupport()
+        {
+            throw new NotSupportedException(
+                $"This platform does not support {nameof(Vector256)} nor {nameof(Vector128)}");
+        }
     }
 
     private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out Span<byte> data)

From 29025f1ee274b72248fa6ad4cc4ea904e9df130e Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Thu, 25 Jul 2024 15:03:12 +0200
Subject: [PATCH 12/19] fix of the search

---
 src/Paprika.Tests/Data/SlottedArrayTests.cs |  3 ++-
 src/Paprika/Data/SlottedArray.cs            | 28 +++++++++++++--------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index aa77e633..9750a7ca 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -662,7 +662,8 @@ public static void DeleteAssert(this SlottedArray map, in ReadOnlySpan<byte> key
 
     public static void GetAssert(this SlottedArray map, in ReadOnlySpan<byte> key, ReadOnlySpan<byte> expected)
     {
-        map.TryGet(NibblePath.FromKey(key), out var actual).Should().BeTrue();
+        var retrieved = map.TryGet(NibblePath.FromKey(key), out var actual);
+        retrieved.Should().BeTrue();
         actual.SequenceEqual(expected).Should().BeTrue("Actual data should equal expected");
     }
 
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index b41eb19a..f2b15e91 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -37,6 +37,9 @@ public readonly ref struct SlottedArray
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private static int AlignToDoubleVectorSize(int count) => (count + (DoubleVectorSize - 1)) & -DoubleVectorSize;
+    
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static int AlignToVectorSize(int count) => (count + (VectorSize - 1)) & -VectorSize;
 
     public SlottedArray(Span<byte> buffer)
     {
@@ -495,8 +498,8 @@ public void Clear()
         "key encoding is delayed but it might be called twice, here + TrySet")]
     private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<byte> data)
     {
-        var aligned = AlignToDoubleVectorSize(_header.Low);
         var count = _header.Low / Slot.TotalSize;
+        var aligned = AlignToDoubleVectorSize(_header.Low) / sizeof(ushort);
 
         ref var d = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
 
@@ -621,15 +624,15 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
         return NotFound;
     }
 
-    // private void AssertAllSlots()
-    // {
-    //     var count = _header.Low / Slot.TotalSize;
-    //
-    //     for (int i = 0; i < count; i++)
-    //     {
-    //         Debug.Assert(GetSlotPayload(i).Length >= 0);
-    //     }
-    // }
+    private void AssertAllSlots()
+    {
+        var count = _header.Low / Slot.TotalSize;
+    
+        for (int i = 0; i < count; i++)
+        {
+            Debug.Assert(GetSlotPayload(i).Length >= 0);
+        }
+    }
 
     /// <summary>
     /// Gets the payload pointed to by the given slot without the length prefix.
@@ -642,6 +645,11 @@ private Span<byte> GetSlotPayload(int index)
         var addr = GetSlotRef(index).ItemAddress;
         var length = previousSlotAddress - addr;
 
+        if (length < 0)
+        {
+            AssertAllSlots();
+        }
+        
         return _data.Slice(addr, length);
     }
 

From 0ef13aa6b5ee2e44031107740017d2511da13d47 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Thu, 25 Jul 2024 15:04:49 +0200
Subject: [PATCH 13/19] dummy assert removed

---
 src/Paprika/Data/SlottedArray.cs | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index f2b15e91..e97ba83a 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -148,8 +148,6 @@ private bool TrySetImpl(ushort hash, byte preamble, in NibblePath trimmed, ReadO
         _header.Low += Slot.TotalSize;
         _header.High += (ushort)total;
 
-        //AssertAllSlots();
-
         return true;
     }
 
@@ -365,8 +363,6 @@ private void DeleteImpl(int index, bool collectTombstones = true)
         {
             CollectTombstones();
         }
-
-        //AssertAllSlots();
     }
 
     private void MarkAsDeleted(int index)
@@ -437,8 +433,6 @@ private void Defragment()
         _header.Low = (ushort)(newCount * Slot.TotalSize);
         _header.High = (ushort)(_data.Length - writtenTo);
         _header.Deleted = 0;
-
-        //AssertAllSlots();
     }
 
     /// <summary>
@@ -467,8 +461,6 @@ private void CollectTombstones()
             // move back by one to see if it's deleted as well
             index--;
         }
-
-        //AssertAllSlots();
     }
 
     public bool TryGet(scoped in NibblePath key, out ReadOnlySpan<byte> data)
@@ -624,16 +616,6 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
         return NotFound;
     }
 
-    private void AssertAllSlots()
-    {
-        var count = _header.Low / Slot.TotalSize;
-    
-        for (int i = 0; i < count; i++)
-        {
-            Debug.Assert(GetSlotPayload(i).Length >= 0);
-        }
-    }
-
     /// <summary>
     /// Gets the payload pointed to by the given slot without the length prefix.
     /// </summary>
@@ -644,12 +626,6 @@ private Span<byte> GetSlotPayload(int index)
         var previousSlotAddress = index > 0 ? GetSlotRef(index - 1).ItemAddress : _data.Length;
         var addr = GetSlotRef(index).ItemAddress;
         var length = previousSlotAddress - addr;
-
-        if (length < 0)
-        {
-            AssertAllSlots();
-        }
-        
         return _data.Slice(addr, length);
     }
 

From 90c99b15e2c9aa7ca57727a311592be37f5f41d0 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Thu, 25 Jul 2024 17:42:02 +0200
Subject: [PATCH 14/19] bug fixed

---
 src/Paprika.Tests/Data/SlottedArrayTests.cs | 33 ++++++++++++++++++++-
 src/Paprika.Tests/Store/DbTests.cs          | 13 ++++++--
 src/Paprika/Data/SlottedArray.cs            | 21 ++++++++-----
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index 9750a7ca..f611d787 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -271,6 +271,36 @@ public void Breach_VectorSize_with_key_count()
         }
     }
 
+    [Test(Description = "Make a lot of requests to make breach the vector count")]
+    public void Set_Get_With_Specific_Lengths([Values(8, 16, 32, 64, 68, 72)] int count)
+    {
+        const int keyLength = 2;
+
+        Span<byte> keys = stackalloc byte[count * 2];
+        for (byte i = 0; i < count; i++)
+        {
+            keys[i * keyLength] = i;
+            keys[i * keyLength + 1] = i;
+        }
+
+        var map = new SlottedArray(new byte[Page.PageSize]);
+
+        for (var i = 0; i < count; i++)
+        {
+            map.SetAssert(GetKey(keys, i), GetValue(i));
+        }
+
+        for (var i = 0; i < count; i++)
+        {
+            map.GetAssert(GetKey(keys, i), GetValue(i));
+        }
+
+        return;
+
+        static NibblePath GetKey(Span<byte> keys, int i) => NibblePath.FromKey(keys.Slice(i * keyLength, keyLength));
+        static ReadOnlySpan<byte> GetValue(int i) => new byte[(byte)(i & 255)];
+    }
+
     private static ReadOnlySpan<byte> Data(byte key) => new[] { key };
 
     [Test]
@@ -669,7 +699,8 @@ public static void GetAssert(this SlottedArray map, in ReadOnlySpan<byte> key, R
 
     public static void GetAssert(this SlottedArray map, in NibblePath key, ReadOnlySpan<byte> expected)
     {
-        map.TryGet(key, out var actual).Should().BeTrue();
+        var retrieved = map.TryGet(key, out var actual);
+        retrieved.Should().BeTrue();
         actual.SequenceEqual(expected).Should().BeTrue("Actual data should equal expected");
     }
 
diff --git a/src/Paprika.Tests/Store/DbTests.cs b/src/Paprika.Tests/Store/DbTests.cs
index 4839fcb9..4743b8f5 100644
--- a/src/Paprika.Tests/Store/DbTests.cs
+++ b/src/Paprika.Tests/Store/DbTests.cs
@@ -1,4 +1,5 @@
 using System.Buffers.Binary;
+using System.Diagnostics;
 using FluentAssertions;
 using Nethermind.Int256;
 using NUnit.Framework;
@@ -197,7 +198,7 @@ public async Task Spin_large()
         using var db = PagedDb.NativeMemoryDb(size);
 
         const int batches = 25;
-        const int storageSlots = 10_000;
+        const int storageSlots = 5_000;
         const int storageKeyLength = 32;
 
         var value = new byte[32];
@@ -209,13 +210,21 @@ public async Task Spin_large()
 
         var readBatches = new List<IReadOnlyBatch>();
 
-        for (var i = 0; i < batches; i++)
+        //for (var i = 0; i < batches; i++)
         {
             using var batch = db.BeginNextBatch();
 
             for (var slot = 0; slot < storageSlots; slot++)
             {
+                if (slot >= 4890)
+                    Debugger.Break();
+
                 batch.SetStorage(account, GetStorageAddress(slot), value);
+
+                if (slot >= 4890)
+                {
+                    batch.AssertStorageValue(account, GetStorageAddress(4890), value);
+                }
             }
 
             await batch.Commit(CommitOptions.FlushDataAndRoot);
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index e97ba83a..72ed3254 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -37,7 +37,7 @@ public readonly ref struct SlottedArray
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private static int AlignToDoubleVectorSize(int count) => (count + (DoubleVectorSize - 1)) & -DoubleVectorSize;
-    
+
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private static int AlignToVectorSize(int count) => (count + (VectorSize - 1)) & -VectorSize;
 
@@ -491,6 +491,7 @@ public void Clear()
     private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<byte> data)
     {
         var count = _header.Low / Slot.TotalSize;
+        var jump = DoubleVectorSize / sizeof(ushort);
         var aligned = AlignToDoubleVectorSize(_header.Low) / sizeof(ushort);
 
         ref var d = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(_data));
@@ -498,7 +499,6 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         if (Vector256.IsHardwareAccelerated)
         {
             var search = Vector256.Create(hash);
-            var jump = DoubleVectorSize / sizeof(ushort);
 
             for (var i = 0; i < aligned; i += jump)
             {
@@ -509,9 +509,13 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
 
                     if (i + jump >= aligned)
                     {
+                        // Undoing the multiplication done above to calculate aligned, to get the number of items.
+                        var alignedCount = aligned / VectorsByBatch;
+                        var toClear = alignedCount - count;
+
                         // This is the last in batch, masking is required to remove potential hits that are false positive
-                        var shift = count & (VectorSize - 1);
-                        var mask = (1U << shift) - 1;
+                        var hashesPerVector = VectorSize / sizeof(ushort);
+                        var mask = (1U << hashesPerVector - toClear) - 1;
                         matches &= mask;
                     }
 
@@ -529,7 +533,6 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
         else if (Vector128.IsHardwareAccelerated)
         {
             var search = Vector128.Create(hash);
-            var jump = DoubleVectorSize / sizeof(ushort);
 
             for (var i = 0; i < aligned; i += jump)
             {
@@ -540,9 +543,13 @@ private int TryGetImpl(in NibblePath key, ushort hash, byte preamble, out Span<b
 
                     if (i + jump >= aligned)
                     {
+                        // Undoing the multiplication done above to calculate aligned, to get the number of items.
+                        var alignedCount = aligned / VectorsByBatch;
+                        var toClear = alignedCount - count;
+
                         // This is the last in batch, masking is required to remove potential hits that are false positive
-                        var shift = count & (VectorSize - 1);
-                        var mask = (1U << shift) - 1;
+                        var hashesPerVector = VectorSize / sizeof(ushort);
+                        var mask = (1U << hashesPerVector - toClear) - 1;
                         matches &= mask;
                     }
 

From abfb04bc74fc1bde35627fc7c269430db0b5b941 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Thu, 25 Jul 2024 18:12:11 +0200
Subject: [PATCH 15/19] undo dummy testing

---
 src/Paprika.Tests/Store/DbTests.cs | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/Paprika.Tests/Store/DbTests.cs b/src/Paprika.Tests/Store/DbTests.cs
index 4743b8f5..7adf4a3b 100644
--- a/src/Paprika.Tests/Store/DbTests.cs
+++ b/src/Paprika.Tests/Store/DbTests.cs
@@ -1,8 +1,6 @@
 using System.Buffers.Binary;
-using System.Diagnostics;
 using FluentAssertions;
 using Nethermind.Int256;
-using NUnit.Framework;
 using Paprika.Crypto;
 using Paprika.Store;
 using static Paprika.Tests.Values;
@@ -198,7 +196,7 @@ public async Task Spin_large()
         using var db = PagedDb.NativeMemoryDb(size);
 
         const int batches = 25;
-        const int storageSlots = 5_000;
+        const int storageSlots = 10_000;
         const int storageKeyLength = 32;
 
         var value = new byte[32];
@@ -210,21 +208,13 @@ public async Task Spin_large()
 
         var readBatches = new List<IReadOnlyBatch>();
 
-        //for (var i = 0; i < batches; i++)
+        for (var i = 0; i < batches; i++)
         {
             using var batch = db.BeginNextBatch();
 
             for (var slot = 0; slot < storageSlots; slot++)
             {
-                if (slot >= 4890)
-                    Debugger.Break();
-
                 batch.SetStorage(account, GetStorageAddress(slot), value);
-
-                if (slot >= 4890)
-                {
-                    batch.AssertStorageValue(account, GetStorageAddress(4890), value);
-                }
             }
 
             await batch.Commit(CommitOptions.FlushDataAndRoot);

From a69bd6c1e364a0fa347d6189d4b6ca4c058f1709 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Fri, 26 Jul 2024 10:28:15 +0200
Subject: [PATCH 16/19] benchmarks updated and GetPayload simplified

---
 .../SlottedArrayBenchmarks.cs                 | 46 +++++++++++++++++++
 src/Paprika.Tests/Data/SlottedArrayTests.cs   |  2 -
 src/Paprika/Data/SlottedArray.cs              |  9 +++-
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
index 2246e57d..68b89954 100644
--- a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
+++ b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
@@ -1,5 +1,6 @@
 using System.Runtime.InteropServices;
 using BenchmarkDotNet.Attributes;
+using Paprika.Crypto;
 using Paprika.Data;
 using Paprika.Store;
 
@@ -81,6 +82,51 @@ public int TryGet(byte index, bool odd)
         return count;
     }
 
+    [Benchmark(OperationsPerInvoke = 2)]
+    [Arguments(0, 0)]
+    [Arguments(0, 1)]
+    [Arguments(1, 1)]
+    [Arguments(0, 2)]
+    [Arguments(1, 2)]
+    [Arguments(0, 3)]
+    [Arguments(1, 3)]
+    [Arguments(0, 4)]
+    [Arguments(1, 4)]
+    [Arguments(0, 6)]
+    [Arguments(1, 6)]
+    [Arguments(0, 32)]
+    [Arguments(1, 31)]
+    [Arguments(1, 30)]
+    public int Prepare_Key(int sliceFrom, int length)
+    {
+        var key = NibblePath.FromKey(Keccak.EmptyTreeHash).Slice(sliceFrom, length);
+
+        // spin: 1
+        var hash = SlottedArray.PrepareKeyForTests(key, out var preamble, out var trimmed);
+
+        // spin: 2
+        var hash2 = SlottedArray.PrepareKeyForTests(key, out var preamble2, out var trimmed2);
+
+        return
+            hash + preamble + trimmed.Length +
+            hash2 + preamble2 + trimmed2.Length;
+    }
+
+    [Benchmark]
+    public int EnumerateAll()
+    {
+        var map = new SlottedArray(new Span<byte>(_map, Page.PageSize));
+
+        var length = 0;
+        foreach (var item in map.EnumerateAll())
+        {
+            length += item.Key.Length;
+            length += item.RawData.Length;
+        }
+
+        return length;
+    }
+
     private NibblePath GetKey(byte i, bool odd)
     {
         var span = new Span<byte>(_keys, Page.PageSize);
diff --git a/src/Paprika.Tests/Data/SlottedArrayTests.cs b/src/Paprika.Tests/Data/SlottedArrayTests.cs
index f611d787..06c304fb 100644
--- a/src/Paprika.Tests/Data/SlottedArrayTests.cs
+++ b/src/Paprika.Tests/Data/SlottedArrayTests.cs
@@ -1,5 +1,3 @@
-using System.Runtime.InteropServices;
-using System.Runtime.Intrinsics;
 using FluentAssertions;
 using Paprika.Crypto;
 using Paprika.Data;
diff --git a/src/Paprika/Data/SlottedArray.cs b/src/Paprika/Data/SlottedArray.cs
index 72ed3254..fd0064e1 100644
--- a/src/Paprika/Data/SlottedArray.cs
+++ b/src/Paprika/Data/SlottedArray.cs
@@ -629,9 +629,14 @@ private int TryFind(int at, uint matches, in NibblePath key, byte preamble, out
     [SkipLocalsInit]
     private Span<byte> GetSlotPayload(int index)
     {
-        // assert whether the slot has a previous, if not use data.length
-        var previousSlotAddress = index > 0 ? GetSlotRef(index - 1).ItemAddress : _data.Length;
         var addr = GetSlotRef(index).ItemAddress;
+
+        // If this is the first, just slice of data
+        if (index == 0)
+            return _data[addr..];
+
+        // Not the first, calculate on the basis of the address.
+        var previousSlotAddress = GetSlotRef(index - 1).ItemAddress;
         var length = previousSlotAddress - addr;
         return _data.Slice(addr, length);
     }

From fe2ccd946182daa5891854e806eae7622ae9cd5c Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Fri, 26 Jul 2024 11:21:56 +0200
Subject: [PATCH 17/19] hash collision benchmark

---
 .../SlottedArrayBenchmarks.cs                 | 71 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
index 68b89954..f2afe93e 100644
--- a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
+++ b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
@@ -18,6 +18,15 @@ private const int
     private readonly void* _keys;
     private readonly void* _map;
 
+    // Hash colliding
+    private const int HashCollidingKeyCount = 32;
+
+    // Use first and last as opportunity to collide
+    private const int BytesPerKeyHashColliding = 3;
+    private readonly void* _hashCollidingKeys;
+    private readonly void* _hashCollidingMap;
+
+
     public SlottedArrayBenchmarks()
     {
         // Create keys
@@ -45,7 +54,37 @@ public SlottedArrayBenchmarks()
                 throw new Exception("Not enough memory");
             }
         }
+        
+        // Hash colliding
+        _hashCollidingKeys = AllocAlignedPage();
+
+        // Create keys so that two consecutive ones share the hash.
+        // This should make it somewhat realistic where there are some collisions but not a lot of them.
+        var hashCollidingKeys = new Span<byte>(_hashCollidingKeys, Page.PageSize);
+        for (byte i = 0; i < HashCollidingKeyCount; i++)
+        {
+            // 0th divide by 2 to collide
+            hashCollidingKeys[i * BytesPerKeyHashColliding] = (byte)(i / 2);
+
+            // 1th differentiate with the first
+            hashCollidingKeys[i * BytesPerKeyHashColliding + 1] = i;
+            
+            // 2nd divide by 2 to collide
+            hashCollidingKeys[i * BytesPerKeyHashColliding + 2] = (byte)(i / 2);
+        }
+        
+        _hashCollidingMap = AllocAlignedPage();
 
+        var hashColliding = new SlottedArray(new Span<byte>(_hashCollidingMap, Page.PageSize));
+        for (byte i = 0; i < HashCollidingKeyCount; i++)
+        {
+            value[0] = i;
+            if (hashColliding.TrySet(GetHashCollidingKey(i), value) == false)
+            {
+                throw new Exception("Not enough memory");
+            }
+        }
+        
         return;
 
         static void* AllocAlignedPage()
@@ -81,6 +120,26 @@ public int TryGet(byte index, bool odd)
         if (map.TryGet(key, out _)) count += 1;
         return count;
     }
+    
+    [Benchmark(OperationsPerInvoke = 4)]
+    [Arguments((byte)1)]
+    [Arguments((byte)2)]
+    [Arguments((byte)3)]
+    [Arguments((byte)4)]
+    [Arguments((byte)30)]
+    [Arguments((byte)31)]
+    public int TryGet_With_Hash_Collisions(byte index)
+    {
+        var map = new SlottedArray(new Span<byte>(_hashCollidingMap, Page.PageSize));
+        var key = GetHashCollidingKey(index);
+
+        var count = 0;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        if (map.TryGet(key, out _)) count += 1;
+        return count;
+    }
 
     [Benchmark(OperationsPerInvoke = 2)]
     [Arguments(0, 0)]
@@ -129,9 +188,19 @@ public int EnumerateAll()
 
     private NibblePath GetKey(byte i, bool odd)
     {
-        var span = new Span<byte>(_keys, Page.PageSize);
+
+        var span = new Span<byte>(_keys, BytesPerKey * KeyCount);
         var slice = span.Slice(i * BytesPerKey, BytesPerKey);
 
         return NibblePath.FromKey(slice, odd ? 1 : 0, 4);
     }
+    
+    private NibblePath GetHashCollidingKey(byte i)
+    {
+        var span = new Span<byte>(_hashCollidingKeys, BytesPerKeyHashColliding * HashCollidingKeyCount);
+        var slice = span.Slice(i * BytesPerKeyHashColliding, BytesPerKeyHashColliding);
+
+        // Use full key
+        return NibblePath.FromKey(slice,  0, BytesPerKeyHashColliding * NibblePath.NibblePerByte);
+    }
 }
\ No newline at end of file

From 650045fb25047ab1cdbba9aef392475c14940944 Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Fri, 26 Jul 2024 11:53:50 +0200
Subject: [PATCH 18/19] design updated

---
 docs/design.md | 111 +++++++++++++------------------------------------
 1 file changed, 29 insertions(+), 82 deletions(-)

diff --git a/docs/design.md b/docs/design.md
index 0faba7d4..f1991226 100644
--- a/docs/design.md
+++ b/docs/design.md
@@ -253,130 +253,77 @@ public struct PageHeader
 
 ### SlottedArray
 
-The `SlottedArray` component is responsible for storing data in-page. It is capable of mapping a `NibblePath` to a value represented by `ReadOnlySpan<byte>`. Empyt values are allowed.
+The `SlottedArray` component is responsible for storing data in a page. It is capable of mapping a `NibblePath` to a value represented by `ReadOnlySpan<byte>`. Empty values are allowed as they are treated as tombstones. The tombstoning is needed to provide the write-through buffering capability, so that a value can be marked as deleted only later to be flushed down.
 
 #### SlottedArray layout
 
-`SlottedArray` needs to store values with variant lengths over a fixed `Span<byte>` provided by the page. To make it work, Paprika uses a modified pattern of the slot array, used by major players in the world of B+ oriented databases (see: [PostgreSQL page layout](https://www.postgresql.org/docs/current/storage-page-layout.html#STORAGE-PAGE-LAYOUT-FIGURE)). How it works then?
+`SlottedArray` needs to store values with variant lengths over a fixed `Span<byte>` provided by the page. To make it work, Paprika uses a modified pattern of the slot array, used by major players in the world of B+ oriented databases (see: [PostgreSQL page layout](https://www.postgresql.org/docs/current/storage-page-layout.html#STORAGE-PAGE-LAYOUT-FIGURE)).
 
 The slot array pattern uses a fixed-size buffer that is provided within the page. It allocates chunks of it from two directions:
 
 1. from `0` forward
 2. from the end downward
 
-The first direction, from `0` is used for fixed-size structures that represent slots. Each slot has some metadata, including the most important one, the offset to the start of data. The direction from the end is used to store var length payloads. Paprika diverges from the usual slot array though. The slot array assumes that it's up to the higher level to map the slot identifiers to keys. What the page provides is just a container for tuples that stores them and maps them to the `CTID`s (see: [PostgreSQL system columns](https://www.postgresql.org/docs/current/ddl-system-columns.html)). How Paprika uses this approach
+The first direction, from `0` is used for fixed-size structures that represent slots. Each slot has some metadata, including the most important one, the offset to the start of data. The direction from the end is used to store var length payloads. Paprika diverges from the usual slot array though. The slot array assumes that it's up to the higher level to map the slot identifiers to keys. What the page provides is just a container for tuples that stores them and maps them to the `CTID`s (see: [PostgreSQL system columns](https://www.postgresql.org/docs/current/ddl-system-columns.html)).
 
-In Paprika, each page level represents a cutoff in the nibble path to make it aligned to the Merkle construct. The key management could be extracted out of the `SlottedArray` component, but it would make it less self-contained. `SlottedArray` then provides `TrySet` and `TryGet` methods that accept nibble paths. This impacts the design of the slot, which is as follows:
+Paprika provides a vector-aligned slotted array, that stores lookup data from the beginning and the actual payload from the end. What makes it special is that Paprika uses vectorized instructions (`Vector256` or `Vector128` depending on the architecture) and alignment to its sizes to make the search as efficient as possible. The space that a given slotted array occupies will be then split into the following:
 
-```csharp
-private struct Slot
-{
-    public const int Size = 4;
+1. `Header`
+1. `Vector` of key hashes
+1. `Vector` of `Slot` entries
+1. `Vector` of key hashes
+1. `Vector` of `Slot` entries
+1. ...
+1. ...
+1. data
 
-    /// <summary>
-    /// The address currently requires 12 bits [0-11] to address whole page. 
-    /// </summary>
-    private const ushort AddressMask = Page.PageSize - 1;
+where each `Vector` is aligned to a machine that it runs on (`Vector` will be 32 bytes on modern `x64` and 16 bytes on `ARM`) and `Vector`s are allocated in pairs (hashes + corresponding slots). Keeping hashes and slots in separate chunks, allows for a fast vectorized search over hashes, without the need to scan over slot entries. The entries are inspected only on the hash match.
 
-    /// <summary>
-    /// The address of this item.
-    /// </summary>
-    public ushort ItemAddress { /* bitwise magic */ } 
-    
-    /// <summary>
-    /// Whether the given entry is deleted or not
-    /// </summary>
-    public bool IsDeleted => KeyPreamble == KeyPreambleDelete;
+The `Header` consists of 8 bytes and keeps track of `hi` and `lo` as in a usual `SlottedArray`.
 
-    public byte KeyPreamble { /* bitwise magic */ } 
+##### Slot
 
-    private ushort Raw;
+A `Slot` occupies `2 bytes` and represents several values needed to keep track of the value stored in an array. It uses bit-wise operations to store all the information in 2 bytes:
 
-    /// <summary>
-    /// Used for vectorized search
-    /// </summary>
-    public const int HashShiftForSearch = 1;
+1. `ItemAddress` - represents the address to go to for the data of the given slot (address currently requires 12 bits [0-11] to address the whole 4 kb page)
+2. `Preamble` - shows whether there are some bytes left for the key or other
+3. `IsDeleted`- whether the item was deleted in the array but not GCed yet
 
-    /// <summary>
-    /// The memorized result of <see cref="PrepareKey"/> of this item.
-    /// </summary>
-    public ushort Hash;
-
-    /// <summary>
-    /// Prepares the key for the search. 
-    /// </summary>
-    public static ushort PrepareKey(/* ... */)
-    {
-        // ...
-    }
-
-    public static NibblePath UnPrepareKey(/* ... */)
-    {
-        // ...
-    }
-}
-```
-
-The slot is 4 bytes long. Using the `PrepareKey` method, some of the nibbles are extrated from the key as a `Hash` for fast comparisons. It has the actual `ItemAddress` that points to the beginning of the payload. The length of the item is calculated by subtracting the address from the previous slot address. The drawback of this design is a linear search across all the slots when an item must be found. With the expected number of items per page, which should be no bigger than 100, it gives 400 bytes of slots to search through. This should be ok-ish with modern processors as the search uses the vectorized index search. Additionally, it adds some checks for the preamble so that the collissions should not be that likely. 
-
-With this, the `SlottedArray` memory representation looks like the following.
-
-```bash
-┌───────────────┬───────┬───────┬───────────────────────────────┐
-│HEADER         │Slot 0 │Slot 1 │                               │
-│               │       │       │                               │
-│High           │Prefix │Prefix │                               │
-│Low            │Addr   │Addr   │ ► ► ►                         │
-│Deleted        │   │   │   │   │                               │
-│               │   │   │   │   │                               │
-├───────────────┴───┼───┴───┼───┘                               │
-│                   │       │                                   │
-│                ┌──┼───────┘                                   │
-│                │  │                                           │
-│                │  │                                           │
-│                │  └──────────┐                                │
-│                │             │                                │
-│                ▼             ▼                                │
-│                ┌─────────────┬────────────────────────────────┤
-│                │             │                                │
-│                │             │                                │
-│          ◄ ◄ ◄ │    DATA     │             DATA               │
-│                │ for slot1   │          for slot 0            │
-│                │             │                                │
-└────────────────┴─────────────┴────────────────────────────────┘
-```
+The `Slot` provides a method called `PrepareKey` that is responsible for extracting the `hash` for the given `NibblePath` key, returning a trimmed version of the key (what goes in the hash is extracted away) and a preamble. There's another counterpart method called `UnPrepareKey` that does the opposite. While `PrepareKey` is used for all the operations like `TrySet` and `TryGet`, `UnPrepareKey` is used only to materialize back the keys. This happens for example when a `SlottedArray` is enumerated. In other cases, the caller does not need to reconstruct the key as they have it.
 
 The `SlottedArray` can wrap an arbitrary span of memory so it can be used for any page that wants to store data by key.
 
 #### Deletion and tombstones
 
-`SlottedArray` uses a tombstoning to mark the given entry as deleted. It's much cheaper to mark something as deleted and collect garbage from time to time than to compress it every single time. The marker of deleteion is frequently called a `tombstone`. To decide whether or not a GC should be called when there's not enough place to just append data, a counter of tombstones is held. If non zero, GC can be used to reclaim memory.
+When deleting an item `SlottedArray` marks a given slot as deleted. Then tries to collect deleted from the last one. Eventually, when the deleted space is scattered across the map, it will run a `Defragment` procedure that copies what is alive and removes all the gaps.
+
+When a slot is marked as deleted, its hash is set to `~hash`. This is done to prevent it from being searched when performing the vectorized search over hashes. The negation is used so that there's no single value that will make all the deletes collide with the same entry.
 
 #### Iteration
 
-`SlottedArray` allows an efficient iteration over each entries using the `map.EnumerateAll()` method. It provides the caller with a `ref struct Enumerator` that does not allocate and allows traversing the map. It's worth to mention that the enumerator allows to delete an entry when enumerating by calling the delete method with the item from the enumerator `map.Delete(item)`. Again, it's based on the tombstoning mentioned above and just marks the data as deleted.
+`SlottedArray` allows an efficient iteration of its entries using the `map.EnumerateAll()` method. It provides the caller with a `ref struct Enumerator` that does not allocate and allows traversing the map. There's a special feature of the enumerator that allows deleting an entry when enumerating by calling the delete method with the item from the enumerator `map.Delete(item)`. Again, it's based on marking slots as deleted.
 
 ### Merkle construct
 
 From Ethereum's point of view, any storage mechanism needs to be able to calculate the `StateRootHash`. This hash allows us to verify whether the block state is valid. How it is done and what is used underneath is not important as long as the store mechanism can provide the answer to the ultimate question: _what is the StateRootHash of the given block?_
 
-To address this `Merkle` is implemented as a pre-commit hook. This hook is run when a block is committed to the blockchain. After all, from the point of execution there's no reason to run it before. Merkleization of the tree is split into the following steps executed sequentially:
+To address this `Merkle` is implemented as a pre-commit hook. This hook is run when a block is committed to the blockchain. After all, from the point of execution, there's no reason to run it before. Merkleization of the tree is split into the following steps executed sequentially:
 
 1. Visit all Storage operations (SSTORE). For each key:
-   1. remember `Account` that `Storage`` belongs to
-   1. walk through the MPT of Account Storage to create/amend Trie nodes. This part is marking paths as dirty
+   1. remember `Account` that `Storage` belongs to
+   1. walk through the MPT of Account Storage to create/amend Trie nodes. This part marks paths as dirty
 1. Visit all State operations. For each key:
    1. check if it was one of the Storage operations. If yes, remove it from the set above
    1. walk through the MPT of Account State to create/amend Trie nodes
 1. Visit all the accounts that were not accessed in 2., but were remembered in 1, meaning Accounts that had their storage modified but no changes to codehash, balance, nonce. For each key:
    1. walk through the MPT of Account State to create/amend Trie nodes
 1. Calculate the Root Hash
-   1. for each of accounts that had their storage modified (from 1.),
+   1. for each of the accounts that had their storage modified (from 1.),
       1. calculate the storage root hash
       1. store it in the account (decode account, encode, set)
    1. calculate the root hash of the State. **Parallel**
 
-It's worth to mention that even though `RLP` of branches is not stored in the database, its transient form is memoized in memory. This greatly improves the overall performance of Merkleization as reduced the number of fetched data from the database (no calls for children). Of course it requires cache invalidation which is done whenever marking the paths is done.
+Even though `RLP` of branches is not stored in the database, its transient form is memoized in memory. This greatly improves the overall performance of Merkleization as reduces the number of fetched data from the database (no calls for children). Of course, it requires cache invalidation which is done whenever marking the paths is done.
 
 ## Examples
 

From 109b2db086418997a604296763683780952a43ce Mon Sep 17 00:00:00 2001
From: scooletz <scooletz@gmail.com>
Date: Fri, 26 Jul 2024 12:46:32 +0200
Subject: [PATCH 19/19] format

---
 src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
index f2afe93e..96b7cc25 100644
--- a/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
+++ b/src/Paprika.Benchmarks/SlottedArrayBenchmarks.cs
@@ -54,7 +54,7 @@ public SlottedArrayBenchmarks()
                 throw new Exception("Not enough memory");
             }
         }
-        
+
         // Hash colliding
         _hashCollidingKeys = AllocAlignedPage();
 
@@ -68,11 +68,11 @@ public SlottedArrayBenchmarks()
 
             // 1th differentiate with the first
             hashCollidingKeys[i * BytesPerKeyHashColliding + 1] = i;
-            
+
             // 2nd divide by 2 to collide
             hashCollidingKeys[i * BytesPerKeyHashColliding + 2] = (byte)(i / 2);
         }
-        
+
         _hashCollidingMap = AllocAlignedPage();
 
         var hashColliding = new SlottedArray(new Span<byte>(_hashCollidingMap, Page.PageSize));
@@ -84,7 +84,7 @@ public SlottedArrayBenchmarks()
                 throw new Exception("Not enough memory");
             }
         }
-        
+
         return;
 
         static void* AllocAlignedPage()
@@ -120,7 +120,7 @@ public int TryGet(byte index, bool odd)
         if (map.TryGet(key, out _)) count += 1;
         return count;
     }
-    
+
     [Benchmark(OperationsPerInvoke = 4)]
     [Arguments((byte)1)]
     [Arguments((byte)2)]
@@ -194,13 +194,13 @@ private NibblePath GetKey(byte i, bool odd)
 
         return NibblePath.FromKey(slice, odd ? 1 : 0, 4);
     }
-    
+
     private NibblePath GetHashCollidingKey(byte i)
     {
         var span = new Span<byte>(_hashCollidingKeys, BytesPerKeyHashColliding * HashCollidingKeyCount);
         var slice = span.Slice(i * BytesPerKeyHashColliding, BytesPerKeyHashColliding);
 
         // Use full key
-        return NibblePath.FromKey(slice,  0, BytesPerKeyHashColliding * NibblePath.NibblePerByte);
+        return NibblePath.FromKey(slice, 0, BytesPerKeyHashColliding * NibblePath.NibblePerByte);
     }
 }
\ No newline at end of file