diff --git a/README.md b/README.md
index f1ffdf22..36e58da3 100644
--- a/README.md
+++ b/README.md
@@ -689,7 +689,7 @@ To explicitly disable half-precision support, define the following macro before
 > But if you are running on different generations of devices, it makes sense to pre-compile the library for all supported generations at once, and dispatch at runtime.
 > This flag does just that and is used to produce the `simsimd.so` shared library, as well as the Python and other bindings.
 
-`SIMSIMD_TARGET_ARM` (`SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`), `SIMSIMD_TARGET_X86` (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`): 
+`SIMSIMD_TARGET_ARM` (`SIMSIMD_TARGET_NEON`, `SIMSIMD_TARGET_SVE`, `SIMSIMD_TARGET_SVE2`, `SIMSIMD_TARGET_NEON_F16`, `SIMSIMD_TARGET_SVE_F16`, `SIMSIMD_TARGET_NEON_BF16`, `SIMSIMD_TARGET_SVE_BF16`), `SIMSIMD_TARGET_X86` (`SIMSIMD_TARGET_HASWELL`, `SIMSIMD_TARGET_SKYLAKE`, `SIMSIMD_TARGET_ICE`, `SIMSIMD_TARGET_GENOA`, `SIMSIMD_TARGET_SAPPHIRE`): 
 
 > By default, SimSIMD automatically infers the target architecture and pre-compiles as many kernels as possible.
 > In some cases, you may want to explicitly disable some of the kernels.
diff --git a/build.rs b/build.rs
index 684a8d4e..520c1b58 100644
--- a/build.rs
+++ b/build.rs
@@ -17,7 +17,17 @@ fn main() {
 
         let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
         let flags_to_try = match target_arch.as_str() {
-            "arm" | "aarch64" => vec!["SIMSIMD_TARGET_NEON", "SIMSIMD_TARGET_SVE"],
+            "arm" | "aarch64" => vec![
+                "SIMSIMD_TARGET_SVE2",
+                "SIMSIMD_TARGET_SVE_BF16",
+                "SIMSIMD_TARGET_SVE_F16",
+                "SIMSIMD_TARGET_SVE_I8",
+                "SIMSIMD_TARGET_SVE",
+                "SIMSIMD_TARGET_NEON_BF16",
+                "SIMSIMD_TARGET_NEON_F16",
+                "SIMSIMD_TARGET_NEON_I8",
+                "SIMSIMD_TARGET_NEON",
+            ],
             _ => vec![
                 "SIMSIMD_TARGET_SAPPHIRE",
                 "SIMSIMD_TARGET_GENOA",
diff --git a/c/lib.c b/c/lib.c
index 24c5d6c6..c4520656 100644
--- a/c/lib.c
+++ b/c/lib.c
@@ -20,6 +20,9 @@
 #if !defined(SIMSIMD_TARGET_SVE) && (defined(__linux__))
 #define SIMSIMD_TARGET_SVE 1
 #endif
+#if !defined(SIMSIMD_TARGET_SVE2) && (defined(__linux__))
+#define SIMSIMD_TARGET_SVE2 1
+#endif
 #if !defined(SIMSIMD_TARGET_HASWELL) && (defined(_MSC_VER) || defined(__APPLE__) || defined(__linux__))
 #define SIMSIMD_TARGET_HASWELL 1
 #endif
diff --git a/cpp/bench.cxx b/cpp/bench.cxx
index 1c792e37..d5871371 100644
--- a/cpp/bench.cxx
+++ b/cpp/bench.cxx
@@ -473,7 +473,7 @@ void measure_sparse(bm::State& state, metric_at metric, metric_at baseline, std:
     mean_error /= pairs.size();
     state.counters["error"] = mean_error;
     state.counters["bytes"] =
-        bm::Counter(iterations * pairs[0].a.size_bytes() * pairs[0].b.size_bytes(), bm::Counter::kIsRate);
+        bm::Counter(iterations * (pairs[0].a.size_bytes() + pairs[0].b.size_bytes()), bm::Counter::kIsRate);
     state.counters["pairs"] = bm::Counter(iterations, bm::Counter::kIsRate);
     state.counters["matches"] =
         std::accumulate(results_contender.begin(), results_contender.end(), 0.0) / results_contender.size();
@@ -482,8 +482,8 @@ void measure_sparse(bm::State& state, metric_at metric, metric_at baseline, std:
 template <simsimd_datatype_t datatype_ak, typename metric_at = void>
 void dense_(std::string name, metric_at* distance_func, metric_at* baseline_func) {
     using pair_t = vectors_pair_gt<datatype_ak>;
-    std::string name_dims = name + "_" + std::to_string(dense_dimensions) + "d";
-    bm::RegisterBenchmark(name_dims.c_str(), measure_dense<pair_t, metric_at*>, distance_func, baseline_func,
+    std::string bench_name = name + "<" + std::to_string(dense_dimensions) + "d>";
+    bm::RegisterBenchmark(bench_name.c_str(), measure_dense<pair_t, metric_at*>, distance_func, baseline_func,
                           dense_dimensions)
         ->MinTime(default_seconds)
         ->Threads(default_threads);
@@ -495,15 +495,18 @@ void sparse_(std::string name, metric_at* distance_func, metric_at* baseline_fun
     using pair_t = vectors_pair_gt<datatype_ak>;
 
     // Register different lengths, intersection sizes, and distributions
-    // 2 first lengths * 3 second lengths * 3 intersection sizes = 18 benchmarks for each metric.
-    for (std::size_t first_len : {128, 1024}) {                //< 2 lengths
-        for (std::size_t second_len_multiplier : {1, 8, 64}) { //< 3 lengths
-            for (std::size_t intersection_size : {1, 8, 64}) { //< 3 sizes
-
+    // 2 first lengths * 3 second length multipliers * 4 intersection grades = 24 benchmarks for each metric.
+    for (std::size_t first_len : {128, 1024}) {                         //< 2 lengths
+        for (std::size_t second_len_multiplier : {1, 8, 64}) {          //< 3 length multipliers
+            for (double intersection_share : {0.01, 0.05, 0.5, 0.95}) { //< 4 intersection grades
+                std::size_t intersection_size = static_cast<std::size_t>(first_len * intersection_share);
                 std::size_t second_len = first_len * second_len_multiplier;
-                std::string test_name = name + "_" + std::to_string(first_len) + "d^" + std::to_string(second_len) +
-                                        "d_w" + std::to_string(intersection_size) + "matches";
-                bm::RegisterBenchmark(test_name.c_str(), measure_sparse<pair_t, metric_at*>, distance_func,
+                std::string bench_name = name + "<|A|=" + std::to_string(first_len) +
+                                         ",|B|=" + std::to_string(second_len) +
+                                         ",|A∩B|=" + std::to_string(intersection_size) + ">";
+                if (second_len > 8192)
+                    continue;
+                bm::RegisterBenchmark(bench_name.c_str(), measure_sparse<pair_t, metric_at*>, distance_func,
                                       baseline_func, first_len, second_len, intersection_size)
                     ->MinTime(default_seconds)
                     ->Threads(default_threads);
@@ -516,8 +519,8 @@ template <simsimd_datatype_t datatype_ak, typename metric_at = void>
 void curved_(std::string name, metric_at* distance_func, metric_at* baseline_func) {
 
     using pair_t = vectors_pair_gt<datatype_ak>;
-    std::string name_dims = name + "_" + std::to_string(curved_dimensions) + "d";
-    bm::RegisterBenchmark(name_dims.c_str(), measure_curved<pair_t, metric_at*>, distance_func, baseline_func,
+    std::string bench_name = name + "<" + std::to_string(curved_dimensions) + "d>";
+    bm::RegisterBenchmark(bench_name.c_str(), measure_curved<pair_t, metric_at*>, distance_func, baseline_func,
                           curved_dimensions)
         ->MinTime(default_seconds)
         ->Threads(default_threads);
@@ -570,6 +573,7 @@ int main(int argc, char** argv) {
     std::printf("Compile-time settings:\n");
     std::printf("- Arm NEON support enabled: %s\n", flags[SIMSIMD_TARGET_NEON]);
     std::printf("- Arm SVE support enabled: %s\n", flags[SIMSIMD_TARGET_SVE]);
+    std::printf("- Arm SVE2 support enabled: %s\n", flags[SIMSIMD_TARGET_SVE2]);
     std::printf("- x86 Haswell support enabled: %s\n", flags[SIMSIMD_TARGET_HASWELL]);
     std::printf("- x86 Skylake support enabled: %s\n", flags[SIMSIMD_TARGET_SKYLAKE]);
     std::printf("- x86 Ice Lake support enabled: %s\n", flags[SIMSIMD_TARGET_ICE]);
@@ -585,6 +589,7 @@ int main(int argc, char** argv) {
     std::printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
     std::printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
     std::printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
+    std::printf("- Arm SVE2 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve2_k) != 0]);
     std::printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
     std::printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
     std::printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);
@@ -663,6 +668,9 @@ int main(int argc, char** argv) {
     curved_<f16_k>("mahalanobis_f16_neon", simsimd_mahalanobis_f16_neon, simsimd_mahalanobis_f16_accurate);
     curved_<bf16_k>("bilinear_bf16_neon", simsimd_bilinear_bf16_neon, simsimd_bilinear_bf16_accurate);
     curved_<bf16_k>("mahalanobis_bf16_neon", simsimd_mahalanobis_bf16_neon, simsimd_mahalanobis_bf16_accurate);
+
+    sparse_<u16_k>("intersect_u16_neon", simsimd_intersect_u16_neon, simsimd_intersect_u16_accurate);
+    sparse_<u32_k>("intersect_u32_neon", simsimd_intersect_u32_neon, simsimd_intersect_u32_accurate);
 #endif
 
 #if SIMSIMD_TARGET_SVE
@@ -687,10 +695,11 @@ int main(int argc, char** argv) {
     dense_<f32c_k>("vdot_f32c_sve", simsimd_vdot_f32c_sve, simsimd_vdot_f32c_accurate);
     dense_<f64c_k>("dot_f64c_sve", simsimd_dot_f64c_sve, simsimd_dot_f64c_serial);
     dense_<f64c_k>("vdot_f64c_sve", simsimd_vdot_f64c_sve, simsimd_vdot_f64c_serial);
+#endif
 
-    sparse_<u16_k>("intersect_u16_sve", simsimd_intersect_u16_sve, simsimd_intersect_u16_accurate);
-    sparse_<u32_k>("intersect_u32_sve", simsimd_intersect_u32_sve, simsimd_intersect_u32_accurate);
-
+#if SIMSIMD_TARGET_SVE2
+    sparse_<u16_k>("intersect_u16_sve2", simsimd_intersect_u16_sve2, simsimd_intersect_u16_accurate);
+    sparse_<u32_k>("intersect_u32_sve2", simsimd_intersect_u32_sve2, simsimd_intersect_u32_accurate);
 #endif
 
 #if SIMSIMD_TARGET_HASWELL
diff --git a/cpp/test.c b/cpp/test.c
index 066c4b53..078e7d42 100644
--- a/cpp/test.c
+++ b/cpp/test.c
@@ -28,6 +28,7 @@ void print_capabilities(void) {
     printf("Compile-time settings:\n");
     printf("- Arm NEON support enabled: %s\n", flags[SIMSIMD_TARGET_NEON]);
     printf("- Arm SVE support enabled: %s\n", flags[SIMSIMD_TARGET_SVE]);
+    printf("- Arm SVE2 support enabled: %s\n", flags[SIMSIMD_TARGET_SVE2]);
     printf("- x86 Haswell support enabled: %s\n", flags[SIMSIMD_TARGET_HASWELL]);
     printf("- x86 Skylake support enabled: %s\n", flags[SIMSIMD_TARGET_SKYLAKE]);
     printf("- x86 Ice Lake support enabled: %s\n", flags[SIMSIMD_TARGET_ICE]);
@@ -43,6 +44,7 @@ void print_capabilities(void) {
     printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
     printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
     printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
+    printf("- Arm SVE2 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve2_k) != 0]);
     printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
     printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
     printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);
diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h
index f81cbfd3..e9ac3b2f 100644
--- a/include/simsimd/simsimd.h
+++ b/include/simsimd/simsimd.h
@@ -181,6 +181,8 @@ typedef enum {
     simsimd_cap_sve_f16_k = 1 << 25,   ///< ARM SVE `f16` capability
     simsimd_cap_sve_bf16_k = 1 << 26,  ///< ARM SVE `bf16` capability
     simsimd_cap_sve_i8_k = 1 << 27,    ///< ARM SVE `i8` capability
+    simsimd_cap_sve2_k = 1 << 28,      ///< ARM SVE2 capability
+    simsimd_cap_sve2p1_k = 1 << 29,    ///< ARM SVE2p1 capability
 
 } simsimd_capability_t;
 
@@ -437,8 +439,8 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_arm(void) {
     //  - 0b0001: SVE2 is implemented
     //  - 0b0010: SVE2.1 is implemented
     // This value must match the existing indicator obtained from ID_AA64PFR0_EL1:
-    //    unsigned supports_sve = ((id_aa64zfr0_el1) & 0xF) >= 1;
-    //    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 2;
+    unsigned supports_sve2 = ((id_aa64zfr0_el1) & 0xF) >= 1;
+    unsigned supports_sve2p1 = ((id_aa64zfr0_el1) & 0xF) >= 2;
     unsigned supports_neon = 1; // NEON is always supported
 
     return (simsimd_capability_t)(                                                                    //
@@ -450,6 +452,8 @@ SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities_arm(void) {
         (simsimd_cap_sve_f16_k * (supports_sve && supports_fp16)) |                                   //
         (simsimd_cap_sve_bf16_k * (supports_sve && supports_sve_bf16)) |                              //
         (simsimd_cap_sve_i8_k * (supports_sve && supports_sve_i8mm)) |                                //
+        (simsimd_cap_sve2_k * (supports_sve2)) |                                                      //
+        (simsimd_cap_sve2p1_k * (supports_sve2p1)) |                                                  //
         (simsimd_cap_serial_k));
 #else // SIMSIMD_DEFINED_LINUX
     return simsimd_cap_serial_k;
@@ -995,10 +999,17 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( //
     // Unsigned 16-bit integer vectors
     case simsimd_datatype_u16_k: {
 
-#if SIMSIMD_TARGET_SVE
+#if SIMSIMD_TARGET_SVE2
         if (viable & simsimd_cap_sve_k)
             switch (kind) {
-            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_sve, *c = simsimd_cap_sve_k; return;
+            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_sve2, *c = simsimd_cap_sve_k; return;
+            default: break;
+            }
+#endif
+#if SIMSIMD_TARGET_NEON
+        if (viable & simsimd_cap_neon_k)
+            switch (kind) {
+            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_neon, *c = simsimd_cap_neon_k; return;
             default: break;
             }
 #endif
@@ -1020,10 +1031,17 @@ SIMSIMD_PUBLIC void simsimd_find_metric_punned( //
     // Unsigned 32-bit integer vectors
     case simsimd_datatype_u32_k: {
 
-#if SIMSIMD_TARGET_SVE
+#if SIMSIMD_TARGET_SVE2
         if (viable & simsimd_cap_sve_k)
             switch (kind) {
-            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_sve, *c = simsimd_cap_sve_k; return;
+            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_sve2, *c = simsimd_cap_sve_k; return;
+            default: break;
+            }
+#endif
+#if SIMSIMD_TARGET_NEON
+        if (viable & simsimd_cap_neon_k)
+            switch (kind) {
+            case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u32_neon, *c = simsimd_cap_neon_k; return;
             default: break;
             }
 #endif
@@ -1098,6 +1116,7 @@ SIMSIMD_DYNAMIC int simsimd_uses_sve(void);
 SIMSIMD_DYNAMIC int simsimd_uses_sve_f16(void);
 SIMSIMD_DYNAMIC int simsimd_uses_sve_bf16(void);
 SIMSIMD_DYNAMIC int simsimd_uses_sve_i8(void);
+SIMSIMD_DYNAMIC int simsimd_uses_sve2(void);
 SIMSIMD_DYNAMIC int simsimd_uses_haswell(void);
 SIMSIMD_DYNAMIC int simsimd_uses_skylake(void);
 SIMSIMD_DYNAMIC int simsimd_uses_ice(void);
@@ -1252,6 +1271,7 @@ SIMSIMD_PUBLIC int simsimd_uses_sve(void) { return SIMSIMD_TARGET_ARM && SIMSIMD
 SIMSIMD_PUBLIC int simsimd_uses_sve_f16(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE && SIMSIMD_NATIVE_F16; }
 SIMSIMD_PUBLIC int simsimd_uses_sve_bf16(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE && SIMSIMD_NATIVE_BF16; }
 SIMSIMD_PUBLIC int simsimd_uses_sve_i8(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE; }
+SIMSIMD_PUBLIC int simsimd_uses_sve2(void) { return SIMSIMD_TARGET_ARM && SIMSIMD_TARGET_SVE2; }
 SIMSIMD_PUBLIC int simsimd_uses_haswell(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_HASWELL; }
 SIMSIMD_PUBLIC int simsimd_uses_skylake(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SKYLAKE; }
 SIMSIMD_PUBLIC int simsimd_uses_ice(void) { return SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_ICE; }
@@ -1671,8 +1691,10 @@ SIMSIMD_PUBLIC void simsimd_js_f64(simsimd_f64_t const* a, simsimd_f64_t const*
  */
 SIMSIMD_PUBLIC void simsimd_intersect_u16(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length,
                                           simsimd_size_t b_length, simsimd_distance_t* d) {
-#if SIMSIMD_TARGET_SVE
-    simsimd_intersect_u16_sve(a, b, a_length, b_length, d);
+#if SIMSIMD_TARGET_SVE2
+    simsimd_intersect_u16_sve2(a, b, a_length, b_length, d);
+#elif SIMSIMD_TARGET_NEON
+    simsimd_intersect_u16_neon(a, b, a_length, b_length, d);
 #elif SIMSIMD_TARGET_SKYLAKE
     simsimd_intersect_u16_ice(a, b, a_length, b_length, d);
 #else
@@ -1682,8 +1704,10 @@ SIMSIMD_PUBLIC void simsimd_intersect_u16(simsimd_u16_t const* a, simsimd_u16_t
 
 SIMSIMD_PUBLIC void simsimd_intersect_u32(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length,
                                           simsimd_size_t b_length, simsimd_distance_t* d) {
-#if SIMSIMD_TARGET_SVE
-    simsimd_intersect_u32_sve(a, b, a_length, b_length, d);
+#if SIMSIMD_TARGET_SVE2
+    simsimd_intersect_u32_sve2(a, b, a_length, b_length, d);
+#elif SIMSIMD_TARGET_NEON
+    simsimd_intersect_u32_neon(a, b, a_length, b_length, d);
 #elif SIMSIMD_TARGET_SKYLAKE
     simsimd_intersect_u32_ice(a, b, a_length, b_length, d);
 #else
diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h
index e7fa9175..25e92288 100644
--- a/include/simsimd/sparse.h
+++ b/include/simsimd/sparse.h
@@ -61,8 +61,10 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_accurate(simsimd_u32_t const* a, simsi
 /*  SIMD-powered backends for Arm SVE, mostly using 32-bit arithmetic over variable-length platform-defined word sizes.
  *  Designed for Arm Graviton 3, Microsoft Cobalt, as well as Nvidia Grace and newer Ampere Altra CPUs.
  */
-SIMSIMD_PUBLIC void simsimd_intersect_u32_sve(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
-SIMSIMD_PUBLIC void simsimd_intersect_u16_sve(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
+SIMSIMD_PUBLIC void simsimd_intersect_u32_neon(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
+SIMSIMD_PUBLIC void simsimd_intersect_u16_neon(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
+SIMSIMD_PUBLIC void simsimd_intersect_u32_sve2(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
+SIMSIMD_PUBLIC void simsimd_intersect_u16_sve2(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t* results);
 
 /*  SIMD-powered backends for various generations of AVX512 CPUs.
  *  Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop.
@@ -149,252 +151,653 @@ SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_ser
 #if SIMSIMD_TARGET_X86
 #if SIMSIMD_TARGET_ICE
 #pragma GCC push_options
-#pragma GCC target("avx512f", "avx512vl", "bmi2", "avx512bw")
-#pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2,avx512bw"))), apply_to = function)
-
-SIMSIMD_PUBLIC void simsimd_intersect_u16_ice(simsimd_u16_t const* shorter, simsimd_u16_t const* longer,
-                                              simsimd_size_t shorter_length, simsimd_size_t longer_length,
-                                              simsimd_distance_t* results) {
-    simsimd_size_t intersection_count = 0;
-    simsimd_size_t shorter_idx = 0, longer_idx = 0;
-    simsimd_size_t longer_load_size;
-    __mmask32 longer_mask;
-
-    while (shorter_idx < shorter_length && longer_idx < longer_length) {
-        // Load `shorter_member` and broadcast it to shorter vector, load `longer_members_vec` from memory.
-        simsimd_size_t longer_remaining = longer_length - longer_idx;
-        simsimd_u16_t shorter_member = shorter[shorter_idx];
-        __m512i shorter_member_vec = _mm512_set1_epi16(*(short*)&shorter_member);
-        __m512i longer_members_vec;
-        if (longer_remaining < 32) {
-            longer_load_size = longer_remaining;
-            longer_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, longer_remaining);
-        } else {
-            longer_load_size = 32;
-            longer_mask = 0xFFFFFFFF;
+#pragma GCC target("avx512f", "avx512vl", "bmi2", "lzcnt", "popcnt", "avx512bw", "avx512vbmi2")
+#pragma clang attribute push(__attribute__((target("avx512f,avx512vl,bmi2,lzcnt,popcnt,avx512bw,avx512vbmi2"))),       \
+                             apply_to = function)
+
+/*  The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives
+ *  for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022.
+ *
+ *      https://github.com/mozonaut/vp2intersect
+ *      https://arxiv.org/pdf/2112.06342.pdf
+ *
+ *  For R&D purposes, it's important to keep the following latencies in mind:
+ *
+ *   - `_mm512_permutex_epi64` - needs F - 3 cycles latency
+ *   - `_mm512_shuffle_epi8` - needs BW - 1 cycle latency
+ *   - `_mm512_permutexvar_epi16` - needs BW - 4-6 cycles latency
+ *   - `_mm512_permutexvar_epi8` - needs VBMI - 3 cycles latency
+ */
+
+SIMSIMD_INTERNAL simsimd_u32_t _mm512_2intersect_epi16_mask(__m512i a, __m512i b) {
+    __m512i a1 = _mm512_alignr_epi32(a, a, 4);
+    __m512i a2 = _mm512_alignr_epi32(a, a, 8);
+    __m512i a3 = _mm512_alignr_epi32(a, a, 12);
+
+    __m512i b1 = _mm512_shuffle_epi32(b, _MM_PERM_ADCB);
+    __m512i b2 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
+    __m512i b3 = _mm512_shuffle_epi32(b, _MM_PERM_CBAD);
+
+    __m512i b01 = _mm512_shrdi_epi32(b, b, 16);
+    __m512i b11 = _mm512_shrdi_epi32(b1, b1, 16);
+    __m512i b21 = _mm512_shrdi_epi32(b2, b2, 16);
+    __m512i b31 = _mm512_shrdi_epi32(b3, b3, 16);
+
+    __mmask32 nm00 = _mm512_cmpneq_epi16_mask(a, b);
+    __mmask32 nm01 = _mm512_cmpneq_epi16_mask(a1, b);
+    __mmask32 nm02 = _mm512_cmpneq_epi16_mask(a2, b);
+    __mmask32 nm03 = _mm512_cmpneq_epi16_mask(a3, b);
+
+    __mmask32 nm10 = _mm512_mask_cmpneq_epi16_mask(nm00, a, b01);
+    __mmask32 nm11 = _mm512_mask_cmpneq_epi16_mask(nm01, a1, b01);
+    __mmask32 nm12 = _mm512_mask_cmpneq_epi16_mask(nm02, a2, b01);
+    __mmask32 nm13 = _mm512_mask_cmpneq_epi16_mask(nm03, a3, b01);
+
+    __mmask32 nm20 = _mm512_mask_cmpneq_epi16_mask(nm10, a, b1);
+    __mmask32 nm21 = _mm512_mask_cmpneq_epi16_mask(nm11, a1, b1);
+    __mmask32 nm22 = _mm512_mask_cmpneq_epi16_mask(nm12, a2, b1);
+    __mmask32 nm23 = _mm512_mask_cmpneq_epi16_mask(nm13, a3, b1);
+
+    __mmask32 nm30 = _mm512_mask_cmpneq_epi16_mask(nm20, a, b11);
+    __mmask32 nm31 = _mm512_mask_cmpneq_epi16_mask(nm21, a1, b11);
+    __mmask32 nm32 = _mm512_mask_cmpneq_epi16_mask(nm22, a2, b11);
+    __mmask32 nm33 = _mm512_mask_cmpneq_epi16_mask(nm23, a3, b11);
+
+    __mmask32 nm40 = _mm512_mask_cmpneq_epi16_mask(nm30, a, b2);
+    __mmask32 nm41 = _mm512_mask_cmpneq_epi16_mask(nm31, a1, b2);
+    __mmask32 nm42 = _mm512_mask_cmpneq_epi16_mask(nm32, a2, b2);
+    __mmask32 nm43 = _mm512_mask_cmpneq_epi16_mask(nm33, a3, b2);
+
+    __mmask32 nm50 = _mm512_mask_cmpneq_epi16_mask(nm40, a, b21);
+    __mmask32 nm51 = _mm512_mask_cmpneq_epi16_mask(nm41, a1, b21);
+    __mmask32 nm52 = _mm512_mask_cmpneq_epi16_mask(nm42, a2, b21);
+    __mmask32 nm53 = _mm512_mask_cmpneq_epi16_mask(nm43, a3, b21);
+
+    __mmask32 nm60 = _mm512_mask_cmpneq_epi16_mask(nm50, a, b3);
+    __mmask32 nm61 = _mm512_mask_cmpneq_epi16_mask(nm51, a1, b3);
+    __mmask32 nm62 = _mm512_mask_cmpneq_epi16_mask(nm52, a2, b3);
+    __mmask32 nm63 = _mm512_mask_cmpneq_epi16_mask(nm53, a3, b3);
+
+    __mmask32 nm70 = _mm512_mask_cmpneq_epi16_mask(nm60, a, b31);
+    __mmask32 nm71 = _mm512_mask_cmpneq_epi16_mask(nm61, a1, b31);
+    __mmask32 nm72 = _mm512_mask_cmpneq_epi16_mask(nm62, a2, b31);
+    __mmask32 nm73 = _mm512_mask_cmpneq_epi16_mask(nm63, a3, b31);
+
+    return ~(simsimd_u32_t)(nm70 & simsimd_u32_rol(nm71, 8) & simsimd_u32_rol(nm72, 16) & simsimd_u32_ror(nm73, 8));
+}
+
+SIMSIMD_INTERNAL simsimd_u16_t _mm512_2intersect_epi32_mask(__m512i a, __m512i b) {
+    __m512i a1 = _mm512_alignr_epi32(a, a, 4);
+    __m512i b1 = _mm512_shuffle_epi32(b, _MM_PERM_ADCB);
+    __mmask16 nm00 = _mm512_cmpneq_epi32_mask(a, b);
+
+    __m512i a2 = _mm512_alignr_epi32(a, a, 8);
+    __m512i a3 = _mm512_alignr_epi32(a, a, 12);
+    __mmask16 nm01 = _mm512_cmpneq_epi32_mask(a1, b);
+    __mmask16 nm02 = _mm512_cmpneq_epi32_mask(a2, b);
+
+    __mmask16 nm03 = _mm512_cmpneq_epi32_mask(a3, b);
+    __mmask16 nm10 = _mm512_mask_cmpneq_epi32_mask(nm00, a, b1);
+    __mmask16 nm11 = _mm512_mask_cmpneq_epi32_mask(nm01, a1, b1);
+
+    __m512i b2 = _mm512_shuffle_epi32(b, _MM_PERM_BADC);
+    __mmask16 nm12 = _mm512_mask_cmpneq_epi32_mask(nm02, a2, b1);
+    __mmask16 nm13 = _mm512_mask_cmpneq_epi32_mask(nm03, a3, b1);
+    __mmask16 nm20 = _mm512_mask_cmpneq_epi32_mask(nm10, a, b2);
+
+    __m512i b3 = _mm512_shuffle_epi32(b, _MM_PERM_CBAD);
+    __mmask16 nm21 = _mm512_mask_cmpneq_epi32_mask(nm11, a1, b2);
+    __mmask16 nm22 = _mm512_mask_cmpneq_epi32_mask(nm12, a2, b2);
+    __mmask16 nm23 = _mm512_mask_cmpneq_epi32_mask(nm13, a3, b2);
+
+    __mmask16 nm0 = _mm512_mask_cmpneq_epi32_mask(nm20, a, b3);
+    __mmask16 nm1 = _mm512_mask_cmpneq_epi32_mask(nm21, a1, b3);
+    __mmask16 nm2 = _mm512_mask_cmpneq_epi32_mask(nm22, a2, b3);
+    __mmask16 nm3 = _mm512_mask_cmpneq_epi32_mask(nm23, a3, b3);
+
+    return ~(simsimd_u16_t)(nm0 & simsimd_u16_rol(nm1, 4) & simsimd_u16_rol(nm2, 8) & simsimd_u16_ror(nm3, 4));
+}
+
+SIMSIMD_PUBLIC void simsimd_intersect_u16_ice(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length,
+                                              simsimd_size_t b_length, simsimd_distance_t* results) {
+
+    // The baseline implementation for very small arrays (2 registers or less) can be quite simple:
+    if (a_length < 64 && b_length < 64) {
+        simsimd_intersect_u16_serial(a, b, a_length, b_length, results);
+        return;
+    }
+
+    simsimd_u16_t const* const a_end = a + a_length;
+    simsimd_u16_t const* const b_end = b + b_length;
+    simsimd_size_t c = 0;
+    union vec_t {
+        __m512i zmm;
+        simsimd_u16_t u16[32];
+        simsimd_u8_t u8[64];
+    } a_vec, b_vec;
+
+    while (a + 32 < a_end && b + 32 < b_end) {
+        a_vec.zmm = _mm512_loadu_si512((__m512i const*)a);
+        b_vec.zmm = _mm512_loadu_si512((__m512i const*)b);
+
+        // Intersecting registers with `_mm512_2intersect_epi16_mask` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u16_t a_min;
+        simsimd_u16_t a_max = a_vec.u16[31];
+        simsimd_u16_t b_min = b_vec.u16[0];
+        simsimd_u16_t b_max = b_vec.u16[31];
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && a + 64 < a_end) {
+            a += 32;
+            a_vec.zmm = _mm512_loadu_si512((__m512i const*)a);
+            a_max = a_vec.u16[31];
         }
-        longer_members_vec = _mm512_maskz_loadu_epi16(longer_mask, (__m512i const*)(longer + longer_idx));
-
-        // Compare `shorter_member` with each element in `longer_members_vec`,
-        // and jump to the position of the match. There can be only one match at most!
-        __mmask32 equal_mask = _mm512_mask_cmpeq_epu16_mask(longer_mask, shorter_member_vec, longer_members_vec);
-        simsimd_size_t equal_count = equal_mask != 0;
-        intersection_count += equal_count;
-
-        // When comparing a scalar against a sorted array, we can find three types of elements:
-        // - entries that scalar is greater than,
-        // - entries that scalar is equal to,
-        // - entries that scalar is less than,
-        // ... in that order! Any of them can be an empty set.
-        __mmask32 greater_mask = _mm512_mask_cmplt_epu16_mask(longer_mask, longer_members_vec, shorter_member_vec);
-        simsimd_size_t greater_count = _mm_popcnt_u32(greater_mask);
-        simsimd_size_t smaller_exists = longer_load_size > greater_count - equal_count;
-
-        // Advance the first array:
-        // - to the next element, if a match was found,
-        // - to the next element, if the current element is smaller than any elements in the second array.
-        shorter_idx += equal_count | smaller_exists;
-        // Advance the second array:
-        // - to the next element after match, if a match was found,
-        // - to the first element that is greater than the current element in the first array, if no match was found.
-        longer_idx += greater_count + equal_count;
-
-        // At any given cycle, take one entry from shorter array and compare it with multiple from the longer array.
-        // For that, we need to swap the arrays if necessary.
-        if ((shorter_length - shorter_idx) > (longer_length - longer_idx)) {
-            simsimd_u16_t const* temp_array = shorter;
-            shorter = longer, longer = temp_array;
-            simsimd_size_t temp_length = shorter_length;
-            shorter_length = longer_length, longer_length = temp_length;
-            simsimd_size_t temp_idx = shorter_idx;
-            shorter_idx = longer_idx, longer_idx = temp_idx;
+        a_min = a_vec.u16[0];
+        while (b_max < a_min && b + 64 < b_end) {
+            b += 32;
+            b_vec.zmm = _mm512_loadu_si512((__m512i const*)b);
+            b_max = b_vec.u16[31];
         }
+        b_min = b_vec.u16[0];
+
+        // Now we are likely to have some overlap, so we can intersect the registers
+        __mmask32 a_matches = _mm512_2intersect_epi16_mask(a_vec.zmm, b_vec.zmm);
+
+        // The paper also contained a very nice procedure for exporting the matches,
+        // but we don't need it here:
+        //      _mm512_mask_compressstoreu_epi16(c, a_matches, a_vec);
+        c += _mm_popcnt_u32(a_matches); // The `_popcnt32` symbol isn't recognized by MSVC
+
+        __m512i a_last_broadcasted = _mm512_set1_epi16(*(short const*)&a_max);
+        __m512i b_last_broadcasted = _mm512_set1_epi16(*(short const*)&b_max);
+        __mmask32 a_step_mask = _mm512_cmple_epu16_mask(a_vec.zmm, b_last_broadcasted);
+        __mmask32 b_step_mask = _mm512_cmple_epu16_mask(b_vec.zmm, a_last_broadcasted);
+        a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
+        b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
     }
-    *results = intersection_count;
+
+    simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
+    *results += c;
 }
 
-SIMSIMD_PUBLIC void simsimd_intersect_u32_ice(simsimd_u32_t const* shorter, simsimd_u32_t const* longer,
-                                              simsimd_size_t shorter_length, simsimd_size_t longer_length,
-                                              simsimd_distance_t* results) {
-    simsimd_size_t intersection_count = 0;
-    simsimd_size_t shorter_idx = 0, longer_idx = 0;
-    simsimd_size_t longer_load_size;
-    __mmask16 longer_mask;
-
-    while (shorter_idx < shorter_length && longer_idx < longer_length) {
-        // Load `shorter_member` and broadcast it to shorter vector, load `longer_members_vec` from memory.
-        simsimd_size_t longer_remaining = longer_length - longer_idx;
-        simsimd_u32_t shorter_member = shorter[shorter_idx];
-        __m512i shorter_member_vec = _mm512_set1_epi32(*(int*)&shorter_member);
-        __m512i longer_members_vec;
-        if (longer_remaining < 16) {
-            longer_load_size = longer_remaining;
-            longer_mask = (__mmask16)_bzhi_u32(0xFFFF, longer_remaining);
-        } else {
-            longer_load_size = 16;
-            longer_mask = 0xFFFF;
+SIMSIMD_PUBLIC void simsimd_intersect_u32_ice(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length,
+                                              simsimd_size_t b_length, simsimd_distance_t* results) {
+
+    // The baseline implementation for very small arrays (2 registers or less) can be quite simple:
+    if (a_length < 32 && b_length < 32) {
+        simsimd_intersect_u32_serial(a, b, a_length, b_length, results);
+        return;
+    }
+
+    simsimd_u32_t const* const a_end = a + a_length;
+    simsimd_u32_t const* const b_end = b + b_length;
+    simsimd_size_t c = 0;
+    union vec_t {
+        __m512i zmm;
+        simsimd_u32_t u32[16];
+        simsimd_u8_t u8[64];
+    } a_vec, b_vec;
+
+    while (a + 16 < a_end && b + 16 < b_end) {
+        a_vec.zmm = _mm512_loadu_si512((__m512i const*)a);
+        b_vec.zmm = _mm512_loadu_si512((__m512i const*)b);
+
+        // Intersecting registers with `_mm512_2intersect_epi32_mask` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u32_t a_min;
+        simsimd_u32_t a_max = a_vec.u32[15];
+        simsimd_u32_t b_min = b_vec.u32[0];
+        simsimd_u32_t b_max = b_vec.u32[15];
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && a + 32 < a_end) {
+            a += 16;
+            a_vec.zmm = _mm512_loadu_si512((__m512i const*)a);
+            a_max = a_vec.u32[15];
         }
-        longer_members_vec = _mm512_maskz_loadu_epi32(longer_mask, (__m512i const*)(longer + longer_idx));
-
-        // Compare `shorter_member` with each element in `longer_members_vec`,
-        // and jump to the position of the match. There can be only one match at most!
-        __mmask16 equal_mask = _mm512_mask_cmpeq_epu32_mask(longer_mask, shorter_member_vec, longer_members_vec);
-        simsimd_size_t equal_count = equal_mask != 0;
-        intersection_count += equal_count;
-
-        // When comparing a scalar against a sorted array, we can find three types of elements:
-        // - entries that scalar is greater than,
-        // - entries that scalar is equal to,
-        // - entries that scalar is less than,
-        // ... in that order! Any of them can be an empty set.
-        __mmask16 greater_mask = _mm512_mask_cmplt_epu32_mask(longer_mask, longer_members_vec, shorter_member_vec);
-        simsimd_size_t greater_count = _mm_popcnt_u32(greater_mask);
-        simsimd_size_t smaller_exists = longer_load_size > greater_count - equal_count;
-
-        // Advance the first array:
-        // - to the next element, if a match was found,
-        // - to the next element, if the current element is smaller than any elements in the second array.
-        shorter_idx += equal_count | smaller_exists;
-        // Advance the second array:
-        // - to the next element after match, if a match was found,
-        // - to the first element that is greater than the current element in the first array, if no match was found.
-        longer_idx += greater_count + equal_count;
-
-        // At any given cycle, take one entry from shorter array and compare it with multiple from the longer array.
-        // For that, we need to swap the arrays if necessary.
-        if ((shorter_length - shorter_idx) > (longer_length - longer_idx)) {
-            simsimd_u32_t const* temp_array = shorter;
-            shorter = longer, longer = temp_array;
-            simsimd_size_t temp_length = shorter_length;
-            shorter_length = longer_length, longer_length = temp_length;
-            simsimd_size_t temp_idx = shorter_idx;
-            shorter_idx = longer_idx, longer_idx = temp_idx;
+        a_min = a_vec.u32[0];
+        while (b_max < a_min && b + 32 < b_end) {
+            b += 16;
+            b_vec.zmm = _mm512_loadu_si512((__m512i const*)b);
+            b_max = b_vec.u32[15];
         }
+        b_min = b_vec.u32[0];
+
+        // Now we are likely to have some overlap, so we can intersect the registers
+        __mmask16 a_matches = _mm512_2intersect_epi32_mask(a_vec.zmm, b_vec.zmm);
+
+        // The paper also contained a very nice procedure for exporting the matches,
+        // but we don't need it here:
+        //      _mm512_mask_compressstoreu_epi32(c, a_matches, a_vec);
+        c += _mm_popcnt_u32(a_matches); // The `_popcnt32` symbol isn't recognized by MSVC
+
+        __m512i a_last_broadcasted = _mm512_set1_epi32(*(int const*)&a_max);
+        __m512i b_last_broadcasted = _mm512_set1_epi32(*(int const*)&b_max);
+        __mmask16 a_step_mask = _mm512_cmple_epu32_mask(a_vec.zmm, b_last_broadcasted);
+        __mmask16 b_step_mask = _mm512_cmple_epu32_mask(b_vec.zmm, a_last_broadcasted);
+        a += 32 - _lzcnt_u32((simsimd_u32_t)a_step_mask);
+        b += 32 - _lzcnt_u32((simsimd_u32_t)b_step_mask);
     }
-    *results = intersection_count;
+
+    simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);
+    *results += c;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // SIMSIMD_TARGET_SKYLAKE
+#endif // SIMSIMD_TARGET_ICE
 #endif // SIMSIMD_TARGET_X86
 
 #if SIMSIMD_TARGET_ARM
-#if SIMSIMD_TARGET_SVE
-
+#if SIMSIMD_TARGET_NEON
 #pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+sve")
-#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve"))), apply_to = function)
-
-SIMSIMD_PUBLIC void simsimd_intersect_u16_sve(simsimd_u16_t const* shorter, simsimd_u16_t const* longer,
-                                              simsimd_size_t shorter_length, simsimd_size_t longer_length,
-                                              simsimd_distance_t* results) {
-
-    // Temporarily disable SVE: https://github.com/ashvardanian/SimSIMD/issues/168
-    simsimd_intersect_u16_serial(shorter, longer, shorter_length, longer_length, results);
-    return;
-
-    // SVE implementations with 128-bit registers can only fit 8x 16-bit words,
-    // making this kernel quite inefficient. Let's aim for registers of 256 bits and larger.
-    simsimd_size_t longer_load_size = svcnth();
-    if (longer_load_size < 16) {
-        simsimd_intersect_u16_serial(shorter, longer, shorter_length, longer_length, results);
+#pragma GCC target("arch=armv8.2-a")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a"))), apply_to = function)
+
+/**
+ *  @brief  Uses `vshrn` to produce a bitmask, similar to `movemask` in SSE.
+ *  https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+ */
+SIMSIMD_INTERNAL simsimd_u64_t _simsimd_u8_to_u4_neon(uint8x16_t vec) {
+    return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0);
+}
+
+SIMSIMD_INTERNAL int _simsimd_clz_u64(simsimd_u64_t x) {
+// On GCC and Clang use the builtin, otherwise use the generic implementation
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_clzll(x);
+#else
+    int n = 0;
+    while ((x & 0x8000000000000000ull) == 0)
+        n++, x <<= 1;
+    return n;
+#endif
+}
+
+SIMSIMD_INTERNAL uint32x4_t _simsimd_intersect_u32x4_neon(uint32x4_t a, uint32x4_t b) {
+    uint32x4_t b1 = vextq_u32(b, b, 1);
+    uint32x4_t b2 = vextq_u32(b, b, 2);
+    uint32x4_t b3 = vextq_u32(b, b, 3);
+    uint32x4_t nm00 = vceqq_u32(a, b);
+    uint32x4_t nm01 = vceqq_u32(a, b1);
+    uint32x4_t nm02 = vceqq_u32(a, b2);
+    uint32x4_t nm03 = vceqq_u32(a, b3);
+    uint32x4_t nm = vorrq_u32(vorrq_u32(nm00, nm01), vorrq_u32(nm02, nm03));
+    return nm;
+}
+
+SIMSIMD_INTERNAL uint16x8_t _simsimd_intersect_u16x8_neon(uint16x8_t a, uint16x8_t b) {
+    uint16x8_t b1 = vextq_u16(b, b, 1);
+    uint16x8_t b2 = vextq_u16(b, b, 2);
+    uint16x8_t b3 = vextq_u16(b, b, 3);
+    uint16x8_t b4 = vextq_u16(b, b, 4);
+    uint16x8_t b5 = vextq_u16(b, b, 5);
+    uint16x8_t b6 = vextq_u16(b, b, 6);
+    uint16x8_t b7 = vextq_u16(b, b, 7);
+    uint16x8_t nm00 = vceqq_u16(a, b);
+    uint16x8_t nm01 = vceqq_u16(a, b1);
+    uint16x8_t nm02 = vceqq_u16(a, b2);
+    uint16x8_t nm03 = vceqq_u16(a, b3);
+    uint16x8_t nm04 = vceqq_u16(a, b4);
+    uint16x8_t nm05 = vceqq_u16(a, b5);
+    uint16x8_t nm06 = vceqq_u16(a, b6);
+    uint16x8_t nm07 = vceqq_u16(a, b7);
+    uint16x8_t nm = vorrq_u16(vorrq_u16(vorrq_u16(nm00, nm01), vorrq_u16(nm02, nm03)),
+                              vorrq_u16(vorrq_u16(nm04, nm05), vorrq_u16(nm06, nm07)));
+    return nm;
+}
+
+SIMSIMD_PUBLIC void simsimd_intersect_u16_neon(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length,
+                                               simsimd_size_t b_length, simsimd_distance_t* results) {
+
+    // The baseline implementation for very small arrays (2 registers or less) can be quite simple:
+    if (a_length < 32 && b_length < 32) {
+        simsimd_intersect_u16_serial(a, b, a_length, b_length, results);
         return;
     }
 
-    simsimd_size_t intersection_count = 0;
-    simsimd_size_t shorter_idx = 0, longer_idx = 0;
-    while (shorter_idx < shorter_length && longer_idx < longer_length) {
-        // Load `shorter_member` and broadcast it, load `longer_members_vec` from memory
-        simsimd_size_t longer_remaining = longer_length - longer_idx;
-        simsimd_u16_t shorter_member = shorter[shorter_idx];
-        svbool_t pg = svwhilelt_b16_u64(longer_idx, longer_length);
-        svuint16_t shorter_member_vec = svdup_n_u16(shorter_member);
-        svuint16_t longer_members_vec = svld1_u16(pg, longer + longer_idx);
-
-        // Compare `shorter_member` with each element in `longer_members_vec`
-        svbool_t equal_mask = svcmpeq_u16(pg, shorter_member_vec, longer_members_vec);
-        simsimd_size_t equal_count = svcntp_b16(svptrue_b16(), equal_mask);
-        intersection_count += equal_count;
-
-        // Count the number of elements in `longer_members_vec` that are less than `shorter_member`
-        svbool_t smaller_mask = svcmplt_u16(pg, longer_members_vec, shorter_member_vec);
-        simsimd_size_t smaller_count = svcntp_b16(svptrue_b16(), smaller_mask);
-
-        // Advance pointers
-        longer_load_size = longer_remaining < longer_load_size ? longer_remaining : longer_load_size;
-        shorter_idx += (longer_load_size - smaller_count - equal_count) != 0;
-        longer_idx += smaller_count + equal_count;
-
-        // Swap arrays if necessary
-        if ((shorter_length - shorter_idx) > (longer_length - longer_idx)) {
-            simsimd_u16_t const* temp_array = shorter;
-            shorter = longer, longer = temp_array;
-            simsimd_size_t temp_length = shorter_length;
-            shorter_length = longer_length, longer_length = temp_length;
-            simsimd_size_t temp_idx = shorter_idx;
-            shorter_idx = longer_idx, longer_idx = temp_idx;
+    simsimd_u16_t const* const a_end = a + a_length;
+    simsimd_u16_t const* const b_end = b + b_length;
+    union vec_t {
+        uint16x8_t u16x8;
+        simsimd_u16_t u16[8];
+        simsimd_u8_t u8[16];
+    } a_vec, b_vec, c_counts_vec;
+    c_counts_vec.u16x8 = vdupq_n_u16(0);
+
+    while (a + 8 < a_end && b + 8 < b_end) {
+        a_vec.u16x8 = vld1q_u16(a);
+        b_vec.u16x8 = vld1q_u16(b);
+
+        // Intersecting registers with `_simsimd_intersect_u16x8_neon` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u16_t a_min;
+        simsimd_u16_t a_max = a_vec.u16[7];
+        simsimd_u16_t b_min = b_vec.u16[0];
+        simsimd_u16_t b_max = b_vec.u16[7];
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && a + 16 < a_end) {
+            a += 8;
+            a_vec.u16x8 = vld1q_u16(a);
+            a_max = a_vec.u16[7];
         }
+        a_min = a_vec.u16[0];
+        while (b_max < a_min && b + 16 < b_end) {
+            b += 8;
+            b_vec.u16x8 = vld1q_u16(b);
+            b_max = b_vec.u16[7];
+        }
+        b_min = b_vec.u16[0];
+
+        // Now we are likely to have some overlap, so we can intersect the registers.
+        // We can do it by performing a population count at every cycle, but it's not the cheapest in terms of cycles.
+        //
+        //      simsimd_u64_t a_matches = __builtin_popcountll(
+        //          _simsimd_u8_to_u4_neon(vreinterpretq_u8_u16(
+        //              _simsimd_intersect_u16x8_neon(a_vec.u16x8, b_vec.u16x8))));
+        //      c += a_matches / 8;
+        //
+        // Alternatively, we can we can transform match-masks into "ones", accumulate them between the cycles,
+        // and merge all together in the end.
+        uint16x8_t a_matches = _simsimd_intersect_u16x8_neon(a_vec.u16x8, b_vec.u16x8);
+        c_counts_vec.u16x8 = vaddq_u16(c_counts_vec.u16x8, vandq_u16(a_matches, vdupq_n_u16(1)));
+
+        // Counting leading zeros is tricky. On Arm we can use inline Assembly to get the result,
+        // but MSVC doesn't support that:
+        //
+        //      SIMSIMD_INTERNAL int _simsimd_clz_u64(simsimd_u64_t value) {
+        //          simsimd_u64_t result;
+        //          __asm__("clz %x0, %x1" : "=r"(result) : "r"(value));
+        //          return (int)result;
+        //      }
+        //
+        // Alternatively, we can use the `vclz_u32` NEON intrinsic.
+        // It will compute the leading zeros number for both `a_step` and `b_step` in parallel.
+        uint16x8_t a_last_broadcasted = vdupq_n_u16(a_max);
+        uint16x8_t b_last_broadcasted = vdupq_n_u16(b_max);
+        simsimd_u64_t a_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
+            vreinterpretq_u8_u16(vcleq_u16(a_vec.u16x8, b_last_broadcasted))));
+        simsimd_u64_t b_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
+            vreinterpretq_u8_u16(vcleq_u16(b_vec.u16x8, a_last_broadcasted))));
+        a += (64 - a_step) / 8;
+        b += (64 - b_step) / 8;
     }
-    *results = intersection_count;
-}
 
-SIMSIMD_PUBLIC void simsimd_intersect_u32_sve(simsimd_u32_t const* shorter, simsimd_u32_t const* longer,
-                                              simsimd_size_t shorter_length, simsimd_size_t longer_length,
-                                              simsimd_distance_t* results) {
+    simsimd_intersect_u16_serial(a, b, a_end - a, b_end - b, results);
+    *results += vaddvq_u16(c_counts_vec.u16x8);
+}
 
-    // Temporarily disable SVE: https://github.com/ashvardanian/SimSIMD/issues/168
-    simsimd_intersect_u32_serial(shorter, longer, shorter_length, longer_length, results);
-    return;
+SIMSIMD_PUBLIC void simsimd_intersect_u32_neon(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length,
+                                               simsimd_size_t b_length, simsimd_distance_t* results) {
 
-    // SVE implementations with 128-bit registers can only fit 4x 32-bit words,
-    // making this kernel quite inefficient. Let's aim for registers of 256 bits and larger.
-    simsimd_size_t longer_load_size = svcntw();
-    if (longer_load_size < 8) {
-        simsimd_intersect_u32_serial(shorter, longer, shorter_length, longer_length, results);
+    // The baseline implementation for very small arrays (2 registers or less) can be quite simple:
+    if (a_length < 32 && b_length < 32) {
+        simsimd_intersect_u32_serial(a, b, a_length, b_length, results);
         return;
     }
 
-    simsimd_size_t intersection_count = 0;
-    simsimd_size_t shorter_idx = 0, longer_idx = 0;
-    while (shorter_idx < shorter_length && longer_idx < longer_length) {
-        // Load `shorter_member` and broadcast it, load `longer_members_vec` from memory
-        simsimd_size_t longer_remaining = longer_length - longer_idx;
-        simsimd_u32_t shorter_member = shorter[shorter_idx];
-        svbool_t pg = svwhilelt_b32_u64(longer_idx, longer_length);
-        svuint32_t shorter_member_vec = svdup_n_u32(shorter_member);
-        svuint32_t longer_members_vec = svld1_u32(pg, longer + longer_idx);
-
-        // Compare `shorter_member` with each element in `longer_members_vec`
-        svbool_t equal_mask = svcmpeq_u32(pg, shorter_member_vec, longer_members_vec);
-        simsimd_size_t equal_count = svcntp_b32(svptrue_b32(), equal_mask);
-        intersection_count += equal_count;
-
-        // Count the number of elements in `longer_members_vec` that are less than `shorter_member`
-        svbool_t smaller_mask = svcmplt_u32(pg, longer_members_vec, shorter_member_vec);
-        simsimd_size_t smaller_count = svcntp_b32(svptrue_b32(), smaller_mask);
-
-        // Advance pointers
-        longer_load_size = longer_remaining < longer_load_size ? longer_remaining : longer_load_size;
-        shorter_idx += (longer_load_size - smaller_count - equal_count) != 0;
-        longer_idx += smaller_count + equal_count;
-
-        // Swap arrays if necessary
-        if ((shorter_length - shorter_idx) > (longer_length - longer_idx)) {
-            simsimd_u32_t const* temp_array = shorter;
-            shorter = longer, longer = temp_array;
-            simsimd_size_t temp_length = shorter_length;
-            shorter_length = longer_length, longer_length = temp_length;
-            simsimd_size_t temp_idx = shorter_idx;
-            shorter_idx = longer_idx, longer_idx = temp_idx;
+    simsimd_u32_t const* const a_end = a + a_length;
+    simsimd_u32_t const* const b_end = b + b_length;
+    union vec_t {
+        uint32x4_t u32x4;
+        simsimd_u32_t u32[4];
+        simsimd_u8_t u8[16];
+    } a_vec, b_vec, c_counts_vec;
+    c_counts_vec.u32x4 = vdupq_n_u32(0);
+
+    while (a + 4 < a_end && b + 4 < b_end) {
+        a_vec.u32x4 = vld1q_u32(a);
+        b_vec.u32x4 = vld1q_u32(b);
+
+        // Intersecting registers with `_simsimd_intersect_u32x4_neon` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u32_t a_min;
+        simsimd_u32_t a_max = a_vec.u32[3];
+        simsimd_u32_t b_min = b_vec.u32[0];
+        simsimd_u32_t b_max = b_vec.u32[3];
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && a + 8 < a_end) {
+            a += 4;
+            a_vec.u32x4 = vld1q_u32(a);
+            a_max = a_vec.u32[3];
+        }
+        a_min = a_vec.u32[0];
+        while (b_max < a_min && b + 8 < b_end) {
+            b += 4;
+            b_vec.u32x4 = vld1q_u32(b);
+            b_max = b_vec.u32[3];
+        }
+        b_min = b_vec.u32[0];
+
+        // Now we are likely to have some overlap, so we can intersect the registers
+        // We can do it by performing a population count at every cycle, but it's not the cheapest in terms of cycles.
+        //
+        //     simsimd_u64_t a_matches = __builtin_popcountll(
+        //         _simsimd_u8_to_u4_neon(vreinterpretq_u8_u32(
+        //             _simsimd_intersect_u32x4_neon(a_vec.u32x4, b_vec.u32x4))));
+        //     c += a_matches / 16;
+        //
+        // Alternatively, we can we can transform match-masks into "ones", accumulate them between the cycles,
+        // and merge all together in the end.
+        uint32x4_t a_matches = _simsimd_intersect_u32x4_neon(a_vec.u32x4, b_vec.u32x4);
+        c_counts_vec.u32x4 = vaddq_u32(c_counts_vec.u32x4, vandq_u32(a_matches, vdupq_n_u32(1)));
+
+        uint32x4_t a_last_broadcasted = vdupq_n_u32(a_max);
+        uint32x4_t b_last_broadcasted = vdupq_n_u32(b_max);
+        simsimd_u64_t a_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
+            vreinterpretq_u8_u32(vcleq_u32(a_vec.u32x4, b_last_broadcasted))));
+        simsimd_u64_t b_step = _simsimd_clz_u64(_simsimd_u8_to_u4_neon( //
+            vreinterpretq_u8_u32(vcleq_u32(b_vec.u32x4, a_last_broadcasted))));
+        a += (64 - a_step) / 16;
+        b += (64 - b_step) / 16;
+    }
+
+    simsimd_intersect_u32_serial(a, b, a_end - a, b_end - b, results);
+    *results += vaddvq_u32(c_counts_vec.u32x4);
+}
+
+#pragma clang attribute pop
+#pragma GCC pop_options
+#endif // SIMSIMD_TARGET_NEON
+
+#if SIMSIMD_TARGET_SVE2
+#pragma GCC push_options
+#pragma GCC target("arch=armv8.2-a+sve+sve2")
+#pragma clang attribute push(__attribute__((target("arch=armv8.2-a+sve+sve2"))), apply_to = function)
+
+/*  SVE2 introduces many new integer-oriented instructions, extending some of the NEON functionality to
+ *  variable-length SVE registers. Those include "compare multiple" intrinsics:
+ *
+ *  - `svmatch[_u16]` that matches each scalar in first vector against all members of a 128-bit lane in the second.
+ *  - `svhistcnt[_s32]_z` does something similar, performing an inclusive prefix scan.
+ *  - `svtbx[_u16]` does extended table lookup
+ *
+ *  Other notable instructions:
+ *
+ *  - `DUP`: Broadcast indexed predicate element
+ *    https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/DUP--predicate---Broadcast-indexed-predicate-element-?lang=en
+ *  - `SCLAMP` and `UCLAMP`: clamp values, i.e. combined min+max
+ *    https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/SCLAMP--Signed-clamp-to-minimum-maximum-vector-?lang=en
+ *    https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/UCLAMP--Unsigned-clamp-to-minimum-maximum-vector-?lang=en
+ *  - `TBLQ`: Table lookup quadword
+ *    https://developer.arm.com/documentation/ddi0602/2022-12/SVE-Instructions/TBLQ--Programmable-table-lookup-within-each-quadword-vector-segment--zeroing--?lang=en
+ *  -
+ *    https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/INCB--INCD--INCH--INCW--scalar---Increment-scalar-by-multiple-of-predicate-constraint-element-count-
+ *
+ *  Great resources for SVE2 intrinsics:
+ *
+ *  > ARM’s Scalable Vector Extensions: A Critical Look at SVE2 For Integer Workloads
+ *    https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
+ */
+
+SIMSIMD_PUBLIC void simsimd_intersect_u16_sve2(simsimd_u16_t const* a, simsimd_u16_t const* b, simsimd_size_t a_length,
+                                               simsimd_size_t b_length, simsimd_distance_t* results) {
+
+    // A single SVE lane is 128 bits wide, so one lane fits 8 values.
+    simsimd_size_t const register_size = svcnth();
+    simsimd_size_t const lanes_count = register_size / 8;
+    simsimd_size_t a_idx = 0, b_idx = 0;
+    simsimd_size_t c = 0;
+
+    while (a_idx < a_length && b_idx < b_length) {
+        // Load `a_member` and broadcast it, load `b_members_vec` from memory
+        svbool_t a_progress = svwhilelt_b16_u64(a_idx, a_length);
+        svbool_t b_progress = svwhilelt_b16_u64(b_idx, b_length);
+        svuint16_t a_vec = svld1_u16(a_progress, a + a_idx);
+        svuint16_t b_vec = svld1_u16(b_progress, b + b_idx);
+
+        // Intersecting registers with `svmatch_u16` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u16_t a_min;
+        simsimd_u16_t a_max = svlastb(a_progress, a_vec);
+        simsimd_u16_t b_min = svlasta(svpfalse_b(), b_vec);
+        simsimd_u16_t b_max = svlastb(b_progress, b_vec);
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && (a_idx + register_size) < a_length) {
+            a_idx += register_size;
+            a_progress = svwhilelt_b16_u64(a_idx, a_length);
+            a_vec = svld1_u16(a_progress, a + a_idx);
+            a_max = svlastb(a_progress, a_vec);
+        }
+        a_min = svlasta(svpfalse_b(), a_vec);
+        while (b_max < a_min && (b_idx + register_size) < b_length) {
+            b_idx += register_size;
+            b_progress = svwhilelt_b16_u64(b_idx, b_length);
+            b_vec = svld1_u16(b_progress, b + b_idx);
+            b_max = svlastb(b_progress, b_vec);
+        }
+        b_min = svlasta(svpfalse_b(), b_vec);
+
+        // Before we evaluate the intersection size, obfurscating the order in `b_vec`,
+        // let's estimate how much we will need to advance the pointers afterwards.
+        // For that, we don't even need to broadcast the values in SVE, as the whole
+        // register can be compared against a scalar:
+        //
+        //      svuint16_t a_last_broadcasted =  svdup_n_u16(a_max);
+        //      svuint16_t b_last_broadcasted =  svdup_n_u16(b_max);
+        svbool_t a_mask = svcmple_n_u16(a_progress, a_vec, b_max);
+        svbool_t b_mask = svcmple_n_u16(b_progress, b_vec, a_max);
+        simsimd_u64_t a_step = svcntp_b16(a_progress, a_mask);
+        simsimd_u64_t b_step = svcntp_b16(b_progress, b_mask);
+
+        // Compare `a_vec` with each lane of `b_vec`
+        svbool_t equal_mask = svmatch_u16(a_progress, a_vec, b_vec);
+        for (simsimd_size_t i = 1; i < lanes_count; i++) {
+            b_vec = svext_u16(b_vec, b_vec, 8);
+            equal_mask = svorr_z(svptrue_b16(), equal_mask, svmatch_u16(a_progress, a_vec, b_vec));
+        }
+        simsimd_size_t equal_count = svcntp_b16(svptrue_b16(), equal_mask);
+
+        // Advance
+        a_idx += a_step;
+        b_idx += b_step;
+        c += equal_count;
+    }
+    *results = c;
+}
+
+SIMSIMD_PUBLIC void simsimd_intersect_u32_sve2(simsimd_u32_t const* a, simsimd_u32_t const* b, simsimd_size_t a_length,
+                                               simsimd_size_t b_length, simsimd_distance_t* results) {
+
+    // A single SVE lane is 128 bits wide, so one lane fits 4 values.
+    simsimd_size_t const register_size = svcntw();
+    simsimd_size_t const lanes_count = register_size / 4;
+    simsimd_size_t a_idx = 0, b_idx = 0;
+    simsimd_size_t c = 0;
+
+    while (a_idx < a_length && b_idx < b_length) {
+        // Load `a_member` and broadcast it, load `b_members_vec` from memory
+        svbool_t a_progress = svwhilelt_b32_u64(a_idx, a_length);
+        svbool_t b_progress = svwhilelt_b32_u64(b_idx, b_length);
+        svuint32_t a_vec = svld1_u32(a_progress, a + a_idx);
+        svuint32_t b_vec = svld1_u32(b_progress, b + b_idx);
+
+        // Intersecting registers with `svmatch_u16` involves a lot of shuffling
+        // and comparisons, so we want to avoid it if the slices don't overlap at all..
+        simsimd_u32_t a_min;
+        simsimd_u32_t a_max = svlastb(a_progress, a_vec);
+        simsimd_u32_t b_min = svlasta(svpfalse_b(), b_vec);
+        simsimd_u32_t b_max = svlastb(b_progress, b_vec);
+
+        // If the slices don't overlap, advance the appropriate pointer
+        while (a_max < b_min && (a_idx + register_size) < a_length) {
+            a_idx += register_size;
+            a_progress = svwhilelt_b32_u64(a_idx, a_length);
+            a_vec = svld1_u32(a_progress, a + a_idx);
+            a_max = svlastb(a_progress, a_vec);
+        }
+        a_min = svlasta(svpfalse_b(), a_vec);
+        while (b_max < a_min && (b_idx + register_size) < b_length) {
+            b_idx += register_size;
+            b_progress = svwhilelt_b32_u64(b_idx, b_length);
+            b_vec = svld1_u32(b_progress, b + b_idx);
+            b_max = svlastb(b_progress, b_vec);
         }
+        b_min = svlasta(svpfalse_b(), b_vec);
+
+        // Before we evaluate the intersection size, obfurscating the order in `b_vec`,
+        // let's estimate how much we will need to advance the pointers afterwards.
+        // For that, we don't even need to broadcast the values in SVE, as the whole
+        // register can be compared against a scalar:
+        //
+        //      svuint32_t a_last_broadcasted =  svdup_n_u32(a_max);
+        //      svuint32_t b_last_broadcasted =  svdup_n_u32(b_max);
+        svbool_t a_mask = svcmple_n_u32(a_progress, a_vec, b_max);
+        svbool_t b_mask = svcmple_n_u32(b_progress, b_vec, a_max);
+        simsimd_u64_t a_step = svcntp_b32(a_progress, a_mask);
+        simsimd_u64_t b_step = svcntp_b32(b_progress, b_mask);
+
+        // Comparing `a_vec` with each lane of `b_vec` can't be done with `svmatch`,
+        // the same way as in `simsimd_intersect_u16_sve2`, as that instruction is only
+        // available for 8-bit and 16-bit integers.
+        //
+        //      svbool_t equal_mask = svpfalse_b();
+        //      for (simsimd_size_t i = 0; i < register_size; i++) {
+        //          equal_mask = svorr_z(svptrue_b32(), equal_mask, svcmpeq_u32(a_progress, a_vec, b_vec));
+        //          b_vec = svext_u32(b_vec, b_vec, 1);
+        //      }
+        //      simsimd_size_t equal_count = svcntp_b32(a_progress, equal_mask);
+        //
+        // Alternatively, one can use histogram instructions, like `svhistcnt_u32_z`.
+        // They practically compute the prefix-matching count, which is equivalent to
+        // the lower triangle of the row-major intersection matrix.
+        // To compute the upper triangle, we can reverse (with `svrev_b32`) the order of
+        // elements and repeat the operation, accumulating the results for top and bottom.
+        // Let's look at 4x element registers as an example:
+        //
+        //      ⊐ α = {A, B, C, D}, β = {X, Y, Z, W}:
+        //
+        //      hist(α, β):           hist(α_rev, β_rev):
+        //
+        //        X Y Z W               W Z Y X
+        //      A 1 0 0 0             D 1 0 0 0
+        //      B 1 1 0 0             C 1 1 0 0
+        //      C 1 1 1 0             B 1 1 1 0
+        //      D 1 1 1 1             A 1 1 1 1
+        //
+        svuint32_t hist_lower = svhistcnt_u32_z(a_progress, a_vec, b_vec);
+        svuint32_t a_rev_vec = svrev_u32(a_vec);
+        svuint32_t b_rev_vec = svrev_u32(b_vec);
+        svuint32_t hist_upper = svrev_u32(svhistcnt_u32_z(svptrue_b32(), a_rev_vec, b_rev_vec));
+        svuint32_t hist = svorr_u32_x(a_progress, hist_lower, hist_upper);
+        svbool_t equal_mask = svcmpne_n_u32(a_progress, hist, 0);
+        simsimd_size_t equal_count = svcntp_b32(a_progress, equal_mask);
+
+        // Advance
+        a_idx += a_step;
+        b_idx += b_step;
+        c += equal_count;
     }
-    *results = intersection_count;
+    *results = c;
 }
 
 #pragma clang attribute pop
 #pragma GCC pop_options
-#endif // SIMSIMD_TARGET_SVE
+#endif // SIMSIMD_TARGET_SVE2
 #endif // SIMSIMD_TARGET_ARM
 
 #ifdef __cplusplus
diff --git a/include/simsimd/types.h b/include/simsimd/types.h
index 1ce6c87c..3693b5aa 100644
--- a/include/simsimd/types.h
+++ b/include/simsimd/types.h
@@ -98,6 +98,16 @@
 #define SIMSIMD_TARGET_SVE_BF16 SIMSIMD_TARGET_SVE
 #endif // !defined(SIMSIMD_TARGET_SVE_BF16)
 
+// Compiling for Arm: SIMSIMD_TARGET_SVE2
+#if !defined(SIMSIMD_TARGET_SVE2) || (SIMSIMD_TARGET_SVE2 && !SIMSIMD_TARGET_ARM)
+#if defined(__ARM_FEATURE_SVE)
+#define SIMSIMD_TARGET_SVE2 SIMSIMD_TARGET_ARM
+#else
+#undef SIMSIMD_TARGET_SVE2
+#define SIMSIMD_TARGET_SVE2 0
+#endif // defined(__ARM_FEATURE_SVE)
+#endif // !defined(SIMSIMD_TARGET_SVE2)
+
 // Compiling for x86: SIMSIMD_TARGET_HASWELL
 //
 // Starting with Ivy Bridge, Intel supports the `F16C` extensions for fast half-precision
@@ -163,11 +173,12 @@
 #include <arm_neon.h>
 #endif
 
-#if SIMSIMD_TARGET_SVE
+#if SIMSIMD_TARGET_SVE || SIMSIMD_TARGET_SVE2
 #include <arm_sve.h>
 #endif
 
-#if SIMSIMD_TARGET_HASWELL || SIMSIMD_TARGET_SKYLAKE
+#if SIMSIMD_TARGET_HASWELL || SIMSIMD_TARGET_SKYLAKE || SIMSIMD_TARGET_ICE || SIMSIMD_TARGET_GENOA ||                  \
+    SIMSIMD_TARGET_SAPPHIRE
 #include <immintrin.h>
 #endif
 
@@ -466,6 +477,13 @@ SIMSIMD_PUBLIC void simsimd_compress_bf16(simsimd_f32_t x, simsimd_bf16_t* resul
     *(unsigned short*)result_ptr = (unsigned short)conv.i;
 }
 
+SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_rol(simsimd_u32_t x, int n) { return (x << n) | (x >> (32 - n)); }
+SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_rol(simsimd_u16_t x, int n) { return (x << n) | (x >> (16 - n)); }
+SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_rol(simsimd_u8_t x, int n) { return (x << n) | (x >> (8 - n)); }
+SIMSIMD_INTERNAL simsimd_u32_t simsimd_u32_ror(simsimd_u32_t x, int n) { return (x >> n) | (x << (32 - n)); }
+SIMSIMD_INTERNAL simsimd_u16_t simsimd_u16_ror(simsimd_u16_t x, int n) { return (x >> n) | (x << (16 - n)); }
+SIMSIMD_INTERNAL simsimd_u8_t simsimd_u8_ror(simsimd_u8_t x, int n) { return (x >> n) | (x << (8 - n)); }
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/setup.py b/setup.py
index 8cab9123..be6fe2d9 100644
--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,7 @@ def get_bool_env_w_name(name: str, preference: bool) -> tuple:
         [
             get_bool_env_w_name("SIMSIMD_TARGET_NEON", True),
             get_bool_env_w_name("SIMSIMD_TARGET_SVE", True),
+            get_bool_env_w_name("SIMSIMD_TARGET_SVE2", True),
             get_bool_env_w_name("SIMSIMD_TARGET_HASWELL", True),
             get_bool_env_w_name("SIMSIMD_TARGET_SKYLAKE", True),
             get_bool_env_w_name("SIMSIMD_TARGET_ICE", True),
@@ -96,6 +97,7 @@ def get_bool_env_w_name(name: str, preference: bool) -> tuple:
         [
             get_bool_env_w_name("SIMSIMD_TARGET_NEON", True),
             get_bool_env_w_name("SIMSIMD_TARGET_SVE", False),
+            get_bool_env_w_name("SIMSIMD_TARGET_SVE2", False),
             get_bool_env_w_name("SIMSIMD_TARGET_HASWELL", True),
             get_bool_env_w_name("SIMSIMD_TARGET_SKYLAKE", False),
             get_bool_env_w_name("SIMSIMD_TARGET_ICE", False),
@@ -119,6 +121,7 @@ def get_bool_env_w_name(name: str, preference: bool) -> tuple:
             get_bool_env_w_name("SIMSIMD_TARGET_NEON", True),
             get_bool_env_w_name("SIMSIMD_TARGET_NEON_BF16", False),
             get_bool_env_w_name("SIMSIMD_TARGET_SVE", False),
+            get_bool_env_w_name("SIMSIMD_TARGET_SVE2", False),
             get_bool_env_w_name("SIMSIMD_TARGET_HASWELL", True),
             get_bool_env_w_name("SIMSIMD_TARGET_SKYLAKE", True),
             get_bool_env_w_name("SIMSIMD_TARGET_ICE", True),