Merge branch 'main-dev' into main-dev-intersect

ashvardanian · Sep 13, 2024 · 0f04a1b · 0f04a1b
2 parents 843cf1c + eeb0580
commit 0f04a1b
Show file tree

Hide file tree

Showing 18 changed files with 731 additions and 379 deletions.
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
@@ -64,7 +64,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --no-cache-dir --upgrade pip
-          pip install --no-cache-dir pytest numpy scipy py-cpuinfo pytest-repeat
+          pip install --no-cache-dir py-cpuinfo pytest pytest-repeat numpy scipy tabulate
           python -c "from cpuinfo import get_cpu_info; print(get_cpu_info())"
 
       - name: Build locally on Ubuntu

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 project(
     simsimd
-    VERSION 5.1.2
+    VERSION 5.2.1
     LANGUAGES C CXX
     DESCRIPTION "Fastest SIMD-Accelerated Vector Similarity Functions for x86 and Arm"
     HOMEPAGE_URL "https://github.com/ashvardanian/simsimd"

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -40,9 +40,9 @@ cmake --build build_release --config Release
 Testing:
 
 ```sh
-pip install -e .                    # to install the package in editable mode
-pip install pytest pytest-repeat    # testing dependencies
-pytest python/test.py -s -x -Wd     # to run tests
+pip install -e .                             # to install the package in editable mode
+pip install pytest pytest-repeat tabulate    # testing dependencies
+pytest python/test.py -s -x -Wd                 # to run tests
 
 # to check supported SIMD instructions:
 python -c "import simsimd; print(simsimd.get_capabilities())" 

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "simsimd"
 description = "Fastest SIMD-Accelerated Vector Similarity Functions for x86 and Arm"
-version = "5.1.2"
+version = "5.2.1"
 edition = "2021"
 license = "Apache-2.0"
 authors = ["Ash Vardanian <[email protected]>"]

diff --git a/README.md b/README.md
@@ -42,7 +42,8 @@ SimSIMD provides an alternative.
 
 ## Features
 
-__SimSIMD__ provides __over 200 SIMD-optimized kernels__ for various distance and similarity measures, accelerating search in [USearch](https://github.com/unum-cloud/usearch) and several DBMS products.
+__SimSIMD__ (Arabic: "سيمسيم دي") is a library of __over 200 SIMD-optimized kernels__ for distance and similarity measures, boosting search performance in [USearch](https://github.com/unum-cloud/usearch) and several database systems.
+Named after the iconic ["Open Sesame"](https://en.wikipedia.org/wiki/Open_sesame) command from _Ali Baba and the Forty Thieves_, it opens the doors to a modern treasure: maximizing the potential of today's hardware for high resource utilization.
 Implemented distance functions include:
 
 - Euclidean (L2) and Cosine (Angular) spatial distances for Vector Search.
@@ -64,6 +65,7 @@ Moreover, SimSIMD...
 - has bindings for [Python](#using-simsimd-in-python), [Rust](#using-simsimd-in-rust) and [JS](#using-simsimd-in-javascript).
 - has Arm backends for NEON and Scalable Vector Extensions (SVE).
 - has x86 backends for Haswell, Skylake, Ice Lake, Genoa, and Sapphire Rapids.
+- with both compile-time and runtime CPU feature detection easily integrates anywhere!
 
 Due to the high-level of fragmentation of SIMD support in different x86 CPUs, SimSIMD generally uses the names of select Intel CPU generations for its backends.
 They, however, also work on AMD CPUs.
@@ -239,6 +241,18 @@ By default, the output distances will be stored in double-precision `f64` floati
 That behavior may not be space-efficient, especially if you are computing the hamming distance between short binary vectors, that will generally fit into 8x smaller `u8` or `u16` types.
 To override this behavior, use the `dtype` argument.
 
+### Helper Functions
+
+You can turn specific backends on or off depending on the exact environment.
+A common case may be avoiding AVX-512 on older AMD CPUs and [Intel Ice Lake](https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html) CPUs to ensure the CPU doesn't change the frequency license and throttle performance.
+
+```py
+$ simsimd.get_capabilities()
+> {'serial': True, 'neon': False, 'sve': False, 'neon_f16': False, 'sve_f16': False, 'neon_bf16': False, 'sve_bf16': False, 'neon_i8': False, 'sve_i8': False, 'haswell': True, 'skylake': True, 'ice': True, 'genoa': True, 'sapphire': True}
+$ simsimd.disable_capability("sapphire")
+$ simsimd.enable_capability("sapphire")
+```
+
 ### Using Python API with USearch
 
 Want to use it in Python with [USearch](https://github.com/unum-cloud/usearch)?
@@ -429,9 +443,9 @@ expected = np.inner(a_f32rounded, b_f32rounded)
 result = simd.inner(a_bf16, b_bf16, "bf16")
 ```
 
-### Dynamic Dispatch
+### Dynamic Dispatch in Rust
 
-SimSIMD provides a dynamic dispatch mechanism to select the most advanced micro-kernel for the current CPU.
+SimSIMD provides a [dynamic dispatch](#dynamic-dispatch) mechanism to select the most advanced micro-kernel for the current CPU.
 You can query supported backends and use the `SimSIMD::capabilities` function to select the best one.
 
 ```rust
@@ -529,10 +543,10 @@ int main() {
 }
 ```
 
-### Dynamic Dispatch
+### Dynamic Dispatch in C
 
 To avoid hard-coding the backend, you can rely on `c/lib.c` to prepackage all possible backends in one binary, and select the most recent CPU features at runtime.
-That feature of the C library is called dynamic dispatch and is extensively used in the Python, JavaScript, and Rust bindings.
+That feature of the C library is called [dynamic dispatch](#dynamic-dispatch) and is extensively used in the Python, JavaScript, and Rust bindings.
 To test which CPU features are available on the machine at runtime, use the following APIs:
 
 ```c
@@ -858,9 +872,20 @@ Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-
 
 Both functions are defined for non-negative numbers, and the logarithm is a key part of their computation.
 
+### Dynamic Dispatch
+
+Most popular software is precompiled and distributed with fairly conservative CPU optimizations, to ensure compatibility with older hardware.
+Database Management platforms, like ClickHouse, and Web Browsers, like Google Chrome,need to run on billions of devices, and they can't afford to be picky about the CPU features.
+For such users SimSIMD provides a dynamic dispatch mechanism, which selects the most advanced micro-kernel for the current CPU at runtime.
+
+You can compile SimSIMD on an old CPU, like Intel Haswell, and run it on a new one, like AMD Genoa, and it will automatically use the most advanced instructions available.
+Reverse is also true, you can compile on a new CPU and run on an old one, and it will automatically fall back to the most basic instructions.
+Moreover, the very first time you prove for CPU capabilities with `simsimd_capabilities()`, it initializes the dynamic dispatch mechanism, and all subsequent calls will be faster and won't face race conditions in multi-threaded environments.
+
 ## Target Specific Backends
 
 SimSIMD exposes all kernels for all backends, and you can select the most advanced one for the current CPU without relying on built-in dispatch mechanisms.
+That's handy for testing and benchmarking, but also in case you want to dispatch a very specific kernel for a very specific CPU, bypassing SimSIMD assignment logic.
 All of the function names follow the same pattern: `simsimd_{function}_{type}_{backend}`.
 
 - The backend can be `serial`, `haswell`, `skylake`, `ice`, `genoa`, `sapphire`, `neon`, or `sve`.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-5.1.2
+5.2.1
diff --git a/c/lib.c b/c/lib.c
@@ -45,13 +45,6 @@
 extern "C" {
 #endif
 
-simsimd_capability_t simsimd_capabilities(void) {
-    static simsimd_capability_t static_capabilities = simsimd_cap_any_k;
-    if (static_capabilities == simsimd_cap_any_k)
-        static_capabilities = simsimd_capabilities_implementation();
-    return static_capabilities;
-}
-
 // Every time a function is called, it checks if the metric is already loaded. If not, it fetches it.
 // If no metric is found, it returns NaN. We can obtain NaN by dividing 0.0 by 0.0, but that annoys
 // the MSVC compiler. Instead we can directly write-in the signaling NaN (0x7FF0000000000001)
@@ -175,6 +168,72 @@ SIMSIMD_DYNAMIC int simsimd_uses_skylake(void) { return (simsimd_capabilities()
 SIMSIMD_DYNAMIC int simsimd_uses_ice(void) { return (simsimd_capabilities() & simsimd_cap_ice_k) != 0; }
 SIMSIMD_DYNAMIC int simsimd_uses_genoa(void) { return (simsimd_capabilities() & simsimd_cap_genoa_k) != 0; }
 SIMSIMD_DYNAMIC int simsimd_uses_sapphire(void) { return (simsimd_capabilities() & simsimd_cap_sapphire_k) != 0; }
+SIMSIMD_DYNAMIC int simsimd_uses_dynamic_dispatch(void) { return 1; }
+
+simsimd_capability_t simsimd_capabilities(void) {
+    static simsimd_capability_t static_capabilities = simsimd_cap_any_k;
+    if (static_capabilities != simsimd_cap_any_k)
+        return static_capabilities;
+
+    static_capabilities = simsimd_capabilities_implementation();
+
+    // In multithreaded applications we need to ensure that the function pointers are pre-initialized,
+    // so the first time we are probing for capabilities, we should also probe all of our metrics
+    // with dummy inputs:
+    simsimd_distance_t dummy_results_buffer[2];
+    simsimd_distance_t* dummy_results = &dummy_results_buffer[0];
+    void* dummy = 0;
+
+    // Dense:
+    simsimd_dot_f16(dummy, dummy, 0, dummy_results);
+    simsimd_dot_bf16(dummy, dummy, 0, dummy_results);
+    simsimd_dot_f32(dummy, dummy, 0, dummy_results);
+    simsimd_dot_f64(dummy, dummy, 0, dummy_results);
+    simsimd_dot_f16c(dummy, dummy, 0, dummy_results);
+    simsimd_dot_bf16c(dummy, dummy, 0, dummy_results);
+    simsimd_dot_f32c(dummy, dummy, 0, dummy_results);
+    simsimd_dot_f64c(dummy, dummy, 0, dummy_results);
+    simsimd_vdot_f16c(dummy, dummy, 0, dummy_results);
+    simsimd_vdot_bf16c(dummy, dummy, 0, dummy_results);
+    simsimd_vdot_f32c(dummy, dummy, 0, dummy_results);
+    simsimd_vdot_f64c(dummy, dummy, 0, dummy_results);
+    simsimd_cos_i8(dummy, dummy, 0, dummy_results);
+    simsimd_cos_f16(dummy, dummy, 0, dummy_results);
+    simsimd_cos_bf16(dummy, dummy, 0, dummy_results);
+    simsimd_cos_f32(dummy, dummy, 0, dummy_results);
+    simsimd_cos_f64(dummy, dummy, 0, dummy_results);
+    simsimd_l2sq_i8(dummy, dummy, 0, dummy_results);
+    simsimd_l2sq_f16(dummy, dummy, 0, dummy_results);
+    simsimd_l2sq_bf16(dummy, dummy, 0, dummy_results);
+    simsimd_l2sq_f32(dummy, dummy, 0, dummy_results);
+    simsimd_l2sq_f64(dummy, dummy, 0, dummy_results);
+    simsimd_hamming_b8(dummy, dummy, 0, dummy_results);
+    simsimd_jaccard_b8(dummy, dummy, 0, dummy_results);
+    simsimd_kl_f16(dummy, dummy, 0, dummy_results);
+    simsimd_kl_bf16(dummy, dummy, 0, dummy_results);
+    simsimd_kl_f32(dummy, dummy, 0, dummy_results);
+    simsimd_kl_f64(dummy, dummy, 0, dummy_results);
+    simsimd_js_f16(dummy, dummy, 0, dummy_results);
+    simsimd_js_bf16(dummy, dummy, 0, dummy_results);
+    simsimd_js_f32(dummy, dummy, 0, dummy_results);
+    simsimd_js_f64(dummy, dummy, 0, dummy_results);
+
+    // Sparse
+    simsimd_intersect_u16(dummy, dummy, 0, 0, dummy_results);
+    simsimd_intersect_u32(dummy, dummy, 0, 0, dummy_results);
+
+    // Curved:
+    simsimd_bilinear_f64(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_mahalanobis_f64(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_bilinear_f32(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_mahalanobis_f32(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_bilinear_f16(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_mahalanobis_f16(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_bilinear_bf16(dummy, dummy, dummy, 0, dummy_results);
+    simsimd_mahalanobis_bf16(dummy, dummy, dummy, 0, dummy_results);
+
+    return static_capabilities;
+}
 
 #ifdef __cplusplus
 }

diff --git a/cpp/bench.cxx b/cpp/bench.cxx
@@ -737,6 +737,9 @@ int main(int argc, char** argv) {
     dense_<bf16_k>("cos_bf16_genoa", simsimd_cos_bf16_genoa, simsimd_cos_bf16_accurate);
     dense_<bf16_k>("l2sq_bf16_genoa", simsimd_l2sq_bf16_genoa, simsimd_l2sq_bf16_accurate);
 
+    dense_<bf16_k>("dot_bf16c_genoa", simsimd_dot_bf16c_genoa, simsimd_dot_bf16c_accurate);
+    dense_<bf16_k>("vdot_bf16c_genoa", simsimd_vdot_bf16c_genoa, simsimd_vdot_bf16c_accurate);
+
     curved_<bf16_k>("bilinear_bf16_genoa", simsimd_bilinear_bf16_genoa, simsimd_bilinear_bf16_accurate);
     curved_<bf16_k>("mahalanobis_bf16_genoa", simsimd_mahalanobis_bf16_genoa, simsimd_mahalanobis_bf16_accurate);
 #endif