Improve: Infer boolean types in Python

ashvardanian · Nov 19, 2024 · 8b98963 · 8b98963
1 parent 364e736
commit 8b98963
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -162,10 +162,22 @@ dist = simsimd.cosine(vec1, vec2, "int8")
 dist = simsimd.cosine(vec1, vec2, "float16")
 dist = simsimd.cosine(vec1, vec2, "float32")
 dist = simsimd.cosine(vec1, vec2, "float64")
-dist = simsimd.hamming(vec1, vec2, "bit8")
+dist = simsimd.hamming(vec1, vec2, "bin8")
+```
+
+Binary distance functions are computed at a bit-level.
+Meaning a vector of 10x 8-bit integers will be treated as a sequence of 80 individual bits or dimensions.
+This differs from NumPy, that can't handle smaller-than-byte types, but you can still avoid the `bin8` argument by reinterpreting the vector as booleans:
+
+```py
+vec1 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
+vec2 = np.random.randint(2, size=80).astype(np.uint8).packbits().view(np.bool_)
+hamming_distance = simsimd.hamming(vec1, vec2)
+jaccard_distance = simsimd.jaccard(vec1, vec2)
 ```
 
 With other frameworks, like PyTorch, one can get a richer type-system than NumPy, but the lack of good CPython interoperability makes it hard to pass data without copies.
+Here is an example of using SimSIMD with PyTorch to compute the cosine similarity between two `bfloat16` vectors:
 
 ```py
 import numpy as np
@@ -181,7 +193,7 @@ torch.randn(8, out=vec2)
 
 # Both libs will look into the same memory buffers and report the same results
 dist_slow = 1 - torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
-dist_fast = simsimd.cosine(buf1, buf2, "bf16")
+dist_fast = simsimd.cosine(buf1, buf2, "bfloat16")
 ```
 
 It also allows using SimSIMD for half-precision complex numbers, which NumPy does not support.
@@ -254,9 +266,9 @@ distances: DistancesTensor = simsimd.cdist(matrix1, matrix2, metric="cosine")
 distances_array: np.ndarray = np.array(distances, copy=True)                    # now managed by NumPy
 ```
 
-### Elementwise Kernels
+### Element-wise Kernels
 
-SimSIMD also provides mixed-precision elementwise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
+SimSIMD also provides mixed-precision element-wise kernels, where the input vectors and the output have the same numeric type, but the intermediate accumulators are of a higher precision.
 
 ```py
 import numpy as np

diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h
@@ -1050,7 +1050,7 @@ SIMSIMD_INTERNAL simsimd_distance_t _simsimd_cos_normalize_f32_haswell(simsimd_f
     // Load the squares into an __m128 register for single-precision floating-point operations
     __m128 squares = _mm_set_ps(a2, b2, a2, b2); // We replicate to make use of full register
 
-    // Compute the reciprocal square root of the squares using _mm_rsqrt_ps (single-precision)
+    // Compute the reciprocal square root of the squares using `_mm_rsqrt_ps` (single-precision)
     __m128 rsqrts = _mm_rsqrt_ps(squares);
 
     // Perform one iteration of Newton-Raphson refinement to improve the precision of rsqrt:

diff --git a/python/lib.c b/python/lib.c
@@ -190,7 +190,7 @@ simsimd_datatype_t python_string_to_datatype(char const *name) {
 
     //! Boolean values:
     else if (same_string(name, "bin8") || // SimSIMD-specific
-             same_string(name, "c"))      // Named type
+             same_string(name, "?"))      // Named type
         return simsimd_datatype_b8_k;
 
     // Signed integers:
@@ -276,7 +276,7 @@ char const *datatype_to_python_string(simsimd_datatype_t dtype) {
     case simsimd_datatype_f32c_k: return "Zf";
     case simsimd_datatype_f16c_k: return "Ze";
     // Boolean values:
-    case simsimd_datatype_b8_k: return "c";
+    case simsimd_datatype_b8_k: return "?";
     // Signed integers:
     case simsimd_datatype_i8_k: return "b";
     case simsimd_datatype_i16_k: return "h";

diff --git a/scripts/test.py b/scripts/test.py
@@ -839,6 +839,13 @@ def test_dense_bits(ndim, metric, capability, stats_fixture):
     np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
     collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
 
+    # Aside from overriding the `dtype` parameter, we can also view as booleans
+    result_dt, result = profile(simd_kernel, np.packbits(a).view(np.bool_), np.packbits(b).view(np.bool_))
+    result = np.array(result)
+
+    np.testing.assert_allclose(result, expected, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
+    collect_errors(metric, ndim, "bin8", accurate, accurate_dt, expected, expected_dt, result, result_dt, stats_fixture)
+
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.skipif(not scipy_available, reason="SciPy is not installed")