fix: ALP encoding handles infinity, negative zero and nan values (#2555)

robert3005 · web-flow · commit 0dd969d2fb19 · 2025-03-03T17:25:09.000Z
Using NativePType::is_eq and due to rust float -> int cast semantics we can avoid checking whether value is one of the unencodable variants. https://doc.rust-lang.org/stable/reference/expressions/operator-expr.html#numeric-cast for reference
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/encodings/alp/src/alp/compress.rs b/encodings/alp/src/alp/compress.rs
@@ -268,4 +268,22 @@ mod tests {
             Scalar::null_typed::<f64>()
         );
     }
+
+    #[test]
+    fn non_finite_numbers() {
+        let original = PrimitiveArray::new(
+            buffer![0.0f32, -0.0, f32::NAN, f32::NEG_INFINITY, f32::INFINITY],
+            Validity::NonNullable,
+        );
+        let encoded = alp_encode(&original).unwrap();
+        let decoded = encoded.to_primitive().unwrap();
+        for idx in 0..original.len() {
+            let decoded_val = decoded.as_slice::<f32>()[idx];
+            let original_val = original.as_slice::<f32>()[idx];
+            assert!(
+                decoded_val.is_eq(original_val),
+                "Expected {original_val} but got {decoded_val}"
+            );
+        }
+    }
 }
diff --git a/encodings/alp/src/alp/compute/compare.rs b/encodings/alp/src/alp/compute/compare.rs
@@ -245,7 +245,7 @@ mod tests {
         assert_eq!(r_gt, vec![true; 10]);
 
         let r_gte = test_alp_compare(&encoded, -0.0_f32, Operator::Gt).unwrap();
-        assert_eq!(r_gte, vec![false; 10]);
+        assert_eq!(r_gte, vec![true; 10]);
 
         let r_lte = test_alp_compare(&encoded, 0.06051_f32, Operator::Lte).unwrap();
         assert_eq!(r_lte, vec![true; 10]);
diff --git a/encodings/alp/src/alp/mod.rs b/encodings/alp/src/alp/mod.rs
@@ -13,6 +13,7 @@ mod serde;
 pub use array::*;
 pub use compress::*;
 use vortex_buffer::{Buffer, BufferMut};
+use vortex_dtype::NativePType;
 
 const SAMPLE_SIZE: usize = 32;
 
@@ -35,7 +36,7 @@ mod private {
     impl Sealed for f64 {}
 }
 
-pub trait ALPFloat: private::Sealed + Float + Display + 'static {
+pub trait ALPFloat: private::Sealed + Float + Display + NativePType {
     type ALPInt: PrimInt + Display + ToPrimitive + Copy;
 
     const FRACTIONAL_BITS: u8;
@@ -148,9 +149,9 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
 
     #[inline]
     fn encode_single(value: Self, exponents: Exponents) -> Option<Self::ALPInt> {
-        let encoded = unsafe { Self::encode_single_unchecked(value, exponents) };
+        let encoded = Self::encode_single_unchecked(value, exponents);
         let decoded = Self::decode_single(encoded, exponents);
-        if decoded == value {
+        if decoded.is_eq(value) {
             return Some(encoded);
         }
         None
@@ -185,11 +186,10 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
         Self::from_int(encoded) * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]
     }
 
-    /// # Safety
-    ///
-    /// The returned value may not decode back to the original value.
+    /// Encode single float value. The returned value might decode to a different value than passed in.
+    /// Consider using [`Self::encode_single`] if you want the checked version of this function.
     #[inline(always)]
-    unsafe fn encode_single_unchecked(value: Self, exponents: Exponents) -> Self::ALPInt {
+    fn encode_single_unchecked(value: Self, exponents: Exponents) -> Self::ALPInt {
         (value * Self::F10[exponents.e as usize] * Self::IF10[exponents.f as usize])
             .fast_round()
             .as_int()
@@ -212,10 +212,10 @@ fn encode_chunk_unchecked<T: ALPFloat>(
 
     // encode the chunk, counting the number of patches
     let mut chunk_patch_count = 0;
-    encoded_output.extend(chunk.iter().map(|v| {
-        let encoded = unsafe { T::encode_single_unchecked(*v, exp) };
+    encoded_output.extend(chunk.iter().map(|&v| {
+        let encoded = T::encode_single_unchecked(v, exp);
         let decoded = T::decode_single(encoded, exp);
-        let neq = (decoded != *v) as usize;
+        let neq = !decoded.is_eq(v) as usize;
         chunk_patch_count += neq;
         encoded
     }));
@@ -237,7 +237,7 @@ fn encode_chunk_unchecked<T: ALPFloat>(
             // write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op)
             patch_indices_mut[chunk_patch_index].write(i as u64);
             patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]);
-            chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize;
+            chunk_patch_index += !decoded.is_eq(chunk[i - num_prev_encoded]) as usize;
         }
         assert_eq!(chunk_patch_index, chunk_patch_count);
         unsafe {
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
@@ -24,7 +24,6 @@ itertools = { workspace = true }
 num-traits = { workspace = true }
 rand = { workspace = true, optional = true }
 rkyv = { workspace = true }
-serde = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-dtype = { workspace = true }
diff --git a/wasm-test/Cargo.lock b/wasm-test/Cargo.lock