Implement compare operations for view types (#5900)

XiangpengHao · web-flow · commit d0a88c651991 · 2024-06-17T16:22:01.000-04:00
* compare kernel for view types

* add binary view as well

* make ci happy

* better comments, better readbility

* add tests

* add more tests
diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
@@ -20,7 +20,7 @@ use crate::builder::GenericByteViewBuilder;
 use crate::iterator::ArrayIter;
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::{BinaryViewType, ByteViewType, StringViewType};
-use crate::{Array, ArrayAccessor, ArrayRef};
+use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
 use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
 use arrow_schema::{ArrowError, DataType};
@@ -186,6 +186,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         }
     }
 
+    /// Create a new [`Scalar`] from `value`
+    pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
+        Scalar::new(Self::from_iter_values(std::iter::once(value)))
+    }
+
     /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
     pub fn from_iter_values<Ptr, I>(iter: I) -> Self
     where
@@ -239,8 +244,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         let v = self.views.get_unchecked(idx);
         let len = *v as u32;
         let b = if len <= 12 {
-            let ptr = self.views.as_ptr() as *const u8;
-            std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize)
+            Self::inline_value(v, len as usize)
         } else {
             let view = ByteView::from(*v);
             let data = self.buffers.get_unchecked(view.buffer_index as usize);
@@ -250,6 +254,17 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         T::Native::from_bytes_unchecked(b)
     }
 
+    /// Returns the inline value of the view.
+    ///
+    /// # Safety
+    /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
+    /// - The `len` must be the length of the inlined value. It should never be larger than 12.
+    #[inline(always)]
+    pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
+        debug_assert!(len <= 12);
+        std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
+    }
+
     /// constructs a new iterator
     pub fn iter(&self) -> ArrayIter<&Self> {
         ArrayIter::new(self)
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
@@ -24,10 +24,10 @@
 //!
 
 use arrow_array::cast::AsArray;
-use arrow_array::types::ByteArrayType;
+use arrow_array::types::{ByteArrayType, ByteViewType};
 use arrow_array::{
     downcast_primitive_array, AnyDictionaryArray, Array, ArrowNativeTypeOp, BooleanArray, Datum,
-    FixedSizeBinaryArray, GenericByteArray,
+    FixedSizeBinaryArray, GenericByteArray, GenericByteViewArray,
 };
 use arrow_buffer::bit_util::ceil;
 use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
@@ -228,8 +228,10 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray,
             (l, r) => apply(op, l.values().as_ref(), l_s, l_v, r.values().as_ref(), r_s, r_v),
             (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v, r.as_boolean(), r_s, r_v),
             (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v, r.as_string::<i32>(), r_s, r_v),
+            (Utf8View, Utf8View) => apply(op, l.as_string_view(), l_s, l_v, r.as_string_view(), r_s, r_v),
             (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s, l_v, r.as_string::<i64>(), r_s, r_v),
             (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v, r.as_binary::<i32>(), r_s, r_v),
+            (BinaryView, BinaryView) => apply(op, l.as_binary_view(), l_s, l_v, r.as_binary_view(), r_s, r_v),
             (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s, l_v, r.as_binary::<i64>(), r_s, r_v),
             (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op, l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
             (Null, Null) => None,
@@ -459,7 +461,7 @@ fn apply_op_vectored<T: ArrayOrd>(
 }
 
 trait ArrayOrd {
-    type Item: Copy + Default;
+    type Item: Copy;
 
     fn len(&self) -> usize;
 
@@ -538,6 +540,109 @@ impl<'a, T: ByteArrayType> ArrayOrd for &'a GenericByteArray<T> {
     }
 }
 
+/// Comparing two ByteView types are non-trivial.
+/// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
+///
+/// ByteView types give us the following two advantages, and we need to be careful not to lose them:
+/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
+///     Meaning that reading one array element requires only one memory access
+///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
+///
+/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
+///     thanks to the inlined 4 bytes.
+///     Consider equality check:
+///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
+///     If we are unlucky and the first four bytes are the same, we need to fallback to compare two full strings.
+impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
+    /// Item.0 is the array, Item.1 is the index into the array.
+    /// Why don't we just store Item.0[Item.1] as the item?
+    ///  - Because if we do so, we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
+    ///  - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
+    ///     e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
+    type Item = (&'a GenericByteViewArray<T>, usize);
+
+    /// # Equality check flow
+    /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
+    /// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
+    ///     (2.1) if the inlined 4 bytes are different, we can return false immediately.
+    ///     (2.2) o.w., we need to compare the full string.
+    ///
+    /// # Safety
+    /// (1) Indexing. The Self::Item.1 encodes the index value, which is already checked in `value` function,
+    ///               so it is safe to index into the views.
+    /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per spec), so it is safe to slice from the view.
+    fn is_eq(l: Self::Item, r: Self::Item) -> bool {
+        let l_view = unsafe { l.0.views().get_unchecked(l.1) };
+        let l_len = *l_view as u32;
+
+        let r_view = unsafe { r.0.views().get_unchecked(r.1) };
+        let r_len = *r_view as u32;
+
+        if l_len != r_len {
+            return false;
+        }
+
+        if l_len <= 12 {
+            let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
+            let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
+            l_data == r_data
+        } else {
+            let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
+            let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
+            if l_inlined_data != r_inlined_data {
+                return false;
+            }
+
+            let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() };
+            let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() };
+            l_full_data == r_full_data
+        }
+    }
+
+    /// # Ordering check flow
+    /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
+    /// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
+    ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
+    ///     (2.2) o.w., we need to compare the full string.
+    ///
+    /// # Safety
+    /// (1) Indexing. The Self::Item.1 encodes the index value, which is already checked in `value` function,
+    ///              so it is safe to index into the views.
+    /// (2) Slice data from view. We know the bytes 4-8 are inlined data (per spec), so it is safe to slice from the view.
+    fn is_lt(l: Self::Item, r: Self::Item) -> bool {
+        let l_view = l.0.views().get(l.1).unwrap();
+        let l_len = *l_view as u32;
+
+        let r_view = r.0.views().get(r.1).unwrap();
+        let r_len = *r_view as u32;
+
+        if l_len <= 12 && r_len <= 12 {
+            let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
+            let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
+            return l_data < r_data;
+        }
+        // one of the string is larger than 12 bytes,
+        // we then try to compare the inlined data first
+        let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
+        let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
+        if r_inlined_data != l_inlined_data {
+            return l_inlined_data < r_inlined_data;
+        }
+        // unfortunately, we need to compare the full data
+        let l_full_data: &[u8] = unsafe { l.0.value_unchecked(l.1).as_ref() };
+        let r_full_data: &[u8] = unsafe { r.0.value_unchecked(r.1).as_ref() };
+        l_full_data < r_full_data
+    }
+
+    fn len(&self) -> usize {
+        Array::len(self)
+    }
+
+    unsafe fn value_unchecked(&self, idx: usize) -> Self::Item {
+        (self, idx)
+    }
+}
+
 impl<'a> ArrayOrd for &'a FixedSizeBinaryArray {
     type Item = &'a [u8];
 
diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs