From cd5886fd8d4cf3fe231d281c2bdb805e72c4e585 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 20 Aug 2024 13:49:17 +0800 Subject: [PATCH] Fix comments --- arrow-array/src/array/byte_view_array.rs | 116 ++++++++++------------- arrow-string/src/like.rs | 21 ++++ arrow-string/src/predicate.rs | 44 ++++++--- 3 files changed, 101 insertions(+), 80 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 237316e0785c..aa512036a28d 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -279,22 +279,6 @@ impl GenericByteViewArray { T::Native::from_bytes_unchecked(b) } - /// Returns the bytes at index `i` - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] { - let v = self.views.get_unchecked(idx); - let len = *v as u32; - if len <= 12 { - Self::inline_value(v, len as usize) - } else { - let view = ByteView::from(*v); - let data = self.buffers.get_unchecked(view.buffer_index as usize); - let offset = view.offset as usize; - data.get_unchecked(offset..offset + len as usize) - } - } - /// Returns the inline value of the view. /// /// # Safety @@ -326,6 +310,54 @@ impl GenericByteViewArray { }) } + /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. + /// If the prefix length is larger than the string length, it will return the empty string. + pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < prefix_len { + return &[] as &[u8]; + } + + if prefix_len <= 4 || len <= 12 { + unsafe { StringViewArray::inline_value(v, prefix_len) } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + prefix_len) } + } + }) + } + + /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. + /// If the suffix length is larger than the string length, it will return the empty string. + pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < suffix_len { + return &[] as &[u8]; + } + + if len <= 12 { + unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } + } + }) + } + /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { @@ -699,58 +731,6 @@ impl StringViewArray { None => true, }) } - - /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. - /// If the prefix length is larger than the string length, it will return the empty string. - pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator { - self.views().into_iter().map(move |v| { - let len = (*v as u32) as usize; - - if len < prefix_len { - return ""; - } - - let b = if prefix_len <= 4 || len <= 12 { - unsafe { StringViewArray::inline_value(v, prefix_len) } - } else { - let view = ByteView::from(*v); - let data = unsafe { - self.data_buffers() - .get_unchecked(view.buffer_index as usize) - }; - let offset = view.offset as usize; - unsafe { data.get_unchecked(offset..offset + prefix_len) } - }; - - unsafe { str::from_utf8_unchecked(b) } - }) - } - - /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. - /// If the suffix length is larger than the string length, it will return the empty string. - pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator { - self.views().into_iter().map(move |v| { - let len = (*v as u32) as usize; - - if len < suffix_len { - return ""; - } - - let b = if len <= 12 { - unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } - } else { - let view = ByteView::from(*v); - let data = unsafe { - self.data_buffers() - .get_unchecked(view.buffer_index as usize) - }; - let offset = view.offset as usize; - unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } - }; - - unsafe { str::from_utf8_unchecked(b) } - }) - } } impl From> for StringViewArray { diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e878e241853a..4626be1362e9 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -989,6 +989,27 @@ mod tests { vec![false, true, true, false, false, false, false, true, true, true, true] ); + // 😈 is four bytes long. + test_utf8_scalar!( + test_uff8_array_like_multibyte, + vec![ + "sdlkdfFooßsdfs", + "sdlkdfFooSSdggs", + "sdlkdfFoosssdsd", + "FooS", + "Foos", + "ffooSS", + "ffooß", + "😃sadlksffofsSsh😈klF", + "😱slgffoesSsh😈klF", + "FFKoSS", + "longer than 12 bytes FFKoSS", + ], + "%Ssh😈klF", + like, + vec![false, false, false, false, false, false, false, true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_ilike_scalar_one, vec![ diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 73a0cce9744f..91a4e187541b 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -132,8 +132,10 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .prefix_iter(v.len()) - .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) + .prefix_bytes_iter(v.len()) + .map(|haystack| { + starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) .collect::>(), ) } else { @@ -146,9 +148,13 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .prefix_iter(v.len()) + .prefix_bytes_iter(v.len()) .map(|haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) .collect::>(), ) @@ -162,8 +168,10 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .suffix_iter(v.len()) - .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) + .suffix_bytes_iter(v.len()) + .map(|haystack| { + starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) .collect::>(), ) } else { @@ -176,9 +184,13 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .suffix_iter(v.len()) + .suffix_bytes_iter(v.len()) .map(|haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) .collect::>(), ) @@ -195,16 +207,24 @@ impl<'a> Predicate<'a> { } } -/// This is faster than `str::starts_with` for small strings. -/// See for more details. -fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn starts_with_bytes( + haystack: &[u8], + needle: &[u8], + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { if needle.len() > haystack.len() { false } else { - zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + zip(haystack, needle).all(byte_eq_kernel) } } +/// This is faster than `str::starts_with` for small strings. +/// See for more details. +fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + starts_with_bytes(haystack.as_bytes(), needle.as_bytes(), byte_eq_kernel) +} + /// This is faster than `str::ends_with` for small strings. /// See for more details. fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {