Skip to content

Commit

Permalink
Fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
xinlifoobar committed Aug 20, 2024
1 parent 3322905 commit cd5886f
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 80 deletions.
116 changes: 48 additions & 68 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,22 +279,6 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
T::Native::from_bytes_unchecked(b)
}

/// Returns the bytes at index `i`
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] {
let v = self.views.get_unchecked(idx);
let len = *v as u32;
if len <= 12 {
Self::inline_value(v, len as usize)
} else {
let view = ByteView::from(*v);
let data = self.buffers.get_unchecked(view.buffer_index as usize);
let offset = view.offset as usize;
data.get_unchecked(offset..offset + len as usize)
}
}

/// Returns the inline value of the view.
///
/// # Safety
Expand Down Expand Up @@ -326,6 +310,54 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
})
}

/// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
/// If the prefix length is larger than the string length, it will return the empty string.
pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator<Item = &[u8]> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < prefix_len {
return &[] as &[u8];
}

if prefix_len <= 4 || len <= 12 {
unsafe { StringViewArray::inline_value(v, prefix_len) }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + prefix_len) }
}
})
}

/// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
/// If the suffix length is larger than the string length, it will return the empty string.
pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator<Item = &[u8]> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < suffix_len {
return &[] as &[u8];
}

if len <= 12 {
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
}
})
}

/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
Expand Down Expand Up @@ -699,58 +731,6 @@ impl StringViewArray {
None => true,
})
}

/// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
/// If the prefix length is larger than the string length, it will return the empty string.
pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator<Item = &str> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < prefix_len {
return "";
}

let b = if prefix_len <= 4 || len <= 12 {
unsafe { StringViewArray::inline_value(v, prefix_len) }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + prefix_len) }
};

unsafe { str::from_utf8_unchecked(b) }
})
}

/// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
/// If the suffix length is larger than the string length, it will return the empty string.
pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator<Item = &str> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < suffix_len {
return "";
}

let b = if len <= 12 {
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
};

unsafe { str::from_utf8_unchecked(b) }
})
}
}

impl From<Vec<&str>> for StringViewArray {
Expand Down
21 changes: 21 additions & 0 deletions arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,27 @@ mod tests {
vec![false, true, true, false, false, false, false, true, true, true, true]
);

// 😈 is four bytes long.
test_utf8_scalar!(
test_uff8_array_like_multibyte,
vec![
"sdlkdfFooßsdfs",
"sdlkdfFooSSdggs",
"sdlkdfFoosssdsd",
"FooS",
"Foos",
"ffooSS",
"ffooß",
"😃sadlksffofsSsh😈klF",
"😱slgffoesSsh😈klF",
"FFKoSS",
"longer than 12 bytes FFKoSS",
],
"%Ssh😈klF",
like,
vec![false, false, false, false, false, false, false, true, true, false, false]
);

test_utf8_scalar!(
test_utf8_array_ilike_scalar_one,
vec![
Expand Down
44 changes: 32 additions & 12 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,10 @@ impl<'a> Predicate<'a> {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.prefix_iter(v.len())
.map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
.prefix_bytes_iter(v.len())
.map(|haystack| {
starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
Expand All @@ -146,9 +148,13 @@ impl<'a> Predicate<'a> {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.prefix_iter(v.len())
.prefix_bytes_iter(v.len())
.map(|haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
starts_with_bytes(
haystack,
v.as_bytes(),
equals_ignore_ascii_case_kernel,
) != negate
})
.collect::<Vec<_>>(),
)
Expand All @@ -162,8 +168,10 @@ impl<'a> Predicate<'a> {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_iter(v.len())
.map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
.suffix_bytes_iter(v.len())
.map(|haystack| {
starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
Expand All @@ -176,9 +184,13 @@ impl<'a> Predicate<'a> {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_iter(v.len())
.suffix_bytes_iter(v.len())
.map(|haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
starts_with_bytes(
haystack,
v.as_bytes(),
equals_ignore_ascii_case_kernel,
) != negate
})
.collect::<Vec<_>>(),
)
Expand All @@ -195,16 +207,24 @@ impl<'a> Predicate<'a> {
}
}

/// This is faster than `str::starts_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
fn starts_with_bytes(
haystack: &[u8],
needle: &[u8],
byte_eq_kernel: impl Fn((&u8, &u8)) -> bool,
) -> bool {
if needle.len() > haystack.len() {
false
} else {
zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel)
zip(haystack, needle).all(byte_eq_kernel)
}
}

/// This is faster than `str::starts_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
starts_with_bytes(haystack.as_bytes(), needle.as_bytes(), byte_eq_kernel)
}

/// This is faster than `str::ends_with` for small strings.
/// See <https://github.com/apache/arrow-rs/issues/6107> for more details.
fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {
Expand Down

0 comments on commit cd5886f

Please sign in to comment.