Skip to content

Commit a61f1dc

Browse files
RinChanNOWWWalamb
andauthored
Support casting StringArray/BinaryArray --> StringView / BinaryView (#5686)
* Support casting from byte array to byte view array. * Use new_unchecked. * Add safety justification comment * Fix comments :facepalm --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 08af471 commit a61f1dc

File tree

2 files changed

+139
-2
lines changed

2 files changed

+139
-2
lines changed

arrow-array/src/array/byte_view_array.rs

+12
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,18 @@ impl BinaryViewArray {
428428
}
429429
}
430430

431+
impl From<Vec<&[u8]>> for BinaryViewArray {
432+
fn from(v: Vec<&[u8]>) -> Self {
433+
Self::from_iter_values(v)
434+
}
435+
}
436+
437+
impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
438+
fn from(v: Vec<Option<&[u8]>>) -> Self {
439+
v.into_iter().collect()
440+
}
441+
}
442+
431443
/// A [`GenericByteViewArray`] that stores utf8 data
432444
///
433445
/// # Example

arrow-cast/src/cast/mod.rs

+127-2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ use crate::cast::dictionary::*;
4646
use crate::cast::list::*;
4747
use crate::cast::string::*;
4848

49+
use arrow_buffer::ScalarBuffer;
50+
use arrow_data::ByteView;
4951
use chrono::{NaiveTime, Offset, TimeZone, Utc};
5052
use std::cmp::Ordering;
5153
use std::sync::Arc;
@@ -119,6 +121,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
119121
| Utf8
120122
| LargeBinary
121123
| LargeUtf8
124+
| BinaryView
125+
| Utf8View
122126
| List(_)
123127
| LargeList(_)
124128
| FixedSizeList(_, _)
@@ -192,8 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
192196
DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8
193197
}
194198

195-
(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
196-
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true,
199+
(Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
200+
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
197201
(FixedSizeBinary(_), Binary | LargeBinary) => true,
198202
(
199203
Utf8 | LargeUtf8,
@@ -213,6 +217,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
213217
| Timestamp(Nanosecond, _)
214218
| Interval(_),
215219
) => true,
220+
(Utf8 | LargeUtf8, Utf8View) => true,
216221
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
217222
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
218223

@@ -611,6 +616,8 @@ pub fn cast_with_options(
611616
| Utf8
612617
| LargeBinary
613618
| LargeUtf8
619+
| BinaryView
620+
| Utf8View
614621
| List(_)
615622
| LargeList(_)
616623
| FixedSizeList(_, _)
@@ -1120,6 +1127,7 @@ pub fn cast_with_options(
11201127
let binary = BinaryArray::from(array.as_string::<i32>().clone());
11211128
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
11221129
}
1130+
Utf8View => cast_byte_to_view::<Utf8Type, StringViewType>(array),
11231131
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
11241132
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i32>(array, cast_options),
11251133
Time32(TimeUnit::Millisecond) => {
@@ -1179,6 +1187,7 @@ pub fn cast_with_options(
11791187
LargeBinary => Ok(Arc::new(LargeBinaryArray::from(
11801188
array.as_string::<i64>().clone(),
11811189
))),
1190+
Utf8View => cast_byte_to_view::<LargeUtf8Type, StringViewType>(array),
11821191
Time32(TimeUnit::Second) => parse_string::<Time32SecondType, i64>(array, cast_options),
11831192
Time32(TimeUnit::Millisecond) => {
11841193
parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1226,6 +1235,7 @@ pub fn cast_with_options(
12261235
FixedSizeBinary(size) => {
12271236
cast_binary_to_fixed_size_binary::<i32>(array, *size, cast_options)
12281237
}
1238+
BinaryView => cast_byte_to_view::<BinaryType, BinaryViewType>(array),
12291239
_ => Err(ArrowError::CastError(format!(
12301240
"Casting from {from_type:?} to {to_type:?} not supported",
12311241
))),
@@ -1240,6 +1250,7 @@ pub fn cast_with_options(
12401250
FixedSizeBinary(size) => {
12411251
cast_binary_to_fixed_size_binary::<i64>(array, *size, cast_options)
12421252
}
1253+
BinaryView => cast_byte_to_view::<LargeBinaryType, BinaryViewType>(array),
12431254
_ => Err(ArrowError::CastError(format!(
12441255
"Casting from {from_type:?} to {to_type:?} not supported",
12451256
))),
@@ -2238,6 +2249,56 @@ where
22382249
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
22392250
}
22402251

2252+
/// Helper function to cast from one `ByteArrayType` array to `ByteViewType` array.
2253+
fn cast_byte_to_view<FROM, V>(array: &dyn Array) -> Result<ArrayRef, ArrowError>
2254+
where
2255+
FROM: ByteArrayType,
2256+
FROM::Offset: OffsetSizeTrait + ToPrimitive,
2257+
V: ByteViewType,
2258+
{
2259+
let data = array.to_data();
2260+
assert_eq!(data.data_type(), &FROM::DATA_TYPE);
2261+
2262+
let len = array.len();
2263+
let str_values_buf = data.buffers()[1].clone();
2264+
let offsets = data.buffers()[0].typed_data::<FROM::Offset>();
2265+
2266+
let mut views_builder = BufferBuilder::<u128>::new(len);
2267+
for w in offsets.windows(2) {
2268+
let offset = w[0].to_u32().unwrap();
2269+
let end = w[1].to_u32().unwrap();
2270+
let value_buf = &str_values_buf[offset as usize..end as usize];
2271+
let length = end - offset;
2272+
2273+
if length <= 12 {
2274+
let mut view_buffer = [0; 16];
2275+
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
2276+
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
2277+
views_builder.append(u128::from_le_bytes(view_buffer));
2278+
} else {
2279+
let view = ByteView {
2280+
length,
2281+
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
2282+
buffer_index: 0,
2283+
offset,
2284+
};
2285+
views_builder.append(view.into());
2286+
}
2287+
}
2288+
2289+
assert_eq!(views_builder.len(), len);
2290+
2291+
// Safety: the input was a valid array so it valid UTF8 (if string). And
2292+
// all offsets were valid and we created the views correctly
2293+
Ok(Arc::new(unsafe {
2294+
GenericByteViewArray::<V>::new_unchecked(
2295+
ScalarBuffer::new(views_builder.finish(), 0, len),
2296+
vec![str_values_buf],
2297+
data.nulls().cloned(),
2298+
)
2299+
}))
2300+
}
2301+
22412302
#[cfg(test)]
22422303
mod tests {
22432304
use arrow_buffer::{Buffer, NullBuffer};
@@ -5044,6 +5105,70 @@ mod tests {
50445105
}
50455106
}
50465107

5108+
#[test]
5109+
fn test_string_to_view() {
5110+
_test_string_to_view::<i32>();
5111+
_test_string_to_view::<i64>();
5112+
}
5113+
5114+
fn _test_string_to_view<O>()
5115+
where
5116+
O: OffsetSizeTrait,
5117+
{
5118+
let data = vec![
5119+
Some("hello"),
5120+
Some("world"),
5121+
None,
5122+
Some("large payload over 12 bytes"),
5123+
Some("lulu"),
5124+
];
5125+
5126+
let string_array = GenericStringArray::<O>::from(data.clone());
5127+
5128+
assert!(can_cast_types(
5129+
string_array.data_type(),
5130+
&DataType::Utf8View
5131+
));
5132+
5133+
let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
5134+
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
5135+
5136+
let expect_string_view_array = StringViewArray::from(data);
5137+
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
5138+
}
5139+
5140+
#[test]
5141+
fn test_bianry_to_view() {
5142+
_test_binary_to_view::<i32>();
5143+
_test_binary_to_view::<i64>();
5144+
}
5145+
5146+
fn _test_binary_to_view<O>()
5147+
where
5148+
O: OffsetSizeTrait,
5149+
{
5150+
let data: Vec<Option<&[u8]>> = vec![
5151+
Some(b"hello"),
5152+
Some(b"world"),
5153+
None,
5154+
Some(b"large payload over 12 bytes"),
5155+
Some(b"lulu"),
5156+
];
5157+
5158+
let binary_array = GenericBinaryArray::<O>::from(data.clone());
5159+
5160+
assert!(can_cast_types(
5161+
binary_array.data_type(),
5162+
&DataType::BinaryView
5163+
));
5164+
5165+
let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap();
5166+
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
5167+
5168+
let expect_binary_view_array = BinaryViewArray::from(data);
5169+
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
5170+
}
5171+
50475172
#[test]
50485173
fn test_cast_from_f64() {
50495174
let f64_values: Vec<f64> = vec![

0 commit comments

Comments
 (0)