Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[arrow-cast] Support cast numeric to string view #6719

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,13 @@ impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>>
/// ```
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>;

impl std::fmt::Write for StringViewBuilder {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.append_value(s);
Ok(())
}
}

/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray]
///
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with
Expand Down
120 changes: 97 additions & 23 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) |
// decimal to signed numeric
(Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true,
// decimal to Utf8
(Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true,
// decimal to string
(Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) => true,
// Utf8 to decimal
(Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true,
(Struct(from_fields), Struct(to_fields)) => {
Expand Down Expand Up @@ -231,7 +231,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(Utf8 | LargeUtf8, Utf8View) => true,
(BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
(_, Utf8View | Utf8 | LargeUtf8) => from_type.is_primitive(),

(_, Binary | LargeBinary) => from_type.is_integer(),

Expand Down Expand Up @@ -917,6 +917,7 @@ pub fn cast_with_options(
Float64 => cast_decimal_to_float::<Decimal128Type, Float64Type, _>(array, |x| {
x as f64 / 10_f64.powi(*scale as i32)
}),
Utf8View => value_to_string_view(array, cast_options),
Utf8 => value_to_string::<i32>(array, cast_options),
LargeUtf8 => value_to_string::<i64>(array, cast_options),
Null => Ok(new_null_array(to_type, array.len())),
Expand Down Expand Up @@ -982,6 +983,7 @@ pub fn cast_with_options(
Float64 => cast_decimal_to_float::<Decimal256Type, Float64Type, _>(array, |x| {
x.to_f64().unwrap() / 10_f64.powi(*scale as i32)
}),
Utf8View => value_to_string_view(array, cast_options),
Utf8 => value_to_string::<i32>(array, cast_options),
LargeUtf8 => value_to_string::<i64>(array, cast_options),
Null => Ok(new_null_array(to_type, array.len())),
Expand Down Expand Up @@ -1462,6 +1464,9 @@ pub fn cast_with_options(
(BinaryView, _) => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
(from_type, Utf8View) if from_type.is_primitive() => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this also fixes the Timestamp -> Utf8View issue. It would be good to have tests for temporal -> Utf8View added to cover this case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After reviewing the code, I realized that the Timestamp -> Utf8View cast is not supported yet.

The main issue comes from the current implementation of formatter.format.write (source) which currently only applies to DisplayIndex derives (source), but the Temporal datatype is implemented based on DisplayIndexState (source).

I think this issue deserves a separate PR to handle the temporal -> string view casting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll file another PR today to cover the temporal -> Utf8View case unless someone beats me to it.

value_to_string_view(array, cast_options)
}
(from_type, LargeUtf8) if from_type.is_primitive() => {
value_to_string::<i64>(array, cast_options)
}
Expand Down Expand Up @@ -2485,12 +2490,11 @@ where

#[cfg(test)]
mod tests {
use super::*;
use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer};
use chrono::NaiveDate;
use half::f16;

use super::*;

macro_rules! generate_cast_test_case {
($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => {
let output =
Expand Down Expand Up @@ -3708,6 +3712,40 @@ mod tests {
assert_eq!(10.0, c.value(3));
}

#[test]
fn test_cast_int_to_utf8view() {
assert!(can_cast_types(&DataType::Int8, &DataType::Utf8View));
assert!(can_cast_types(&DataType::Int16, &DataType::Utf8View));
assert!(can_cast_types(&DataType::Int32, &DataType::Utf8View));
assert!(can_cast_types(&DataType::Int64, &DataType::Utf8View));

let array = Int32Array::from(vec![None, Some(8), Some(9), Some(10)]);
let arr = cast(&array, &DataType::Utf8View).unwrap();
assert_eq!(4, arr.len());
assert_eq!(1, arr.null_count());
let c = arr.as_string_view();
assert!(c.is_null(0));
assert_eq!("8", c.value(1));
assert_eq!("9", c.value(2));
assert_eq!("10", c.value(3));
}

#[test]
fn test_cast_float_to_utf8view() {
assert!(can_cast_types(&DataType::Float16, &DataType::Utf8View));
assert!(can_cast_types(&DataType::Float32, &DataType::Utf8View));
assert!(can_cast_types(&DataType::Float64, &DataType::Utf8View));

let array = Float32Array::from(vec![Some(8.64), Some(9.81), None]);
let arr = cast(&array, &DataType::Utf8View).unwrap();
assert_eq!(3, arr.len());
assert_eq!(1, arr.null_count());
let c = arr.as_string_view();
assert_eq!("8.64", c.value(0));
assert_eq!("9.81", c.value(1));
assert!(c.is_null(2));
}

#[test]
fn test_cast_utf8_to_i32() {
let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]);
Expand Down Expand Up @@ -9114,26 +9152,51 @@ mod tests {
}

#[test]
fn test_cast_decimal_to_utf8() {
fn test_cast_decimal_to_string() {
assert!(can_cast_types(
&DataType::Decimal128(10, 4),
&DataType::Utf8View
));
assert!(can_cast_types(
&DataType::Decimal256(38, 10),
&DataType::Utf8View
));

macro_rules! assert_decimal_values {
($array:expr) => {
let c = $array;
assert_eq!("1123.454", c.value(0));
assert_eq!("2123.456", c.value(1));
assert_eq!("-3123.453", c.value(2));
assert_eq!("-3123.456", c.value(3));
assert_eq!("0.000", c.value(4));
assert_eq!("0.123", c.value(5));
assert_eq!("1234.567", c.value(6));
assert_eq!("-1234.567", c.value(7));
assert!(c.is_null(8));
};
}

fn test_decimal_to_string<IN: ArrowPrimitiveType, OffsetSize: OffsetSizeTrait>(
output_type: DataType,
array: PrimitiveArray<IN>,
) {
let b = cast(&array, &output_type).unwrap();

assert_eq!(b.data_type(), &output_type);
let c = b.as_string::<OffsetSize>();

assert_eq!("1123.454", c.value(0));
assert_eq!("2123.456", c.value(1));
assert_eq!("-3123.453", c.value(2));
assert_eq!("-3123.456", c.value(3));
assert_eq!("0.000", c.value(4));
assert_eq!("0.123", c.value(5));
assert_eq!("1234.567", c.value(6));
assert_eq!("-1234.567", c.value(7));
assert!(c.is_null(8));
match b.data_type() {
DataType::Utf8View => {
let c = b.as_string_view();
assert_decimal_values!(c);
}
DataType::Utf8 | DataType::LargeUtf8 => {
let c = b.as_string::<OffsetSize>();
assert_decimal_values!(c);
}
_ => (),
}
}

let array128: Vec<Option<i128>> = vec![
Some(1123454),
Some(2123456),
Expand All @@ -9145,22 +9208,33 @@ mod tests {
Some(-123456789),
None,
];
let array256: Vec<Option<i256>> = array128
.iter()
.map(|num| num.map(i256::from_i128))
.collect();

let array256: Vec<Option<i256>> = array128.iter().map(|v| v.map(i256::from_i128)).collect();

test_decimal_to_string::<arrow_array::types::Decimal128Type, i32>(
test_decimal_to_string::<Decimal128Type, i32>(
DataType::Utf8View,
create_decimal_array(array128.clone(), 7, 3).unwrap(),
);
test_decimal_to_string::<Decimal128Type, i32>(
DataType::Utf8,
create_decimal_array(array128.clone(), 7, 3).unwrap(),
);
test_decimal_to_string::<arrow_array::types::Decimal128Type, i64>(
test_decimal_to_string::<Decimal128Type, i64>(
DataType::LargeUtf8,
create_decimal_array(array128, 7, 3).unwrap(),
);
test_decimal_to_string::<arrow_array::types::Decimal256Type, i32>(

test_decimal_to_string::<Decimal256Type, i32>(
DataType::Utf8View,
create_decimal256_array(array256.clone(), 7, 3).unwrap(),
);
test_decimal_to_string::<Decimal256Type, i32>(
DataType::Utf8,
create_decimal256_array(array256.clone(), 7, 3).unwrap(),
);
test_decimal_to_string::<arrow_array::types::Decimal256Type, i64>(
test_decimal_to_string::<Decimal256Type, i64>(
DataType::LargeUtf8,
create_decimal256_array(array256, 7, 3).unwrap(),
);
Expand Down
16 changes: 16 additions & 0 deletions arrow-cast/src/cast/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,22 @@ pub(crate) fn value_to_string<O: OffsetSizeTrait>(
Ok(Arc::new(builder.finish()))
}

pub(crate) fn value_to_string_view(
array: &dyn Array,
options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let mut builder = StringViewBuilder::with_capacity(array.len());
let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
let nulls = array.nulls();
for i in 0..array.len() {
match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
true => builder.append_null(),
false => formatter.value(i).write(&mut builder)?,
}
}
Ok(Arc::new(builder.finish()))
}

/// Parse UTF-8
pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
array: &dyn Array,
Expand Down
Loading