-
-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Add binary as numerical #19542
Changes from 9 commits
180db60
975cf92
d7f84e7
88bfaa5
54a59df
455dc6b
3c895a0
68dc7c3
7905f95
414169d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,8 +7,9 @@ use crate::array::*; | |
use crate::buffer::Buffer; | ||
use crate::datatypes::ArrowDataType; | ||
use crate::offset::{Offset, Offsets}; | ||
use crate::types::NativeType; | ||
use crate::types::{f16, NativeType}; | ||
|
||
/// Trait for parsing a data type from string | ||
pub(super) trait Parse { | ||
fn parse(val: &[u8]) -> Option<Self> | ||
where | ||
|
@@ -50,6 +51,52 @@ impl Parse for f64 { | |
} | ||
} | ||
|
||
/// Trait for casting bytes to a primitive type | ||
pub trait Cast { | ||
fn cast_le(val: &[u8]) -> Option<Self> | ||
where | ||
Self: Sized; | ||
fn cast_be(val: &[u8]) -> Option<Self> | ||
where | ||
Self: Sized; | ||
} | ||
macro_rules! impl_cast { | ||
($primitive_type:ident) => { | ||
impl Cast for $primitive_type { | ||
fn cast_le(val: &[u8]) -> Option<Self> { | ||
Some($primitive_type::from_le_bytes(val.try_into().ok()?)) | ||
} | ||
|
||
fn cast_be(val: &[u8]) -> Option<Self> { | ||
Some($primitive_type::from_be_bytes(val.try_into().ok()?)) | ||
} | ||
} | ||
}; | ||
} | ||
|
||
impl_cast!(i8); | ||
impl_cast!(i16); | ||
impl_cast!(i32); | ||
impl_cast!(i64); | ||
impl_cast!(i128); | ||
impl_cast!(u8); | ||
impl_cast!(u16); | ||
impl_cast!(u32); | ||
impl_cast!(u64); | ||
impl_cast!(u128); | ||
impl_cast!(f32); | ||
impl_cast!(f64); | ||
|
||
impl Cast for f16 { | ||
fn cast_le(val: &[u8]) -> Option<Self> { | ||
Some(f16::from_bits(u16::cast_le(val)?)) | ||
} | ||
|
||
fn cast_be(val: &[u8]) -> Option<Self> { | ||
Some(f16::from_bits(u16::cast_be(val)?)) | ||
} | ||
} | ||
|
||
/// Conversion of binary | ||
pub fn binary_to_large_binary( | ||
from: &BinaryArray<i32>, | ||
|
@@ -93,7 +140,8 @@ pub fn binary_to_utf8<O: Offset>( | |
} | ||
|
||
/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. | ||
pub(super) fn binary_to_primitive<O: Offset, T>( | ||
#[allow(dead_code)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that this isn't casting anymore, and we shouldn't dispatch to this via cast I think we should move this all much higher int he polars dependency chain. This should instead be implemented in |
||
pub(super) fn parse_binary_to_primitive<O: Offset, T>( | ||
from: &BinaryArray<O>, | ||
to: &ArrowDataType, | ||
) -> PrimitiveArray<T> | ||
|
@@ -105,7 +153,8 @@ where | |
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone()) | ||
} | ||
|
||
pub(super) fn binary_to_primitive_dyn<O: Offset, T>( | ||
#[allow(dead_code)] | ||
pub(super) fn parse_binary_to_primitive_dyn<O: Offset, T>( | ||
from: &dyn Array, | ||
to: &ArrowDataType, | ||
options: CastOptionsImpl, | ||
|
@@ -117,10 +166,55 @@ where | |
if options.partial { | ||
unimplemented!() | ||
} else { | ||
Ok(Box::new(binary_to_primitive::<O, T>(from, to))) | ||
Ok(Box::new(parse_binary_to_primitive::<O, T>(from, to))) | ||
} | ||
} | ||
|
||
/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. | ||
pub(super) fn cast_binary_to_primitive<O: Offset, T>( | ||
from: &BinaryArray<O>, | ||
to: &ArrowDataType, | ||
options: CastOptionsImpl, | ||
is_little_endian: bool, | ||
) -> PrimitiveArray<T> | ||
where | ||
T: NativeType + Cast, | ||
{ | ||
if options.partial { | ||
unimplemented!() | ||
} else { | ||
let iter = from.iter().map(|x| { | ||
x.and_then::<T, _>(|x| { | ||
if is_little_endian { | ||
T::cast_le(x) | ||
} else { | ||
T::cast_be(x) | ||
} | ||
}) | ||
}); | ||
|
||
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone()) | ||
} | ||
} | ||
|
||
pub(super) fn cast_binary_to_primitive_dyn<O: Offset, T>( | ||
from: &dyn Array, | ||
to: &ArrowDataType, | ||
options: CastOptionsImpl, | ||
is_little_endian: bool, | ||
) -> PolarsResult<Box<dyn Array>> | ||
where | ||
T: NativeType + Cast, | ||
{ | ||
let from = from.as_any().downcast_ref().unwrap(); | ||
Ok(Box::new(cast_binary_to_primitive::<O, T>( | ||
from, | ||
to, | ||
options, | ||
is_little_endian, | ||
))) | ||
} | ||
|
||
/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing. | ||
/// # Errors | ||
/// This function errors if the maximum key is smaller than the number of distinct elements | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,8 +11,8 @@ mod utf8_to; | |
pub use binary_to::*; | ||
#[cfg(feature = "dtype-decimal")] | ||
pub use binview_to::binview_to_decimal; | ||
use binview_to::binview_to_primitive_dyn; | ||
pub use binview_to::utf8view_to_utf8; | ||
use binview_to::parse_binview_to_primitive_dyn; | ||
pub use binview_to::{cast_binview_to_primitive_dyn, utf8view_to_utf8}; | ||
pub use boolean_to::*; | ||
pub use decimal_to::*; | ||
use dictionary_to::*; | ||
|
@@ -338,6 +338,16 @@ pub fn cast( | |
array.as_any().downcast_ref().unwrap(), | ||
) | ||
.boxed()), | ||
UInt8 => cast_binview_to_primitive_dyn::<u8>(array, to_type, options, true), | ||
UInt16 => cast_binview_to_primitive_dyn::<u16>(array, to_type, options, true), | ||
UInt32 => cast_binview_to_primitive_dyn::<u32>(array, to_type, options, true), | ||
UInt64 => cast_binview_to_primitive_dyn::<u64>(array, to_type, options, true), | ||
Int8 => cast_binview_to_primitive_dyn::<i8>(array, to_type, options, true), | ||
Int16 => cast_binview_to_primitive_dyn::<i16>(array, to_type, options, true), | ||
Int32 => cast_binview_to_primitive_dyn::<i32>(array, to_type, options, true), | ||
Int64 => cast_binview_to_primitive_dyn::<i64>(array, to_type, options, true), | ||
Float32 => cast_binview_to_primitive_dyn::<f32>(array, to_type, options, true), | ||
Float64 => cast_binview_to_primitive_dyn::<f64>(array, to_type, options, true), | ||
LargeList(inner) if matches!(inner.dtype, ArrowDataType::UInt8) => { | ||
let bin_array = view_to_binary::<i64>(array.as_any().downcast_ref().unwrap()); | ||
Ok(binary_to_list(&bin_array, to_type.clone()).boxed()) | ||
|
@@ -356,7 +366,6 @@ pub fn cast( | |
(LargeList(lhs), List(rhs)) if lhs == rhs => { | ||
Ok(cast_large_to_list(array.as_any().downcast_ref().unwrap(), to_type).boxed()) | ||
}, | ||
|
||
(_, List(to)) => { | ||
// cast primitive to list's primitive | ||
let values = cast(array, &to.dtype, options)?; | ||
|
@@ -394,16 +403,26 @@ pub fn cast( | |
match to_type { | ||
BinaryView => Ok(arr.to_binview().boxed()), | ||
LargeUtf8 => Ok(binview_to::utf8view_to_utf8::<i64>(arr).boxed()), | ||
UInt8 => binview_to_primitive_dyn::<u8>(&arr.to_binview(), to_type, options), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And then we can remove all this from the cast kernels as well. |
||
UInt16 => binview_to_primitive_dyn::<u16>(&arr.to_binview(), to_type, options), | ||
UInt32 => binview_to_primitive_dyn::<u32>(&arr.to_binview(), to_type, options), | ||
UInt64 => binview_to_primitive_dyn::<u64>(&arr.to_binview(), to_type, options), | ||
Int8 => binview_to_primitive_dyn::<i8>(&arr.to_binview(), to_type, options), | ||
Int16 => binview_to_primitive_dyn::<i16>(&arr.to_binview(), to_type, options), | ||
Int32 => binview_to_primitive_dyn::<i32>(&arr.to_binview(), to_type, options), | ||
Int64 => binview_to_primitive_dyn::<i64>(&arr.to_binview(), to_type, options), | ||
Float32 => binview_to_primitive_dyn::<f32>(&arr.to_binview(), to_type, options), | ||
Float64 => binview_to_primitive_dyn::<f64>(&arr.to_binview(), to_type, options), | ||
UInt8 => parse_binview_to_primitive_dyn::<u8>(&arr.to_binview(), to_type, options), | ||
UInt16 => { | ||
parse_binview_to_primitive_dyn::<u16>(&arr.to_binview(), to_type, options) | ||
}, | ||
UInt32 => { | ||
parse_binview_to_primitive_dyn::<u32>(&arr.to_binview(), to_type, options) | ||
}, | ||
UInt64 => { | ||
parse_binview_to_primitive_dyn::<u64>(&arr.to_binview(), to_type, options) | ||
}, | ||
Int8 => parse_binview_to_primitive_dyn::<i8>(&arr.to_binview(), to_type, options), | ||
Int16 => parse_binview_to_primitive_dyn::<i16>(&arr.to_binview(), to_type, options), | ||
Int32 => parse_binview_to_primitive_dyn::<i32>(&arr.to_binview(), to_type, options), | ||
Int64 => parse_binview_to_primitive_dyn::<i64>(&arr.to_binview(), to_type, options), | ||
Float32 => { | ||
parse_binview_to_primitive_dyn::<f32>(&arr.to_binview(), to_type, options) | ||
}, | ||
Float64 => { | ||
parse_binview_to_primitive_dyn::<f64>(&arr.to_binview(), to_type, options) | ||
}, | ||
Timestamp(time_unit, None) => { | ||
utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned()) | ||
}, | ||
|
@@ -508,16 +527,16 @@ pub fn cast( | |
}, | ||
|
||
(LargeBinary, _) => match to_type { | ||
UInt8 => binary_to_primitive_dyn::<i64, u8>(array, to_type, options), | ||
UInt16 => binary_to_primitive_dyn::<i64, u16>(array, to_type, options), | ||
UInt32 => binary_to_primitive_dyn::<i64, u32>(array, to_type, options), | ||
UInt64 => binary_to_primitive_dyn::<i64, u64>(array, to_type, options), | ||
Int8 => binary_to_primitive_dyn::<i64, i8>(array, to_type, options), | ||
Int16 => binary_to_primitive_dyn::<i64, i16>(array, to_type, options), | ||
Int32 => binary_to_primitive_dyn::<i64, i32>(array, to_type, options), | ||
Int64 => binary_to_primitive_dyn::<i64, i64>(array, to_type, options), | ||
Float32 => binary_to_primitive_dyn::<i64, f32>(array, to_type, options), | ||
Float64 => binary_to_primitive_dyn::<i64, f64>(array, to_type, options), | ||
UInt8 => cast_binary_to_primitive_dyn::<i64, u8>(array, to_type, options, true), | ||
UInt16 => cast_binary_to_primitive_dyn::<i64, u16>(array, to_type, options, true), | ||
UInt32 => cast_binary_to_primitive_dyn::<i64, u32>(array, to_type, options, true), | ||
UInt64 => cast_binary_to_primitive_dyn::<i64, u64>(array, to_type, options, true), | ||
Int8 => cast_binary_to_primitive_dyn::<i64, i8>(array, to_type, options, true), | ||
Int16 => cast_binary_to_primitive_dyn::<i64, i16>(array, to_type, options, true), | ||
Int32 => cast_binary_to_primitive_dyn::<i64, i32>(array, to_type, options, true), | ||
Int64 => cast_binary_to_primitive_dyn::<i64, i64>(array, to_type, options, true), | ||
Float32 => cast_binary_to_primitive_dyn::<i64, f32>(array, to_type, options, true), | ||
Float64 => cast_binary_to_primitive_dyn::<i64, f64>(array, to_type, options, true), | ||
Binary => { | ||
binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) | ||
.map(|x| x.boxed()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't support
f16
, so we can drop this.