Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add binary as numerical #19542

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 98 additions & 4 deletions crates/polars-arrow/src/compute/cast/binary_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ use crate::array::*;
use crate::buffer::Buffer;
use crate::datatypes::ArrowDataType;
use crate::offset::{Offset, Offsets};
use crate::types::NativeType;
use crate::types::{f16, NativeType};

/// Trait for parsing a data type from string
pub(super) trait Parse {
fn parse(val: &[u8]) -> Option<Self>
where
Expand Down Expand Up @@ -50,6 +51,52 @@ impl Parse for f64 {
}
}

/// Trait for casting bytes to a primitive type
pub trait Cast {
fn cast_le(val: &[u8]) -> Option<Self>
where
Self: Sized;
fn cast_be(val: &[u8]) -> Option<Self>
where
Self: Sized;
}
macro_rules! impl_cast {
($primitive_type:ident) => {
impl Cast for $primitive_type {
fn cast_le(val: &[u8]) -> Option<Self> {
Some($primitive_type::from_le_bytes(val.try_into().ok()?))
}

fn cast_be(val: &[u8]) -> Option<Self> {
Some($primitive_type::from_be_bytes(val.try_into().ok()?))
}
}
};
}

impl_cast!(i8);
impl_cast!(i16);
impl_cast!(i32);
impl_cast!(i64);
impl_cast!(i128);
impl_cast!(u8);
impl_cast!(u16);
impl_cast!(u32);
impl_cast!(u64);
impl_cast!(u128);
impl_cast!(f32);
impl_cast!(f64);

impl Cast for f16 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't support f16, so we can drop this.

fn cast_le(val: &[u8]) -> Option<Self> {
Some(f16::from_bits(u16::cast_le(val)?))
}

fn cast_be(val: &[u8]) -> Option<Self> {
Some(f16::from_bits(u16::cast_be(val)?))
}
}

/// Conversion of binary
pub fn binary_to_large_binary(
from: &BinaryArray<i32>,
Expand Down Expand Up @@ -93,7 +140,8 @@ pub fn binary_to_utf8<O: Offset>(
}

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn binary_to_primitive<O: Offset, T>(
#[allow(dead_code)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that this isn't casting anymore, and we shouldn't dispatch to this via cast I think we should move this all much higher int he polars dependency chain.

This should instead be implemented in polars-ops. Only the expressions should call into this and lower it is isn't something we should compile.

pub(super) fn parse_binary_to_primitive<O: Offset, T>(
from: &BinaryArray<O>,
to: &ArrowDataType,
) -> PrimitiveArray<T>
Expand All @@ -105,7 +153,8 @@ where
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
}

pub(super) fn binary_to_primitive_dyn<O: Offset, T>(
#[allow(dead_code)]
pub(super) fn parse_binary_to_primitive_dyn<O: Offset, T>(
from: &dyn Array,
to: &ArrowDataType,
options: CastOptionsImpl,
Expand All @@ -117,10 +166,55 @@ where
if options.partial {
unimplemented!()
} else {
Ok(Box::new(binary_to_primitive::<O, T>(from, to)))
Ok(Box::new(parse_binary_to_primitive::<O, T>(from, to)))
}
}

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn cast_binary_to_primitive<O: Offset, T>(
from: &BinaryArray<O>,
to: &ArrowDataType,
options: CastOptionsImpl,
is_little_endian: bool,
) -> PrimitiveArray<T>
where
T: NativeType + Cast,
{
if options.partial {
unimplemented!()
} else {
let iter = from.iter().map(|x| {
x.and_then::<T, _>(|x| {
if is_little_endian {
T::cast_le(x)
} else {
T::cast_be(x)
}
})
});

PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
}
}

pub(super) fn cast_binary_to_primitive_dyn<O: Offset, T>(
from: &dyn Array,
to: &ArrowDataType,
options: CastOptionsImpl,
is_little_endian: bool,
) -> PolarsResult<Box<dyn Array>>
where
T: NativeType + Cast,
{
let from = from.as_any().downcast_ref().unwrap();
Ok(Box::new(cast_binary_to_primitive::<O, T>(
from,
to,
options,
is_little_endian,
)))
}

/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing.
/// # Errors
/// This function errors if the maximum key is smaller than the number of distinct elements
Expand Down
53 changes: 49 additions & 4 deletions crates/polars-arrow/src/compute/cast/binview_to.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use chrono::Datelike;
use polars_error::PolarsResult;

use super::Cast;
use crate::array::*;
use crate::compute::cast::binary_to::Parse;
use crate::compute::cast::CastOptionsImpl;
Expand Down Expand Up @@ -61,8 +62,8 @@ pub fn utf8view_to_utf8<O: Offset>(array: &Utf8ViewArray) -> Utf8Array<O> {
)
}
}
/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn binview_to_primitive<T>(
/// Parses a [`BinaryArray`] to a [`PrimitiveArray`] as string, making any uncastable value a Null.
pub(super) fn parse_binview_to_primitive<T>(
from: &BinaryViewArray,
to: &ArrowDataType,
) -> PrimitiveArray<T>
Expand All @@ -74,7 +75,7 @@ where
PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
}

pub(super) fn binview_to_primitive_dyn<T>(
pub(super) fn parse_binview_to_primitive_dyn<T>(
from: &dyn Array,
to: &ArrowDataType,
options: CastOptionsImpl,
Expand All @@ -86,7 +87,51 @@ where
if options.partial {
unimplemented!()
} else {
Ok(Box::new(binview_to_primitive::<T>(from, to)))
Ok(Box::new(parse_binview_to_primitive::<T>(from, to)))
}
}

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn cast_binview_to_primitive<T>(
from: &BinaryViewArray,
to: &ArrowDataType,
is_little_endian: bool,
) -> PrimitiveArray<T>
where
T: NativeType + Cast,
{
let iter = from.iter().map(|x| {
x.and_then::<T, _>(|x| {
if is_little_endian {
T::cast_le(x)
} else {
T::cast_be(x)
}
})
});

PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
}

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub fn cast_binview_to_primitive_dyn<T>(
from: &dyn Array,
to: &ArrowDataType,
options: CastOptionsImpl,
is_little_endian: bool,
) -> PolarsResult<Box<dyn Array>>
where
T: NativeType + Cast,
{
let from = from.as_any().downcast_ref().unwrap();
if options.partial {
unimplemented!()
} else {
Ok(Box::new(cast_binview_to_primitive::<T>(
from,
to,
is_little_endian,
)))
}
}

Expand Down
65 changes: 42 additions & 23 deletions crates/polars-arrow/src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ mod utf8_to;
pub use binary_to::*;
#[cfg(feature = "dtype-decimal")]
pub use binview_to::binview_to_decimal;
use binview_to::binview_to_primitive_dyn;
pub use binview_to::utf8view_to_utf8;
use binview_to::parse_binview_to_primitive_dyn;
pub use binview_to::{cast_binview_to_primitive_dyn, utf8view_to_utf8};
pub use boolean_to::*;
pub use decimal_to::*;
use dictionary_to::*;
Expand Down Expand Up @@ -338,6 +338,16 @@ pub fn cast(
array.as_any().downcast_ref().unwrap(),
)
.boxed()),
UInt8 => cast_binview_to_primitive_dyn::<u8>(array, to_type, options, true),
UInt16 => cast_binview_to_primitive_dyn::<u16>(array, to_type, options, true),
UInt32 => cast_binview_to_primitive_dyn::<u32>(array, to_type, options, true),
UInt64 => cast_binview_to_primitive_dyn::<u64>(array, to_type, options, true),
Int8 => cast_binview_to_primitive_dyn::<i8>(array, to_type, options, true),
Int16 => cast_binview_to_primitive_dyn::<i16>(array, to_type, options, true),
Int32 => cast_binview_to_primitive_dyn::<i32>(array, to_type, options, true),
Int64 => cast_binview_to_primitive_dyn::<i64>(array, to_type, options, true),
Float32 => cast_binview_to_primitive_dyn::<f32>(array, to_type, options, true),
Float64 => cast_binview_to_primitive_dyn::<f64>(array, to_type, options, true),
LargeList(inner) if matches!(inner.dtype, ArrowDataType::UInt8) => {
let bin_array = view_to_binary::<i64>(array.as_any().downcast_ref().unwrap());
Ok(binary_to_list(&bin_array, to_type.clone()).boxed())
Expand All @@ -356,7 +366,6 @@ pub fn cast(
(LargeList(lhs), List(rhs)) if lhs == rhs => {
Ok(cast_large_to_list(array.as_any().downcast_ref().unwrap(), to_type).boxed())
},

(_, List(to)) => {
// cast primitive to list's primitive
let values = cast(array, &to.dtype, options)?;
Expand Down Expand Up @@ -394,16 +403,26 @@ pub fn cast(
match to_type {
BinaryView => Ok(arr.to_binview().boxed()),
LargeUtf8 => Ok(binview_to::utf8view_to_utf8::<i64>(arr).boxed()),
UInt8 => binview_to_primitive_dyn::<u8>(&arr.to_binview(), to_type, options),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And then we can remove all this from the cast kernels as well.

UInt16 => binview_to_primitive_dyn::<u16>(&arr.to_binview(), to_type, options),
UInt32 => binview_to_primitive_dyn::<u32>(&arr.to_binview(), to_type, options),
UInt64 => binview_to_primitive_dyn::<u64>(&arr.to_binview(), to_type, options),
Int8 => binview_to_primitive_dyn::<i8>(&arr.to_binview(), to_type, options),
Int16 => binview_to_primitive_dyn::<i16>(&arr.to_binview(), to_type, options),
Int32 => binview_to_primitive_dyn::<i32>(&arr.to_binview(), to_type, options),
Int64 => binview_to_primitive_dyn::<i64>(&arr.to_binview(), to_type, options),
Float32 => binview_to_primitive_dyn::<f32>(&arr.to_binview(), to_type, options),
Float64 => binview_to_primitive_dyn::<f64>(&arr.to_binview(), to_type, options),
UInt8 => parse_binview_to_primitive_dyn::<u8>(&arr.to_binview(), to_type, options),
UInt16 => {
parse_binview_to_primitive_dyn::<u16>(&arr.to_binview(), to_type, options)
},
UInt32 => {
parse_binview_to_primitive_dyn::<u32>(&arr.to_binview(), to_type, options)
},
UInt64 => {
parse_binview_to_primitive_dyn::<u64>(&arr.to_binview(), to_type, options)
},
Int8 => parse_binview_to_primitive_dyn::<i8>(&arr.to_binview(), to_type, options),
Int16 => parse_binview_to_primitive_dyn::<i16>(&arr.to_binview(), to_type, options),
Int32 => parse_binview_to_primitive_dyn::<i32>(&arr.to_binview(), to_type, options),
Int64 => parse_binview_to_primitive_dyn::<i64>(&arr.to_binview(), to_type, options),
Float32 => {
parse_binview_to_primitive_dyn::<f32>(&arr.to_binview(), to_type, options)
},
Float64 => {
parse_binview_to_primitive_dyn::<f64>(&arr.to_binview(), to_type, options)
},
Timestamp(time_unit, None) => {
utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned())
},
Expand Down Expand Up @@ -508,16 +527,16 @@ pub fn cast(
},

(LargeBinary, _) => match to_type {
UInt8 => binary_to_primitive_dyn::<i64, u8>(array, to_type, options),
UInt16 => binary_to_primitive_dyn::<i64, u16>(array, to_type, options),
UInt32 => binary_to_primitive_dyn::<i64, u32>(array, to_type, options),
UInt64 => binary_to_primitive_dyn::<i64, u64>(array, to_type, options),
Int8 => binary_to_primitive_dyn::<i64, i8>(array, to_type, options),
Int16 => binary_to_primitive_dyn::<i64, i16>(array, to_type, options),
Int32 => binary_to_primitive_dyn::<i64, i32>(array, to_type, options),
Int64 => binary_to_primitive_dyn::<i64, i64>(array, to_type, options),
Float32 => binary_to_primitive_dyn::<i64, f32>(array, to_type, options),
Float64 => binary_to_primitive_dyn::<i64, f64>(array, to_type, options),
UInt8 => cast_binary_to_primitive_dyn::<i64, u8>(array, to_type, options, true),
UInt16 => cast_binary_to_primitive_dyn::<i64, u16>(array, to_type, options, true),
UInt32 => cast_binary_to_primitive_dyn::<i64, u32>(array, to_type, options, true),
UInt64 => cast_binary_to_primitive_dyn::<i64, u64>(array, to_type, options, true),
Int8 => cast_binary_to_primitive_dyn::<i64, i8>(array, to_type, options, true),
Int16 => cast_binary_to_primitive_dyn::<i64, i16>(array, to_type, options, true),
Int32 => cast_binary_to_primitive_dyn::<i64, i32>(array, to_type, options, true),
Int64 => cast_binary_to_primitive_dyn::<i64, i64>(array, to_type, options, true),
Float32 => cast_binary_to_primitive_dyn::<i64, f32>(array, to_type, options, true),
Float64 => cast_binary_to_primitive_dyn::<i64, f64>(array, to_type, options, true),
Binary => {
binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone())
.map(|x| x.boxed())
Expand Down
33 changes: 33 additions & 0 deletions crates/polars-ops/src/chunked_array/binary/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#[cfg(feature = "binary_encoding")]
use std::borrow::Cow;

use arrow::compute::cast::{cast_binview_to_primitive_dyn, CastOptionsImpl};
use arrow::with_match_primitive_type_full;
#[cfg(feature = "binary_encoding")]
use base64::engine::general_purpose;
#[cfg(feature = "binary_encoding")]
Expand Down Expand Up @@ -131,6 +133,37 @@ pub trait BinaryNameSpaceImpl: AsBinary {
.unwrap()
}
}

#[cfg(feature = "binary_encoding")]
#[allow(clippy::wrong_self_convention)]
fn from_buffer(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult<Series> {
let ca = self.as_binary();
let arrow_type = dtype.to_arrow(CompatLevel::newest());

match arrow_type.to_physical_type() {
arrow::datatypes::PhysicalType::Primitive(ty) => {
with_match_primitive_type_full!(ty, |$T| {
unsafe {
Ok(Series::from_chunks_and_dtype_unchecked(
ca.name().clone(),
ca.chunks().iter().map(|chunk| {
cast_binview_to_primitive_dyn::<$T>(
&**chunk,
&arrow_type,
CastOptionsImpl::default(),
is_little_endian,
)
}).collect::<PolarsResult<Vec<_>>>()?,
dtype
))
}
})
},
_ => Err(
polars_err!(InvalidOperation:"unsupported data type in from_buffer. Only numerical types are allowed."),
),
}
}
}

impl BinaryNameSpaceImpl for BinaryChunked {}
9 changes: 9 additions & 0 deletions crates/polars-plan/src/dsl/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,13 @@ impl BinaryNameSpace {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::Base64Encode))
}

#[cfg(feature = "binary_encoding")]
pub fn from_buffer(self, to_type: DataType, is_little_endian: bool) -> Expr {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::FromBuffer(
to_type,
is_little_endian,
)))
}
}
Loading
Loading