diff --git a/crates/polars-compute/src/cast/binary_to.rs b/crates/polars-compute/src/cast/binary_to.rs index 071bdd85456c..8b137891791f 100644 --- a/crates/polars-compute/src/cast/binary_to.rs +++ b/crates/polars-compute/src/cast/binary_to.rs @@ -1,245 +1 @@ -use std::sync::Arc; -use arrow::array::*; -use arrow::buffer::Buffer; -use arrow::datatypes::ArrowDataType; -use arrow::offset::{Offset, Offsets}; -use arrow::types::NativeType; -use polars_error::PolarsResult; - -use super::CastOptionsImpl; - -pub(super) trait Parse { - fn parse(val: &[u8]) -> Option - where - Self: Sized; -} - -macro_rules! impl_parse { - ($primitive_type:ident) => { - impl Parse for $primitive_type { - fn parse(val: &[u8]) -> Option { - atoi_simd::parse_skipped(val).ok() - } - } - }; -} -impl_parse!(i8); -impl_parse!(i16); -impl_parse!(i32); -impl_parse!(i64); -impl_parse!(u8); -impl_parse!(u16); -impl_parse!(u32); -impl_parse!(u64); - -impl Parse for f32 { - fn parse(val: &[u8]) -> Option - where - Self: Sized, - { - fast_float2::parse(val).ok() - } -} -impl Parse for f64 { - fn parse(val: &[u8]) -> Option - where - Self: Sized, - { - fast_float2::parse(val).ok() - } -} - -/// Conversion of binary -pub fn binary_to_large_binary( - from: &BinaryArray, - to_dtype: ArrowDataType, -) -> BinaryArray { - let values = from.values().clone(); - BinaryArray::::new( - to_dtype, - from.offsets().into(), - values, - from.validity().cloned(), - ) -} - -/// Conversion of binary -pub fn binary_large_to_binary( - from: &BinaryArray, - to_dtype: ArrowDataType, -) -> PolarsResult> { - let values = from.values().clone(); - let offsets = from.offsets().try_into()?; - Ok(BinaryArray::::new( - to_dtype, - offsets, - values, - from.validity().cloned(), - )) -} - -/// Conversion to utf8 -pub fn binary_to_utf8( - from: &BinaryArray, - to_dtype: ArrowDataType, -) -> PolarsResult> { - Utf8Array::::try_new( - to_dtype, - from.offsets().clone(), - from.values().clone(), - from.validity().cloned(), - ) -} - -/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub(super) fn binary_to_primitive( - from: &BinaryArray, - to: &ArrowDataType, -) -> PrimitiveArray -where - T: NativeType + Parse, -{ - let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); - - PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) -} - -pub(super) fn binary_to_primitive_dyn( - from: &dyn Array, - to: &ArrowDataType, - options: CastOptionsImpl, -) -> PolarsResult> -where - T: NativeType + Parse, -{ - let from = from.as_any().downcast_ref().unwrap(); - if options.partial { - unimplemented!() - } else { - Ok(Box::new(binary_to_primitive::(from, to))) - } -} - -/// Cast [`BinaryArray`] to [`DictionaryArray`], also known as packing. -/// # Errors -/// This function errors if the maximum key is smaller than the number of distinct elements -/// in the array. -pub fn binary_to_dictionary( - from: &BinaryArray, -) -> PolarsResult> { - let mut array = MutableDictionaryArray::>::new(); - array.reserve(from.len()); - array.try_extend(from.iter())?; - - Ok(array.into()) -} - -pub(super) fn binary_to_dictionary_dyn( - from: &dyn Array, -) -> PolarsResult> { - let values = from.as_any().downcast_ref().unwrap(); - binary_to_dictionary::(values).map(|x| Box::new(x) as Box) -} - -fn fixed_size_to_offsets(values_len: usize, fixed_size: usize) -> Offsets { - let offsets = (0..(values_len + 1)) - .step_by(fixed_size) - .map(|v| O::from_as_usize(v)) - .collect(); - // SAFETY: - // * every element is `>= 0` - // * element at position `i` is >= than element at position `i-1`. - unsafe { Offsets::new_unchecked(offsets) } -} - -/// Conversion of `FixedSizeBinary` to `Binary`. -pub fn fixed_size_binary_binary( - from: &FixedSizeBinaryArray, - to_dtype: ArrowDataType, -) -> BinaryArray { - let values = from.values().clone(); - let offsets = fixed_size_to_offsets(values.len(), from.size()); - BinaryArray::::new(to_dtype, offsets.into(), values, from.validity().cloned()) -} - -pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray { - let datatype = <[u8] as ViewType>::DATA_TYPE; - - // Fast path: all the views are inlineable - if from.size() <= View::MAX_INLINE_SIZE as usize { - // @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that - // prevents it from properly SIMD-ing this loop. It insists on memcpying while it should - // know that the size is really small. Dispatching over the `from.size()` and making it - // constant does make loop SIMD, but it does not actually speed anything up and the code it - // generates is still horrible. - // - // This is really slow, and I don't think it has to be. - - // SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before - let mut views = Vec::new(); - View::extend_with_inlinable_strided( - &mut views, - from.values().as_slice(), - from.size() as u8, - ); - let views = Buffer::from(views); - return BinaryViewArray::try_new(datatype, views, Arc::default(), from.validity().cloned()) - .unwrap(); - } - - const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize; - - let size = from.size(); - let num_bytes = from.len() * size; - let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER); - assert!(num_buffers < u32::MAX as usize); - - let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size; - // This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division - let split_point = num_elements_per_buffer * size; - - // This is zero-copy for the buffer since split just increases the data since - let mut buffer = from.values().clone(); - let mut buffers = Vec::with_capacity(num_buffers); - - if let Some(num_buffers) = num_buffers.checked_sub(1) { - for _ in 0..num_buffers { - let slice; - (slice, buffer) = buffer.split_at(split_point); - buffers.push(slice); - } - buffers.push(buffer); - } - - let mut iter = from.values_iter(); - let iter = iter.by_ref(); - let mut views = Vec::with_capacity(from.len()); - for buffer_idx in 0..num_buffers { - views.extend( - iter.take(num_elements_per_buffer) - .enumerate() - .map(|(i, slice)| { - // SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before - unsafe { - View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32) - } - }), - ); - } - let views = views.into(); - - BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap() -} - -/// Conversion of binary -pub fn binary_to_list(from: &BinaryArray, to_dtype: ArrowDataType) -> ListArray { - let values = from.values().clone(); - let values = PrimitiveArray::new(ArrowDataType::UInt8, values, None); - ListArray::::new( - to_dtype, - from.offsets().clone(), - values.boxed(), - from.validity().cloned(), - ) -} diff --git a/crates/polars-compute/src/cast/binview_to.rs b/crates/polars-compute/src/cast/binview_to.rs index 30fa10e4fa5d..53ac3787e0d1 100644 --- a/crates/polars-compute/src/cast/binview_to.rs +++ b/crates/polars-compute/src/cast/binview_to.rs @@ -9,7 +9,14 @@ use polars_error::PolarsResult; use super::binary_to::Parse; use super::temporal::EPOCH_DAYS_FROM_CE; -use super::CastOptionsImpl; +use super::{Cast, CastOptionsImpl}; +use crate::array::*; +#[cfg(feature = "dtype-decimal")] +use crate::compute::decimal::deserialize_decimal; +use crate::datatypes::{ArrowDataType, TimeUnit}; +use crate::offset::Offset; +use crate::temporal_conversions::EPOCH_DAYS_FROM_CE; +use crate::types::NativeType; pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; @@ -61,8 +68,8 @@ pub fn utf8view_to_utf8(array: &Utf8ViewArray) -> Utf8Array { ) } } -/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub(super) fn binview_to_primitive( +/// Parses a [`BinaryArray`] to a [`PrimitiveArray`] as string, making any uncastable value a Null. +pub(super) fn parse_binview_to_primitive( from: &BinaryViewArray, to: &ArrowDataType, ) -> PrimitiveArray @@ -74,7 +81,7 @@ where PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } -pub(super) fn binview_to_primitive_dyn( +pub(super) fn parse_binview_to_primitive_dyn( from: &dyn Array, to: &ArrowDataType, options: CastOptionsImpl, @@ -86,7 +93,51 @@ where if options.partial { unimplemented!() } else { - Ok(Box::new(binview_to_primitive::(from, to))) + Ok(Box::new(parse_binview_to_primitive::(from, to))) + } +} + +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn cast_binview_to_primitive( + from: &BinaryViewArray, + to: &ArrowDataType, + is_little_endian: bool, +) -> PrimitiveArray +where + T: NativeType + Cast, +{ + let iter = from.iter().map(|x| { + x.and_then::(|x| { + if is_little_endian { + T::cast_le(x) + } else { + T::cast_be(x) + } + }) + }); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub fn cast_binview_to_primitive_dyn( + from: &dyn Array, + to: &ArrowDataType, + options: CastOptionsImpl, + is_little_endian: bool, +) -> PolarsResult> +where + T: NativeType + Cast, +{ + let from = from.as_any().downcast_ref().unwrap(); + if options.partial { + unimplemented!() + } else { + Ok(Box::new(cast_binview_to_primitive::( + from, + to, + is_little_endian, + ))) } } diff --git a/crates/polars-compute/src/cast/mod.rs b/crates/polars-compute/src/cast/mod.rs index beec08504795..c40b753f8791 100644 --- a/crates/polars-compute/src/cast/mod.rs +++ b/crates/polars-compute/src/cast/mod.rs @@ -11,8 +11,8 @@ mod utf8_to; pub use binary_to::*; #[cfg(feature = "dtype-decimal")] pub use binview_to::binview_to_decimal; -use binview_to::binview_to_primitive_dyn; -pub use binview_to::utf8view_to_utf8; +use binview_to::parse_binview_to_primitive_dyn; +pub use binview_to::{cast_binview_to_primitive_dyn, utf8view_to_utf8}; pub use boolean_to::*; pub use decimal_to::*; pub mod temporal; @@ -338,6 +338,16 @@ pub fn cast( array.as_any().downcast_ref().unwrap(), ) .boxed()), + UInt8 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + UInt16 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + UInt32 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + UInt64 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Int8 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Int16 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Int32 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Int64 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Float32 => cast_binview_to_primitive_dyn::(array, to_type, options, true), + Float64 => cast_binview_to_primitive_dyn::(array, to_type, options, true), LargeList(inner) if matches!(inner.dtype, ArrowDataType::UInt8) => { let bin_array = view_to_binary::(array.as_any().downcast_ref().unwrap()); Ok(binary_to_list(&bin_array, to_type.clone()).boxed()) @@ -356,7 +366,6 @@ pub fn cast( (LargeList(lhs), List(rhs)) if lhs == rhs => { Ok(cast_large_to_list(array.as_any().downcast_ref().unwrap(), to_type).boxed()) }, - (_, List(to)) => { // cast primitive to list's primitive let values = cast(array, &to.dtype, options)?; @@ -394,16 +403,26 @@ pub fn cast( match to_type { BinaryView => Ok(arr.to_binview().boxed()), LargeUtf8 => Ok(binview_to::utf8view_to_utf8::(arr).boxed()), - UInt8 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - UInt16 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - UInt32 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - UInt64 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Int8 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Int16 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Int32 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Int64 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Float32 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), - Float64 => binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + UInt8 => parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + UInt16 => { + parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options) + }, + UInt32 => { + parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options) + }, + UInt64 => { + parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options) + }, + Int8 => parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + Int16 => parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + Int32 => parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + Int64 => parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options), + Float32 => { + parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options) + }, + Float64 => { + parse_binview_to_primitive_dyn::(&arr.to_binview(), to_type, options) + }, Timestamp(time_unit, None) => { utf8view_to_naive_timestamp_dyn(array, time_unit.to_owned()) }, @@ -508,16 +527,16 @@ pub fn cast( }, (LargeBinary, _) => match to_type { - UInt8 => binary_to_primitive_dyn::(array, to_type, options), - UInt16 => binary_to_primitive_dyn::(array, to_type, options), - UInt32 => binary_to_primitive_dyn::(array, to_type, options), - UInt64 => binary_to_primitive_dyn::(array, to_type, options), - Int8 => binary_to_primitive_dyn::(array, to_type, options), - Int16 => binary_to_primitive_dyn::(array, to_type, options), - Int32 => binary_to_primitive_dyn::(array, to_type, options), - Int64 => binary_to_primitive_dyn::(array, to_type, options), - Float32 => binary_to_primitive_dyn::(array, to_type, options), - Float64 => binary_to_primitive_dyn::(array, to_type, options), + UInt8 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + UInt16 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + UInt32 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + UInt64 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Int8 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Int16 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Int32 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Int64 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Float32 => cast_binary_to_primitive_dyn::(array, to_type, options, true), + Float64 => cast_binary_to_primitive_dyn::(array, to_type, options, true), Binary => { binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) .map(|x| x.boxed()) diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs index b20f1d9e3e5a..ecb2228f94a1 100644 --- a/crates/polars-ops/src/chunked_array/binary/namespace.rs +++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs @@ -1,6 +1,8 @@ #[cfg(feature = "binary_encoding")] use std::borrow::Cow; +use arrow::compute::cast::{cast_binview_to_primitive_dyn, CastOptionsImpl}; +use arrow::with_match_primitive_type_full; #[cfg(feature = "binary_encoding")] use base64::engine::general_purpose; #[cfg(feature = "binary_encoding")] @@ -127,6 +129,37 @@ pub trait BinaryNameSpaceImpl: AsBinary { .unwrap() } } + + #[cfg(feature = "binary_encoding")] + #[allow(clippy::wrong_self_convention)] + fn from_buffer(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult { + let ca = self.as_binary(); + let arrow_type = dtype.to_arrow(CompatLevel::newest()); + + match arrow_type.to_physical_type() { + arrow::datatypes::PhysicalType::Primitive(ty) => { + with_match_primitive_type_full!(ty, |$T| { + unsafe { + Ok(Series::from_chunks_and_dtype_unchecked( + ca.name().clone(), + ca.chunks().iter().map(|chunk| { + cast_binview_to_primitive_dyn::<$T>( + &**chunk, + &arrow_type, + CastOptionsImpl::default(), + is_little_endian, + ) + }).collect::>>()?, + dtype + )) + } + }) + }, + _ => Err( + polars_err!(InvalidOperation:"unsupported data type in from_buffer. Only numerical types are allowed."), + ), + } + } } impl BinaryNameSpaceImpl for BinaryChunked {} diff --git a/crates/polars-plan/src/dsl/binary.rs b/crates/polars-plan/src/dsl/binary.rs index 9091b1777b65..659d498b4388 100644 --- a/crates/polars-plan/src/dsl/binary.rs +++ b/crates/polars-plan/src/dsl/binary.rs @@ -64,4 +64,13 @@ impl BinaryNameSpace { self.0 .map_private(FunctionExpr::BinaryExpr(BinaryFunction::Base64Encode)) } + + #[cfg(feature = "binary_encoding")] + pub fn from_buffer(self, to_type: DataType, is_little_endian: bool) -> Expr { + self.0 + .map_private(FunctionExpr::BinaryExpr(BinaryFunction::FromBuffer( + to_type, + is_little_endian, + ))) + } } diff --git a/crates/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs index 88f3ad71b545..34f05a7839ca 100644 --- a/crates/polars-plan/src/dsl/function_expr/binary.rs +++ b/crates/polars-plan/src/dsl/function_expr/binary.rs @@ -18,6 +18,8 @@ pub enum BinaryFunction { Base64Decode(bool), #[cfg(feature = "binary_encoding")] Base64Encode, + #[cfg(feature = "binary_encoding")] + FromBuffer(DataType, bool), Size, } @@ -31,6 +33,8 @@ impl BinaryFunction { HexDecode(_) | Base64Decode(_) => mapper.with_same_dtype(), #[cfg(feature = "binary_encoding")] HexEncode | Base64Encode => mapper.with_dtype(DataType::String), + #[cfg(feature = "binary_encoding")] + FromBuffer(dtype, _) => mapper.with_dtype(dtype.clone()), Size => mapper.with_dtype(DataType::UInt32), } } @@ -51,6 +55,8 @@ impl Display for BinaryFunction { Base64Decode(_) => "base64_decode", #[cfg(feature = "binary_encoding")] Base64Encode => "base64_encode", + #[cfg(feature = "binary_encoding")] + FromBuffer(_, _) => "from_buffer", Size => "size_bytes", }; write!(f, "bin.{s}") @@ -78,6 +84,8 @@ impl From for SpecialEq> { Base64Decode(strict) => map!(base64_decode, strict), #[cfg(feature = "binary_encoding")] Base64Encode => map!(base64_encode), + #[cfg(feature = "binary_encoding")] + FromBuffer(dtype, is_little_endian) => map!(from_buffer, &dtype, is_little_endian), Size => map!(size_bytes), } } @@ -141,6 +149,17 @@ pub(super) fn base64_encode(s: &Column) -> PolarsResult { Ok(ca.base64_encode().into()) } +#[cfg(feature = "binary_encoding")] +pub(super) fn from_buffer( + s: &Column, + dtype: &DataType, + is_little_endian: bool, +) -> PolarsResult { + let ca = s.binary()?; + ca.from_buffer(dtype, is_little_endian) + .map(|val| val.into()) +} + impl From for FunctionExpr { fn from(b: BinaryFunction) -> Self { FunctionExpr::BinaryExpr(b) diff --git a/crates/polars-python/src/expr/binary.rs b/crates/polars-python/src/expr/binary.rs index 7833c450af2a..6a495af133c5 100644 --- a/crates/polars-python/src/expr/binary.rs +++ b/crates/polars-python/src/expr/binary.rs @@ -1,5 +1,7 @@ +use polars::prelude::DataType; use pyo3::prelude::*; +use crate::prelude::Wrap; use crate::PyExpr; #[pymethods] @@ -40,6 +42,24 @@ impl PyExpr { self.inner.clone().binary().base64_encode().into() } + #[cfg(feature = "binary_encoding")] + #[allow(clippy::wrong_self_convention)] + fn from_buffer(&self, dtype: Wrap, kind: &str) -> PyResult { + use pyo3::exceptions::PyValueError; + + let is_little_endian = match kind.to_lowercase().as_str() { + "le" | "little-endian" | "little" => true, + "be" | "big-endian" | "big" => false, + _ => return Err(PyValueError::new_err(format!("invalid kind: {kind}"))), + }; + Ok(self + .inner + .clone() + .binary() + .from_buffer(dtype.0, is_little_endian) + .into()) + } + fn bin_size_bytes(&self) -> Self { self.inner.clone().binary().size_bytes().into() } diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index 92b4109f620b..ea6284cda9c9 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -125,6 +125,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] Roll: TypeAlias = Literal["raise", "forward", "backward"] SerializationFormat: TypeAlias = Literal["binary", "json"] +Endianness: TypeAlias = Literal["little", "big"] SizeUnit: TypeAlias = Literal[ "b", "kb", diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py index 62cf43180fd3..96afd1f440fe 100644 --- a/py-polars/polars/expr/binary.py +++ b/py-polars/polars/expr/binary.py @@ -1,14 +1,21 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from polars._utils.parse import parse_into_expression from polars._utils.various import scale_bytes from polars._utils.wrap import wrap_expr +from polars.datatypes import parse_into_dtype if TYPE_CHECKING: from polars import Expr - from polars._typing import IntoExpr, SizeUnit, TransferEncoding + from polars._typing import ( + Endianness, + IntoExpr, + PolarsDataType, + SizeUnit, + TransferEncoding, + ) class ExprBinaryNameSpace: @@ -289,3 +296,43 @@ def size(self, unit: SizeUnit = "b") -> Expr: sz = wrap_expr(self._pyexpr.bin_size_bytes()) sz = scale_bytes(sz, unit) return sz + + def from_buffer( + self, dtype: PolarsDataType | type[Any], endianness: Endianness = "little" + ) -> Expr: + r""" + Interpret a buffer as a numerical polars type. + + Parameters + ---------- + dtype : PolarsDataType | type[Any] + Which type to cast binary column to + endianness : {"big", "little"}, optional + Which endianness to use when interpreting bytes, by default "little" + + Returns + ------- + Expr + Expression of data type `dtype`. + Note that if binary array is too short value will be null. + If binary array is too long, remainder will be ignored. + + Examples + -------- + >>> df = pl.DataFrame({"data": [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]}) + >>> df.with_columns( # doctest: +IGNORE_RESULT + ... casted=pl.col("data").bin.from_buffer(pl.Int32, "little"), + ... ) + shape: (2, 3) + ┌─────────────────────┬────────┐ + │ data ┆ caster │ + │ --- ┆ --- │ + │ binary ┆ i32 │ + ╞═════════════════════╪════════╡ + │ b"\x05\x00\x00\x00" ┆ 5 │ + │ b"\x10\x00\x01\x00" ┆ 65552 │ + └─────────────────────┴────────┘ + """ + dtype = parse_into_dtype(dtype) + + return wrap_expr(self._pyexpr.from_buffer(dtype, endianness)) diff --git a/py-polars/tests/unit/operations/namespaces/test_binary.py b/py-polars/tests/unit/operations/namespaces/test_binary.py index e15ca2010817..01a820cba96d 100644 --- a/py-polars/tests/unit/operations/namespaces/test_binary.py +++ b/py-polars/tests/unit/operations/namespaces/test_binary.py @@ -1,5 +1,7 @@ from __future__ import annotations +import random +import struct from typing import TYPE_CHECKING import pytest @@ -164,3 +166,65 @@ def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None: df["data"].bin.size(unit).item(), # series ): assert sz == expected + + +@pytest.mark.parametrize( + ("dtype", "type_size", "struct_type"), + [ + (pl.Int8, 1, "b"), + (pl.UInt8, 1, "B"), + (pl.Int16, 2, "h"), + (pl.UInt16, 2, "H"), + (pl.Int32, 4, "i"), + (pl.UInt32, 4, "I"), + (pl.Int64, 8, "q"), + (pl.UInt64, 8, "Q"), + (pl.Float32, 4, "f"), + (pl.Float64, 8, "d"), + ], +) +def test_from_buffer( + dtype: pl.DataType, + type_size: int, + struct_type: str, +) -> None: + # Make test reproducible + random.seed(42) + + byte_arr = [random.randbytes(type_size) for _ in range(3)] + df = pl.DataFrame({"x": byte_arr}) + + for endianness in ["little", "big"]: + # So that mypy doesn't complain + struct_endianness = "<" if endianness == "little" else ">" + expected = [ + struct.unpack_from(f"{struct_endianness}{struct_type}", elem_bytes)[0] + for elem_bytes in byte_arr + ] + expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype}) + + result = df.select(pl.col("x").bin.from_buffer(dtype, endianness)) # type: ignore[arg-type] + + assert_frame_equal(result, expected_df) + + +def test_from_buffer_invalid() -> None: + # Fails because buffer has more than 4 bytes + df = pl.DataFrame({"x": [b"d3d3a"]}) + print(struct.unpack_from("