From f1c74fe6a46548c1acf956e6556f76a4b95f582b Mon Sep 17 00:00:00 2001 From: ritchie Date: Thu, 16 Nov 2023 12:03:46 +0100 Subject: [PATCH 1/2] feat: replace (parts) of lexical with atoi_simd --- Cargo.lock | 18 +++----- Cargo.toml | 1 + crates/polars-arrow/src/compute/cast/mod.rs | 33 +++----------- .../polars-arrow/src/compute/cast/utf8_to.rs | 45 ------------------- crates/polars-arrow/src/util/lexical.rs | 42 ----------------- crates/polars-arrow/src/util/mod.rs | 19 -------- crates/polars-io/Cargo.toml | 8 ++-- crates/polars-io/src/csv/buffer.rs | 8 ++-- 8 files changed, 20 insertions(+), 154 deletions(-) delete mode 100644 crates/polars-arrow/src/util/lexical.rs diff --git a/Cargo.lock b/Cargo.lock index b2e7f28fcbaa..0e4ecc756548 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -259,6 +259,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atoi_simd" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc41b65e01b6851bdcd2d741824e6b310d571396bf3915e31e4792034ee65126" + [[package]] name = "autocfg" version = "1.1.0" @@ -1907,15 +1913,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "lexical" -version = "6.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7aefb36fd43fef7003334742cbf77b243fcd36418a1d1bdd480d613a67968f6" -dependencies = [ - "lexical-core", -] - [[package]] name = "lexical-core" version = "0.8.5" @@ -2744,6 +2741,7 @@ version = "0.34.2" dependencies = [ "ahash", "async-trait", + "atoi_simd", "bytes", "chrono", "chrono-tz", @@ -2752,8 +2750,6 @@ dependencies = [ "futures", "home", "itoa", - "lexical", - "lexical-core", "memchr", "memmap2", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 80e8d8b3eef6..8cfc8d8fe127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ hex = "0.4.3" indexmap = { version = "2", features = ["std"] } itoa = "1.0.6" lexical-core = "0.8.5" +atoi_simd = "0.15" memchr = "2.6" multiversion = "0.7" ndarray = { version = "0.15", default-features = false } diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index 690b59f7c049..1569e6bb1d95 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -570,42 +570,19 @@ pub fn cast( }, (Utf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), - Date32 => utf8_to_date32_dyn::(array), - Date64 => utf8_to_date64_dyn::(array), LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( array.as_any().downcast_ref().unwrap(), ))), - Timestamp(time_unit, None) => { - utf8_to_naive_timestamp_dyn::(array, time_unit.to_owned()) - }, - Timestamp(time_unit, Some(time_zone)) => { - utf8_to_timestamp_dyn::(array, time_zone.clone(), time_unit.to_owned()) - }, _ => polars_bail!(InvalidOperation: "casting from {from_type:?} to {to_type:?} not supported", ), }, (LargeUtf8, _) => match to_type { - UInt8 => utf8_to_primitive_dyn::(array, to_type, options), - UInt16 => utf8_to_primitive_dyn::(array, to_type, options), - UInt32 => utf8_to_primitive_dyn::(array, to_type, options), - UInt64 => utf8_to_primitive_dyn::(array, to_type, options), - Int8 => utf8_to_primitive_dyn::(array, to_type, options), - Int16 => utf8_to_primitive_dyn::(array, to_type, options), - Int32 => utf8_to_primitive_dyn::(array, to_type, options), - Int64 => utf8_to_primitive_dyn::(array, to_type, options), - Float32 => utf8_to_primitive_dyn::(array, to_type, options), - Float64 => utf8_to_primitive_dyn::(array, to_type, options), + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { + let binary = + utf8_to_binary::(array.as_any().downcast_ref().unwrap(), to_type.clone()); + cast(&binary, to_type, options) + }, Date32 => utf8_to_date32_dyn::(array), Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()).map(|x| x.boxed()), diff --git a/crates/polars-arrow/src/compute/cast/utf8_to.rs b/crates/polars-arrow/src/compute/cast/utf8_to.rs index 4437b021b135..79e970e82280 100644 --- a/crates/polars-arrow/src/compute/cast/utf8_to.rs +++ b/crates/polars-arrow/src/compute/cast/utf8_to.rs @@ -1,7 +1,6 @@ use chrono::Datelike; use polars_error::PolarsResult; -use super::CastOptions; use crate::array::*; use crate::datatypes::{ArrowDataType, TimeUnit}; use crate::offset::Offset; @@ -9,53 +8,9 @@ use crate::temporal_conversions::{ utf8_to_naive_timestamp as utf8_to_naive_timestamp_, utf8_to_timestamp as utf8_to_timestamp_, EPOCH_DAYS_FROM_CE, }; -use crate::types::NativeType; const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; -/// Casts a [`Utf8Array`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn utf8_to_primitive(from: &Utf8Array, to: &ArrowDataType) -> PrimitiveArray -where - T: NativeType + lexical_core::FromLexical, -{ - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x.as_bytes()).ok())); - - PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) -} - -/// Casts a [`Utf8Array`] to a [`PrimitiveArray`] at best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. -pub fn partial_utf8_to_primitive( - from: &Utf8Array, - to: &ArrowDataType, -) -> PrimitiveArray -where - T: NativeType + lexical_core::FromLexical, -{ - let iter = from.iter().map(|x| { - x.and_then::(|x| lexical_core::parse_partial(x.as_bytes()).ok().map(|x| x.0)) - }); - - PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) -} - -pub(super) fn utf8_to_primitive_dyn( - from: &dyn Array, - to: &ArrowDataType, - options: CastOptions, -) -> PolarsResult> -where - T: NativeType + lexical_core::FromLexical, -{ - let from = from.as_any().downcast_ref().unwrap(); - if options.partial { - Ok(Box::new(partial_utf8_to_primitive::(from, to))) - } else { - Ok(Box::new(utf8_to_primitive::(from, to))) - } -} - /// Casts a [`Utf8Array`] to a Date32 primitive, making any uncastable value a Null. pub fn utf8_to_date32(from: &Utf8Array) -> PrimitiveArray { let iter = from.iter().map(|x| { diff --git a/crates/polars-arrow/src/util/lexical.rs b/crates/polars-arrow/src/util/lexical.rs deleted file mode 100644 index 047986cbbedd..000000000000 --- a/crates/polars-arrow/src/util/lexical.rs +++ /dev/null @@ -1,42 +0,0 @@ -/// Converts numeric type to a `String` -#[inline] -pub fn lexical_to_bytes(n: N) -> Vec { - let mut buf = Vec::::with_capacity(N::FORMATTED_SIZE_DECIMAL); - lexical_to_bytes_mut(n, &mut buf); - buf -} - -/// Converts numeric type to a `String` -#[inline] -pub fn lexical_to_bytes_mut(n: N, buf: &mut Vec) { - buf.clear(); - buf.reserve(N::FORMATTED_SIZE_DECIMAL); - unsafe { - // JUSTIFICATION - // Benefit - // Allows using the faster serializer lexical core and convert to string - // Soundness - // Length of buf is set as written length afterwards. lexical_core - // creates a valid string, so doesn't need to be checked. - let slice = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.capacity()); - - // Safety: - // Omits an unneeded bound check as we just ensured that we reserved `N::FORMATTED_SIZE_DECIMAL` - #[cfg(debug_assertions)] - { - let len = lexical_core::write(n, slice).len(); - buf.set_len(len); - } - #[cfg(not(debug_assertions))] - { - let len = lexical_core::write_unchecked(n, slice).len(); - buf.set_len(len); - } - } -} - -/// Converts numeric type to a `String` -#[inline] -pub fn lexical_to_string(n: N) -> String { - unsafe { String::from_utf8_unchecked(lexical_to_bytes(n)) } -} diff --git a/crates/polars-arrow/src/util/mod.rs b/crates/polars-arrow/src/util/mod.rs index 54bc05d2c762..d446d9d5afe3 100644 --- a/crates/polars-arrow/src/util/mod.rs +++ b/crates/polars-arrow/src/util/mod.rs @@ -1,24 +1,5 @@ //! Misc utilities used in different places in the crate. -#[cfg(any( - feature = "compute", - feature = "io_csv_write", - feature = "io_csv_read", - feature = "io_json", - feature = "io_json_write", - feature = "compute_cast" -))] -mod lexical; -#[cfg(any( - feature = "compute", - feature = "io_csv_write", - feature = "io_csv_read", - feature = "io_json", - feature = "io_json_write", - feature = "compute_cast" -))] -pub use lexical::*; - #[cfg(feature = "benchmarks")] #[cfg_attr(docsrs, doc(cfg(feature = "benchmarks")))] pub mod bench_util; diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index cdbaa6a9dea6..fbfd308119be 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -19,6 +19,7 @@ polars-utils = { workspace = true } ahash = { workspace = true } arrow = { workspace = true } async-trait = { version = "0.1.59", optional = true } +atoi_simd = { workspace = true, optional = true } bytes = { version = "1.3" } chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } @@ -26,8 +27,6 @@ fast-float = { version = "0.2", optional = true } flate2 = { version = "1", optional = true, default-features = false } futures = { workspace = true, optional = true } itoa = { workspace = true, optional = true } -lexical = { version = "6", optional = true, default-features = false, features = ["std", "parse-integers"] } -lexical-core = { workspace = true, optional = true } memchr = { workspace = true } memmap = { package = "memmap2", version = "0.7" } num-traits = { workspace = true } @@ -60,8 +59,7 @@ default = ["decompress"] json = [ "polars-json", "simd-json", - "lexical", - "lexical-core", + "atoi_simd", "serde_json", "dtype-struct", "csv", @@ -73,7 +71,7 @@ ipc = ["arrow/io_ipc", "arrow/io_ipc_compression"] ipc_streaming = ["arrow/io_ipc", "arrow/io_ipc_compression"] # support for arrow avro parsing avro = ["arrow/io_avro", "arrow/io_avro_compression"] -csv = ["lexical", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"] +csv = ["atoi_simd", "polars-core/rows", "itoa", "ryu", "fast-float", "simdutf8"] decompress = ["flate2/rust_backend", "zstd"] decompress-fast = ["flate2/zlib-ng", "zstd"] dtype-categorical = ["polars-core/dtype-categorical"] diff --git a/crates/polars-io/src/csv/buffer.rs b/crates/polars-io/src/csv/buffer.rs index 7e785866a671..d922fdd9508a 100644 --- a/crates/polars-io/src/csv/buffer.rs +++ b/crates/polars-io/src/csv/buffer.rs @@ -34,25 +34,25 @@ impl PrimitiveParser for Float64Type { impl PrimitiveParser for UInt32Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi_simd::parse(bytes).ok() } } impl PrimitiveParser for UInt64Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi_simd::parse(bytes).ok() } } impl PrimitiveParser for Int32Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi_simd::parse(bytes).ok() } } impl PrimitiveParser for Int64Type { #[inline] fn parse(bytes: &[u8]) -> Option { - lexical::parse(bytes).ok() + atoi_simd::parse(bytes).ok() } } From 5457e9b1988ad6ab7d6e1374d927c13d1f70addd Mon Sep 17 00:00:00 2001 From: ritchie Date: Thu, 16 Nov 2023 13:00:28 +0100 Subject: [PATCH 2/2] prune all lexical --- Cargo.lock | 7 +- Cargo.toml | 2 +- crates/polars-arrow/Cargo.toml | 7 +- .../src/compute/cast/binary_to.rs | 68 ++++++---- crates/polars-arrow/src/compute/cast/mod.rs | 6 +- .../src/compute/cast/primitive_to.rs | 121 ++++++++++++------ crates/polars-io/Cargo.toml | 2 +- py-polars/Cargo.toml | 2 +- py-polars/src/conversion.rs | 8 +- 9 files changed, 149 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e4ecc756548..ebf542459484 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2627,6 +2627,7 @@ dependencies = [ "arrow-schema", "async-stream", "atoi", + "atoi_simd", "avro-schema", "bytemuck", "chrono", @@ -2637,6 +2638,7 @@ dependencies = [ "dyn-clone", "either", "ethnum", + "fast-float", "flate2", "foreign_vec", "futures", @@ -2644,7 +2646,7 @@ dependencies = [ "hashbrown 0.14.2", "hex", "indexmap 2.1.0", - "lexical-core", + "itoa", "lz4", "multiversion", "num-traits", @@ -2655,6 +2657,7 @@ dependencies = [ "regex", "regex-syntax 0.8.2", "rustc_version", + "ryu", "sample-arrow2", "sample-std", "sample-test", @@ -3063,8 +3066,8 @@ dependencies = [ "built", "ciborium", "either", + "itoa", "jemallocator", - "lexical-core", "libc", "mimalloc", "ndarray", diff --git a/Cargo.toml b/Cargo.toml index 8cfc8d8fe127..8a20df23850a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,8 +45,8 @@ hashbrown = { version = "0.14", features = ["rayon", "ahash"] } hex = "0.4.3" indexmap = { version = "2", features = ["std"] } itoa = "1.0.6" -lexical-core = "0.8.5" atoi_simd = "0.15" +fast-float = { version = "0.2" } memchr = "2.6" multiversion = "0.7" ndarray = { version = "0.15", default-features = false } diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index 56cc496fccbc..356a4d7157c7 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -33,7 +33,10 @@ simdutf8 = { workspace = true } ethnum = { workspace = true } # To efficiently cast numbers to strings -lexical-core = { workspace = true, optional = true } +atoi_simd = { workspace = true, optional = true } +fast-float = { workspace = true, optional = true } +itoa = { workspace = true, optional = true } +ryu = { workspace = true, optional = true } regex = { workspace = true, optional = true } regex-syntax = { version = "0.8", optional = true } @@ -134,7 +137,7 @@ compute_arithmetics = ["strength_reduce", "compute_arithmetics_decimal"] compute_bitwise = [] compute_boolean = [] compute_boolean_kleene = [] -compute_cast = ["lexical-core", "compute_take"] +compute_cast = ["compute_take", "ryu", "atoi_simd", "itoa", "fast-float"] compute_comparison = ["compute_take", "compute_boolean"] compute_concatenate = [] compute_filter = [] diff --git a/crates/polars-arrow/src/compute/cast/binary_to.rs b/crates/polars-arrow/src/compute/cast/binary_to.rs index 2f6ab05f51ab..da6791fd80b0 100644 --- a/crates/polars-arrow/src/compute/cast/binary_to.rs +++ b/crates/polars-arrow/src/compute/cast/binary_to.rs @@ -6,6 +6,47 @@ use crate::datatypes::ArrowDataType; use crate::offset::{Offset, Offsets}; use crate::types::NativeType; +pub(super) trait Parse { + fn parse(val: &[u8]) -> Option + where + Self: Sized; +} + +macro_rules! impl_parse { + ($primitive_type:ident) => { + impl Parse for $primitive_type { + fn parse(val: &[u8]) -> Option { + atoi_simd::parse(val).ok() + } + } + }; +} +impl_parse!(i8); +impl_parse!(i16); +impl_parse!(i32); +impl_parse!(i64); +impl_parse!(u8); +impl_parse!(u16); +impl_parse!(u32); +impl_parse!(u64); + +impl Parse for f32 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} +impl Parse for f64 { + fn parse(val: &[u8]) -> Option + where + Self: Sized, + { + fast_float::parse(val).ok() + } +} + /// Conversion of binary pub fn binary_to_large_binary( from: &BinaryArray, @@ -61,32 +102,15 @@ pub fn binary_to_large_utf8( Utf8Array::::try_new(to_data_type, offsets, values, from.validity().cloned()) } -/// Casts a [`BinaryArray`] to a [`PrimitiveArray`] at best-effort using `lexical_core::parse_partial`, making any uncastable value as zero. -pub fn partial_binary_to_primitive( - from: &BinaryArray, - to: &ArrowDataType, -) -> PrimitiveArray -where - T: NativeType + lexical_core::FromLexical, -{ - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse_partial(x).ok().map(|x| x.0))); - - PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) -} - /// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. -pub fn binary_to_primitive( +pub(super) fn binary_to_primitive( from: &BinaryArray, to: &ArrowDataType, ) -> PrimitiveArray where - T: NativeType + lexical_core::FromLexical, + T: NativeType + Parse, { - let iter = from - .iter() - .map(|x| x.and_then::(|x| lexical_core::parse(x).ok())); + let iter = from.iter().map(|x| x.and_then::(|x| T::parse(x))); PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) } @@ -97,11 +121,11 @@ pub(super) fn binary_to_primitive_dyn( options: CastOptions, ) -> PolarsResult> where - T: NativeType + lexical_core::FromLexical, + T: NativeType + Parse, { let from = from.as_any().downcast_ref().unwrap(); if options.partial { - Ok(Box::new(partial_binary_to_primitive::(from, to))) + unimplemented!() } else { Ok(Box::new(binary_to_primitive::(from, to))) } diff --git a/crates/polars-arrow/src/compute/cast/mod.rs b/crates/polars-arrow/src/compute/cast/mod.rs index 1569e6bb1d95..0944b4a504de 100644 --- a/crates/polars-arrow/src/compute/cast/mod.rs +++ b/crates/polars-arrow/src/compute/cast/mod.rs @@ -579,8 +579,10 @@ pub fn cast( }, (LargeUtf8, _) => match to_type { UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => { - let binary = - utf8_to_binary::(array.as_any().downcast_ref().unwrap(), to_type.clone()); + let binary = utf8_to_binary::( + array.as_any().downcast_ref().unwrap(), + ArrowDataType::LargeBinary, + ); cast(&binary, to_type, options) }, Date32 => utf8_to_date32_dyn::(array), diff --git a/crates/polars-arrow/src/compute/cast/primitive_to.rs b/crates/polars-arrow/src/compute/cast/primitive_to.rs index a1fab69ae0de..3db6cfa142f7 100644 --- a/crates/polars-arrow/src/compute/cast/primitive_to.rs +++ b/crates/polars-arrow/src/compute/cast/primitive_to.rs @@ -12,10 +12,64 @@ use crate::offset::{Offset, Offsets}; use crate::temporal_conversions::*; use crate::types::{days_ms, f16, months_days_ns, NativeType}; -/// Returns a [`BinaryArray`] where every element is the binary representation of the number. -pub fn primitive_to_binary( +pub(super) trait SerPrimitive { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized; +} + +macro_rules! impl_ser_primitive { + ($ptype:ident) => { + impl SerPrimitive for $ptype { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = itoa::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } + } + }; +} + +impl_ser_primitive!(i8); +impl_ser_primitive!(i16); +impl_ser_primitive!(i32); +impl_ser_primitive!(i64); +impl_ser_primitive!(u8); +impl_ser_primitive!(u16); +impl_ser_primitive!(u32); +impl_ser_primitive!(u64); + +impl SerPrimitive for f32 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +impl SerPrimitive for f64 { + fn write(f: &mut Vec, val: Self) -> usize + where + Self: Sized, + { + let mut buffer = ryu::Buffer::new(); + let value = buffer.format(val); + f.extend_from_slice(value.as_bytes()); + value.len() + } +} + +fn primitive_to_values_and_offsets( from: &PrimitiveArray, -) -> BinaryArray { +) -> (Vec, Offsets) { let mut values: Vec = Vec::with_capacity(from.len()); let mut offsets: Vec = Vec::with_capacity(from.len() + 1); offsets.push(O::default()); @@ -23,14 +77,8 @@ pub fn primitive_to_binary( let mut offset: usize = 0; unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); + for &x in from.values().iter() { + let len = T::write(&mut values, x); offset += len; offsets.push(O::from_as_usize(offset)); @@ -39,19 +87,29 @@ pub fn primitive_to_binary( values.shrink_to_fit(); // Safety: offsets _are_ monotonically increasing let offsets = unsafe { Offsets::new_unchecked(offsets) }; - BinaryArray::::new( - BinaryArray::::default_data_type(), - offsets.into(), - values.into(), - from.validity().cloned(), - ) + + (values, offsets) } } +/// Returns a [`BinaryArray`] where every element is the binary representation of the number. +pub(super) fn primitive_to_binary( + from: &PrimitiveArray, +) -> BinaryArray { + let (values, offsets) = primitive_to_values_and_offsets(from); + + BinaryArray::::new( + BinaryArray::::default_data_type(), + offsets.into(), + values.into(), + from.validity().cloned(), + ) +} + pub(super) fn primitive_to_binary_dyn(from: &dyn Array) -> PolarsResult> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_binary::(from))) @@ -81,32 +139,11 @@ where } /// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. -pub fn primitive_to_utf8( +pub(super) fn primitive_to_utf8( from: &PrimitiveArray, ) -> Utf8Array { - let mut values: Vec = Vec::with_capacity(from.len()); - let mut offsets: Vec = Vec::with_capacity(from.len() + 1); - offsets.push(O::default()); - - let mut offset: usize = 0; - + let (values, offsets) = primitive_to_values_and_offsets(from); unsafe { - for x in from.values().iter() { - values.reserve(offset + T::FORMATTED_SIZE_DECIMAL); - - let bytes = std::slice::from_raw_parts_mut( - values.as_mut_ptr().add(offset), - values.capacity() - offset, - ); - let len = lexical_core::write_unchecked(*x, bytes).len(); - - offset += len; - offsets.push(O::from_as_usize(offset)); - } - values.set_len(offset); - values.shrink_to_fit(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }; Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), @@ -119,7 +156,7 @@ pub fn primitive_to_utf8( pub(super) fn primitive_to_utf8_dyn(from: &dyn Array) -> PolarsResult> where O: Offset, - T: NativeType + lexical_core::ToLexical, + T: NativeType + SerPrimitive, { let from = from.as_any().downcast_ref().unwrap(); Ok(Box::new(primitive_to_utf8::(from))) diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index fbfd308119be..b174d4f73dd1 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -23,7 +23,7 @@ atoi_simd = { workspace = true, optional = true } bytes = { version = "1.3" } chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } -fast-float = { version = "0.2", optional = true } +fast-float = { workspace = true, optional = true } flate2 = { version = "1", optional = true, default-features = false } futures = { workspace = true, optional = true } itoa = { workspace = true, optional = true } diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index b31d26438d2b..61bad167f34c 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -20,7 +20,7 @@ polars-utils = { workspace = true } ahash = { workspace = true } ciborium = { workspace = true } either = { workspace = true } -lexical-core = { workspace = true } +itoa = { workspace = true } libc = "0.2" ndarray = { workspace = true } numpy = { version = "0.20", default-features = false } diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index ecb3cbf83582..0b6b4ab00e54 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -191,7 +191,13 @@ fn decimal_to_digits(v: i128, buf: &mut [u128; 3]) -> usize { // safety: transmute is safe as there are 48 bytes in 3 128bit ints // and the minimal alignment of u8 fits u16 let buf = unsafe { std::mem::transmute::<&mut [u128; 3], &mut [u8; 48]>(buf) }; - let len = lexical_core::write(v, buf).len(); + let mut buffer = itoa::Buffer::new(); + let value = buffer.format(v); + let len = value.len(); + for (dst, src) in buf.iter_mut().zip(value.as_bytes().iter()) { + *dst = *src + } + let ptr = buf.as_mut_ptr() as *mut i128; unsafe { // this is safe because we know that the buffer is exactly 48 bytes long