From 393beabb586f5d35103fd967f769c92aba2c2929 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 24 Sep 2024 16:22:33 +0200 Subject: [PATCH 1/3] refactor: Preserve scalar in min/max_horizontal --- .../src/chunked_array/ops/min_max_binary.rs | 48 ++++++++++++------- crates/polars-core/src/frame/mod.rs | 34 +++++-------- crates/polars-python/src/dataframe/general.rs | 4 +- 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs index bc33f088b1f9..28e7c491095b 100644 --- a/crates/polars-core/src/chunked_array/ops/min_max_binary.rs +++ b/crates/polars-core/src/chunked_array/ops/min_max_binary.rs @@ -31,31 +31,45 @@ where arity::binary_elementwise_values(left, right, op) } -pub(crate) fn min_max_binary_series( - left: &Series, - right: &Series, +pub(crate) fn min_max_binary_columns( + left: &Column, + right: &Column, min: bool, -) -> PolarsResult { +) -> PolarsResult { if left.dtype().to_physical().is_numeric() && left.null_count() == 0 && right.null_count() == 0 && left.len() == right.len() { - let (lhs, rhs) = coerce_lhs_rhs(left, right)?; - let logical = lhs.dtype(); - let lhs = lhs.to_physical_repr(); - let rhs = rhs.to_physical_repr(); + match (left, right) { + (Column::Series(left), Column::Series(right)) => { + let (lhs, rhs) = coerce_lhs_rhs(left, right)?; + let logical = lhs.dtype(); + let lhs = lhs.to_physical_repr(); + let rhs = rhs.to_physical_repr(); - with_match_physical_numeric_polars_type!(lhs.dtype(), |$T| { - let a: &ChunkedArray<$T> = lhs.as_ref().as_ref().as_ref(); - let b: &ChunkedArray<$T> = rhs.as_ref().as_ref().as_ref(); + with_match_physical_numeric_polars_type!(lhs.dtype(), |$T| { + let a: &ChunkedArray<$T> = lhs.as_ref().as_ref().as_ref(); + let b: &ChunkedArray<$T> = rhs.as_ref().as_ref().as_ref(); - if min { - min_binary(a, b).into_series().cast(logical) - } else { - max_binary(a, b).into_series().cast(logical) - } - }) + if min { + min_binary(a, b).into_series().cast(logical) + } else { + max_binary(a, b).into_series().cast(logical) + } + }) + .map(Column::from) + }, + _ => { + let mask = if min { + left.lt(right)? + } else { + left.gt(right)? + }; + + left.zip_with(&mask, right) + }, + } } else { let mask = if min { left.lt(right)? & left.is_not_null() | right.is_null() diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index a741ad846351..1b9e7c72f123 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -37,7 +37,7 @@ use crate::chunked_array::cast::CastOptions; #[cfg(feature = "row_hash")] use crate::hashing::_df_rows_to_hashes_threaded_vertical; #[cfg(feature = "zip_with")] -use crate::prelude::min_max_binary::min_max_binary_series; +use crate::prelude::min_max_binary::min_max_binary_columns; use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort}; use crate::series::IsSorted; use crate::POOL; @@ -2584,24 +2584,19 @@ impl DataFrame { /// Aggregate the column horizontally to their min values. #[cfg(feature = "zip_with")] - pub fn min_horizontal(&self) -> PolarsResult> { - let min_fn = |acc: &Series, s: &Series| min_max_binary_series(acc, s, true); + pub fn min_horizontal(&self) -> PolarsResult> { + let min_fn = |acc: &Column, s: &Column| min_max_binary_columns(acc, s, true); match self.columns.len() { 0 => Ok(None), - 1 => Ok(Some( - self.columns[0].clone().as_materialized_series().clone(), - )), - 2 => min_fn( - self.columns[0].as_materialized_series(), - self.columns[1].as_materialized_series(), - ) - .map(Some), + 1 => Ok(Some(self.columns[0].clone())), + 2 => min_fn(&self.columns[0], &self.columns[1]).map(Some), _ => { // the try_reduce_with is a bit slower in parallelism, // but I don't think it matters here as we parallelize over columns, not over elements POOL.install(|| { - self.par_materialized_column_iter() + self.columns + .par_iter() .map(|s| Ok(Cow::Borrowed(s))) .try_reduce_with(|l, r| min_fn(&l, &r).map(Cow::Owned)) // we can unwrap the option, because we are certain there is a column @@ -2615,22 +2610,19 @@ impl DataFrame { /// Aggregate the column horizontally to their max values. #[cfg(feature = "zip_with")] - pub fn max_horizontal(&self) -> PolarsResult> { - let max_fn = |acc: &Series, s: &Series| min_max_binary_series(acc, s, false); + pub fn max_horizontal(&self) -> PolarsResult> { + let max_fn = |acc: &Column, s: &Column| min_max_binary_columns(acc, s, false); match self.columns.len() { 0 => Ok(None), - 1 => Ok(Some(self.columns[0].as_materialized_series().clone())), - 2 => max_fn( - self.columns[0].as_materialized_series(), - self.columns[1].as_materialized_series(), - ) - .map(Some), + 1 => Ok(Some(self.columns[0].clone())), + 2 => max_fn(&self.columns[0], &self.columns[1]).map(Some), _ => { // the try_reduce_with is a bit slower in parallelism, // but I don't think it matters here as we parallelize over columns, not over elements POOL.install(|| { - self.par_materialized_column_iter() + self.columns + .par_iter() .map(|s| Ok(Cow::Borrowed(s))) .try_reduce_with(|l, r| max_fn(&l, &r).map(Cow::Owned)) // we can unwrap the option, because we are certain there is a column diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index 78727ffefd33..5df89ed423c7 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -460,12 +460,12 @@ impl PyDataFrame { pub fn max_horizontal(&self) -> PyResult> { let s = self.df.max_horizontal().map_err(PyPolarsErr::from)?; - Ok(s.map(|s| s.into())) + Ok(s.map(|s| s.take_materialized_series().into())) } pub fn min_horizontal(&self) -> PyResult> { let s = self.df.min_horizontal().map_err(PyPolarsErr::from)?; - Ok(s.map(|s| s.into())) + Ok(s.map(|s| s.take_materialized_series().into())) } pub fn sum_horizontal(&self, ignore_nulls: bool) -> PyResult> { From 0c4328ab7b2754f39831fc2e0e71febd2c08506c Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 24 Sep 2024 16:46:50 +0200 Subject: [PATCH 2/3] refactor: Preserve scalar in sort_with and in compare --- crates/polars-core/src/frame/column/mod.rs | 71 +++------------------- crates/polars-core/src/frame/mod.rs | 2 +- 2 files changed, 11 insertions(+), 62 deletions(-) diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index cb88a9946ca4..491f2ea4ed49 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -17,6 +17,7 @@ use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; mod arithmetic; mod scalar; +mod compare; /// A column within a [`DataFrame`]. /// @@ -990,69 +991,17 @@ impl Column { // @scalar-opt self.as_materialized_series().estimated_size() } -} - -impl ChunkCompareEq<&Column> for Column { - type Item = PolarsResult; - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal_missing(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .equal_missing(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .not_equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal_missing(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .not_equal_missing(rhs.as_materialized_series()) - } -} -impl ChunkCompareIneq<&Column> for Column { - type Item = PolarsResult; - - /// Create a boolean mask by checking if self > rhs. - #[inline] - fn gt(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .gt(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self >= rhs. - #[inline] - fn gt_eq(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .gt_eq(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self < rhs. - #[inline] - fn lt(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .lt(rhs.as_materialized_series()) - } + pub(crate) fn sort_with(&self, options: SortOptions) -> PolarsResult { + match self { + Column::Series(s) => s.sort_with(options).map(Self::from), + Column::Scalar(s) => { + // This makes this function throw the same errors as Series::sort_with + _ = s.as_single_value_series().sort_with(options)?; - /// Create a boolean mask by checking if self <= rhs. - #[inline] - fn lt_eq(&self, rhs: &Column) -> Self::Item { - self.as_materialized_series() - .lt_eq(rhs.as_materialized_series()) + Ok(self.clone()) + }, + } } } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 1b9e7c72f123..998b82eddc78 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1870,7 +1870,7 @@ impl DataFrame { let df = df.as_single_chunk_par(); let mut take = match (by_column.len(), has_struct) { (1, false) => { - let s = &by_column[0].as_materialized_series(); + let s = &by_column[0]; let options = SortOptions { descending: sort_options.descending[0], nulls_last: sort_options.nulls_last[0], From 395fdb0efe4255d31f08e891455240b0c38034a4 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 24 Sep 2024 16:47:13 +0200 Subject: [PATCH 3/3] add missing file --- .../polars-core/src/frame/column/compare.rs | 86 +++++++++++++++++++ crates/polars-core/src/frame/column/mod.rs | 2 +- 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 crates/polars-core/src/frame/column/compare.rs diff --git a/crates/polars-core/src/frame/column/compare.rs b/crates/polars-core/src/frame/column/compare.rs new file mode 100644 index 000000000000..fdb792d60074 --- /dev/null +++ b/crates/polars-core/src/frame/column/compare.rs @@ -0,0 +1,86 @@ +use polars_error::PolarsResult; + +use super::{BooleanChunked, ChunkCompareEq, ChunkCompareIneq, ChunkExpandAtIndex, Column, Series}; + +macro_rules! column_element_wise_broadcasting { + ($lhs:expr, $rhs:expr, $op:expr) => { + match ($lhs, $rhs) { + (Column::Series(lhs), Column::Series(rhs)) => $op(lhs, rhs), + (Column::Series(lhs), Column::Scalar(rhs)) => $op(lhs, &rhs.as_single_value_series()), + (Column::Scalar(lhs), Column::Series(rhs)) => $op(&lhs.as_single_value_series(), rhs), + (Column::Scalar(lhs), Column::Scalar(rhs)) => { + $op(&lhs.as_single_value_series(), &rhs.as_single_value_series()).map(|ca| { + if ca.len() == 0 { + ca + } else { + ca.new_from_index(0, lhs.len()) + } + }) + }, + } + }; +} + +impl ChunkCompareEq<&Column> for Column { + type Item = PolarsResult; + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::equal) + } + + /// Create a boolean mask by checking for equality. + #[inline] + fn equal_missing(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!( + self, + rhs, + >::equal_missing + ) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::not_equal) + } + + /// Create a boolean mask by checking for inequality. + #[inline] + fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!( + self, + rhs, + >::not_equal_missing + ) + } +} + +impl ChunkCompareIneq<&Column> for Column { + type Item = PolarsResult; + + /// Create a boolean mask by checking if self > rhs. + #[inline] + fn gt(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::gt) + } + + /// Create a boolean mask by checking if self >= rhs. + #[inline] + fn gt_eq(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::gt_eq) + } + + /// Create a boolean mask by checking if self < rhs. + #[inline] + fn lt(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::lt) + } + + /// Create a boolean mask by checking if self <= rhs. + #[inline] + fn lt_eq(&self, rhs: &Column) -> PolarsResult { + column_element_wise_broadcasting!(self, rhs, >::lt_eq) + } +} diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 491f2ea4ed49..3a6343415a6a 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -16,8 +16,8 @@ use crate::utils::{slice_offsets, Container}; use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; mod arithmetic; -mod scalar; mod compare; +mod scalar; /// A column within a [`DataFrame`]. ///