diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index 19ea2c18c0b6..40d8bf6285b9 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -162,7 +162,7 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { #[must_use] fn sliced(&self, offset: usize, length: usize) -> Box { if length == 0 { - return new_empty_array(self.data_type().clone()) + return new_empty_array(self.data_type().clone()); } let mut new = self.to_boxed(); new.slice(offset, length); diff --git a/crates/polars-core/src/chunked_array/ops/chunkops.rs b/crates/polars-core/src/chunked_array/ops/chunkops.rs index b3d10e883187..8a147e0fe123 100644 --- a/crates/polars-core/src/chunked_array/ops/chunkops.rs +++ b/crates/polars-core/src/chunked_array/ops/chunkops.rs @@ -17,7 +17,7 @@ pub(crate) fn split_at( let (raw_offset, _) = slice_offsets(offset, 0, own_length); let mut remaining_offset = raw_offset; - let mut iter = chunks.into_iter(); + let mut iter = chunks.iter(); for chunk in &mut iter { let chunk_len = chunk.len(); @@ -173,17 +173,61 @@ impl ChunkedArray { } } - /// Slice the array. The chunks are reallocated the underlying data slices are zero copy. + /// Split the array. The chunks are reallocated the underlying data slices are zero copy. /// /// When offset is negative it will be counted from the end of the array. /// This method will never error, /// and will slice the best match when offset, or length is out of bounds pub fn split_at(&self, offset: i64) -> (Self, Self) { - // The len: 0 special cases ensure we release memory. // A normal slice, slice the buffers and thus keep the whole memory allocated. let (l, r) = split_at(&self.chunks, offset, self.len()); - let out_l = unsafe { self.copy_with_chunks(l) }; - let out_r = unsafe { self.copy_with_chunks(r) }; + let mut out_l = unsafe { self.copy_with_chunks(l) }; + let mut out_r = unsafe { self.copy_with_chunks(r) }; + + use MetadataProperties as P; + let mut properties_l = P::SORTED | P::FAST_EXPLODE_LIST; + let mut properties_r = P::SORTED | P::FAST_EXPLODE_LIST; + + let is_ascending = self.is_sorted_ascending_flag(); + let is_descending = self.is_sorted_descending_flag(); + + if is_ascending || is_descending { + let has_nulls_at_start = self.null_count() != 0 + && self + .chunks() + .first() + .unwrap() + .as_ref() + .validity() + .map_or(false, |bm| bm.get(0).unwrap()); + + if !has_nulls_at_start { + let can_copy_min_value = !has_nulls_at_start && is_ascending; + let can_copy_max_value = !has_nulls_at_start && is_descending; + + properties_l.set(P::MIN_VALUE, can_copy_min_value); + properties_l.set(P::MAX_VALUE, can_copy_max_value); + } + + let has_nulls_at_end = self.null_count() != 0 + && self + .chunks() + .last() + .unwrap() + .as_ref() + .validity() + .map_or(false, |bm| bm.get(bm.len() - 1).unwrap()); + + if !has_nulls_at_end { + let can_copy_min_value = !has_nulls_at_end && is_descending; + let can_copy_max_value = !has_nulls_at_end && is_ascending; + properties_r.set(P::MIN_VALUE, can_copy_min_value); + properties_r.set(P::MAX_VALUE, can_copy_max_value); + } + } + out_l.copy_metadata(self, properties_l); + out_r.copy_metadata(self, properties_r); + (out_l, out_r) } diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 52e41cf27c9f..6418b3ab3277 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -2233,6 +2233,14 @@ impl DataFrame { unsafe { DataFrame::new_no_checks(col) } } + /// Split [`DataFrame`] at the given `offset`. + pub fn split_at(&self, offset: i64) -> (Self, Self) { + let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip(); + let a = unsafe { DataFrame::new_no_checks(a) }; + let b = unsafe { DataFrame::new_no_checks(b) }; + (a, b) + } + pub fn clear(&self) -> Self { let col = self.columns.iter().map(|s| s.clear()).collect::>(); unsafe { DataFrame::new_no_checks(col) } diff --git a/crates/polars-core/src/series/implementations/binary.rs b/crates/polars-core/src/series/implementations/binary.rs index 63d4a25eff9a..042e58fcf813 100644 --- a/crates/polars-core/src/series/implementations/binary.rs +++ b/crates/polars-core/src/series/implementations/binary.rs @@ -124,7 +124,6 @@ impl SeriesTrait for SeriesWrap { (a.into_series(), b.into_series()) } - fn append(&mut self, other: &Series) -> PolarsResult<()> { polars_ensure!(self.0.dtype() == other.dtype(), append); // todo! add object diff --git a/crates/polars-core/src/series/implementations/binary_offset.rs b/crates/polars-core/src/series/implementations/binary_offset.rs index a16db7e1911e..b0ac481f682c 100644 --- a/crates/polars-core/src/series/implementations/binary_offset.rs +++ b/crates/polars-core/src/series/implementations/binary_offset.rs @@ -87,7 +87,6 @@ impl SeriesTrait for SeriesWrap { (a.into_series(), b.into_series()) } - fn append(&mut self, other: &Series) -> PolarsResult<()> { polars_ensure!(self.0.dtype() == other.dtype(), append); // todo! add object diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index bee6a4771df3..2b6777eda58f 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -145,7 +145,6 @@ impl SeriesTrait for SeriesWrap { (a.into_series(), b.into_series()) } - fn append(&mut self, other: &Series) -> PolarsResult<()> { polars_ensure!(self.0.dtype() == other.dtype(), append); self.0.append(other.as_ref().as_ref()); diff --git a/crates/polars-core/src/series/implementations/categorical.rs b/crates/polars-core/src/series/implementations/categorical.rs index 425f24cd39e9..97ac4be0031a 100644 --- a/crates/polars-core/src/series/implementations/categorical.rs +++ b/crates/polars-core/src/series/implementations/categorical.rs @@ -161,7 +161,6 @@ impl SeriesTrait for SeriesWrap { (a, b) } - fn append(&mut self, other: &Series) -> PolarsResult<()> { polars_ensure!(self.0.dtype() == other.dtype(), append); self.0.append(other.categorical().unwrap()) diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index 767c9886629d..42f60bd06c4e 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -174,7 +174,12 @@ impl SeriesTrait for SeriesWrap { } fn split_at(&self, offset: i64) -> (Series, Series) { let (a, b) = self.0.split_at(offset); - (a.into_datetime(self.0.time_unit(), self.0.time_zone().clone()).into_series(), b.into_datetime(self.0.time_unit(), self.0.time_zone().clone()).into_series()) + ( + a.into_datetime(self.0.time_unit(), self.0.time_zone().clone()) + .into_series(), + b.into_datetime(self.0.time_unit(), self.0.time_zone().clone()) + .into_series(), + ) } fn mean(&self) -> Option { diff --git a/crates/polars-core/src/series/implementations/decimal.rs b/crates/polars-core/src/series/implementations/decimal.rs index 76096dab64b4..bfc79836b618 100644 --- a/crates/polars-core/src/series/implementations/decimal.rs +++ b/crates/polars-core/src/series/implementations/decimal.rs @@ -196,9 +196,11 @@ impl SeriesTrait for SeriesWrap { fn split_at(&self, offset: i64) -> (Series, Series) { let (a, b) = self.0.split_at(offset); - let a = a.into_decimal_unchecked(self.0.precision(), self.0.scale()) + let a = a + .into_decimal_unchecked(self.0.precision(), self.0.scale()) .into_series(); - let b = b.into_decimal_unchecked(self.0.precision(), self.0.scale()) + let b = b + .into_decimal_unchecked(self.0.precision(), self.0.scale()) .into_series(); (a, b) } diff --git a/crates/polars-core/src/series/implementations/null.rs b/crates/polars-core/src/series/implementations/null.rs index c084952424db..564a8f93669d 100644 --- a/crates/polars-core/src/series/implementations/null.rs +++ b/crates/polars-core/src/series/implementations/null.rs @@ -240,20 +240,20 @@ impl SeriesTrait for NullChunked { } fn split_at(&self, offset: i64) -> (Series, Series) { - let (l, r) = chunkops::split_at(&self.chunks(), offset, self.len()); - (NullChunked { - name: self.name.clone(), - length: l.iter().map(|arr|arr.len() as IdxSize).sum(), - chunks: l, - } + let (l, r) = chunkops::split_at(self.chunks(), offset, self.len()); + ( + NullChunked { + name: self.name.clone(), + length: l.iter().map(|arr| arr.len() as IdxSize).sum(), + chunks: l, + } + .into_series(), + NullChunked { + name: self.name.clone(), + length: r.iter().map(|arr| arr.len() as IdxSize).sum(), + chunks: r, + } .into_series(), - NullChunked { - name: self.name.clone(), - length: r.iter().map(|arr|arr.len() as IdxSize).sum(), - chunks: r, - } - .into_series(), - ) } diff --git a/crates/polars-core/src/series/implementations/struct_.rs b/crates/polars-core/src/series/implementations/struct_.rs index c723033d82c0..d9ed03948fce 100644 --- a/crates/polars-core/src/series/implementations/struct_.rs +++ b/crates/polars-core/src/series/implementations/struct_.rs @@ -126,14 +126,12 @@ impl SeriesTrait for SeriesWrap { out.into_series() } - fn split_at(&self, offset: i64) -> (Series, Series) { let (a, b): (Vec<_>, Vec<_>) = self.0.fields().iter().map(|s| s.split_at(offset)).unzip(); let a = StructChunked::new(self.name(), &a).unwrap(); let b = StructChunked::new(self.name(), &b).unwrap(); (a.into_series(), b.into_series()) - } fn append(&mut self, other: &Series) -> PolarsResult<()> { diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index 313fb55b1b50..fd38c4bad099 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -108,6 +108,8 @@ pub fn _split_offsets(len: usize, n: usize) -> Vec<(usize, usize)> { pub trait Container: Clone { fn slice(&self, offset: i64, len: usize) -> Self; + fn split_at(&self, offset: i64) -> (Self, Self); + fn len(&self) -> usize; fn iter_chunks(&self) -> impl Iterator; @@ -122,6 +124,10 @@ impl Container for DataFrame { DataFrame::slice(self, offset, len) } + fn split_at(&self, offset: i64) -> (Self, Self) { + DataFrame::split_at(self, offset) + } + fn len(&self) -> usize { self.height() } @@ -144,6 +150,10 @@ impl Container for ChunkedArray { ChunkedArray::slice(self, offset, len) } + fn split_at(&self, offset: i64) -> (Self, Self) { + ChunkedArray::split_at(self, offset) + } + fn len(&self) -> usize { ChunkedArray::len(self) } @@ -167,6 +177,10 @@ impl Container for Series { self.0.slice(offset, len) } + fn split_at(&self, offset: i64) -> (Self, Self) { + self.0.split_at(offset) + } + fn len(&self) -> usize { self.0.len() } @@ -258,9 +272,9 @@ pub fn split_and_flatten(container: &C, target: usize) -> Vec { continue 'new_chunk; } - // TODO! use `split` operation here. That saves a null count. - out.push(chunk.slice(0, chunk_size)); - chunk = chunk.slice(chunk_size as i64, h - chunk_size); + let (a, b) = chunk.split_at(chunk_size as i64); + out.push(a); + chunk = b; } } out