Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Make DataFrame a Vec of Column instead of Series #18664

Merged
merged 42 commits into from
Sep 14, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
2b082c7
start of the massacre
coastalwhite Sep 10, 2024
b027465
finish polars-plan
coastalwhite Sep 11, 2024
135c92f
finish polars-expr
coastalwhite Sep 11, 2024
e116e8c
finish polars-mem-engine
coastalwhite Sep 11, 2024
8f0595f
finish polars-pipe
coastalwhite Sep 11, 2024
1d15a85
finish polars-stream
coastalwhite Sep 11, 2024
43fb45e
finish polars-lazy
coastalwhite Sep 11, 2024
d58f393
finish polars-sql
coastalwhite Sep 11, 2024
563d5fc
finished polars-python
coastalwhite Sep 11, 2024
12ab77c
Tiny fixes to make the tests pass again
coastalwhite Sep 11, 2024
977e9d0
fix many of the rust tests
coastalwhite Sep 11, 2024
52405af
fix failed rebase
coastalwhite Sep 11, 2024
5faaaa9
clippy and format
coastalwhite Sep 11, 2024
95be5d7
fix more clippy issues
coastalwhite Sep 11, 2024
5617a00
feature gate struct_
coastalwhite Sep 11, 2024
da65351
fix doc building issues
coastalwhite Sep 11, 2024
21e6c7e
feature gate more things
coastalwhite Sep 11, 2024
bc5f655
even more feature gating
coastalwhite Sep 11, 2024
0ecd61d
add polars_utils dev dependency
coastalwhite Sep 11, 2024
b86c33b
idk
coastalwhite Sep 11, 2024
5afc13e
fix without debug assertions
coastalwhite Sep 11, 2024
7fb53df
fix miri in CI
coastalwhite Sep 11, 2024
94d2a7c
fmt
coastalwhite Sep 11, 2024
e8c5088
fix test
coastalwhite Sep 11, 2024
4ae63d5
fix many doc issues
coastalwhite Sep 12, 2024
5b2c6aa
add docs and actually create scalar columns
coastalwhite Sep 12, 2024
487d417
fix clippy
coastalwhite Sep 12, 2024
2d4a0da
start on a lot of the column optimizations
coastalwhite Sep 12, 2024
e797f70
scalar-opt StructArray::from_series
coastalwhite Sep 13, 2024
b4b4d52
format
coastalwhite Sep 13, 2024
6d3c40e
extract str_value
coastalwhite Sep 13, 2024
fcfd947
scalar-opt Column arithmetic
coastalwhite Sep 13, 2024
3b591a8
scalar-opt Column casting
coastalwhite Sep 13, 2024
16d1d63
further scalar-opt removals
coastalwhite Sep 13, 2024
18a34cb
more scalar-opt removals
coastalwhite Sep 13, 2024
733421d
fmt
coastalwhite Sep 13, 2024
c9b2723
fix small errors
coastalwhite Sep 13, 2024
801dfe0
fmt
coastalwhite Sep 13, 2024
834ec50
feature gate datetime
coastalwhite Sep 13, 2024
8198662
remove resize constraint
coastalwhite Sep 13, 2024
581f94d
fix doc comment
coastalwhite Sep 13, 2024
015ca79
remove widely used materializations
coastalwhite Sep 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/bitmap/bitmask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option<u32> {
// We use this by setting the first argument to 1 << n, which means the
// first n-1 zero bits of it will spread to the first n-1 one bits of w,
// after which the one bit will exactly get copied to the nth one bit of w.
#[cfg(target_feature = "bmi2")]
#[cfg(all(not(miri), target_feature = "bmi2"))]
{
if n >= 32 {
return None;
Expand All @@ -28,7 +28,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option<u32> {
Some(nth_set_bit.trailing_zeros())
}

#[cfg(not(target_feature = "bmi2"))]
#[cfg(any(miri, not(target_feature = "bmi2")))]
{
// Each block of 2/4/8/16 bits contains how many set bits there are in that block.
let set_per_2 = w - ((w >> 1) & 0x55555555);
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn cast_single_to_struct(
new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype));
}

StructChunked::from_series(name, &new_fields).map(|ca| ca.into_series())
StructChunked::from_series(name, new_fields.iter()).map(|ca| ca.into_series())
}

impl<T> ChunkedArray<T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ impl CategoricalChunked {

let mut counts = groups.group_count();
counts.rename(PlSmallStr::from_static("counts"));
let cols = vec![values.into_series(), counts.into_series()];
let cols = vec![values.into_series().into(), counts.into_series().into()];
let df = unsafe { DataFrame::new_no_checks(cols) };
df.sort(
["counts"],
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-core/src/chunked_array/ndarray.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ impl DataFrame {
///
/// ```rust
/// use polars_core::prelude::*;
/// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_series();
/// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_series();
/// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_column();
/// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_column();
///
/// let df = DataFrame::new(vec![a, b]).unwrap();
/// let ndarray = df.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap();
Expand All @@ -108,7 +108,7 @@ impl DataFrame {
let columns = self.get_columns();
POOL.install(|| {
columns.par_iter().enumerate().try_for_each(|(col_idx, s)| {
let s = s.cast(&N::get_dtype())?;
let s = s.as_materialized_series().cast(&N::get_dtype())?;
let s = match s.dtype() {
DataType::Float32 => {
let ca = s.f32().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/fill_null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ impl Series {
/// ```rust
/// # use polars_core::prelude::*;
/// fn example() -> PolarsResult<()> {
/// let s = Series::new("some_missing".into(), &[Some(1), None, Some(2)]);
/// let s = Column::new("some_missing".into(), &[Some(1), None, Some(2)]);
///
/// let filled = s.fill_null(FillNullStrategy::Forward(None))?;
/// assert_eq!(Vec::from(filled.i32()?), &[Some(1), Some(1), Some(2)]);
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/ops/full.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ impl ListChunked {
#[cfg(feature = "dtype-struct")]
impl ChunkFullNull for StructChunked {
fn full_null(name: PlSmallStr, length: usize) -> StructChunked {
let s = vec![Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, &s)
let s = [Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, s.iter())
.unwrap()
.with_outer_validity(Some(Bitmap::new_zeroed(length)))
}
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ pub trait ChunkVar {
/// fn filter_all_ones(df: &DataFrame) -> PolarsResult<DataFrame> {
/// let mask = df
/// .column("column_a")?
/// .as_materialized_series()
/// .equal(1)?;
///
/// df.filter(&mask)
Expand Down Expand Up @@ -384,7 +385,7 @@ pub trait ChunkSort<T: PolarsDataType> {
#[allow(unused_variables)]
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
_options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
polars_bail!(opq = arg_sort_multiple, T::get_dtype());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ impl PartialOrd for CompareRow<'_> {
/// Similar to .argsort() then .slice(0, k) but with a more efficient implementation.
pub fn _arg_bottom_k(
k: usize,
by_column: &[Series],
by_column: &[Column],
sort_options: &mut SortMultipleOptions,
) -> PolarsResult<NoNull<IdxCa>> {
let from_n_rows = by_column[0].len();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::utils::_split_offsets;

pub(crate) fn args_validate<T: PolarsDataType>(
ca: &ChunkedArray<T>,
other: &[Series],
other: &[Column],
param_value: &[bool],
param_name: &str,
) -> PolarsResult<()> {
Expand All @@ -25,7 +25,7 @@ pub(crate) fn args_validate<T: PolarsDataType>(

pub(crate) fn arg_sort_multiple_impl<T: NullOrderCmp + Send + Copy>(
mut vals: Vec<(IdxSize, T)>,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
let nulls_last = &options.nulls_last;
Expand All @@ -36,7 +36,7 @@ pub(crate) fn arg_sort_multiple_impl<T: NullOrderCmp + Send + Copy>(

let compare_inner: Vec<_> = by
.iter()
.map(|s| s.into_total_ord_inner())
.map(|s| s.as_materialized_series().into_total_ord_inner())
.collect_trusted();

let first_descending = descending[0];
Expand Down Expand Up @@ -198,7 +198,7 @@ pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult<RowsEncoded> {
}

pub fn _get_rows_encoded(
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<RowsEncoded> {
Expand All @@ -209,6 +209,7 @@ pub fn _get_rows_encoded(
let mut fields = Vec::with_capacity(by.len());

for ((by, desc), null_last) in by.iter().zip(descending).zip(nulls_last) {
let by = by.as_materialized_series();
let arr = _get_rows_encoded_compat_array(by)?;
let sort_field = EncodingField {
descending: *desc,
Expand Down Expand Up @@ -236,7 +237,7 @@ pub fn _get_rows_encoded(

pub fn _get_rows_encoded_ca(
name: PlSmallStr,
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<BinaryOffsetChunked> {
Expand All @@ -245,7 +246,7 @@ pub fn _get_rows_encoded_ca(
}

pub fn _get_rows_encoded_arr(
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<BinaryArray<i64>> {
Expand All @@ -261,7 +262,7 @@ pub fn _get_rows_encoded_ca_unordered(
}

pub(crate) fn argsort_multiple_row_fmt(
by: &[Series],
by: &[Column],
mut descending: Vec<bool>,
mut nulls_last: Vec<bool>,
parallel: bool,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ impl CategoricalChunked {

pub(crate) fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
if self.uses_lexical_ordering() {
Expand Down Expand Up @@ -177,15 +177,15 @@ mod test {
SortMultipleOptions::default().with_order_descending_multi([false, false]),
)?;
let out = out.column("cat")?;
let cat = out.categorical()?;
let cat = out.as_materialized_series().categorical()?;
assert_order(cat, &["a", "a", "b", "c"]);

let out = df.sort(
["vals", "cat"],
SortMultipleOptions::default().with_order_descending_multi([false, false]),
)?;
let out = out.column("cat")?;
let cat = out.categorical()?;
let cat = out.as_materialized_series().categorical()?;
assert_order(cat, &["b", "c", "a", "a"]);
}
Ok(())
Expand Down
32 changes: 19 additions & 13 deletions crates/polars-core/src/chunked_array/ops/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ where

fn arg_sort_multiple_numeric<T: PolarsNumericType>(
ca: &ChunkedArray<T>,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(ca, by, &options.descending, "descending")?;
Expand Down Expand Up @@ -294,7 +294,7 @@ where
/// We assume that all numeric `Series` are of the same type, if not it will panic
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
arg_sort_multiple_numeric(self, by, options)
Expand Down Expand Up @@ -349,7 +349,7 @@ impl ChunkSort<StringType> for StringChunked {
///
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
self.as_binary().arg_sort_multiple(by, options)
Expand Down Expand Up @@ -427,7 +427,7 @@ impl ChunkSort<BinaryType> for BinaryChunked {

fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(self, by, &options.descending, "descending")?;
Expand Down Expand Up @@ -574,7 +574,7 @@ impl ChunkSort<BinaryOffsetType> for BinaryOffsetChunked {
/// uphold this contract. If not, it will panic.
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(self, by, &options.descending, "descending")?;
Expand All @@ -599,7 +599,7 @@ impl StructChunked {
pub(crate) fn arg_sort(&self, options: SortOptions) -> IdxCa {
let bin = _get_rows_encoded_ca(
self.name().clone(),
&[self.clone().into_series()],
&[self.clone().into_column()],
&[options.descending],
&[options.nulls_last],
)
Expand Down Expand Up @@ -692,7 +692,7 @@ impl ChunkSort<BooleanType> for BooleanChunked {
}
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
let mut vals = Vec::with_capacity(self.len());
Expand Down Expand Up @@ -724,7 +724,7 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult<Series>
.iter()
.map(convert_sort_column_multi_sort)
.collect::<PolarsResult<Vec<_>>>()?;
let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?;
out.zip_outer_validity(ca);
out.into_series()
},
Expand Down Expand Up @@ -754,14 +754,16 @@ pub fn _broadcast_bools(n_cols: usize, values: &mut Vec<bool>) {
}

pub(crate) fn prepare_arg_sort(
columns: Vec<Series>,
columns: Vec<Column>,
sort_options: &mut SortMultipleOptions,
) -> PolarsResult<(Series, Vec<Series>)> {
) -> PolarsResult<(Column, Vec<Column>)> {
let n_cols = columns.len();

let mut columns = columns
.iter()
.map(Column::as_materialized_series)
.map(convert_sort_column_multi_sort)
.map(|s| s.map(Column::from))
.collect::<PolarsResult<Vec<_>>>()?;

_broadcast_bools(n_cols, &mut sort_options.descending);
Expand Down Expand Up @@ -881,11 +883,15 @@ mod test {
PlSmallStr::from_static("c"),
&["a", "b", "c", "d", "e", "f", "g", "h"],
);
let df = DataFrame::new(vec![a.into_series(), b.into_series(), c.into_series()])?;
let df = DataFrame::new(vec![
a.into_series().into(),
b.into_series().into(),
c.into_series().into(),
])?;

let out = df.sort(["a", "b", "c"], SortMultipleOptions::default())?;
assert_eq!(
Vec::from(out.column("b")?.i64()?),
Vec::from(out.column("b")?.as_series().unwrap().i64()?),
&[
Some(0),
Some(2),
Expand All @@ -905,7 +911,7 @@ mod test {
)
.into_series();
let b = Int32Chunked::new(PlSmallStr::from_static("b"), &[5, 4, 2, 3, 4, 5]).into_series();
let df = DataFrame::new(vec![a, b])?;
let df = DataFrame::new(vec![a.into(), b.into()])?;

let out = df.sort(["a", "b"], SortMultipleOptions::default())?;
let expected = df!(
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl ChunkZip<StructType> for StructChunked {
.map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs))
.collect::<PolarsResult<Vec<_>>>()?;

let mut out = StructChunked::from_series(self.name().clone(), &fields)?;
let mut out = StructChunked::from_series(self.name().clone(), fields.iter())?;

// Zip the validities.
if (l.null_count + r.null_count) > 0 {
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/random.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ impl DataFrame {
match n.get(0) {
Some(n) => self.sample_n_literal(n as usize, with_replacement, shuffle, seed),
None => {
let new_cols = self.columns.iter().map(Series::clear).collect_trusted();
let new_cols = self.columns.iter().map(Column::clear).collect_trusted();
Ok(unsafe { DataFrame::new_no_checks(new_cols) })
},
}
Expand Down Expand Up @@ -238,7 +238,7 @@ impl DataFrame {
self.sample_n_literal(n, with_replacement, shuffle, seed)
},
None => {
let new_cols = self.columns.iter().map(Series::clear).collect_trusted();
let new_cols = self.columns.iter().map(Column::clear).collect_trusted();
Ok(unsafe { DataFrame::new_no_checks(new_cols) })
},
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/struct_/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ use crate::prelude::StructChunked;

impl DataFrame {
pub fn into_struct(self, name: PlSmallStr) -> StructChunked {
StructChunked::from_series(name, &self.columns).expect("same invariants")
StructChunked::from_columns(name, &self.columns).expect("same invariants")
}
}
Loading