Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Make DataFrame a Vec of Column instead of Series #18664

Merged
merged 42 commits into from
Sep 14, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
2b082c7
start of the massacre
coastalwhite Sep 10, 2024
b027465
finish polars-plan
coastalwhite Sep 11, 2024
135c92f
finish polars-expr
coastalwhite Sep 11, 2024
e116e8c
finish polars-mem-engine
coastalwhite Sep 11, 2024
8f0595f
finish polars-pipe
coastalwhite Sep 11, 2024
1d15a85
finish polars-stream
coastalwhite Sep 11, 2024
43fb45e
finish polars-lazy
coastalwhite Sep 11, 2024
d58f393
finish polars-sql
coastalwhite Sep 11, 2024
563d5fc
finished polars-python
coastalwhite Sep 11, 2024
12ab77c
Tiny fixes to make the tests pass again
coastalwhite Sep 11, 2024
977e9d0
fix many of the rust tests
coastalwhite Sep 11, 2024
52405af
fix failed rebase
coastalwhite Sep 11, 2024
5faaaa9
clippy and format
coastalwhite Sep 11, 2024
95be5d7
fix more clippy issues
coastalwhite Sep 11, 2024
5617a00
feature gate struct_
coastalwhite Sep 11, 2024
da65351
fix doc building issues
coastalwhite Sep 11, 2024
21e6c7e
feature gate more things
coastalwhite Sep 11, 2024
bc5f655
even more feature gating
coastalwhite Sep 11, 2024
0ecd61d
add polars_utils dev dependency
coastalwhite Sep 11, 2024
b86c33b
idk
coastalwhite Sep 11, 2024
5afc13e
fix without debug assertions
coastalwhite Sep 11, 2024
7fb53df
fix miri in CI
coastalwhite Sep 11, 2024
94d2a7c
fmt
coastalwhite Sep 11, 2024
e8c5088
fix test
coastalwhite Sep 11, 2024
4ae63d5
fix many doc issues
coastalwhite Sep 12, 2024
5b2c6aa
add docs and actually create scalar columns
coastalwhite Sep 12, 2024
487d417
fix clippy
coastalwhite Sep 12, 2024
2d4a0da
start on a lot of the column optimizations
coastalwhite Sep 12, 2024
e797f70
scalar-opt StructArray::from_series
coastalwhite Sep 13, 2024
b4b4d52
format
coastalwhite Sep 13, 2024
6d3c40e
extract str_value
coastalwhite Sep 13, 2024
fcfd947
scalar-opt Column arithmetic
coastalwhite Sep 13, 2024
3b591a8
scalar-opt Column casting
coastalwhite Sep 13, 2024
16d1d63
further scalar-opt removals
coastalwhite Sep 13, 2024
18a34cb
more scalar-opt removals
coastalwhite Sep 13, 2024
733421d
fmt
coastalwhite Sep 13, 2024
c9b2723
fix small errors
coastalwhite Sep 13, 2024
801dfe0
fmt
coastalwhite Sep 13, 2024
834ec50
feature gate datetime
coastalwhite Sep 13, 2024
8198662
remove resize constraint
coastalwhite Sep 13, 2024
581f94d
fix doc comment
coastalwhite Sep 13, 2024
015ca79
remove widely used materializations
coastalwhite Sep 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn cast_single_to_struct(
new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype));
}

StructChunked::from_series(name, &new_fields).map(|ca| ca.into_series())
StructChunked::from_series(name, new_fields.iter()).map(|ca| ca.into_series())
}

impl<T> ChunkedArray<T>
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/ops/full.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ impl ListChunked {
#[cfg(feature = "dtype-struct")]
impl ChunkFullNull for StructChunked {
fn full_null(name: PlSmallStr, length: usize) -> StructChunked {
let s = vec![Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, &s)
let s = [Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, s.iter())
.unwrap()
.with_outer_validity(Some(Bitmap::new_zeroed(length)))
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult<Series>
.iter()
.map(convert_sort_column_multi_sort)
.collect::<PolarsResult<Vec<_>>>()?;
let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?;
out.zip_outer_validity(ca);
out.into_series()
},
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl ChunkZip<StructType> for StructChunked {
.map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs))
.collect::<PolarsResult<Vec<_>>>()?;

let mut out = StructChunked::from_series(self.name().clone(), &fields)?;
let mut out = StructChunked::from_series(self.name().clone(), fields.iter())?;

// Zip the validities.
if (l.null_count + r.null_count) > 0 {
Expand Down
4 changes: 1 addition & 3 deletions crates/polars-core/src/chunked_array/struct_/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use crate::prelude::StructChunked;

impl DataFrame {
pub fn into_struct(self, name: PlSmallStr) -> StructChunked {
// @scalar-opt
let series = self.materialized_column_iter().cloned().collect::<Vec<_>>();
StructChunked::from_series(name, &series).expect("same invariants")
StructChunked::from_columns(name, &self.columns).expect("same invariants")
}
}
51 changes: 26 additions & 25 deletions crates/polars-core/src/chunked_array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,24 @@ use crate::utils::Container;

pub type StructChunked = ChunkedArray<StructType>;

fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult<StructChunked> {
fn constructor<'a, I: ExactSizeIterator<Item = &'a Series> + Clone>(
name: PlSmallStr,
fields: I,
) -> PolarsResult<StructChunked> {
// Different chunk lengths: rechunk and recurse.
if !fields.iter().map(|s| s.n_chunks()).all_equal() {
let fields = fields.iter().map(|s| s.rechunk()).collect::<Vec<_>>();
return constructor(name, &fields);
if !fields.clone().map(|s| s.n_chunks()).all_equal() {
let fields = fields.map(|s| s.rechunk()).collect::<Vec<_>>();
return constructor(name, fields.iter());
}

let n_chunks = fields[0].n_chunks();
let dtype = DataType::Struct(fields.iter().map(|s| s.field().into_owned()).collect());
let n_chunks = fields.clone().next().unwrap().n_chunks();
let dtype = DataType::Struct(fields.clone().map(|s| s.field().into_owned()).collect());
let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest());

let chunks = (0..n_chunks)
.map(|c_i| {
let fields = fields
.iter()
.clone()
.map(|field| field.chunks()[c_i].clone())
.collect::<Vec<_>>();

Expand All @@ -55,30 +58,28 @@ fn constructor(name: PlSmallStr, fields: &[Series]) -> PolarsResult<StructChunke
},
// Different chunk lengths: rechunk and recurse.
Err(_) => {
let fields = fields.iter().map(|s| s.rechunk()).collect::<Vec<_>>();
constructor(name, &fields)
let fields = fields.map(|s| s.rechunk()).collect::<Vec<_>>();
constructor(name, fields.iter())
},
}
}

impl StructChunked {
pub fn from_columns(name: PlSmallStr, fields: &[Column]) -> PolarsResult<Self> {
// @scalar-opt!
let series = fields
.iter()
.map(|c| c.as_materialized_series().clone())
.collect::<Vec<_>>();
Self::from_series(name, &series)
Self::from_series(name, fields.iter().map(|c| c.as_materialized_series()))
}

pub fn from_series(name: PlSmallStr, fields: &[Series]) -> PolarsResult<Self> {
pub fn from_series<'a, I: ExactSizeIterator<Item = &'a Series> + Clone>(
name: PlSmallStr,
fields: I,
) -> PolarsResult<Self> {
let mut names = PlHashSet::with_capacity(fields.len());
let first_len = fields.first().map(|s| s.len()).unwrap_or(0);
let first_len = fields.clone().next().map(|s| s.len()).unwrap_or(0);
let mut max_len = first_len;

let mut all_equal_len = true;
let mut is_empty = false;
for s in fields {
for s in fields.clone() {
let s_len = s.len();
max_len = std::cmp::max(max_len, s_len);

Expand Down Expand Up @@ -117,10 +118,10 @@ impl StructChunked {
);
}
}
constructor(name, &new_fields)
} else if fields.is_empty() {
let fields = &[Series::new_null(PlSmallStr::EMPTY, 0)];
constructor(name, fields)
constructor(name, new_fields.iter())
} else if fields.len() == 0 {
let fields = [Series::new_null(PlSmallStr::EMPTY, 0)];
constructor(name, fields.iter())
} else {
constructor(name, fields)
}
Expand Down Expand Up @@ -184,7 +185,7 @@ impl StructChunked {
})
.collect::<PolarsResult<Vec<_>>>()?;

let mut out = Self::from_series(self.name().clone(), &new_fields)?;
let mut out = Self::from_series(self.name().clone(), new_fields.iter())?;
if self.null_count > 0 {
out.zip_outer_validity(self);
}
Expand Down Expand Up @@ -240,7 +241,7 @@ impl StructChunked {
}
})
.collect::<PolarsResult<Vec<_>>>()?;
let mut out = Self::from_series(self.name().clone(), &fields)?;
let mut out = Self::from_series(self.name().clone(), fields.iter())?;
if self.null_count > 0 {
out.zip_outer_validity(self);
}
Expand Down Expand Up @@ -285,7 +286,7 @@ impl StructChunked {
.iter()
.map(func)
.collect::<PolarsResult<Vec<_>>>()?;
Self::from_series(self.name().clone(), &fields).map(|mut ca| {
Self::from_series(self.name().clone(), fields.iter()).map(|mut ca| {
if self.null_count > 0 {
// SAFETY: we don't change types/ lengths.
unsafe {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/frame/row/av_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ impl<'a> AnyValueBufferTrusted<'a> {
s
})
.collect::<Vec<_>>();
StructChunked::from_series(PlSmallStr::EMPTY, &v)
StructChunked::from_series(PlSmallStr::EMPTY, v.iter())
.unwrap()
.into_series()
},
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/serde/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ impl<'de> Deserialize<'de> for Series {
#[cfg(feature = "dtype-struct")]
DataType::Struct(_) => {
let values: Vec<Series> = map.next_value()?;
let ca = StructChunked::from_series(name.clone(), &values).unwrap();
let ca = StructChunked::from_series(name.clone(), values.iter()).unwrap();
let mut s = ca.into_series();
s.rename(name);
Ok(s)
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/series/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ fn any_values_to_struct(
series_fields.push(s)
}

let mut out = StructChunked::from_series(PlSmallStr::EMPTY, &series_fields)?;
let mut out = StructChunked::from_series(PlSmallStr::EMPTY, series_fields.iter())?;
if has_outer_validity {
let mut validity = MutableBitmap::new();
validity.extend_constant(values.len(), true);
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ impl Series {
.iter()
.map(|s| s.to_physical_repr().into_owned())
.collect();
let mut ca = StructChunked::from_series(self.name().clone(), &fields).unwrap();
let mut ca = StructChunked::from_series(self.name().clone(), fields.iter()).unwrap();

if arr.null_count() > 0 {
ca.zip_outer_validity(arr);
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/series/ops/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ impl Series {
.iter()
.map(|fld| Series::full_null(fld.name().clone(), size, fld.dtype()))
.collect::<Vec<_>>();
let ca = StructChunked::from_series(name, &fields).unwrap();
let ca = StructChunked::from_series(name, fields.iter()).unwrap();

if !fields.is_empty() {
ca.with_outer_validity(Some(Bitmap::new_zeroed(size)))
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-expr/src/expressions/aggregation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ impl PartitionedAggregation for AggregationExpr {
};
let mut count_s = series.agg_valid_count(groups);
count_s.rename(PlSmallStr::from_static("__POLARS_COUNT"));
Ok(StructChunked::from_series(new_name, &[agg_s, count_s])
Ok(StructChunked::from_series(new_name, [agg_s, count_s].iter())
.unwrap()
.into_series())
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/array/to_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub trait ToStruct: AsArray {
.collect::<PolarsResult<Vec<_>>>()
})?;

StructChunked::from_series(ca.name().clone(), &fields)
StructChunked::from_series(ca.name().clone(), fields.iter())
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/hist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ where
let out = fields.pop().unwrap();
out.with_name(ca.name().clone())
} else {
StructChunked::from_series(ca.name().clone(), &fields)
StructChunked::from_series(ca.name().clone(), fields.iter())
.unwrap()
.into_series()
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/list/to_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ pub trait ToStruct: AsList {
.collect::<PolarsResult<Vec<_>>>()
})?;

StructChunked::from_series(ca.name().clone(), &fields)
StructChunked::from_series(ca.name().clone(), fields.iter())
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/strings/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub(super) fn extract_groups(
if n_fields == 1 {
return StructChunked::from_series(
ca.name().clone(),
&[Series::new_null(ca.name().clone(), ca.len())],
[Series::new_null(ca.name().clone(), ca.len())].iter(),
)
.map(|ca| ca.into_series());
}
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,10 @@ mod tests {

let expected_series = StructChunked::from_series(
"".into(),
&[
[
Series::new("a".into(), &[None, Some(1), Some(2), None]),
Series::new("b".into(), &[None, Some("hello"), Some("goodbye"), None]),
],
].iter(),
)
.unwrap()
.with_outer_validity_chunked(BooleanChunked::new("".into(), [false, true, true, false]))
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/chunked_array/strings/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ where
})
.collect::<Vec<_>>();

StructChunked::from_series(ca.name().clone(), &fields)
StructChunked::from_series(ca.name().clone(), fields.iter())
}

pub fn split_helper<'a, F, I>(ca: &'a StringChunked, by: &'a StringChunked, op: F) -> ListChunked
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-ops/src/frame/join/merge_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ fn merge_series(lhs: &Series, rhs: &Series, merge_indicator: &[bool]) -> PolarsR
.zip(rhs.fields_as_series())
.map(|(lhs, rhs)| merge_series(lhs, &rhs, merge_indicator))
.collect::<PolarsResult<Vec<_>>>()?;
StructChunked::from_series(PlSmallStr::EMPTY, &new_fields)
StructChunked::from_series(PlSmallStr::EMPTY, new_fields.iter())
.unwrap()
.into_series()
},
Expand Down
12 changes: 3 additions & 9 deletions crates/polars-ops/src/frame/pivot/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,15 +233,9 @@ fn pivot_impl(
already exists in the DataFrame. Please rename it prior to calling `pivot`.")
}
// @scalar-opt
let columns_struct = StructChunked::from_series(
column.clone(),
&fields
.iter()
.map(|c| c.as_materialized_series().clone())
.collect::<Vec<_>>(),
)
.unwrap()
.into_series();
let columns_struct = StructChunked::from_columns(column.clone(), fields)
.unwrap()
.into_series();
let mut binding = pivot_df.clone();
let pivot_df = unsafe { binding.with_column_unchecked(columns_struct) };
pivot_impl_single_column(
Expand Down
7 changes: 1 addition & 6 deletions crates/polars-ops/src/frame/pivot/positioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,14 +484,9 @@ pub(super) fn compute_row_idx(
}
} else {
let binding = pivot_df.select(index.iter().cloned())?;
// @scalar-opt
let fields = binding.get_columns();
let fields = fields
.iter()
.map(|c| c.as_materialized_series().clone())
.collect::<Vec<_>>();
let index_struct_series =
StructChunked::from_series(PlSmallStr::from_static("placeholder"), &fields)?
StructChunked::from_columns(PlSmallStr::from_static("placeholder"), fields)?
.into_series();
let index_agg = unsafe { index_struct_series.agg_first(groups) };
let index_agg_physical = index_agg.to_physical_repr();
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-ops/src/series/ops/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ fn map_cats(
},
});

let outvals = vec![
let outvals = [
brk_vals.finish().into_series(),
bld.finish()
._with_fast_unique(label_has_value.iter().all(bool::clone))
.into_series(),
];
Ok(StructChunked::from_series(out_name, &outvals)?.into_series())
Ok(StructChunked::from_series(out_name, outvals.iter())?.into_series())
} else {
Ok(bld
.drain_iter_and_finish(s_iter.map(|opt| {
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-plan/src/dsl/function_expr/struct_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ pub(super) fn rename_fields(s: &Column, names: Arc<[PlSmallStr]>) -> PolarsResul
s
})
.collect::<Vec<_>>();
let mut out = StructChunked::from_series(ca.name().clone(), &fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?;
out.zip_outer_validity(ca);
Ok(out.into_column())
}
Expand All @@ -193,7 +193,7 @@ pub(super) fn prefix_fields(s: &Column, prefix: &str) -> PolarsResult<Column> {
s
})
.collect::<Vec<_>>();
let mut out = StructChunked::from_series(ca.name().clone(), &fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?;
out.zip_outer_validity(ca);
Ok(out.into_column())
}
Expand All @@ -210,7 +210,7 @@ pub(super) fn suffix_fields(s: &Column, suffix: &str) -> PolarsResult<Column> {
s
})
.collect::<Vec<_>>();
let mut out = StructChunked::from_series(ca.name().clone(), &fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), fields.iter())?;
out.zip_outer_validity(ca);
Ok(out.into_column())
}
Expand Down Expand Up @@ -245,7 +245,7 @@ pub(super) fn with_fields(args: &[Column]) -> PolarsResult<Column> {
}

let new_fields = fields.into_values().cloned().collect::<Vec<_>>();
let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?;
out.zip_outer_validity(ca);
Ok(out.into_column())
}
2 changes: 1 addition & 1 deletion crates/polars-plan/src/dsl/name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ impl ExprNameNameSpace {
fd
})
.collect::<Vec<_>>();
let mut out = StructChunked::from_series(s.name().clone(), &fields)?;
let mut out = StructChunked::from_series(s.name().clone(), fields.iter())?;
out.zip_outer_validity(s);
Ok(Some(out.into_column()))
},
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-python/src/map/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ fn iterator_to_struct<'a>(
.collect::<Vec<_>>()
});

Ok(StructChunked::from_series(name, &fields)
Ok(StructChunked::from_series(name, fields.iter())
.unwrap()
.into_series()
.into())
Expand Down
Loading