Skip to content

Commit

Permalink
refactor: Make DataFrame a Vec of Column instead of Series (#18664)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Sep 14, 2024
1 parent 29cdb17 commit 962b576
Show file tree
Hide file tree
Showing 249 changed files with 4,605 additions and 2,124 deletions.
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/bitmap/bitmask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option<u32> {
// We use this by setting the first argument to 1 << n, which means the
// first n-1 zero bits of it will spread to the first n-1 one bits of w,
// after which the one bit will exactly get copied to the nth one bit of w.
#[cfg(target_feature = "bmi2")]
#[cfg(all(not(miri), target_feature = "bmi2"))]
{
if n >= 32 {
return None;
Expand All @@ -28,7 +28,7 @@ fn nth_set_bit_u32(w: u32, n: u32) -> Option<u32> {
Some(nth_set_bit.trailing_zeros())
}

#[cfg(not(target_feature = "bmi2"))]
#[cfg(any(miri, not(target_feature = "bmi2")))]
{
// Each block of 2/4/8/16 bits contains how many set bits there are in that block.
let set_per_2 = w - ((w >> 1) & 0x55555555);
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn cast_single_to_struct(
new_fields.push(Series::full_null(fld.name.clone(), length, &fld.dtype));
}

StructChunked::from_series(name, &new_fields).map(|ca| ca.into_series())
StructChunked::from_series(name, new_fields.iter()).map(|ca| ca.into_series())
}

impl<T> ChunkedArray<T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ impl CategoricalChunked {

let mut counts = groups.group_count();
counts.rename(PlSmallStr::from_static("counts"));
let cols = vec![values.into_series(), counts.into_series()];
let cols = vec![values.into_series().into(), counts.into_series().into()];
let df = unsafe { DataFrame::new_no_checks(cols) };
df.sort(
["counts"],
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/logical/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ impl LogicalType for DatetimeChunked {
fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
self.0
.get_any_value(i)
.map(|av| av.as_datetime(self.time_unit(), self.time_zone()))
.map(|av| av.as_datetime(self.time_unit(), self.time_zone().as_ref()))
}

unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
self.0
.get_any_value_unchecked(i)
.as_datetime(self.time_unit(), self.time_zone())
.as_datetime(self.time_unit(), self.time_zone().as_ref())
}

fn cast_with_options(
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-core/src/chunked_array/ndarray.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ impl DataFrame {
///
/// ```rust
/// use polars_core::prelude::*;
/// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_series();
/// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_series();
/// let a = UInt32Chunked::new("a".into(), &[1, 2, 3]).into_column();
/// let b = Float64Chunked::new("b".into(), &[10., 8., 6.]).into_column();
///
/// let df = DataFrame::new(vec![a, b]).unwrap();
/// let ndarray = df.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap();
Expand All @@ -108,7 +108,7 @@ impl DataFrame {
let columns = self.get_columns();
POOL.install(|| {
columns.par_iter().enumerate().try_for_each(|(col_idx, s)| {
let s = s.cast(&N::get_dtype())?;
let s = s.as_materialized_series().cast(&N::get_dtype())?;
let s = match s.dtype() {
DataType::Float32 => {
let ca = s.f32().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ pub(crate) unsafe fn arr_to_any_value<'a>(
DataType::Datetime(tu, tz) => {
let arr = &*(arr as *const dyn Array as *const Int64Array);
let v = arr.value_unchecked(idx);
AnyValue::Datetime(v, *tu, tz)
AnyValue::Datetime(v, *tu, tz.as_ref())
},
#[cfg(feature = "dtype-date")]
DataType::Date => {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/fill_null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ impl Series {
/// ```rust
/// # use polars_core::prelude::*;
/// fn example() -> PolarsResult<()> {
/// let s = Series::new("some_missing".into(), &[Some(1), None, Some(2)]);
/// let s = Column::new("some_missing".into(), &[Some(1), None, Some(2)]);
///
/// let filled = s.fill_null(FillNullStrategy::Forward(None))?;
/// assert_eq!(Vec::from(filled.i32()?), &[Some(1), Some(1), Some(2)]);
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/ops/full.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ impl ListChunked {
#[cfg(feature = "dtype-struct")]
impl ChunkFullNull for StructChunked {
fn full_null(name: PlSmallStr, length: usize) -> StructChunked {
let s = vec![Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, &s)
let s = [Series::new_null(PlSmallStr::EMPTY, length)];
StructChunked::from_series(name, s.iter())
.unwrap()
.with_outer_validity(Some(Bitmap::new_zeroed(length)))
}
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ pub trait ChunkVar {
/// fn filter_all_ones(df: &DataFrame) -> PolarsResult<DataFrame> {
/// let mask = df
/// .column("column_a")?
/// .as_materialized_series()
/// .equal(1)?;
///
/// df.filter(&mask)
Expand Down Expand Up @@ -384,7 +385,7 @@ pub trait ChunkSort<T: PolarsDataType> {
#[allow(unused_variables)]
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
_options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
polars_bail!(opq = arg_sort_multiple, T::get_dtype());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ impl PartialOrd for CompareRow<'_> {
/// Similar to .argsort() then .slice(0, k) but with a more efficient implementation.
pub fn _arg_bottom_k(
k: usize,
by_column: &[Series],
by_column: &[Column],
sort_options: &mut SortMultipleOptions,
) -> PolarsResult<NoNull<IdxCa>> {
let from_n_rows = by_column[0].len();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::utils::_split_offsets;

pub(crate) fn args_validate<T: PolarsDataType>(
ca: &ChunkedArray<T>,
other: &[Series],
other: &[Column],
param_value: &[bool],
param_name: &str,
) -> PolarsResult<()> {
Expand All @@ -25,7 +25,7 @@ pub(crate) fn args_validate<T: PolarsDataType>(

pub(crate) fn arg_sort_multiple_impl<T: NullOrderCmp + Send + Copy>(
mut vals: Vec<(IdxSize, T)>,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
let nulls_last = &options.nulls_last;
Expand All @@ -36,7 +36,7 @@ pub(crate) fn arg_sort_multiple_impl<T: NullOrderCmp + Send + Copy>(

let compare_inner: Vec<_> = by
.iter()
.map(|s| s.into_total_ord_inner())
.map(|s| s.as_materialized_series().into_total_ord_inner())
.collect_trusted();

let first_descending = descending[0];
Expand Down Expand Up @@ -198,7 +198,7 @@ pub fn _get_rows_encoded_unordered(by: &[Series]) -> PolarsResult<RowsEncoded> {
}

pub fn _get_rows_encoded(
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<RowsEncoded> {
Expand All @@ -209,6 +209,7 @@ pub fn _get_rows_encoded(
let mut fields = Vec::with_capacity(by.len());

for ((by, desc), null_last) in by.iter().zip(descending).zip(nulls_last) {
let by = by.as_materialized_series();
let arr = _get_rows_encoded_compat_array(by)?;
let sort_field = EncodingField {
descending: *desc,
Expand Down Expand Up @@ -236,7 +237,7 @@ pub fn _get_rows_encoded(

pub fn _get_rows_encoded_ca(
name: PlSmallStr,
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<BinaryOffsetChunked> {
Expand All @@ -245,7 +246,7 @@ pub fn _get_rows_encoded_ca(
}

pub fn _get_rows_encoded_arr(
by: &[Series],
by: &[Column],
descending: &[bool],
nulls_last: &[bool],
) -> PolarsResult<BinaryArray<i64>> {
Expand All @@ -261,7 +262,7 @@ pub fn _get_rows_encoded_ca_unordered(
}

pub(crate) fn argsort_multiple_row_fmt(
by: &[Series],
by: &[Column],
mut descending: Vec<bool>,
mut nulls_last: Vec<bool>,
parallel: bool,
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-core/src/chunked_array/ops/sort/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ impl CategoricalChunked {
pub(crate) fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
if self.uses_lexical_ordering() {
Expand Down Expand Up @@ -177,15 +177,15 @@ mod test {
SortMultipleOptions::default().with_order_descending_multi([false, false]),
)?;
let out = out.column("cat")?;
let cat = out.categorical()?;
let cat = out.as_materialized_series().categorical()?;
assert_order(cat, &["a", "a", "b", "c"]);

let out = df.sort(
["vals", "cat"],
SortMultipleOptions::default().with_order_descending_multi([false, false]),
)?;
let out = out.column("cat")?;
let cat = out.categorical()?;
let cat = out.as_materialized_series().categorical()?;
assert_order(cat, &["b", "c", "a", "a"]);
}
Ok(())
Expand Down
32 changes: 19 additions & 13 deletions crates/polars-core/src/chunked_array/ops/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ where

fn arg_sort_multiple_numeric<T: PolarsNumericType>(
ca: &ChunkedArray<T>,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(ca, by, &options.descending, "descending")?;
Expand Down Expand Up @@ -294,7 +294,7 @@ where
/// We assume that all numeric `Series` are of the same type, if not it will panic
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
arg_sort_multiple_numeric(self, by, options)
Expand Down Expand Up @@ -349,7 +349,7 @@ impl ChunkSort<StringType> for StringChunked {
///
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
self.as_binary().arg_sort_multiple(by, options)
Expand Down Expand Up @@ -427,7 +427,7 @@ impl ChunkSort<BinaryType> for BinaryChunked {

fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(self, by, &options.descending, "descending")?;
Expand Down Expand Up @@ -574,7 +574,7 @@ impl ChunkSort<BinaryOffsetType> for BinaryOffsetChunked {
/// uphold this contract. If not, it will panic.
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
args_validate(self, by, &options.descending, "descending")?;
Expand All @@ -599,7 +599,7 @@ impl StructChunked {
pub(crate) fn arg_sort(&self, options: SortOptions) -> IdxCa {
let bin = _get_rows_encoded_ca(
self.name().clone(),
&[self.clone().into_series()],
&[self.clone().into_column()],
&[options.descending],
&[options.nulls_last],
)
Expand Down Expand Up @@ -692,7 +692,7 @@ impl ChunkSort<BooleanType> for BooleanChunked {
}
fn arg_sort_multiple(
&self,
by: &[Series],
by: &[Column],
options: &SortMultipleOptions,
) -> PolarsResult<IdxCa> {
let mut vals = Vec::with_capacity(self.len());
Expand Down Expand Up @@ -724,7 +724,7 @@ pub(crate) fn convert_sort_column_multi_sort(s: &Series) -> PolarsResult<Series>
.iter()
.map(convert_sort_column_multi_sort)
.collect::<PolarsResult<Vec<_>>>()?;
let mut out = StructChunked::from_series(ca.name().clone(), &new_fields)?;
let mut out = StructChunked::from_series(ca.name().clone(), new_fields.iter())?;
out.zip_outer_validity(ca);
out.into_series()
},
Expand Down Expand Up @@ -754,14 +754,16 @@ pub fn _broadcast_bools(n_cols: usize, values: &mut Vec<bool>) {
}

pub(crate) fn prepare_arg_sort(
columns: Vec<Series>,
columns: Vec<Column>,
sort_options: &mut SortMultipleOptions,
) -> PolarsResult<(Series, Vec<Series>)> {
) -> PolarsResult<(Column, Vec<Column>)> {
let n_cols = columns.len();

let mut columns = columns
.iter()
.map(Column::as_materialized_series)
.map(convert_sort_column_multi_sort)
.map(|s| s.map(Column::from))
.collect::<PolarsResult<Vec<_>>>()?;

_broadcast_bools(n_cols, &mut sort_options.descending);
Expand Down Expand Up @@ -881,11 +883,15 @@ mod test {
PlSmallStr::from_static("c"),
&["a", "b", "c", "d", "e", "f", "g", "h"],
);
let df = DataFrame::new(vec![a.into_series(), b.into_series(), c.into_series()])?;
let df = DataFrame::new(vec![
a.into_series().into(),
b.into_series().into(),
c.into_series().into(),
])?;

let out = df.sort(["a", "b", "c"], SortMultipleOptions::default())?;
assert_eq!(
Vec::from(out.column("b")?.i64()?),
Vec::from(out.column("b")?.as_series().unwrap().i64()?),
&[
Some(0),
Some(2),
Expand All @@ -905,7 +911,7 @@ mod test {
)
.into_series();
let b = Int32Chunked::new(PlSmallStr::from_static("b"), &[5, 4, 2, 3, 4, 5]).into_series();
let df = DataFrame::new(vec![a, b])?;
let df = DataFrame::new(vec![a.into(), b.into()])?;

let out = df.sort(["a", "b"], SortMultipleOptions::default())?;
let expected = df!(
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/ops/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl ChunkZip<StructType> for StructChunked {
.map(|(lhs, rhs)| lhs.zip_with_same_type(&mask, &rhs))
.collect::<PolarsResult<Vec<_>>>()?;

let mut out = StructChunked::from_series(self.name().clone(), &fields)?;
let mut out = StructChunked::from_series(self.name().clone(), fields.iter())?;

// Zip the validities.
if (l.null_count + r.null_count) > 0 {
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/chunked_array/random.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ impl DataFrame {
match n.get(0) {
Some(n) => self.sample_n_literal(n as usize, with_replacement, shuffle, seed),
None => {
let new_cols = self.columns.iter().map(Series::clear).collect_trusted();
let new_cols = self.columns.iter().map(Column::clear).collect_trusted();
Ok(unsafe { DataFrame::new_no_checks(new_cols) })
},
}
Expand Down Expand Up @@ -238,7 +238,7 @@ impl DataFrame {
self.sample_n_literal(n, with_replacement, shuffle, seed)
},
None => {
let new_cols = self.columns.iter().map(Series::clear).collect_trusted();
let new_cols = self.columns.iter().map(Column::clear).collect_trusted();
Ok(unsafe { DataFrame::new_no_checks(new_cols) })
},
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/struct_/frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ use crate::prelude::StructChunked;

impl DataFrame {
pub fn into_struct(self, name: PlSmallStr) -> StructChunked {
StructChunked::from_series(name, &self.columns).expect("same invariants")
StructChunked::from_columns(name, &self.columns).expect("same invariants")
}
}
Loading

0 comments on commit 962b576

Please sign in to comment.