Skip to content

Commit

Permalink
fix: Properly split struct columns
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 29, 2024
1 parent 4dc17d9 commit 9c3ae21
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 39 deletions.
13 changes: 1 addition & 12 deletions crates/polars-core/src/frame/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,7 @@ impl DataFrame {
let columns = self
.get_columns()
.iter()
.map(|s| match s.dtype() {
#[cfg(feature = "dtype-struct")]
DataType::Struct(_) => {
let mut ca = s.struct_().unwrap().clone();
for field in ca.fields_mut().iter_mut() {
*field = field.replace_with_chunk(field.chunks()[i].clone())
}
ca.update_chunks(0);
ca.into_series()
},
_ => s.replace_with_chunk(s.chunks()[i].clone()),
})
.map(|s| s.select_chunk(i))
.collect::<Vec<_>>();

DataFrame::new_no_checks(columns)
Expand Down
7 changes: 4 additions & 3 deletions crates/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,10 @@ impl DataFrame {
/// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
/// This may lead to more peak memory consumption.
pub fn as_single_chunk_par(&mut self) -> &mut Self {
if self.columns.iter().any(|s| s.n_chunks() > 1) {
self.columns = self._apply_columns_par(&|s| s.rechunk());
}
self.as_single_chunk();
// if self.columns.iter().any(|s| s.n_chunks() > 1) {
// self.columns = self._apply_columns_par(&|s| s.rechunk());
// }
self
}

Expand Down
47 changes: 23 additions & 24 deletions crates/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,30 +194,29 @@ impl Series {
ca.chunks_mut()
}

/// Create a `Series` of the same data type with all chunks replaced.
/// # Safety
/// These chunks should align with the data-type
pub unsafe fn replace_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
let mut new = self.clear();
// Assign mut so we go through arc only once.
let mut_new = new._get_inner_mut();
*mut_new.chunks_mut() = chunks;
mut_new.compute_len();
new
}

/// Create a `Series` of the same data type with all chunks replaced.
/// # Safety
/// This chunk should align with the data-type
pub unsafe fn replace_with_chunk(&self, chunk: ArrayRef) -> Self {
let mut new = self.clear();
// Assign mut so we go through arc only once.
let mut_new = new._get_inner_mut();
let chunks = mut_new.chunks_mut();
chunks.clear();
chunks.push(chunk);
mut_new.compute_len();
new
pub fn select_chunk(&self, i: usize) -> Self {
match self.dtype() {
#[cfg(feature = "dtype-struct")]
DataType::Struct(_) => {
let mut ca = self.struct_().unwrap().clone();
for field in ca.fields_mut().iter_mut() {
*field = field.select_chunk(i)
}
ca.update_chunks(0);
ca.into_series()
},
_ => {
let mut new = self.clear();
// Assign mut so we go through arc only once.
let mut_new = new._get_inner_mut();
let chunks = unsafe { mut_new.chunks_mut() };
let chunk = self.chunks()[i].clone();
chunks.clear();
chunks.push(chunk);
mut_new.compute_len();
new
},
}
}

pub fn is_sorted_flag(&self) -> IsSorted {
Expand Down
7 changes: 7 additions & 0 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,3 +927,10 @@ def test_struct_filter_chunked_16498() -> None:

def test_struct_field_dynint_nullable_16243() -> None:
pl.select(pl.lit(None).fill_null(pl.struct(42)))


def test_struct_split_16536() -> None:
df = pl.DataFrame({"struct": [{"a": {"a": {"a": 1}}}], "list": [[1]], "int": [1]})

df = pl.concat([df, df, df, df], rechunk=False)
assert df.filter(pl.col("int") == 1).shape == (4, 3)

0 comments on commit 9c3ae21

Please sign in to comment.