diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 65706ff40b52..5d3a277651dd 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,6 +1,5 @@ use arrow::bitmap::MutableBitmap; use arrow::compute::cast::utf8view_to_utf8; -#[cfg(feature = "dtype-array")] use arrow::compute::take::take_unchecked; use polars_utils::vec::PushUnchecked; @@ -25,7 +24,7 @@ impl ChunkExplode for ListChunked { let offsets = listarr.offsets().as_slice(); let mut values = listarr.values().clone(); - let mut s = if ca._can_fast_explode() { + let (mut s, offsets) = if ca._can_fast_explode() { // ensure that the value array is sliced // as a list only slices its offsets on a slice operation @@ -39,13 +38,16 @@ impl ChunkExplode for ListChunked { values = unsafe { values.sliced_unchecked(start, len) }; } // SAFETY: inner_dtype should be correct - unsafe { - Series::from_chunks_and_dtype_unchecked( - self.name(), - vec![values], - &self.inner_dtype().to_physical(), - ) - } + ( + unsafe { + Series::from_chunks_and_dtype_unchecked( + self.name(), + vec![values], + &self.inner_dtype().to_physical(), + ) + }, + offsets_buf, + ) } else { // during tests // test that this code branch is not hit with list arrays that could be fast exploded @@ -63,16 +65,61 @@ impl ChunkExplode for ListChunked { panic!("could have fast exploded") } } - - // SAFETY: inner_dtype should be correct - let values = unsafe { - Series::from_chunks_and_dtype_unchecked( - self.name(), - vec![values], - &self.inner_dtype().to_physical(), - ) - }; - values.explode_by_offsets(offsets) + if listarr.null_count() == 0 { + // SAFETY: inner_dtype should be correct + let values = unsafe { + Series::from_chunks_and_dtype_unchecked( + self.name(), + vec![values], + &self.inner_dtype().to_physical(), + ) + }; + (values.explode_by_offsets(offsets), offsets_buf) + } else { + // we have already ensure that validity is not none. + let validity = listarr.validity().unwrap(); + + let mut indices = + MutablePrimitiveArray::::with_capacity(*offsets_buf.last() as usize); + let mut new_offsets = Vec::with_capacity(listarr.len() + 1); + let mut current_offset = 0i64; + let mut iter = offsets.iter(); + if let Some(mut previous) = iter.next().copied() { + new_offsets.push(current_offset); + iter.enumerate().for_each(|(i, &offset)| { + let len = offset - previous; + let start = previous as IdxSize; + let end = offset as IdxSize; + // SAFETY: we are within bounds + if unsafe { validity.get_bit_unchecked(i) } { + // explode expects null value if sublist is empty. + if len == 0 { + indices.push_null(); + } else { + indices.extend_trusted_len_values(start..end); + } + current_offset += len; + } else { + indices.push_null(); + } + previous = offset; + new_offsets.push(current_offset); + }) + } + // SAFETY: the indices we generate are in bounds + let chunk = unsafe { take_unchecked(values.as_ref(), &indices.into()) }; + // SAFETY: inner_dtype should be correct + let s = unsafe { + Series::from_chunks_and_dtype_unchecked( + self.name(), + vec![chunk], + &self.inner_dtype().to_physical(), + ) + }; + // SAFETY: monotonically increasing + let new_offsets = unsafe { OffsetsBuffer::new_unchecked(new_offsets.into()) }; + (s, new_offsets) + } }; debug_assert_eq!(s.name(), self.name()); // restore logical type @@ -80,7 +127,7 @@ impl ChunkExplode for ListChunked { s = s.cast_unchecked(&self.inner_dtype()).unwrap(); } - Ok((s, offsets_buf)) + Ok((s, offsets)) } } diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 50e52e5ed3b2..5936c729873a 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -416,3 +416,31 @@ def test_df_explode_with_array() -> None: }, ) assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list) + + +def test_explode_nullable_list() -> None: + df = pl.DataFrame({"layout1": [None, [1, 2]], "b": [False, True]}).with_columns( + layout2=pl.when(pl.col("b")).then([1, 2]), + ) + + explode_df = df.explode("layout1", "layout2") + expected_df = pl.DataFrame( + { + "layout1": [None, 1, 2], + "b": [False, True, True], + "layout2": [None, 1, 2], + } + ) + assert_frame_equal(explode_df, expected_df) + + explode_expr = df.select( + pl.col("layout1").explode(), + pl.col("layout2").explode(), + ) + expected_df = pl.DataFrame( + { + "layout1": [None, 1, 2], + "layout2": [None, 1, 2], + } + ) + assert_frame_equal(explode_expr, expected_df)