From cb40bbd30e69f06ed6b0b4f87b2409b0f4c9846c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 28 May 2024 20:18:31 +0200 Subject: [PATCH] fix: Don't panic on hashing nested list types (#16555) --- crates/polars-core/src/series/mod.rs | 11 ++++++++--- crates/polars-plan/src/logical_plan/lit.rs | 10 +++++++--- py-polars/tests/unit/test_cse.py | 6 ++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index d0e6d4ffae53..687f4544b4d4 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -146,9 +146,14 @@ impl Hash for Wrap { fn hash(&self, state: &mut H) { let rs = RandomState::with_seeds(0, 0, 0, 0); let mut h = vec![]; - self.0.vec_hash(rs, &mut h).unwrap(); - let h = h.into_iter().fold(0, |a: u64, b| a.wrapping_add(b)); - h.hash(state) + if self.0.vec_hash(rs, &mut h).is_ok() { + let h = h.into_iter().fold(0, |a: u64, b| a.wrapping_add(b)); + h.hash(state) + } else { + self.len().hash(state); + self.null_count().hash(state); + self.dtype().hash(state); + } } } diff --git a/crates/polars-plan/src/logical_plan/lit.rs b/crates/polars-plan/src/logical_plan/lit.rs index 67dfb72528d8..9cdebad91911 100644 --- a/crates/polars-plan/src/logical_plan/lit.rs +++ b/crates/polars-plan/src/logical_plan/lit.rs @@ -4,6 +4,7 @@ use std::hash::{Hash, Hasher}; use polars_core::export::chrono::{Duration as ChronoDuration, NaiveDate, NaiveDateTime}; use polars_core::prelude::*; use polars_core::utils::materialize_dyn_int; +use polars_utils::hashing::hash_to_partition; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -453,9 +454,12 @@ impl Hash for LiteralValue { let len = s.len(); len.hash(state); s.null_count().hash(state); - // Hash 5 first values. Still a poor hash, but it removes the pathological clashes. - for i in 0..std::cmp::min(5, len) { - s.get(i).unwrap().hash(state); + const RANDOM: u64 = 0x2c194fa5df32a367; + let mut rng = (len as u64) ^ RANDOM; + for _ in 0..5 { + let idx = hash_to_partition(rng, len); + s.get(idx).unwrap().hash(state); + rng = rng.rotate_right(17).wrapping_add(RANDOM); } }, LiteralValue::Range { diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 833d73dad470..8e8f8057cd35 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -768,3 +768,9 @@ def test_cse_series_collision_16138(capfd: Any, monkeypatch: Any) -> None: } captured = capfd.readouterr().err assert "3 CSE" in captured + + +def test_nested_cache_no_panic_16553() -> None: + assert pl.LazyFrame().select(a=[[[1]]]).collect(comm_subexpr_elim=True).to_dict( + as_series=False + ) == {"a": [[[[1]]]]}