diff --git a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs index 6b8f0f65eb64..99d414d13440 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/string_cache.rs @@ -5,6 +5,7 @@ use std::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; use hashbrown::hash_table::Entry; use hashbrown::HashTable; use once_cell::sync::Lazy; +use polars_error::{polars_ensure, PolarsResult}; use polars_utils::aliases::PlRandomState; use polars_utils::pl_str::PlSmallStr; @@ -190,8 +191,9 @@ impl SCacheInner { } #[inline] - pub(crate) fn get_current_payloads(&self) -> &[PlSmallStr] { - &self.payloads + pub(crate) fn get_current_payloads(&self, uuid: u32) -> PolarsResult<&[PlSmallStr]> { + polars_ensure!(self.uuid == uuid, ComputeError: "trying to perform operations with values from different global string cache version"); + Ok(&self.payloads) } } diff --git a/crates/polars-core/src/chunked_array/ops/row_encode.rs b/crates/polars-core/src/chunked_array/ops/row_encode.rs index e60cab0de122..4c8436d194ae 100644 --- a/crates/polars-core/src/chunked_array/ops/row_encode.rs +++ b/crates/polars-core/src/chunked_array/ops/row_encode.rs @@ -1,4 +1,5 @@ use arrow::compute::utils::combine_validities_and_many; +use arrow::legacy::trusted_len::TrustedLenPush; use polars_row::{ convert_columns, RowEncodingCategoricalContext, RowEncodingContext, RowEncodingOptions, RowsEncoded, @@ -70,7 +71,7 @@ pub fn encode_rows_vertical_par_unordered_broadcast_nulls( )) } -pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { +pub fn get_row_encoding_dictionary(dtype: &DataType) -> PolarsResult> { match dtype { DataType::Boolean | DataType::UInt8 @@ -91,7 +92,7 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option None, + | DataType::Duration(_) => Ok(None), DataType::Unknown(_) => panic!("Unsupported in row encoding"), @@ -100,7 +101,7 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { - Some(RowEncodingContext::Decimal(precision.unwrap_or(38))) + Ok(Some(RowEncodingContext::Decimal(precision.unwrap_or(38)))) }, #[cfg(feature = "dtype-array")] @@ -111,24 +112,25 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { + RevMapping::Global(map, _, uuid) => { let num_known_categories = map.keys().max().copied().map_or(0, |m| m + 1); // @TODO: This should probably be cached. - let lexical_sort_idxs = - matches!(ordering, CategoricalOrdering::Lexical).then(|| { - let read_map = crate::STRING_CACHE.read_map(); - let payloads = read_map.get_current_payloads(); - assert!(payloads.len() >= num_known_categories as usize); - - let mut idxs = (0..num_known_categories).collect::>(); - idxs.sort_by_key(|&k| payloads[k as usize].as_str()); - let mut sort_idxs = vec![0; num_known_categories as usize]; - for (i, idx) in idxs.into_iter().enumerate_u32() { - sort_idxs[idx as usize] = i; - } - sort_idxs - }); + let lexical_sort_idxs = if matches!(ordering, CategoricalOrdering::Lexical) { + let read_map = crate::STRING_CACHE.read_map(); + let payloads = read_map.get_current_payloads(*uuid)?; + assert!(payloads.len() >= num_known_categories as usize); + + let mut idxs = (0..num_known_categories).collect::>(); + idxs.sort_by_key(|&k| payloads[k as usize].as_str()); + let mut sort_idxs = vec![0; num_known_categories as usize]; + for (i, idx) in idxs.into_iter().enumerate_u32() { + sort_idxs[idx as usize] = i; + } + Some(sort_idxs) + } else { + None + }; (num_known_categories, lexical_sort_idxs) }, @@ -157,14 +159,14 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option { let mut out = Vec::new(); for (i, f) in fs.iter().enumerate() { - if let Some(dict) = get_row_encoding_dictionary(f.dtype()) { + if let Some(dict) = get_row_encoding_dictionary(f.dtype())? { out.reserve(fs.len()); out.extend(std::iter::repeat_n(None, i)); out.push(Some(dict)); @@ -173,16 +175,18 @@ pub fn get_row_encoding_dictionary(dtype: &DataType) -> Option PolarsResult { let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_unsorted(); - let dict = get_row_encoding_dictionary(by.dtype()); + let dict = get_row_encoding_dictionary(by.dtype())?; cols.push(arr); opts.push(opt); @@ -241,7 +245,7 @@ pub fn _get_rows_encoded( let by = by.as_materialized_series(); let arr = by.to_physical_repr().rechunk().chunks()[0].to_boxed(); let opt = RowEncodingOptions::new_sorted(*desc, *null_last); - let dict = get_row_encoding_dictionary(by.dtype()); + let dict = get_row_encoding_dictionary(by.dtype())?; cols.push(arr); opts.push(opt); diff --git a/crates/polars-expr/src/groups/row_encoded.rs b/crates/polars-expr/src/groups/row_encoded.rs index 1b90a8254bbe..30cebe1180b0 100644 --- a/crates/polars-expr/src/groups/row_encoded.rs +++ b/crates/polars-expr/src/groups/row_encoded.rs @@ -42,7 +42,8 @@ impl RowEncodedHashGrouper { let dicts = self .key_schema .iter() - .map(|(_, dt)| get_row_encoding_dictionary(dt)) + // @TODO: Get this unwrap out of here. That is a bit difficult to do. + .map(|(_, dt)| get_row_encoding_dictionary(dt).unwrap()) .collect::>(); let fields = vec![RowEncodingOptions::new_unsorted(); key_dtypes.len()]; let key_columns = diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs index e5e506c5ac04..1615eaac2f96 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs @@ -77,7 +77,7 @@ impl Eval { let mut dicts = Vec::with_capacity(self.key_columns_expr.len()); for phys_e in self.key_columns_expr.iter() { let s = phys_e.evaluate(chunk, &context.execution_state)?; - dicts.push(get_row_encoding_dictionary(s.dtype())); + dicts.push(get_row_encoding_dictionary(s.dtype())?); let s = s.to_physical_repr().into_owned(); let s = prepare_key(&s, chunk); keys_columns.push(s.to_arrow(0, CompatLevel::newest())); diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 5c17243c4475..61c6c4a39b80 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -263,6 +263,8 @@ impl AggHashTable { .iter_values() .take(self.num_keys) .map(get_row_encoding_dictionary) + // @TODO: get this unwrap out of here. + .map(|v| v.unwrap()) .collect::>(); let fields = vec![Default::default(); self.num_keys]; let key_columns = diff --git a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs index f9a3e901b1a0..c7686ac9b4c3 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/generic_build.rs @@ -142,7 +142,7 @@ impl GenericBuild { let s = phys_e.evaluate(chunk, &context.execution_state)?; let arr = s.to_physical_repr().rechunk().array_ref(0).clone(); self.join_columns.push(arr); - dicts.push(get_row_encoding_dictionary(s.dtype())); + dicts.push(get_row_encoding_dictionary(s.dtype())?); } let rows_encoded = polars_row::convert_columns_no_order( self.join_columns[0].len(), // @NOTE: does not work for ZFS diff --git a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs index 73287f518555..71349b6152ac 100644 --- a/crates/polars-pipe/src/executors/sinks/joins/row_values.rs +++ b/crates/polars-pipe/src/executors/sinks/joins/row_values.rs @@ -60,7 +60,7 @@ impl RowValues { names.push(s.name().to_string()); } self.join_columns_material.push(s.array_ref(0).clone()); - dicts.push(get_row_encoding_dictionary(s.dtype())); + dicts.push(get_row_encoding_dictionary(s.dtype())?); } // We determine the indices of the columns that have to be removed diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 175a6b707430..2ad333919ce8 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -143,7 +143,7 @@ impl SortSinkMultiple { let (_, dtype) = schema.get_at_index(*i).unwrap(); get_row_encoding_dictionary(dtype) }) - .collect::>(); + .collect::>>()?; polars_ensure!(sort_idx.iter().collect::>().len() == sort_idx.len(), ComputeError: "only supports sorting by unique columns"); diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index a3a80505596f..4aec0724a322 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -550,7 +550,8 @@ impl PySeries { let dicts = dtypes .iter() .map(|(_, dtype)| get_row_encoding_dictionary(&dtype.0)) - .collect::>(); + .collect::>>() + .map_err(PyPolarsErr::from)?; // Get the BinaryOffset array. let arr = self.series.rechunk(); diff --git a/py-polars/tests/unit/test_row_encoding.py b/py-polars/tests/unit/test_row_encoding.py index 89363c295b6b..2fb119214356 100644 --- a/py-polars/tests/unit/test_row_encoding.py +++ b/py-polars/tests/unit/test_row_encoding.py @@ -373,3 +373,14 @@ def test_null( .to_series(), s, ) + + +def test_wrong_version_categorical_20364() -> None: + with pl.StringCache(): + s = pl.Series("s", ["a"], pl.Categorical(ordering="lexical")) + + with pytest.raises( + pl.exceptions.ComputeError, + match="trying to perform operations with values from different global string cache version", + ): + s.to_frame()._row_encode([(False, False, False)])