Skip to content

Commit 447681c

Browse files
Revert "feat(rust): Add RLE to RLE_DICTIONARY encoder (#15959)"
This reverts commit 6730a72.
1 parent acb601d commit 447681c

File tree

14 files changed

+136
-274
lines changed

14 files changed

+136
-274
lines changed

crates/polars-arrow/src/compute/cast/binary_to.rs

-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ pub fn binary_to_dictionary<O: Offset, K: DictionaryKey>(
139139
from: &BinaryArray<O>,
140140
) -> PolarsResult<DictionaryArray<K>> {
141141
let mut array = MutableDictionaryArray::<K, MutableBinaryArray<O>>::new();
142-
array.reserve(from.len());
143142
array.try_extend(from.iter())?;
144143

145144
Ok(array.into())

crates/polars-arrow/src/compute/cast/binview_to.rs

-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ pub(super) fn binview_to_dictionary<K: DictionaryKey>(
2121
from: &BinaryViewArray,
2222
) -> PolarsResult<DictionaryArray<K>> {
2323
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<[u8]>>::new();
24-
array.reserve(from.len());
2524
array.try_extend(from.iter())?;
2625

2726
Ok(array.into())
@@ -31,7 +30,6 @@ pub(super) fn utf8view_to_dictionary<K: DictionaryKey>(
3130
from: &Utf8ViewArray,
3231
) -> PolarsResult<DictionaryArray<K>> {
3332
let mut array = MutableDictionaryArray::<K, MutableBinaryViewArray<str>>::new();
34-
array.reserve(from.len());
3533
array.try_extend(from.iter())?;
3634

3735
Ok(array.into())

crates/polars-arrow/src/compute/cast/primitive_to.rs

-1
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,6 @@ pub fn primitive_to_dictionary<T: NativeType + Eq + Hash, K: DictionaryKey>(
318318
let mut array = MutableDictionaryArray::<K, _>::try_empty(MutablePrimitiveArray::<T>::from(
319319
from.data_type().clone(),
320320
))?;
321-
array.reserve(from.len());
322321
array.try_extend(iter)?;
323322

324323
Ok(array.into())

crates/polars-arrow/src/compute/cast/utf8_to.rs

-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ pub fn utf8_to_dictionary<O: Offset, K: DictionaryKey>(
2727
from: &Utf8Array<O>,
2828
) -> PolarsResult<DictionaryArray<K>> {
2929
let mut array = MutableDictionaryArray::<K, MutableUtf8Array<O>>::new();
30-
array.reserve(from.len());
3130
array.try_extend(from.iter())?;
3231

3332
Ok(array.into())

crates/polars-io/src/parquet/write/writer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ where
102102
WriteOptions {
103103
write_statistics: self.statistics,
104104
compression: self.compression,
105-
version: Version::V1,
105+
version: Version::V2,
106106
data_pagesize_limit: self.data_page_size,
107107
}
108108
}

crates/polars-parquet/src/arrow/write/dictionary.rs

+35-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use arrow::array::{Array, BinaryViewArray, DictionaryArray, DictionaryKey, Utf8ViewArray};
22
use arrow::bitmap::{Bitmap, MutableBitmap};
33
use arrow::datatypes::{ArrowDataType, IntegerType};
4+
use num_traits::ToPrimitive;
45
use polars_error::{polars_bail, PolarsResult};
56

67
use super::binary::{
@@ -15,19 +16,23 @@ use super::primitive::{
1516
use super::{binview, nested, Nested, WriteOptions};
1617
use crate::arrow::read::schema::is_nullable;
1718
use crate::arrow::write::{slice_nested_leaf, utils};
18-
use crate::parquet::encoding::hybrid_rle::encode;
19+
use crate::parquet::encoding::hybrid_rle::encode_u32;
1920
use crate::parquet::encoding::Encoding;
2021
use crate::parquet::page::{DictPage, Page};
2122
use crate::parquet::schema::types::PrimitiveType;
2223
use crate::parquet::statistics::{serialize_statistics, ParquetStatistics};
23-
use crate::write::DynIter;
24+
use crate::write::{to_nested, DynIter, ParquetType};
2425

2526
pub(crate) fn encode_as_dictionary_optional(
2627
array: &dyn Array,
27-
nested: &[Nested],
2828
type_: PrimitiveType,
2929
options: WriteOptions,
3030
) -> Option<PolarsResult<DynIter<'static, PolarsResult<Page>>>> {
31+
let nested = to_nested(array, &ParquetType::PrimitiveType(type_.clone()))
32+
.ok()?
33+
.pop()
34+
.unwrap();
35+
3136
let dtype = Box::new(array.data_type().clone());
3237

3338
let len_before = array.len();
@@ -47,11 +52,35 @@ pub(crate) fn encode_as_dictionary_optional(
4752
if (array.values().len() as f64) / (len_before as f64) > 0.75 {
4853
return None;
4954
}
55+
if array.values().len().to_u16().is_some() {
56+
let array = arrow::compute::cast::cast(
57+
array,
58+
&ArrowDataType::Dictionary(
59+
IntegerType::UInt16,
60+
Box::new(array.values().data_type().clone()),
61+
false,
62+
),
63+
Default::default(),
64+
)
65+
.unwrap();
66+
67+
let array = array
68+
.as_any()
69+
.downcast_ref::<DictionaryArray<u16>>()
70+
.unwrap();
71+
return Some(array_to_pages(
72+
array,
73+
type_,
74+
&nested,
75+
options,
76+
Encoding::RleDictionary,
77+
));
78+
}
5079

5180
Some(array_to_pages(
5281
array,
5382
type_,
54-
nested,
83+
&nested,
5584
options,
5685
Encoding::RleDictionary,
5786
))
@@ -87,15 +116,15 @@ fn serialize_keys_values<K: DictionaryKey>(
87116
buffer.push(num_bits as u8);
88117

89118
// followed by the encoded indices.
90-
Ok(encode::<u32, _, _>(buffer, keys, num_bits)?)
119+
Ok(encode_u32(buffer, keys, num_bits)?)
91120
} else {
92121
let num_bits = utils::get_bit_width(keys.clone().max().unwrap_or(0) as u64);
93122

94123
// num_bits as a single byte
95124
buffer.push(num_bits as u8);
96125

97126
// followed by the encoded indices.
98-
Ok(encode::<u32, _, _>(buffer, keys, num_bits)?)
127+
Ok(encode_u32(buffer, keys, num_bits)?)
99128
}
100129
}
101130

crates/polars-parquet/src/arrow/write/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ pub fn array_to_pages(
219219
// Only take this path for primitive columns
220220
if matches!(nested.first(), Some(Nested::Primitive(_, _, _))) {
221221
if let Some(result) =
222-
encode_as_dictionary_optional(primitive_array, nested, type_.clone(), options)
222+
encode_as_dictionary_optional(primitive_array, type_.clone(), options)
223223
{
224224
return result;
225225
}

crates/polars-parquet/src/arrow/write/nested/mod.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use polars_error::PolarsResult;
66
pub use rep::num_values;
77

88
use super::Nested;
9-
use crate::parquet::encoding::hybrid_rle::encode;
9+
use crate::parquet::encoding::hybrid_rle::encode_u32;
1010
use crate::parquet::read::levels::get_bit_width;
1111
use crate::parquet::write::Version;
1212

@@ -41,12 +41,12 @@ fn write_rep_levels(buffer: &mut Vec<u8>, nested: &[Nested], version: Version) -
4141
match version {
4242
Version::V1 => {
4343
write_levels_v1(buffer, |buffer: &mut Vec<u8>| {
44-
encode::<u32, _, _>(buffer, levels, num_bits)?;
44+
encode_u32(buffer, levels, num_bits)?;
4545
Ok(())
4646
})?;
4747
},
4848
Version::V2 => {
49-
encode::<u32, _, _>(buffer, levels, num_bits)?;
49+
encode_u32(buffer, levels, num_bits)?;
5050
},
5151
}
5252

@@ -65,10 +65,10 @@ fn write_def_levels(buffer: &mut Vec<u8>, nested: &[Nested], version: Version) -
6565

6666
match version {
6767
Version::V1 => write_levels_v1(buffer, move |buffer: &mut Vec<u8>| {
68-
encode::<u32, _, _>(buffer, levels, num_bits)?;
68+
encode_u32(buffer, levels, num_bits)?;
6969
Ok(())
7070
}),
71-
Version::V2 => Ok(encode::<u32, _, _>(buffer, levels, num_bits)?),
71+
Version::V2 => Ok(encode_u32(buffer, levels, num_bits)?),
7272
}
7373
}
7474

crates/polars-parquet/src/arrow/write/utils.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use polars_error::*;
44

55
use super::{Version, WriteOptions};
66
use crate::parquet::compression::CompressionOptions;
7-
use crate::parquet::encoding::hybrid_rle::encode;
7+
use crate::parquet::encoding::hybrid_rle::encode_bool;
88
use crate::parquet::encoding::Encoding;
99
use crate::parquet::metadata::Descriptor;
1010
use crate::parquet::page::{DataPage, DataPageHeader, DataPageHeaderV1, DataPageHeaderV2};
@@ -14,7 +14,7 @@ use crate::parquet::statistics::ParquetStatistics;
1414
fn encode_iter_v1<I: Iterator<Item = bool>>(buffer: &mut Vec<u8>, iter: I) -> PolarsResult<()> {
1515
buffer.extend_from_slice(&[0; 4]);
1616
let start = buffer.len();
17-
encode::<bool, _, _>(buffer, iter, 1)?;
17+
encode_bool(buffer, iter)?;
1818
let end = buffer.len();
1919
let length = end - start;
2020

@@ -25,7 +25,7 @@ fn encode_iter_v1<I: Iterator<Item = bool>>(buffer: &mut Vec<u8>, iter: I) -> Po
2525
}
2626

2727
fn encode_iter_v2<I: Iterator<Item = bool>>(writer: &mut Vec<u8>, iter: I) -> PolarsResult<()> {
28-
Ok(encode::<bool, _, _>(writer, iter, 1)?)
28+
Ok(encode_bool(writer, iter)?)
2929
}
3030

3131
fn encode_iter<I: Iterator<Item = bool>>(

0 commit comments

Comments
 (0)