Skip to content

Commit ce8363a

Browse files
authored
Set the default size of BitWriter for DeltdaBitPackEndoer to 1MB (#5776)
1 parent c498eb7 commit ce8363a

File tree

2 files changed

+10
-21
lines changed

2 files changed

+10
-21
lines changed

parquet/src/encodings/encoding/mod.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ impl<T: DataType> Encoder<T> for RleValueEncoder<T> {
249249
// DELTA_BINARY_PACKED encoding
250250

251251
const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32;
252-
const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024;
252+
const DEFAULT_BIT_WRITER_SIZE: usize = 1024 * 1024;
253253
const DEFAULT_NUM_MINI_BLOCKS: usize = 4;
254254

255255
/// Delta bit packed encoder.
@@ -313,7 +313,7 @@ impl<T: DataType> DeltaBitPackEncoder<T> {
313313

314314
DeltaBitPackEncoder {
315315
page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE),
316-
bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE),
316+
bit_writer: BitWriter::new(DEFAULT_BIT_WRITER_SIZE),
317317
total_values: 0,
318318
first_value: 0,
319319
current_value: 0, // current value to keep adding deltas

parquet/src/util/bit_util.rs

+8-19
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,9 @@ pub struct BitWriter {
172172
}
173173

174174
impl BitWriter {
175-
pub fn new(max_bytes: usize) -> Self {
175+
pub fn new(initial_capacity: usize) -> Self {
176176
Self {
177-
buffer: Vec::with_capacity(max_bytes),
177+
buffer: Vec::with_capacity(initial_capacity),
178178
buffered_values: 0,
179179
bit_offset: 0,
180180
}
@@ -304,12 +304,7 @@ impl BitWriter {
304304
/// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra
305305
/// higher ordered bytes will be ignored.
306306
#[inline]
307-
pub fn put_aligned_offset<T: AsBytes>(
308-
&mut self,
309-
val: T,
310-
num_bytes: usize,
311-
offset: usize,
312-
) {
307+
pub fn put_aligned_offset<T: AsBytes>(&mut self, val: T, num_bytes: usize, offset: usize) {
313308
let slice = val.as_bytes();
314309
let len = num_bytes.min(slice.len());
315310
self.buffer[offset..offset + len].copy_from_slice(&slice[..len])
@@ -405,8 +400,8 @@ impl BitReader {
405400
self.load_buffered_values()
406401
}
407402

408-
let mut v = trailing_bits(self.buffered_values, self.bit_offset + num_bits)
409-
>> self.bit_offset;
403+
let mut v =
404+
trailing_bits(self.buffered_values, self.bit_offset + num_bits) >> self.bit_offset;
410405
self.bit_offset += num_bits;
411406

412407
if self.bit_offset >= 64 {
@@ -571,8 +566,7 @@ impl BitReader {
571566
false => num_values,
572567
};
573568

574-
let end_bit_offset =
575-
self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset;
569+
let end_bit_offset = self.byte_offset * 8 + values_to_read * num_bits + self.bit_offset;
576570

577571
self.byte_offset = end_bit_offset / 8;
578572
self.bit_offset = end_bit_offset % 8;
@@ -585,11 +579,7 @@ impl BitReader {
585579
}
586580

587581
/// Reads up to `num_bytes` to `buf` returning the number of bytes read
588-
pub(crate) fn get_aligned_bytes(
589-
&mut self,
590-
buf: &mut Vec<u8>,
591-
num_bytes: usize,
592-
) -> usize {
582+
pub(crate) fn get_aligned_bytes(&mut self, buf: &mut Vec<u8>, num_bytes: usize) -> usize {
593583
// Align to byte offset
594584
self.byte_offset = self.get_byte_offset();
595585
self.bit_offset = 0;
@@ -998,8 +988,7 @@ mod tests {
998988
.collect();
999989

1000990
// Generic values used to check against actual values read from `get_batch`.
1001-
let expected_values: Vec<T> =
1002-
values.iter().map(|v| from_le_slice(v.as_bytes())).collect();
991+
let expected_values: Vec<T> = values.iter().map(|v| from_le_slice(v.as_bytes())).collect();
1003992

1004993
(0..total).for_each(|i| writer.put_value(values[i], num_bits));
1005994

0 commit comments

Comments
 (0)