Skip to content

Commit a417f01

Browse files
committed
Write Bloom filters between row groups instead of the end
This allows Bloom filters to not be saved in memory, which can save significant space when writing long files
1 parent 759767b commit a417f01

File tree

2 files changed

+11
-5
lines changed

2 files changed

+11
-5
lines changed

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,8 @@ impl<W: Write + Send> ArrowWriter<W> {
259259
for chunk in in_progress.close()? {
260260
chunk.append_to_row_group(&mut row_group_writer)?;
261261
}
262-
row_group_writer.close()?;
262+
let row_group_metadata = row_group_writer.close()?;
263+
self.writer.write_bloom_filters(&mut [row_group_metadata.to_thrift()])?;
263264
Ok(())
264265
}
265266

parquet/src/file/writer.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,19 @@ impl<W: Write + Send> SerializedFileWriter<W> {
274274
}
275275

276276
/// Serialize all the bloom filter to the file
277-
fn write_bloom_filters(&mut self, row_groups: &mut [RowGroup]) -> Result<()> {
277+
pub fn write_bloom_filters(&mut self, row_groups: &mut [RowGroup]) -> Result<()> {
278278
// iter row group
279279
// iter each column
280280
// write bloom filter to the file
281-
for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() {
281+
for row_group in row_groups.iter_mut() {
282+
let row_group_idx: u16 = row_group
283+
.ordinal
284+
.expect("Missing row group ordinal")
285+
.try_into()
286+
.expect("Negative row group ordinal");
287+
let row_group_idx = row_group_idx as usize;
282288
for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() {
283-
match &self.bloom_filters[row_group_idx][column_idx] {
289+
match self.bloom_filters[row_group_idx][column_idx].take() {
284290
Some(bloom_filter) => {
285291
let start_offset = self.buf.bytes_written();
286292
bloom_filter.write(&mut self.buf)?;
@@ -338,7 +344,6 @@ impl<W: Write + Send> SerializedFileWriter<W> {
338344
.map(|v| v.to_thrift())
339345
.collect::<Vec<_>>();
340346

341-
self.write_bloom_filters(&mut row_groups)?;
342347
// Write column indexes and offset indexes
343348
self.write_column_indexes(&mut row_groups)?;
344349
self.write_offset_indexes(&mut row_groups)?;

0 commit comments

Comments
 (0)