Skip to content

Commit acdd27a

Browse files
adriangbetseidlalamb
authored
Fix writing of invalid Parquet ColumnIndex when row group contains null pages (#6319)
* Fix writing of invalid Parquet ColumnIndex when row group contains null pages Co-authored-by: Ed Seidl <[email protected]> * fix lint * more rusty Co-authored-by: Ed Seidl <[email protected]> * re-enable tests --------- Co-authored-by: Ed Seidl <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 69e5e5f commit acdd27a

File tree

2 files changed

+73
-14
lines changed

2 files changed

+73
-14
lines changed

parquet/src/file/metadata/writer.rs

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -450,19 +450,57 @@ mod tests {
450450
true,
451451
)]));
452452

453-
let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(2)]));
453+
// build row groups / pages that exercise different combinations of nulls and values
454+
// note that below we set the row group and page sizes to 4 and 2 respectively
455+
// so that these "groupings" make sense
456+
let a: ArrayRef = Arc::new(Int32Array::from(vec![
457+
// a row group that has all values
458+
Some(i32::MIN),
459+
Some(-1),
460+
Some(1),
461+
Some(i32::MAX),
462+
// a row group with a page of all nulls and a page of all values
463+
None,
464+
None,
465+
Some(2),
466+
Some(3),
467+
// a row group that has all null pages
468+
None,
469+
None,
470+
None,
471+
None,
472+
// a row group having 1 page with all values and 1 page with some nulls
473+
Some(4),
474+
Some(5),
475+
None,
476+
Some(6),
477+
// a row group having 1 page with all nulls and 1 page with some nulls
478+
None,
479+
None,
480+
Some(7),
481+
None,
482+
// a row group having all pages with some nulls
483+
None,
484+
Some(8),
485+
Some(9),
486+
None,
487+
]));
454488

455489
let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
456490

457-
let writer_props = match write_page_index {
458-
true => WriterProperties::builder()
459-
.set_statistics_enabled(EnabledStatistics::Page)
460-
.build(),
461-
false => WriterProperties::builder()
462-
.set_statistics_enabled(EnabledStatistics::Chunk)
463-
.build(),
491+
let writer_props_builder = match write_page_index {
492+
true => WriterProperties::builder().set_statistics_enabled(EnabledStatistics::Page),
493+
false => WriterProperties::builder().set_statistics_enabled(EnabledStatistics::Chunk),
464494
};
465495

496+
// tune the size or pages to the data above
497+
// to make sure we exercise code paths where all items in a page are null, etc.
498+
let writer_props = writer_props_builder
499+
.set_max_row_group_size(4)
500+
.set_data_page_row_count_limit(2)
501+
.set_write_batch_size(2)
502+
.build();
503+
466504
let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(writer_props)).unwrap();
467505
writer.write(&batch).unwrap();
468506
writer.close().unwrap();
@@ -610,6 +648,29 @@ mod tests {
610648
decoded_metadata.num_row_groups()
611649
);
612650

651+
// check that the mins and maxes are what we expect for each page
652+
// also indirectly checking that the pages were written out as we expected them to be laid out
653+
// (if they're not, or something gets refactored in the future that breaks that assumption,
654+
// this test may have to drop down to a lower level and create metadata directly instead of relying on
655+
// writing an entire file)
656+
let column_indexes = metadata.metadata.column_index().unwrap();
657+
assert_eq!(column_indexes.len(), 6);
658+
// make sure each row group has 2 pages by checking the first column
659+
// page counts for each column for each row group, should all be the same and there should be
660+
// 12 pages in total across 6 row groups / 1 column
661+
let mut page_counts = vec![];
662+
for row_group in column_indexes {
663+
for column in row_group {
664+
match column {
665+
Index::INT32(column_index) => {
666+
page_counts.push(column_index.indexes.len());
667+
}
668+
_ => panic!("unexpected column index type"),
669+
}
670+
}
671+
}
672+
assert_eq!(page_counts, vec![2; 6]);
673+
613674
metadata
614675
.metadata
615676
.row_groups()

parquet/src/file/page_index/index.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,16 +230,14 @@ impl<T: ParquetValueType> NativeIndex<T> {
230230
let min_values = self
231231
.indexes
232232
.iter()
233-
.map(|x| x.min_bytes().map(|x| x.to_vec()))
234-
.collect::<Option<Vec<_>>>()
235-
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);
233+
.map(|x| x.min_bytes().unwrap_or(&[]).to_vec())
234+
.collect::<Vec<_>>();
236235

237236
let max_values = self
238237
.indexes
239238
.iter()
240-
.map(|x| x.max_bytes().map(|x| x.to_vec()))
241-
.collect::<Option<Vec<_>>>()
242-
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);
239+
.map(|x| x.max_bytes().unwrap_or(&[]).to_vec())
240+
.collect::<Vec<_>>();
243241

244242
let null_counts = self
245243
.indexes

0 commit comments

Comments
 (0)