Skip to content

Commit 73a0c26

Browse files
authored
Return None when Parquet page indexes are not present in file (#6639)
* return none for missing page indexes * return option from page index read functions * update docs
1 parent f033e4f commit 73a0c26

File tree

6 files changed

+42
-55
lines changed

6 files changed

+42
-55
lines changed

parquet/src/arrow/arrow_reader/mod.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -3584,9 +3584,7 @@ mod tests {
35843584
.unwrap();
35853585
// Although `Vec<Vec<PageLoacation>>` of each row group is empty,
35863586
// we should read the file successfully.
3587-
// FIXME: this test will fail when metadata parsing returns `None` for missing page
3588-
// indexes. https://github.com/apache/arrow-rs/issues/6447
3589-
assert!(builder.metadata().offset_index().unwrap()[0].is_empty());
3587+
assert!(builder.metadata().offset_index().is_none());
35903588
let reader = builder.build().unwrap();
35913589
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
35923590
assert_eq!(batches.len(), 1);

parquet/src/arrow/arrow_writer/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1743,7 +1743,7 @@ mod tests {
17431743
"Expected a dictionary page"
17441744
);
17451745

1746-
let offset_indexes = read_offset_indexes(&file, column).unwrap();
1746+
let offset_indexes = read_offset_indexes(&file, column).unwrap().unwrap();
17471747

17481748
let page_locations = offset_indexes[0].page_locations.clone();
17491749

parquet/src/arrow/async_reader/mod.rs

+2-4
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,6 @@ mod tests {
928928
use crate::arrow::schema::parquet_to_arrow_schema_and_fields;
929929
use crate::arrow::ArrowWriter;
930930
use crate::file::metadata::ParquetMetaDataReader;
931-
use crate::file::page_index::index_reader;
932931
use crate::file::properties::WriterProperties;
933932
use arrow::compute::kernels::cmp::eq;
934933
use arrow::error::Result as ArrowResult;
@@ -1566,12 +1565,11 @@ mod tests {
15661565
let data = Bytes::from(std::fs::read(path).unwrap());
15671566

15681567
let metadata = ParquetMetaDataReader::new()
1568+
.with_page_indexes(true)
15691569
.parse_and_finish(&data)
15701570
.unwrap();
15711571

1572-
let offset_index =
1573-
index_reader::read_offset_indexes(&data, metadata.row_group(0).columns())
1574-
.expect("reading offset index");
1572+
let offset_index = metadata.offset_index().expect("reading offset index")[0].clone();
15751573

15761574
let mut metadata_builder = metadata.into_builder();
15771575
let mut row_groups = metadata_builder.take_row_groups();

parquet/src/file/metadata/reader.rs

-15
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,6 @@ impl ParquetMetaDataReader {
303303

304304
// Get bounds needed for page indexes (if any are present in the file).
305305
let Some(range) = self.range_for_page_index() else {
306-
self.empty_page_indexes();
307306
return Ok(());
308307
};
309308

@@ -477,20 +476,6 @@ impl ParquetMetaDataReader {
477476
Ok(())
478477
}
479478

480-
/// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility
481-
///
482-
/// See <https://github.com/apache/arrow-rs/pull/6451> for details
483-
fn empty_page_indexes(&mut self) {
484-
let metadata = self.metadata.as_mut().unwrap();
485-
let num_row_groups = metadata.num_row_groups();
486-
if self.column_index {
487-
metadata.set_column_index(Some(vec![vec![]; num_row_groups]));
488-
}
489-
if self.offset_index {
490-
metadata.set_offset_index(Some(vec![vec![]; num_row_groups]));
491-
}
492-
}
493-
494479
fn range_for_page_index(&self) -> Option<Range<usize>> {
495480
// sanity check
496481
self.metadata.as_ref()?;

parquet/src/file/page_index/index_reader.rs

+26-22
Original file line numberDiff line numberDiff line change
@@ -43,35 +43,37 @@ pub(crate) fn acc_range(a: Option<Range<usize>>, b: Option<Range<usize>>) -> Opt
4343
///
4444
/// Returns a vector of `index[column_number]`.
4545
///
46-
/// Returns an empty vector if this row group does not contain a
47-
/// [`ColumnIndex`].
46+
/// Returns `None` if this row group does not contain a [`ColumnIndex`].
4847
///
4948
/// See [Page Index Documentation] for more details.
5049
///
5150
/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
5251
pub fn read_columns_indexes<R: ChunkReader>(
5352
reader: &R,
5453
chunks: &[ColumnChunkMetaData],
55-
) -> Result<Vec<Index>, ParquetError> {
54+
) -> Result<Option<Vec<Index>>, ParquetError> {
5655
let fetch = chunks
5756
.iter()
5857
.fold(None, |range, c| acc_range(range, c.column_index_range()));
5958

6059
let fetch = match fetch {
6160
Some(r) => r,
62-
None => return Ok(vec![Index::NONE; chunks.len()]),
61+
None => return Ok(None),
6362
};
6463

6564
let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?;
6665
let get = |r: Range<usize>| &bytes[(r.start - fetch.start)..(r.end - fetch.start)];
6766

68-
chunks
69-
.iter()
70-
.map(|c| match c.column_index_range() {
71-
Some(r) => decode_column_index(get(r), c.column_type()),
72-
None => Ok(Index::NONE),
73-
})
74-
.collect()
67+
Some(
68+
chunks
69+
.iter()
70+
.map(|c| match c.column_index_range() {
71+
Some(r) => decode_column_index(get(r), c.column_type()),
72+
None => Ok(Index::NONE),
73+
})
74+
.collect(),
75+
)
76+
.transpose()
7577
}
7678

7779
/// Reads [`OffsetIndex`], per-page [`PageLocation`] for all columns of a row
@@ -116,35 +118,37 @@ pub fn read_pages_locations<R: ChunkReader>(
116118
///
117119
/// Returns a vector of `offset_index[column_number]`.
118120
///
119-
/// Returns an empty vector if this row group does not contain an
120-
/// [`OffsetIndex`].
121+
/// Returns `None` if this row group does not contain an [`OffsetIndex`].
121122
///
122123
/// See [Page Index Documentation] for more details.
123124
///
124125
/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
125126
pub fn read_offset_indexes<R: ChunkReader>(
126127
reader: &R,
127128
chunks: &[ColumnChunkMetaData],
128-
) -> Result<Vec<OffsetIndexMetaData>, ParquetError> {
129+
) -> Result<Option<Vec<OffsetIndexMetaData>>, ParquetError> {
129130
let fetch = chunks
130131
.iter()
131132
.fold(None, |range, c| acc_range(range, c.offset_index_range()));
132133

133134
let fetch = match fetch {
134135
Some(r) => r,
135-
None => return Ok(vec![]),
136+
None => return Ok(None),
136137
};
137138

138139
let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?;
139140
let get = |r: Range<usize>| &bytes[(r.start - fetch.start)..(r.end - fetch.start)];
140141

141-
chunks
142-
.iter()
143-
.map(|c| match c.offset_index_range() {
144-
Some(r) => decode_offset_index(get(r)),
145-
None => Err(general_err!("missing offset index")),
146-
})
147-
.collect()
142+
Some(
143+
chunks
144+
.iter()
145+
.map(|c| match c.offset_index_range() {
146+
Some(r) => decode_offset_index(get(r)),
147+
None => Err(general_err!("missing offset index")),
148+
})
149+
.collect(),
150+
)
151+
.transpose()
148152
}
149153

150154
pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> {

parquet/src/file/serialized_reader.rs

+12-10
Original file line numberDiff line numberDiff line change
@@ -1223,8 +1223,8 @@ mod tests {
12231223
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
12241224
let metadata = reader.metadata();
12251225
assert_eq!(metadata.num_row_groups(), 0);
1226-
assert_eq!(metadata.column_index().unwrap().len(), 0);
1227-
assert_eq!(metadata.offset_index().unwrap().len(), 0);
1226+
assert!(metadata.column_index().is_none());
1227+
assert!(metadata.offset_index().is_none());
12281228

12291229
// false, true predicate
12301230
let test_file = get_test_file("alltypes_tiny_pages.parquet");
@@ -1236,8 +1236,8 @@ mod tests {
12361236
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
12371237
let metadata = reader.metadata();
12381238
assert_eq!(metadata.num_row_groups(), 0);
1239-
assert_eq!(metadata.column_index().unwrap().len(), 0);
1240-
assert_eq!(metadata.offset_index().unwrap().len(), 0);
1239+
assert!(metadata.column_index().is_none());
1240+
assert!(metadata.offset_index().is_none());
12411241

12421242
// false, false predicate
12431243
let test_file = get_test_file("alltypes_tiny_pages.parquet");
@@ -1249,8 +1249,8 @@ mod tests {
12491249
let reader = SerializedFileReader::new_with_options(test_file, read_options)?;
12501250
let metadata = reader.metadata();
12511251
assert_eq!(metadata.num_row_groups(), 0);
1252-
assert_eq!(metadata.column_index().unwrap().len(), 0);
1253-
assert_eq!(metadata.offset_index().unwrap().len(), 0);
1252+
assert!(metadata.column_index().is_none());
1253+
assert!(metadata.offset_index().is_none());
12541254
Ok(())
12551255
}
12561256

@@ -1340,13 +1340,15 @@ mod tests {
13401340
let columns = metadata.row_group(0).columns();
13411341
let reversed: Vec<_> = columns.iter().cloned().rev().collect();
13421342

1343-
let a = read_columns_indexes(&test_file, columns).unwrap();
1344-
let mut b = read_columns_indexes(&test_file, &reversed).unwrap();
1343+
let a = read_columns_indexes(&test_file, columns).unwrap().unwrap();
1344+
let mut b = read_columns_indexes(&test_file, &reversed)
1345+
.unwrap()
1346+
.unwrap();
13451347
b.reverse();
13461348
assert_eq!(a, b);
13471349

1348-
let a = read_offset_indexes(&test_file, columns).unwrap();
1349-
let mut b = read_offset_indexes(&test_file, &reversed).unwrap();
1350+
let a = read_offset_indexes(&test_file, columns).unwrap().unwrap();
1351+
let mut b = read_offset_indexes(&test_file, &reversed).unwrap().unwrap();
13501352
b.reverse();
13511353
assert_eq!(a, b);
13521354
}

0 commit comments

Comments
 (0)