From e6c819f926a1664902c6f7d3f96b84f87304a21e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 24 Sep 2024 16:04:29 -0700 Subject: [PATCH 1/4] workaround for missing page indexes --- parquet/src/arrow/arrow_reader/mod.rs | 27 +++++++-------------------- parquet/src/file/metadata/reader.rs | 25 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 253625117c2b..9f56b2d25bd9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -34,9 +34,7 @@ use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; -use crate::file::footer; -use crate::file::metadata::ParquetMetaData; -use crate::file::page_index::index_reader; +use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; use crate::schema::types::SchemaDescriptor; @@ -382,23 +380,9 @@ impl ArrowReaderMetadata { /// `Self::metadata` is missing the page index, this function will attempt /// to load the page index by making an object store request. pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { - let mut metadata = footer::parse_metadata(reader)?; - if options.page_index { - let column_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_columns_indexes(reader, rg.columns())) - .collect::>>()?; - metadata.set_column_index(Some(column_index)); - - let offset_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) - .collect::>>()?; - - metadata.set_offset_index(Some(offset_index)) - } + let metadata = ParquetMetaDataReader::new() + .with_page_indexes(options.page_index) + .parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -3496,8 +3480,11 @@ mod tests { ArrowReaderOptions::new().with_page_index(true), ) .unwrap(); + // Although `Vec>` of each row group is empty, // we should read the file successfully. + // FIXME: this test will fail when metadata parsing returns `None` for missing page + // indexes. https://github.com/apache/arrow-rs/issues/6447 assert!(builder.metadata().offset_index().unwrap()[0].is_empty()); let reader = builder.build().unwrap(); let batches = reader.collect::, _>>().unwrap(); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 333e0fd6e90d..f3132f92d700 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -201,6 +201,7 @@ impl ParquetMetaDataReader { // need for more data. This is not it's intended use. The plan is to add a NeedMoreData // value to the enum, but this would be a breaking change. This will be done as // 54.0.0 draws nearer. + // https://github.com/apache/arrow-rs/issues/6447 Err(ParquetError::IndexOutOfBound(needed, _)) => { // If reader is the same length as `file_size` then presumably there is no more to // read, so return an EOF error. @@ -247,13 +248,18 @@ impl ParquetMetaDataReader { )); } - // TODO(ets): what is the correct behavior for missing page indexes? MetadataLoader would - // leave them as `None`, while the parser in `index_reader::read_columns_indexes` returns a - // vector of empty vectors. - // I think it's best to leave them as `None`. + // FIXME: there are differing implementations in the case where page indexes are missing + // from the file. `MetadataLoader` will leave them as `None`, while the parser in + // `index_reader::read_columns_indexes` returns a vector of empty vectors. + // It is best for this function to replicate the latter behavior for now, but in the future + // the two paths to retrieve metadata should be made consistent. Note that this is only + // an issue if the user requested page indexes, so there is no need to provide empty + // vectors in `try_parse_sized()`. + // https://github.com/apache/arrow-rs/issues/6447 // Get bounds needed for page indexes (if any are present in the file). let Some(range) = self.range_for_page_index() else { + self.empty_page_indexes(); return Ok(()); }; @@ -412,6 +418,17 @@ impl ParquetMetaDataReader { Ok(()) } + fn empty_page_indexes(&mut self) { + let metadata = self.metadata.as_mut().unwrap(); + let num_row_groups = metadata.num_row_groups(); + if self.column_index { + metadata.set_column_index(Some(vec![vec![]; num_row_groups])); + } + if self.offset_index { + metadata.set_offset_index(Some(vec![vec![]; num_row_groups])); + } + } + fn range_for_page_index(&self) -> Option> { // sanity check self.metadata.as_ref()?; From abfba527e17e74d1e46de44246e23ed67ebccb05 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 24 Sep 2024 16:07:27 -0700 Subject: [PATCH 2/4] remove empty line --- parquet/src/arrow/arrow_reader/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 9f56b2d25bd9..a109851f72b6 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3480,7 +3480,6 @@ mod tests { ArrowReaderOptions::new().with_page_index(true), ) .unwrap(); - // Although `Vec>` of each row group is empty, // we should read the file successfully. // FIXME: this test will fail when metadata parsing returns `None` for missing page From 98a7ca60c1105bac11058b5063085f9d5bee9178 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 25 Sep 2024 16:15:15 -0700 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Andrew Lamb --- parquet/src/file/metadata/reader.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index f3132f92d700..2afde1ed9e91 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -251,8 +251,8 @@ impl ParquetMetaDataReader { // FIXME: there are differing implementations in the case where page indexes are missing // from the file. `MetadataLoader` will leave them as `None`, while the parser in // `index_reader::read_columns_indexes` returns a vector of empty vectors. - // It is best for this function to replicate the latter behavior for now, but in the future - // the two paths to retrieve metadata should be made consistent. Note that this is only + // It is best for this function to replicate the latter behavior for now, but in a future + // breaking release, the two paths to retrieve metadata should be made consistent. Note that this is only // an issue if the user requested page indexes, so there is no need to provide empty // vectors in `try_parse_sized()`. // https://github.com/apache/arrow-rs/issues/6447 @@ -418,6 +418,9 @@ impl ParquetMetaDataReader { Ok(()) } + /// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility + /// + /// See for details fn empty_page_indexes(&mut self) { let metadata = self.metadata.as_mut().unwrap(); let num_row_groups = metadata.num_row_groups(); From 4d725323991d508f4cb4b7bfd43406f654d67db7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 25 Sep 2024 19:31:04 -0400 Subject: [PATCH 4/4] fmt --- parquet/src/file/metadata/reader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 2afde1ed9e91..4a5a1bc93c4f 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -419,7 +419,7 @@ impl ParquetMetaDataReader { } /// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility - /// + /// /// See for details fn empty_page_indexes(&mut self) { let metadata = self.metadata.as_mut().unwrap();