Skip to content

Commit b38ccf7

Browse files
committed
Add ParquetMetadataWriter allow ad-hoc encoding of ParquetMetadata
1 parent 3bc9987 commit b38ccf7

File tree

5 files changed

+500
-103
lines changed

5 files changed

+500
-103
lines changed

parquet/src/arrow/async_reader/metadata.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ use crate::errors::{ParquetError, Result};
2020
use crate::file::footer::{decode_footer, decode_metadata};
2121
use crate::file::metadata::ParquetMetaData;
2222
use crate::file::page_index::index::Index;
23-
use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index};
23+
use crate::file::page_index::index_reader::{
24+
acc_range, decode_column_index, decode_page_locations,
25+
};
2426
use bytes::Bytes;
2527
use futures::future::BoxFuture;
2628
use futures::FutureExt;
@@ -177,7 +179,9 @@ impl<F: MetadataFetch> MetadataLoader<F> {
177179
x.columns()
178180
.iter()
179181
.map(|c| match c.offset_index_range() {
180-
Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]),
182+
Some(r) => {
183+
decode_page_locations(&data[r.start - offset..r.end - offset])
184+
}
181185
None => Err(general_err!("missing offset index")),
182186
})
183187
.collect::<Result<Vec<_>>>()

parquet/src/file/metadata/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
8686
///
8787
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
8888
/// [`parse_metadata`]: crate::file::footer::parse_metadata
89-
#[derive(Debug, Clone)]
89+
#[derive(Debug, Clone, PartialEq)]
9090
pub struct ParquetMetaData {
9191
/// File level metadata
9292
file_metadata: FileMetaData,
@@ -222,7 +222,7 @@ pub type FileMetaDataPtr = Arc<FileMetaData>;
222222
/// File level metadata for a Parquet file.
223223
///
224224
/// Includes the version of the file, metadata, number of rows, schema, and column orders
225-
#[derive(Debug, Clone)]
225+
#[derive(Debug, Clone, PartialEq)]
226226
pub struct FileMetaData {
227227
version: i32,
228228
num_rows: i64,

parquet/src/file/page_index/index.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,38 @@ impl<T: ParquetValueType> NativeIndex<T> {
168168
boundary_order: index.boundary_order,
169169
})
170170
}
171+
172+
pub(crate) fn to_thrift(&self) -> ColumnIndex {
173+
let min_values = self
174+
.indexes
175+
.iter()
176+
.map(|x| x.min_bytes().map(|x| x.to_vec()))
177+
.collect::<Option<Vec<_>>>()
178+
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);
179+
180+
let max_values = self
181+
.indexes
182+
.iter()
183+
.map(|x| x.max_bytes().map(|x| x.to_vec()))
184+
.collect::<Option<Vec<_>>>()
185+
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);
186+
187+
let null_counts = self
188+
.indexes
189+
.iter()
190+
.map(|x| x.null_count())
191+
.collect::<Option<Vec<_>>>();
192+
193+
ColumnIndex::new(
194+
self.indexes.iter().map(|x| x.min().is_none()).collect(),
195+
min_values,
196+
max_values,
197+
self.boundary_order,
198+
null_counts,
199+
None,
200+
None,
201+
)
202+
}
171203
}
172204

173205
#[cfg(test)]

0 commit comments

Comments
 (0)