Skip to content

Commit faf0b9e

Browse files
committed
Example of reading and writing parquet metadata outside the file
1 parent 9be0eb5 commit faf0b9e

File tree

2 files changed

+111
-0
lines changed

2 files changed

+111
-0
lines changed

parquet/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ object_store = ["dep:object_store", "async"]
115115
# Group Zstd dependencies
116116
zstd = ["dep:zstd", "zstd-sys"]
117117

118+
119+
[[example]]
120+
name = "external_metadata"
121+
required-features = ["arrow", "async"]
122+
path = "./examples/external_metadata.rs"
123+
118124
[[example]]
119125
name = "read_parquet"
120126
required-features = ["arrow"]

parquet/examples/external_metadata.rs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::path::Path;
19+
use arrow_array::RecordBatch;
20+
use arrow_cast::pretty::{pretty_format_batches};
21+
use parquet::file::metadata::ParquetMetaData;
22+
23+
/// This example demonstrates advanced usage of Parquet metadata.
24+
///
25+
/// This is designed to show how to store Parquet metadata somewhere other than
26+
/// the Parquet file itself, and how to use that metadata to read the file. This
27+
/// can be used, for example, to store metadata for parquet files on remote
28+
/// object storage (e.g. S3) in a local file, use a query engine like
29+
/// DataFusion to figure out which files to read, and then read the files with a
30+
/// single object store request.
31+
///
32+
/// Specifically it:
33+
/// 1. It reads the metadata of a Parquet file
34+
/// 2. Removes some column statistics from the metadata (to make them smaller)
35+
/// 3. Stores the metadata in a separate file
36+
/// 4. Reads the metadata from the separate file and uses that to read the Parquet file
37+
///
38+
/// Without this API, to implement the functionality you need to implement
39+
/// a conversion to/from some other structs that can be serialized/deserialized.
40+
41+
#[tokio::main(flavor = "current_thread")]
42+
async fn main() -> parquet::errors::Result<()> {
43+
let testdata = arrow::util::test_util::parquet_test_data();
44+
let parquet_path = format!("{testdata}/alltypes_plain.parquet");
45+
let metadata_path = "thift_metadata.dat"; // todo tempdir for now use local file to inspect it
46+
47+
let metadata = get_metadata_from_parquet_file(&parquet_path).await;
48+
let metadata = prepare_metadata(metadata);
49+
write_metadata_to_file(metadata, &metadata_path);
50+
51+
// now read the metadata from the file and use it to read the Parquet file
52+
let metadata = read_metadata_from_file(&metadata_path);
53+
let batches = read_parquet_file_with_metadata(&parquet_path, metadata);
54+
55+
// display the results
56+
let batches_string = pretty_format_batches(&batches).unwrap()
57+
.to_string();
58+
let batches_lines :Vec<_> = batches_string
59+
.split('\n')
60+
.collect();
61+
62+
assert_eq!(batches_lines,
63+
vec!["todo"]
64+
);
65+
66+
Ok(())
67+
}
68+
69+
/// Reads the metadata from a parquet file
70+
async fn get_metadata_from_parquet_file(file: impl AsRef<Path>) -> ParquetMetaData {
71+
todo!();
72+
}
73+
74+
/// modifies the metadata to reduce its size
75+
fn prepare_metadata(metadata: ParquetMetaData) -> ParquetMetaData {
76+
todo!();
77+
}
78+
79+
/// writes the metadata to a file
80+
///
81+
/// The data is stored using the same thrift format as the Parquet file metadata
82+
fn write_metadata_to_file(metadata: ParquetMetaData, file: impl AsRef<Path>) {
83+
todo!();
84+
}
85+
86+
/// Reads the metadata from a file
87+
///
88+
/// This function reads the format written by `write_metadata_to_file`
89+
fn read_metadata_from_file(file: impl AsRef<Path>) -> ParquetMetaData {
90+
todo!();
91+
}
92+
93+
/// Reads the Parquet file using the metadata
94+
///
95+
/// This shows how to read the Parquet file using previously read metadata
96+
/// instead of the metadata in the Parquet file itself. This avoids an IO /
97+
/// having to fetch and decode the metadata from the Parquet file before
98+
/// beginning to read it.
99+
///
100+
/// In this example, we read the results as Arrow record batches
101+
fn read_parquet_file_with_metadata(file: impl AsRef<Path>, metadata: ParquetMetaData) -> Vec<RecordBatch>{
102+
todo!();
103+
}
104+
105+

0 commit comments

Comments
 (0)