@@ -156,13 +156,16 @@ pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
156
156
/// defined by [`parquet.thrift`].
157
157
///
158
158
/// # Overview
159
+ /// The fields of this structure are:
159
160
/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
160
161
/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
161
162
/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
162
163
///
163
164
/// This structure is read by the various readers in this crate or can be read
164
165
/// directly from a file using the [`ParquetMetaDataReader`] struct.
165
166
///
167
+ /// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
168
+ ///
166
169
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
167
170
#[ derive( Debug , Clone , PartialEq ) ]
168
171
pub struct ParquetMetaData {
@@ -204,6 +207,11 @@ impl ParquetMetaData {
204
207
}
205
208
}
206
209
210
+ /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
211
+ pub fn into_builder ( self ) -> ParquetMetaDataBuilder {
212
+ self . into ( )
213
+ }
214
+
207
215
/// Returns file metadata as reference.
208
216
pub fn file_metadata ( & self ) -> & FileMetaData {
209
217
& self . file_metadata
@@ -290,6 +298,117 @@ impl ParquetMetaData {
290
298
}
291
299
}
292
300
301
+ /// A builder for creating / manipulating [`ParquetMetaData`]
302
+ ///
303
+ /// # Example creating a new [`ParquetMetaData`]
304
+ ///
305
+ ///```no_run
306
+ /// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
307
+ /// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
308
+ /// // Create a new builder given the file metadata
309
+ /// let file_metadata = get_file_metadata();
310
+ /// // Create a row group
311
+ /// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
312
+ /// .set_num_rows(100)
313
+ /// // ... (A real row group needs more than just the number of rows)
314
+ /// .build()
315
+ /// .unwrap();
316
+ /// // Create the final metadata
317
+ /// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
318
+ /// .add_row_group(row_group)
319
+ /// .build();
320
+ /// ```
321
+ ///
322
+ /// # Example modifying an existing [`ParquetMetaData`]
323
+ /// ```no_run
324
+ /// # use parquet::file::metadata::ParquetMetaData;
325
+ /// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
326
+ /// // Modify the metadata so only the last RowGroup remains
327
+ /// let metadata: ParquetMetaData = load_metadata();
328
+ /// let mut builder = metadata.into_builder();
329
+ ///
330
+ /// // Take existing row groups to modify
331
+ /// let mut row_groups = builder.take_row_groups();
332
+ /// let last_row_group = row_groups.pop().unwrap();
333
+ ///
334
+ /// let metadata = builder
335
+ /// .add_row_group(last_row_group)
336
+ /// .build();
337
+ /// ```
338
+ pub struct ParquetMetaDataBuilder ( ParquetMetaData ) ;
339
+
340
+ impl ParquetMetaDataBuilder {
341
+ /// Create a new builder from a file metadata, with no row groups
342
+ pub fn new ( file_meta_data : FileMetaData ) -> Self {
343
+ Self ( ParquetMetaData :: new ( file_meta_data, vec ! [ ] ) )
344
+ }
345
+
346
+ /// Create a new builder from an exising ParquetMetaData
347
+ pub fn new_from_metadata ( metadata : ParquetMetaData ) -> Self {
348
+ Self ( metadata)
349
+ }
350
+
351
+ /// Adds a row group to the metadata
352
+ pub fn add_row_group ( mut self , row_group : RowGroupMetaData ) -> Self {
353
+ self . 0 . row_groups . push ( row_group) ;
354
+ self
355
+ }
356
+
357
+ /// Sets all the row groups to the specified list
358
+ pub fn set_row_groups ( mut self , row_groups : Vec < RowGroupMetaData > ) -> Self {
359
+ self . 0 . row_groups = row_groups;
360
+ self
361
+ }
362
+
363
+ /// Takes ownership of the row groups in this builder, and clears the list
364
+ /// of row groups.
365
+ ///
366
+ /// This can be used for more efficient creation of a new ParquetMetaData
367
+ /// from an existing one.
368
+ pub fn take_row_groups ( & mut self ) -> Vec < RowGroupMetaData > {
369
+ std:: mem:: take ( & mut self . 0 . row_groups )
370
+ }
371
+
372
+ /// Return a reference to the current row groups
373
+ pub fn row_groups ( & self ) -> & [ RowGroupMetaData ] {
374
+ & self . 0 . row_groups
375
+ }
376
+
377
+ /// Sets the column index
378
+ pub fn set_column_index ( mut self , column_index : Option < ParquetColumnIndex > ) -> Self {
379
+ self . 0 . column_index = column_index;
380
+ self
381
+ }
382
+
383
+ /// Returns the current column index from the builder, replacing it with `None`
384
+ pub fn take_column_index ( & mut self ) -> Option < ParquetColumnIndex > {
385
+ std:: mem:: take ( & mut self . 0 . column_index )
386
+ }
387
+
388
+ /// Sets the offset index
389
+ pub fn set_offset_index ( mut self , offset_index : Option < ParquetOffsetIndex > ) -> Self {
390
+ self . 0 . offset_index = offset_index;
391
+ self
392
+ }
393
+
394
+ /// Returns the current offset index from the builder, replacing it with `None`
395
+ pub fn take_offset_index ( & mut self ) -> Option < ParquetOffsetIndex > {
396
+ std:: mem:: take ( & mut self . 0 . offset_index )
397
+ }
398
+
399
+ /// Creates a new ParquetMetaData from the builder
400
+ pub fn build ( self ) -> ParquetMetaData {
401
+ let Self ( metadata) = self ;
402
+ metadata
403
+ }
404
+ }
405
+
406
+ impl From < ParquetMetaData > for ParquetMetaDataBuilder {
407
+ fn from ( meta_data : ParquetMetaData ) -> Self {
408
+ Self ( meta_data)
409
+ }
410
+ }
411
+
293
412
pub type KeyValue = crate :: format:: KeyValue ;
294
413
295
414
/// Reference counted pointer for [`FileMetaData`].
@@ -566,12 +685,27 @@ impl RowGroupMetaDataBuilder {
566
685
self
567
686
}
568
687
688
+ /// Takes ownership of the the column metadata in this builder, and clears
689
+ /// the list of columns.
690
+ ///
691
+ /// This can be used for more efficient creation of a new RowGroupMetaData
692
+ /// from an existing one.
693
+ pub fn take_columns ( & mut self ) -> Vec < ColumnChunkMetaData > {
694
+ std:: mem:: take ( & mut self . 0 . columns )
695
+ }
696
+
569
697
/// Sets column metadata for this row group.
570
698
pub fn set_column_metadata ( mut self , value : Vec < ColumnChunkMetaData > ) -> Self {
571
699
self . 0 . columns = value;
572
700
self
573
701
}
574
702
703
+ /// Adds a column metadata to this row group
704
+ pub fn add_column_metadata ( mut self , value : ColumnChunkMetaData ) -> Self {
705
+ self . 0 . columns . push ( value) ;
706
+ self
707
+ }
708
+
575
709
/// Sets ordinal for this row group.
576
710
pub fn set_ordinal ( mut self , value : i16 ) -> Self {
577
711
self . 0 . ordinal = Some ( value) ;
@@ -1672,7 +1806,9 @@ mod tests {
1672
1806
. unwrap ( ) ;
1673
1807
let row_group_meta_with_stats = vec ! [ row_group_meta_with_stats] ;
1674
1808
1675
- let parquet_meta = ParquetMetaData :: new ( file_metadata. clone ( ) , row_group_meta_with_stats) ;
1809
+ let parquet_meta = ParquetMetaDataBuilder :: new ( file_metadata. clone ( ) )
1810
+ . set_row_groups ( row_group_meta_with_stats)
1811
+ . build ( ) ;
1676
1812
let base_expected_size = 2312 ;
1677
1813
1678
1814
assert_eq ! ( parquet_meta. memory_size( ) , base_expected_size) ;
@@ -1692,14 +1828,13 @@ mod tests {
1692
1828
offset_index. append_unencoded_byte_array_data_bytes ( Some ( 10 ) ) ;
1693
1829
let offset_index = offset_index. build_to_thrift ( ) ;
1694
1830
1695
- let parquet_meta = ParquetMetaData :: new_with_page_index (
1696
- file_metadata,
1697
- row_group_meta,
1698
- Some ( vec ! [ vec![ Index :: BOOLEAN ( native_index) ] ] ) ,
1699
- Some ( vec ! [ vec![
1831
+ let parquet_meta = ParquetMetaDataBuilder :: new ( file_metadata)
1832
+ . set_row_groups ( row_group_meta)
1833
+ . set_column_index ( Some ( vec ! [ vec![ Index :: BOOLEAN ( native_index) ] ] ) )
1834
+ . set_offset_index ( Some ( vec ! [ vec![
1700
1835
OffsetIndexMetaData :: try_new( offset_index) . unwrap( )
1701
- ] ] ) ,
1702
- ) ;
1836
+ ] ] ) )
1837
+ . build ( ) ;
1703
1838
1704
1839
let bigger_expected_size = 2816 ;
1705
1840
// more set fields means more memory usage
0 commit comments