@@ -27,14 +27,30 @@ use crate::file::{metadata::*, reader::ChunkReader, FOOTER_SIZE, PARQUET_MAGIC};
27
27
28
28
use crate :: schema:: types:: { self , SchemaDescriptor } ;
29
29
30
- /// Layout of Parquet file
30
+ /// Reads the [ParquetMetaData] from the footer of the parquet file.
31
+ ///
32
+ /// # Layout of Parquet file
33
+ /// ```text
31
34
/// +---------------------------+-----+---+
32
35
/// | Rest of file | B | A |
33
36
/// +---------------------------+-----+---+
34
- /// where A: parquet footer, B: parquet metadata.
37
+ /// ```
38
+ /// where
39
+ /// * `A`: parquet footer which stores the length of the metadata.
40
+ /// * `B`: parquet metadata.
41
+ ///
42
+ /// # I/O
43
+ ///
44
+ /// This method first reads the last 8 bytes of the file via
45
+ /// [`ChunkReader::get_read`] to get the the parquet footer which contains the
46
+ /// metadata length.
47
+ ///
48
+ /// It then issues a second `get_read` to read the encoded metadata
49
+ /// metadata.
35
50
///
36
- /// The reader first reads DEFAULT_FOOTER_SIZE bytes from the end of the file.
37
- /// If it is not enough according to the length indicated in the footer, it reads more bytes.
51
+ /// # See Also
52
+ /// [`decode_metadata`] for decoding the metadata from the bytes.
53
+ /// [`decode_footer`] for decoding the metadata length from the footer.
38
54
pub fn parse_metadata < R : ChunkReader > ( chunk_reader : & R ) -> Result < ParquetMetaData > {
39
55
// check file is large enough to hold footer
40
56
let file_size = chunk_reader. len ( ) ;
@@ -65,7 +81,13 @@ pub fn parse_metadata<R: ChunkReader>(chunk_reader: &R) -> Result<ParquetMetaDat
65
81
decode_metadata ( chunk_reader. get_bytes ( start, metadata_len) ?. as_ref ( ) )
66
82
}
67
83
68
- /// Decodes [`ParquetMetaData`] from the provided bytes
84
+ /// Decodes [`ParquetMetaData`] from the provided bytes.
85
+ ///
86
+ /// Typically this is used to decode the metadata from the end of a parquet
87
+ /// file. The format of `buf` is the Thift compact binary protocol, as specified
88
+ /// by the [Parquet Spec].
89
+ ///
90
+ /// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
69
91
pub fn decode_metadata ( buf : & [ u8 ] ) -> Result < ParquetMetaData > {
70
92
// TODO: row group filtering
71
93
let mut prot = TCompactSliceInputProtocol :: new ( buf) ;
@@ -90,7 +112,17 @@ pub fn decode_metadata(buf: &[u8]) -> Result<ParquetMetaData> {
90
112
Ok ( ParquetMetaData :: new ( file_metadata, row_groups) )
91
113
}
92
114
93
- /// Decodes the footer returning the metadata length in bytes
115
+ /// Decodes the Parquet footer returning the metadata length in bytes
116
+ ///
117
+ /// A parquet footer is 8 bytes long and has the following layout:
118
+ /// * 4 bytes for the metadata length
119
+ /// * 4 bytes for the magic bytes 'PAR1'
120
+ ///
121
+ /// ```text
122
+ /// +-----+--------+
123
+ /// | len | 'PAR1' |
124
+ /// +-----+--------+
125
+ /// ```
94
126
pub fn decode_footer ( slice : & [ u8 ; FOOTER_SIZE ] ) -> Result < usize > {
95
127
// check this is indeed a parquet file
96
128
if slice[ 4 ..] != PARQUET_MAGIC {
0 commit comments