18
18
use std:: { ops:: Range , sync:: Arc } ;
19
19
20
20
use crate :: arrow:: arrow_reader:: ArrowReaderOptions ;
21
- use crate :: arrow:: async_reader:: AsyncFileReader ;
21
+ use crate :: arrow:: async_reader:: { AsyncFileReader , MetadataSuffixFetch } ;
22
22
use crate :: errors:: { ParquetError , Result } ;
23
23
use crate :: file:: metadata:: { ParquetMetaData , ParquetMetaDataReader } ;
24
24
use bytes:: Bytes ;
25
25
use futures:: { future:: BoxFuture , FutureExt , TryFutureExt } ;
26
- use object_store:: { path:: Path , ObjectMeta , ObjectStore } ;
26
+ use object_store:: { path:: Path , ObjectStore } ;
27
+ use object_store:: { GetOptions , GetRange } ;
27
28
use tokio:: runtime:: Handle ;
28
29
29
30
/// Reads Parquet files in object storage using [`ObjectStore`].
@@ -45,29 +46,29 @@ use tokio::runtime::Handle;
45
46
/// println!("Found Blob with {}B at {}", meta.size, meta.location);
46
47
///
47
48
/// // Show Parquet metadata
48
- /// let reader = ParquetObjectReader::new(storage_container, meta);
49
+ /// let reader = ParquetObjectReader::new(storage_container, meta.location).with_file_size(meta.size );
49
50
/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
50
51
/// print_parquet_metadata(&mut stdout(), builder.metadata());
51
52
/// # }
52
53
/// ```
53
54
#[ derive( Clone , Debug ) ]
54
55
pub struct ParquetObjectReader {
55
56
store : Arc < dyn ObjectStore > ,
56
- meta : ObjectMeta ,
57
+ path : Path ,
58
+ file_size : Option < usize > ,
57
59
metadata_size_hint : Option < usize > ,
58
60
preload_column_index : bool ,
59
61
preload_offset_index : bool ,
60
62
runtime : Option < Handle > ,
61
63
}
62
64
63
65
impl ParquetObjectReader {
64
- /// Creates a new [`ParquetObjectReader`] for the provided [`ObjectStore`] and [`ObjectMeta`]
65
- ///
66
- /// [`ObjectMeta`] can be obtained using [`ObjectStore::list`] or [`ObjectStore::head`]
67
- pub fn new ( store : Arc < dyn ObjectStore > , meta : ObjectMeta ) -> Self {
66
+ /// Creates a new [`ParquetObjectReader`] for the provided [`ObjectStore`] and [`Path`].
67
+ pub fn new ( store : Arc < dyn ObjectStore > , path : Path ) -> Self {
68
68
Self {
69
69
store,
70
- meta,
70
+ path,
71
+ file_size : None ,
71
72
metadata_size_hint : None ,
72
73
preload_column_index : false ,
73
74
preload_offset_index : false ,
@@ -84,6 +85,22 @@ impl ParquetObjectReader {
84
85
}
85
86
}
86
87
88
+ /// Provide the byte size of this file.
89
+ ///
90
+ /// If provided, the file size will ensure that only bounded range requests are used. If file
91
+ /// size is not provided, the reader will use suffix range requests to fetch the metadata.
92
+ ///
93
+ /// Providing this size up front is an important optimization to avoid extra calls when the
94
+ /// underlying store does not support suffix range requests.
95
+ ///
96
+ /// The file size can be obtained using [`ObjectStore::list`] or [`ObjectStore::head`].
97
+ pub fn with_file_size ( self , file_size : usize ) -> Self {
98
+ Self {
99
+ file_size : Some ( file_size) ,
100
+ ..self
101
+ }
102
+ }
103
+
87
104
/// Load the Column Index as part of [`Self::get_metadata`]
88
105
pub fn with_preload_column_index ( self , preload_column_index : bool ) -> Self {
89
106
Self {
@@ -125,7 +142,7 @@ impl ParquetObjectReader {
125
142
{
126
143
match & self . runtime {
127
144
Some ( handle) => {
128
- let path = self . meta . location . clone ( ) ;
145
+ let path = self . path . clone ( ) ;
129
146
let store = Arc :: clone ( & self . store ) ;
130
147
handle
131
148
. spawn ( async move { f ( & store, & path) . await } )
@@ -138,13 +155,27 @@ impl ParquetObjectReader {
138
155
)
139
156
. boxed ( )
140
157
}
141
- None => f ( & self . store , & self . meta . location )
142
- . map_err ( |e| e. into ( ) )
143
- . boxed ( ) ,
158
+ None => f ( & self . store , & self . path ) . map_err ( |e| e. into ( ) ) . boxed ( ) ,
144
159
}
145
160
}
146
161
}
147
162
163
+ impl MetadataSuffixFetch for & mut ParquetObjectReader {
164
+ fn fetch_suffix ( & mut self , suffix : usize ) -> BoxFuture < ' _ , Result < Bytes > > {
165
+ let options = GetOptions {
166
+ range : Some ( GetRange :: Suffix ( suffix) ) ,
167
+ ..Default :: default ( )
168
+ } ;
169
+ self . spawn ( |store, path| {
170
+ async move {
171
+ let resp = store. get_opts ( path, options) . await ?;
172
+ Ok :: < _ , ParquetError > ( resp. bytes ( ) . await ?)
173
+ }
174
+ . boxed ( )
175
+ } )
176
+ }
177
+ }
178
+
148
179
impl AsyncFileReader for ParquetObjectReader {
149
180
fn get_bytes ( & mut self , range : Range < usize > ) -> BoxFuture < ' _ , Result < Bytes > > {
150
181
self . spawn ( |store, path| store. get_range ( path, range) )
@@ -165,13 +196,16 @@ impl AsyncFileReader for ParquetObjectReader {
165
196
// `Self::get_bytes`.
166
197
fn get_metadata ( & mut self ) -> BoxFuture < ' _ , Result < Arc < ParquetMetaData > > > {
167
198
Box :: pin ( async move {
168
- let file_size = self . meta . size ;
169
- let metadata = ParquetMetaDataReader :: new ( )
199
+ let metadata_reader = ParquetMetaDataReader :: new ( )
170
200
. with_column_indexes ( self . preload_column_index )
171
201
. with_offset_indexes ( self . preload_offset_index )
172
- . with_prefetch_hint ( self . metadata_size_hint )
173
- . load_and_finish ( self , file_size)
174
- . await ?;
202
+ . with_prefetch_hint ( self . metadata_size_hint ) ;
203
+ let metadata = if let Some ( file_size) = self . file_size {
204
+ metadata_reader. load_and_finish ( self , file_size) . await ?
205
+ } else {
206
+ metadata_reader. load_via_suffix_and_finish ( self ) . await ?
207
+ } ;
208
+
175
209
Ok ( Arc :: new ( metadata) )
176
210
} )
177
211
}
@@ -181,7 +215,6 @@ impl AsyncFileReader for ParquetObjectReader {
181
215
options : & ' a ArrowReaderOptions ,
182
216
) -> BoxFuture < ' a , Result < Arc < ParquetMetaData > > > {
183
217
Box :: pin ( async move {
184
- let file_size = self . meta . size ;
185
218
let metadata = ParquetMetaDataReader :: new ( )
186
219
. with_column_indexes ( self . preload_column_index )
187
220
. with_offset_indexes ( self . preload_offset_index )
@@ -191,7 +224,11 @@ impl AsyncFileReader for ParquetObjectReader {
191
224
let metadata =
192
225
metadata. with_decryption_properties ( options. file_decryption_properties . as_ref ( ) ) ;
193
226
194
- let metadata = metadata. load_and_finish ( self , file_size) . await ?;
227
+ let metadata = if let Some ( file_size) = self . file_size {
228
+ metadata. load_and_finish ( self , file_size) . await ?
229
+ } else {
230
+ metadata. load_via_suffix_and_finish ( self ) . await ?
231
+ } ;
195
232
196
233
Ok ( Arc :: new ( metadata) )
197
234
} )
@@ -231,7 +268,22 @@ mod tests {
231
268
#[ tokio:: test]
232
269
async fn test_simple ( ) {
233
270
let ( meta, store) = get_meta_store ( ) . await ;
234
- let object_reader = ParquetObjectReader :: new ( store, meta) ;
271
+ let object_reader =
272
+ ParquetObjectReader :: new ( store, meta. location ) . with_file_size ( meta. size ) ;
273
+
274
+ let builder = ParquetRecordBatchStreamBuilder :: new ( object_reader)
275
+ . await
276
+ . unwrap ( ) ;
277
+ let batches: Vec < _ > = builder. build ( ) . unwrap ( ) . try_collect ( ) . await . unwrap ( ) ;
278
+
279
+ assert_eq ! ( batches. len( ) , 1 ) ;
280
+ assert_eq ! ( batches[ 0 ] . num_rows( ) , 8 ) ;
281
+ }
282
+
283
+ #[ tokio:: test]
284
+ async fn test_simple_without_file_length ( ) {
285
+ let ( meta, store) = get_meta_store ( ) . await ;
286
+ let object_reader = ParquetObjectReader :: new ( store, meta. location ) ;
235
287
236
288
let builder = ParquetRecordBatchStreamBuilder :: new ( object_reader)
237
289
. await
@@ -247,7 +299,8 @@ mod tests {
247
299
let ( mut meta, store) = get_meta_store ( ) . await ;
248
300
meta. location = Path :: from ( "I don't exist.parquet" ) ;
249
301
250
- let object_reader = ParquetObjectReader :: new ( store, meta) ;
302
+ let object_reader =
303
+ ParquetObjectReader :: new ( store, meta. location ) . with_file_size ( meta. size ) ;
251
304
// Cannot use unwrap_err as ParquetRecordBatchStreamBuilder: !Debug
252
305
match ParquetRecordBatchStreamBuilder :: new ( object_reader) . await {
253
306
Ok ( _) => panic ! ( "expected failure" ) ,
@@ -280,7 +333,9 @@ mod tests {
280
333
281
334
let initial_actions = num_actions. load ( Ordering :: Relaxed ) ;
282
335
283
- let reader = ParquetObjectReader :: new ( store, meta) . with_runtime ( rt. handle ( ) . clone ( ) ) ;
336
+ let reader = ParquetObjectReader :: new ( store, meta. location )
337
+ . with_file_size ( meta. size )
338
+ . with_runtime ( rt. handle ( ) . clone ( ) ) ;
284
339
285
340
let builder = ParquetRecordBatchStreamBuilder :: new ( reader) . await . unwrap ( ) ;
286
341
let batches: Vec < _ > = builder. build ( ) . unwrap ( ) . try_collect ( ) . await . unwrap ( ) ;
@@ -306,7 +361,9 @@ mod tests {
306
361
307
362
let ( meta, store) = get_meta_store ( ) . await ;
308
363
309
- let reader = ParquetObjectReader :: new ( store, meta) . with_runtime ( rt. handle ( ) . clone ( ) ) ;
364
+ let reader = ParquetObjectReader :: new ( store, meta. location )
365
+ . with_file_size ( meta. size )
366
+ . with_runtime ( rt. handle ( ) . clone ( ) ) ;
310
367
311
368
let current_id = std:: thread:: current ( ) . id ( ) ;
312
369
@@ -329,7 +386,9 @@ mod tests {
329
386
330
387
let ( meta, store) = get_meta_store ( ) . await ;
331
388
332
- let mut reader = ParquetObjectReader :: new ( store, meta) . with_runtime ( rt. handle ( ) . clone ( ) ) ;
389
+ let mut reader = ParquetObjectReader :: new ( store, meta. location )
390
+ . with_file_size ( meta. size )
391
+ . with_runtime ( rt. handle ( ) . clone ( ) ) ;
333
392
334
393
rt. shutdown_background ( ) ;
335
394
0 commit comments