@@ -34,12 +34,15 @@ use itertools::Itertools;
34
34
use parquet:: {
35
35
arrow:: ArrowWriter ,
36
36
basic:: Encoding ,
37
- file:: properties:: { WriterProperties , WriterPropertiesBuilder } ,
37
+ file:: {
38
+ properties:: { WriterProperties , WriterPropertiesBuilder } ,
39
+ FOOTER_SIZE ,
40
+ } ,
38
41
format:: SortingColumn ,
39
42
schema:: types:: ColumnPath ,
40
43
} ;
41
44
use rand:: distributions:: DistString ;
42
- use tracing:: error;
45
+ use tracing:: { error, trace } ;
43
46
44
47
use crate :: {
45
48
cli:: Options ,
@@ -218,7 +221,10 @@ impl<'a> Stream<'a> {
218
221
219
222
dir. flatten ( )
220
223
. map ( |file| file. path ( ) )
221
- . filter ( |file| file. extension ( ) . is_some_and ( |ext| ext. eq ( "parquet" ) ) )
224
+ . filter ( |file| {
225
+ file. extension ( ) . is_some_and ( |ext| ext. eq ( "parquet" ) )
226
+ && std:: fs:: metadata ( file) . is_ok_and ( |meta| meta. len ( ) > FOOTER_SIZE as u64 )
227
+ } )
222
228
. collect ( )
223
229
}
224
230
@@ -350,24 +356,32 @@ impl<'a> Stream<'a> {
350
356
. build ( ) ;
351
357
schemas. push ( merged_schema. clone ( ) ) ;
352
358
let schema = Arc :: new ( merged_schema) ;
353
- let parquet_file = OpenOptions :: new ( )
359
+ let mut part_path = parquet_path. to_owned ( ) ;
360
+ part_path. set_extension ( "part" ) ;
361
+ let mut part_file = OpenOptions :: new ( )
354
362
. create ( true )
355
363
. append ( true )
356
- . open ( & parquet_path )
364
+ . open ( & part_path )
357
365
. map_err ( |_| StagingError :: Create ) ?;
358
- let mut writer = ArrowWriter :: try_new ( & parquet_file , schema. clone ( ) , Some ( props) ) ?;
366
+ let mut writer = ArrowWriter :: try_new ( & mut part_file , schema. clone ( ) , Some ( props) ) ?;
359
367
for ref record in record_reader. merged_iter ( schema, time_partition. cloned ( ) ) {
360
368
writer. write ( record) ?;
361
369
}
362
-
363
370
writer. close ( ) ?;
364
- if parquet_file. metadata ( ) . unwrap ( ) . len ( ) < parquet:: file:: FOOTER_SIZE as u64 {
371
+
372
+ if part_file. metadata ( ) . unwrap ( ) . len ( ) < parquet:: file:: FOOTER_SIZE as u64 {
365
373
error ! (
366
374
"Invalid parquet file {:?} detected for stream {}, removing it" ,
367
- & parquet_path , & self . stream_name
375
+ & part_path , & self . stream_name
368
376
) ;
369
- remove_file ( parquet_path ) . unwrap ( ) ;
377
+ remove_file ( part_path ) . unwrap ( ) ;
370
378
} else {
379
+ trace ! ( "Parquet file successfully constructed" ) ;
380
+ if let Err ( e) = std:: fs:: rename ( & part_path, & parquet_path) {
381
+ error ! (
382
+ "Couldn't rename part file: {part_path:?} -> {parquet_path:?}, error = {e}"
383
+ ) ;
384
+ }
371
385
for file in arrow_files {
372
386
// warn!("file-\n{file:?}\n");
373
387
let file_size = file. metadata ( ) . unwrap ( ) . len ( ) ;
0 commit comments