@@ -5,8 +5,9 @@ use std::sync::Arc;
5
5
use arrow_array:: { Array , Int32Array , Int64Array , MapArray , RecordBatch , StringArray , StructArray } ;
6
6
use arrow_select:: filter:: filter_record_batch;
7
7
use chrono:: { DateTime , Utc } ;
8
+ use delta_kernel:: engine:: arrow_data:: ArrowEngineData ;
8
9
use delta_kernel:: expressions:: { Scalar , StructData } ;
9
- use delta_kernel:: { Expression , ExpressionHandler } ;
10
+ use delta_kernel:: { Expression , ExpressionEvaluator , ExpressionHandler } ;
10
11
use indexmap:: IndexMap ;
11
12
use object_store:: path:: Path ;
12
13
use object_store:: ObjectMeta ;
@@ -34,27 +35,44 @@ pub(crate) trait PartitionsExt {
34
35
35
36
impl PartitionsExt for IndexMap < & str , Scalar > {
36
37
fn hive_partition_path ( & self ) -> String {
37
- let fields = self
38
- . iter ( )
39
- . map ( |( k, v) | {
40
- let encoded = v. serialize_encoded ( ) ;
41
- format ! ( "{k}={encoded}" )
42
- } )
43
- . collect :: < Vec < _ > > ( ) ;
44
- fields. join ( "/" )
38
+ self . iter ( )
39
+ . map ( |( k, v) | format ! ( "{k}={}" , v. serialize_encoded( ) ) )
40
+ . collect :: < Vec < _ > > ( )
41
+ . join ( "/" )
45
42
}
46
43
}
47
44
48
45
impl PartitionsExt for IndexMap < String , Scalar > {
49
46
fn hive_partition_path ( & self ) -> String {
50
- let fields = self
47
+ self . iter ( )
48
+ . map ( |( k, v) | format ! ( "{k}={}" , v. serialize_encoded( ) ) )
49
+ . collect :: < Vec < _ > > ( )
50
+ . join ( "/" )
51
+ }
52
+ }
53
+
54
+ impl PartitionsExt for StructData {
55
+ fn hive_partition_path ( & self ) -> String {
56
+ self . fields ( )
51
57
. iter ( )
52
- . map ( |( k, v) | {
53
- let encoded = v. serialize_encoded ( ) ;
54
- format ! ( "{k}={encoded}" )
55
- } )
56
- . collect :: < Vec < _ > > ( ) ;
57
- fields. join ( "/" )
58
+ . zip ( self . values ( ) . iter ( ) )
59
+ . map ( |( k, v) | format ! ( "{}={}" , k. name( ) , v. serialize_encoded( ) ) )
60
+ . collect :: < Vec < _ > > ( )
61
+ . join ( "/" )
62
+ }
63
+ }
64
+
65
+ pub trait StructDataExt {
66
+ fn get ( & self , key : & str ) -> Option < & Scalar > ;
67
+ }
68
+
69
+ impl StructDataExt for StructData {
70
+ fn get ( & self , key : & str ) -> Option < & Scalar > {
71
+ self . fields ( )
72
+ . iter ( )
73
+ . zip ( self . values ( ) . iter ( ) )
74
+ . find ( |( k, _) | k. name ( ) == key)
75
+ . map ( |( _, v) | v)
58
76
}
59
77
}
60
78
@@ -134,13 +152,10 @@ impl DeletionVectorView<'_> {
134
152
/// functionality, e.g. parsing partition values.
135
153
#[ derive( Debug , PartialEq ) ]
136
154
pub struct LogicalFile < ' a > {
137
- path : & ' a StringArray ,
138
- /// The on-disk size of this data file in bytes
139
- size : & ' a Int64Array ,
140
- /// Last modification time of the file in milliseconds since the epoch.
141
- modification_time : & ' a Int64Array ,
155
+ data : Arc < RecordBatch > ,
142
156
/// The partition values for this logical file.
143
157
partition_values : & ' a MapArray ,
158
+ partition_values_parsed : Option < & ' a StructArray > ,
144
159
/// Struct containing all available statistics for the columns in this file.
145
160
stats : & ' a StructArray ,
146
161
/// Array containing the deletion vector data.
@@ -155,7 +170,7 @@ pub struct LogicalFile<'a> {
155
170
impl LogicalFile < ' _ > {
156
171
/// Path to the files storage location.
157
172
pub fn path ( & self ) -> Cow < ' _ , str > {
158
- percent_decode_str ( self . path . value ( self . index ) ) . decode_utf8_lossy ( )
173
+ percent_decode_str ( pick :: < StringArray > ( & self . data , 0 ) . value ( self . index ) ) . decode_utf8_lossy ( )
159
174
}
160
175
161
176
/// An object store [`Path`] to the file.
@@ -173,12 +188,12 @@ impl LogicalFile<'_> {
173
188
174
189
/// File size stored on disk.
175
190
pub fn size ( & self ) -> i64 {
176
- self . size . value ( self . index )
191
+ pick :: < Int64Array > ( & self . data , 1 ) . value ( self . index )
177
192
}
178
193
179
194
/// Last modification time of the file.
180
195
pub fn modification_time ( & self ) -> i64 {
181
- self . modification_time . value ( self . index )
196
+ pick :: < Int64Array > ( & self . data , 2 ) . value ( self . index )
182
197
}
183
198
184
199
/// Datetime of the last modification time of the file.
@@ -191,6 +206,14 @@ impl LogicalFile<'_> {
191
206
) )
192
207
}
193
208
209
+ pub fn partition_values_scalar ( & self ) -> Option < StructData > {
210
+ self . partition_values_parsed
211
+ . and_then ( |arr| match Scalar :: from_array ( arr, self . index ) {
212
+ Some ( Scalar :: Struct ( s) ) => Some ( s) ,
213
+ _ => None ,
214
+ } )
215
+ }
216
+
194
217
/// The partition values for this logical file.
195
218
pub fn partition_values ( & self ) -> DeltaResult < PartitionValues < ' _ > > {
196
219
if self . partition_fields . is_empty ( ) {
@@ -296,11 +319,13 @@ impl LogicalFile<'_> {
296
319
deletion_timestamp : Some ( Utc :: now ( ) . timestamp_millis ( ) ) ,
297
320
extended_file_metadata : Some ( true ) ,
298
321
size : Some ( self . size ( ) ) ,
299
- partition_values : self . partition_values ( ) . ok ( ) . map ( |pv| {
300
- pv. iter ( )
322
+ partition_values : self . partition_values_scalar ( ) . map ( |pv| {
323
+ pv. fields ( )
324
+ . iter ( )
325
+ . zip ( pv. values ( ) . iter ( ) )
301
326
. map ( |( k, v) | {
302
327
(
303
- k. to_string ( ) ,
328
+ k. name ( ) . to_owned ( ) ,
304
329
if v. is_null ( ) {
305
330
None
306
331
} else {
@@ -335,9 +360,8 @@ impl<'a> TryFrom<&LogicalFile<'a>> for ObjectMeta {
335
360
/// Helper for processing data from the materialized Delta log.
336
361
pub struct FileStatsAccessor < ' a > {
337
362
partition_fields : PartitionFields < ' a > ,
338
- paths : & ' a StringArray ,
363
+ data : Arc < RecordBatch > ,
339
364
sizes : & ' a Int64Array ,
340
- modification_times : & ' a Int64Array ,
341
365
stats : & ' a StructArray ,
342
366
deletion_vector : Option < DeletionVector < ' a > > ,
343
367
partition_values : & ' a MapArray ,
@@ -346,15 +370,57 @@ pub struct FileStatsAccessor<'a> {
346
370
pointer : usize ,
347
371
}
348
372
373
+ lazy_static:: lazy_static! {
374
+ static ref FILE_SCHEMA : StructType = StructType :: new( [
375
+ StructField :: new( "path" , DataType :: STRING , false ) ,
376
+ StructField :: new( "size" , DataType :: LONG , false ) ,
377
+ ] ) ;
378
+ static ref FILE_PICKER : Arc <dyn ExpressionEvaluator > = ARROW_HANDLER . get_evaluator(
379
+ Arc :: new( FILE_SCHEMA . clone( ) ) ,
380
+ Expression :: struct_from( [
381
+ Expression :: column( [ "add" , "path" ] ) ,
382
+ Expression :: column( [ "add" , "size" ] ) ,
383
+ Expression :: column( [ "add" , "modificationTime" ] )
384
+ ] ) ,
385
+ DataType :: struct_type( [
386
+ StructField :: new( "path" , DataType :: STRING , false ) ,
387
+ StructField :: new( "size" , DataType :: LONG , false ) ,
388
+ StructField :: new( "modification_time" , DataType :: LONG , false ) ,
389
+ ] ) ,
390
+ ) ;
391
+ }
392
+
393
+ fn pick < ' a , T : Array + ' static > ( data : & ' a RecordBatch , idx : usize ) -> & ' a T {
394
+ data. column ( idx)
395
+ . as_any ( )
396
+ . downcast_ref :: < T > ( )
397
+ . ok_or_else ( || {
398
+ DeltaTableError :: generic ( format ! (
399
+ "expected column '{}' to be of type '{}'" ,
400
+ idx,
401
+ std:: any:: type_name:: <T >( )
402
+ ) )
403
+ } )
404
+ . unwrap ( )
405
+ }
406
+
349
407
impl < ' a > FileStatsAccessor < ' a > {
350
408
pub ( crate ) fn try_new (
351
409
data : & ' a RecordBatch ,
352
410
metadata : & ' a Metadata ,
353
411
schema : & ' a StructType ,
354
412
) -> DeltaResult < Self > {
355
- let paths = extract_and_cast :: < StringArray > ( data, "add.path" ) ?;
413
+ let file_data = FILE_PICKER . evaluate ( & ArrowEngineData :: new ( data. clone ( ) ) ) ?;
414
+ let result = file_data
415
+ . into_any ( )
416
+ . downcast :: < ArrowEngineData > ( )
417
+ . map_err ( |_| {
418
+ DeltaTableError :: generic ( "failed to downcast evaluator result to ArrowEngineData." )
419
+ } ) ?
420
+ . record_batch ( )
421
+ . clone ( ) ;
422
+
356
423
let sizes = extract_and_cast :: < Int64Array > ( data, "add.size" ) ?;
357
- let modification_times = extract_and_cast :: < Int64Array > ( data, "add.modificationTime" ) ?;
358
424
let stats = extract_and_cast :: < StructArray > ( data, "add.stats_parsed" ) ?;
359
425
let partition_values = extract_and_cast :: < MapArray > ( data, "add.partitionValues" ) ?;
360
426
let partition_values_parsed =
@@ -398,9 +464,8 @@ impl<'a> FileStatsAccessor<'a> {
398
464
399
465
Ok ( Self {
400
466
partition_fields,
401
- paths ,
467
+ data : Arc :: new ( result ) ,
402
468
sizes,
403
- modification_times,
404
469
stats,
405
470
deletion_vector,
406
471
partition_values,
@@ -418,10 +483,9 @@ impl<'a> FileStatsAccessor<'a> {
418
483
) ) ) ;
419
484
}
420
485
Ok ( LogicalFile {
421
- path : self . paths ,
422
- size : self . sizes ,
423
- modification_time : self . modification_times ,
486
+ data : self . data . clone ( ) ,
424
487
partition_values : self . partition_values ,
488
+ partition_values_parsed : self . partition_values_parsed . clone ( ) ,
425
489
partition_fields : self . partition_fields . clone ( ) ,
426
490
stats : self . stats ,
427
491
deletion_vector : self . deletion_vector . clone ( ) ,
@@ -444,30 +508,6 @@ impl<'a> Iterator for FileStatsAccessor<'a> {
444
508
}
445
509
}
446
510
447
- pub struct LogDataIterator < ' a > {
448
- data : & ' a RecordBatch ,
449
- pointer : usize ,
450
- }
451
-
452
- impl < ' a > LogDataIterator < ' a > {
453
- pub ( crate ) fn new ( data : & ' a RecordBatch ) -> Self {
454
- Self { data, pointer : 0 }
455
- }
456
-
457
- pub ( crate ) fn len ( & self ) -> usize {
458
- self . data . num_rows ( )
459
- }
460
-
461
- pub fn path ( & self ) -> & str {
462
- let paths = self
463
- . data
464
- . column_by_name ( "path" )
465
- . and_then ( |c| c. as_any ( ) . downcast_ref :: < StringArray > ( ) )
466
- . unwrap ( ) ;
467
- paths. value ( self . pointer )
468
- }
469
- }
470
-
471
511
/// Provides semanitc access to the log data.
472
512
///
473
513
/// This is a helper struct that provides access to the log data in a more semantic way
@@ -896,8 +936,8 @@ mod tests {
896
936
897
937
assert_eq ! ( json_action. path( ) , struct_action. path( ) ) ;
898
938
assert_eq ! (
899
- json_action. partition_values ( ) . unwrap ( ) ,
900
- struct_action. partition_values ( ) . unwrap ( )
939
+ json_action. partition_values_scalar ( ) ,
940
+ struct_action. partition_values_scalar ( )
901
941
) ;
902
942
// assert_eq!(
903
943
// json_action.max_values().unwrap(),
0 commit comments