@@ -61,6 +61,7 @@ use datafusion_physical_expr::{
61
61
62
62
use async_trait:: async_trait;
63
63
use futures:: { future, stream, StreamExt , TryStreamExt } ;
64
+ use itertools:: Itertools ;
64
65
use object_store:: ObjectStore ;
65
66
66
67
/// Configuration for creating a [`ListingTable`]
@@ -438,6 +439,112 @@ impl ListingOptions {
438
439
439
440
self . format . infer_schema ( state, & store, & files) . await
440
441
}
442
+
443
+ /// Infers the partition columns stored in `LOCATION` and compares
444
+ /// them with the columns provided in `PARTITIONED BY` to help prevent
445
+ /// accidental corrupts of partitioned tables.
446
+ ///
447
+ /// Allows specifying partial partitions.
448
+ pub async fn validate_partitions (
449
+ & self ,
450
+ state : & SessionState ,
451
+ table_path : & ListingTableUrl ,
452
+ ) -> Result < ( ) > {
453
+ if self . table_partition_cols . is_empty ( ) {
454
+ return Ok ( ( ) ) ;
455
+ }
456
+
457
+ if !table_path. is_collection ( ) {
458
+ return plan_err ! (
459
+ "Can't create a partitioned table backed by a single file, \
460
+ perhaps the URL is missing a trailing slash?"
461
+ ) ;
462
+ }
463
+
464
+ let inferred = self . infer_partitions ( state, table_path) . await ?;
465
+
466
+ // no partitioned files found on disk
467
+ if inferred. is_empty ( ) {
468
+ return Ok ( ( ) ) ;
469
+ }
470
+
471
+ let table_partition_names = self
472
+ . table_partition_cols
473
+ . iter ( )
474
+ . map ( |( col_name, _) | col_name. clone ( ) )
475
+ . collect_vec ( ) ;
476
+
477
+ if inferred. len ( ) < table_partition_names. len ( ) {
478
+ return plan_err ! (
479
+ "Inferred partitions to be {:?}, but got {:?}" ,
480
+ inferred,
481
+ table_partition_names
482
+ ) ;
483
+ }
484
+
485
+ // match prefix to allow creating tables with partial partitions
486
+ for ( idx, col) in table_partition_names. iter ( ) . enumerate ( ) {
487
+ if & inferred[ idx] != col {
488
+ return plan_err ! (
489
+ "Inferred partitions to be {:?}, but got {:?}" ,
490
+ inferred,
491
+ table_partition_names
492
+ ) ;
493
+ }
494
+ }
495
+
496
+ Ok ( ( ) )
497
+ }
498
+
499
+ /// Infer the partitioning at the given path on the provided object store.
500
+ /// For performance reasons, it doesn't read all the files on disk
501
+ /// and therefore may fail to detect invalid partitioning.
502
+ async fn infer_partitions (
503
+ & self ,
504
+ state : & SessionState ,
505
+ table_path : & ListingTableUrl ,
506
+ ) -> Result < Vec < String > > {
507
+ let store = state. runtime_env ( ) . object_store ( table_path) ?;
508
+
509
+ // only use 10 files for inference
510
+ // This can fail to detect inconsistent partition keys
511
+ // A DFS traversal approach of the store can help here
512
+ let files: Vec < _ > = table_path
513
+ . list_all_files ( state, store. as_ref ( ) , & self . file_extension )
514
+ . await ?
515
+ . take ( 10 )
516
+ . try_collect ( )
517
+ . await ?;
518
+
519
+ let stripped_path_parts = files. iter ( ) . map ( |file| {
520
+ table_path
521
+ . strip_prefix ( & file. location )
522
+ . unwrap ( )
523
+ . collect_vec ( )
524
+ } ) ;
525
+
526
+ let partition_keys = stripped_path_parts
527
+ . map ( |path_parts| {
528
+ path_parts
529
+ . into_iter ( )
530
+ . rev ( )
531
+ . skip ( 1 ) // get parents only; skip the file itself
532
+ . rev ( )
533
+ . map ( |s| s. split ( '=' ) . take ( 1 ) . collect ( ) )
534
+ . collect_vec ( )
535
+ } )
536
+ . collect_vec ( ) ;
537
+
538
+ match partition_keys. into_iter ( ) . all_equal_value ( ) {
539
+ Ok ( v) => Ok ( v) ,
540
+ Err ( None ) => Ok ( vec ! [ ] ) ,
541
+ Err ( Some ( diff) ) => {
542
+ let mut sorted_diff = [ diff. 0 , diff. 1 ] ;
543
+ sorted_diff. sort ( ) ;
544
+ plan_err ! ( "Found mixed partition values on disk {:?}" , sorted_diff)
545
+ }
546
+ }
547
+ }
441
548
}
442
549
443
550
/// Reads data from one or more files via an
0 commit comments