@@ -29,13 +29,11 @@ use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType};
29
29
use crate :: expressions:: PhysicalSortExpr ;
30
30
use crate :: limit:: LimitStream ;
31
31
use crate :: metrics:: {
32
- BaselineMetrics , Count , ExecutionPlanMetricsSet , MetricBuilder , MetricsSet ,
32
+ BaselineMetrics , ExecutionPlanMetricsSet , MetricsSet , SpillMetrics ,
33
33
} ;
34
34
use crate :: projection:: { make_with_child, update_expr, ProjectionExec } ;
35
35
use crate :: sorts:: streaming_merge:: StreamingMergeBuilder ;
36
- use crate :: spill:: {
37
- get_record_batch_memory_size, read_spill_as_stream, spill_record_batches,
38
- } ;
36
+ use crate :: spill:: { get_record_batch_memory_size, InProgressSpillFile , SpillManager } ;
39
37
use crate :: stream:: RecordBatchStreamAdapter ;
40
38
use crate :: topk:: TopK ;
41
39
use crate :: {
@@ -50,7 +48,7 @@ use arrow::array::{
50
48
use arrow:: compute:: { concat_batches, lexsort_to_indices, take_arrays, SortColumn } ;
51
49
use arrow:: datatypes:: { DataType , SchemaRef } ;
52
50
use arrow:: row:: { RowConverter , SortField } ;
53
- use datafusion_common:: { internal_err, Result } ;
51
+ use datafusion_common:: { internal_datafusion_err , internal_err, Result } ;
54
52
use datafusion_execution:: disk_manager:: RefCountedTempFile ;
55
53
use datafusion_execution:: memory_pool:: { MemoryConsumer , MemoryReservation } ;
56
54
use datafusion_execution:: runtime_env:: RuntimeEnv ;
@@ -65,23 +63,14 @@ struct ExternalSorterMetrics {
65
63
/// metrics
66
64
baseline : BaselineMetrics ,
67
65
68
- /// count of spills during the execution of the operator
69
- spill_count : Count ,
70
-
71
- /// total spilled bytes during the execution of the operator
72
- spilled_bytes : Count ,
73
-
74
- /// total spilled rows during the execution of the operator
75
- spilled_rows : Count ,
66
+ spill_metrics : SpillMetrics ,
76
67
}
77
68
78
69
impl ExternalSorterMetrics {
79
70
fn new ( metrics : & ExecutionPlanMetricsSet , partition : usize ) -> Self {
80
71
Self {
81
72
baseline : BaselineMetrics :: new ( metrics, partition) ,
82
- spill_count : MetricBuilder :: new ( metrics) . spill_count ( partition) ,
83
- spilled_bytes : MetricBuilder :: new ( metrics) . spilled_bytes ( partition) ,
84
- spilled_rows : MetricBuilder :: new ( metrics) . spilled_rows ( partition) ,
73
+ spill_metrics : SpillMetrics :: new ( metrics, partition) ,
85
74
}
86
75
}
87
76
}
@@ -230,9 +219,14 @@ struct ExternalSorter {
230
219
/// if `Self::in_mem_batches` are sorted
231
220
in_mem_batches_sorted : bool ,
232
221
233
- /// If data has previously been spilled, the locations of the
234
- /// spill files (in Arrow IPC format)
235
- spills : Vec < RefCountedTempFile > ,
222
+ /// During external sorting, in-memory intermediate data will be appended to
223
+ /// this file incrementally. Once finished, this file will be moved to [`Self::finished_spill_files`].
224
+ in_progress_spill_file : Option < InProgressSpillFile > ,
225
+ /// If data has previously been spilled, the locations of the spill files (in
226
+ /// Arrow IPC format)
227
+ /// Within the same spill file, the data might be chunked into multiple batches,
228
+ /// and ordered by sort keys.
229
+ finished_spill_files : Vec < RefCountedTempFile > ,
236
230
237
231
// ========================================================================
238
232
// EXECUTION RESOURCES:
@@ -244,6 +238,7 @@ struct ExternalSorter {
244
238
runtime : Arc < RuntimeEnv > ,
245
239
/// Reservation for in_mem_batches
246
240
reservation : MemoryReservation ,
241
+ spill_manager : SpillManager ,
247
242
248
243
/// Reservation for the merging of in-memory batches. If the sort
249
244
/// might spill, `sort_spill_reservation_bytes` will be
@@ -278,15 +273,23 @@ impl ExternalSorter {
278
273
MemoryConsumer :: new ( format ! ( "ExternalSorterMerge[{partition_id}]" ) )
279
274
. register ( & runtime. memory_pool ) ;
280
275
276
+ let spill_manager = SpillManager :: new (
277
+ Arc :: clone ( & runtime) ,
278
+ metrics. spill_metrics . clone ( ) ,
279
+ Arc :: clone ( & schema) ,
280
+ ) ;
281
+
281
282
Self {
282
283
schema,
283
284
in_mem_batches : vec ! [ ] ,
284
285
in_mem_batches_sorted : false ,
285
- spills : vec ! [ ] ,
286
+ in_progress_spill_file : None ,
287
+ finished_spill_files : vec ! [ ] ,
286
288
expr : expr. into ( ) ,
287
289
metrics,
288
290
fetch,
289
291
reservation,
292
+ spill_manager,
290
293
merge_reservation,
291
294
runtime,
292
295
batch_size,
@@ -320,7 +323,7 @@ impl ExternalSorter {
320
323
}
321
324
322
325
fn spilled_before ( & self ) -> bool {
323
- !self . spills . is_empty ( )
326
+ !self . finished_spill_files . is_empty ( )
324
327
}
325
328
326
329
/// Returns the final sorted output of all batches inserted via
@@ -348,11 +351,11 @@ impl ExternalSorter {
348
351
self . sort_or_spill_in_mem_batches ( true ) . await ?;
349
352
}
350
353
351
- for spill in self . spills . drain ( ..) {
354
+ for spill in self . finished_spill_files . drain ( ..) {
352
355
if !spill. path ( ) . exists ( ) {
353
356
return internal_err ! ( "Spill file {:?} does not exist" , spill. path( ) ) ;
354
357
}
355
- let stream = read_spill_as_stream ( spill, Arc :: clone ( & self . schema ) , 2 ) ?;
358
+ let stream = self . spill_manager . read_spill_as_stream ( spill) ?;
356
359
streams. push ( stream) ;
357
360
}
358
361
@@ -379,46 +382,69 @@ impl ExternalSorter {
379
382
380
383
/// How many bytes have been spilled to disk?
381
384
fn spilled_bytes ( & self ) -> usize {
382
- self . metrics . spilled_bytes . value ( )
385
+ self . metrics . spill_metrics . spilled_bytes . value ( )
383
386
}
384
387
385
388
/// How many rows have been spilled to disk?
386
389
fn spilled_rows ( & self ) -> usize {
387
- self . metrics . spilled_rows . value ( )
390
+ self . metrics . spill_metrics . spilled_rows . value ( )
388
391
}
389
392
390
393
/// How many spill files have been created?
391
394
fn spill_count ( & self ) -> usize {
392
- self . metrics . spill_count . value ( )
395
+ self . metrics . spill_metrics . spill_file_count . value ( )
393
396
}
394
397
395
- /// Writes any `in_memory_batches` to a spill file and clears
396
- /// the batches. The contents of the spill file are sorted .
398
+ /// When calling, all `in_mem_batches` must be sorted (*), and then all of them will
399
+ /// be appended to the in-progress spill file.
397
400
///
398
- /// Returns the amount of memory freed.
399
- async fn spill ( & mut self ) -> Result < usize > {
401
+ /// (*) 'Sorted' here means globally sorted for all buffered batches when the
402
+ /// memory limit is reached, instead of partially sorted within the batch.
403
+ async fn spill_append ( & mut self ) -> Result < ( ) > {
404
+ assert ! ( self . in_mem_batches_sorted) ;
405
+
400
406
// we could always get a chance to free some memory as long as we are holding some
401
407
if self . in_mem_batches . is_empty ( ) {
402
- return Ok ( 0 ) ;
408
+ return Ok ( ( ) ) ;
409
+ }
410
+
411
+ // Lazily initialize the in-progress spill file
412
+ if self . in_progress_spill_file . is_none ( ) {
413
+ self . in_progress_spill_file =
414
+ Some ( self . spill_manager . create_in_progress_file ( "Sorting" ) ?) ;
403
415
}
404
416
405
417
self . organize_stringview_arrays ( ) ?;
406
418
407
419
debug ! ( "Spilling sort data of ExternalSorter to disk whilst inserting" ) ;
408
420
409
- let spill_file = self . runtime . disk_manager . create_tmp_file ( "Sorting" ) ?;
410
421
let batches = std:: mem:: take ( & mut self . in_mem_batches ) ;
411
- let ( spilled_rows, spilled_bytes) = spill_record_batches (
412
- & batches,
413
- spill_file. path ( ) . into ( ) ,
414
- Arc :: clone ( & self . schema ) ,
415
- ) ?;
416
- let used = self . reservation . free ( ) ;
417
- self . metrics . spill_count . add ( 1 ) ;
418
- self . metrics . spilled_bytes . add ( spilled_bytes) ;
419
- self . metrics . spilled_rows . add ( spilled_rows) ;
420
- self . spills . push ( spill_file) ;
421
- Ok ( used)
422
+ self . reservation . free ( ) ;
423
+
424
+ let in_progress_file = self . in_progress_spill_file . as_mut ( ) . ok_or_else ( || {
425
+ internal_datafusion_err ! ( "In-progress spill file should be initialized" )
426
+ } ) ?;
427
+
428
+ for batch in batches {
429
+ in_progress_file. append_batch ( & batch) ?;
430
+ }
431
+
432
+ Ok ( ( ) )
433
+ }
434
+
435
+ /// Finishes the in-progress spill file and moves it to the finished spill files.
436
+ async fn spill_finish ( & mut self ) -> Result < ( ) > {
437
+ let mut in_progress_file =
438
+ self . in_progress_spill_file . take ( ) . ok_or_else ( || {
439
+ internal_datafusion_err ! ( "Should be called after `spill_append`" )
440
+ } ) ?;
441
+ let spill_file = in_progress_file. finish ( ) ?;
442
+
443
+ if let Some ( spill_file) = spill_file {
444
+ self . finished_spill_files . push ( spill_file) ;
445
+ }
446
+
447
+ Ok ( ( ) )
422
448
}
423
449
424
450
/// Reconstruct `self.in_mem_batches` to organize the payload buffers of each
@@ -515,6 +541,7 @@ impl ExternalSorter {
515
541
// `self.in_mem_batches` is already taken away by the sort_stream, now it is empty.
516
542
// We'll gradually collect the sorted stream into self.in_mem_batches, or directly
517
543
// write sorted batches to disk when the memory is insufficient.
544
+ let mut spilled = false ;
518
545
while let Some ( batch) = sorted_stream. next ( ) . await {
519
546
let batch = batch?;
520
547
let sorted_size = get_reserved_byte_for_record_batch ( & batch) ;
@@ -523,7 +550,8 @@ impl ExternalSorter {
523
550
// already in memory, so it's okay to combine it with previously
524
551
// sorted batches, and spill together.
525
552
self . in_mem_batches . push ( batch) ;
526
- self . spill ( ) . await ?; // reservation is freed in spill()
553
+ self . spill_append ( ) . await ?; // reservation is freed in spill()
554
+ spilled = true ;
527
555
} else {
528
556
self . in_mem_batches . push ( batch) ;
529
557
self . in_mem_batches_sorted = true ;
@@ -540,7 +568,12 @@ impl ExternalSorter {
540
568
if ( self . reservation . size ( ) > before / 2 ) || force_spill {
541
569
// We have not freed more than 50% of the memory, so we have to spill to
542
570
// free up more memory
543
- self . spill ( ) . await ?;
571
+ self . spill_append ( ) . await ?;
572
+ spilled = true ;
573
+ }
574
+
575
+ if spilled {
576
+ self . spill_finish ( ) . await ?;
544
577
}
545
578
546
579
// Reserve headroom for next sort/merge
@@ -1489,7 +1522,14 @@ mod tests {
1489
1522
// batches.
1490
1523
// The number of spills is roughly calculated as:
1491
1524
// `number_of_batches / (sort_spill_reservation_bytes / batch_size)`
1492
- assert ! ( ( 12 ..=18 ) . contains( & spill_count) ) ;
1525
+
1526
+ // If this assertion fail with large spill count, make sure the following
1527
+ // case does not happen:
1528
+ // During external sorting, one sorted run should be spilled to disk in a
1529
+ // single file, due to memory limit we might need to append to the file
1530
+ // multiple times to spill all the data. Make sure we're not writing each
1531
+ // appending as a separate file.
1532
+ assert ! ( ( 4 ..=8 ) . contains( & spill_count) ) ;
1493
1533
assert ! ( ( 15000 ..=20000 ) . contains( & spilled_rows) ) ;
1494
1534
assert ! ( ( 900000 ..=1000000 ) . contains( & spilled_bytes) ) ;
1495
1535
0 commit comments