20
20
use std:: sync:: Arc ;
21
21
22
22
use arrow:: {
23
- array:: { ArrayRef , Int32Array } ,
23
+ array:: { as_string_array , ArrayRef , Int32Array , StringArray } ,
24
24
compute:: SortOptions ,
25
25
record_batch:: RecordBatch ,
26
26
} ;
@@ -29,6 +29,7 @@ use datafusion::physical_plan::expressions::PhysicalSortExpr;
29
29
use datafusion:: physical_plan:: sorts:: sort:: SortExec ;
30
30
use datafusion:: physical_plan:: { collect, ExecutionPlan } ;
31
31
use datafusion:: prelude:: { SessionConfig , SessionContext } ;
32
+ use datafusion_common:: cast:: as_int32_array;
32
33
use datafusion_execution:: memory_pool:: GreedyMemoryPool ;
33
34
use datafusion_physical_expr:: expressions:: col;
34
35
use datafusion_physical_expr_common:: sort_expr:: LexOrdering ;
@@ -42,12 +43,17 @@ const KB: usize = 1 << 10;
42
43
#[ cfg_attr( tarpaulin, ignore) ]
43
44
async fn test_sort_10k_mem ( ) {
44
45
for ( batch_size, should_spill) in [ ( 5 , false ) , ( 20000 , true ) , ( 500000 , true ) ] {
45
- SortTest :: new ( )
46
+ let ( input , collected ) = SortTest :: new ( )
46
47
. with_int32_batches ( batch_size)
48
+ . with_sort_columns ( vec ! [ "x" ] )
47
49
. with_pool_size ( 10 * KB )
48
50
. with_should_spill ( should_spill)
49
51
. run ( )
50
52
. await ;
53
+
54
+ let expected = partitions_to_sorted_vec ( & input) ;
55
+ let actual = batches_to_vec ( & collected) ;
56
+ assert_eq ! ( expected, actual, "failure in @ batch_size {batch_size:?}" ) ;
51
57
}
52
58
}
53
59
@@ -57,29 +63,123 @@ async fn test_sort_100k_mem() {
57
63
for ( batch_size, should_spill) in
58
64
[ ( 5 , false ) , ( 10000 , false ) , ( 20000 , true ) , ( 1000000 , true ) ]
59
65
{
60
- SortTest :: new ( )
66
+ let ( input , collected ) = SortTest :: new ( )
61
67
. with_int32_batches ( batch_size)
68
+ . with_sort_columns ( vec ! [ "x" ] )
69
+ . with_pool_size ( 100 * KB )
70
+ . with_should_spill ( should_spill)
71
+ . run ( )
72
+ . await ;
73
+
74
+ let expected = partitions_to_sorted_vec ( & input) ;
75
+ let actual = batches_to_vec ( & collected) ;
76
+ assert_eq ! ( expected, actual, "failure in @ batch_size {batch_size:?}" ) ;
77
+ }
78
+ }
79
+
80
+ #[ tokio:: test]
81
+ #[ cfg_attr( tarpaulin, ignore) ]
82
+ async fn test_sort_strings_100k_mem ( ) {
83
+ for ( batch_size, should_spill) in
84
+ [ ( 5 , false ) , ( 1000 , false ) , ( 10000 , true ) , ( 20000 , true ) ]
85
+ {
86
+ let ( input, collected) = SortTest :: new ( )
87
+ . with_utf8_batches ( batch_size)
88
+ . with_sort_columns ( vec ! [ "x" ] )
62
89
. with_pool_size ( 100 * KB )
63
90
. with_should_spill ( should_spill)
64
91
. run ( )
65
92
. await ;
93
+
94
+ let mut input = input
95
+ . iter ( )
96
+ . flat_map ( |p| p. iter ( ) )
97
+ . map ( |b| {
98
+ let array = b. column ( 0 ) ;
99
+ as_string_array ( array)
100
+ . iter ( )
101
+ . map ( |s| s. unwrap ( ) . to_string ( ) )
102
+ } )
103
+ . flatten ( )
104
+ . collect :: < Vec < String > > ( ) ;
105
+ input. sort_unstable ( ) ;
106
+ let actual = collected
107
+ . iter ( )
108
+ . map ( |b| {
109
+ let array = b. column ( 0 ) ;
110
+ as_string_array ( array)
111
+ . iter ( )
112
+ . map ( |s| s. unwrap ( ) . to_string ( ) )
113
+ } )
114
+ . flatten ( )
115
+ . collect :: < Vec < String > > ( ) ;
116
+ assert_eq ! ( input, actual) ;
117
+ }
118
+ }
119
+
120
+ #[ tokio:: test]
121
+ #[ cfg_attr( tarpaulin, ignore) ]
122
+ async fn test_sort_multi_columns_100k_mem ( ) {
123
+ for ( batch_size, should_spill) in
124
+ [ ( 5 , false ) , ( 1000 , false ) , ( 10000 , true ) , ( 20000 , true ) ]
125
+ {
126
+ let ( input, collected) = SortTest :: new ( )
127
+ . with_int32_utf8_batches ( batch_size)
128
+ . with_sort_columns ( vec ! [ "x" , "y" ] )
129
+ . with_pool_size ( 100 * KB )
130
+ . with_should_spill ( should_spill)
131
+ . run ( )
132
+ . await ;
133
+
134
+ fn record_batch_to_vec ( b : & RecordBatch ) -> Vec < ( i32 , String ) > {
135
+ let mut rows: Vec < _ > = Vec :: new ( ) ;
136
+ let i32_array = as_int32_array ( b. column ( 0 ) ) . unwrap ( ) ;
137
+ let string_array = as_string_array ( b. column ( 1 ) ) ;
138
+ for i in 0 ..b. num_rows ( ) {
139
+ let str = string_array. value ( i) . to_string ( ) ;
140
+ let i32 = i32_array. value ( i) ;
141
+ rows. push ( ( i32, str) ) ;
142
+ }
143
+ rows
144
+ }
145
+ let mut input = input
146
+ . iter ( )
147
+ . flat_map ( |p| p. iter ( ) )
148
+ . map ( record_batch_to_vec)
149
+ . flatten ( )
150
+ . collect :: < Vec < ( i32 , String ) > > ( ) ;
151
+ input. sort_unstable ( ) ;
152
+ let actual = collected
153
+ . iter ( )
154
+ . map ( record_batch_to_vec)
155
+ . flatten ( )
156
+ . collect :: < Vec < ( i32 , String ) > > ( ) ;
157
+ assert_eq ! ( input, actual) ;
66
158
}
67
159
}
68
160
69
161
#[ tokio:: test]
70
162
async fn test_sort_unlimited_mem ( ) {
71
163
for ( batch_size, should_spill) in [ ( 5 , false ) , ( 20000 , false ) , ( 1000000 , false ) ] {
72
- SortTest :: new ( )
164
+ let ( input , collected ) = SortTest :: new ( )
73
165
. with_int32_batches ( batch_size)
166
+ . with_sort_columns ( vec ! [ "x" ] )
74
167
. with_pool_size ( usize:: MAX )
75
168
. with_should_spill ( should_spill)
76
169
. run ( )
77
170
. await ;
171
+
172
+ let expected = partitions_to_sorted_vec ( & input) ;
173
+ let actual = batches_to_vec ( & collected) ;
174
+ assert_eq ! ( expected, actual, "failure in @ batch_size {batch_size:?}" ) ;
78
175
}
79
176
}
177
+
80
178
#[ derive( Debug , Default ) ]
81
179
struct SortTest {
82
180
input : Vec < Vec < RecordBatch > > ,
181
+ /// The names of the columns to sort by
182
+ sort_columns : Vec < String > ,
83
183
/// GreedyMemoryPool size, if specified
84
184
pool_size : Option < usize > ,
85
185
/// If true, expect the sort to spill
@@ -91,12 +191,29 @@ impl SortTest {
91
191
Default :: default ( )
92
192
}
93
193
194
+ fn with_sort_columns ( mut self , sort_columns : Vec < & str > ) -> Self {
195
+ self . sort_columns = sort_columns. iter ( ) . map ( |s| s. to_string ( ) ) . collect ( ) ;
196
+ self
197
+ }
198
+
94
199
/// Create batches of int32 values of rows
95
200
fn with_int32_batches ( mut self , rows : usize ) -> Self {
96
201
self . input = vec ! [ make_staggered_i32_batches( rows) ] ;
97
202
self
98
203
}
99
204
205
+ /// Create batches of utf8 values of rows
206
+ fn with_utf8_batches ( mut self , rows : usize ) -> Self {
207
+ self . input = vec ! [ make_staggered_utf8_batches( rows) ] ;
208
+ self
209
+ }
210
+
211
+ /// Create batches of int32 and utf8 values of rows
212
+ fn with_int32_utf8_batches ( mut self , rows : usize ) -> Self {
213
+ self . input = vec ! [ make_staggered_i32_utf8_batches( rows) ] ;
214
+ self
215
+ }
216
+
100
217
/// specify that this test should use a memory pool of the specified size
101
218
fn with_pool_size ( mut self , pool_size : usize ) -> Self {
102
219
self . pool_size = Some ( pool_size) ;
@@ -110,7 +227,7 @@ impl SortTest {
110
227
111
228
/// Sort the input using SortExec and ensure the results are
112
229
/// correct according to `Vec::sort` both with and without spilling
113
- async fn run ( & self ) {
230
+ async fn run ( & self ) -> ( Vec < Vec < RecordBatch > > , Vec < RecordBatch > ) {
114
231
let input = self . input . clone ( ) ;
115
232
let first_batch = input
116
233
. iter ( )
@@ -119,16 +236,21 @@ impl SortTest {
119
236
. expect ( "at least one batch" ) ;
120
237
let schema = first_batch. schema ( ) ;
121
238
122
- let sort = LexOrdering :: new ( vec ! [ PhysicalSortExpr {
123
- expr: col( "x" , & schema) . unwrap( ) ,
124
- options: SortOptions {
125
- descending: false ,
126
- nulls_first: true ,
127
- } ,
128
- } ] ) ;
239
+ let sort_ordering = LexOrdering :: new (
240
+ self . sort_columns
241
+ . iter ( )
242
+ . map ( |c| PhysicalSortExpr {
243
+ expr : col ( c, & schema) . unwrap ( ) ,
244
+ options : SortOptions {
245
+ descending : false ,
246
+ nulls_first : true ,
247
+ } ,
248
+ } )
249
+ . collect ( ) ,
250
+ ) ;
129
251
130
252
let exec = MemorySourceConfig :: try_new_exec ( & input, schema, None ) . unwrap ( ) ;
131
- let sort = Arc :: new ( SortExec :: new ( sort , exec) ) ;
253
+ let sort = Arc :: new ( SortExec :: new ( sort_ordering , exec) ) ;
132
254
133
255
let session_config = SessionConfig :: new ( ) ;
134
256
let session_ctx = if let Some ( pool_size) = self . pool_size {
@@ -153,9 +275,6 @@ impl SortTest {
153
275
let task_ctx = session_ctx. task_ctx ( ) ;
154
276
let collected = collect ( sort. clone ( ) , task_ctx) . await . unwrap ( ) ;
155
277
156
- let expected = partitions_to_sorted_vec ( & input) ;
157
- let actual = batches_to_vec ( & collected) ;
158
-
159
278
if self . should_spill {
160
279
assert_ne ! (
161
280
sort. metrics( ) . unwrap( ) . spill_count( ) . unwrap( ) ,
@@ -175,7 +294,8 @@ impl SortTest {
175
294
0 ,
176
295
"The sort should have returned all memory used back to the memory pool"
177
296
) ;
178
- assert_eq ! ( expected, actual, "failure in @ pool_size {self:?}" ) ;
297
+
298
+ ( input, collected)
179
299
}
180
300
}
181
301
@@ -203,3 +323,63 @@ fn make_staggered_i32_batches(len: usize) -> Vec<RecordBatch> {
203
323
}
204
324
batches
205
325
}
326
+
327
+ /// Return randomly sized record batches in a field named 'x' of type `Utf8`
328
+ /// with randomized content
329
+ fn make_staggered_utf8_batches ( len : usize ) -> Vec < RecordBatch > {
330
+ let mut rng = rand:: thread_rng ( ) ;
331
+ let max_batch = 1024 ;
332
+
333
+ let mut batches = vec ! [ ] ;
334
+ let mut remaining = len;
335
+ while remaining != 0 {
336
+ let to_read = rng. gen_range ( 0 ..=remaining. min ( max_batch) ) ;
337
+ remaining -= to_read;
338
+
339
+ batches. push (
340
+ RecordBatch :: try_from_iter ( vec ! [ (
341
+ "x" ,
342
+ Arc :: new( StringArray :: from_iter_values(
343
+ ( 0 ..to_read) . map( |_| format!( "test_string_{}" , rng. gen :: <u32 >( ) ) ) ,
344
+ ) ) as ArrayRef ,
345
+ ) ] )
346
+ . unwrap ( ) ,
347
+ )
348
+ }
349
+ batches
350
+ }
351
+
352
+ /// Return randomly sized record batches in a field named 'x' of type `Int32`
353
+ /// with randomized i32 content and a field named 'y' of type `Utf8`
354
+ /// with randomized content
355
+ fn make_staggered_i32_utf8_batches ( len : usize ) -> Vec < RecordBatch > {
356
+ let mut rng = rand:: thread_rng ( ) ;
357
+ let max_batch = 1024 ;
358
+
359
+ let mut batches = vec ! [ ] ;
360
+ let mut remaining = len;
361
+ while remaining != 0 {
362
+ let to_read = rng. gen_range ( 0 ..=remaining. min ( max_batch) ) ;
363
+ remaining -= to_read;
364
+
365
+ batches. push (
366
+ RecordBatch :: try_from_iter ( vec ! [
367
+ (
368
+ "x" ,
369
+ Arc :: new( Int32Array :: from_iter_values(
370
+ ( 0 ..to_read) . map( |_| rng. gen ( ) ) ,
371
+ ) ) as ArrayRef ,
372
+ ) ,
373
+ (
374
+ "y" ,
375
+ Arc :: new( StringArray :: from_iter_values(
376
+ ( 0 ..to_read) . map( |_| format!( "test_string_{}" , rng. gen :: <u32 >( ) ) ) ,
377
+ ) ) as ArrayRef ,
378
+ ) ,
379
+ ] )
380
+ . unwrap ( ) ,
381
+ )
382
+ }
383
+
384
+ batches
385
+ }
0 commit comments