@@ -170,11 +170,16 @@ impl Dataset {
170
170
171
171
#[ derive( Debug , Clone ) ]
172
172
pub struct ColumnDescr {
173
- // Column name
173
+ /// Column name
174
174
name : String ,
175
175
176
- // Data type of this column
176
+ /// Data type of this column
177
177
column_type : DataType ,
178
+
179
+ /// The maximum number of distinct values in this column.
180
+ ///
181
+ /// See [`ColumnDescr::with_max_num_distinct`] for more information
182
+ max_num_distinct : Option < usize > ,
178
183
}
179
184
180
185
impl ColumnDescr {
@@ -183,8 +188,18 @@ impl ColumnDescr {
183
188
Self {
184
189
name : name. to_string ( ) ,
185
190
column_type,
191
+ max_num_distinct : None ,
186
192
}
187
193
}
194
+
195
+ /// set the maximum number of distinct values in this column
196
+ ///
197
+ /// If `None`, the number of distinct values is randomly selected between 1
198
+ /// and the number of rows.
199
+ pub fn with_max_num_distinct ( mut self , num_distinct : usize ) -> Self {
200
+ self . max_num_distinct = Some ( num_distinct) ;
201
+ self
202
+ }
188
203
}
189
204
190
205
/// Record batch generator
@@ -199,20 +214,15 @@ struct RecordBatchGenerator {
199
214
}
200
215
201
216
macro_rules! generate_string_array {
202
- ( $SELF: ident, $NUM_ROWS: ident, $BATCH_GEN_RNG: ident, $ARRAY_GEN_RNG: ident, $OFFSET_TYPE: ty) => { {
217
+ ( $SELF: ident, $NUM_ROWS: ident, $MAX_NUM_DISTINCT : expr , $ BATCH_GEN_RNG: ident, $ARRAY_GEN_RNG: ident, $OFFSET_TYPE: ty) => { {
203
218
let null_pct_idx = $BATCH_GEN_RNG. gen_range( 0 ..$SELF. candidate_null_pcts. len( ) ) ;
204
219
let null_pct = $SELF. candidate_null_pcts[ null_pct_idx] ;
205
220
let max_len = $BATCH_GEN_RNG. gen_range( 1 ..50 ) ;
206
- let num_distinct_strings = if $NUM_ROWS > 1 {
207
- $BATCH_GEN_RNG. gen_range( 1 ..$NUM_ROWS)
208
- } else {
209
- $NUM_ROWS
210
- } ;
211
221
212
222
let mut generator = StringArrayGenerator {
213
223
max_len,
214
224
num_strings: $NUM_ROWS,
215
- num_distinct_strings,
225
+ num_distinct_strings: $MAX_NUM_DISTINCT ,
216
226
null_pct,
217
227
rng: $ARRAY_GEN_RNG,
218
228
} ;
@@ -222,19 +232,14 @@ macro_rules! generate_string_array {
222
232
}
223
233
224
234
macro_rules! generate_primitive_array {
225
- ( $SELF: ident, $NUM_ROWS: ident, $BATCH_GEN_RNG: ident, $ARRAY_GEN_RNG: ident, $DATA_TYPE: ident) => {
235
+ ( $SELF: ident, $NUM_ROWS: ident, $MAX_NUM_DISTINCT : expr , $ BATCH_GEN_RNG: ident, $ARRAY_GEN_RNG: ident, $DATA_TYPE: ident) => {
226
236
paste:: paste! { {
227
237
let null_pct_idx = $BATCH_GEN_RNG. gen_range( 0 ..$SELF. candidate_null_pcts. len( ) ) ;
228
238
let null_pct = $SELF. candidate_null_pcts[ null_pct_idx] ;
229
- let num_distinct_primitives = if $NUM_ROWS > 1 {
230
- $BATCH_GEN_RNG. gen_range( 1 ..$NUM_ROWS)
231
- } else {
232
- $NUM_ROWS
233
- } ;
234
239
235
240
let mut generator = PrimitiveArrayGenerator {
236
241
num_primitives: $NUM_ROWS,
237
- num_distinct_primitives,
242
+ num_distinct_primitives: $MAX_NUM_DISTINCT ,
238
243
null_pct,
239
244
rng: $ARRAY_GEN_RNG,
240
245
} ;
@@ -264,7 +269,7 @@ impl RecordBatchGenerator {
264
269
let mut arrays = Vec :: with_capacity ( self . columns . len ( ) ) ;
265
270
for col in self . columns . iter ( ) {
266
271
let array = self . generate_array_of_type (
267
- col. column_type . clone ( ) ,
272
+ col,
268
273
num_rows,
269
274
& mut rng,
270
275
array_gen_rng. clone ( ) ,
@@ -285,16 +290,28 @@ impl RecordBatchGenerator {
285
290
286
291
fn generate_array_of_type (
287
292
& self ,
288
- data_type : DataType ,
293
+ col : & ColumnDescr ,
289
294
num_rows : usize ,
290
295
batch_gen_rng : & mut ThreadRng ,
291
296
array_gen_rng : StdRng ,
292
297
) -> ArrayRef {
293
- match data_type {
298
+ let num_distinct = if num_rows > 1 {
299
+ batch_gen_rng. gen_range ( 1 ..num_rows)
300
+ } else {
301
+ num_rows
302
+ } ;
303
+ // cap to at most the num_distinct values
304
+ let max_num_distinct = col
305
+ . max_num_distinct
306
+ . map ( |max| num_distinct. min ( max) )
307
+ . unwrap_or ( num_distinct) ;
308
+
309
+ match col. column_type {
294
310
DataType :: Int8 => {
295
311
generate_primitive_array ! (
296
312
self ,
297
313
num_rows,
314
+ max_num_distinct,
298
315
batch_gen_rng,
299
316
array_gen_rng,
300
317
i8
@@ -304,6 +321,7 @@ impl RecordBatchGenerator {
304
321
generate_primitive_array ! (
305
322
self ,
306
323
num_rows,
324
+ max_num_distinct,
307
325
batch_gen_rng,
308
326
array_gen_rng,
309
327
i16
@@ -313,6 +331,7 @@ impl RecordBatchGenerator {
313
331
generate_primitive_array ! (
314
332
self ,
315
333
num_rows,
334
+ max_num_distinct,
316
335
batch_gen_rng,
317
336
array_gen_rng,
318
337
i32
@@ -322,6 +341,7 @@ impl RecordBatchGenerator {
322
341
generate_primitive_array ! (
323
342
self ,
324
343
num_rows,
344
+ max_num_distinct,
325
345
batch_gen_rng,
326
346
array_gen_rng,
327
347
i64
@@ -331,6 +351,7 @@ impl RecordBatchGenerator {
331
351
generate_primitive_array ! (
332
352
self ,
333
353
num_rows,
354
+ max_num_distinct,
334
355
batch_gen_rng,
335
356
array_gen_rng,
336
357
u8
@@ -340,6 +361,7 @@ impl RecordBatchGenerator {
340
361
generate_primitive_array ! (
341
362
self ,
342
363
num_rows,
364
+ max_num_distinct,
343
365
batch_gen_rng,
344
366
array_gen_rng,
345
367
u16
@@ -349,6 +371,7 @@ impl RecordBatchGenerator {
349
371
generate_primitive_array ! (
350
372
self ,
351
373
num_rows,
374
+ max_num_distinct,
352
375
batch_gen_rng,
353
376
array_gen_rng,
354
377
u32
@@ -358,6 +381,7 @@ impl RecordBatchGenerator {
358
381
generate_primitive_array ! (
359
382
self ,
360
383
num_rows,
384
+ max_num_distinct,
361
385
batch_gen_rng,
362
386
array_gen_rng,
363
387
u64
@@ -367,6 +391,7 @@ impl RecordBatchGenerator {
367
391
generate_primitive_array ! (
368
392
self ,
369
393
num_rows,
394
+ max_num_distinct,
370
395
batch_gen_rng,
371
396
array_gen_rng,
372
397
f32
@@ -376,19 +401,34 @@ impl RecordBatchGenerator {
376
401
generate_primitive_array ! (
377
402
self ,
378
403
num_rows,
404
+ max_num_distinct,
379
405
batch_gen_rng,
380
406
array_gen_rng,
381
407
f64
382
408
)
383
409
}
384
410
DataType :: Utf8 => {
385
- generate_string_array ! ( self , num_rows, batch_gen_rng, array_gen_rng, i32 )
411
+ generate_string_array ! (
412
+ self ,
413
+ num_rows,
414
+ max_num_distinct,
415
+ batch_gen_rng,
416
+ array_gen_rng,
417
+ i32
418
+ )
386
419
}
387
420
DataType :: LargeUtf8 => {
388
- generate_string_array ! ( self , num_rows, batch_gen_rng, array_gen_rng, i64 )
421
+ generate_string_array ! (
422
+ self ,
423
+ num_rows,
424
+ max_num_distinct,
425
+ batch_gen_rng,
426
+ array_gen_rng,
427
+ i64
428
+ )
389
429
}
390
430
_ => {
391
- panic ! ( "Unsupported data generator type: {data_type}" )
431
+ panic ! ( "Unsupported data generator type: {}" , col . column_type )
392
432
}
393
433
}
394
434
}
@@ -413,14 +453,8 @@ mod test {
413
453
// - Their rows num should be same and between [16, 32]
414
454
let config = DatasetGeneratorConfig {
415
455
columns : vec ! [
416
- ColumnDescr {
417
- name: "a" . to_string( ) ,
418
- column_type: DataType :: Utf8 ,
419
- } ,
420
- ColumnDescr {
421
- name: "b" . to_string( ) ,
422
- column_type: DataType :: UInt32 ,
423
- } ,
456
+ ColumnDescr :: new( "a" , DataType :: Utf8 ) ,
457
+ ColumnDescr :: new( "b" , DataType :: UInt32 ) ,
424
458
] ,
425
459
rows_num_range : ( 16 , 32 ) ,
426
460
sort_keys_set : vec ! [ vec![ "b" . to_string( ) ] ] ,
0 commit comments