@@ -9,6 +9,7 @@ use vortex_array::variants::PrimitiveArrayTrait;
9
9
use vortex_array:: { Array , ToCanonical } ;
10
10
use vortex_dtype:: { NativePType , match_each_integer_ptype} ;
11
11
use vortex_error:: { VortexExpect , VortexUnwrap } ;
12
+ use vortex_mask:: AllOr ;
12
13
use vortex_scalar:: PValue ;
13
14
14
15
use crate :: sample:: sample;
@@ -196,7 +197,12 @@ where
196
197
let value_count = validity. true_count ( ) ;
197
198
198
199
// Initialize loop state
199
- let head = array. as_slice :: < T > ( ) [ 0 ] ;
200
+ let head_idx = validity
201
+ . first ( )
202
+ . vortex_expect ( "All null masks have been handled before" ) ;
203
+ let buffer = array. buffer :: < T > ( ) ;
204
+ let head = buffer[ head_idx] ;
205
+
200
206
let mut loop_state = LoopState {
201
207
min : head,
202
208
max : head,
@@ -205,43 +211,55 @@ where
205
211
} else {
206
212
HashMap :: with_hasher ( FxBuildHasher )
207
213
} ,
208
- distinct_values_count : if count_distinct_values { 0 } else { u32:: MAX } ,
209
214
prev : head,
210
215
runs : 1 ,
211
216
} ;
212
217
213
- let values = array. buffer :: < T > ( ) ;
214
- let mask = validity. to_boolean_buffer ( ) ;
215
-
216
- let mut offset = 0 ;
217
- for chunk in values. as_slice ( ) . chunks ( 64 ) {
218
- let validity = mask. slice ( offset, chunk. len ( ) ) ;
219
- offset += chunk. len ( ) ;
220
-
221
- if chunk. len ( ) < 64 {
222
- // Final iteration, run naive loop
223
- inner_loop_naive ( chunk, count_distinct_values, & validity, & mut loop_state) ;
224
- break ;
225
- }
226
-
227
- let set_bits = validity. count_set_bits ( ) ;
228
-
229
- match set_bits {
230
- // All nulls -> no stats to update
231
- 0 => continue ,
232
- // Inner loop for when validity check can be elided
233
- 64 => inner_loop_nonnull (
234
- chunk. try_into ( ) . vortex_unwrap ( ) ,
218
+ let sliced = buffer. slice ( head_idx..array. len ( ) ) ;
219
+ let mut chunks = sliced. as_slice ( ) . array_chunks :: < 64 > ( ) ;
220
+ match validity. boolean_buffer ( ) {
221
+ AllOr :: All => {
222
+ for chunk in & mut chunks {
223
+ inner_loop_nonnull ( chunk, count_distinct_values, & mut loop_state)
224
+ }
225
+ let remainder = chunks. remainder ( ) ;
226
+ inner_loop_naive (
227
+ remainder,
235
228
count_distinct_values,
229
+ & BooleanBuffer :: new_set ( remainder. len ( ) ) ,
236
230
& mut loop_state,
237
- ) ,
238
- // Inner loop for when we need to check validity
239
- _ => inner_loop_nullable (
240
- chunk. try_into ( ) . vortex_unwrap ( ) ,
231
+ ) ;
232
+ }
233
+ AllOr :: None => unreachable ! ( "All invalid arrays have been handled before" ) ,
234
+ AllOr :: Some ( v) => {
235
+ let mask = v. slice ( head_idx, array. len ( ) - head_idx) ;
236
+ let mut offset = 0 ;
237
+ for chunk in & mut chunks {
238
+ let validity = mask. slice ( offset, 64 ) ;
239
+ offset += 64 ;
240
+
241
+ match validity. count_set_bits ( ) {
242
+ // All nulls -> no stats to update
243
+ 0 => continue ,
244
+ // Inner loop for when validity check can be elided
245
+ 64 => inner_loop_nonnull ( chunk, count_distinct_values, & mut loop_state) ,
246
+ // Inner loop for when we need to check validity
247
+ _ => inner_loop_nullable (
248
+ chunk,
249
+ count_distinct_values,
250
+ & validity,
251
+ & mut loop_state,
252
+ ) ,
253
+ }
254
+ }
255
+ // Final iteration, run naive loop
256
+ let remainder = chunks. remainder ( ) ;
257
+ inner_loop_naive (
258
+ remainder,
241
259
count_distinct_values,
242
- & validity ,
260
+ & mask . slice ( offset , remainder . len ( ) ) ,
243
261
& mut loop_state,
244
- ) ,
262
+ ) ;
245
263
}
246
264
}
247
265
@@ -257,7 +275,11 @@ where
257
275
} ;
258
276
259
277
let runs = loop_state. runs ;
260
- let distinct_values_count = loop_state. distinct_values_count ;
278
+ let distinct_values_count = if count_distinct_values {
279
+ loop_state. distinct_values . len ( ) . try_into ( ) . vortex_unwrap ( )
280
+ } else {
281
+ u32:: MAX
282
+ } ;
261
283
262
284
let typed = TypedStats {
263
285
min : loop_state. min ,
@@ -289,7 +311,6 @@ struct LoopState<T> {
289
311
max : T ,
290
312
prev : T ,
291
313
runs : u32 ,
292
- distinct_values_count : u32 ,
293
314
distinct_values : HashMap < T , u32 , FxBuildHasher > ,
294
315
}
295
316
@@ -305,7 +326,6 @@ fn inner_loop_nonnull<T: PrimInt + Hash>(
305
326
306
327
if count_distinct_values {
307
328
* state. distinct_values . entry ( value) . or_insert ( 0 ) += 1 ;
308
- state. distinct_values_count = state. distinct_values . len ( ) . try_into ( ) . vortex_unwrap ( ) ;
309
329
}
310
330
311
331
if value != state. prev {
@@ -329,8 +349,6 @@ fn inner_loop_nullable<T: PrimInt + Hash>(
329
349
330
350
if count_distinct_values {
331
351
* state. distinct_values . entry ( value) . or_insert ( 0 ) += 1 ;
332
- state. distinct_values_count =
333
- state. distinct_values . len ( ) . try_into ( ) . vortex_unwrap ( ) ;
334
352
}
335
353
336
354
if value != state. prev {
@@ -355,8 +373,6 @@ fn inner_loop_naive<T: PrimInt + Hash>(
355
373
356
374
if count_distinct_values {
357
375
* state. distinct_values . entry ( value) . or_insert ( 0 ) += 1 ;
358
- state. distinct_values_count =
359
- state. distinct_values . len ( ) . try_into ( ) . vortex_unwrap ( ) ;
360
376
}
361
377
362
378
if value != state. prev {
@@ -376,6 +392,8 @@ mod tests {
376
392
use vortex_array:: validity:: Validity ;
377
393
use vortex_buffer:: { Buffer , buffer} ;
378
394
395
+ use crate :: CompressorStats ;
396
+ use crate :: integer:: IntegerStats ;
379
397
use crate :: integer:: stats:: typed_int_stats;
380
398
381
399
#[ test]
@@ -413,4 +431,16 @@ mod tests {
413
431
let stats = typed_int_stats :: < u8 > ( & array, true ) ;
414
432
assert_eq ! ( stats. distinct_values_count, 64 ) ;
415
433
}
434
+
435
+ #[ test]
436
+ fn test_integer_stats_leading_nulls ( ) {
437
+ let ints = PrimitiveArray :: new ( buffer ! [ 0 , 1 , 2 ] , Validity :: from_iter ( [ false , true , true ] ) ) ;
438
+
439
+ let stats = IntegerStats :: generate ( & ints) ;
440
+
441
+ assert_eq ! ( stats. value_count, 2 ) ;
442
+ assert_eq ! ( stats. null_count, 1 ) ;
443
+ assert_eq ! ( stats. average_run_length, 1 ) ;
444
+ assert_eq ! ( stats. distinct_values_count, 2 ) ;
445
+ }
416
446
}
0 commit comments