Skip to content

Commit f8a6d00

Browse files
committed
impl lazily compute for null count when it is expansive to calc it eagerly.
1 parent 0e99e3a commit f8a6d00

File tree

3 files changed

+73
-15
lines changed

3 files changed

+73
-15
lines changed

arrow-buffer/src/buffer/null.rs

+71-13
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,66 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::sync::atomic::{AtomicI64, Ordering};
19+
1820
use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
1921
use crate::buffer::BooleanBuffer;
2022
use crate::{Buffer, MutableBuffer};
2123

24+
const UNINITIALIZED_NULL_COUNT: i64 = -1;
25+
26+
#[derive(Debug)]
27+
pub enum NullCount {
28+
Eager(usize),
29+
Lazy(AtomicI64),
30+
}
31+
32+
impl Clone for NullCount {
33+
fn clone(&self) -> Self {
34+
match self {
35+
Self::Eager(v) => Self::Eager(*v),
36+
Self::Lazy(v) => {
37+
let v = v.load(Ordering::Relaxed);
38+
Self::Lazy(AtomicI64::new(v))
39+
}
40+
}
41+
}
42+
}
43+
2244
/// A [`BooleanBuffer`] used to encode validity for arrow arrays
2345
///
2446
/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a
2547
/// `true` value indicating the corresponding slot is not null, and `false` indicating
2648
/// that it is null.
2749
///
2850
/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
29-
#[derive(Debug, Clone, Eq, PartialEq)]
51+
#[derive(Debug, Clone)]
3052
pub struct NullBuffer {
3153
buffer: BooleanBuffer,
32-
null_count: usize,
54+
null_count: NullCount,
55+
}
56+
57+
impl PartialEq for NullBuffer {
58+
fn eq(&self, other: &Self) -> bool {
59+
self.buffer == other.buffer
60+
}
3361
}
3462

63+
impl Eq for NullBuffer {}
64+
3565
impl NullBuffer {
3666
/// Create a new [`NullBuffer`] computing the null count
3767
pub fn new(buffer: BooleanBuffer) -> Self {
38-
let null_count = buffer.len() - buffer.count_set_bits();
68+
// Expensive to calc the null count, we should lazily compute it when
69+
let null_count = NullCount::Lazy(AtomicI64::new(UNINITIALIZED_NULL_COUNT));
3970
Self { buffer, null_count }
4071
}
4172

4273
/// Create a new [`NullBuffer`] of length `len` where all values are null
4374
pub fn new_null(len: usize) -> Self {
4475
Self {
4576
buffer: BooleanBuffer::new_unset(len),
46-
null_count: len,
77+
null_count: NullCount::Eager(len),
4778
}
4879
}
4980

@@ -53,7 +84,7 @@ impl NullBuffer {
5384
pub fn new_valid(len: usize) -> Self {
5485
Self {
5586
buffer: BooleanBuffer::new_set(len),
56-
null_count: 0,
87+
null_count: NullCount::Eager(0),
5788
}
5889
}
5990

@@ -63,7 +94,10 @@ impl NullBuffer {
6394
///
6495
/// `buffer` must contain `null_count` `0` bits
6596
pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
66-
Self { buffer, null_count }
97+
Self {
98+
buffer,
99+
null_count: NullCount::Eager(null_count),
100+
}
67101
}
68102

69103
/// Computes the union of the nulls in two optional [`NullBuffer`]
@@ -81,9 +115,12 @@ impl NullBuffer {
81115

82116
/// Returns true if all nulls in `other` also exist in self
83117
pub fn contains(&self, other: &NullBuffer) -> bool {
84-
if other.null_count == 0 {
85-
return true;
118+
if let NullCount::Eager(v) = &other.null_count {
119+
if *v == 0 {
120+
return true;
121+
}
86122
}
123+
87124
let lhs = self.inner().bit_chunks().iter_padded();
88125
let rhs = other.inner().bit_chunks().iter_padded();
89126
lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
@@ -106,9 +143,17 @@ impl NullBuffer {
106143
crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
107144
}
108145
}
146+
147+
let null_count = if let NullCount::Eager(v) = &self.null_count {
148+
NullCount::Eager(v * count)
149+
} else {
150+
// TODO: not sure about if it is better to load the atomic and attempt to reuse the compute result
151+
NullCount::Lazy(AtomicI64::new(UNINITIALIZED_NULL_COUNT))
152+
};
153+
109154
Self {
110155
buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
111-
null_count: self.null_count * count,
156+
null_count,
112157
}
113158
}
114159

@@ -131,9 +176,20 @@ impl NullBuffer {
131176
}
132177

133178
/// Returns the null count for this [`NullBuffer`]
134-
#[inline]
135179
pub fn null_count(&self) -> usize {
136-
self.null_count
180+
match &self.null_count {
181+
NullCount::Eager(v) => *v,
182+
NullCount::Lazy(v) => {
183+
let cached_null_count = v.load(Ordering::Acquire);
184+
if cached_null_count != UNINITIALIZED_NULL_COUNT {
185+
return cached_null_count as usize;
186+
}
187+
188+
let computed_null_count = self.buffer.len() - self.buffer.count_set_bits();
189+
v.store(computed_null_count as i64, Ordering::Release);
190+
computed_null_count
191+
}
192+
}
137193
}
138194

139195
/// Returns `true` if the value at `idx` is not null
@@ -189,8 +245,10 @@ impl NullBuffer {
189245
&self,
190246
f: F,
191247
) -> Result<(), E> {
192-
if self.null_count == self.len() {
193-
return Ok(());
248+
if let NullCount::Eager(v) = &self.null_count {
249+
if *v == self.len() {
250+
return Ok(());
251+
}
194252
}
195253
self.valid_indices().try_for_each(f)
196254
}

0 commit comments

Comments
 (0)