Skip to content

Commit b91b316

Browse files
committed
add boolean row selection implementation
1 parent 56525ef commit b91b316

File tree

2 files changed

+321
-0
lines changed

2 files changed

+321
-0
lines changed
Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::ops::Range;
19+
20+
use arrow_array::{Array, BooleanArray};
21+
use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer};
22+
use arrow_data::bit_iterator::BitIndexIterator;
23+
24+
use super::{RowSelection, RowSelector};
25+
26+
/// A selection of rows, similar to [`RowSelection`], but based on a boolean array
27+
#[derive(Debug, Clone, PartialEq, Eq)]
28+
pub struct BooleanRowSelection {
29+
selector: BooleanBuffer,
30+
}
31+
32+
impl BooleanRowSelection {
33+
/// Create a new [`BooleanRowSelection] from a list of [`BooleanArray`].
34+
pub fn from_filters(filters: &[BooleanArray]) -> Self {
35+
let arrays: Vec<&dyn Array> = filters.iter().map(|x| x as &dyn Array).collect();
36+
let result = arrow_select::concat::concat(&arrays).unwrap().into_data();
37+
let (boolean_array, _null) = BooleanArray::from(result).into_parts();
38+
BooleanRowSelection {
39+
selector: boolean_array,
40+
}
41+
}
42+
43+
/// Create a new [`BooleanRowSelection`] with all rows unselected
44+
pub fn new_unselected(row_count: usize) -> Self {
45+
let buffer = BooleanBuffer::new_unset(row_count);
46+
47+
BooleanRowSelection { selector: buffer }
48+
}
49+
50+
/// Create a new [`BooleanRowSelection`] with all rows selected
51+
pub fn new_selected(row_count: usize) -> Self {
52+
let buffer = BooleanBuffer::new_set(row_count);
53+
54+
BooleanRowSelection { selector: buffer }
55+
}
56+
57+
/// Returns a new [`BooleanRowSelection`] that selects the inverse of this [`BooleanRowSelection`].
58+
pub fn as_inverted(&self) -> Self {
59+
let buffer = !&self.selector;
60+
BooleanRowSelection { selector: buffer }
61+
}
62+
63+
/// Returns the number of rows in this [`BooleanRowSelection`].
64+
pub fn len(&self) -> usize {
65+
self.selector.len()
66+
}
67+
68+
/// Returns the number of rows selected by this [`BooleanRowSelection`].
69+
pub fn row_count(&self) -> usize {
70+
self.selector.count_set_bits()
71+
}
72+
73+
/// Create a new [`BooleanRowSelection`] from a list of consecutive ranges.
74+
pub fn from_consecutive_ranges(
75+
ranges: impl Iterator<Item = Range<usize>>,
76+
total_rows: usize,
77+
) -> Self {
78+
let mut buffer = BooleanBufferBuilder::new(total_rows);
79+
let mut last_end = 0;
80+
81+
for range in ranges {
82+
let len = range.end - range.start;
83+
if len == 0 {
84+
continue;
85+
}
86+
87+
if range.start > last_end {
88+
buffer.append_n(range.start - last_end, false);
89+
}
90+
buffer.append_n(len, true);
91+
last_end = range.end;
92+
}
93+
94+
if last_end != total_rows {
95+
buffer.append_n(total_rows - last_end, false);
96+
}
97+
98+
BooleanRowSelection {
99+
selector: buffer.finish(),
100+
}
101+
}
102+
103+
/// Compute the union of two [`BooleanRowSelection`]
104+
/// For example:
105+
/// self: NNYYYYNNYYNYN
106+
/// other: NYNNNNNNN
107+
///
108+
/// returned: NYYYYYNNYYNYN
109+
#[must_use]
110+
pub fn union(&self, other: &Self) -> Self {
111+
// use arrow::compute::kernels::boolean::or;
112+
113+
let union_selectors = &self.selector | &other.selector;
114+
115+
BooleanRowSelection {
116+
selector: union_selectors,
117+
}
118+
}
119+
120+
/// Compute the intersection of two [`BooleanRowSelection`]
121+
/// For example:
122+
/// self: NNYYYYNNYYNYN
123+
/// other: NYNNNNNNY
124+
///
125+
/// returned: NNNNNNNNYYNYN
126+
#[must_use]
127+
pub fn intersection(&self, other: &Self) -> Self {
128+
let intersection_selectors = &self.selector & &other.selector;
129+
130+
BooleanRowSelection {
131+
selector: intersection_selectors,
132+
}
133+
}
134+
135+
/// Combines this [`BooleanRowSelection`] with another using logical AND on the selected bits.
136+
///
137+
/// Unlike [`intersection`], the `other` [`BooleanRowSelection`] must have exactly as many set bits as `self`.
138+
/// This method will keep only the bits in `self` that are also set in `other`
139+
/// at the positions corresponding to `self`'s set bits.
140+
pub fn and_then(&self, other: &Self) -> Self {
141+
// Ensure that 'other' has exactly as many set bits as 'self'
142+
debug_assert_eq!(
143+
self.row_count(),
144+
other.len(),
145+
"The 'other' selection must have exactly as many set bits as 'self'."
146+
);
147+
148+
if self.len() == other.len() {
149+
// fast path if the two selections are the same length
150+
// common if this is the first predicate
151+
debug_assert_eq!(self.row_count(), self.len());
152+
return self.intersection(other);
153+
}
154+
155+
let mut buffer = MutableBuffer::from_len_zeroed(self.len());
156+
buffer.copy_from_slice(self.selector.values());
157+
let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.len());
158+
159+
// Create iterators for 'self' and 'other' bits
160+
let mut other_bits = other.selector.iter();
161+
162+
for bit_idx in self.true_iter() {
163+
let predicate = other_bits
164+
.next()
165+
.expect("Mismatch in set bits between self and other");
166+
if !predicate {
167+
builder.set_bit(bit_idx, false);
168+
}
169+
}
170+
171+
BooleanRowSelection {
172+
selector: builder.finish(),
173+
}
174+
}
175+
176+
/// Returns an iterator over the indices of the set bits in this [`BooleanRowSelection`]
177+
pub fn true_iter(&self) -> BitIndexIterator<'_> {
178+
self.selector.set_indices()
179+
}
180+
181+
/// Returns `true` if this [`BooleanRowSelection`] selects any rows
182+
pub fn selects_any(&self) -> bool {
183+
self.true_iter().next().is_some()
184+
}
185+
186+
/// Returns a new [`BooleanRowSelection`] that selects the rows in this [`BooleanRowSelection`] from `offset` to `offset + len`
187+
pub fn slice(&self, offset: usize, len: usize) -> BooleanArray {
188+
BooleanArray::new(self.selector.slice(offset, len), None)
189+
}
190+
}
191+
192+
impl From<Vec<RowSelector>> for BooleanRowSelection {
193+
fn from(selection: Vec<RowSelector>) -> Self {
194+
let selection = RowSelection::from(selection);
195+
RowSelection::into(selection)
196+
}
197+
}
198+
199+
impl From<RowSelection> for BooleanRowSelection {
200+
fn from(selection: RowSelection) -> Self {
201+
let total_rows = selection.row_count();
202+
let mut builder = BooleanBufferBuilder::new(total_rows);
203+
204+
for selector in selection.iter() {
205+
if selector.skip {
206+
builder.append_n(selector.row_count, false);
207+
} else {
208+
builder.append_n(selector.row_count, true);
209+
}
210+
}
211+
212+
BooleanRowSelection {
213+
selector: builder.finish(),
214+
}
215+
}
216+
}
217+
218+
impl From<&BooleanRowSelection> for RowSelection {
219+
fn from(selection: &BooleanRowSelection) -> Self {
220+
let array = BooleanArray::new(selection.selector.clone(), None);
221+
RowSelection::from_filters(&[array])
222+
}
223+
}
224+
225+
#[cfg(test)]
226+
mod tests {
227+
use rand::Rng;
228+
229+
use super::*;
230+
231+
fn generate_random_row_selection(total_rows: usize, selection_ratio: f64) -> BooleanArray {
232+
let mut rng = rand::thread_rng();
233+
let bools: Vec<bool> = (0..total_rows)
234+
.map(|_| rng.gen_bool(selection_ratio))
235+
.collect();
236+
BooleanArray::from(bools)
237+
}
238+
239+
#[test]
240+
fn test_boolean_row_selection_round_trip() {
241+
let total_rows = 1_000;
242+
for &selection_ratio in &[0.0, 0.1, 0.5, 0.9, 1.0] {
243+
let selection = generate_random_row_selection(total_rows, selection_ratio);
244+
let boolean_selection = BooleanRowSelection::from_filters(&[selection]);
245+
let row_selection = RowSelection::from(&boolean_selection);
246+
let boolean_selection_again = row_selection.into();
247+
assert_eq!(boolean_selection, boolean_selection_again);
248+
}
249+
}
250+
251+
#[test]
252+
fn test_boolean_union_intersection() {
253+
let total_rows = 1_000;
254+
255+
let base_boolean_selection =
256+
BooleanRowSelection::from_filters(&[generate_random_row_selection(total_rows, 0.1)]);
257+
let base_row_selection = RowSelection::from(&base_boolean_selection);
258+
for &selection_ratio in &[0.0, 0.1, 0.5, 0.9, 1.0] {
259+
let boolean_selection =
260+
BooleanRowSelection::from_filters(&[generate_random_row_selection(
261+
total_rows,
262+
selection_ratio,
263+
)]);
264+
let row_selection = RowSelection::from(&boolean_selection);
265+
266+
let boolean_union = boolean_selection.union(&base_boolean_selection);
267+
let row_union = row_selection.union(&base_row_selection);
268+
assert_eq!(boolean_union, BooleanRowSelection::from(row_union));
269+
270+
let boolean_intersection = boolean_selection.intersection(&base_boolean_selection);
271+
let row_intersection = row_selection.intersection(&base_row_selection);
272+
assert_eq!(
273+
boolean_intersection,
274+
BooleanRowSelection::from(row_intersection)
275+
);
276+
}
277+
}
278+
279+
#[test]
280+
fn test_boolean_selection_and_then() {
281+
// Initial mask: 001011010101
282+
let self_filters = vec![BooleanArray::from(vec![
283+
false, false, true, false, true, true, false, true, false, true, false, true,
284+
])];
285+
let self_selection = BooleanRowSelection::from_filters(&self_filters);
286+
287+
// Predicate mask (only for selected bits): 001101
288+
let other_filters = vec![BooleanArray::from(vec![
289+
false, false, true, true, false, true,
290+
])];
291+
let other_selection = BooleanRowSelection::from_filters(&other_filters);
292+
293+
let result = self_selection.and_then(&other_selection);
294+
295+
// Expected result: 000001010001
296+
let expected_filters = vec![BooleanArray::from(vec![
297+
false, false, false, false, false, true, false, true, false, false, false, true,
298+
])];
299+
let expected_selection = BooleanRowSelection::from_filters(&expected_filters);
300+
301+
assert_eq!(result, expected_selection);
302+
}
303+
304+
#[test]
305+
#[should_panic(
306+
expected = "The 'other' selection must have exactly as many set bits as 'self'."
307+
)]
308+
fn test_and_then_mismatched_set_bits() {
309+
let self_filters = vec![BooleanArray::from(vec![true, true, false])];
310+
let self_selection = BooleanRowSelection::from_filters(&self_filters);
311+
312+
// 'other' has only one set bit, but 'self' has two
313+
let other_filters = vec![BooleanArray::from(vec![true, false, false])];
314+
let other_selection = BooleanRowSelection::from_filters(&other_filters);
315+
316+
// This should panic
317+
let _ = self_selection.and_then(&other_selection);
318+
}
319+
}

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
3838
use crate::file::reader::{ChunkReader, SerializedPageReader};
3939
use crate::schema::types::SchemaDescriptor;
4040

41+
#[cfg(feature = "experimental")]
42+
mod boolean_selection;
4143
mod filter;
4244
mod selection;
4345
pub mod statistics;

0 commit comments

Comments
 (0)