Skip to content

Commit 1f017ff

Browse files
committed
add boolean row selection implementation
1 parent b91b316 commit 1f017ff

File tree

4 files changed

+87
-49
lines changed

4 files changed

+87
-49
lines changed

parquet/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ harness = false
225225
[[bench]]
226226
name = "row_selector"
227227
harness = false
228-
required-features = ["arrow"]
228+
required-features = ["arrow", "experimental"]
229229

230230
[lib]
231231
bench = false

parquet/benches/row_selector.rs

+76-37
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use arrow_array::BooleanArray;
1919
use criterion::*;
20-
use parquet::arrow::arrow_reader::RowSelection;
20+
use parquet::arrow::arrow_reader::{BooleanRowSelection, RowSelection};
2121
use rand::Rng;
2222

2323
/// Generates a random RowSelection with a specified selection ratio.
@@ -40,47 +40,86 @@ fn generate_random_row_selection(total_rows: usize, selection_ratio: f64) -> Boo
4040

4141
fn criterion_benchmark(c: &mut Criterion) {
4242
let total_rows = 300_000;
43-
let selection_ratio = 1.0 / 3.0;
43+
let selection_ratios = [0.000_01, 0.001, 0.1, 0.3];
4444

45-
// Generate two random RowSelections with approximately 1/3 of the rows selected.
46-
let row_selection_a =
47-
RowSelection::from_filters(&[generate_random_row_selection(total_rows, selection_ratio)]);
48-
let row_selection_b =
49-
RowSelection::from_filters(&[generate_random_row_selection(total_rows, selection_ratio)]);
45+
for ratio in selection_ratios {
46+
let slice_selection_a =
47+
RowSelection::from_filters(&[generate_random_row_selection(total_rows, ratio)]);
48+
let slice_selection_b =
49+
RowSelection::from_filters(&[generate_random_row_selection(total_rows, ratio)]);
5050

51-
// Benchmark the intersection of the two RowSelections.
52-
c.bench_function("intersection", |b| {
53-
b.iter(|| {
54-
let intersection = row_selection_a.intersection(&row_selection_b);
55-
criterion::black_box(intersection);
56-
})
57-
});
51+
let boolean_selection_a = BooleanRowSelection::from(slice_selection_a.clone());
52+
let boolean_selection_b = BooleanRowSelection::from(slice_selection_b.clone());
5853

59-
c.bench_function("union", |b| {
60-
b.iter(|| {
61-
let union = row_selection_a.union(&row_selection_b);
62-
criterion::black_box(union);
63-
})
64-
});
54+
// Benchmark the intersection of the two RowSelections.
55+
c.bench_function(&format!("slice intersection {}", ratio), |b| {
56+
b.iter(|| {
57+
let intersection = slice_selection_a.intersection(&slice_selection_b);
58+
criterion::black_box(intersection);
59+
})
60+
});
6561

66-
c.bench_function("from_filters", |b| {
67-
let boolean_array = generate_random_row_selection(total_rows, selection_ratio);
68-
b.iter(|| {
69-
let array = boolean_array.clone();
70-
let selection = RowSelection::from_filters(&[array]);
71-
criterion::black_box(selection);
72-
})
73-
});
62+
c.bench_function(&format!("boolean intersection {}", ratio), |b| {
63+
b.iter(|| {
64+
let intersection = boolean_selection_a.intersection(&boolean_selection_b);
65+
criterion::black_box(intersection);
66+
})
67+
});
7468

75-
c.bench_function("and_then", |b| {
76-
let selected = row_selection_a.row_count();
77-
let sub_selection =
78-
RowSelection::from_filters(&[generate_random_row_selection(selected, selection_ratio)]);
79-
b.iter(|| {
80-
let result = row_selection_a.and_then(&sub_selection);
81-
criterion::black_box(result);
82-
})
83-
});
69+
c.bench_function(&format!("slice union {}", ratio), |b| {
70+
b.iter(|| {
71+
let union = slice_selection_a.union(&slice_selection_b);
72+
criterion::black_box(union);
73+
})
74+
});
75+
76+
c.bench_function(&format!("boolean union {}", ratio), |b| {
77+
b.iter(|| {
78+
let union = boolean_selection_a.union(&boolean_selection_b);
79+
criterion::black_box(union);
80+
})
81+
});
82+
83+
c.bench_function(&format!("slice from_filters {}", ratio), |b| {
84+
let boolean_array = generate_random_row_selection(total_rows, ratio);
85+
b.iter(|| {
86+
let array = boolean_array.clone();
87+
let selection = RowSelection::from_filters(&[array]);
88+
criterion::black_box(selection);
89+
})
90+
});
91+
92+
c.bench_function(&format!("boolean from_filters {}", ratio), |b| {
93+
let boolean_array = generate_random_row_selection(total_rows, ratio);
94+
b.iter(|| {
95+
let array = boolean_array.clone();
96+
let selection = BooleanRowSelection::from_filters(&[array]);
97+
criterion::black_box(selection);
98+
})
99+
});
100+
101+
c.bench_function(&format!("slice and_then {}", ratio), |b| {
102+
let selected = slice_selection_a.row_count();
103+
let sub_selection =
104+
RowSelection::from_filters(&[generate_random_row_selection(selected, ratio)]);
105+
b.iter(|| {
106+
let result = slice_selection_a.and_then(&sub_selection);
107+
criterion::black_box(result);
108+
})
109+
});
110+
111+
c.bench_function(&format!("boolean and_then {}", ratio), |b| {
112+
let selected = boolean_selection_a.row_count();
113+
let sub_selection =
114+
BooleanRowSelection::from_filters(&[generate_random_row_selection(
115+
selected, ratio,
116+
)]);
117+
b.iter(|| {
118+
let result = boolean_selection_a.and_then(&sub_selection);
119+
criterion::black_box(result);
120+
})
121+
});
122+
}
84123
}
85124

86125
criterion_group!(benches, criterion_benchmark);

parquet/src/arrow/arrow_reader/boolean_selection.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,9 @@ impl BooleanRowSelection {
152152
return self.intersection(other);
153153
}
154154

155-
let mut buffer = MutableBuffer::from_len_zeroed(self.len());
156-
buffer.copy_from_slice(self.selector.values());
157-
let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.len());
155+
let mut buffer = MutableBuffer::from_len_zeroed(self.selector.inner().len());
156+
buffer.copy_from_slice(self.selector.inner().as_slice());
157+
let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.selector.len());
158158

159159
// Create iterators for 'self' and 'other' bits
160160
let mut other_bits = other.selector.iter();

parquet/src/arrow/arrow_reader/mod.rs

+7-8
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,6 @@
2020
use std::collections::VecDeque;
2121
use std::sync::Arc;
2222

23-
use arrow_array::cast::AsArray;
24-
use arrow_array::Array;
25-
use arrow_array::{RecordBatch, RecordBatchReader};
26-
use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef};
27-
use arrow_select::filter::prep_null_mask_filter;
28-
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
29-
pub use selection::{RowSelection, RowSelector};
30-
3123
pub use crate::arrow::array_reader::RowGroups;
3224
use crate::arrow::array_reader::{build_array_reader, ArrayReader};
3325
use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
@@ -37,6 +29,13 @@ use crate::errors::{ParquetError, Result};
3729
use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
3830
use crate::file::reader::{ChunkReader, SerializedPageReader};
3931
use crate::schema::types::SchemaDescriptor;
32+
use arrow_array::{cast::AsArray, Array, RecordBatch, RecordBatchReader};
33+
use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef};
34+
use arrow_select::filter::prep_null_mask_filter;
35+
#[cfg(feature = "experimental")]
36+
pub use boolean_selection::BooleanRowSelection;
37+
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
38+
pub use selection::{RowSelection, RowSelector};
4039

4140
#[cfg(feature = "experimental")]
4241
mod boolean_selection;

0 commit comments

Comments
 (0)