Skip to content

Commit c597e4a

Browse files
committed
add boolean selection
1 parent b91b316 commit c597e4a

File tree

4 files changed

+167
-68
lines changed

4 files changed

+167
-68
lines changed

parquet/Cargo.toml

+81-20
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,14 @@ edition = { workspace = true }
2929
rust-version = "1.70.0"
3030

3131
[target.'cfg(target_arch = "wasm32")'.dependencies]
32-
ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }
32+
ahash = { version = "0.8", default-features = false, features = [
33+
"compile-time-rng",
34+
] }
3335

3436
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
35-
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
37+
ahash = { version = "0.8", default-features = false, features = [
38+
"runtime-rng",
39+
] }
3640

3741
[dependencies]
3842
arrow-array = { workspace = true, optional = true }
@@ -49,25 +53,53 @@ object_store = { version = "0.11.0", default-features = false, optional = true }
4953
bytes = { version = "1.1", default-features = false, features = ["std"] }
5054
thrift = { version = "0.17", default-features = false }
5155
snap = { version = "1.0", default-features = false, optional = true }
52-
brotli = { version = "7.0", default-features = false, features = ["std"], optional = true }
53-
flate2 = { version = "1.0", default-features = false, features = ["rust_backend"], optional = true }
54-
lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true }
56+
brotli = { version = "7.0", default-features = false, features = [
57+
"std",
58+
], optional = true }
59+
flate2 = { version = "1.0", default-features = false, features = [
60+
"rust_backend",
61+
], optional = true }
62+
lz4_flex = { version = "0.11", default-features = false, features = [
63+
"std",
64+
"frame",
65+
], optional = true }
5566
zstd = { version = "0.13", optional = true, default-features = false }
5667
chrono = { workspace = true }
5768
num = { version = "0.4", default-features = false }
5869
num-bigint = { version = "0.4", default-features = false }
59-
base64 = { version = "0.22", default-features = false, features = ["std", ], optional = true }
60-
clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true }
61-
serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
62-
serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true }
70+
base64 = { version = "0.22", default-features = false, features = [
71+
"std",
72+
], optional = true }
73+
clap = { version = "4.1", default-features = false, features = [
74+
"std",
75+
"derive",
76+
"env",
77+
"help",
78+
"error-context",
79+
"usage",
80+
], optional = true }
81+
serde = { version = "1.0", default-features = false, features = [
82+
"derive",
83+
], optional = true }
84+
serde_json = { version = "1.0", default-features = false, features = [
85+
"std",
86+
], optional = true }
6387
seq-macro = { version = "0.3", default-features = false }
64-
futures = { version = "0.3", default-features = false, features = ["std"], optional = true }
65-
tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] }
88+
futures = { version = "0.3", default-features = false, features = [
89+
"std",
90+
], optional = true }
91+
tokio = { version = "1.0", optional = true, default-features = false, features = [
92+
"macros",
93+
"rt",
94+
"io-util",
95+
] }
6696
hashbrown = { version = "0.14", default-features = false }
6797
twox-hash = { version = "1.6", default-features = false }
6898
paste = { version = "1.0" }
6999
half = { version = "2.1", default-features = false, features = ["num-traits"] }
70-
sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] }
100+
sysinfo = { version = "0.32.0", optional = true, default-features = false, features = [
101+
"system",
102+
] }
71103
crc32fast = { version = "1.4.2", optional = true, default-features = false }
72104

73105
[dev-dependencies]
@@ -76,14 +108,34 @@ criterion = { version = "0.5", default-features = false }
76108
snap = { version = "1.0", default-features = false }
77109
tempfile = { version = "3.0", default-features = false }
78110
brotli = { version = "7.0", default-features = false, features = ["std"] }
79-
flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] }
80-
lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"] }
111+
flate2 = { version = "1.0", default-features = false, features = [
112+
"rust_backend",
113+
] }
114+
lz4_flex = { version = "0.11", default-features = false, features = [
115+
"std",
116+
"frame",
117+
] }
81118
zstd = { version = "0.13", default-features = false }
82119
serde_json = { version = "1.0", features = ["std"], default-features = false }
83-
arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", "json"] }
84-
tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "io-util", "fs"] }
85-
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
86-
object_store = { version = "0.11.0", default-features = false, features = ["azure"] }
120+
arrow = { workspace = true, features = [
121+
"ipc",
122+
"test_utils",
123+
"prettyprint",
124+
"json",
125+
] }
126+
tokio = { version = "1.0", default-features = false, features = [
127+
"macros",
128+
"rt",
129+
"io-util",
130+
"fs",
131+
] }
132+
rand = { version = "0.8", default-features = false, features = [
133+
"std",
134+
"std_rng",
135+
] }
136+
object_store = { version = "0.11.0", default-features = false, features = [
137+
"azure",
138+
] }
87139

88140
# TODO: temporary to fix parquet wasm build
89141
# upstream issue: https://github.com/gyscos/zstd-rs/issues/269
@@ -101,7 +153,16 @@ default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"]
101153
# Enable lz4
102154
lz4 = ["lz4_flex"]
103155
# Enable arrow reader/writer APIs
104-
arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"]
156+
arrow = [
157+
"base64",
158+
"arrow-array",
159+
"arrow-buffer",
160+
"arrow-cast",
161+
"arrow-data",
162+
"arrow-schema",
163+
"arrow-select",
164+
"arrow-ipc",
165+
]
105166
# Enable CLI tools
106167
cli = ["json", "base64", "clap", "arrow-csv", "serde"]
107168
# Enable JSON APIs
@@ -225,7 +286,7 @@ harness = false
225286
[[bench]]
226287
name = "row_selector"
227288
harness = false
228-
required-features = ["arrow"]
289+
required-features = ["arrow", "experimental"]
229290

230291
[lib]
231292
bench = false

parquet/benches/row_selector.rs

+76-37
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
use arrow_array::BooleanArray;
1919
use criterion::*;
20-
use parquet::arrow::arrow_reader::RowSelection;
20+
use parquet::arrow::arrow_reader::{BooleanRowSelection, RowSelection};
2121
use rand::Rng;
2222

2323
/// Generates a random RowSelection with a specified selection ratio.
@@ -40,47 +40,86 @@ fn generate_random_row_selection(total_rows: usize, selection_ratio: f64) -> Boo
4040

4141
fn criterion_benchmark(c: &mut Criterion) {
4242
let total_rows = 300_000;
43-
let selection_ratio = 1.0 / 3.0;
43+
let selection_ratios = [0.000_01, 0.001, 0.1, 0.3];
4444

45-
// Generate two random RowSelections with approximately 1/3 of the rows selected.
46-
let row_selection_a =
47-
RowSelection::from_filters(&[generate_random_row_selection(total_rows, selection_ratio)]);
48-
let row_selection_b =
49-
RowSelection::from_filters(&[generate_random_row_selection(total_rows, selection_ratio)]);
45+
for ratio in selection_ratios {
46+
let slice_selection_a =
47+
RowSelection::from_filters(&[generate_random_row_selection(total_rows, ratio)]);
48+
let slice_selection_b =
49+
RowSelection::from_filters(&[generate_random_row_selection(total_rows, ratio)]);
5050

51-
// Benchmark the intersection of the two RowSelections.
52-
c.bench_function("intersection", |b| {
53-
b.iter(|| {
54-
let intersection = row_selection_a.intersection(&row_selection_b);
55-
criterion::black_box(intersection);
56-
})
57-
});
51+
let boolean_selection_a = BooleanRowSelection::from(slice_selection_a.clone());
52+
let boolean_selection_b = BooleanRowSelection::from(slice_selection_b.clone());
5853

59-
c.bench_function("union", |b| {
60-
b.iter(|| {
61-
let union = row_selection_a.union(&row_selection_b);
62-
criterion::black_box(union);
63-
})
64-
});
54+
// Benchmark the intersection of the two RowSelections.
55+
c.bench_function(&format!("slice intersection {}", ratio), |b| {
56+
b.iter(|| {
57+
let intersection = slice_selection_a.intersection(&slice_selection_b);
58+
criterion::black_box(intersection);
59+
})
60+
});
6561

66-
c.bench_function("from_filters", |b| {
67-
let boolean_array = generate_random_row_selection(total_rows, selection_ratio);
68-
b.iter(|| {
69-
let array = boolean_array.clone();
70-
let selection = RowSelection::from_filters(&[array]);
71-
criterion::black_box(selection);
72-
})
73-
});
62+
c.bench_function(&format!("boolean intersection {}", ratio), |b| {
63+
b.iter(|| {
64+
let intersection = boolean_selection_a.intersection(&boolean_selection_b);
65+
criterion::black_box(intersection);
66+
})
67+
});
7468

75-
c.bench_function("and_then", |b| {
76-
let selected = row_selection_a.row_count();
77-
let sub_selection =
78-
RowSelection::from_filters(&[generate_random_row_selection(selected, selection_ratio)]);
79-
b.iter(|| {
80-
let result = row_selection_a.and_then(&sub_selection);
81-
criterion::black_box(result);
82-
})
83-
});
69+
c.bench_function(&format!("slice union {}", ratio), |b| {
70+
b.iter(|| {
71+
let union = slice_selection_a.union(&slice_selection_b);
72+
criterion::black_box(union);
73+
})
74+
});
75+
76+
c.bench_function(&format!("boolean union {}", ratio), |b| {
77+
b.iter(|| {
78+
let union = boolean_selection_a.union(&boolean_selection_b);
79+
criterion::black_box(union);
80+
})
81+
});
82+
83+
c.bench_function(&format!("slice from_filters {}", ratio), |b| {
84+
let boolean_array = generate_random_row_selection(total_rows, ratio);
85+
b.iter(|| {
86+
let array = boolean_array.clone();
87+
let selection = RowSelection::from_filters(&[array]);
88+
criterion::black_box(selection);
89+
})
90+
});
91+
92+
c.bench_function(&format!("boolean from_filters {}", ratio), |b| {
93+
let boolean_array = generate_random_row_selection(total_rows, ratio);
94+
b.iter(|| {
95+
let array = boolean_array.clone();
96+
let selection = BooleanRowSelection::from_filters(&[array]);
97+
criterion::black_box(selection);
98+
})
99+
});
100+
101+
c.bench_function(&format!("slice and_then {}", ratio), |b| {
102+
let selected = slice_selection_a.row_count();
103+
let sub_selection =
104+
RowSelection::from_filters(&[generate_random_row_selection(selected, ratio)]);
105+
b.iter(|| {
106+
let result = slice_selection_a.and_then(&sub_selection);
107+
criterion::black_box(result);
108+
})
109+
});
110+
111+
c.bench_function(&format!("boolean and_then {}", ratio), |b| {
112+
let selected = boolean_selection_a.row_count();
113+
let sub_selection =
114+
BooleanRowSelection::from_filters(&[generate_random_row_selection(
115+
selected, ratio,
116+
)]);
117+
b.iter(|| {
118+
let result = boolean_selection_a.and_then(&sub_selection);
119+
criterion::black_box(result);
120+
})
121+
});
122+
}
84123
}
85124

86125
criterion_group!(benches, criterion_benchmark);

parquet/src/arrow/arrow_reader/boolean_selection.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,9 @@ impl BooleanRowSelection {
152152
return self.intersection(other);
153153
}
154154

155-
let mut buffer = MutableBuffer::from_len_zeroed(self.len());
156-
buffer.copy_from_slice(self.selector.values());
157-
let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.len());
155+
let mut buffer = MutableBuffer::from_len_zeroed(self.selector.inner().len());
156+
buffer.copy_from_slice(self.selector.inner().as_slice());
157+
let mut builder = BooleanBufferBuilder::new_from_buffer(buffer, self.selector.len());
158158

159159
// Create iterators for 'self' and 'other' bits
160160
let mut other_bits = other.selector.iter();

parquet/src/arrow/arrow_reader/mod.rs

+7-8
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,6 @@
2020
use std::collections::VecDeque;
2121
use std::sync::Arc;
2222

23-
use arrow_array::cast::AsArray;
24-
use arrow_array::Array;
25-
use arrow_array::{RecordBatch, RecordBatchReader};
26-
use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef};
27-
use arrow_select::filter::prep_null_mask_filter;
28-
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
29-
pub use selection::{RowSelection, RowSelector};
30-
3123
pub use crate::arrow::array_reader::RowGroups;
3224
use crate::arrow::array_reader::{build_array_reader, ArrayReader};
3325
use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
@@ -37,6 +29,13 @@ use crate::errors::{ParquetError, Result};
3729
use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
3830
use crate::file::reader::{ChunkReader, SerializedPageReader};
3931
use crate::schema::types::SchemaDescriptor;
32+
use arrow_array::{cast::AsArray, Array, RecordBatch, RecordBatchReader};
33+
use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef};
34+
use arrow_select::filter::prep_null_mask_filter;
35+
#[cfg(feature = "experimental")]
36+
pub use boolean_selection::BooleanRowSelection;
37+
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
38+
pub use selection::{RowSelection, RowSelector};
4039

4140
#[cfg(feature = "experimental")]
4241
mod boolean_selection;

0 commit comments

Comments
 (0)