From 822760c0df669118a0d4239acf84209c8d1f2cd4 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Thu, 10 Apr 2025 22:49:38 +0800 Subject: [PATCH 01/21] Add benchmark for parquet reader with row_filter and project settings --- parquet/Cargo.toml | 4 + parquet/benches/arrow_reader_row_filter.rs | 316 +++++++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 parquet/benches/arrow_reader_row_filter.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 2f31a290e398..80b398df21de 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -211,6 +211,10 @@ name = "arrow_statistics" required-features = ["arrow"] harness = false +[[bench]] +name = "arrow_reader_row_filter" +required-features = ["arrow"] +harness = false [[bench]] name = "compression" diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs new file mode 100644 index 000000000000..788ca3e533b7 --- /dev/null +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for evaluating row filters and projections on a Parquet file. +//! +//! This benchmark creates a Parquet file in memory with 100K rows and four columns: +//! - int64: sequential integers +//! - float64: floating-point values (derived from the integers) +//! - utf8View: string values where about half are non-empty, +//! and a few rows (every 10Kth row) are the constant "const" +//! - ts: timestamp values (using, e.g., a millisecond epoch) +//! +//! It then applies several filter functions and projections, benchmarking the read-back speed. +//! +//! Filters tested: +//! - A string filter: `utf8View <> ''` (non-empty) +//! - A string filter: `utf8View = 'const'` (selective) +//! - An integer non-selective filter (e.g. even numbers) +//! - An integer selective filter (e.g. `int64 = 0`) +//! - A timestamp filter (e.g. `ts > threshold`) +//! +//! Projections tested: +//! - All 4 columns. +//! - All columns except the one used for the filter. +//! +//! To run the benchmark, use `cargo bench --bench bench_filter_projection`. + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use std::sync::Arc; +use tempfile::NamedTempFile; + +use arrow::array::{ + ArrayRef, BooleanArray, BooleanBuilder, Float64Array, Int64Array, TimestampMillisecondArray, +}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use arrow::record_batch::RecordBatch; +use arrow_array::builder::StringViewBuilder; +use arrow_array::{Array, StringViewArray}; +use parquet::arrow::arrow_reader::{ + ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, RowFilter, +}; +use parquet::arrow::{ArrowWriter, ProjectionMask}; +use parquet::file::properties::WriterProperties; + +/// Create a RecordBatch with 100K rows and four columns. +fn make_record_batch() -> RecordBatch { + let num_rows = 100_000; + + // int64 column: sequential numbers 0..num_rows + let int_values: Vec = (0..num_rows as i64).collect(); + let int_array = Arc::new(Int64Array::from(int_values)) as ArrayRef; + + // float64 column: derived from int64 (e.g., multiplied by 0.1) + let float_values: Vec = (0..num_rows).map(|i| i as f64 * 0.1).collect(); + let float_array = Arc::new(Float64Array::from(float_values)) as ArrayRef; + + // utf8View column: even rows get non-empty strings; odd rows get an empty string; + // every 10Kth even row is "const" to be selective. + let mut string_view_builder = StringViewBuilder::with_capacity(100_000); + for i in 0..num_rows { + if i % 2 == 0 { + if i % 10_000 == 0 { + string_view_builder.append_value("const"); + } else { + string_view_builder.append_value("nonempty"); + } + } else { + string_view_builder.append_value(""); + } + } + let utf8_view_array = Arc::new(string_view_builder.finish()) as ArrayRef; + + // Timestamp column: using milliseconds from an epoch (simply using the row index) + let ts_values: Vec = (0..num_rows as i64).collect(); + let ts_array = Arc::new(TimestampMillisecondArray::from(ts_values)) as ArrayRef; + + let schema = Arc::new(Schema::new(vec![ + Field::new("int64", DataType::Int64, false), + Field::new("float64", DataType::Float64, false), + Field::new("utf8View", DataType::Utf8View, false), + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + ])); + + RecordBatch::try_new( + schema, + vec![int_array, float_array, utf8_view_array, ts_array], + ) + .unwrap() +} + +/// Writes the record batch to a temporary Parquet file. +fn write_parquet_file() -> NamedTempFile { + let batch = make_record_batch(); + let schema = batch.schema(); + let props = WriterProperties::builder().build(); + + let file = tempfile::Builder::new() + .suffix(".parquet") + .tempfile() + .unwrap(); + { + let file_reopen = file.reopen().unwrap(); + let mut writer = ArrowWriter::try_new(file_reopen, schema.clone(), Some(props)).unwrap(); + // Write the entire batch as a single row group. + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } + file +} + +/// Filter function: returns a BooleanArray with true when utf8View <> "". +fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { + let array = batch + .column(batch.schema().index_of("utf8View").unwrap()) + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { + let keep = array.value(i) != ""; + builder.append_value(keep); + } + builder.finish() +} + +/// Filter function: returns a BooleanArray with true when utf8View == "const". +fn filter_utf8_view_const(batch: &RecordBatch) -> BooleanArray { + let array = batch + .column(batch.schema().index_of("utf8View").unwrap()) + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { + let keep = array.value(i) == "const"; + builder.append_value(keep); + } + builder.finish() +} + +/// Integer non-selective filter: returns true for even numbers. +fn filter_int64_even(batch: &RecordBatch) -> BooleanArray { + let array = batch + .column(batch.schema().index_of("int64").unwrap()) + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { + let keep = array.value(i) % 2 == 0; + builder.append_value(keep); + } + builder.finish() +} + +/// Integer selective filter: returns true only when int64 equals 0. +fn filter_int64_eq_zero(batch: &RecordBatch) -> BooleanArray { + let array = batch + .column(batch.schema().index_of("int64").unwrap()) + .as_any() + .downcast_ref::() + .unwrap(); + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { + let keep = array.value(i) == 0; + builder.append_value(keep); + } + builder.finish() +} + +/// Timestamp filter: returns true when ts > threshold (using 50_000 as example threshold). +fn filter_timestamp_gt(batch: &RecordBatch) -> BooleanArray { + let array = batch + .column(batch.schema().index_of("ts").unwrap()) + .as_any() + .downcast_ref::() + .unwrap(); + let threshold = 50_000; + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { + let keep = array.value(i) > threshold; + builder.append_value(keep); + } + builder.finish() +} + +#[derive(Clone)] +enum FilterType { + Utf8ViewNonEmpty, + Utf8ViewConst, + Int64Even, + Int64EqZero, + TimestampGt, +} + +impl std::fmt::Display for FilterType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FilterType::Utf8ViewNonEmpty => write!(f, "utf8View <> ''"), + FilterType::Utf8ViewConst => write!(f, "utf8View = 'const'"), + FilterType::Int64Even => write!(f, "int64 even"), + FilterType::Int64EqZero => write!(f, "int64 = 0"), + FilterType::TimestampGt => write!(f, "ts > 50_000"), + } + } +} + +fn benchmark_filters_and_projections(c: &mut Criterion) { + let parquet_file = write_parquet_file(); + + // Define filter functions associated with each FilterType. + let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ + (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), + (FilterType::Utf8ViewConst, filter_utf8_view_const), + (FilterType::Int64Even, filter_int64_even), + (FilterType::Int64EqZero, filter_int64_eq_zero), + (FilterType::TimestampGt, filter_timestamp_gt), + ]; + + let mut group = c.benchmark_group("arrow_reader_row_filter"); + + // Iterate by value (Copy is available for FilterType and fn pointers) + for (filter_type, filter_fn) in filter_funcs.into_iter() { + for proj_case in ["all_columns", "exclude_filter_column"].iter() { + // Define indices for all columns: [0: "int64", 1: "float64", 2: "utf8View", 3: "ts"] + let all_indices = vec![0, 1, 2, 3]; + + // For the output projection, conditionally exclude the filter column. + let output_projection: Vec = if *proj_case == "all_columns" { + all_indices.clone() + } else { + all_indices + .into_iter() + .filter(|i| match filter_type { + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, // Exclude "utf8" (index 2) + FilterType::Int64Even | FilterType::Int64EqZero => *i != 0, // Exclude "int64" (index 0) + FilterType::TimestampGt => *i != 3, // Exclude "ts" (index 3) + }) + .collect() + }; + + // For predicate pushdown, define a projection that includes the column required for the predicate. + let predicate_projection: Vec = match filter_type { + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], + FilterType::Int64Even | FilterType::Int64EqZero => vec![0], + FilterType::TimestampGt => vec![3], + }; + + // Create a benchmark id combining filter type and projection case. + let bench_id = BenchmarkId::new( + format!("filter_case: {} project_case: {}", filter_type, proj_case), + "", + ); + group.bench_function(bench_id, |b| { + b.iter(|| { + // Reopen the Parquet file for each iteration. + let file = parquet_file.reopen().unwrap(); + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = ArrowReaderBuilder::try_new_with_options(file, options).unwrap(); + let file_metadata = builder.metadata().file_metadata().clone(); + // Build the projection mask from the output projection (clone to avoid move) + let mask = ProjectionMask::roots( + file_metadata.schema_descr(), + output_projection.clone(), + ); + + // Build the predicate mask from the predicate projection (clone to avoid move) + let pred_mask = ProjectionMask::roots( + file_metadata.schema_descr(), + predicate_projection.clone(), + ); + + // Copy the filter function pointer. + let f = filter_fn; + // Wrap the filter function in a closure to satisfy the expected signature. + let filter = + ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| Ok(f(&batch))); + let row_filter = RowFilter::new(vec![Box::new(filter)]); + + // Build the reader with row filter and output projection. + let reader = builder + .with_row_filter(row_filter) + .with_projection(mask) + .build() + .unwrap(); + + // Collect result batches, unwrapping errors. + let _result: Vec = reader.map(|r| r.unwrap()).collect(); + }); + }); + } + } + + group.finish(); +} + +criterion_group!(benches, benchmark_filters_and_projections); +criterion_main!(benches); From 31a544fd9bc5506d8c52014a4d7c922680ec1b08 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Thu, 10 Apr 2025 23:03:51 +0800 Subject: [PATCH 02/21] fix clippy --- parquet/benches/arrow_reader_row_filter.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 788ca3e533b7..5284275c97bf 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -21,7 +21,7 @@ //! - int64: sequential integers //! - float64: floating-point values (derived from the integers) //! - utf8View: string values where about half are non-empty, -//! and a few rows (every 10Kth row) are the constant "const" +//! and a few rows (every 10Kth row) are the constant "const" //! - ts: timestamp values (using, e.g., a millisecond epoch) //! //! It then applies several filter functions and projections, benchmarking the read-back speed. @@ -135,7 +135,7 @@ fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { .unwrap(); let mut builder = BooleanBuilder::with_capacity(array.len()); for i in 0..array.len() { - let keep = array.value(i) != ""; + let keep = !array.value(i).is_empty(); builder.append_value(keep); } builder.finish() @@ -227,7 +227,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); // Define filter functions associated with each FilterType. - let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ + type FilterFn = fn(&RecordBatch) -> BooleanArray; + let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), (FilterType::Utf8ViewConst, filter_utf8_view_const), (FilterType::Int64Even, filter_int64_even), From b16428db3fdeebf5085dde4486e987dbcfaa5ed4 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 17:15:32 +0800 Subject: [PATCH 03/21] change bench mark to use asyn read to trigger the page cache --- 7401.patch | 391 +++++++++++++++++++++ parquet/Cargo.toml | 5 +- parquet/benches/arrow_reader_row_filter.rs | 88 ++--- 3 files changed, 441 insertions(+), 43 deletions(-) create mode 100644 7401.patch diff --git a/7401.patch b/7401.patch new file mode 100644 index 000000000000..0bc9bce05063 --- /dev/null +++ b/7401.patch @@ -0,0 +1,391 @@ +From 822760c0df669118a0d4239acf84209c8d1f2cd4 Mon Sep 17 00:00:00 2001 +From: zhuqi-lucas <821684824@qq.com> +Date: Thu, 10 Apr 2025 22:49:38 +0800 +Subject: [PATCH 1/2] Add benchmark for parquet reader with row_filter and + project settings + +--- + parquet/Cargo.toml | 4 + + parquet/benches/arrow_reader_row_filter.rs | 316 +++++++++++++++++++++ + 2 files changed, 320 insertions(+) + create mode 100644 parquet/benches/arrow_reader_row_filter.rs + +diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml +index 2f31a290e398..80b398df21de 100644 +--- a/parquet/Cargo.toml ++++ b/parquet/Cargo.toml +@@ -211,6 +211,10 @@ name = "arrow_statistics" + required-features = ["arrow"] + harness = false + ++[[bench]] ++name = "arrow_reader_row_filter" ++required-features = ["arrow"] ++harness = false + + [[bench]] + name = "compression" +diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs +new file mode 100644 +index 000000000000..788ca3e533b7 +--- /dev/null ++++ b/parquet/benches/arrow_reader_row_filter.rs +@@ -0,0 +1,316 @@ ++// Licensed to the Apache Software Foundation (ASF) under one ++// or more contributor license agreements. See the NOTICE file ++// distributed with this work for additional information ++// regarding copyright ownership. The ASF licenses this file ++// to you under the Apache License, Version 2.0 (the ++// "License"); you may not use this file except in compliance ++// with the License. You may obtain a copy of the License at ++// ++// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// Unless required by applicable law or agreed to in writing, ++// software distributed under the License is distributed on an ++// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY ++// KIND, either express or implied. See the License for the ++// specific language governing permissions and limitations ++// under the License. ++ ++//! Benchmark for evaluating row filters and projections on a Parquet file. ++//! ++//! This benchmark creates a Parquet file in memory with 100K rows and four columns: ++//! - int64: sequential integers ++//! - float64: floating-point values (derived from the integers) ++//! - utf8View: string values where about half are non-empty, ++//! and a few rows (every 10Kth row) are the constant "const" ++//! - ts: timestamp values (using, e.g., a millisecond epoch) ++//! ++//! It then applies several filter functions and projections, benchmarking the read-back speed. ++//! ++//! Filters tested: ++//! - A string filter: `utf8View <> ''` (non-empty) ++//! - A string filter: `utf8View = 'const'` (selective) ++//! - An integer non-selective filter (e.g. even numbers) ++//! - An integer selective filter (e.g. `int64 = 0`) ++//! - A timestamp filter (e.g. `ts > threshold`) ++//! ++//! Projections tested: ++//! - All 4 columns. ++//! - All columns except the one used for the filter. ++//! ++//! To run the benchmark, use `cargo bench --bench bench_filter_projection`. ++ ++use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; ++use std::sync::Arc; ++use tempfile::NamedTempFile; ++ ++use arrow::array::{ ++ ArrayRef, BooleanArray, BooleanBuilder, Float64Array, Int64Array, TimestampMillisecondArray, ++}; ++use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; ++use arrow::record_batch::RecordBatch; ++use arrow_array::builder::StringViewBuilder; ++use arrow_array::{Array, StringViewArray}; ++use parquet::arrow::arrow_reader::{ ++ ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, RowFilter, ++}; ++use parquet::arrow::{ArrowWriter, ProjectionMask}; ++use parquet::file::properties::WriterProperties; ++ ++/// Create a RecordBatch with 100K rows and four columns. ++fn make_record_batch() -> RecordBatch { ++ let num_rows = 100_000; ++ ++ // int64 column: sequential numbers 0..num_rows ++ let int_values: Vec = (0..num_rows as i64).collect(); ++ let int_array = Arc::new(Int64Array::from(int_values)) as ArrayRef; ++ ++ // float64 column: derived from int64 (e.g., multiplied by 0.1) ++ let float_values: Vec = (0..num_rows).map(|i| i as f64 * 0.1).collect(); ++ let float_array = Arc::new(Float64Array::from(float_values)) as ArrayRef; ++ ++ // utf8View column: even rows get non-empty strings; odd rows get an empty string; ++ // every 10Kth even row is "const" to be selective. ++ let mut string_view_builder = StringViewBuilder::with_capacity(100_000); ++ for i in 0..num_rows { ++ if i % 2 == 0 { ++ if i % 10_000 == 0 { ++ string_view_builder.append_value("const"); ++ } else { ++ string_view_builder.append_value("nonempty"); ++ } ++ } else { ++ string_view_builder.append_value(""); ++ } ++ } ++ let utf8_view_array = Arc::new(string_view_builder.finish()) as ArrayRef; ++ ++ // Timestamp column: using milliseconds from an epoch (simply using the row index) ++ let ts_values: Vec = (0..num_rows as i64).collect(); ++ let ts_array = Arc::new(TimestampMillisecondArray::from(ts_values)) as ArrayRef; ++ ++ let schema = Arc::new(Schema::new(vec![ ++ Field::new("int64", DataType::Int64, false), ++ Field::new("float64", DataType::Float64, false), ++ Field::new("utf8View", DataType::Utf8View, false), ++ Field::new( ++ "ts", ++ DataType::Timestamp(TimeUnit::Millisecond, None), ++ false, ++ ), ++ ])); ++ ++ RecordBatch::try_new( ++ schema, ++ vec![int_array, float_array, utf8_view_array, ts_array], ++ ) ++ .unwrap() ++} ++ ++/// Writes the record batch to a temporary Parquet file. ++fn write_parquet_file() -> NamedTempFile { ++ let batch = make_record_batch(); ++ let schema = batch.schema(); ++ let props = WriterProperties::builder().build(); ++ ++ let file = tempfile::Builder::new() ++ .suffix(".parquet") ++ .tempfile() ++ .unwrap(); ++ { ++ let file_reopen = file.reopen().unwrap(); ++ let mut writer = ArrowWriter::try_new(file_reopen, schema.clone(), Some(props)).unwrap(); ++ // Write the entire batch as a single row group. ++ writer.write(&batch).unwrap(); ++ writer.close().unwrap(); ++ } ++ file ++} ++ ++/// Filter function: returns a BooleanArray with true when utf8View <> "". ++fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { ++ let array = batch ++ .column(batch.schema().index_of("utf8View").unwrap()) ++ .as_any() ++ .downcast_ref::() ++ .unwrap(); ++ let mut builder = BooleanBuilder::with_capacity(array.len()); ++ for i in 0..array.len() { ++ let keep = array.value(i) != ""; ++ builder.append_value(keep); ++ } ++ builder.finish() ++} ++ ++/// Filter function: returns a BooleanArray with true when utf8View == "const". ++fn filter_utf8_view_const(batch: &RecordBatch) -> BooleanArray { ++ let array = batch ++ .column(batch.schema().index_of("utf8View").unwrap()) ++ .as_any() ++ .downcast_ref::() ++ .unwrap(); ++ let mut builder = BooleanBuilder::with_capacity(array.len()); ++ for i in 0..array.len() { ++ let keep = array.value(i) == "const"; ++ builder.append_value(keep); ++ } ++ builder.finish() ++} ++ ++/// Integer non-selective filter: returns true for even numbers. ++fn filter_int64_even(batch: &RecordBatch) -> BooleanArray { ++ let array = batch ++ .column(batch.schema().index_of("int64").unwrap()) ++ .as_any() ++ .downcast_ref::() ++ .unwrap(); ++ let mut builder = BooleanBuilder::with_capacity(array.len()); ++ for i in 0..array.len() { ++ let keep = array.value(i) % 2 == 0; ++ builder.append_value(keep); ++ } ++ builder.finish() ++} ++ ++/// Integer selective filter: returns true only when int64 equals 0. ++fn filter_int64_eq_zero(batch: &RecordBatch) -> BooleanArray { ++ let array = batch ++ .column(batch.schema().index_of("int64").unwrap()) ++ .as_any() ++ .downcast_ref::() ++ .unwrap(); ++ let mut builder = BooleanBuilder::with_capacity(array.len()); ++ for i in 0..array.len() { ++ let keep = array.value(i) == 0; ++ builder.append_value(keep); ++ } ++ builder.finish() ++} ++ ++/// Timestamp filter: returns true when ts > threshold (using 50_000 as example threshold). ++fn filter_timestamp_gt(batch: &RecordBatch) -> BooleanArray { ++ let array = batch ++ .column(batch.schema().index_of("ts").unwrap()) ++ .as_any() ++ .downcast_ref::() ++ .unwrap(); ++ let threshold = 50_000; ++ let mut builder = BooleanBuilder::with_capacity(array.len()); ++ for i in 0..array.len() { ++ let keep = array.value(i) > threshold; ++ builder.append_value(keep); ++ } ++ builder.finish() ++} ++ ++#[derive(Clone)] ++enum FilterType { ++ Utf8ViewNonEmpty, ++ Utf8ViewConst, ++ Int64Even, ++ Int64EqZero, ++ TimestampGt, ++} ++ ++impl std::fmt::Display for FilterType { ++ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { ++ match self { ++ FilterType::Utf8ViewNonEmpty => write!(f, "utf8View <> ''"), ++ FilterType::Utf8ViewConst => write!(f, "utf8View = 'const'"), ++ FilterType::Int64Even => write!(f, "int64 even"), ++ FilterType::Int64EqZero => write!(f, "int64 = 0"), ++ FilterType::TimestampGt => write!(f, "ts > 50_000"), ++ } ++ } ++} ++ ++fn benchmark_filters_and_projections(c: &mut Criterion) { ++ let parquet_file = write_parquet_file(); ++ ++ // Define filter functions associated with each FilterType. ++ let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ ++ (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), ++ (FilterType::Utf8ViewConst, filter_utf8_view_const), ++ (FilterType::Int64Even, filter_int64_even), ++ (FilterType::Int64EqZero, filter_int64_eq_zero), ++ (FilterType::TimestampGt, filter_timestamp_gt), ++ ]; ++ ++ let mut group = c.benchmark_group("arrow_reader_row_filter"); ++ ++ // Iterate by value (Copy is available for FilterType and fn pointers) ++ for (filter_type, filter_fn) in filter_funcs.into_iter() { ++ for proj_case in ["all_columns", "exclude_filter_column"].iter() { ++ // Define indices for all columns: [0: "int64", 1: "float64", 2: "utf8View", 3: "ts"] ++ let all_indices = vec![0, 1, 2, 3]; ++ ++ // For the output projection, conditionally exclude the filter column. ++ let output_projection: Vec = if *proj_case == "all_columns" { ++ all_indices.clone() ++ } else { ++ all_indices ++ .into_iter() ++ .filter(|i| match filter_type { ++ FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, // Exclude "utf8" (index 2) ++ FilterType::Int64Even | FilterType::Int64EqZero => *i != 0, // Exclude "int64" (index 0) ++ FilterType::TimestampGt => *i != 3, // Exclude "ts" (index 3) ++ }) ++ .collect() ++ }; ++ ++ // For predicate pushdown, define a projection that includes the column required for the predicate. ++ let predicate_projection: Vec = match filter_type { ++ FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], ++ FilterType::Int64Even | FilterType::Int64EqZero => vec![0], ++ FilterType::TimestampGt => vec![3], ++ }; ++ ++ // Create a benchmark id combining filter type and projection case. ++ let bench_id = BenchmarkId::new( ++ format!("filter_case: {} project_case: {}", filter_type, proj_case), ++ "", ++ ); ++ group.bench_function(bench_id, |b| { ++ b.iter(|| { ++ // Reopen the Parquet file for each iteration. ++ let file = parquet_file.reopen().unwrap(); ++ let options = ArrowReaderOptions::new().with_page_index(true); ++ let builder = ArrowReaderBuilder::try_new_with_options(file, options).unwrap(); ++ let file_metadata = builder.metadata().file_metadata().clone(); ++ // Build the projection mask from the output projection (clone to avoid move) ++ let mask = ProjectionMask::roots( ++ file_metadata.schema_descr(), ++ output_projection.clone(), ++ ); ++ ++ // Build the predicate mask from the predicate projection (clone to avoid move) ++ let pred_mask = ProjectionMask::roots( ++ file_metadata.schema_descr(), ++ predicate_projection.clone(), ++ ); ++ ++ // Copy the filter function pointer. ++ let f = filter_fn; ++ // Wrap the filter function in a closure to satisfy the expected signature. ++ let filter = ++ ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| Ok(f(&batch))); ++ let row_filter = RowFilter::new(vec![Box::new(filter)]); ++ ++ // Build the reader with row filter and output projection. ++ let reader = builder ++ .with_row_filter(row_filter) ++ .with_projection(mask) ++ .build() ++ .unwrap(); ++ ++ // Collect result batches, unwrapping errors. ++ let _result: Vec = reader.map(|r| r.unwrap()).collect(); ++ }); ++ }); ++ } ++ } ++ ++ group.finish(); ++} ++ ++criterion_group!(benches, benchmark_filters_and_projections); ++criterion_main!(benches); + +From 31a544fd9bc5506d8c52014a4d7c922680ec1b08 Mon Sep 17 00:00:00 2001 +From: zhuqi-lucas <821684824@qq.com> +Date: Thu, 10 Apr 2025 23:03:51 +0800 +Subject: [PATCH 2/2] fix clippy + +--- + parquet/benches/arrow_reader_row_filter.rs | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs +index 788ca3e533b7..5284275c97bf 100644 +--- a/parquet/benches/arrow_reader_row_filter.rs ++++ b/parquet/benches/arrow_reader_row_filter.rs +@@ -21,7 +21,7 @@ + //! - int64: sequential integers + //! - float64: floating-point values (derived from the integers) + //! - utf8View: string values where about half are non-empty, +-//! and a few rows (every 10Kth row) are the constant "const" ++//! and a few rows (every 10Kth row) are the constant "const" + //! - ts: timestamp values (using, e.g., a millisecond epoch) + //! + //! It then applies several filter functions and projections, benchmarking the read-back speed. +@@ -135,7 +135,7 @@ fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { + .unwrap(); + let mut builder = BooleanBuilder::with_capacity(array.len()); + for i in 0..array.len() { +- let keep = array.value(i) != ""; ++ let keep = !array.value(i).is_empty(); + builder.append_value(keep); + } + builder.finish() +@@ -227,7 +227,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { + let parquet_file = write_parquet_file(); + + // Define filter functions associated with each FilterType. +- let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ ++ type FilterFn = fn(&RecordBatch) -> BooleanArray; ++ let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ + (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), + (FilterType::Utf8ViewConst, filter_utf8_view_const), + (FilterType::Int64Even, filter_int64_even), diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 80b398df21de..eda58e751a62 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -74,8 +74,7 @@ ring = { version = "0.17", default-features = false, features = ["std"], optiona [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } -criterion = { version = "0.5", default-features = false } -snap = { version = "1.0", default-features = false } +criterion = { version = "0.5", default-features = false, features = ["async_futures"] } tempfile = { version = "3.0", default-features = false } brotli = { version = "7.0", default-features = false, features = ["std"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } @@ -213,7 +212,7 @@ harness = false [[bench]] name = "arrow_reader_row_filter" -required-features = ["arrow"] +required-features = ["arrow", "async"] harness = false [[bench]] diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 5284275c97bf..af07636e49de 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -50,11 +50,13 @@ use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::builder::StringViewBuilder; use arrow_array::{Array, StringViewArray}; -use parquet::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, RowFilter, -}; -use parquet::arrow::{ArrowWriter, ProjectionMask}; +use criterion::async_executor::FuturesExecutor; +use futures::TryStreamExt; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; +use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::properties::WriterProperties; +use tokio::fs::File; +use tokio::runtime::Runtime; /// Create a RecordBatch with 100K rows and four columns. fn make_record_batch() -> RecordBatch { @@ -226,6 +228,8 @@ impl std::fmt::Display for FilterType { fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); + let runtime = Runtime::new().unwrap(); // Create a new Tokio runtime + // Define filter functions associated with each FilterType. type FilterFn = fn(&RecordBatch) -> BooleanArray; let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ @@ -270,47 +274,51 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { format!("filter_case: {} project_case: {}", filter_type, proj_case), "", ); + group.bench_function(bench_id, |b| { - b.iter(|| { - // Reopen the Parquet file for each iteration. - let file = parquet_file.reopen().unwrap(); - let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ArrowReaderBuilder::try_new_with_options(file, options).unwrap(); - let file_metadata = builder.metadata().file_metadata().clone(); - // Build the projection mask from the output projection (clone to avoid move) - let mask = ProjectionMask::roots( - file_metadata.schema_descr(), - output_projection.clone(), - ); - - // Build the predicate mask from the predicate projection (clone to avoid move) - let pred_mask = ProjectionMask::roots( - file_metadata.schema_descr(), - predicate_projection.clone(), - ); - - // Copy the filter function pointer. - let f = filter_fn; - // Wrap the filter function in a closure to satisfy the expected signature. - let filter = - ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| Ok(f(&batch))); - let row_filter = RowFilter::new(vec![Box::new(filter)]); - - // Build the reader with row filter and output projection. - let reader = builder - .with_row_filter(row_filter) - .with_projection(mask) - .build() - .unwrap(); - - // Collect result batches, unwrapping errors. - let _result: Vec = reader.map(|r| r.unwrap()).collect(); + b.to_async(FuturesExecutor).iter(|| async { + runtime.block_on(async { + // Reopen the Parquet file for each iteration. + let file = File::open(parquet_file.path()).await.unwrap(); + + // Create a async parquet reader builder with batch_size. + let options = ArrowReaderOptions::new().with_page_index(true); + + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(file, options) + .await + .unwrap() + .with_batch_size(8192); + + let file_metadata = builder.metadata().file_metadata().clone(); + + let mask = ProjectionMask::roots( + file_metadata.schema_descr(), + output_projection.clone(), + ); + + let pred_mask = ProjectionMask::roots( + file_metadata.schema_descr(), + predicate_projection.clone(), + ); + + let f = filter_fn; + let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { + Ok(f(&batch)) + }); + let stream = builder + .with_projection(mask) + .with_row_filter(RowFilter::new(vec![Box::new(filter)])) + .build() + .unwrap(); + + // Collect the results into a vector of RecordBatches. + stream.try_collect::>().await.unwrap(); + }) }); }); } } - - group.finish(); } criterion_group!(benches, benchmark_filters_and_projections); From 1aacd01ee29cbdff5e7ab7ce9edf4093b9116978 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 17:16:40 +0800 Subject: [PATCH 04/21] fix --- 7401.patch | 391 ----------------------------------------------------- 1 file changed, 391 deletions(-) delete mode 100644 7401.patch diff --git a/7401.patch b/7401.patch deleted file mode 100644 index 0bc9bce05063..000000000000 --- a/7401.patch +++ /dev/null @@ -1,391 +0,0 @@ -From 822760c0df669118a0d4239acf84209c8d1f2cd4 Mon Sep 17 00:00:00 2001 -From: zhuqi-lucas <821684824@qq.com> -Date: Thu, 10 Apr 2025 22:49:38 +0800 -Subject: [PATCH 1/2] Add benchmark for parquet reader with row_filter and - project settings - ---- - parquet/Cargo.toml | 4 + - parquet/benches/arrow_reader_row_filter.rs | 316 +++++++++++++++++++++ - 2 files changed, 320 insertions(+) - create mode 100644 parquet/benches/arrow_reader_row_filter.rs - -diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml -index 2f31a290e398..80b398df21de 100644 ---- a/parquet/Cargo.toml -+++ b/parquet/Cargo.toml -@@ -211,6 +211,10 @@ name = "arrow_statistics" - required-features = ["arrow"] - harness = false - -+[[bench]] -+name = "arrow_reader_row_filter" -+required-features = ["arrow"] -+harness = false - - [[bench]] - name = "compression" -diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs -new file mode 100644 -index 000000000000..788ca3e533b7 ---- /dev/null -+++ b/parquet/benches/arrow_reader_row_filter.rs -@@ -0,0 +1,316 @@ -+// Licensed to the Apache Software Foundation (ASF) under one -+// or more contributor license agreements. See the NOTICE file -+// distributed with this work for additional information -+// regarding copyright ownership. The ASF licenses this file -+// to you under the Apache License, Version 2.0 (the -+// "License"); you may not use this file except in compliance -+// with the License. You may obtain a copy of the License at -+// -+// http://www.apache.org/licenses/LICENSE-2.0 -+// -+// Unless required by applicable law or agreed to in writing, -+// software distributed under the License is distributed on an -+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -+// KIND, either express or implied. See the License for the -+// specific language governing permissions and limitations -+// under the License. -+ -+//! Benchmark for evaluating row filters and projections on a Parquet file. -+//! -+//! This benchmark creates a Parquet file in memory with 100K rows and four columns: -+//! - int64: sequential integers -+//! - float64: floating-point values (derived from the integers) -+//! - utf8View: string values where about half are non-empty, -+//! and a few rows (every 10Kth row) are the constant "const" -+//! - ts: timestamp values (using, e.g., a millisecond epoch) -+//! -+//! It then applies several filter functions and projections, benchmarking the read-back speed. -+//! -+//! Filters tested: -+//! - A string filter: `utf8View <> ''` (non-empty) -+//! - A string filter: `utf8View = 'const'` (selective) -+//! - An integer non-selective filter (e.g. even numbers) -+//! - An integer selective filter (e.g. `int64 = 0`) -+//! - A timestamp filter (e.g. `ts > threshold`) -+//! -+//! Projections tested: -+//! - All 4 columns. -+//! - All columns except the one used for the filter. -+//! -+//! To run the benchmark, use `cargo bench --bench bench_filter_projection`. -+ -+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -+use std::sync::Arc; -+use tempfile::NamedTempFile; -+ -+use arrow::array::{ -+ ArrayRef, BooleanArray, BooleanBuilder, Float64Array, Int64Array, TimestampMillisecondArray, -+}; -+use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -+use arrow::record_batch::RecordBatch; -+use arrow_array::builder::StringViewBuilder; -+use arrow_array::{Array, StringViewArray}; -+use parquet::arrow::arrow_reader::{ -+ ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, RowFilter, -+}; -+use parquet::arrow::{ArrowWriter, ProjectionMask}; -+use parquet::file::properties::WriterProperties; -+ -+/// Create a RecordBatch with 100K rows and four columns. -+fn make_record_batch() -> RecordBatch { -+ let num_rows = 100_000; -+ -+ // int64 column: sequential numbers 0..num_rows -+ let int_values: Vec = (0..num_rows as i64).collect(); -+ let int_array = Arc::new(Int64Array::from(int_values)) as ArrayRef; -+ -+ // float64 column: derived from int64 (e.g., multiplied by 0.1) -+ let float_values: Vec = (0..num_rows).map(|i| i as f64 * 0.1).collect(); -+ let float_array = Arc::new(Float64Array::from(float_values)) as ArrayRef; -+ -+ // utf8View column: even rows get non-empty strings; odd rows get an empty string; -+ // every 10Kth even row is "const" to be selective. -+ let mut string_view_builder = StringViewBuilder::with_capacity(100_000); -+ for i in 0..num_rows { -+ if i % 2 == 0 { -+ if i % 10_000 == 0 { -+ string_view_builder.append_value("const"); -+ } else { -+ string_view_builder.append_value("nonempty"); -+ } -+ } else { -+ string_view_builder.append_value(""); -+ } -+ } -+ let utf8_view_array = Arc::new(string_view_builder.finish()) as ArrayRef; -+ -+ // Timestamp column: using milliseconds from an epoch (simply using the row index) -+ let ts_values: Vec = (0..num_rows as i64).collect(); -+ let ts_array = Arc::new(TimestampMillisecondArray::from(ts_values)) as ArrayRef; -+ -+ let schema = Arc::new(Schema::new(vec![ -+ Field::new("int64", DataType::Int64, false), -+ Field::new("float64", DataType::Float64, false), -+ Field::new("utf8View", DataType::Utf8View, false), -+ Field::new( -+ "ts", -+ DataType::Timestamp(TimeUnit::Millisecond, None), -+ false, -+ ), -+ ])); -+ -+ RecordBatch::try_new( -+ schema, -+ vec![int_array, float_array, utf8_view_array, ts_array], -+ ) -+ .unwrap() -+} -+ -+/// Writes the record batch to a temporary Parquet file. -+fn write_parquet_file() -> NamedTempFile { -+ let batch = make_record_batch(); -+ let schema = batch.schema(); -+ let props = WriterProperties::builder().build(); -+ -+ let file = tempfile::Builder::new() -+ .suffix(".parquet") -+ .tempfile() -+ .unwrap(); -+ { -+ let file_reopen = file.reopen().unwrap(); -+ let mut writer = ArrowWriter::try_new(file_reopen, schema.clone(), Some(props)).unwrap(); -+ // Write the entire batch as a single row group. -+ writer.write(&batch).unwrap(); -+ writer.close().unwrap(); -+ } -+ file -+} -+ -+/// Filter function: returns a BooleanArray with true when utf8View <> "". -+fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { -+ let array = batch -+ .column(batch.schema().index_of("utf8View").unwrap()) -+ .as_any() -+ .downcast_ref::() -+ .unwrap(); -+ let mut builder = BooleanBuilder::with_capacity(array.len()); -+ for i in 0..array.len() { -+ let keep = array.value(i) != ""; -+ builder.append_value(keep); -+ } -+ builder.finish() -+} -+ -+/// Filter function: returns a BooleanArray with true when utf8View == "const". -+fn filter_utf8_view_const(batch: &RecordBatch) -> BooleanArray { -+ let array = batch -+ .column(batch.schema().index_of("utf8View").unwrap()) -+ .as_any() -+ .downcast_ref::() -+ .unwrap(); -+ let mut builder = BooleanBuilder::with_capacity(array.len()); -+ for i in 0..array.len() { -+ let keep = array.value(i) == "const"; -+ builder.append_value(keep); -+ } -+ builder.finish() -+} -+ -+/// Integer non-selective filter: returns true for even numbers. -+fn filter_int64_even(batch: &RecordBatch) -> BooleanArray { -+ let array = batch -+ .column(batch.schema().index_of("int64").unwrap()) -+ .as_any() -+ .downcast_ref::() -+ .unwrap(); -+ let mut builder = BooleanBuilder::with_capacity(array.len()); -+ for i in 0..array.len() { -+ let keep = array.value(i) % 2 == 0; -+ builder.append_value(keep); -+ } -+ builder.finish() -+} -+ -+/// Integer selective filter: returns true only when int64 equals 0. -+fn filter_int64_eq_zero(batch: &RecordBatch) -> BooleanArray { -+ let array = batch -+ .column(batch.schema().index_of("int64").unwrap()) -+ .as_any() -+ .downcast_ref::() -+ .unwrap(); -+ let mut builder = BooleanBuilder::with_capacity(array.len()); -+ for i in 0..array.len() { -+ let keep = array.value(i) == 0; -+ builder.append_value(keep); -+ } -+ builder.finish() -+} -+ -+/// Timestamp filter: returns true when ts > threshold (using 50_000 as example threshold). -+fn filter_timestamp_gt(batch: &RecordBatch) -> BooleanArray { -+ let array = batch -+ .column(batch.schema().index_of("ts").unwrap()) -+ .as_any() -+ .downcast_ref::() -+ .unwrap(); -+ let threshold = 50_000; -+ let mut builder = BooleanBuilder::with_capacity(array.len()); -+ for i in 0..array.len() { -+ let keep = array.value(i) > threshold; -+ builder.append_value(keep); -+ } -+ builder.finish() -+} -+ -+#[derive(Clone)] -+enum FilterType { -+ Utf8ViewNonEmpty, -+ Utf8ViewConst, -+ Int64Even, -+ Int64EqZero, -+ TimestampGt, -+} -+ -+impl std::fmt::Display for FilterType { -+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -+ match self { -+ FilterType::Utf8ViewNonEmpty => write!(f, "utf8View <> ''"), -+ FilterType::Utf8ViewConst => write!(f, "utf8View = 'const'"), -+ FilterType::Int64Even => write!(f, "int64 even"), -+ FilterType::Int64EqZero => write!(f, "int64 = 0"), -+ FilterType::TimestampGt => write!(f, "ts > 50_000"), -+ } -+ } -+} -+ -+fn benchmark_filters_and_projections(c: &mut Criterion) { -+ let parquet_file = write_parquet_file(); -+ -+ // Define filter functions associated with each FilterType. -+ let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ -+ (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), -+ (FilterType::Utf8ViewConst, filter_utf8_view_const), -+ (FilterType::Int64Even, filter_int64_even), -+ (FilterType::Int64EqZero, filter_int64_eq_zero), -+ (FilterType::TimestampGt, filter_timestamp_gt), -+ ]; -+ -+ let mut group = c.benchmark_group("arrow_reader_row_filter"); -+ -+ // Iterate by value (Copy is available for FilterType and fn pointers) -+ for (filter_type, filter_fn) in filter_funcs.into_iter() { -+ for proj_case in ["all_columns", "exclude_filter_column"].iter() { -+ // Define indices for all columns: [0: "int64", 1: "float64", 2: "utf8View", 3: "ts"] -+ let all_indices = vec![0, 1, 2, 3]; -+ -+ // For the output projection, conditionally exclude the filter column. -+ let output_projection: Vec = if *proj_case == "all_columns" { -+ all_indices.clone() -+ } else { -+ all_indices -+ .into_iter() -+ .filter(|i| match filter_type { -+ FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, // Exclude "utf8" (index 2) -+ FilterType::Int64Even | FilterType::Int64EqZero => *i != 0, // Exclude "int64" (index 0) -+ FilterType::TimestampGt => *i != 3, // Exclude "ts" (index 3) -+ }) -+ .collect() -+ }; -+ -+ // For predicate pushdown, define a projection that includes the column required for the predicate. -+ let predicate_projection: Vec = match filter_type { -+ FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], -+ FilterType::Int64Even | FilterType::Int64EqZero => vec![0], -+ FilterType::TimestampGt => vec![3], -+ }; -+ -+ // Create a benchmark id combining filter type and projection case. -+ let bench_id = BenchmarkId::new( -+ format!("filter_case: {} project_case: {}", filter_type, proj_case), -+ "", -+ ); -+ group.bench_function(bench_id, |b| { -+ b.iter(|| { -+ // Reopen the Parquet file for each iteration. -+ let file = parquet_file.reopen().unwrap(); -+ let options = ArrowReaderOptions::new().with_page_index(true); -+ let builder = ArrowReaderBuilder::try_new_with_options(file, options).unwrap(); -+ let file_metadata = builder.metadata().file_metadata().clone(); -+ // Build the projection mask from the output projection (clone to avoid move) -+ let mask = ProjectionMask::roots( -+ file_metadata.schema_descr(), -+ output_projection.clone(), -+ ); -+ -+ // Build the predicate mask from the predicate projection (clone to avoid move) -+ let pred_mask = ProjectionMask::roots( -+ file_metadata.schema_descr(), -+ predicate_projection.clone(), -+ ); -+ -+ // Copy the filter function pointer. -+ let f = filter_fn; -+ // Wrap the filter function in a closure to satisfy the expected signature. -+ let filter = -+ ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| Ok(f(&batch))); -+ let row_filter = RowFilter::new(vec![Box::new(filter)]); -+ -+ // Build the reader with row filter and output projection. -+ let reader = builder -+ .with_row_filter(row_filter) -+ .with_projection(mask) -+ .build() -+ .unwrap(); -+ -+ // Collect result batches, unwrapping errors. -+ let _result: Vec = reader.map(|r| r.unwrap()).collect(); -+ }); -+ }); -+ } -+ } -+ -+ group.finish(); -+} -+ -+criterion_group!(benches, benchmark_filters_and_projections); -+criterion_main!(benches); - -From 31a544fd9bc5506d8c52014a4d7c922680ec1b08 Mon Sep 17 00:00:00 2001 -From: zhuqi-lucas <821684824@qq.com> -Date: Thu, 10 Apr 2025 23:03:51 +0800 -Subject: [PATCH 2/2] fix clippy - ---- - parquet/benches/arrow_reader_row_filter.rs | 7 ++++--- - 1 file changed, 4 insertions(+), 3 deletions(-) - -diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs -index 788ca3e533b7..5284275c97bf 100644 ---- a/parquet/benches/arrow_reader_row_filter.rs -+++ b/parquet/benches/arrow_reader_row_filter.rs -@@ -21,7 +21,7 @@ - //! - int64: sequential integers - //! - float64: floating-point values (derived from the integers) - //! - utf8View: string values where about half are non-empty, --//! and a few rows (every 10Kth row) are the constant "const" -+//! and a few rows (every 10Kth row) are the constant "const" - //! - ts: timestamp values (using, e.g., a millisecond epoch) - //! - //! It then applies several filter functions and projections, benchmarking the read-back speed. -@@ -135,7 +135,7 @@ fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { - .unwrap(); - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { -- let keep = array.value(i) != ""; -+ let keep = !array.value(i).is_empty(); - builder.append_value(keep); - } - builder.finish() -@@ -227,7 +227,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { - let parquet_file = write_parquet_file(); - - // Define filter functions associated with each FilterType. -- let filter_funcs: Vec<(FilterType, fn(&RecordBatch) -> BooleanArray)> = vec![ -+ type FilterFn = fn(&RecordBatch) -> BooleanArray; -+ let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ - (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), - (FilterType::Utf8ViewConst, filter_utf8_view_const), - (FilterType::Int64Even, filter_int64_even), From 768826e2e7776108e6fdb99ca3c7c27939526b17 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 17:48:43 +0800 Subject: [PATCH 05/21] fix --- parquet/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index eda58e751a62..1d2737a0c629 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -75,6 +75,7 @@ ring = { version = "0.17", default-features = false, features = ["std"], optiona [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } criterion = { version = "0.5", default-features = false, features = ["async_futures"] } +snap = { version = "1.0", default-features = false } tempfile = { version = "3.0", default-features = false } brotli = { version = "7.0", default-features = false, features = ["std"] } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } From f624b91c8ad18ffa486c3110b8ef29ed4cc4cd27 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Apr 2025 09:47:21 -0400 Subject: [PATCH 06/21] Update comments, add background --- parquet/benches/arrow_reader_row_filter.rs | 53 +++++++++++++++------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index af07636e49de..da89279065e5 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -17,26 +17,28 @@ //! Benchmark for evaluating row filters and projections on a Parquet file. //! -//! This benchmark creates a Parquet file in memory with 100K rows and four columns: -//! - int64: sequential integers -//! - float64: floating-point values (derived from the integers) -//! - utf8View: string values where about half are non-empty, -//! and a few rows (every 10Kth row) are the constant "const" -//! - ts: timestamp values (using, e.g., a millisecond epoch) +//! # Background: //! -//! It then applies several filter functions and projections, benchmarking the read-back speed. +//! As described in [Efficient Filter Pushdown in Parquet], evaluating +//! pushdown filters is a two step process: //! -//! Filters tested: -//! - A string filter: `utf8View <> ''` (non-empty) -//! - A string filter: `utf8View = 'const'` (selective) -//! - An integer non-selective filter (e.g. even numbers) -//! - An integer selective filter (e.g. `int64 = 0`) -//! - A timestamp filter (e.g. `ts > threshold`) +//! 1. Build a filter mask by decoding and evaluating filter functions on +//! the filter column(s). //! -//! Projections tested: -//! - All 4 columns. -//! - All columns except the one used for the filter. +//! 2. Decode the rows that match the filter mask from the projected columns. //! +//! The performance of this process depending on several factors, including: +//! +//! 1. How many rows are selected as well and how well clustered the results +//! are, where the representation of the filter mask is important. +//! 2. If the same column is used for both filtering and projection, as the +//! columns that appear in both filtering and projection are decoded twice. +//! +//! This benchmark helps measure the performance of these operations. +//! +//! [Efficient Filter Pushdown in Parquet]: https://datafusion.apache.org/blog/2025/03/21/parquet-pushdown/ +//! +//! # To run: //! To run the benchmark, use `cargo bench --bench bench_filter_projection`. use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; @@ -59,6 +61,12 @@ use tokio::fs::File; use tokio::runtime::Runtime; /// Create a RecordBatch with 100K rows and four columns. +/// +/// - int64: sequential integers +/// - float64: floating-point values (derived from the integers) +/// - utf8View: string values where about half are non-empty, +/// and a few rows (every 10Kth row) are the constant "const" +/// - ts: timestamp values (using, e.g., a millisecond epoch) fn make_record_batch() -> RecordBatch { let num_rows = 100_000; @@ -204,6 +212,12 @@ fn filter_timestamp_gt(batch: &RecordBatch) -> BooleanArray { builder.finish() } +/// Filters tested: +/// - A string filter: `utf8View <> ''` (non-empty) +/// - A string filter: `utf8View = 'const'` (selective) +/// - An integer non-selective filter (e.g. even numbers) +/// - An integer selective filter (e.g. `int64 = 0`) +/// - A timestamp filter (e.g. `ts > threshold`) #[derive(Clone)] enum FilterType { Utf8ViewNonEmpty, @@ -225,6 +239,13 @@ impl std::fmt::Display for FilterType { } } +/// This benchmark tests the performance of row filters and projections +/// +/// Tests combinations of FilterType and ProjectionType +/// +/// Projections tested: +/// - All 4 columns. +/// - All columns except the one used for the filter. fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); From 6c28e447fa5d6d9919273be3f68771537bd1a5b3 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 22:28:58 +0800 Subject: [PATCH 07/21] incremently addressing the comments --- parquet/benches/arrow_reader_row_filter.rs | 289 ++++++++++----------- 1 file changed, 134 insertions(+), 155 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index af07636e49de..69718d0d7160 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -18,97 +18,136 @@ //! Benchmark for evaluating row filters and projections on a Parquet file. //! //! This benchmark creates a Parquet file in memory with 100K rows and four columns: -//! - int64: sequential integers -//! - float64: floating-point values (derived from the integers) -//! - utf8View: string values where about half are non-empty, -//! and a few rows (every 10Kth row) are the constant "const" -//! - ts: timestamp values (using, e.g., a millisecond epoch) -//! -//! It then applies several filter functions and projections, benchmarking the read-back speed. +//! - int64: random integers generated using a fixed seed (range: 0..100) +//! - float64: random floating-point values generated using a fixed seed (range: 0.0..100.0) +//! - utf8View: random strings (with some empty values and the constant "const"). +//! Randomly produces short strings (3-12 bytes) and long strings (13-20 bytes). +//! - ts: sequential timestamps in milliseconds //! //! Filters tested: -//! - A string filter: `utf8View <> ''` (non-empty) -//! - A string filter: `utf8View = 'const'` (selective) -//! - An integer non-selective filter (e.g. even numbers) -//! - An integer selective filter (e.g. `int64 = 0`) -//! - A timestamp filter (e.g. `ts > threshold`) +//! - utf8View <> '' (no selective) %80 +//! - utf8View = 'const' (selective) %5 +//! - int64 = 0 (selective) +//! - ts > 50_000 (no selective) %50 //! //! Projections tested: -//! - All 4 columns. -//! - All columns except the one used for the filter. +//! - All columns. +//! - All columns except the one used for filtering. //! //! To run the benchmark, use `cargo bench --bench bench_filter_projection`. use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use rand::{rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; use tempfile::NamedTempFile; -use arrow::array::{ - ArrayRef, BooleanArray, BooleanBuilder, Float64Array, Int64Array, TimestampMillisecondArray, -}; -use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use arrow::record_batch::RecordBatch; +use arrow::array::{ArrayRef, Float64Array, Int64Array, TimestampMillisecondArray}; +use arrow::compute::kernels::cmp::{eq, gt, neq}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use arrow_array::builder::StringViewBuilder; -use arrow_array::{Array, StringViewArray}; -use criterion::async_executor::FuturesExecutor; +use arrow_array::StringViewArray; +use arrow_cast::pretty::pretty_format_batches; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::properties::WriterProperties; use tokio::fs::File; -use tokio::runtime::Runtime; - -/// Create a RecordBatch with 100K rows and four columns. -fn make_record_batch() -> RecordBatch { - let num_rows = 100_000; - - // int64 column: sequential numbers 0..num_rows - let int_values: Vec = (0..num_rows as i64).collect(); - let int_array = Arc::new(Int64Array::from(int_values)) as ArrayRef; - // float64 column: derived from int64 (e.g., multiplied by 0.1) - let float_values: Vec = (0..num_rows).map(|i| i as f64 * 0.1).collect(); - let float_array = Arc::new(Float64Array::from(float_values)) as ArrayRef; - - // utf8View column: even rows get non-empty strings; odd rows get an empty string; - // every 10Kth even row is "const" to be selective. - let mut string_view_builder = StringViewBuilder::with_capacity(100_000); - for i in 0..num_rows { - if i % 2 == 0 { - if i % 10_000 == 0 { - string_view_builder.append_value("const"); - } else { - string_view_builder.append_value("nonempty"); +fn create_random_array( + field: &Field, + size: usize, + null_density: f32, + _true_density: f32, +) -> arrow::error::Result { + match field.data_type() { + DataType::Int64 => { + let mut rng = StdRng::seed_from_u64(42); + let values: Vec = (0..size).map(|_| rng.random_range(0..100)).collect(); + Ok(Arc::new(Int64Array::from(values)) as ArrayRef) + } + DataType::Float64 => { + let mut rng = StdRng::seed_from_u64(43); + let values: Vec = (0..size).map(|_| rng.random_range(0.0..100.0)).collect(); + Ok(Arc::new(Float64Array::from(values)) as ArrayRef) + } + DataType::Utf8View => { + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(44); + for _ in 0..size { + let choice = rng.random_range(0..100); + if choice < (null_density * 100.0) as u32 { + builder.append_value(""); + } else if choice < 25 { + builder.append_value("const"); + } else { + let is_long = rng.random_range(0.5); + let len = if is_long { + rng.random_range(13..21) + } else { + rng.random_range(3..12) + }; + let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let s: String = (0..len) + .map(|_| { + let idx = rng.random_range(0..charset.len()); + charset[idx] as char + }) + .collect(); + builder.append_value(&s); + } } - } else { - string_view_builder.append_value(""); + Ok(Arc::new(builder.finish()) as ArrayRef) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let values: Vec = (0..size as i64).collect(); + Ok(Arc::new(TimestampMillisecondArray::from(values)) as ArrayRef) } + _ => unimplemented!("Field type not supported in create_random_array"), } - let utf8_view_array = Arc::new(string_view_builder.finish()) as ArrayRef; +} - // Timestamp column: using milliseconds from an epoch (simply using the row index) - let ts_values: Vec = (0..num_rows as i64).collect(); - let ts_array = Arc::new(TimestampMillisecondArray::from(ts_values)) as ArrayRef; +pub fn create_random_batch( + schema: SchemaRef, + size: usize, + null_density: f32, + true_density: f32, +) -> arrow::error::Result { + let columns = schema + .fields() + .iter() + .map(|field| create_random_array(field, size, null_density, true_density)) + .collect::>>()?; + RecordBatch::try_new_with_options( + schema, + columns, + &RecordBatchOptions::new().with_match_field_names(false), + ) +} - let schema = Arc::new(Schema::new(vec![ +fn make_record_batch() -> RecordBatch { + let num_rows = 100_000; + let fields = vec![ Field::new("int64", DataType::Int64, false), Field::new("float64", DataType::Float64, false), - Field::new("utf8View", DataType::Utf8View, false), + Field::new("utf8View", DataType::Utf8View, true), Field::new( "ts", DataType::Timestamp(TimeUnit::Millisecond, None), false, ), - ])); - - RecordBatch::try_new( - schema, - vec![int_array, float_array, utf8_view_array, ts_array], - ) - .unwrap() + ]; + let schema = Arc::new(Schema::new(fields)); + let batch = create_random_batch(schema, num_rows, 0.2, 0.5).unwrap(); + + println!("Batch created with {} rows", num_rows); + println!( + "First 100 rows:\n{}", + pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() + ); + batch } -/// Writes the record batch to a temporary Parquet file. fn write_parquet_file() -> NamedTempFile { let batch = make_record_batch(); let schema = batch.schema(); @@ -121,94 +160,47 @@ fn write_parquet_file() -> NamedTempFile { { let file_reopen = file.reopen().unwrap(); let mut writer = ArrowWriter::try_new(file_reopen, schema.clone(), Some(props)).unwrap(); - // Write the entire batch as a single row group. writer.write(&batch).unwrap(); writer.close().unwrap(); } file } -/// Filter function: returns a BooleanArray with true when utf8View <> "". -fn filter_utf8_view_nonempty(batch: &RecordBatch) -> BooleanArray { - let array = batch - .column(batch.schema().index_of("utf8View").unwrap()) - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { - let keep = !array.value(i).is_empty(); - builder.append_value(keep); - } - builder.finish() +// Use Arrow compute kernels for filtering. +// Returns a BooleanArray where true indicates the row satisfies the condition. +fn filter_utf8_view_nonempty( + batch: &RecordBatch, +) -> arrow::error::Result { + let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let string_view_scalar = StringViewArray::new_scalar(""); + // Compare with empty string + let not_equals_empty = neq(array, &string_view_scalar)?; + Ok(not_equals_empty) } -/// Filter function: returns a BooleanArray with true when utf8View == "const". -fn filter_utf8_view_const(batch: &RecordBatch) -> BooleanArray { - let array = batch - .column(batch.schema().index_of("utf8View").unwrap()) - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { - let keep = array.value(i) == "const"; - builder.append_value(keep); - } - builder.finish() +fn filter_utf8_view_const(batch: &RecordBatch) -> arrow::error::Result { + let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let string_view_scalar = StringViewArray::new_scalar("const"); + let eq_const = eq(array, &string_view_scalar)?; + Ok(eq_const) } - -/// Integer non-selective filter: returns true for even numbers. -fn filter_int64_even(batch: &RecordBatch) -> BooleanArray { - let array = batch - .column(batch.schema().index_of("int64").unwrap()) - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { - let keep = array.value(i) % 2 == 0; - builder.append_value(keep); - } - builder.finish() +fn filter_int64_eq_zero(batch: &RecordBatch) -> arrow::error::Result { + let array = batch.column(batch.schema().index_of("int64").unwrap()); + let eq_zero = eq(array, &Int64Array::new_scalar(0))?; + Ok(eq_zero) } -/// Integer selective filter: returns true only when int64 equals 0. -fn filter_int64_eq_zero(batch: &RecordBatch) -> BooleanArray { - let array = batch - .column(batch.schema().index_of("int64").unwrap()) - .as_any() - .downcast_ref::() - .unwrap(); - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { - let keep = array.value(i) == 0; - builder.append_value(keep); - } - builder.finish() -} - -/// Timestamp filter: returns true when ts > threshold (using 50_000 as example threshold). -fn filter_timestamp_gt(batch: &RecordBatch) -> BooleanArray { - let array = batch - .column(batch.schema().index_of("ts").unwrap()) - .as_any() - .downcast_ref::() - .unwrap(); - let threshold = 50_000; - let mut builder = BooleanBuilder::with_capacity(array.len()); - for i in 0..array.len() { - let keep = array.value(i) > threshold; - builder.append_value(keep); - } - builder.finish() +fn filter_timestamp_gt(batch: &RecordBatch) -> arrow::error::Result { + let array = batch.column(batch.schema().index_of("ts").unwrap()); + // For Timestamp arrays, use ScalarValue::TimestampMillisecond. + let gt_thresh = gt(array, &TimestampMillisecondArray::new_scalar(50_000))?; + Ok(gt_thresh) } #[derive(Clone)] enum FilterType { Utf8ViewNonEmpty, Utf8ViewConst, - Int64Even, Int64EqZero, TimestampGt, } @@ -218,7 +210,6 @@ impl std::fmt::Display for FilterType { match self { FilterType::Utf8ViewNonEmpty => write!(f, "utf8View <> ''"), FilterType::Utf8ViewConst => write!(f, "utf8View = 'const'"), - FilterType::Int64Even => write!(f, "int64 even"), FilterType::Int64EqZero => write!(f, "int64 = 0"), FilterType::TimestampGt => write!(f, "ts > 50_000"), } @@ -228,62 +219,53 @@ impl std::fmt::Display for FilterType { fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); - let runtime = Runtime::new().unwrap(); // Create a new Tokio runtime - - // Define filter functions associated with each FilterType. - type FilterFn = fn(&RecordBatch) -> BooleanArray; + type FilterFn = fn(&RecordBatch) -> arrow::error::Result; let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), (FilterType::Utf8ViewConst, filter_utf8_view_const), - (FilterType::Int64Even, filter_int64_even), (FilterType::Int64EqZero, filter_int64_eq_zero), (FilterType::TimestampGt, filter_timestamp_gt), ]; let mut group = c.benchmark_group("arrow_reader_row_filter"); - // Iterate by value (Copy is available for FilterType and fn pointers) for (filter_type, filter_fn) in filter_funcs.into_iter() { for proj_case in ["all_columns", "exclude_filter_column"].iter() { - // Define indices for all columns: [0: "int64", 1: "float64", 2: "utf8View", 3: "ts"] let all_indices = vec![0, 1, 2, 3]; - // For the output projection, conditionally exclude the filter column. let output_projection: Vec = if *proj_case == "all_columns" { all_indices.clone() } else { all_indices .into_iter() .filter(|i| match filter_type { - FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, // Exclude "utf8" (index 2) - FilterType::Int64Even | FilterType::Int64EqZero => *i != 0, // Exclude "int64" (index 0) - FilterType::TimestampGt => *i != 3, // Exclude "ts" (index 3) + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, + FilterType::Int64EqZero => *i != 0, + FilterType::TimestampGt => *i != 3, }) .collect() }; - // For predicate pushdown, define a projection that includes the column required for the predicate. let predicate_projection: Vec = match filter_type { FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], - FilterType::Int64Even | FilterType::Int64EqZero => vec![0], + FilterType::Int64EqZero => vec![0], FilterType::TimestampGt => vec![3], }; - // Create a benchmark id combining filter type and projection case. let bench_id = BenchmarkId::new( format!("filter_case: {} project_case: {}", filter_type, proj_case), "", ); group.bench_function(bench_id, |b| { - b.to_async(FuturesExecutor).iter(|| async { - runtime.block_on(async { - // Reopen the Parquet file for each iteration. + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + b.iter(|| { + rt.block_on(async { let file = File::open(parquet_file.path()).await.unwrap(); - - // Create a async parquet reader builder with batch_size. let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(file, options) .await @@ -291,12 +273,10 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .with_batch_size(8192); let file_metadata = builder.metadata().file_metadata().clone(); - let mask = ProjectionMask::roots( file_metadata.schema_descr(), output_projection.clone(), ); - let pred_mask = ProjectionMask::roots( file_metadata.schema_descr(), predicate_projection.clone(), @@ -304,7 +284,7 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let f = filter_fn; let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { - Ok(f(&batch)) + Ok(f(&batch).unwrap()) }); let stream = builder .with_projection(mask) @@ -312,7 +292,6 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .build() .unwrap(); - // Collect the results into a vector of RecordBatches. stream.try_collect::>().await.unwrap(); }) }); From 69a2617e97fca04b47f41221437a5b1d7a79554a Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 22:33:57 +0800 Subject: [PATCH 08/21] Fix bool random --- parquet/benches/arrow_reader_row_filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 69718d0d7160..a3ac9537fa9f 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -81,7 +81,7 @@ fn create_random_array( } else if choice < 25 { builder.append_value("const"); } else { - let is_long = rng.random_range(0.5); + let is_long = rng.random_bool(0.5); let len = if is_long { rng.random_range(13..21) } else { From 6a378182e60cd6c6a61a34cc6dd86147c17ebd29 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 11 Apr 2025 10:38:51 -0400 Subject: [PATCH 09/21] fixup --- parquet/benches/arrow_reader_row_filter.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 547a2e33dbf5..a5f1b6c30a8f 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -147,7 +147,6 @@ pub fn create_random_batch( ) } ->>>>>>> 69a2617 fn make_record_batch() -> RecordBatch { let num_rows = 100_000; let fields = vec![ @@ -220,12 +219,6 @@ fn filter_timestamp_gt(batch: &RecordBatch) -> arrow::error::Result ''` (non-empty) -/// - A string filter: `utf8View = 'const'` (selective) -/// - An integer non-selective filter (e.g. even numbers) -/// - An integer selective filter (e.g. `int64 = 0`) -/// - A timestamp filter (e.g. `ts > threshold`) #[derive(Clone)] enum FilterType { Utf8ViewNonEmpty, @@ -245,13 +238,6 @@ impl std::fmt::Display for FilterType { } } -/// This benchmark tests the performance of row filters and projections -/// -/// Tests combinations of FilterType and ProjectionType -/// -/// Projections tested: -/// - All 4 columns. -/// - All columns except the one used for the filter. fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); From 2f6ccbb2609ce76f561c468e0b45c6af4f0cdca1 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 23:01:18 +0800 Subject: [PATCH 10/21] Add fn switch and project enum --- parquet/benches/arrow_reader_row_filter.rs | 151 ++++++++++++--------- 1 file changed, 90 insertions(+), 61 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index a3ac9537fa9f..ab9e25390315 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -25,10 +25,10 @@ //! - ts: sequential timestamps in milliseconds //! //! Filters tested: -//! - utf8View <> '' (no selective) %80 -//! - utf8View = 'const' (selective) %5 +//! - utf8View <> '' (non-selective) +//! - utf8View = 'const' (selective) //! - int64 = 0 (selective) -//! - ts > 50_000 (no selective) %50 +//! - ts > 50_000 (non-selective) //! //! Projections tested: //! - All columns. @@ -41,7 +41,7 @@ use rand::{rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; use tempfile::NamedTempFile; -use arrow::array::{ArrayRef, Float64Array, Int64Array, TimestampMillisecondArray}; +use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, TimestampMillisecondArray}; use arrow::compute::kernels::cmp::{eq, gt, neq}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; @@ -54,6 +54,7 @@ use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMas use parquet::file::properties::WriterProperties; use tokio::fs::File; +/// Create a random array for a given field. fn create_random_array( field: &Field, size: usize, @@ -77,6 +78,7 @@ fn create_random_array( for _ in 0..size { let choice = rng.random_range(0..100); if choice < (null_density * 100.0) as u32 { + // Use empty string to represent a null value. builder.append_value(""); } else if choice < 25 { builder.append_value("const"); @@ -107,6 +109,7 @@ fn create_random_array( } } +/// Create a random RecordBatch from the given schema. pub fn create_random_batch( schema: SchemaRef, size: usize, @@ -125,6 +128,7 @@ pub fn create_random_batch( ) } +/// Create a RecordBatch with 100K rows and four columns. fn make_record_batch() -> RecordBatch { let num_rows = 100_000; let fields = vec![ @@ -148,6 +152,7 @@ fn make_record_batch() -> RecordBatch { batch } +/// Write the RecordBatch to a temporary Parquet file. fn write_parquet_file() -> NamedTempFile { let batch = make_record_batch(); let schema = batch.schema(); @@ -166,37 +171,7 @@ fn write_parquet_file() -> NamedTempFile { file } -// Use Arrow compute kernels for filtering. -// Returns a BooleanArray where true indicates the row satisfies the condition. -fn filter_utf8_view_nonempty( - batch: &RecordBatch, -) -> arrow::error::Result { - let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let string_view_scalar = StringViewArray::new_scalar(""); - // Compare with empty string - let not_equals_empty = neq(array, &string_view_scalar)?; - Ok(not_equals_empty) -} - -fn filter_utf8_view_const(batch: &RecordBatch) -> arrow::error::Result { - let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let string_view_scalar = StringViewArray::new_scalar("const"); - let eq_const = eq(array, &string_view_scalar)?; - Ok(eq_const) -} -fn filter_int64_eq_zero(batch: &RecordBatch) -> arrow::error::Result { - let array = batch.column(batch.schema().index_of("int64").unwrap()); - let eq_zero = eq(array, &Int64Array::new_scalar(0))?; - Ok(eq_zero) -} - -fn filter_timestamp_gt(batch: &RecordBatch) -> arrow::error::Result { - let array = batch.column(batch.schema().index_of("ts").unwrap()); - // For Timestamp arrays, use ScalarValue::TimestampMillisecond. - let gt_thresh = gt(array, &TimestampMillisecondArray::new_scalar(50_000))?; - Ok(gt_thresh) -} - +/// FilterType encapsulates the different filter comparisons. #[derive(Clone)] enum FilterType { Utf8ViewNonEmpty, @@ -216,46 +191,95 @@ impl std::fmt::Display for FilterType { } } +impl FilterType { + /// Filters the given batch according to self using Arrow compute kernels. + /// Returns a BooleanArray where true indicates that the row satisfies the condition. + fn filter_batch(&self, batch: &RecordBatch) -> arrow::error::Result { + match self { + FilterType::Utf8ViewNonEmpty => { + let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let string_view_scalar = StringViewArray::new_scalar(""); + let not_equals_empty = neq(array, &string_view_scalar)?; + Ok(not_equals_empty) + } + FilterType::Utf8ViewConst => { + let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let string_view_scalar = StringViewArray::new_scalar("const"); + let eq_const = eq(array, &string_view_scalar)?; + Ok(eq_const) + } + FilterType::Int64EqZero => { + let array = batch.column(batch.schema().index_of("int64").unwrap()); + let eq_zero = eq(array, &Int64Array::new_scalar(0))?; + Ok(eq_zero) + } + FilterType::TimestampGt => { + let array = batch.column(batch.schema().index_of("ts").unwrap()); + let gt_thresh = gt(array, &TimestampMillisecondArray::new_scalar(50_000))?; + Ok(gt_thresh) + } + } + } +} + +/// ProjectionCase defines the projection mode. +#[derive(Clone)] +enum ProjectionCase { + AllColumns, + ExcludeFilterColumn, +} + +impl std::fmt::Display for ProjectionCase { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ProjectionCase::AllColumns => write!(f, "all_columns"), + ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), + } + } +} + fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); - type FilterFn = fn(&RecordBatch) -> arrow::error::Result; - let filter_funcs: Vec<(FilterType, FilterFn)> = vec![ - (FilterType::Utf8ViewNonEmpty, filter_utf8_view_nonempty), - (FilterType::Utf8ViewConst, filter_utf8_view_const), - (FilterType::Int64EqZero, filter_int64_eq_zero), - (FilterType::TimestampGt, filter_timestamp_gt), + let filter_types: Vec = vec![ + FilterType::Utf8ViewNonEmpty, + FilterType::Utf8ViewConst, + FilterType::Int64EqZero, + FilterType::TimestampGt, + ]; + + let projection_cases = vec![ + ProjectionCase::AllColumns, + ProjectionCase::ExcludeFilterColumn, ]; let mut group = c.benchmark_group("arrow_reader_row_filter"); - for (filter_type, filter_fn) in filter_funcs.into_iter() { - for proj_case in ["all_columns", "exclude_filter_column"].iter() { + for filter_type in filter_types.iter().cloned() { + for proj_case in &projection_cases { + // All column indices: [0: int64, 1: float64, 2: utf8View, 3: ts] let all_indices = vec![0, 1, 2, 3]; - - let output_projection: Vec = if *proj_case == "all_columns" { - all_indices.clone() - } else { - all_indices + let filter_col = match filter_type { + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => 2, + FilterType::Int64EqZero => 0, + FilterType::TimestampGt => 3, + }; + let output_projection: Vec = match proj_case { + ProjectionCase::AllColumns => all_indices.clone(), + ProjectionCase::ExcludeFilterColumn => all_indices .into_iter() - .filter(|i| match filter_type { - FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => *i != 2, - FilterType::Int64EqZero => *i != 0, - FilterType::TimestampGt => *i != 3, - }) - .collect() + .filter(|i| *i != filter_col) + .collect(), }; - + // For predicate pushdown, include the filter column. let predicate_projection: Vec = match filter_type { FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], FilterType::Int64EqZero => vec![0], FilterType::TimestampGt => vec![3], }; - let bench_id = BenchmarkId::new( - format!("filter_case: {} project_case: {}", filter_type, proj_case), - "", - ); + let bench_id = + BenchmarkId::new(format!("filter: {} proj: {}", filter_type, proj_case), ""); group.bench_function(bench_id, |b| { let rt = tokio::runtime::Builder::new_multi_thread() @@ -263,6 +287,9 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .build() .unwrap(); b.iter(|| { + // Clone filter_type inside the closure to avoid moving it + let filter_type_inner = filter_type.clone(); + rt.block_on(async { let file = File::open(parquet_file.path()).await.unwrap(); let options = ArrowReaderOptions::new().with_page_index(true); @@ -282,10 +309,12 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { predicate_projection.clone(), ); - let f = filter_fn; let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { - Ok(f(&batch).unwrap()) + // Clone filter_type within the closure + let filter_type_inner = filter_type_inner.clone(); + Ok(filter_type_inner.filter_batch(&batch).unwrap()) }); + let stream = builder .with_projection(mask) .with_row_filter(RowFilter::new(vec![Box::new(filter)])) From d0a656b8d2a16b594a7df52ba6f21ea0e5f3b14c Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 11 Apr 2025 23:13:26 +0800 Subject: [PATCH 11/21] Fix clippy --- parquet/benches/arrow_reader_row_filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 83c323ff69ff..5d7e50e0912b 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -45,7 +45,7 @@ //! - int64: random integers generated using a fixed seed (range: 0..100) //! - float64: random floating-point values generated using a fixed seed (range: 0.0..100.0) //! - utf8View: random strings (with some empty values and the constant "const"). -//! Randomly produces short strings (3-12 bytes) and long strings (13-20 bytes). +//! Randomly produces short strings (3-12 bytes) and long strings (13-20 bytes). //! - ts: sequential timestamps in milliseconds //! //! Filters tested: From 67480b9a4428effcaf8b2f6f97650563669a1e02 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sat, 12 Apr 2025 14:38:54 +0800 Subject: [PATCH 12/21] Address comment --- parquet/benches/arrow_reader_row_filter.rs | 513 +++++++++++++++------ 1 file changed, 378 insertions(+), 135 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 5d7e50e0912b..12fd02e23c93 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -20,63 +20,77 @@ //! # Background: //! //! As described in [Efficient Filter Pushdown in Parquet], evaluating -//! pushdown filters is a two step process: +//! pushdown filters is a two-step process: //! //! 1. Build a filter mask by decoding and evaluating filter functions on //! the filter column(s). //! //! 2. Decode the rows that match the filter mask from the projected columns. //! -//! The performance of this process depending on several factors, including: -//! -//! 1. How many rows are selected as well and how well clustered the results -//! are, where the representation of the filter mask is important. -//! 2. If the same column is used for both filtering and projection, as the -//! columns that appear in both filtering and projection are decoded twice. +//! The performance depends on factors such as the number of rows selected, +//! the clustering of results (which affects the efficiency of the filter mask), +//! and whether the same column is used for both filtering and projection. //! //! This benchmark helps measure the performance of these operations. //! //! [Efficient Filter Pushdown in Parquet]: https://datafusion.apache.org/blog/2025/03/21/parquet-pushdown/ //! -//! # To run: -//! To run the benchmark, use `cargo bench --bench bench_filter_projection`. -//! -//! This benchmark creates a Parquet file in memory with 100K rows and four columns: -//! - int64: random integers generated using a fixed seed (range: 0..100) -//! - float64: random floating-point values generated using a fixed seed (range: 0.0..100.0) -//! - utf8View: random strings (with some empty values and the constant "const"). -//! Randomly produces short strings (3-12 bytes) and long strings (13-20 bytes). -//! - ts: sequential timestamps in milliseconds +//! The benchmark creates an in-memory Parquet file with 100K rows and ten columns. +//! The first four columns are: +//! - int64: random integers (range: 0..100) generated with a fixed seed. +//! - float64: random floating-point values (range: 0.0..100.0) generated with a fixed seed. +//! - utf8View: random strings with some empty values and occasional constant "const" values. +//! - ts: sequential timestamps in milliseconds. //! -//! Filters tested: -//! - utf8View <> '' (non-selective) -//! - utf8View = 'const' (selective) -//! - int64 = 0 (selective) -//! - ts > 50_000 (non-selective) +//! The following six columns (for filtering) are generated to mimic different +//! filter selectivity and clustering patterns: +//! - pt: for Point Lookup – exactly one row is set to "unique_point", all others are random strings. +//! - sel: for Selective Unclustered – exactly 1% of rows (those with i % 100 == 0) are "selected". +//! - mod_clustered: for Moderately Selective Clustered – in each 10K-row block, the first 10 rows are "mod_clustered". +//! - mod_unclustered: for Moderately Selective Unclustered – exactly 10% of rows (those with i % 10 == 1) are "mod_unclustered". +//! - unsel_unclustered: for Unselective Unclustered – exactly 99% of rows (those with i % 100 != 0) are "unsel_unclustered". +//! - unsel_clustered: for Unselective Clustered – in each 10K-row block, rows with an offset >= 1000 are "unsel_clustered". //! -//! Projections tested: -//! - All columns. -//! - All columns except the one used for filtering. - -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use rand::{rngs::StdRng, Rng, SeedableRng}; -use std::sync::Arc; -use tempfile::NamedTempFile; +//! As a side note, an additional composite benchmark is provided which demonstrates +//! the performance when applying two filters simultaneously (i.e. chaining row selectors). use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, TimestampMillisecondArray}; use arrow::compute::kernels::cmp::{eq, gt, neq}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; -use arrow::record_batch::{RecordBatch, RecordBatchOptions}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use arrow::record_batch::RecordBatch; use arrow_array::builder::StringViewBuilder; use arrow_array::StringViewArray; use arrow_cast::pretty::pretty_format_batches; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::properties::WriterProperties; +use rand::{rngs::StdRng, Rng, SeedableRng}; +use std::sync::Arc; +use tempfile::NamedTempFile; use tokio::fs::File; -/// Create a random array for a given field. +/// Generates a random string (either short: 3–11 bytes or long: 13–20 bytes) with 50% probability. +/// This is used to fill non-selected rows in the filter columns. +fn random_string(rng: &mut StdRng) -> String { + let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let is_long = rng.random_bool(0.5); + let len = if is_long { + rng.random_range(13..21) + } else { + rng.random_range(3..12) + }; + (0..len) + .map(|_| charset[rng.random_range(0..charset.len())] as char) + .collect() +} + +/// Create a random array for a given field, generating data with fixed seed reproducibility. +/// - For Int64, random integers in [0, 100). +/// - For Float64, random floats in [0.0, 100.0). +/// - For Utf8View, a mix of empty strings, the constant "const", and random strings. +/// - For Timestamp, sequential timestamps in milliseconds. fn create_random_array( field: &Field, size: usize, @@ -100,25 +114,11 @@ fn create_random_array( for _ in 0..size { let choice = rng.random_range(0..100); if choice < (null_density * 100.0) as u32 { - // Use empty string to represent a null value. builder.append_value(""); } else if choice < 25 { builder.append_value("const"); } else { - let is_long = rng.random_bool(0.5); - let len = if is_long { - rng.random_range(13..21) - } else { - rng.random_range(3..12) - }; - let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - let s: String = (0..len) - .map(|_| { - let idx = rng.random_range(0..charset.len()); - charset[idx] as char - }) - .collect(); - builder.append_value(&s); + builder.append_value(random_string(&mut rng)); } } Ok(Arc::new(builder.finish()) as ArrayRef) @@ -131,28 +131,103 @@ fn create_random_array( } } -/// Create a random RecordBatch from the given schema. -pub fn create_random_batch( - schema: SchemaRef, - size: usize, - null_density: f32, - true_density: f32, -) -> arrow::error::Result { - let columns = schema - .fields() - .iter() - .map(|field| create_random_array(field, size, null_density, true_density)) - .collect::>>()?; - RecordBatch::try_new_with_options( - schema, - columns, - &RecordBatchOptions::new().with_match_field_names(false), - ) +/// Create the "pt" column: one random index is set to "unique_point", the remaining rows are filled with random strings. +fn create_filter_array_pt(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(100); + let unique_index = rng.random_range(0..size); + for i in 0..size { + if i == unique_index { + builder.append_value("unique_point"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef } -/// Create a RecordBatch with 100K rows and four columns. -fn make_record_batch() -> RecordBatch { - let num_rows = 100_000; +/// Create the "sel" column: exactly 1% of rows (those with index % 100 == 0) are set to "selected", +/// while the other 99% of rows are filled with random strings. +fn create_filter_array_sel(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(101); + for i in 0..size { + if i % 100 == 0 { + builder.append_value("selected"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef +} + +/// Create the "mod_clustered" column: in each 10,000-row block, the first 10 rows are set to "mod_clustered" +/// (simulating a clustered filter with 10 rows per block), and the rest are filled with random strings. +fn create_filter_array_mod_clustered(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let block_size = 10_000; + let mut rng = StdRng::seed_from_u64(102); + for i in 0..size { + if (i % block_size) < 10 { + builder.append_value("mod_clustered"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef +} + +/// Create the "mod_unclustered" column: exactly 10% of rows (those with index % 10 == 1) +/// are set to "mod_unclustered", while the remaining rows receive random strings. +fn create_filter_array_mod_unclustered(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(103); + for i in 0..size { + if i % 10 == 1 { + builder.append_value("mod_unclustered"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef +} + +/// Create the "unsel_unclustered" column: exactly 99% of rows (those with index % 100 != 0) +/// are set to "unsel_unclustered", and the remaining 1% get random strings. +fn create_filter_array_unsel_unclustered(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(104); + for i in 0..size { + if i % 100 != 0 { + builder.append_value("unsel_unclustered"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef +} + +/// Create the "unsel_clustered" column: in each 10,000-row block, rows with an offset >= 1000 +/// are set to "unsel_clustered" (representing a clustered filter selecting 90% of the rows), +/// while rows with offset < 1000 are filled with random strings. +fn create_filter_array_unsel_clustered(size: usize) -> ArrayRef { + let mut builder = StringViewBuilder::with_capacity(size); + let block_size = 10_000; + let mut rng = StdRng::seed_from_u64(105); + for i in 0..size { + if (i % block_size) >= 1000 { + builder.append_value("unsel_clustered"); + } else { + builder.append_value(random_string(&mut rng)); + } + } + Arc::new(builder.finish()) as ArrayRef +} + +/// Create an extended RecordBatch with 100K rows and ten columns. +/// The schema includes the original four columns and the six additional filter columns, +/// whose names have been updated to use "clustered" and "unclustered" as appropriate. +fn create_extended_batch(size: usize) -> RecordBatch { let fields = vec![ Field::new("int64", DataType::Int64, false), Field::new("float64", DataType::Float64, false), @@ -162,10 +237,69 @@ fn make_record_batch() -> RecordBatch { DataType::Timestamp(TimeUnit::Millisecond, None), false, ), + Field::new("pt", DataType::Utf8View, true), + Field::new("sel", DataType::Utf8View, true), + Field::new("mod_clustered", DataType::Utf8View, true), + Field::new("mod_unclustered", DataType::Utf8View, true), + Field::new("unsel_unclustered", DataType::Utf8View, true), + Field::new("unsel_clustered", DataType::Utf8View, true), ]; let schema = Arc::new(Schema::new(fields)); - let batch = create_random_batch(schema, num_rows, 0.2, 0.5).unwrap(); + let int64_array = + create_random_array(&Field::new("int64", DataType::Int64, false), size, 0.0, 0.0).unwrap(); + let float64_array = create_random_array( + &Field::new("float64", DataType::Float64, false), + size, + 0.0, + 0.0, + ) + .unwrap(); + let utf8_array = create_random_array( + &Field::new("utf8View", DataType::Utf8View, true), + size, + 0.2, + 0.5, + ) + .unwrap(); + let ts_array = create_random_array( + &Field::new( + "ts", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + size, + 0.0, + 0.0, + ) + .unwrap(); + + let pt_array = create_filter_array_pt(size); + let sel_array = create_filter_array_sel(size); + let mod_clustered_array = create_filter_array_mod_clustered(size); + let mod_unclustered_array = create_filter_array_mod_unclustered(size); + let unsel_unclustered_array = create_filter_array_unsel_unclustered(size); + let unsel_clustered_array = create_filter_array_unsel_clustered(size); + + let arrays: Vec = vec![ + int64_array, + float64_array, + utf8_array, + ts_array, + pt_array, + sel_array, + mod_clustered_array, + mod_unclustered_array, + unsel_unclustered_array, + unsel_clustered_array, + ]; + RecordBatch::try_new(schema, arrays).unwrap() +} + +/// Create a RecordBatch with 100K rows and print a summary (first 100 rows) to the console. +fn make_record_batch() -> RecordBatch { + let num_rows = 100_000; + let batch = create_extended_batch(num_rows); println!("Batch created with {} rows", num_rows); println!( "First 100 rows:\n{}", @@ -174,12 +308,11 @@ fn make_record_batch() -> RecordBatch { batch } -/// Write the RecordBatch to a temporary Parquet file. +/// Write the RecordBatch to a temporary Parquet file and return the file handle. fn write_parquet_file() -> NamedTempFile { let batch = make_record_batch(); let schema = batch.schema(); let props = WriterProperties::builder().build(); - let file = tempfile::Builder::new() .suffix(".parquet") .tempfile() @@ -193,99 +326,156 @@ fn write_parquet_file() -> NamedTempFile { file } +/// ProjectionCase defines the projection mode for the benchmark: +/// either projecting all columns or excluding the column that is used for filtering. +#[derive(Clone)] +enum ProjectionCase { + AllColumns, + ExcludeFilterColumn, +} + +impl std::fmt::Display for ProjectionCase { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ProjectionCase::AllColumns => write!(f, "all_columns"), + ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), + } + } +} + /// FilterType encapsulates the different filter comparisons. +/// The variants correspond to the different filter patterns. #[derive(Clone)] enum FilterType { Utf8ViewNonEmpty, Utf8ViewConst, Int64EqZero, TimestampGt, + PointLookup, + SelectiveUnclustered, + ModeratelySelectiveClustered, + ModeratelySelectiveUnclustered, + UnselectiveUnclustered, + UnselectiveClustered, } impl std::fmt::Display for FilterType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - FilterType::Utf8ViewNonEmpty => write!(f, "utf8View <> ''"), - FilterType::Utf8ViewConst => write!(f, "utf8View = 'const'"), - FilterType::Int64EqZero => write!(f, "int64 = 0"), - FilterType::TimestampGt => write!(f, "ts > 50_000"), - } + use FilterType::*; + let s = match self { + Utf8ViewNonEmpty => "utf8View <> ''", + Utf8ViewConst => "utf8View = 'const'", + Int64EqZero => "int64 = 0", + TimestampGt => "ts > 50_000", + PointLookup => "Point Lookup", + SelectiveUnclustered => "1% Unclustered Filter", + ModeratelySelectiveClustered => "10% Clustered Filter", + ModeratelySelectiveUnclustered => "10% Unclustered Filter", + UnselectiveUnclustered => "99% Unclustered Filter", + UnselectiveClustered => "90% Clustered Filter", + }; + write!(f, "{}", s) } } impl FilterType { - /// Filters the given batch according to self using Arrow compute kernels. - /// Returns a BooleanArray where true indicates that the row satisfies the condition. + /// Applies the specified filter on the given record batch, returning a BooleanArray mask. + /// Each filter uses its dedicated column and checks equality against a fixed string. fn filter_batch(&self, batch: &RecordBatch) -> arrow::error::Result { + use FilterType::*; match self { - FilterType::Utf8ViewNonEmpty => { + Utf8ViewNonEmpty => { let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let string_view_scalar = StringViewArray::new_scalar(""); - let not_equals_empty = neq(array, &string_view_scalar)?; - Ok(not_equals_empty) + let scalar = StringViewArray::new_scalar(""); + neq(array, &scalar) } - FilterType::Utf8ViewConst => { + Utf8ViewConst => { let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let string_view_scalar = StringViewArray::new_scalar("const"); - let eq_const = eq(array, &string_view_scalar)?; - Ok(eq_const) + let scalar = StringViewArray::new_scalar("const"); + eq(array, &scalar) } - FilterType::Int64EqZero => { + Int64EqZero => { let array = batch.column(batch.schema().index_of("int64").unwrap()); - let eq_zero = eq(array, &Int64Array::new_scalar(0))?; - Ok(eq_zero) + eq(array, &Int64Array::new_scalar(0)) } - FilterType::TimestampGt => { + TimestampGt => { let array = batch.column(batch.schema().index_of("ts").unwrap()); - let gt_thresh = gt(array, &TimestampMillisecondArray::new_scalar(50_000))?; - Ok(gt_thresh) + gt(array, &TimestampMillisecondArray::new_scalar(50_000)) + } + PointLookup => { + let array = batch.column(batch.schema().index_of("pt").unwrap()); + let scalar = StringViewArray::new_scalar("unique_point"); + eq(array, &scalar) + } + SelectiveUnclustered => { + let array = batch.column(batch.schema().index_of("sel").unwrap()); + let scalar = StringViewArray::new_scalar("selected"); + eq(array, &scalar) + } + ModeratelySelectiveClustered => { + let array = batch.column(batch.schema().index_of("mod_clustered").unwrap()); + let scalar = StringViewArray::new_scalar("mod_clustered"); + eq(array, &scalar) + } + ModeratelySelectiveUnclustered => { + let array = batch.column(batch.schema().index_of("mod_unclustered").unwrap()); + let scalar = StringViewArray::new_scalar("mod_unclustered"); + eq(array, &scalar) + } + UnselectiveUnclustered => { + let array = batch.column(batch.schema().index_of("unsel_unclustered").unwrap()); + let scalar = StringViewArray::new_scalar("unsel_unclustered"); + eq(array, &scalar) + } + UnselectiveClustered => { + let array = batch.column(batch.schema().index_of("unsel_clustered").unwrap()); + let scalar = StringViewArray::new_scalar("unsel_clustered"); + eq(array, &scalar) } } } } -/// ProjectionCase defines the projection mode. -#[derive(Clone)] -enum ProjectionCase { - AllColumns, - ExcludeFilterColumn, -} - -impl std::fmt::Display for ProjectionCase { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ProjectionCase::AllColumns => write!(f, "all_columns"), - ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), - } - } -} - +/// Benchmark filters and projections by reading the Parquet file. +/// This benchmark iterates over all individual filter types and two projection cases. +/// It measures the time to read and filter the Parquet file according to each scenario. fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); - - let filter_types: Vec = vec![ + let filter_types = vec![ FilterType::Utf8ViewNonEmpty, FilterType::Utf8ViewConst, FilterType::Int64EqZero, FilterType::TimestampGt, + FilterType::PointLookup, + FilterType::SelectiveUnclustered, + FilterType::ModeratelySelectiveClustered, + FilterType::ModeratelySelectiveUnclustered, + FilterType::UnselectiveUnclustered, + FilterType::UnselectiveClustered, ]; - let projection_cases = vec![ ProjectionCase::AllColumns, ProjectionCase::ExcludeFilterColumn, ]; - let mut group = c.benchmark_group("arrow_reader_row_filter"); - for filter_type in filter_types.iter().cloned() { + for filter_type in filter_types.clone() { for proj_case in &projection_cases { - // All column indices: [0: int64, 1: float64, 2: utf8View, 3: ts] - let all_indices = vec![0, 1, 2, 3]; + // All indices corresponding to the 10 columns. + let all_indices = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + // Determine the filter column index based on the filter type. let filter_col = match filter_type { FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => 2, FilterType::Int64EqZero => 0, FilterType::TimestampGt => 3, + FilterType::PointLookup => 4, + FilterType::SelectiveUnclustered => 5, + FilterType::ModeratelySelectiveClustered => 6, + FilterType::ModeratelySelectiveUnclustered => 7, + FilterType::UnselectiveUnclustered => 8, + FilterType::UnselectiveClustered => 9, }; + // For the projection, either select all columns or exclude the filter column. let output_projection: Vec = match proj_case { ProjectionCase::AllColumns => all_indices.clone(), ProjectionCase::ExcludeFilterColumn => all_indices @@ -293,25 +483,16 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .filter(|i| *i != filter_col) .collect(), }; - // For predicate pushdown, include the filter column. - let predicate_projection: Vec = match filter_type { - FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => vec![2], - FilterType::Int64EqZero => vec![0], - FilterType::TimestampGt => vec![3], - }; let bench_id = BenchmarkId::new(format!("filter: {} proj: {}", filter_type, proj_case), ""); - group.bench_function(bench_id, |b| { let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); b.iter(|| { - // Clone filter_type inside the closure to avoid moving it let filter_type_inner = filter_type.clone(); - rt.block_on(async { let file = File::open(parquet_file.path()).await.unwrap(); let options = ArrowReaderOptions::new().with_page_index(true); @@ -320,29 +501,21 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .await .unwrap() .with_batch_size(8192); - let file_metadata = builder.metadata().file_metadata().clone(); let mask = ProjectionMask::roots( file_metadata.schema_descr(), output_projection.clone(), ); - let pred_mask = ProjectionMask::roots( - file_metadata.schema_descr(), - predicate_projection.clone(), - ); - + let pred_mask = + ProjectionMask::roots(file_metadata.schema_descr(), vec![filter_col]); let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { - // Clone filter_type within the closure - let filter_type_inner = filter_type_inner.clone(); Ok(filter_type_inner.filter_batch(&batch).unwrap()) }); - let stream = builder .with_projection(mask) .with_row_filter(RowFilter::new(vec![Box::new(filter)])) .build() .unwrap(); - stream.try_collect::>().await.unwrap(); }) }); @@ -351,5 +524,75 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { } } -criterion_group!(benches, benchmark_filters_and_projections); +/// Benchmark composite filters by applying two filters simultaneously. +/// This benchmark creates a composite row filter that ANDs two predicates: +/// one on the "sel" column (exactly 1% selected) and one on the "mod_clustered" column +/// (first 10 rows in each 10K block), then measures the performance of the combined filtering. +fn benchmark_composite_filters(c: &mut Criterion) { + let parquet_file = write_parquet_file(); + let mut group = c.benchmark_group("composite_filter"); + + // For composite filtering, we choose: + // - Filter1: SelectiveUnclustered on column "sel" (index 5) + // - Filter2: ModeratelySelectiveClustered on column "mod_clustered" (index 6) + // These filters are applied sequentially (logical AND). + let filter1_col = 5; + let filter2_col = 6; + let bench_id = BenchmarkId::new("Composite Filter: sel AND mod_clustered", ""); + group.bench_function(bench_id, |b| { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + b.iter(|| { + rt.block_on(async { + let file = File::open(parquet_file.path()).await.unwrap(); + let options = ArrowReaderOptions::new().with_page_index(true); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(file, options) + .await + .unwrap() + .with_batch_size(8192); + let file_metadata = builder.metadata().file_metadata().clone(); + // For projection, we select all columns. + let all_indices = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let mask = ProjectionMask::roots(file_metadata.schema_descr(), all_indices.clone()); + let pred_mask1 = + ProjectionMask::roots(file_metadata.schema_descr(), vec![filter1_col]); + let pred_mask2 = + ProjectionMask::roots(file_metadata.schema_descr(), vec![filter2_col]); + + // Create first filter: applies the "sel" filter. + let filter1 = ArrowPredicateFn::new(pred_mask1, move |batch: RecordBatch| { + let scalar = StringViewArray::new_scalar("selected"); + eq( + batch.column(batch.schema().index_of("sel").unwrap()), + &scalar, + ) + }); + // Create second filter: applies the "mod_clustered" filter. + let filter2 = ArrowPredicateFn::new(pred_mask2, move |batch: RecordBatch| { + let scalar = StringViewArray::new_scalar("mod_clustered"); + eq( + batch.column(batch.schema().index_of("mod_clustered").unwrap()), + &scalar, + ) + }); + let composite_filter = RowFilter::new(vec![Box::new(filter1), Box::new(filter2)]); + + let stream = builder + .with_projection(mask) + .with_row_filter(composite_filter) + .build() + .unwrap(); + stream.try_collect::>().await.unwrap(); + }) + }); + }); +} + +criterion_group!( + benches, + benchmark_filters_and_projections, + benchmark_composite_filters +); criterion_main!(benches); From 16bc1bfaff7092b05667d820a29d9e8b56cd317b Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sat, 12 Apr 2025 14:49:28 +0800 Subject: [PATCH 13/21] Add float(half set) and int(full set) change --- parquet/benches/arrow_reader_row_filter.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 12fd02e23c93..2f06e07fb897 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -349,7 +349,8 @@ impl std::fmt::Display for ProjectionCase { enum FilterType { Utf8ViewNonEmpty, Utf8ViewConst, - Int64EqZero, + Int64GTZero, + Float64GTHalf, TimestampGt, PointLookup, SelectiveUnclustered, @@ -365,7 +366,8 @@ impl std::fmt::Display for FilterType { let s = match self { Utf8ViewNonEmpty => "utf8View <> ''", Utf8ViewConst => "utf8View = 'const'", - Int64EqZero => "int64 = 0", + Int64GTZero => "int64 > 0", + Float64GTHalf => "float64 > 50.0", TimestampGt => "ts > 50_000", PointLookup => "Point Lookup", SelectiveUnclustered => "1% Unclustered Filter", @@ -394,9 +396,13 @@ impl FilterType { let scalar = StringViewArray::new_scalar("const"); eq(array, &scalar) } - Int64EqZero => { + Int64GTZero => { let array = batch.column(batch.schema().index_of("int64").unwrap()); - eq(array, &Int64Array::new_scalar(0)) + gt(array, &Int64Array::new_scalar(0)) + } + Float64GTHalf => { + let array = batch.column(batch.schema().index_of("float64").unwrap()); + gt(array, &Float64Array::new_scalar(50.0)) } TimestampGt => { let array = batch.column(batch.schema().index_of("ts").unwrap()); @@ -444,7 +450,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let filter_types = vec![ FilterType::Utf8ViewNonEmpty, FilterType::Utf8ViewConst, - FilterType::Int64EqZero, + FilterType::Int64GTZero, + FilterType::Float64GTHalf, FilterType::TimestampGt, FilterType::PointLookup, FilterType::SelectiveUnclustered, @@ -466,7 +473,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { // Determine the filter column index based on the filter type. let filter_col = match filter_type { FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => 2, - FilterType::Int64EqZero => 0, + FilterType::Int64GTZero => 0, + FilterType::Float64GTHalf => 1, FilterType::TimestampGt => 3, FilterType::PointLookup => 4, FilterType::SelectiveUnclustered => 5, From 7638c41032a1a7310f912571b2eb171a546db5ce Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 13 Apr 2025 23:28:32 +0800 Subject: [PATCH 14/21] Address comments --- parquet/benches/arrow_reader_row_filter.rs | 501 +++++++-------------- 1 file changed, 158 insertions(+), 343 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 2f06e07fb897..e2b67b7c8904 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -51,11 +51,10 @@ //! - unsel_unclustered: for Unselective Unclustered – exactly 99% of rows (those with i % 100 != 0) are "unsel_unclustered". //! - unsel_clustered: for Unselective Clustered – in each 10K-row block, rows with an offset >= 1000 are "unsel_clustered". //! -//! As a side note, an additional composite benchmark is provided which demonstrates -//! the performance when applying two filters simultaneously (i.e. chaining row selectors). use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, TimestampMillisecondArray}; -use arrow::compute::kernels::cmp::{eq, gt, neq}; +use arrow::compute::and; +use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::builder::StringViewBuilder; @@ -71,8 +70,8 @@ use std::sync::Arc; use tempfile::NamedTempFile; use tokio::fs::File; -/// Generates a random string (either short: 3–11 bytes or long: 13–20 bytes) with 50% probability. -/// This is used to fill non-selected rows in the filter columns. +/// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) +/// or a long string (13–20 characters). fn random_string(rng: &mut StdRng) -> String { let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; let is_long = rng.random_bool(0.5); @@ -86,120 +85,34 @@ fn random_string(rng: &mut StdRng) -> String { .collect() } -/// Create a random array for a given field, generating data with fixed seed reproducibility. -/// - For Int64, random integers in [0, 100). -/// - For Float64, random floats in [0.0, 100.0). -/// - For Utf8View, a mix of empty strings, the constant "const", and random strings. -/// - For Timestamp, sequential timestamps in milliseconds. -fn create_random_array( - field: &Field, - size: usize, - null_density: f32, - _true_density: f32, -) -> arrow::error::Result { - match field.data_type() { - DataType::Int64 => { - let mut rng = StdRng::seed_from_u64(42); - let values: Vec = (0..size).map(|_| rng.random_range(0..100)).collect(); - Ok(Arc::new(Int64Array::from(values)) as ArrayRef) - } - DataType::Float64 => { - let mut rng = StdRng::seed_from_u64(43); - let values: Vec = (0..size).map(|_| rng.random_range(0.0..100.0)).collect(); - Ok(Arc::new(Float64Array::from(values)) as ArrayRef) - } - DataType::Utf8View => { - let mut builder = StringViewBuilder::with_capacity(size); - let mut rng = StdRng::seed_from_u64(44); - for _ in 0..size { - let choice = rng.random_range(0..100); - if choice < (null_density * 100.0) as u32 { - builder.append_value(""); - } else if choice < 25 { - builder.append_value("const"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - let values: Vec = (0..size as i64).collect(); - Ok(Arc::new(TimestampMillisecondArray::from(values)) as ArrayRef) - } - _ => unimplemented!("Field type not supported in create_random_array"), - } -} - -/// Create the "pt" column: one random index is set to "unique_point", the remaining rows are filled with random strings. -fn create_filter_array_pt(size: usize) -> ArrayRef { - let mut builder = StringViewBuilder::with_capacity(size); - let mut rng = StdRng::seed_from_u64(100); +/// Creates an int64 array of a given size with random integers in [0, 100). +/// Then, it overwrites a single random index with 9999 to serve as the unique value for point lookup. +fn create_int64_array(size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(42); + let mut values: Vec = (0..size).map(|_| rng.random_range(0..100)).collect(); let unique_index = rng.random_range(0..size); - for i in 0..size { - if i == unique_index { - builder.append_value("unique_point"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Arc::new(builder.finish()) as ArrayRef -} - -/// Create the "sel" column: exactly 1% of rows (those with index % 100 == 0) are set to "selected", -/// while the other 99% of rows are filled with random strings. -fn create_filter_array_sel(size: usize) -> ArrayRef { - let mut builder = StringViewBuilder::with_capacity(size); - let mut rng = StdRng::seed_from_u64(101); - for i in 0..size { - if i % 100 == 0 { - builder.append_value("selected"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Arc::new(builder.finish()) as ArrayRef + values[unique_index] = 9999; // Unique value for point lookup + Arc::new(Int64Array::from(values)) as ArrayRef } -/// Create the "mod_clustered" column: in each 10,000-row block, the first 10 rows are set to "mod_clustered" -/// (simulating a clustered filter with 10 rows per block), and the rest are filled with random strings. -fn create_filter_array_mod_clustered(size: usize) -> ArrayRef { - let mut builder = StringViewBuilder::with_capacity(size); - let block_size = 10_000; - let mut rng = StdRng::seed_from_u64(102); - for i in 0..size { - if (i % block_size) < 10 { - builder.append_value("mod_clustered"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Arc::new(builder.finish()) as ArrayRef -} - -/// Create the "mod_unclustered" column: exactly 10% of rows (those with index % 10 == 1) -/// are set to "mod_unclustered", while the remaining rows receive random strings. -fn create_filter_array_mod_unclustered(size: usize) -> ArrayRef { - let mut builder = StringViewBuilder::with_capacity(size); - let mut rng = StdRng::seed_from_u64(103); - for i in 0..size { - if i % 10 == 1 { - builder.append_value("mod_unclustered"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Arc::new(builder.finish()) as ArrayRef +/// Creates a float64 array of a given size with random floats in [0.0, 100.0). +fn create_float64_array(size: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(43); + let values: Vec = (0..size).map(|_| rng.random_range(0.0..100.0)).collect(); + Arc::new(Float64Array::from(values)) as ArrayRef } -/// Create the "unsel_unclustered" column: exactly 99% of rows (those with index % 100 != 0) -/// are set to "unsel_unclustered", and the remaining 1% get random strings. -fn create_filter_array_unsel_unclustered(size: usize) -> ArrayRef { +/// Creates a utf8View array of a given size with random strings. +/// Now, this column is used in one filter case. +fn create_utf8_view_array(size: usize, null_density: f32) -> ArrayRef { let mut builder = StringViewBuilder::with_capacity(size); - let mut rng = StdRng::seed_from_u64(104); - for i in 0..size { - if i % 100 != 0 { - builder.append_value("unsel_unclustered"); + let mut rng = StdRng::seed_from_u64(44); + for _ in 0..size { + let choice = rng.random_range(0..100); + if choice < (null_density * 100.0) as u32 { + builder.append_value(""); + } else if choice < 25 { + builder.append_value("const"); } else { builder.append_value(random_string(&mut rng)); } @@ -207,27 +120,15 @@ fn create_filter_array_unsel_unclustered(size: usize) -> ArrayRef { Arc::new(builder.finish()) as ArrayRef } -/// Create the "unsel_clustered" column: in each 10,000-row block, rows with an offset >= 1000 -/// are set to "unsel_clustered" (representing a clustered filter selecting 90% of the rows), -/// while rows with offset < 1000 are filled with random strings. -fn create_filter_array_unsel_clustered(size: usize) -> ArrayRef { - let mut builder = StringViewBuilder::with_capacity(size); - let block_size = 10_000; - let mut rng = StdRng::seed_from_u64(105); - for i in 0..size { - if (i % block_size) >= 1000 { - builder.append_value("unsel_clustered"); - } else { - builder.append_value(random_string(&mut rng)); - } - } - Arc::new(builder.finish()) as ArrayRef +/// Creates a ts (timestamp) array of a given size. Each value is computed as i % 10_000, +/// which simulates repeating blocks (each block of 10,000) to model clustered patterns. +fn create_ts_array(size: usize) -> ArrayRef { + let values: Vec = (0..size).map(|i| (i % 10_000) as i64).collect(); + Arc::new(TimestampMillisecondArray::from(values)) as ArrayRef } -/// Create an extended RecordBatch with 100K rows and ten columns. -/// The schema includes the original four columns and the six additional filter columns, -/// whose names have been updated to use "clustered" and "unclustered" as appropriate. -fn create_extended_batch(size: usize) -> RecordBatch { +/// Creates a RecordBatch with 100K rows and 4 columns: int64, float64, utf8View, and ts. +fn create_record_batch(size: usize) -> RecordBatch { let fields = vec![ Field::new("int64", DataType::Int64, false), Field::new("float64", DataType::Float64, false), @@ -237,80 +138,26 @@ fn create_extended_batch(size: usize) -> RecordBatch { DataType::Timestamp(TimeUnit::Millisecond, None), false, ), - Field::new("pt", DataType::Utf8View, true), - Field::new("sel", DataType::Utf8View, true), - Field::new("mod_clustered", DataType::Utf8View, true), - Field::new("mod_unclustered", DataType::Utf8View, true), - Field::new("unsel_unclustered", DataType::Utf8View, true), - Field::new("unsel_clustered", DataType::Utf8View, true), ]; let schema = Arc::new(Schema::new(fields)); - let int64_array = - create_random_array(&Field::new("int64", DataType::Int64, false), size, 0.0, 0.0).unwrap(); - let float64_array = create_random_array( - &Field::new("float64", DataType::Float64, false), - size, - 0.0, - 0.0, - ) - .unwrap(); - let utf8_array = create_random_array( - &Field::new("utf8View", DataType::Utf8View, true), - size, - 0.2, - 0.5, - ) - .unwrap(); - let ts_array = create_random_array( - &Field::new( - "ts", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - size, - 0.0, - 0.0, - ) - .unwrap(); - - let pt_array = create_filter_array_pt(size); - let sel_array = create_filter_array_sel(size); - let mod_clustered_array = create_filter_array_mod_clustered(size); - let mod_unclustered_array = create_filter_array_mod_unclustered(size); - let unsel_unclustered_array = create_filter_array_unsel_unclustered(size); - let unsel_clustered_array = create_filter_array_unsel_clustered(size); + let int64_array = create_int64_array(size); + let float64_array = create_float64_array(size); + let utf8_array = create_utf8_view_array(size, 0.2); + let ts_array = create_ts_array(size); - let arrays: Vec = vec![ - int64_array, - float64_array, - utf8_array, - ts_array, - pt_array, - sel_array, - mod_clustered_array, - mod_unclustered_array, - unsel_unclustered_array, - unsel_clustered_array, - ]; + let arrays: Vec = vec![int64_array, float64_array, utf8_array, ts_array]; RecordBatch::try_new(schema, arrays).unwrap() } -/// Create a RecordBatch with 100K rows and print a summary (first 100 rows) to the console. -fn make_record_batch() -> RecordBatch { - let num_rows = 100_000; - let batch = create_extended_batch(num_rows); - println!("Batch created with {} rows", num_rows); +/// Writes the RecordBatch to a temporary Parquet file and returns the file handle. +fn write_parquet_file() -> NamedTempFile { + let batch = create_record_batch(100_000); + println!("Batch created with {} rows", 100_000); println!( "First 100 rows:\n{}", pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() ); - batch -} - -/// Write the RecordBatch to a temporary Parquet file and return the file handle. -fn write_parquet_file() -> NamedTempFile { - let batch = make_record_batch(); let schema = batch.schema(); let props = WriterProperties::builder().build(); let file = tempfile::Builder::new() @@ -347,96 +194,139 @@ impl std::fmt::Display for ProjectionCase { /// The variants correspond to the different filter patterns. #[derive(Clone)] enum FilterType { - Utf8ViewNonEmpty, - Utf8ViewConst, - Int64GTZero, - Float64GTHalf, - TimestampGt, + /// Here is the 6 filter types: + /// ┌───────────────┐ ┌───────────────┐ + /// │ │ │ │ + /// │ │ │ ... │ + /// │ │ │ │ + /// │ │ │ │ + /// │ ... │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │ │ + /// │ │ │ ... │ + /// │ │ │ │ + /// │ │ │ │ + /// └───────────────┘ └───────────────┘ + /// + /// "Point Lookup": selects a single row + /// (1 RowSelection of 1 row) + /// + /// ┌───────────────┐ ┌───────────────┐ + /// │ ... │ │ │ + /// │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ + /// │ │ │ ... │ + /// │ │ │ │ + /// │ │ │ │ + /// │ ... │ │ │ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │ │ + /// └───────────────┘ └───────────────┘ + /// selective (1%) unclustered filter + /// (1000 RowSelection of 10 rows each) + /// + /// + /// ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ + /// │ ... │ │ │ │ │ │ │ + /// │ │ │ │ │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ ... │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ │ │ + /// │ │ │ │ │ ... │ │ │ + /// │ │ │ ... │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ ... │ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// └───────────────┘ └───────────────┘ └───────────────┘ └───────────────┘ + /// moderately selective (10%) unclustered filter moderately selective (10%) clustered filter + /// (10000 RowSelection of 10 rows each) (10 RowSelections of 10,000 rows each) + /// ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ ... │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ ... │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ └───────────────┘ └───────────────┘ + /// └───────────────┘ └───────────────┘ + /// unselective (99%) unclustered filter unselective (90%) clustered filter + /// (99,000 RowSelections of 10 rows each) (99 RowSelection of 10,000 rows each) PointLookup, SelectiveUnclustered, ModeratelySelectiveClustered, ModeratelySelectiveUnclustered, UnselectiveUnclustered, UnselectiveClustered, + /// The following are Composite and Utf8ViewNonEmpty filters, which is the additional to above 6 filters. + Composite, + Utf8ViewNonEmpty, } impl std::fmt::Display for FilterType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use FilterType::*; let s = match self { - Utf8ViewNonEmpty => "utf8View <> ''", - Utf8ViewConst => "utf8View = 'const'", - Int64GTZero => "int64 > 0", - Float64GTHalf => "float64 > 50.0", - TimestampGt => "ts > 50_000", - PointLookup => "Point Lookup", - SelectiveUnclustered => "1% Unclustered Filter", - ModeratelySelectiveClustered => "10% Clustered Filter", - ModeratelySelectiveUnclustered => "10% Unclustered Filter", - UnselectiveUnclustered => "99% Unclustered Filter", - UnselectiveClustered => "90% Clustered Filter", + FilterType::PointLookup => "int64 == 9999", + FilterType::SelectiveUnclustered => "float64 > 99.0", + FilterType::ModeratelySelectiveClustered => "ts >= 9000", + FilterType::ModeratelySelectiveUnclustered => "int64 > 90", + FilterType::UnselectiveUnclustered => "float64 <= 99.0", + FilterType::UnselectiveClustered => "ts < 9000", + FilterType::Composite => "float64 > 99.0 AND ts >= 9000", + FilterType::Utf8ViewNonEmpty => "utf8View <> ''", }; write!(f, "{}", s) } } impl FilterType { - /// Applies the specified filter on the given record batch, returning a BooleanArray mask. - /// Each filter uses its dedicated column and checks equality against a fixed string. + /// Applies the specified filter on the given RecordBatch and returns a BooleanArray mask. fn filter_batch(&self, batch: &RecordBatch) -> arrow::error::Result { - use FilterType::*; match self { - Utf8ViewNonEmpty => { - let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let scalar = StringViewArray::new_scalar(""); - neq(array, &scalar) - } - Utf8ViewConst => { - let array = batch.column(batch.schema().index_of("utf8View").unwrap()); - let scalar = StringViewArray::new_scalar("const"); - eq(array, &scalar) - } - Int64GTZero => { + // Point Lookup on int64 column + FilterType::PointLookup => { let array = batch.column(batch.schema().index_of("int64").unwrap()); - gt(array, &Int64Array::new_scalar(0)) + let scalar = Int64Array::new_scalar(9999); + eq(array, &scalar) } - Float64GTHalf => { + // Selective Unclustered on float64 column: float64 > 99.0 + FilterType::SelectiveUnclustered => { let array = batch.column(batch.schema().index_of("float64").unwrap()); - gt(array, &Float64Array::new_scalar(50.0)) + let scalar = Float64Array::new_scalar(99.0); + gt(array, &scalar) } - TimestampGt => { + // Moderately Selective Clustered on ts column: ts >= 9000 (implemented as > 8999) + FilterType::ModeratelySelectiveClustered => { let array = batch.column(batch.schema().index_of("ts").unwrap()); - gt(array, &TimestampMillisecondArray::new_scalar(50_000)) - } - PointLookup => { - let array = batch.column(batch.schema().index_of("pt").unwrap()); - let scalar = StringViewArray::new_scalar("unique_point"); - eq(array, &scalar) + gt(array, &TimestampMillisecondArray::new_scalar(8999)) } - SelectiveUnclustered => { - let array = batch.column(batch.schema().index_of("sel").unwrap()); - let scalar = StringViewArray::new_scalar("selected"); - eq(array, &scalar) + // Moderately Selective Unclustered on int64 column: int64 > 90 + FilterType::ModeratelySelectiveUnclustered => { + let array = batch.column(batch.schema().index_of("int64").unwrap()); + let scalar = Int64Array::new_scalar(90); + gt(array, &scalar) } - ModeratelySelectiveClustered => { - let array = batch.column(batch.schema().index_of("mod_clustered").unwrap()); - let scalar = StringViewArray::new_scalar("mod_clustered"); - eq(array, &scalar) + // Unselective Unclustered on float64 column: NOT (float64 > 99.0) + FilterType::UnselectiveUnclustered => { + let array = batch.column(batch.schema().index_of("float64").unwrap()); + gt(array, &Float64Array::new_scalar(99.0)) } - ModeratelySelectiveUnclustered => { - let array = batch.column(batch.schema().index_of("mod_unclustered").unwrap()); - let scalar = StringViewArray::new_scalar("mod_unclustered"); - eq(array, &scalar) + // Unselective Clustered on ts column: ts < 9000 + FilterType::UnselectiveClustered => { + let array = batch.column(batch.schema().index_of("ts").unwrap()); + lt(array, &TimestampMillisecondArray::new_scalar(9000)) } - UnselectiveUnclustered => { - let array = batch.column(batch.schema().index_of("unsel_unclustered").unwrap()); - let scalar = StringViewArray::new_scalar("unsel_unclustered"); - eq(array, &scalar) + // Composite filter: logical AND of (float64 > 99.0) and (ts >= 9000) + FilterType::Composite => { + let mask1 = FilterType::SelectiveUnclustered.filter_batch(batch)?; + let mask2 = FilterType::ModeratelySelectiveClustered.filter_batch(batch)?; + and(&mask1, &mask2) } - UnselectiveClustered => { - let array = batch.column(batch.schema().index_of("unsel_clustered").unwrap()); - let scalar = StringViewArray::new_scalar("unsel_clustered"); - eq(array, &scalar) + // Utf8ViewNonEmpty: selects rows where the utf8View column is not an empty string. + FilterType::Utf8ViewNonEmpty => { + let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let scalar = StringViewArray::new_scalar(""); + neq(array, &scalar) } } } @@ -448,17 +338,14 @@ impl FilterType { fn benchmark_filters_and_projections(c: &mut Criterion) { let parquet_file = write_parquet_file(); let filter_types = vec![ - FilterType::Utf8ViewNonEmpty, - FilterType::Utf8ViewConst, - FilterType::Int64GTZero, - FilterType::Float64GTHalf, - FilterType::TimestampGt, FilterType::PointLookup, FilterType::SelectiveUnclustered, FilterType::ModeratelySelectiveClustered, FilterType::ModeratelySelectiveUnclustered, FilterType::UnselectiveUnclustered, FilterType::UnselectiveClustered, + FilterType::Utf8ViewNonEmpty, + FilterType::Composite, ]; let projection_cases = vec![ ProjectionCase::AllColumns, @@ -469,19 +356,17 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { for filter_type in filter_types.clone() { for proj_case in &projection_cases { // All indices corresponding to the 10 columns. - let all_indices = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let all_indices = vec![0, 1, 2, 3]; // Determine the filter column index based on the filter type. let filter_col = match filter_type { - FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewConst => 2, - FilterType::Int64GTZero => 0, - FilterType::Float64GTHalf => 1, - FilterType::TimestampGt => 3, - FilterType::PointLookup => 4, - FilterType::SelectiveUnclustered => 5, - FilterType::ModeratelySelectiveClustered => 6, - FilterType::ModeratelySelectiveUnclustered => 7, - FilterType::UnselectiveUnclustered => 8, - FilterType::UnselectiveClustered => 9, + FilterType::PointLookup => 0, + FilterType::SelectiveUnclustered => 1, + FilterType::ModeratelySelectiveClustered => 3, + FilterType::ModeratelySelectiveUnclustered => 0, + FilterType::UnselectiveUnclustered => 1, + FilterType::UnselectiveClustered => 3, + FilterType::Composite => 1, // Use float64 column as representative for composite + FilterType::Utf8ViewNonEmpty => 2, }; // For the projection, either select all columns or exclude the filter column. let output_projection: Vec = match proj_case { @@ -532,75 +417,5 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { } } -/// Benchmark composite filters by applying two filters simultaneously. -/// This benchmark creates a composite row filter that ANDs two predicates: -/// one on the "sel" column (exactly 1% selected) and one on the "mod_clustered" column -/// (first 10 rows in each 10K block), then measures the performance of the combined filtering. -fn benchmark_composite_filters(c: &mut Criterion) { - let parquet_file = write_parquet_file(); - let mut group = c.benchmark_group("composite_filter"); - - // For composite filtering, we choose: - // - Filter1: SelectiveUnclustered on column "sel" (index 5) - // - Filter2: ModeratelySelectiveClustered on column "mod_clustered" (index 6) - // These filters are applied sequentially (logical AND). - let filter1_col = 5; - let filter2_col = 6; - let bench_id = BenchmarkId::new("Composite Filter: sel AND mod_clustered", ""); - group.bench_function(bench_id, |b| { - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .unwrap(); - b.iter(|| { - rt.block_on(async { - let file = File::open(parquet_file.path()).await.unwrap(); - let options = ArrowReaderOptions::new().with_page_index(true); - let builder = ParquetRecordBatchStreamBuilder::new_with_options(file, options) - .await - .unwrap() - .with_batch_size(8192); - let file_metadata = builder.metadata().file_metadata().clone(); - // For projection, we select all columns. - let all_indices = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; - let mask = ProjectionMask::roots(file_metadata.schema_descr(), all_indices.clone()); - let pred_mask1 = - ProjectionMask::roots(file_metadata.schema_descr(), vec![filter1_col]); - let pred_mask2 = - ProjectionMask::roots(file_metadata.schema_descr(), vec![filter2_col]); - - // Create first filter: applies the "sel" filter. - let filter1 = ArrowPredicateFn::new(pred_mask1, move |batch: RecordBatch| { - let scalar = StringViewArray::new_scalar("selected"); - eq( - batch.column(batch.schema().index_of("sel").unwrap()), - &scalar, - ) - }); - // Create second filter: applies the "mod_clustered" filter. - let filter2 = ArrowPredicateFn::new(pred_mask2, move |batch: RecordBatch| { - let scalar = StringViewArray::new_scalar("mod_clustered"); - eq( - batch.column(batch.schema().index_of("mod_clustered").unwrap()), - &scalar, - ) - }); - let composite_filter = RowFilter::new(vec![Box::new(filter1), Box::new(filter2)]); - - let stream = builder - .with_projection(mask) - .with_row_filter(composite_filter) - .build() - .unwrap(); - stream.try_collect::>().await.unwrap(); - }) - }); - }); -} - -criterion_group!( - benches, - benchmark_filters_and_projections, - benchmark_composite_filters -); +criterion_group!(benches, benchmark_filters_and_projections,); criterion_main!(benches); From 9271cc939ff97a88689af4a7114e3c84647d4a0c Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Mon, 14 Apr 2025 13:24:48 +0800 Subject: [PATCH 15/21] Set compression --- parquet/benches/arrow_reader_row_filter.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index e2b67b7c8904..2d4b48bdc582 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -69,6 +69,7 @@ use rand::{rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; use tempfile::NamedTempFile; use tokio::fs::File; +use parquet::basic::Compression; /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). @@ -159,7 +160,7 @@ fn write_parquet_file() -> NamedTempFile { pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() ); let schema = batch.schema(); - let props = WriterProperties::builder().build(); + let props = WriterProperties::builder().set_compression( Compression::SNAPPY).build(); let file = tempfile::Builder::new() .suffix(".parquet") .tempfile() From 8e00ac58a95c7a904dbe434dcb4ca0b688cf92ae Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Mon, 14 Apr 2025 13:52:10 +0800 Subject: [PATCH 16/21] fix --- parquet/benches/arrow_reader_row_filter.rs | 43 ++++++++++++---------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 2d4b48bdc582..d04855eaa54a 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -64,12 +64,12 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::basic::Compression; use parquet::file::properties::WriterProperties; use rand::{rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; use tempfile::NamedTempFile; use tokio::fs::File; -use parquet::basic::Compression; /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). @@ -160,7 +160,9 @@ fn write_parquet_file() -> NamedTempFile { pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() ); let schema = batch.schema(); - let props = WriterProperties::builder().set_compression( Compression::SNAPPY).build(); + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); let file = tempfile::Builder::new() .suffix(".parquet") .tempfile() @@ -286,35 +288,35 @@ impl FilterType { match self { // Point Lookup on int64 column FilterType::PointLookup => { - let array = batch.column(batch.schema().index_of("int64").unwrap()); + let array = batch.column(batch.schema().index_of("int64")?); let scalar = Int64Array::new_scalar(9999); eq(array, &scalar) } // Selective Unclustered on float64 column: float64 > 99.0 FilterType::SelectiveUnclustered => { - let array = batch.column(batch.schema().index_of("float64").unwrap()); + let array = batch.column(batch.schema().index_of("float64")?); let scalar = Float64Array::new_scalar(99.0); gt(array, &scalar) } // Moderately Selective Clustered on ts column: ts >= 9000 (implemented as > 8999) FilterType::ModeratelySelectiveClustered => { - let array = batch.column(batch.schema().index_of("ts").unwrap()); + let array = batch.column(batch.schema().index_of("ts")?); gt(array, &TimestampMillisecondArray::new_scalar(8999)) } // Moderately Selective Unclustered on int64 column: int64 > 90 FilterType::ModeratelySelectiveUnclustered => { - let array = batch.column(batch.schema().index_of("int64").unwrap()); + let array = batch.column(batch.schema().index_of("int64")?); let scalar = Int64Array::new_scalar(90); gt(array, &scalar) } // Unselective Unclustered on float64 column: NOT (float64 > 99.0) FilterType::UnselectiveUnclustered => { - let array = batch.column(batch.schema().index_of("float64").unwrap()); + let array = batch.column(batch.schema().index_of("float64")?); gt(array, &Float64Array::new_scalar(99.0)) } // Unselective Clustered on ts column: ts < 9000 FilterType::UnselectiveClustered => { - let array = batch.column(batch.schema().index_of("ts").unwrap()); + let array = batch.column(batch.schema().index_of("ts")?); lt(array, &TimestampMillisecondArray::new_scalar(9000)) } // Composite filter: logical AND of (float64 > 99.0) and (ts >= 9000) @@ -325,7 +327,7 @@ impl FilterType { } // Utf8ViewNonEmpty: selects rows where the utf8View column is not an empty string. FilterType::Utf8ViewNonEmpty => { - let array = batch.column(batch.schema().index_of("utf8View").unwrap()); + let array = batch.column(batch.schema().index_of("utf8View")?); let scalar = StringViewArray::new_scalar(""); neq(array, &scalar) } @@ -360,21 +362,22 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let all_indices = vec![0, 1, 2, 3]; // Determine the filter column index based on the filter type. let filter_col = match filter_type { - FilterType::PointLookup => 0, - FilterType::SelectiveUnclustered => 1, - FilterType::ModeratelySelectiveClustered => 3, - FilterType::ModeratelySelectiveUnclustered => 0, - FilterType::UnselectiveUnclustered => 1, - FilterType::UnselectiveClustered => 3, - FilterType::Composite => 1, // Use float64 column as representative for composite - FilterType::Utf8ViewNonEmpty => 2, + FilterType::PointLookup => vec![0], + FilterType::SelectiveUnclustered => vec![1], + FilterType::ModeratelySelectiveClustered => vec![3], + FilterType::ModeratelySelectiveUnclustered => vec![0], + FilterType::UnselectiveUnclustered => vec![1], + FilterType::UnselectiveClustered => vec![3], + FilterType::Composite => vec![1, 3], // Use float64 column and ts column as representative for composite + FilterType::Utf8ViewNonEmpty => vec![2], }; - // For the projection, either select all columns or exclude the filter column. + + // For the projection, either select all columns or exclude the filter column(s). let output_projection: Vec = match proj_case { ProjectionCase::AllColumns => all_indices.clone(), ProjectionCase::ExcludeFilterColumn => all_indices .into_iter() - .filter(|i| *i != filter_col) + .filter(|i| !filter_col.contains(i)) .collect(), }; @@ -401,7 +404,7 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { output_projection.clone(), ); let pred_mask = - ProjectionMask::roots(file_metadata.schema_descr(), vec![filter_col]); + ProjectionMask::roots(file_metadata.schema_descr(), filter_col.clone()); let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { Ok(filter_type_inner.filter_batch(&batch).unwrap()) }); From 890519e31d711aa09990d5198295580e00ee8ef8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 13:08:13 -0400 Subject: [PATCH 17/21] Update comments --- parquet/benches/arrow_reader_row_filter.rs | 112 ++++++++++++++------- 1 file changed, 75 insertions(+), 37 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index d04855eaa54a..cf41c16a00c9 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -58,7 +58,7 @@ use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::builder::StringViewBuilder; -use arrow_array::StringViewArray; +use arrow_array::{Array, StringViewArray}; use arrow_cast::pretty::pretty_format_batches; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use futures::TryStreamExt; @@ -197,7 +197,8 @@ impl std::fmt::Display for ProjectionCase { /// The variants correspond to the different filter patterns. #[derive(Clone)] enum FilterType { - /// Here is the 6 filter types: + /// "Point Lookup": selects a single row + /// ```text /// ┌───────────────┐ ┌───────────────┐ /// │ │ │ │ /// │ │ │ ... │ @@ -209,10 +210,11 @@ enum FilterType { /// │ │ │ │ /// │ │ │ │ /// └───────────────┘ └───────────────┘ - /// - /// "Point Lookup": selects a single row + /// ``` /// (1 RowSelection of 1 row) - /// + PointLookup, + /// selective (1%) unclustered filter + /// ```text /// ┌───────────────┐ ┌───────────────┐ /// │ ... │ │ │ /// │ │ │ │ @@ -224,45 +226,81 @@ enum FilterType { /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ /// │ │ │ │ /// └───────────────┘ └───────────────┘ - /// selective (1%) unclustered filter + /// ``` /// (1000 RowSelection of 10 rows each) - /// - /// - /// ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ - /// │ ... │ │ │ │ │ │ │ - /// │ │ │ │ │ │ │ │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ ... │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ │ │ - /// │ │ │ │ │ ... │ │ │ - /// │ │ │ ... │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │ ... │ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │ │ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// └───────────────┘ └───────────────┘ └───────────────┘ └───────────────┘ - /// moderately selective (10%) unclustered filter moderately selective (10%) clustered filter - /// (10000 RowSelection of 10 rows each) (10 RowSelections of 10,000 rows each) - /// ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ ... │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ ... │ │ │ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ - /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ └───────────────┘ └───────────────┘ - /// └───────────────┘ └───────────────┘ - /// unselective (99%) unclustered filter unselective (90%) clustered filter - /// (99,000 RowSelections of 10 rows each) (99 RowSelection of 10,000 rows each) - PointLookup, SelectiveUnclustered, + /// moderately selective (10%) clustered filter + /// ```text + /// ┌───────────────┐ ┌───────────────┐ + /// │ │ │ │ + /// │ │ │ │ + /// │ │ │ ... │ + /// │ │ │ │ + /// │ ... │ │ │ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// └───────────────┘ └───────────────┘ + /// ``` + /// (10 RowSelections of 10,000 rows each) ModeratelySelectiveClustered, + /// moderately selective (10%) clustered filter + /// ```text + /// ┌───────────────┐ ┌───────────────┐ + /// │ ... │ │ │ + /// │ │ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ + /// │ │ │ │ + /// │ │ │ ... │ + /// │ ... │ │ │ + /// │ │ │ │ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// └───────────────┘ └───────────────┘ + /// ``` + /// (10 RowSelections of 10,000 rows each) ModeratelySelectiveUnclustered, + /// unselective (99%) unclustered filter + /// ```text + /// ┌───────────────┐ ┌───────────────┐ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │ │ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// └───────────────┘ └───────────────┘ + /// ``` + /// (99,000 RowSelections of 10 rows each) UnselectiveUnclustered, + /// unselective (90%) clustered filter + /// ```text + /// ┌───────────────┐ ┌───────────────┐ + /// │ │ │ │ + /// │ │ │ │ + /// │ │ │ ... │ + /// │ │ │ │ + /// │ ... │ │ │ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// │ │ │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒│ + /// └───────────────┘ └───────────────┘ + /// ``` + /// (99 RowSelection of 10,000 rows each) UnselectiveClustered, - /// The following are Composite and Utf8ViewNonEmpty filters, which is the additional to above 6 filters. + /// [`Self::SelectivelUnclusered`] `AND` + /// [`Self::ModeratelySelectiveClustered`] Composite, + /// `utf8View <> ''` modeling [ClickBench] [Q21-Q27] + /// + /// [ClickBench]: https://github.com/ClickHouse/ClickBench + /// [Q21-Q27]: https://github.com/apache/datafusion/blob/b7177234e65cbbb2dcc04c252f6acd80bb026362/benchmarks/queries/clickbench/queries.sql#L22-L28 Utf8ViewNonEmpty, } From 7eb0476af3f2221147262ac91e62f74912ce172e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 13:12:43 -0400 Subject: [PATCH 18/21] refactor filter column indexes --- parquet/benches/arrow_reader_row_filter.rs | 27 ++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index cf41c16a00c9..636372406689 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -371,6 +371,20 @@ impl FilterType { } } } + + /// Return the indexes in the batch's schema that are used for filtering. + fn filter_columns(&self) -> &'static [usize] { + match self { + FilterType::PointLookup => &[0], + FilterType::SelectiveUnclustered => &[1], + FilterType::ModeratelySelectiveClustered => &[3], + FilterType::ModeratelySelectiveUnclustered => &[0], + FilterType::UnselectiveUnclustered => &[1], + FilterType::UnselectiveClustered => &[3], + FilterType::Composite => &[1, 3], // Use float64 column and ts column as representative for composite + FilterType::Utf8ViewNonEmpty => &[2], + } + } } /// Benchmark filters and projections by reading the Parquet file. @@ -398,18 +412,7 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { for proj_case in &projection_cases { // All indices corresponding to the 10 columns. let all_indices = vec![0, 1, 2, 3]; - // Determine the filter column index based on the filter type. - let filter_col = match filter_type { - FilterType::PointLookup => vec![0], - FilterType::SelectiveUnclustered => vec![1], - FilterType::ModeratelySelectiveClustered => vec![3], - FilterType::ModeratelySelectiveUnclustered => vec![0], - FilterType::UnselectiveUnclustered => vec![1], - FilterType::UnselectiveClustered => vec![3], - FilterType::Composite => vec![1, 3], // Use float64 column and ts column as representative for composite - FilterType::Utf8ViewNonEmpty => vec![2], - }; - + let filter_col = filter_type.filter_columns(); // For the projection, either select all columns or exclude the filter column(s). let output_projection: Vec = match proj_case { ProjectionCase::AllColumns => all_indices.clone(), From 22c7b39bf84513af2edcfffdbedb3b60366116d0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 13:34:01 -0400 Subject: [PATCH 19/21] Read from in memory buffer --- parquet/benches/arrow_reader_row_filter.rs | 78 ++++++++++++++++------ 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 636372406689..41d41a3a6f9a 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -58,18 +58,21 @@ use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::builder::StringViewBuilder; -use arrow_array::{Array, StringViewArray}; +use arrow_array::StringViewArray; use arrow_cast::pretty::pretty_format_batches; +use bytes::Bytes; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use futures::TryStreamExt; +use futures::future::BoxFuture; +use futures::{FutureExt, TryStreamExt}; use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; +use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::basic::Compression; +use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use parquet::file::properties::WriterProperties; use rand::{rngs::StdRng, Rng, SeedableRng}; +use std::ops::Range; use std::sync::Arc; -use tempfile::NamedTempFile; -use tokio::fs::File; /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). @@ -151,8 +154,8 @@ fn create_record_batch(size: usize) -> RecordBatch { RecordBatch::try_new(schema, arrays).unwrap() } -/// Writes the RecordBatch to a temporary Parquet file and returns the file handle. -fn write_parquet_file() -> NamedTempFile { +/// Writes the RecordBatch to an in memory buffer, returning the buffer +fn write_parquet_file() -> Vec { let batch = create_record_batch(100_000); println!("Batch created with {} rows", 100_000); println!( @@ -163,17 +166,13 @@ fn write_parquet_file() -> NamedTempFile { let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) .build(); - let file = tempfile::Builder::new() - .suffix(".parquet") - .tempfile() - .unwrap(); + let mut buffer = vec![]; { - let file_reopen = file.reopen().unwrap(); - let mut writer = ArrowWriter::try_new(file_reopen, schema.clone(), Some(props)).unwrap(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); writer.write(&batch).unwrap(); writer.close().unwrap(); } - file + buffer } /// ProjectionCase defines the projection mode for the benchmark: @@ -195,7 +194,7 @@ impl std::fmt::Display for ProjectionCase { /// FilterType encapsulates the different filter comparisons. /// The variants correspond to the different filter patterns. -#[derive(Clone)] +#[derive(Clone, Copy)] enum FilterType { /// "Point Lookup": selects a single row /// ```text @@ -373,7 +372,7 @@ impl FilterType { } /// Return the indexes in the batch's schema that are used for filtering. - fn filter_columns(&self) -> &'static [usize] { + fn filter_projection(&self) -> &'static [usize] { match self { FilterType::PointLookup => &[0], FilterType::SelectiveUnclustered => &[1], @@ -391,7 +390,8 @@ impl FilterType { /// This benchmark iterates over all individual filter types and two projection cases. /// It measures the time to read and filter the Parquet file according to each scenario. fn benchmark_filters_and_projections(c: &mut Criterion) { - let parquet_file = write_parquet_file(); + // make the parquet file in memory that can be shared + let parquet_file = Bytes::from(write_parquet_file()); let filter_types = vec![ FilterType::PointLookup, FilterType::SelectiveUnclustered, @@ -408,11 +408,11 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { ]; let mut group = c.benchmark_group("arrow_reader_row_filter"); - for filter_type in filter_types.clone() { + for filter_type in filter_types { for proj_case in &projection_cases { // All indices corresponding to the 10 columns. let all_indices = vec![0, 1, 2, 3]; - let filter_col = filter_type.filter_columns(); + let filter_col = filter_type.filter_projection().to_vec(); // For the projection, either select all columns or exclude the filter column(s). let output_projection: Vec = match proj_case { ProjectionCase::AllColumns => all_indices.clone(), @@ -430,12 +430,11 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .build() .unwrap(); b.iter(|| { - let filter_type_inner = filter_type.clone(); rt.block_on(async { - let file = File::open(parquet_file.path()).await.unwrap(); + let reader = MemoryAsyncReader::new(&parquet_file); let options = ArrowReaderOptions::new().with_page_index(true); let builder = - ParquetRecordBatchStreamBuilder::new_with_options(file, options) + ParquetRecordBatchStreamBuilder::new_with_options(reader, options) .await .unwrap() .with_batch_size(8192); @@ -447,7 +446,7 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let pred_mask = ProjectionMask::roots(file_metadata.schema_descr(), filter_col.clone()); let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { - Ok(filter_type_inner.filter_batch(&batch).unwrap()) + Ok(filter_type.filter_batch(&batch).unwrap()) }); let stream = builder .with_projection(mask) @@ -462,5 +461,40 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { } } +/// Adapter to read asynchronously from in memory bytes +#[derive(Debug)] +struct MemoryAsyncReader { + inner: Bytes, +} + +impl MemoryAsyncReader { + fn new(inner: &Bytes) -> Self { + // clone of bytes is cheap -- increments a refcount + Self { + inner: inner.clone(), + } + } +} + +impl AsyncFileReader for MemoryAsyncReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + let data = self.inner.slice(range.start as usize..range.end as usize); + async move { Ok(data) }.boxed() + } + + fn get_metadata<'a>( + &'a mut self, + _options: Option<&'a ArrowReaderOptions>, + ) -> BoxFuture<'a, parquet::errors::Result>> { + let inner = self.inner.clone(); + async move { + let mut metadata_reader = ParquetMetaDataReader::new().with_page_indexes(true); + metadata_reader.try_parse(&inner)?; + metadata_reader.finish().map(Arc::new) + } + .boxed() + } +} + criterion_group!(benches, benchmark_filters_and_projections,); criterion_main!(benches); From 5ae9b5811f90816967a33f6135a3b0078542ce16 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 13:37:37 -0400 Subject: [PATCH 20/21] celanu --- parquet/benches/arrow_reader_row_filter.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 41d41a3a6f9a..82060905196f 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -484,11 +484,12 @@ impl AsyncFileReader for MemoryAsyncReader { fn get_metadata<'a>( &'a mut self, - _options: Option<&'a ArrowReaderOptions>, + options: Option<&'a ArrowReaderOptions>, ) -> BoxFuture<'a, parquet::errors::Result>> { let inner = self.inner.clone(); + let page_index = options.map(|o| o.page_index()).unwrap_or(true); async move { - let mut metadata_reader = ParquetMetaDataReader::new().with_page_indexes(true); + let mut metadata_reader = ParquetMetaDataReader::new().with_page_indexes(page_index); metadata_reader.try_parse(&inner)?; metadata_reader.finish().map(Arc::new) } From 1effe88d71d2f7c87d1f3021b6841c04b6856a67 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Apr 2025 14:17:16 -0400 Subject: [PATCH 21/21] Test both sync and async readers --- parquet/benches/arrow_reader_row_filter.rs | 166 ++++++++++++++------- 1 file changed, 115 insertions(+), 51 deletions(-) diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 82060905196f..e59ba59cc69d 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -63,8 +63,10 @@ use arrow_cast::pretty::pretty_format_batches; use bytes::Bytes; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use futures::future::BoxFuture; -use futures::{FutureExt, TryStreamExt}; -use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; +use futures::{FutureExt, StreamExt}; +use parquet::arrow::arrow_reader::{ + ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowFilter, +}; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::basic::Compression; @@ -194,7 +196,7 @@ impl std::fmt::Display for ProjectionCase { /// FilterType encapsulates the different filter comparisons. /// The variants correspond to the different filter patterns. -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug)] enum FilterType { /// "Point Lookup": selects a single row /// ```text @@ -406,6 +408,12 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { ProjectionCase::AllColumns, ProjectionCase::ExcludeFilterColumn, ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + let mut group = c.benchmark_group("arrow_reader_row_filter"); for filter_type in filter_types { @@ -422,61 +430,123 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { .collect(), }; - let bench_id = - BenchmarkId::new(format!("filter: {} proj: {}", filter_type, proj_case), ""); + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + + let schema_descr = metadata.file_metadata().schema_descr(); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection.clone()); + let pred_mask = ProjectionMask::roots(schema_descr, filter_col.clone()); + + let benchmark_name = format!("{filter_type:?}/{proj_case}",); + + // run the benchmark for the async reader + let bench_id = BenchmarkId::new(benchmark_name.clone(), "async"); + let rt_captured = rt.handle().clone(); group.bench_function(bench_id, |b| { - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .unwrap(); b.iter(|| { - rt.block_on(async { - let reader = MemoryAsyncReader::new(&parquet_file); - let options = ArrowReaderOptions::new().with_page_index(true); - let builder = - ParquetRecordBatchStreamBuilder::new_with_options(reader, options) - .await - .unwrap() - .with_batch_size(8192); - let file_metadata = builder.metadata().file_metadata().clone(); - let mask = ProjectionMask::roots( - file_metadata.schema_descr(), - output_projection.clone(), - ); - let pred_mask = - ProjectionMask::roots(file_metadata.schema_descr(), filter_col.clone()); - let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { - Ok(filter_type.filter_batch(&batch).unwrap()) - }); - let stream = builder - .with_projection(mask) - .with_row_filter(RowFilter::new(vec![Box::new(filter)])) - .build() - .unwrap(); - stream.try_collect::>().await.unwrap(); + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + // row filters are not clone, so must make it each iter + let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { + Ok(filter_type.filter_batch(&batch).unwrap()) + }); + let row_filter = RowFilter::new(vec![Box::new(filter)]); + + rt_captured.block_on(async { + benchmark_async_reader(reader, projection_mask, row_filter).await; }) }); }); + + // run the benchmark for the sync reader + let bench_id = BenchmarkId::new(benchmark_name, "sync"); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + // row filters are not clone, so must make it each iter + let filter = ArrowPredicateFn::new(pred_mask, move |batch: RecordBatch| { + Ok(filter_type.filter_batch(&batch).unwrap()) + }); + let row_filter = RowFilter::new(vec![Box::new(filter)]); + + benchmark_sync_reader(reader, projection_mask, row_filter) + }); + }); } } } -/// Adapter to read asynchronously from in memory bytes -#[derive(Debug)] -struct MemoryAsyncReader { +/// Use async API +async fn benchmark_async_reader( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .build() + .unwrap(); + while let Some(b) = stream.next().await { + b.unwrap(); // consume the batches, no buffering + } +} + +/// Use sync API +fn benchmark_sync_reader( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, +) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .build() + .unwrap(); + for b in stream { + b.unwrap(); // consume the batches, no buffering + } +} + +/// Adapter to read asynchronously from in memory bytes and always loads the +/// metadata with page indexes. +#[derive(Debug, Clone)] +struct InMemoryReader { inner: Bytes, + metadata: Arc, } -impl MemoryAsyncReader { - fn new(inner: &Bytes) -> Self { - // clone of bytes is cheap -- increments a refcount - Self { +impl InMemoryReader { + fn try_new(inner: &Bytes) -> parquet::errors::Result { + let mut metadata_reader = ParquetMetaDataReader::new().with_page_indexes(true); + metadata_reader.try_parse(inner)?; + let metadata = metadata_reader.finish().map(Arc::new)?; + + Ok(Self { + // clone of bytes is cheap -- increments a refcount inner: inner.clone(), - } + metadata, + }) + } + + fn metadata(&self) -> &Arc { + &self.metadata + } + + fn into_inner(self) -> Bytes { + self.inner } } -impl AsyncFileReader for MemoryAsyncReader { +impl AsyncFileReader for InMemoryReader { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { let data = self.inner.slice(range.start as usize..range.end as usize); async move { Ok(data) }.boxed() @@ -484,16 +554,10 @@ impl AsyncFileReader for MemoryAsyncReader { fn get_metadata<'a>( &'a mut self, - options: Option<&'a ArrowReaderOptions>, + _options: Option<&'a ArrowReaderOptions>, ) -> BoxFuture<'a, parquet::errors::Result>> { - let inner = self.inner.clone(); - let page_index = options.map(|o| o.page_index()).unwrap_or(true); - async move { - let mut metadata_reader = ParquetMetaDataReader::new().with_page_indexes(page_index); - metadata_reader.try_parse(&inner)?; - metadata_reader.finish().map(Arc::new) - } - .boxed() + let metadata = Arc::clone(&self.metadata); + async move { Ok(metadata) }.boxed() } }