@@ -116,13 +116,12 @@ pub use writer::plan_to_parquet;
116
116
///
117
117
/// Supports the following optimizations:
118
118
///
119
- /// * Concurrent reads: Can read from one or more files in parallel as multiple
119
+ /// * Concurrent reads: reads from one or more files in parallel as multiple
120
120
/// partitions, including concurrently reading multiple row groups from a single
121
121
/// file.
122
122
///
123
- /// * Predicate push down: skips row groups and pages based on
124
- /// min/max/null_counts in the row group metadata, the page index and bloom
125
- /// filters.
123
+ /// * Predicate push down: skips row groups, pages, rows based on metadata
124
+ /// and late materialization. See "Predicate Pushdown" below.
126
125
///
127
126
/// * Projection pushdown: reads and decodes only the columns required.
128
127
///
@@ -132,9 +131,8 @@ pub use writer::plan_to_parquet;
132
131
/// coalesce I/O operations, etc. See [`ParquetFileReaderFactory`] for more
133
132
/// details.
134
133
///
135
- /// * Schema adapters: read parquet files with different schemas into a unified
136
- /// table schema. This can be used to implement "schema evolution". See
137
- /// [`SchemaAdapterFactory`] for more details.
134
+ /// * Schema evolution: read parquet files with different schemas into a unified
135
+ /// table schema. See [`SchemaAdapterFactory`] for more details.
138
136
///
139
137
/// * metadata_size_hint: controls the number of bytes read from the end of the
140
138
/// file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a
@@ -144,6 +142,29 @@ pub use writer::plan_to_parquet;
144
142
/// * User provided [`ParquetAccessPlan`]s to skip row groups and/or pages
145
143
/// based on external information. See "Implementing External Indexes" below
146
144
///
145
+ /// # Predicate Pushdown
146
+ ///
147
+ /// `ParquetExec` uses the provided [`PhysicalExpr`] predicate as a filter to
148
+ /// skip reading unnecessary data and improve query performance using several techniques:
149
+ ///
150
+ /// * Row group pruning: skips entire row groups based on min/max statistics
151
+ /// found in [`ParquetMetaData`] and any Bloom filters that are present.
152
+ ///
153
+ /// * Page pruning: skips individual pages within a ColumnChunk using the
154
+ /// [Parquet PageIndex], if present.
155
+ ///
156
+ /// * Row filtering: skips rows within a page using a form of late
157
+ /// materialization. When possible, predicates are applied by the parquet
158
+ /// decoder *during* decode (see [`ArrowPredicate`] and [`RowFilter`] for more
159
+ /// details). This is only enabled if `ParquetScanOptions::pushdown_filters` is set to true.
160
+ ///
161
+ /// Note: If the predicate can not be used to accelerate the scan, it is ignored
162
+ /// (no error is raised on predicate evaluation errors).
163
+ ///
164
+ /// [`ArrowPredicate`]: parquet::arrow::arrow_reader::ArrowPredicate
165
+ /// [`RowFilter`]: parquet::arrow::arrow_reader::RowFilter
166
+ /// [Parquet PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
167
+ ///
147
168
/// # Implementing External Indexes
148
169
///
149
170
/// It is possible to restrict the row groups and selections within those row
@@ -199,10 +220,11 @@ pub use writer::plan_to_parquet;
199
220
/// applying predicates to metadata. The plan and projections are used to
200
221
/// determine what pages must be read.
201
222
///
202
- /// * Step 4: The stream begins reading data, fetching the required pages
203
- /// and incrementally decoding them.
223
+ /// * Step 4: The stream begins reading data, fetching the required parquet
224
+ /// pages incrementally decoding them, and applying any row filters (see
225
+ /// [`Self::with_pushdown_filters`]).
204
226
///
205
- /// * Step 5: As each [`RecordBatch]` is read, it may be adapted by a
227
+ /// * Step 5: As each [`RecordBatch`] is read, it may be adapted by a
206
228
/// [`SchemaAdapter`] to match the table schema. By default missing columns are
207
229
/// filled with nulls, but this can be customized via [`SchemaAdapterFactory`].
208
230
///
@@ -268,13 +290,10 @@ impl ParquetExecBuilder {
268
290
}
269
291
}
270
292
271
- /// Set the predicate for the scan.
272
- ///
273
- /// The ParquetExec uses this predicate to filter row groups and data pages
274
- /// using the Parquet statistics and bloom filters.
293
+ /// Set the filter predicate when reading.
275
294
///
276
- /// If the predicate can not be used to prune the scan, it is ignored (no
277
- /// error is raised) .
295
+ /// See the "Predicate Pushdown" section of the [`ParquetExec`] documenation
296
+ /// for more details .
278
297
pub fn with_predicate ( mut self , predicate : Arc < dyn PhysicalExpr > ) -> Self {
279
298
self . predicate = Some ( predicate) ;
280
299
self
@@ -291,7 +310,7 @@ impl ParquetExecBuilder {
291
310
self
292
311
}
293
312
294
- /// Set the table parquet options that control how the ParquetExec reads.
313
+ /// Set the options for controlling how the ParquetExec reads parquet files .
295
314
///
296
315
/// See also [`Self::new_with_options`]
297
316
pub fn with_table_parquet_options (
@@ -480,11 +499,8 @@ impl ParquetExec {
480
499
self
481
500
}
482
501
483
- /// If true, any filter [`Expr`]s on the scan will converted to a
484
- /// [`RowFilter`](parquet::arrow::arrow_reader::RowFilter) in the
485
- /// `ParquetRecordBatchStream`. These filters are applied by the
486
- /// parquet decoder to skip unecessairly decoding other columns
487
- /// which would not pass the predicate. Defaults to false
502
+ /// If true, the predicate will be used during the parquet scan.
503
+ /// Defaults to false
488
504
///
489
505
/// [`Expr`]: datafusion_expr::Expr
490
506
pub fn with_pushdown_filters ( mut self , pushdown_filters : bool ) -> Self {
0 commit comments