@@ -117,6 +117,8 @@ impl<T> ArrowReaderBuilder<T> {
117
117
}
118
118
119
119
/// Only read data from the provided row group indexes
120
+ ///
121
+ /// This is also called row group filtering
120
122
pub fn with_row_groups ( self , row_groups : Vec < usize > ) -> Self {
121
123
Self {
122
124
row_groups : Some ( row_groups) ,
@@ -135,14 +137,41 @@ impl<T> ArrowReaderBuilder<T> {
135
137
/// Provide a [`RowSelection`] to filter out rows, and avoid fetching their
136
138
/// data into memory.
137
139
///
138
- /// Row group filtering is applied prior to this, and therefore rows from skipped
139
- /// row groups should not be included in the [`RowSelection`]
140
+ /// This feature is used to restrict which rows are decoded within row
141
+ /// groups, skipping ranges of rows that are not needed. Such selections
142
+ /// could be determined by evaluating predicates against the parquet page
143
+ /// [`Index`] or some other external information available to a query
144
+ /// engine.
140
145
///
141
- /// An example use case of this would be applying a selection determined by
142
- /// evaluating predicates against the [`Index`]
146
+ /// # Notes
143
147
///
144
- /// It is recommended to enable reading the page index if using this functionality, to allow
145
- /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`]
148
+ /// Row group filtering (see [`Self::with_row_groups`]) is applied prior to
149
+ /// applying the row selection, and therefore rows from skipped row groups
150
+ /// should not be included in the [`RowSelection`] (see example below)
151
+ ///
152
+ /// It is recommended to enable writing the page index if using this
153
+ /// functionality, to allow more efficient skipping over data pages. See
154
+ /// [`ArrowReaderOptions::with_page_index`].
155
+ ///
156
+ /// # Example
157
+ ///
158
+ /// Given a parquet file with 3 row groups, and a row group filter of
159
+ /// `[0, 2]`, in order to only scan rows 50-100 in row group 2:
160
+ ///
161
+ /// ```text
162
+ /// Row Group 0, 1000 rows (selected)
163
+ /// Row Group 1, 1000 rows (skipped)
164
+ /// Row Group 2, 1000 rows (selected, but want to only scan rows 50-100)
165
+ /// ```
166
+ ///
167
+ /// You would pass the following [`RowSelection`]:
168
+ ///
169
+ /// ```text
170
+ /// Select 1000 (scan all rows in row group 0)
171
+ /// Select 50-100 (scan rows 50-100 in row group 2)
172
+ /// ```
173
+ ///
174
+ /// Note there is no entry for the (entirely) skipped row group 1.
146
175
///
147
176
/// [`Index`]: crate::file::page_index::index::Index
148
177
pub fn with_row_selection ( self , selection : RowSelection ) -> Self {
0 commit comments