Skip to content

Commit fe071eb

Browse files
authored
Minor: Improve ArrowReaderBuilder::with_row_selection docs (#5824)
1 parent e4b28bd commit fe071eb

File tree

1 file changed

+35
-6
lines changed
  • parquet/src/arrow/arrow_reader

1 file changed

+35
-6
lines changed

parquet/src/arrow/arrow_reader/mod.rs

+35-6
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ impl<T> ArrowReaderBuilder<T> {
117117
}
118118

119119
/// Only read data from the provided row group indexes
120+
///
121+
/// This is also called row group filtering
120122
pub fn with_row_groups(self, row_groups: Vec<usize>) -> Self {
121123
Self {
122124
row_groups: Some(row_groups),
@@ -135,14 +137,41 @@ impl<T> ArrowReaderBuilder<T> {
135137
/// Provide a [`RowSelection`] to filter out rows, and avoid fetching their
136138
/// data into memory.
137139
///
138-
/// Row group filtering is applied prior to this, and therefore rows from skipped
139-
/// row groups should not be included in the [`RowSelection`]
140+
/// This feature is used to restrict which rows are decoded within row
141+
/// groups, skipping ranges of rows that are not needed. Such selections
142+
/// could be determined by evaluating predicates against the parquet page
143+
/// [`Index`] or some other external information available to a query
144+
/// engine.
140145
///
141-
/// An example use case of this would be applying a selection determined by
142-
/// evaluating predicates against the [`Index`]
146+
/// # Notes
143147
///
144-
/// It is recommended to enable reading the page index if using this functionality, to allow
145-
/// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`]
148+
/// Row group filtering (see [`Self::with_row_groups`]) is applied prior to
149+
/// applying the row selection, and therefore rows from skipped row groups
150+
/// should not be included in the [`RowSelection`] (see example below)
151+
///
152+
/// It is recommended to enable writing the page index if using this
153+
/// functionality, to allow more efficient skipping over data pages. See
154+
/// [`ArrowReaderOptions::with_page_index`].
155+
///
156+
/// # Example
157+
///
158+
/// Given a parquet file with 3 row groups, and a row group filter of
159+
/// `[0, 2]`, in order to only scan rows 50-100 in row group 2:
160+
///
161+
/// ```text
162+
/// Row Group 0, 1000 rows (selected)
163+
/// Row Group 1, 1000 rows (skipped)
164+
/// Row Group 2, 1000 rows (selected, but want to only scan rows 50-100)
165+
/// ```
166+
///
167+
/// You would pass the following [`RowSelection`]:
168+
///
169+
/// ```text
170+
/// Select 1000 (scan all rows in row group 0)
171+
/// Select 50-100 (scan rows 50-100 in row group 2)
172+
/// ```
173+
///
174+
/// Note there is no entry for the (entirely) skipped row group 1.
146175
///
147176
/// [`Index`]: crate::file::page_index::index::Index
148177
pub fn with_row_selection(self, selection: RowSelection) -> Self {

0 commit comments

Comments
 (0)