Skip to content

Commit d65240c

Browse files
authored
Refine ParquetRecordBatchReaderBuilder docs (#5774)
* Refine ParquetRecordBatchReaderBuilder docs * fix link * Suggest using new(), add example
1 parent b07fd5d commit d65240c

File tree

2 files changed

+51
-12
lines changed

2 files changed

+51
-12
lines changed

parquet/src/arrow/arrow_reader/mod.rs

+6-6
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,14 @@ use crate::file::page_index::index_reader;
4444
pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
4545
pub use selection::{RowSelection, RowSelector};
4646

47-
/// A generic builder for constructing sync or async arrow parquet readers. This is not intended
48-
/// to be used directly, instead you should use the specialization for the type of reader
49-
/// you wish to use
47+
/// Builder for constructing parquet readers into arrow.
5048
///
51-
/// * For a synchronous API - [`ParquetRecordBatchReaderBuilder`]
52-
/// * For an asynchronous API - [`ParquetRecordBatchStreamBuilder`]
49+
/// Most users should use one of the following specializations:
5350
///
54-
/// [`ParquetRecordBatchStreamBuilder`]: crate::arrow::async_reader::ParquetRecordBatchStreamBuilder
51+
/// * synchronous API: [`ParquetRecordBatchReaderBuilder::try_new`]
52+
/// * `async` API: [`ParquetRecordBatchStreamBuilder::new`]
53+
///
54+
/// [`ParquetRecordBatchStreamBuilder::new`]: crate::arrow::async_reader::ParquetRecordBatchStreamBuilder::new
5555
pub struct ArrowReaderBuilder<T> {
5656
pub(crate) input: T,
5757

parquet/src/arrow/async_reader/mod.rs

+45-6
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ impl ArrowReaderMetadata {
228228
/// breaking the pre-existing ParquetRecordBatchStreamBuilder API
229229
pub struct AsyncReader<T>(T);
230230

231-
/// A builder used to construct a [`ParquetRecordBatchStream`] for a parquet file
231+
/// A builder used to construct a [`ParquetRecordBatchStream`] for `async` reading of a parquet file
232232
///
233233
/// In particular, this handles reading the parquet file metadata, allowing consumers
234234
/// to use this information to select what specific columns, row groups, etc...
@@ -239,6 +239,37 @@ pub type ParquetRecordBatchStreamBuilder<T> = ArrowReaderBuilder<AsyncReader<T>>
239239

240240
impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
241241
/// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file
242+
///
243+
/// # Example
244+
///
245+
/// ```
246+
/// # use std::fs::metadata;
247+
/// # use std::sync::Arc;
248+
/// # use bytes::Bytes;
249+
/// # use arrow_array::{Int32Array, RecordBatch};
250+
/// # use arrow_schema::{DataType, Field, Schema};
251+
/// # use parquet::arrow::arrow_reader::ArrowReaderMetadata;
252+
/// # use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder};
253+
/// # use tempfile::tempfile;
254+
/// # use futures::StreamExt;
255+
/// # #[tokio::main(flavor="current_thread")]
256+
/// # async fn main() {
257+
/// #
258+
/// # let mut file = tempfile().unwrap();
259+
/// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)]));
260+
/// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap();
261+
/// # let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap();
262+
/// # writer.write(&batch).unwrap();
263+
/// # writer.close().unwrap();
264+
/// // Open async file containing parquet data
265+
/// let mut file = tokio::fs::File::from_std(file);
266+
/// // construct the reader
267+
/// let mut reader = ParquetRecordBatchStreamBuilder::new(file)
268+
/// .await.unwrap().build().unwrap();
269+
/// // Read batche
270+
/// let batch: RecordBatch = reader.next().await.unwrap().unwrap();
271+
/// # }
272+
/// ```
242273
pub async fn new(input: T) -> Result<Self> {
243274
Self::new_with_options(input, Default::default()).await
244275
}
@@ -253,7 +284,9 @@ impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
253284
/// Create a [`ParquetRecordBatchStreamBuilder`] from the provided [`ArrowReaderMetadata`]
254285
///
255286
/// This allows loading metadata once and using it to create multiple builders with
256-
/// potentially different settings
287+
/// potentially different settings, that can be read in parallel.
288+
///
289+
/// # Example of reading from multiple streams in parallel
257290
///
258291
/// ```
259292
/// # use std::fs::metadata;
@@ -268,23 +301,29 @@ impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
268301
/// # #[tokio::main(flavor="current_thread")]
269302
/// # async fn main() {
270303
/// #
271-
/// let mut file = tempfile().unwrap();
304+
/// # let mut file = tempfile().unwrap();
272305
/// # let schema = Arc::new(Schema::new(vec![Field::new("i32", DataType::Int32, false)]));
273306
/// # let mut writer = ArrowWriter::try_new(&mut file, schema.clone(), None).unwrap();
274307
/// # let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))]).unwrap();
275308
/// # writer.write(&batch).unwrap();
276309
/// # writer.close().unwrap();
277-
/// #
310+
/// // open file with parquet data
278311
/// let mut file = tokio::fs::File::from_std(file);
312+
/// // load metadata once
279313
/// let meta = ArrowReaderMetadata::load_async(&mut file, Default::default()).await.unwrap();
314+
/// // create two readers, a and b, from the same underlying file
315+
/// // without reading the metadata again
280316
/// let mut a = ParquetRecordBatchStreamBuilder::new_with_metadata(
281317
/// file.try_clone().await.unwrap(),
282318
/// meta.clone()
283319
/// ).build().unwrap();
284320
/// let mut b = ParquetRecordBatchStreamBuilder::new_with_metadata(file, meta).build().unwrap();
285321
///
286-
/// // Should be able to read from both in parallel
287-
/// assert_eq!(a.next().await.unwrap().unwrap(), b.next().await.unwrap().unwrap());
322+
/// // Can read batches from both readers in parallel
323+
/// assert_eq!(
324+
/// a.next().await.unwrap().unwrap(),
325+
/// b.next().await.unwrap().unwrap(),
326+
/// );
288327
/// # }
289328
/// ```
290329
pub fn new_with_metadata(input: T, metadata: ArrowReaderMetadata) -> Self {

0 commit comments

Comments
 (0)