@@ -530,8 +530,9 @@ impl ArrowColumnChunk {
530
530
531
531
/// Encodes [`ArrowLeafColumn`] to [`ArrowColumnChunk`]
532
532
///
533
- /// Note: This is a low-level interface for applications that require fine-grained control
534
- /// of encoding, see [`ArrowWriter`] for a higher-level interface
533
+ /// Note: This is a low-level interface for applications that require
534
+ /// fine-grained control of encoding (e.g. encoding using multiple threads),
535
+ /// see [`ArrowWriter`] for a higher-level interface
535
536
///
536
537
/// # Example: Encoding two Arrow Array's in Parallel
537
538
/// ```
@@ -540,9 +541,9 @@ impl ArrowColumnChunk {
540
541
/// # use arrow_array::*;
541
542
/// # use arrow_schema::*;
542
543
/// # use parquet::arrow::ArrowSchemaConverter;
543
- /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers};
544
+ /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers, ArrowColumnChunk };
544
545
/// # use parquet::file::properties::WriterProperties;
545
- /// # use parquet::file::writer::SerializedFileWriter;
546
+ /// # use parquet::file::writer::{ SerializedFileWriter, SerializedRowGroupWriter} ;
546
547
/// #
547
548
/// let schema = Arc::new(Schema::new(vec![
548
549
/// Field::new("i32", DataType::Int32, false),
@@ -560,15 +561,20 @@ impl ArrowColumnChunk {
560
561
/// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap();
561
562
///
562
563
/// // Spawn a worker thread for each column
563
- /// // This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better
564
+ /// //
565
+ /// // Note: This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better.
566
+ /// // The `map` produces an iterator of type `tuple of (thread handle, send channel)`.
564
567
/// let mut workers: Vec<_> = col_writers
565
568
/// .into_iter()
566
569
/// .map(|mut col_writer| {
567
570
/// let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
568
571
/// let handle = std::thread::spawn(move || {
572
+ /// // receive Arrays to encode via the channel
569
573
/// for col in recv {
570
574
/// col_writer.write(&col)?;
571
575
/// }
576
+ /// // once the input is complete, close the writer
577
+ /// // to return the newly created ArrowColumnChunk
572
578
/// col_writer.close()
573
579
/// });
574
580
/// (handle, send)
@@ -577,33 +583,40 @@ impl ArrowColumnChunk {
577
583
///
578
584
/// // Create parquet writer
579
585
/// let root_schema = parquet_schema.root_schema_ptr();
580
- /// let mut out = Vec::with_capacity(1024); // This could be a File
581
- /// let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone()).unwrap();
586
+ /// // write to memory in the example, but this could be a File
587
+ /// let mut out = Vec::with_capacity(1024);
588
+ /// let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone())
589
+ /// .unwrap();
582
590
///
583
591
/// // Start row group
584
- /// let mut row_group = writer.next_row_group().unwrap();
592
+ /// let mut row_group_writer: SerializedRowGroupWriter<'_, _> = writer
593
+ /// .next_row_group()
594
+ /// .unwrap();
585
595
///
586
- /// // Columns to encode
596
+ /// // Create some example input columns to encode
587
597
/// let to_write = vec![
588
598
/// Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _,
589
599
/// Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _,
590
600
/// ];
591
601
///
592
- /// // Spawn work to encode columns
602
+ /// // Send the input columns to the workers
593
603
/// let mut worker_iter = workers.iter_mut();
594
604
/// for (arr, field) in to_write.iter().zip(&schema.fields) {
595
605
/// for leaves in compute_leaves(field, arr).unwrap() {
596
606
/// worker_iter.next().unwrap().1.send(leaves).unwrap();
597
607
/// }
598
608
/// }
599
609
///
600
- /// // Finish up parallel column encoding
610
+ /// // Wait for the workers to complete encoding, and append
611
+ /// // the resulting column chunks to the row group (and the file)
601
612
/// for (handle, send) in workers {
602
613
/// drop(send); // Drop send side to signal termination
603
- /// let chunk = handle.join().unwrap().unwrap();
604
- /// chunk.append_to_row_group(&mut row_group).unwrap();
614
+ /// // wait for the worker to send the completed chunk
615
+ /// let chunk: ArrowColumnChunk = handle.join().unwrap().unwrap();
616
+ /// chunk.append_to_row_group(&mut row_group_writer).unwrap();
605
617
/// }
606
- /// row_group.close().unwrap();
618
+ /// // Close the row group which writes to the underlying file
619
+ /// row_group_writer.close().unwrap();
607
620
///
608
621
/// let metadata = writer.close().unwrap();
609
622
/// assert_eq!(metadata.num_rows, 3);
0 commit comments