@@ -69,6 +69,29 @@ use tokio::io::{AsyncWrite, AsyncWriteExt};
69
69
/// It is implemented based on the sync writer [`ArrowWriter`] with an inner buffer.
70
70
/// The buffered data will be flushed to the writer provided by caller when the
71
71
/// buffer's threshold is exceeded.
72
+ ///
73
+ /// ## Memory Limiting
74
+ ///
75
+ /// The nature of parquet forces buffering of an entire row group before it can be flushed
76
+ /// to the underlying writer. This buffering may exceed the configured buffer size
77
+ /// of [`AsyncArrowWriter`]. Memory usage can be limited by prematurely flushing the row group,
78
+ /// although this will have implications for file size and query performance. See [ArrowWriter]
79
+ /// for more information.
80
+ ///
81
+ /// ```no_run
82
+ /// # use tokio::fs::File;
83
+ /// # use arrow_array::RecordBatch;
84
+ /// # use parquet::arrow::AsyncArrowWriter;
85
+ /// # async fn test() {
86
+ /// let mut writer: AsyncArrowWriter<File> = todo!();
87
+ /// let batch: RecordBatch = todo!();
88
+ /// writer.write(&batch).await.unwrap();
89
+ /// // Trigger an early flush if buffered size exceeds 1_000_000
90
+ /// if writer.in_progress_size() > 1_000_000 {
91
+ /// writer.flush().await.unwrap()
92
+ /// }
93
+ /// # }
94
+ /// ```
72
95
pub struct AsyncArrowWriter < W > {
73
96
/// Underlying sync writer
74
97
sync_writer : ArrowWriter < SharedBuffer > ,
@@ -86,13 +109,10 @@ pub struct AsyncArrowWriter<W> {
86
109
impl < W : AsyncWrite + Unpin + Send > AsyncArrowWriter < W > {
87
110
/// Try to create a new Async Arrow Writer.
88
111
///
89
- /// `buffer_size` determines the number of bytes to buffer before flushing
90
- /// to the underlying [`AsyncWrite`]
91
- ///
92
- /// The intermediate buffer will automatically be resized if necessary
93
- ///
94
- /// [`Self::write`] will flush this intermediate buffer if it is at least
95
- /// half full
112
+ /// `buffer_size` determines the minimum number of bytes to buffer before flushing
113
+ /// to the underlying [`AsyncWrite`]. However, the nature of writing parquet may
114
+ /// force buffering of data in excess of this within the underlying [`ArrowWriter`].
115
+ /// See the documentation on [`ArrowWriter`] for more details
96
116
pub fn try_new (
97
117
writer : W ,
98
118
arrow_schema : SchemaRef ,
@@ -105,13 +125,10 @@ impl<W: AsyncWrite + Unpin + Send> AsyncArrowWriter<W> {
105
125
106
126
/// Try to create a new Async Arrow Writer with [`ArrowWriterOptions`].
107
127
///
108
- /// `buffer_size` determines the number of bytes to buffer before flushing
109
- /// to the underlying [`AsyncWrite`]
110
- ///
111
- /// The intermediate buffer will automatically be resized if necessary
112
- ///
113
- /// [`Self::write`] will flush this intermediate buffer if it is at least
114
- /// half full
128
+ /// `buffer_size` determines the minimum number of bytes to buffer before flushing
129
+ /// to the underlying [`AsyncWrite`]. However, the nature of writing parquet may
130
+ /// force buffering of data in excess of this within the underlying [`ArrowWriter`].
131
+ /// See the documentation on [`ArrowWriter`] for more details
115
132
pub fn try_new_with_options (
116
133
writer : W ,
117
134
arrow_schema : SchemaRef ,
0 commit comments