@@ -31,9 +31,8 @@ use arrow_schema::{
31
31
use arrow_string:: like:: starts_with;
32
32
use bytes:: Bytes ;
33
33
use fnv:: FnvHashSet ;
34
- use futures:: channel:: mpsc:: { channel, Sender } ;
35
34
use futures:: future:: BoxFuture ;
36
- use futures:: { try_join, FutureExt , SinkExt , StreamExt , TryFutureExt , TryStreamExt } ;
35
+ use futures:: { try_join, FutureExt , StreamExt , TryFutureExt , TryStreamExt } ;
37
36
use parquet:: arrow:: arrow_reader:: { ArrowPredicateFn , ArrowReaderOptions , RowFilter , RowSelection } ;
38
37
use parquet:: arrow:: async_reader:: AsyncFileReader ;
39
38
use parquet:: arrow:: { ParquetRecordBatchStreamBuilder , ProjectionMask , PARQUET_FIELD_ID_META_KEY } ;
@@ -48,7 +47,6 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator;
48
47
use crate :: expr:: visitors:: row_group_metrics_evaluator:: RowGroupMetricsEvaluator ;
49
48
use crate :: expr:: { BoundPredicate , BoundReference } ;
50
49
use crate :: io:: { FileIO , FileMetadata , FileRead } ;
51
- use crate :: runtime:: spawn;
52
50
use crate :: scan:: { ArrowRecordBatchStream , FileScanTask , FileScanTaskStream } ;
53
51
use crate :: spec:: { Datum , PrimitiveType , Schema } ;
54
52
use crate :: utils:: available_parallelism;
@@ -130,62 +128,41 @@ pub struct ArrowReader {
130
128
impl ArrowReader {
131
129
/// Take a stream of FileScanTasks and reads all the files.
132
130
/// Returns a stream of Arrow RecordBatches containing the data from the files
133
- pub fn read ( self , tasks : FileScanTaskStream ) -> Result < ArrowRecordBatchStream > {
131
+ pub async fn read ( self , tasks : FileScanTaskStream ) -> Result < ArrowRecordBatchStream > {
134
132
let file_io = self . file_io . clone ( ) ;
135
133
let batch_size = self . batch_size ;
136
134
let concurrency_limit_data_files = self . concurrency_limit_data_files ;
137
135
let row_group_filtering_enabled = self . row_group_filtering_enabled ;
138
136
let row_selection_enabled = self . row_selection_enabled ;
139
137
140
- let ( tx, rx) = channel ( concurrency_limit_data_files) ;
141
- let mut channel_for_error = tx. clone ( ) ;
142
-
143
- spawn ( async move {
144
- let result = tasks
145
- . map ( |task| Ok ( ( task, file_io. clone ( ) , tx. clone ( ) ) ) )
146
- . try_for_each_concurrent (
147
- concurrency_limit_data_files,
148
- |( file_scan_task, file_io, tx) | async move {
149
- match file_scan_task {
150
- Ok ( task) => {
151
- let file_path = task. data_file_path . to_string ( ) ;
152
-
153
- spawn ( async move {
154
- Self :: process_file_scan_task (
155
- task,
156
- batch_size,
157
- file_io,
158
- tx,
159
- row_group_filtering_enabled,
160
- row_selection_enabled,
161
- )
162
- . await
163
- } )
164
- . await
165
- . map_err ( |e| e. with_context ( "file_path" , file_path) )
166
- }
167
- Err ( err) => Err ( err) ,
168
- }
169
- } ,
170
- )
171
- . await ;
138
+ let stream = tasks
139
+ . map_ok ( move |task| {
140
+ let file_io = file_io. clone ( ) ;
172
141
173
- if let Err ( error) = result {
174
- let _ = channel_for_error. send ( Err ( error) ) . await ;
175
- }
176
- } ) ;
142
+ Self :: process_file_scan_task (
143
+ task,
144
+ batch_size,
145
+ file_io,
146
+ row_group_filtering_enabled,
147
+ row_selection_enabled,
148
+ )
149
+ } )
150
+ . map_err ( |err| {
151
+ Error :: new ( ErrorKind :: Unexpected , "file scan task generate failed" ) . with_source ( err)
152
+ } )
153
+ . try_buffer_unordered ( concurrency_limit_data_files)
154
+ . try_flatten_unordered ( concurrency_limit_data_files) ;
177
155
178
- return Ok ( rx . boxed ( ) ) ;
156
+ Ok ( Box :: pin ( stream ) as ArrowRecordBatchStream )
179
157
}
180
158
181
159
async fn process_file_scan_task (
182
160
task : FileScanTask ,
183
161
batch_size : Option < usize > ,
184
162
file_io : FileIO ,
185
- mut tx : Sender < Result < RecordBatch > > ,
186
163
row_group_filtering_enabled : bool ,
187
164
row_selection_enabled : bool ,
188
- ) -> Result < ( ) > {
165
+ ) -> Result < ArrowRecordBatchStream > {
189
166
// Get the metadata for the Parquet file we need to read and build
190
167
// a reader for the data within
191
168
let parquet_file = file_io. new_input ( & task. data_file_path ) ?;
@@ -269,14 +246,15 @@ impl ArrowReader {
269
246
270
247
// Build the batch stream and send all the RecordBatches that it generates
271
248
// to the requester.
272
- let mut record_batch_stream = record_batch_stream_builder. build ( ) ?;
273
-
274
- while let Some ( batch) = record_batch_stream. try_next ( ) . await ? {
275
- tx. send ( record_batch_transformer. process_record_batch ( batch) )
276
- . await ?
277
- }
278
-
279
- Ok ( ( ) )
249
+ let record_batch_stream =
250
+ record_batch_stream_builder
251
+ . build ( ) ?
252
+ . map ( move |batch| match batch {
253
+ Ok ( batch) => record_batch_transformer. process_record_batch ( batch) ,
254
+ Err ( err) => Err ( err. into ( ) ) ,
255
+ } ) ;
256
+
257
+ Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
280
258
}
281
259
282
260
fn build_field_id_set_and_map (
0 commit comments