@@ -27,9 +27,8 @@ use arrow::{array::StructArray, error::ArrowError};
27
27
28
28
use crate :: arrow:: array_reader:: { build_array_reader, ArrayReader } ;
29
29
use crate :: arrow:: schema:: parquet_to_arrow_schema;
30
- use crate :: arrow:: schema:: {
31
- parquet_to_arrow_schema_by_columns, parquet_to_arrow_schema_by_root_columns,
32
- } ;
30
+ use crate :: arrow:: schema:: parquet_to_arrow_schema_by_columns;
31
+ use crate :: arrow:: ProjectionMask ;
33
32
use crate :: errors:: Result ;
34
33
use crate :: file:: metadata:: { KeyValue , ParquetMetaData } ;
35
34
use crate :: file:: reader:: FileReader ;
@@ -44,15 +43,8 @@ pub trait ArrowReader {
44
43
fn get_schema ( & mut self ) -> Result < Schema > ;
45
44
46
45
/// Read parquet schema and convert it into arrow schema.
47
- /// This schema only includes columns identified by `column_indices`.
48
- /// To select leaf columns (i.e. `a.b.c` instead of `a`), set `leaf_columns = true`
49
- fn get_schema_by_columns < T > (
50
- & mut self ,
51
- column_indices : T ,
52
- leaf_columns : bool ,
53
- ) -> Result < Schema >
54
- where
55
- T : IntoIterator < Item = usize > ;
46
+ /// This schema only includes columns identified by `mask`.
47
+ fn get_schema_by_columns ( & mut self , mask : ProjectionMask ) -> Result < Schema > ;
56
48
57
49
/// Returns record batch reader from whole parquet file.
58
50
///
@@ -64,19 +56,17 @@ pub trait ArrowReader {
64
56
fn get_record_reader ( & mut self , batch_size : usize ) -> Result < Self :: RecordReader > ;
65
57
66
58
/// Returns record batch reader whose record batch contains columns identified by
67
- /// `column_indices `.
59
+ /// `mask `.
68
60
///
69
61
/// # Arguments
70
62
///
71
- /// `column_indices `: The columns that should be included in record batches.
63
+ /// `mask `: The columns that should be included in record batches.
72
64
/// `batch_size`: Please refer to `get_record_reader`.
73
- fn get_record_reader_by_columns < T > (
65
+ fn get_record_reader_by_columns (
74
66
& mut self ,
75
- column_indices : T ,
67
+ mask : ProjectionMask ,
76
68
batch_size : usize ,
77
- ) -> Result < Self :: RecordReader >
78
- where
79
- T : IntoIterator < Item = usize > ;
69
+ ) -> Result < Self :: RecordReader > ;
80
70
}
81
71
82
72
#[ derive( Debug , Clone , Default ) ]
@@ -118,59 +108,34 @@ impl ArrowReader for ParquetFileArrowReader {
118
108
parquet_to_arrow_schema ( file_metadata. schema_descr ( ) , self . get_kv_metadata ( ) )
119
109
}
120
110
121
- fn get_schema_by_columns < T > (
122
- & mut self ,
123
- column_indices : T ,
124
- leaf_columns : bool ,
125
- ) -> Result < Schema >
126
- where
127
- T : IntoIterator < Item = usize > ,
128
- {
111
+ fn get_schema_by_columns ( & mut self , mask : ProjectionMask ) -> Result < Schema > {
129
112
let file_metadata = self . file_reader . metadata ( ) . file_metadata ( ) ;
130
- if leaf_columns {
131
- parquet_to_arrow_schema_by_columns (
132
- file_metadata. schema_descr ( ) ,
133
- column_indices,
134
- self . get_kv_metadata ( ) ,
135
- )
136
- } else {
137
- parquet_to_arrow_schema_by_root_columns (
138
- file_metadata. schema_descr ( ) ,
139
- column_indices,
140
- self . get_kv_metadata ( ) ,
141
- )
142
- }
113
+ parquet_to_arrow_schema_by_columns (
114
+ file_metadata. schema_descr ( ) ,
115
+ mask,
116
+ self . get_kv_metadata ( ) ,
117
+ )
143
118
}
144
119
145
120
fn get_record_reader (
146
121
& mut self ,
147
122
batch_size : usize ,
148
123
) -> Result < ParquetRecordBatchReader > {
149
- let column_indices = 0 ..self
150
- . file_reader
151
- . metadata ( )
152
- . file_metadata ( )
153
- . schema_descr ( )
154
- . num_columns ( ) ;
155
-
156
- self . get_record_reader_by_columns ( column_indices, batch_size)
124
+ self . get_record_reader_by_columns ( ProjectionMask :: all ( ) , batch_size)
157
125
}
158
126
159
- fn get_record_reader_by_columns < T > (
127
+ fn get_record_reader_by_columns (
160
128
& mut self ,
161
- column_indices : T ,
129
+ mask : ProjectionMask ,
162
130
batch_size : usize ,
163
- ) -> Result < ParquetRecordBatchReader >
164
- where
165
- T : IntoIterator < Item = usize > ,
166
- {
131
+ ) -> Result < ParquetRecordBatchReader > {
167
132
let array_reader = build_array_reader (
168
133
self . file_reader
169
134
. metadata ( )
170
135
. file_metadata ( )
171
136
. schema_descr_ptr ( ) ,
172
137
Arc :: new ( self . get_schema ( ) ?) ,
173
- column_indices ,
138
+ mask ,
174
139
Box :: new ( self . file_reader . clone ( ) ) ,
175
140
) ?;
176
141
@@ -296,7 +261,7 @@ mod tests {
296
261
IntervalDayTimeArrayConverter , LargeUtf8ArrayConverter , Utf8ArrayConverter ,
297
262
} ;
298
263
use crate :: arrow:: schema:: add_encoded_arrow_schema_to_metadata;
299
- use crate :: arrow:: ArrowWriter ;
264
+ use crate :: arrow:: { ArrowWriter , ProjectionMask } ;
300
265
use crate :: basic:: { ConvertedType , Encoding , Repetition , Type as PhysicalType } ;
301
266
use crate :: column:: writer:: get_typed_column_writer_mut;
302
267
use crate :: data_type:: {
@@ -351,12 +316,14 @@ mod tests {
351
316
let parquet_file_reader =
352
317
get_test_reader ( "parquet/generated_simple_numerics/blogs.parquet" ) ;
353
318
354
- let max_len = parquet_file_reader. metadata ( ) . file_metadata ( ) . num_rows ( ) as usize ;
319
+ let file_metadata = parquet_file_reader. metadata ( ) . file_metadata ( ) ;
320
+ let max_len = file_metadata. num_rows ( ) as usize ;
355
321
322
+ let mask = ProjectionMask :: leaves ( file_metadata. schema_descr ( ) , [ 2 ] ) ;
356
323
let mut arrow_reader = ParquetFileArrowReader :: new ( parquet_file_reader) ;
357
324
358
325
let mut record_batch_reader = arrow_reader
359
- . get_record_reader_by_columns ( vec ! [ 2 ] , 60 )
326
+ . get_record_reader_by_columns ( mask , 60 )
360
327
. expect ( "Failed to read into array!" ) ;
361
328
362
329
// Verify that the schema was correctly parsed
@@ -1040,8 +1007,11 @@ mod tests {
1040
1007
// (see: ARROW-11452)
1041
1008
let testdata = arrow:: util:: test_util:: parquet_test_data ( ) ;
1042
1009
let path = format ! ( "{}/nested_structs.rust.parquet" , testdata) ;
1043
- let parquet_file_reader =
1044
- SerializedFileReader :: try_from ( File :: open ( & path) . unwrap ( ) ) . unwrap ( ) ;
1010
+ let file = File :: open ( & path) . unwrap ( ) ;
1011
+ let parquet_file_reader = SerializedFileReader :: try_from ( file) . unwrap ( ) ;
1012
+ let file_metadata = parquet_file_reader. metadata ( ) . file_metadata ( ) ;
1013
+ let schema = file_metadata. schema_descr_ptr ( ) ;
1014
+
1045
1015
let mut arrow_reader = ParquetFileArrowReader :: new ( Arc :: new ( parquet_file_reader) ) ;
1046
1016
let record_batch_reader = arrow_reader
1047
1017
. get_record_reader ( 60 )
@@ -1051,12 +1021,11 @@ mod tests {
1051
1021
batch. unwrap ( ) ;
1052
1022
}
1053
1023
1024
+ let mask = ProjectionMask :: leaves ( & schema, [ 3 , 8 , 10 ] ) ;
1054
1025
let projected_reader = arrow_reader
1055
- . get_record_reader_by_columns ( vec ! [ 3 , 8 , 10 ] , 60 )
1056
- . unwrap ( ) ;
1057
- let projected_schema = arrow_reader
1058
- . get_schema_by_columns ( vec ! [ 3 , 8 , 10 ] , true )
1026
+ . get_record_reader_by_columns ( mask. clone ( ) , 60 )
1059
1027
. unwrap ( ) ;
1028
+ let projected_schema = arrow_reader. get_schema_by_columns ( mask) . unwrap ( ) ;
1060
1029
1061
1030
let expected_schema = Schema :: new ( vec ! [
1062
1031
Field :: new(
@@ -1139,8 +1108,11 @@ mod tests {
1139
1108
}
1140
1109
1141
1110
let file_reader = Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
1111
+ let file_metadata = file_reader. metadata ( ) . file_metadata ( ) ;
1112
+ let mask = ProjectionMask :: leaves ( file_metadata. schema_descr ( ) , [ 0 ] ) ;
1113
+
1142
1114
let mut batch = ParquetFileArrowReader :: new ( file_reader) ;
1143
- let reader = batch. get_record_reader_by_columns ( vec ! [ 0 ] , 1024 ) . unwrap ( ) ;
1115
+ let reader = batch. get_record_reader_by_columns ( mask , 1024 ) . unwrap ( ) ;
1144
1116
1145
1117
let expected_schema = arrow:: datatypes:: Schema :: new ( vec ! [ Field :: new(
1146
1118
"group" ,
@@ -1178,7 +1150,7 @@ mod tests {
1178
1150
let mut arrow_reader = ParquetFileArrowReader :: new ( Arc :: new ( file_reader) ) ;
1179
1151
1180
1152
let mut record_batch_reader = arrow_reader
1181
- . get_record_reader_by_columns ( vec ! [ 0 ] , 10 )
1153
+ . get_record_reader_by_columns ( ProjectionMask :: all ( ) , 10 )
1182
1154
. unwrap ( ) ;
1183
1155
1184
1156
let error = record_batch_reader. next ( ) . unwrap ( ) . unwrap_err ( ) ;
@@ -1414,10 +1386,13 @@ mod tests {
1414
1386
let path = format ! ( "{}/alltypes_plain.parquet" , testdata) ;
1415
1387
let file = File :: open ( & path) . unwrap ( ) ;
1416
1388
let reader = SerializedFileReader :: try_from ( file) . unwrap ( ) ;
1417
- let expected_rows = reader. metadata ( ) . file_metadata ( ) . num_rows ( ) as usize ;
1389
+ let file_metadata = reader. metadata ( ) . file_metadata ( ) ;
1390
+ let expected_rows = file_metadata. num_rows ( ) as usize ;
1391
+ let schema = file_metadata. schema_descr_ptr ( ) ;
1418
1392
1419
1393
let mut arrow_reader = ParquetFileArrowReader :: new ( Arc :: new ( reader) ) ;
1420
- let batch_reader = arrow_reader. get_record_reader_by_columns ( [ ] , 2 ) . unwrap ( ) ;
1394
+ let mask = ProjectionMask :: leaves ( & schema, [ ] ) ;
1395
+ let batch_reader = arrow_reader. get_record_reader_by_columns ( mask, 2 ) . unwrap ( ) ;
1421
1396
1422
1397
let mut total_rows = 0 ;
1423
1398
for maybe_batch in batch_reader {
0 commit comments