1
- use std:: sync:: Arc ;
2
-
3
1
use crate :: {
4
2
schema:: { DataType , Schema , StructField } ,
5
3
DeltaResult ,
6
4
} ;
7
5
8
6
pub trait ColumnarBatch {
7
+ type Column : ColumnVector ;
8
+
9
9
/// Get the schema of the batch.
10
10
fn schema ( & self ) -> DeltaResult < Schema > ;
11
11
12
12
/// Get the column at the specified index.
13
- fn column ( & self , index : usize ) -> & dyn ColumnVector ;
13
+ fn column ( & self , index : usize ) -> Self :: Column ;
14
14
15
15
/// Number of rows in the batch.
16
16
fn size ( & self ) -> usize ;
17
17
18
- fn with_column (
19
- & self ,
20
- index : usize ,
21
- field : StructField ,
22
- column : Arc < dyn ColumnVector > ,
23
- ) -> DeltaResult < Self >
24
- where
25
- Self : Sized ;
18
+ // fn with_column(
19
+ // &self,
20
+ // index: usize,
21
+ // field: StructField,
22
+ // column: & dyn ColumnVector,
23
+ // ) -> DeltaResult<Self>
24
+ // where
25
+ // Self: Sized;
26
26
27
27
fn with_deleted_column_at ( & self , index : usize ) -> DeltaResult < Self >
28
28
where
29
29
Self : Sized ;
30
30
31
- fn with_schema ( & self , schema : Schema ) -> DeltaResult < Self >
32
- where
33
- Self : Sized ;
31
+ // fn with_schema(&self, schema: Schema) -> DeltaResult<Self>
32
+ // where
33
+ // Self: Sized;
34
34
35
35
fn slice ( & self , offset : usize , length : usize ) -> DeltaResult < Self >
36
36
where
37
37
Self : Sized ;
38
38
39
- fn rows ( & self ) -> Box < dyn Iterator < Item = & dyn Row > > ;
39
+ fn rows ( & self ) -> Box < dyn Iterator < Item = Box < dyn Row < Column = Self :: Column > > > > ;
40
40
}
41
41
42
42
// TODO: do all these methods do bounds checking? Should we offer alternative
@@ -50,43 +50,54 @@ pub trait ColumnVector {
50
50
fn is_null ( & self , i : usize ) -> bool ;
51
51
fn get_i32 ( & self , i : usize ) -> DeltaResult < Option < i32 > > ;
52
52
fn get_string ( & self , i : usize ) -> DeltaResult < Option < & str > > ;
53
- fn get_struct ( & self , i : usize ) -> DeltaResult < Option < & dyn Row > > ;
54
- fn get_array ( & self , i : usize ) -> DeltaResult < Option < & dyn ArrayValue > > ;
55
- fn get_map ( & self , i : usize ) -> DeltaResult < Option < & dyn MapValue > > ;
53
+ fn get_struct ( & self , i : usize ) -> DeltaResult < Option < Box < dyn Row < Column = Self > > > > ;
54
+ fn get_array ( & self , i : usize ) -> DeltaResult < Option < Box < dyn ArrayValue < Column = Self > > > > ;
55
+ fn get_map ( & self , i : usize ) -> DeltaResult < Option < Box < dyn MapValue < Column = Self > > > > ;
56
56
}
57
57
58
58
pub trait Row {
59
+ type Column : ColumnVector ;
60
+
59
61
fn schema ( & self ) -> DeltaResult < Schema > ;
60
62
fn is_null ( & self , i : usize ) -> bool ;
61
63
fn get_i32 ( & self , i : usize ) -> DeltaResult < Option < i32 > > ;
62
64
fn get_string ( & self , i : usize ) -> DeltaResult < Option < & str > > ;
63
- fn get_struct ( & self , i : usize ) -> DeltaResult < Option < & dyn Row > > ;
64
- fn get_array ( & self , i : usize ) -> DeltaResult < Option < & dyn ArrayValue > > ;
65
- fn get_map ( & self , i : usize ) -> DeltaResult < Option < & dyn MapValue > > ;
65
+ fn get_struct ( & self , i : usize ) -> DeltaResult < Option < Box < dyn Row < Column = Self :: Column > > > > ;
66
+ fn get_array (
67
+ & self ,
68
+ i : usize ,
69
+ ) -> DeltaResult < Option < Box < dyn ArrayValue < Column = Self :: Column > > > > ;
70
+ fn get_map ( & self , i : usize ) -> DeltaResult < Option < Box < dyn MapValue < Column = Self :: Column > > > > ;
66
71
}
67
72
68
73
// Based on: https://github.com/delta-io/delta/pull/2087
69
74
70
75
pub trait ArrayValue {
76
+ type Column : ColumnVector ;
77
+
71
78
/// Return the number of elements in the array
72
79
fn size ( & self ) -> usize ;
73
80
74
81
/// Get the elements in the array
75
- fn elements ( & self ) -> & dyn ColumnVector ;
82
+ fn elements ( & self ) -> Self :: Column ;
76
83
}
77
84
78
85
pub trait MapValue {
86
+ type Column : ColumnVector ;
87
+
79
88
/// Return the number of elements in the map
80
89
fn size ( & self ) -> usize ;
81
90
82
91
/// Get the keys in the map
83
- fn keys ( & self ) -> & dyn ColumnVector ;
92
+ fn keys ( & self ) -> Self :: Column ;
84
93
85
94
/// Get the values in the map
86
- fn values ( & self ) -> & dyn ColumnVector ;
95
+ fn values ( & self ) -> Self :: Column ;
87
96
}
88
97
89
- mod arrow {
98
+ pub mod arrow {
99
+ use std:: sync:: Arc ;
100
+
90
101
use arrow_array:: cast:: AsArray ;
91
102
use arrow_array:: types:: Int32Type ;
92
103
use arrow_array:: Array as ArrowArray ;
@@ -99,33 +110,67 @@ mod arrow {
99
110
use super :: * ;
100
111
101
112
impl ColumnarBatch for RecordBatch {
113
+ type Column = Arc < dyn ArrowArray > ;
114
+
102
115
fn schema ( & self ) -> DeltaResult < Schema > {
103
116
Ok ( self . schema ( ) . as_ref ( ) . try_into ( ) ?)
104
117
}
105
118
106
- fn column ( & self , index : usize ) -> & dyn ColumnVector {
107
- & self . column ( index) . as_ref ( ) as & dyn ColumnVector
119
+ fn column ( & self , index : usize ) -> Self :: Column {
120
+ self . column ( index) . clone ( )
121
+ }
122
+
123
+ fn size ( & self ) -> usize {
124
+ self . num_rows ( )
125
+ }
126
+
127
+ fn with_deleted_column_at ( & self , index : usize ) -> DeltaResult < Self >
128
+ where
129
+ Self : Sized ,
130
+ {
131
+ let indices = ( 0 ..self . num_columns ( ) )
132
+ . filter ( |i| * i != index)
133
+ . collect :: < Vec < _ > > ( ) ;
134
+ RecordBatch :: project ( & self , & indices) . map_err ( |err| Error :: Arrow ( err) )
135
+ }
136
+
137
+ fn slice ( & self , offset : usize , length : usize ) -> DeltaResult < Self >
138
+ where
139
+ Self : Sized ,
140
+ {
141
+ Ok ( RecordBatch :: slice ( self , offset, length) )
142
+ }
143
+
144
+ fn rows ( & self ) -> Box < dyn Iterator < Item = Box < dyn Row < Column = Self :: Column > > > > {
145
+ let batch = self . clone ( ) ;
146
+ Box :: new ( ( 0 ..self . size ( ) ) . into_iter ( ) . map ( move |i| {
147
+ let row = Box :: new ( ArrowRow {
148
+ batch : batch. clone ( ) ,
149
+ row_index : i,
150
+ } ) ;
151
+ row as Box < dyn Row < Column = Self :: Column > >
152
+ } ) )
108
153
}
109
154
}
110
155
111
- impl ColumnVector for dyn ArrowArray {
156
+ impl ColumnVector for Arc < dyn ArrowArray > {
112
157
fn data_type ( & self ) -> DeltaResult < DataType > {
113
- Ok ( self . data_type ( ) . try_into ( ) ?)
158
+ Ok ( self . as_ref ( ) . data_type ( ) . try_into ( ) ?)
114
159
}
115
160
116
161
fn size ( & self ) -> usize {
117
162
self . len ( )
118
163
}
119
164
120
165
fn is_null ( & self , i : usize ) -> bool {
121
- self . is_null ( i)
166
+ self . as_ref ( ) . is_null ( i)
122
167
}
123
168
124
169
/// Get the i32 value at the specified index.
125
170
///
126
171
/// This will panic if the column is not boolean or if the index is out of bounds.
127
172
fn get_i32 ( & self , i : usize ) -> DeltaResult < Option < i32 > > {
128
- if self . is_null ( i) {
173
+ if self . as_ref ( ) . is_null ( i) {
129
174
Ok ( None )
130
175
} else {
131
176
Ok ( Some ( self . as_primitive :: < Int32Type > ( ) . value ( i) ) )
@@ -136,10 +181,10 @@ mod arrow {
136
181
///
137
182
/// This will panic if the column is not string or if the index is out of bounds.
138
183
fn get_string ( & self , i : usize ) -> DeltaResult < Option < & str > > {
139
- if self . is_null ( i) {
184
+ if self . as_ref ( ) . is_null ( i) {
140
185
Ok ( None )
141
186
} else {
142
- match self . data_type ( ) {
187
+ match self . as_ref ( ) . data_type ( ) {
143
188
ArrowDataType :: Utf8 => Ok ( Some ( self . as_string :: < i32 > ( ) . value ( i) ) ) ,
144
189
ArrowDataType :: LargeUtf8 => Ok ( Some ( self . as_string :: < i64 > ( ) . value ( i) ) ) ,
145
190
_ => panic ! ( "get_string called on non-string column" ) ,
@@ -150,79 +195,94 @@ mod arrow {
150
195
/// Get the struct value at the specified index.
151
196
///
152
197
/// This will panic if the column is not struct or if the index is out of bounds.
153
- fn get_struct ( & self , i : usize ) -> DeltaResult < Option < & dyn Row > > {
154
- if self . is_null ( i) {
198
+ fn get_struct ( & self , i : usize ) -> DeltaResult < Option < Box < dyn Row < Column = Self > > > > {
199
+ if self . as_ref ( ) . is_null ( i) {
155
200
Ok ( None )
156
201
} else {
157
202
let batch = self
158
203
. as_struct_opt ( )
159
- . expect ( "get_struct called on non-struct column" ) ;
204
+ . expect ( "get_struct called on non-struct column" )
205
+ . clone ( ) ;
160
206
let row = ArrowRow {
161
207
batch,
162
208
row_index : i,
163
209
} ;
164
- Ok ( Some ( & row) )
210
+ Ok ( Some ( Box :: new ( row) ) )
165
211
}
166
212
}
167
213
168
214
/// Get the array value at the specified index.
169
215
///
170
216
/// This will panic if the column is not array or if the index is out of bounds.
171
- fn get_array ( & self , i : usize ) -> DeltaResult < Option < & dyn ArrayValue > > {
172
- if self . is_null ( i) {
217
+ fn get_array ( & self , i : usize ) -> DeltaResult < Option < Box < dyn ArrayValue < Column = Self > > > > {
218
+ if self . as_ref ( ) . is_null ( i) {
173
219
Ok ( None )
174
220
} else {
175
- let batch = self
176
- . as_list_opt ( )
177
- . expect ( "get_array called on non-array column" ) ;
178
- let array = ArrowArray {
179
- array : batch. value ( i) ,
221
+ let sub_array = match self . as_ref ( ) . data_type ( ) {
222
+ ArrowDataType :: List ( _) => self . as_list_opt :: < i32 > ( ) . unwrap ( ) . value ( i) ,
223
+ ArrowDataType :: LargeList ( _) => self . as_list_opt :: < i32 > ( ) . unwrap ( ) . value ( i) ,
224
+ _ => panic ! ( "get_array called on non-array column" ) ,
180
225
} ;
181
- Ok ( Some ( & array) )
226
+ Ok ( Some ( Box :: new ( ArrowArraySlice ( sub_array) ) ) )
227
+ }
228
+ }
229
+
230
+ fn get_map ( & self , i : usize ) -> DeltaResult < Option < Box < dyn MapValue < Column = Self > > > > {
231
+ if self . as_ref ( ) . is_null ( i) {
232
+ Ok ( None )
233
+ } else {
234
+ let arr = self . as_map ( ) . value ( i) ;
235
+ let map_array = ArrowMapValue {
236
+ keys : arr. column ( 0 ) . clone ( ) ,
237
+ values : arr. column ( 1 ) . clone ( ) ,
238
+ } ;
239
+ Ok ( Some ( Box :: new ( map_array) ) )
182
240
}
183
241
}
184
242
}
185
243
186
- trait ArrowTabular {
187
- fn schema ( & self ) -> & ArrowSchema ;
188
- fn column ( & self , index : usize ) -> & dyn ArrowArray ;
244
+ pub trait ArrowTabular {
245
+ fn schema ( & self ) -> Arc < ArrowSchema > ;
246
+ fn column ( & self , index : usize ) -> & Arc < dyn ArrowArray > ;
189
247
}
190
248
191
249
impl ArrowTabular for RecordBatch {
192
- fn schema ( & self ) -> & ArrowSchema {
193
- self . schema ( ) . as_ref ( )
250
+ fn schema ( & self ) -> Arc < ArrowSchema > {
251
+ self . schema ( )
194
252
}
195
253
196
- fn column ( & self , index : usize ) -> & dyn ArrowArray {
197
- self . column ( index) . as_ref ( )
254
+ fn column ( & self , index : usize ) -> & Arc < dyn ArrowArray > {
255
+ & self . column ( index)
198
256
}
199
257
}
200
258
201
259
impl ArrowTabular for StructArray {
202
- fn schema ( & self ) -> & ArrowSchema {
203
- self . schema ( )
260
+ fn schema ( & self ) -> Arc < ArrowSchema > {
261
+ Arc :: new ( ArrowSchema :: new ( self . fields ( ) . clone ( ) ) )
204
262
}
205
263
206
- fn column ( & self , index : usize ) -> & dyn ArrowArray {
207
- self . column ( index) . as_ref ( )
264
+ fn column ( & self , index : usize ) -> & Arc < dyn ArrowArray > {
265
+ & self . column ( index)
208
266
}
209
267
}
210
268
211
269
/// A reference to a row in a RecordBatch or StructArray.
212
- pub struct ArrowRow < ' a , T : ArrowTabular > {
213
- batch : & ' a T ,
270
+ pub struct ArrowRow < T : ArrowTabular > {
271
+ batch : T ,
214
272
row_index : usize ,
215
273
}
216
274
217
- impl < ' a , T : ArrowTabular > Row for ArrowRow < ' a , T > {
275
+ impl < T : ArrowTabular > Row for ArrowRow < T > {
276
+ type Column = Arc < dyn ArrowArray > ;
277
+
218
278
fn schema ( & self ) -> DeltaResult < Schema > {
219
- ArrowTabular :: schema ( self . batch )
279
+ ArrowTabular :: schema ( & self . batch )
220
280
. try_into ( )
221
281
. map_err ( |err| Error :: Arrow ( err) )
222
282
}
223
283
224
284
fn is_null ( & self , i : usize ) -> bool {
225
- self . batch . column ( i) . is_null ( self . row_index )
285
+ ArrowArray :: is_null ( self . batch . column ( i) , self . row_index )
226
286
}
227
287
228
288
fn get_i32 ( & self , i : usize ) -> DeltaResult < Option < i32 > > {
@@ -233,24 +293,57 @@ mod arrow {
233
293
self . batch . column ( i) . get_string ( self . row_index )
234
294
}
235
295
236
- fn get_struct ( & self , i : usize ) -> DeltaResult < Option < & dyn Row > > {
296
+ fn get_struct ( & self , i : usize ) -> DeltaResult < Option < Box < dyn Row < Column = Self :: Column > > > > {
237
297
self . batch . column ( i) . get_struct ( self . row_index )
238
298
}
239
299
240
- fn get_array ( & self , i : usize ) -> DeltaResult < Option < & dyn ArrayValue > > {
300
+ fn get_array (
301
+ & self ,
302
+ i : usize ,
303
+ ) -> DeltaResult < Option < Box < dyn ArrayValue < Column = Self :: Column > > > > {
241
304
self . batch . column ( i) . get_array ( self . row_index )
242
305
}
243
306
244
- fn get_map ( & self , i : usize ) -> DeltaResult < Option < & dyn MapValue > > {
307
+ fn get_map (
308
+ & self ,
309
+ i : usize ,
310
+ ) -> DeltaResult < Option < Box < dyn MapValue < Column = Self :: Column > > > > {
245
311
self . batch . column ( i) . get_map ( self . row_index )
246
312
}
247
313
}
248
314
249
- pub struct ArrowArraySlice < ' a > {
250
- batch : & ' a dyn ArrowArray ,
251
- offset : usize ,
252
- length : usize ,
315
+ pub struct ArrowArraySlice ( Arc < dyn ArrowArray > ) ;
316
+
317
+ impl ArrayValue for ArrowArraySlice {
318
+ type Column = Arc < dyn ArrowArray > ;
319
+
320
+ fn size ( & self ) -> usize {
321
+ self . 0 . len ( )
322
+ }
323
+
324
+ fn elements ( & self ) -> Self :: Column {
325
+ self . 0 . clone ( )
326
+ }
327
+ }
328
+
329
+ pub struct ArrowMapValue {
330
+ keys : Arc < dyn ArrowArray > ,
331
+ values : Arc < dyn ArrowArray > ,
253
332
}
254
333
255
-
334
+ impl MapValue for ArrowMapValue {
335
+ type Column = Arc < dyn ArrowArray > ;
336
+
337
+ fn size ( & self ) -> usize {
338
+ self . keys . len ( )
339
+ }
340
+
341
+ fn keys ( & self ) -> Self :: Column {
342
+ self . keys . clone ( )
343
+ }
344
+
345
+ fn values ( & self ) -> Self :: Column {
346
+ self . values . clone ( )
347
+ }
348
+ }
256
349
}
0 commit comments