@@ -42,7 +42,7 @@ use crate::error::Result;
42
42
use crate :: execution:: context:: SessionState ;
43
43
use crate :: physical_plan:: { ExecutionPlan , Statistics } ;
44
44
45
- use arrow_schema:: { DataType , Field , Schema } ;
45
+ use arrow_schema:: { DataType , Field , FieldRef , Schema } ;
46
46
use datafusion_common:: file_options:: file_type:: FileType ;
47
47
use datafusion_common:: { internal_err, not_impl_err, GetExt } ;
48
48
use datafusion_expr:: Expr ;
@@ -235,20 +235,26 @@ pub fn file_type_to_format(
235
235
}
236
236
}
237
237
238
+ /// Create a new field with the specified data type, copying the other
239
+ /// properties from the input field
240
+ fn field_with_new_type ( field : & FieldRef , new_type : DataType ) -> FieldRef {
241
+ Arc :: new ( field. as_ref ( ) . clone ( ) . with_data_type ( new_type) )
242
+ }
243
+
238
244
/// Transform a schema to use view types for Utf8 and Binary
245
+ ///
246
+ /// See [parquet::ParquetFormat::force_view_types] for details
239
247
pub fn transform_schema_to_view ( schema : & Schema ) -> Schema {
240
248
let transformed_fields: Vec < Arc < Field > > = schema
241
249
. fields
242
250
. iter ( )
243
251
. map ( |field| match field. data_type ( ) {
244
- DataType :: Utf8 | DataType :: LargeUtf8 => Arc :: new (
245
- Field :: new ( field. name ( ) , DataType :: Utf8View , field. is_nullable ( ) )
246
- . with_metadata ( field. metadata ( ) . to_owned ( ) ) ,
247
- ) ,
248
- DataType :: Binary | DataType :: LargeBinary => Arc :: new (
249
- Field :: new ( field. name ( ) , DataType :: BinaryView , field. is_nullable ( ) )
250
- . with_metadata ( field. metadata ( ) . to_owned ( ) ) ,
251
- ) ,
252
+ DataType :: Utf8 | DataType :: LargeUtf8 => {
253
+ field_with_new_type ( field, DataType :: Utf8View )
254
+ }
255
+ DataType :: Binary | DataType :: LargeBinary => {
256
+ field_with_new_type ( field, DataType :: BinaryView )
257
+ }
252
258
_ => field. clone ( ) ,
253
259
} )
254
260
. collect ( ) ;
@@ -274,6 +280,7 @@ pub(crate) fn coerce_file_schema_to_view_type(
274
280
( f. name ( ) , dt)
275
281
} )
276
282
. collect ( ) ;
283
+
277
284
if !transform {
278
285
return None ;
279
286
}
@@ -283,14 +290,13 @@ pub(crate) fn coerce_file_schema_to_view_type(
283
290
. iter ( )
284
291
. map (
285
292
|field| match ( table_fields. get ( field. name ( ) ) , field. data_type ( ) ) {
286
- ( Some ( DataType :: Utf8View ) , DataType :: Utf8 )
287
- | ( Some ( DataType :: Utf8View ) , DataType :: LargeUtf8 ) => Arc :: new (
288
- Field :: new ( field. name ( ) , DataType :: Utf8View , field. is_nullable ( ) ) ,
289
- ) ,
290
- ( Some ( DataType :: BinaryView ) , DataType :: Binary )
291
- | ( Some ( DataType :: BinaryView ) , DataType :: LargeBinary ) => Arc :: new (
292
- Field :: new ( field. name ( ) , DataType :: BinaryView , field. is_nullable ( ) ) ,
293
- ) ,
293
+ ( Some ( DataType :: Utf8View ) , DataType :: Utf8 | DataType :: LargeUtf8 ) => {
294
+ field_with_new_type ( field, DataType :: Utf8View )
295
+ }
296
+ (
297
+ Some ( DataType :: BinaryView ) ,
298
+ DataType :: Binary | DataType :: LargeBinary ,
299
+ ) => field_with_new_type ( field, DataType :: BinaryView ) ,
294
300
_ => field. clone ( ) ,
295
301
} ,
296
302
)
@@ -302,6 +308,78 @@ pub(crate) fn coerce_file_schema_to_view_type(
302
308
) )
303
309
}
304
310
311
+ /// Transform a schema so that any binary types are strings
312
+ pub fn transform_binary_to_string ( schema : & Schema ) -> Schema {
313
+ let transformed_fields: Vec < Arc < Field > > = schema
314
+ . fields
315
+ . iter ( )
316
+ . map ( |field| match field. data_type ( ) {
317
+ DataType :: Binary => field_with_new_type ( field, DataType :: Utf8 ) ,
318
+ DataType :: LargeBinary => field_with_new_type ( field, DataType :: LargeUtf8 ) ,
319
+ DataType :: BinaryView => field_with_new_type ( field, DataType :: Utf8View ) ,
320
+ _ => field. clone ( ) ,
321
+ } )
322
+ . collect ( ) ;
323
+ Schema :: new_with_metadata ( transformed_fields, schema. metadata . clone ( ) )
324
+ }
325
+
326
+ /// If the table schema uses a string type, coerce the file schema to use a string type.
327
+ ///
328
+ /// See [parquet::ParquetFormat::binary_as_string] for details
329
+ pub ( crate ) fn coerce_file_schema_to_string_type (
330
+ table_schema : & Schema ,
331
+ file_schema : & Schema ,
332
+ ) -> Option < Schema > {
333
+ let mut transform = false ;
334
+ let table_fields: HashMap < _ , _ > = table_schema
335
+ . fields
336
+ . iter ( )
337
+ . map ( |f| ( f. name ( ) , f. data_type ( ) ) )
338
+ . collect ( ) ;
339
+ let transformed_fields: Vec < Arc < Field > > = file_schema
340
+ . fields
341
+ . iter ( )
342
+ . map (
343
+ |field| match ( table_fields. get ( field. name ( ) ) , field. data_type ( ) ) {
344
+ // table schema uses string type, coerce the file schema to use string type
345
+ (
346
+ Some ( DataType :: Utf8 ) ,
347
+ DataType :: Binary | DataType :: LargeBinary | DataType :: BinaryView ,
348
+ ) => {
349
+ transform = true ;
350
+ field_with_new_type ( field, DataType :: Utf8 )
351
+ }
352
+ // table schema uses large string type, coerce the file schema to use large string type
353
+ (
354
+ Some ( DataType :: LargeUtf8 ) ,
355
+ DataType :: Binary | DataType :: LargeBinary | DataType :: BinaryView ,
356
+ ) => {
357
+ transform = true ;
358
+ field_with_new_type ( field, DataType :: LargeUtf8 )
359
+ }
360
+ // table schema uses string view type, coerce the file schema to use view type
361
+ (
362
+ Some ( DataType :: Utf8View ) ,
363
+ DataType :: Binary | DataType :: LargeBinary | DataType :: BinaryView ,
364
+ ) => {
365
+ transform = true ;
366
+ field_with_new_type ( field, DataType :: Utf8View )
367
+ }
368
+ _ => field. clone ( ) ,
369
+ } ,
370
+ )
371
+ . collect ( ) ;
372
+
373
+ if !transform {
374
+ None
375
+ } else {
376
+ Some ( Schema :: new_with_metadata (
377
+ transformed_fields,
378
+ file_schema. metadata . clone ( ) ,
379
+ ) )
380
+ }
381
+ }
382
+
305
383
#[ cfg( test) ]
306
384
pub ( crate ) mod test_util {
307
385
use std:: ops:: Range ;
0 commit comments