Skip to content

Commit 6348dc3

Browse files
authored
Accept parquet schemas without explicitly required Map keys (#5630)
* Accept parquet schemas without explicitly required Map keys * Map key is always non-nullable * Add comment * Add test for incorrect_map_schema.parquet * Test map data * Bump submodule * Clippy and code comment suggestion
1 parent bbd85ed commit 6348dc3

File tree

4 files changed

+80
-4
lines changed

4 files changed

+80
-4
lines changed

parquet/src/arrow/arrow_reader/mod.rs

+40
Original file line numberDiff line numberDiff line change
@@ -1436,6 +1436,46 @@ mod tests {
14361436
assert_eq!(row_count, 300);
14371437
}
14381438

1439+
#[test]
1440+
fn test_read_incorrect_map_schema_file() {
1441+
let testdata = arrow::util::test_util::parquet_test_data();
1442+
// see https://github.com/apache/parquet-testing/pull/47
1443+
let path = format!("{testdata}/incorrect_map_schema.parquet");
1444+
let file = File::open(path).unwrap();
1445+
let mut record_reader = ParquetRecordBatchReader::try_new(file, 32).unwrap();
1446+
1447+
let batch = record_reader.next().unwrap().unwrap();
1448+
assert_eq!(batch.num_rows(), 1);
1449+
1450+
let expected_schema = Schema::new(Fields::from(vec![Field::new(
1451+
"my_map",
1452+
ArrowDataType::Map(
1453+
Arc::new(Field::new(
1454+
"key_value",
1455+
ArrowDataType::Struct(Fields::from(vec![
1456+
Field::new("key", ArrowDataType::Utf8, false),
1457+
Field::new("value", ArrowDataType::Utf8, true),
1458+
])),
1459+
false,
1460+
)),
1461+
false,
1462+
),
1463+
true,
1464+
)]));
1465+
assert_eq!(batch.schema().as_ref(), &expected_schema);
1466+
1467+
assert_eq!(batch.num_rows(), 1);
1468+
assert_eq!(batch.column(0).null_count(), 0);
1469+
assert_eq!(
1470+
batch.column(0).as_map().keys().as_ref(),
1471+
&StringArray::from(vec!["parent", "name"])
1472+
);
1473+
assert_eq!(
1474+
batch.column(0).as_map().values().as_ref(),
1475+
&StringArray::from(vec!["another", "report"])
1476+
);
1477+
}
1478+
14391479
/// Parameters for single_column_reader_test
14401480
#[derive(Clone)]
14411481
struct TestOptions {

parquet/src/arrow/schema/complex.rs

+15-3
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,16 @@ impl Visitor {
286286
let map_key = &map_key_value.get_fields()[0];
287287
let map_value = &map_key_value.get_fields()[1];
288288

289-
if map_key.get_basic_info().repetition() != Repetition::REQUIRED {
290-
return Err(arrow_err!("Map keys must be required"));
289+
match map_key.get_basic_info().repetition() {
290+
Repetition::REPEATED => {
291+
return Err(arrow_err!("Map keys cannot be repeated"));
292+
}
293+
Repetition::REQUIRED | Repetition::OPTIONAL => {
294+
// Relaxed check for having repetition REQUIRED as there exists
295+
// parquet writers and files that do not conform to this standard.
296+
// This allows us to consume a broader range of existing files even
297+
// if they are out of spec.
298+
}
291299
}
292300

293301
if map_value.get_basic_info().repetition() == Repetition::REPEATED {
@@ -346,7 +354,11 @@ impl Visitor {
346354
// Need both columns to be projected
347355
match (maybe_key, maybe_value) {
348356
(Some(key), Some(value)) => {
349-
let key_field = Arc::new(convert_field(map_key, &key, arrow_key));
357+
let key_field = Arc::new(
358+
convert_field(map_key, &key, arrow_key)
359+
// The key is always non-nullable (#5630)
360+
.with_nullable(false),
361+
);
350362
let value_field = Arc::new(convert_field(map_value, &value, arrow_value));
351363
let field_metadata = match arrow_map {
352364
Some(field) => field.metadata().clone(),

parquet/src/arrow/schema/mod.rs

+24
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,12 @@ mod tests {
10181018
OPTIONAL int32 value;
10191019
}
10201020
}
1021+
REQUIRED group my_map4 (MAP) {
1022+
REPEATED group map {
1023+
OPTIONAL binary key (UTF8);
1024+
REQUIRED int32 value;
1025+
}
1026+
}
10211027
}
10221028
";
10231029

@@ -1075,6 +1081,24 @@ mod tests {
10751081
));
10761082
}
10771083

1084+
// // Map<String, Integer> (non-compliant nullable key)
1085+
// group my_map (MAP_KEY_VALUE) {
1086+
// repeated group map {
1087+
// optional binary key (UTF8);
1088+
// required int32 value;
1089+
// }
1090+
// }
1091+
{
1092+
arrow_fields.push(Field::new_map(
1093+
"my_map4",
1094+
"map",
1095+
Field::new("key", DataType::Utf8, false), // The key is always non-nullable (#5630)
1096+
Field::new("value", DataType::Int32, false),
1097+
false,
1098+
false,
1099+
));
1100+
}
1101+
10781102
let parquet_group_type = parse_message_type(message_type).unwrap();
10791103

10801104
let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type));

0 commit comments

Comments
 (0)