Skip to content

Commit 30762e8

Browse files
Encode UUID as FixedLenByteArray in parquet_derive (#5773)
* fix uuid derive * fix byte array length handling * test lengths * fmt
1 parent dfe0f26 commit 30762e8

File tree

3 files changed

+96
-35
lines changed

3 files changed

+96
-35
lines changed

parquet_derive/src/parquet_field.rs

+90-35
Original file line numberDiff line numberDiff line change
@@ -136,19 +136,19 @@ impl Field {
136136
Type::Option(_) => unimplemented!("Unsupported nesting encountered"),
137137
Type::Reference(_, ref second_type)
138138
| Type::Vec(ref second_type)
139-
| Type::Array(ref second_type)
139+
| Type::Array(ref second_type, _)
140140
| Type::Slice(ref second_type) => match **second_type {
141141
Type::TypePath(_) => Some(self.optional_definition_levels()),
142142
_ => unimplemented!("Unsupported nesting encountered"),
143143
},
144144
},
145145
Type::Reference(_, ref first_type)
146146
| Type::Vec(ref first_type)
147-
| Type::Array(ref first_type)
147+
| Type::Array(ref first_type, _)
148148
| Type::Slice(ref first_type) => match **first_type {
149149
Type::TypePath(_) => None,
150150
Type::Vec(ref second_type)
151-
| Type::Array(ref second_type)
151+
| Type::Array(ref second_type, _)
152152
| Type::Slice(ref second_type) => match **second_type {
153153
Type::TypePath(_) => None,
154154
Type::Reference(_, ref third_type) => match **third_type {
@@ -161,7 +161,7 @@ impl Field {
161161
match **second_type {
162162
Type::TypePath(_) => Some(self.optional_definition_levels()),
163163
Type::Vec(ref third_type)
164-
| Type::Array(ref third_type)
164+
| Type::Array(ref third_type, _)
165165
| Type::Slice(ref third_type) => match **third_type {
166166
Type::TypePath(_) => Some(self.optional_definition_levels()),
167167
Type::Reference(_, ref fourth_type) => match **fourth_type {
@@ -316,25 +316,23 @@ impl Field {
316316
let logical_type = self.ty.logical_type();
317317
let repetition = self.ty.repetition();
318318
let converted_type = self.ty.converted_type();
319+
let length = self.ty.length();
320+
321+
let mut builder = quote! {
322+
ParquetType::primitive_type_builder(#field_name, #physical_type)
323+
.with_logical_type(#logical_type)
324+
.with_repetition(#repetition)
325+
};
319326

320327
if let Some(converted_type) = converted_type {
321-
quote! {
322-
fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type)
323-
.with_logical_type(#logical_type)
324-
.with_repetition(#repetition)
325-
.with_converted_type(#converted_type)
326-
.build().unwrap().into()
327-
)
328-
}
329-
} else {
330-
quote! {
331-
fields.push(ParquetType::primitive_type_builder(#field_name, #physical_type)
332-
.with_logical_type(#logical_type)
333-
.with_repetition(#repetition)
334-
.build().unwrap().into()
335-
)
336-
}
328+
builder = quote! { #builder.with_converted_type(#converted_type) };
329+
}
330+
331+
if let Some(length) = length {
332+
builder = quote! { #builder.with_length(#length) };
337333
}
334+
335+
quote! { fields.push(#builder.build().unwrap().into()) }
338336
}
339337

340338
fn option_into_vals(&self) -> proc_macro2::TokenStream {
@@ -394,7 +392,7 @@ impl Field {
394392
quote! { rec.#field_name.signed_duration_since(::chrono::NaiveDate::from_ymd(1970, 1, 1)).num_days() as i32 }
395393
}
396394
Some(ThirdPartyType::Uuid) => {
397-
quote! { (&rec.#field_name.to_string()[..]).into() }
395+
quote! { rec.#field_name.as_bytes().to_vec().into() }
398396
}
399397
_ => {
400398
if self.is_a_byte_buf {
@@ -430,7 +428,7 @@ impl Field {
430428
}
431429
}
432430
Some(ThirdPartyType::Uuid) => {
433-
quote! { ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap() }
431+
quote! { ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap()) }
434432
}
435433
_ => match &self.ty {
436434
Type::TypePath(_) => match self.ty.last_part().as_str() {
@@ -469,7 +467,7 @@ impl Field {
469467
#[allow(clippy::large_enum_variant)]
470468
#[derive(Debug, PartialEq)]
471469
enum Type {
472-
Array(Box<Type>),
470+
Array(Box<Type>, syn::Expr),
473471
Option(Box<Type>),
474472
Slice(Box<Type>),
475473
Vec(Box<Type>),
@@ -542,7 +540,7 @@ impl Type {
542540
Type::TypePath(_) => parent_ty.unwrap_or(ty),
543541
Type::Option(ref first_type)
544542
| Type::Vec(ref first_type)
545-
| Type::Array(ref first_type)
543+
| Type::Array(ref first_type, _)
546544
| Type::Slice(ref first_type)
547545
| Type::Reference(_, ref first_type) => {
548546
Type::leaf_type_recursive_helper(first_type, Some(ty))
@@ -560,7 +558,7 @@ impl Type {
560558
Type::TypePath(ref type_) => type_,
561559
Type::Option(ref first_type)
562560
| Type::Vec(ref first_type)
563-
| Type::Array(ref first_type)
561+
| Type::Array(ref first_type, _)
564562
| Type::Slice(ref first_type)
565563
| Type::Reference(_, ref first_type) => match **first_type {
566564
Type::TypePath(ref type_) => type_,
@@ -607,7 +605,7 @@ impl Type {
607605
let leaf_type = self.leaf_type_recursive();
608606

609607
match leaf_type {
610-
Type::Array(ref first_type) => {
608+
Type::Array(ref first_type, _length) => {
611609
if let Type::TypePath(_) = **first_type {
612610
if last_part == "u8" {
613611
return BasicType::FIXED_LEN_BYTE_ARRAY;
@@ -638,17 +636,38 @@ impl Type {
638636
}
639637
"f32" => BasicType::FLOAT,
640638
"f64" => BasicType::DOUBLE,
641-
"String" | "str" | "Uuid" => BasicType::BYTE_ARRAY,
639+
"String" | "str" => BasicType::BYTE_ARRAY,
640+
"Uuid" => BasicType::FIXED_LEN_BYTE_ARRAY,
642641
f => unimplemented!("{} currently is not supported", f),
643642
}
644643
}
645644

645+
fn length(&self) -> Option<syn::Expr> {
646+
let last_part = self.last_part();
647+
let leaf_type = self.leaf_type_recursive();
648+
649+
// `[u8; N]` => Some(N)
650+
if let Type::Array(ref first_type, length) = leaf_type {
651+
if let Type::TypePath(_) = **first_type {
652+
if last_part == "u8" {
653+
return Some(length.clone());
654+
}
655+
}
656+
}
657+
658+
match last_part.trim() {
659+
// Uuid => [u8; 16] => Some(16)
660+
"Uuid" => Some(syn::parse_quote!(16)),
661+
_ => None,
662+
}
663+
}
664+
646665
fn logical_type(&self) -> proc_macro2::TokenStream {
647666
let last_part = self.last_part();
648667
let leaf_type = self.leaf_type_recursive();
649668

650669
match leaf_type {
651-
Type::Array(ref first_type) => {
670+
Type::Array(ref first_type, _length) => {
652671
if let Type::TypePath(_) = **first_type {
653672
if last_part == "u8" {
654673
return quote! { None };
@@ -789,7 +808,7 @@ impl Type {
789808

790809
fn from_type_array(f: &syn::Field, ta: &syn::TypeArray) -> Self {
791810
let inner_type = Type::from_type(f, ta.elem.as_ref());
792-
Type::Array(Box::new(inner_type))
811+
Type::Array(Box::new(inner_type), ta.len.clone())
793812
}
794813

795814
fn from_type_slice(f: &syn::Field, ts: &syn::TypeSlice) -> Self {
@@ -1091,6 +1110,7 @@ mod test {
10911110
a_fix_byte_buf: [u8; 10],
10921111
a_complex_option: ::std::option::Option<&Vec<u8>>,
10931112
a_complex_vec: &::std::vec::Vec<&Option<u8>>,
1113+
a_uuid: ::uuid::Uuid,
10941114
}
10951115
};
10961116

@@ -1110,7 +1130,42 @@ mod test {
11101130
BasicType::BYTE_ARRAY,
11111131
BasicType::FIXED_LEN_BYTE_ARRAY,
11121132
BasicType::BYTE_ARRAY,
1113-
BasicType::INT32
1133+
BasicType::INT32,
1134+
BasicType::FIXED_LEN_BYTE_ARRAY,
1135+
]
1136+
)
1137+
}
1138+
1139+
#[test]
1140+
fn test_type_length() {
1141+
let snippet: proc_macro2::TokenStream = quote! {
1142+
struct LotsOfInnerTypes {
1143+
a_buf: ::std::vec::Vec<u8>,
1144+
a_number: i32,
1145+
a_verbose_option: ::std::option::Option<bool>,
1146+
a_silly_string: String,
1147+
a_fix_byte_buf: [u8; 10],
1148+
a_complex_option: ::std::option::Option<&Vec<u8>>,
1149+
a_complex_vec: &::std::vec::Vec<&Option<u8>>,
1150+
a_uuid: ::uuid::Uuid,
1151+
}
1152+
};
1153+
1154+
let fields = extract_fields(snippet);
1155+
let converted_fields: Vec<_> = fields.iter().map(Type::from).collect();
1156+
let lengths: Vec<_> = converted_fields.iter().map(|ty| ty.length()).collect();
1157+
1158+
assert_eq!(
1159+
lengths,
1160+
vec![
1161+
None,
1162+
None,
1163+
None,
1164+
None,
1165+
Some(syn::parse_quote!(10)),
1166+
None,
1167+
None,
1168+
Some(syn::parse_quote!(16)),
11141169
]
11151170
)
11161171
}
@@ -1328,8 +1383,8 @@ mod test {
13281383
let when = Field::from(&fields[0]);
13291384
assert_eq!(when.writer_snippet().to_string(),(quote!{
13301385
{
1331-
let vals : Vec<_> = records.iter().map(|rec| (&rec.unique_id.to_string()[..]).into() ).collect();
1332-
if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() {
1386+
let vals : Vec<_> = records.iter().map(|rec| rec.unique_id.as_bytes().to_vec().into() ).collect();
1387+
if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() {
13331388
typed.write_batch(&vals[..], None, None) ?;
13341389
} else {
13351390
panic!("Schema and struct disagree on type for {}" , stringify!{ unique_id })
@@ -1349,7 +1404,7 @@ mod test {
13491404
}
13501405
}).collect();
13511406

1352-
if let ColumnWriter::ByteArrayColumnWriter(ref mut typed) = column_writer.untyped() {
1407+
if let ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) = column_writer.untyped() {
13531408
typed.write_batch(&vals[..], Some(&definition_levels[..]), None) ?;
13541409
} else {
13551410
panic!("Schema and struct disagree on type for {}" , stringify!{ maybe_unique_id })
@@ -1371,13 +1426,13 @@ mod test {
13711426
assert_eq!(when.reader_snippet().to_string(),(quote!{
13721427
{
13731428
let mut vals = Vec::new();
1374-
if let ColumnReader::ByteArrayColumnReader(mut typed) = column_reader {
1429+
if let ColumnReader::FixedLenByteArrayColumnReader(mut typed) = column_reader {
13751430
typed.read_records(num_records, None, None, &mut vals)?;
13761431
} else {
13771432
panic!("Schema and struct disagree on type for {}", stringify!{ unique_id });
13781433
}
13791434
for (i, r) in &mut records[..num_records].iter_mut().enumerate() {
1380-
r.unique_id = ::uuid::Uuid::parse_str(vals[i].data().convert()).unwrap();
1435+
r.unique_id = ::uuid::Uuid::from_bytes(vals[i].data().try_into().unwrap());
13811436
}
13821437
}
13831438
}).to_string());

parquet_derive_test/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ rust-version = { workspace = true }
3232
parquet = { workspace = true }
3333
parquet_derive = { path = "../parquet_derive", default-features = false }
3434
chrono = { workspace = true }
35+
uuid = { version = "1", features = ["v4"] }

parquet_derive_test/src/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct ACompleteRecord<'a> {
4242
pub borrowed_maybe_a_string: &'a Option<String>,
4343
pub borrowed_maybe_a_str: &'a Option<&'a str>,
4444
pub now: chrono::NaiveDateTime,
45+
pub uuid: uuid::Uuid,
4546
pub byte_vec: Vec<u8>,
4647
pub maybe_byte_vec: Option<Vec<u8>>,
4748
pub borrowed_byte_vec: &'a [u8],
@@ -61,6 +62,7 @@ struct APartiallyCompleteRecord {
6162
pub double: f64,
6263
pub now: chrono::NaiveDateTime,
6364
pub date: chrono::NaiveDate,
65+
pub uuid: uuid::Uuid,
6466
pub byte_vec: Vec<u8>,
6567
}
6668

@@ -105,6 +107,7 @@ mod tests {
105107
OPTIONAL BINARY borrowed_maybe_a_string (STRING);
106108
OPTIONAL BINARY borrowed_maybe_a_str (STRING);
107109
REQUIRED INT64 now (TIMESTAMP_MILLIS);
110+
REQUIRED FIXED_LEN_BYTE_ARRAY (16) uuid (UUID);
108111
REQUIRED BINARY byte_vec;
109112
OPTIONAL BINARY maybe_byte_vec;
110113
REQUIRED BINARY borrowed_byte_vec;
@@ -144,6 +147,7 @@ mod tests {
144147
borrowed_maybe_a_string: &maybe_a_string,
145148
borrowed_maybe_a_str: &maybe_a_str,
146149
now: chrono::Utc::now().naive_local(),
150+
uuid: uuid::Uuid::new_v4(),
147151
byte_vec: vec![0x65, 0x66, 0x67],
148152
maybe_byte_vec: Some(vec![0x88, 0x89, 0x90]),
149153
borrowed_byte_vec: &borrowed_byte_vec,
@@ -179,6 +183,7 @@ mod tests {
179183
double: std::f64::NAN,
180184
now: chrono::Utc::now().naive_local(),
181185
date: chrono::naive::NaiveDate::from_ymd_opt(2015, 3, 14).unwrap(),
186+
uuid: uuid::Uuid::new_v4(),
182187
byte_vec: vec![0x65, 0x66, 0x67],
183188
}];
184189

0 commit comments

Comments
 (0)