From d0fd1ef9fea16dc6b6eea6fac451b225123613bb Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Wed, 3 Apr 2024 14:43:26 +0200 Subject: [PATCH 1/5] Add `UnionArray::into_parts` --- arrow-array/src/array/union_array.rs | 104 +++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index e3e637247537..c60d4f92d6bf 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -319,6 +319,39 @@ impl UnionArray { fields, } } + + /// Deconstruct this array into its constituent parts + /// + /// # Example + /// + /// ``` + /// # use arrow_array::types::Int32Type; + /// # use arrow_array::builder::UnionBuilder; + /// # fn main() -> Result<(), arrow_schema::ArrowError> { + /// let mut builder = UnionBuilder::new_dense(); + /// builder.append::("a", 1).unwrap(); + /// let union_array = builder.build()?; + /// let (data_type, type_ids, offsets, fields) = union_array.into_parts(); + /// # Ok(()) + /// # } + /// ``` + #[allow(clippy::type_complexity)] + pub fn into_parts( + self, + ) -> ( + DataType, + ScalarBuffer, + Option>, + Vec>, + ) { + let Self { + data_type, + type_ids, + offsets, + fields, + } = self; + (data_type, type_ids, offsets, fields) + } } impl From for UnionArray { @@ -505,6 +538,7 @@ impl std::fmt::Debug for UnionArray { mod tests { use super::*; + use crate::array::Int8Type; use crate::builder::UnionBuilder; use crate::cast::AsArray; use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type}; @@ -1201,4 +1235,74 @@ mod tests { assert_eq!(v.len(), 1); assert_eq!(v.as_string::().value(0), "baz"); } + + #[test] + fn into_parts() { + let mut builder = UnionBuilder::new_dense(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("a", 3).unwrap(); + let dense_union = builder.build().unwrap(); + + let field = [ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int8, false), + ]; + let field_type_ids = [0, 1]; + let (data_type, type_ids, offsets, fields) = dense_union.into_parts(); + assert_eq!( + data_type, + DataType::Union( + UnionFields::new(field_type_ids, field.clone()), + UnionMode::Dense + ) + ); + assert_eq!(type_ids, [0, 1, 0]); + assert!(offsets.is_some()); + assert_eq!(offsets.as_ref().unwrap(), &[0, 0, 1]); + assert_eq!(fields.len(), 2); + + let result = UnionArray::try_new( + &[0, 1], + type_ids.into_inner(), + offsets.map(ScalarBuffer::into_inner), + field + .clone() + .into_iter() + .zip(fields.into_iter().flatten()) + .collect(), + ); + assert!(result.is_ok()); + assert_eq!(result.unwrap().len(), 3); + + let mut builder = UnionBuilder::new_sparse(); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.append::("a", 3).unwrap(); + let sparse_union = builder.build().unwrap(); + + let (data_type, type_ids, offsets, fields) = sparse_union.into_parts(); + assert_eq!( + data_type, + DataType::Union( + UnionFields::new(field_type_ids, field.clone()), + UnionMode::Sparse + ) + ); + assert_eq!(type_ids, [0, 1, 0]); + assert!(offsets.is_none()); + assert_eq!(fields.len(), 2); + + let result = UnionArray::try_new( + &[0, 1], + type_ids.into_inner(), + offsets.map(ScalarBuffer::into_inner), + field + .into_iter() + .zip(fields.into_iter().flatten()) + .collect(), + ); + assert!(result.is_ok()); + assert_eq!(result.unwrap().len(), 3); + } } From dd210d9afa6f7f0063e6e0512d7561b6af25c765 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 4 Apr 2024 12:09:05 +0200 Subject: [PATCH 2/5] Return `UnionFields` and `UnionMode` instead of `DataType` --- arrow-array/src/array/union_array.rs | 32 +++++++++++++++------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index c60d4f92d6bf..4603f70d3ee4 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -331,7 +331,7 @@ impl UnionArray { /// let mut builder = UnionBuilder::new_dense(); /// builder.append::("a", 1).unwrap(); /// let union_array = builder.build()?; - /// let (data_type, type_ids, offsets, fields) = union_array.into_parts(); + /// let (union_fields, union_mode, type_ids, offsets, fields) = union_array.into_parts(); /// # Ok(()) /// # } /// ``` @@ -339,7 +339,8 @@ impl UnionArray { pub fn into_parts( self, ) -> ( - DataType, + UnionFields, + UnionMode, ScalarBuffer, Option>, Vec>, @@ -350,7 +351,12 @@ impl UnionArray { offsets, fields, } = self; - (data_type, type_ids, offsets, fields) + match data_type { + DataType::Union(union_fields, union_mode) => { + (union_fields, union_mode, type_ids, offsets, fields) + } + _ => unreachable!(), + } } } @@ -1249,14 +1255,12 @@ mod tests { Field::new("b", DataType::Int8, false), ]; let field_type_ids = [0, 1]; - let (data_type, type_ids, offsets, fields) = dense_union.into_parts(); + let (union_fields, union_mode, type_ids, offsets, fields) = dense_union.into_parts(); assert_eq!( - data_type, - DataType::Union( - UnionFields::new(field_type_ids, field.clone()), - UnionMode::Dense - ) + union_fields, + UnionFields::new(field_type_ids, field.clone()) ); + assert_eq!(union_mode, UnionMode::Dense); assert_eq!(type_ids, [0, 1, 0]); assert!(offsets.is_some()); assert_eq!(offsets.as_ref().unwrap(), &[0, 0, 1]); @@ -1281,14 +1285,12 @@ mod tests { builder.append::("a", 3).unwrap(); let sparse_union = builder.build().unwrap(); - let (data_type, type_ids, offsets, fields) = sparse_union.into_parts(); + let (union_fields, union_mode, type_ids, offsets, fields) = sparse_union.into_parts(); assert_eq!( - data_type, - DataType::Union( - UnionFields::new(field_type_ids, field.clone()), - UnionMode::Sparse - ) + union_fields, + UnionFields::new(field_type_ids, field.clone()) ); + assert_eq!(union_mode, UnionMode::Sparse); assert_eq!(type_ids, [0, 1, 0]); assert!(offsets.is_none()); assert_eq!(fields.len(), 2); From 23e6019cf988ec5cb8a223e293405101352378fe Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 4 Apr 2024 12:50:09 +0200 Subject: [PATCH 3/5] Add `into_parts` test with custom type ids --- arrow-array/src/array/union_array.rs | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 4603f70d3ee4..bda1ef67010f 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -1307,4 +1307,60 @@ mod tests { assert!(result.is_ok()); assert_eq!(result.unwrap().len(), 3); } + + #[test] + fn into_parts_custom_type_ids() { + const TYPE_IDS: [i8; 3] = [8, 4, 9]; + let data_type = DataType::Union( + UnionFields::new( + TYPE_IDS, + [ + Field::new("strings", DataType::Utf8, false), + Field::new("integers", DataType::Int32, false), + Field::new("floats", DataType::Float64, false), + ], + ), + UnionMode::Dense, + ); + let string_array = StringArray::from(vec!["foo", "bar", "baz"]); + let int_array = Int32Array::from(vec![5, 6, 4]); + let float_array = Float64Array::from(vec![10.0]); + let type_ids = Buffer::from_vec(vec![4_i8, 8, 4, 8, 9, 4, 8]); + let value_offsets = Buffer::from_vec(vec![0_i32, 0, 1, 1, 0, 2, 2]); + let data = ArrayData::builder(data_type) + .len(7) + .buffers(vec![type_ids, value_offsets]) + .child_data(vec![ + string_array.into_data(), + int_array.into_data(), + float_array.into_data(), + ]) + .build() + .unwrap(); + let array = UnionArray::from(data); + + let (union_fields, union_mode, type_ids, offsets, mut fields) = array.into_parts(); + assert_eq!(union_mode, UnionMode::Dense); + let result = UnionArray::try_new( + &TYPE_IDS, + type_ids.into_inner(), + offsets.map(ScalarBuffer::into_inner), + union_fields + .iter() + .map(|(type_id, field)| { + ( + (*Arc::clone(field)).clone(), + fields[type_id as usize].take().unwrap(), + ) + }) + .collect(), + ); + assert!(result.is_ok()); + let array = result.unwrap(); + assert_eq!(array.len(), 7); + let (_, _, _, _, fields) = array.into_parts(); + for type_id in TYPE_IDS { + assert!(fields.get(type_id as usize).is_some_and(Option::is_some)) + } + } } From 8eff10f20cc7ed310528ab4c6e0f5ece69da9164 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Mon, 8 Apr 2024 13:08:36 +0200 Subject: [PATCH 4/5] Change `into_parts` output to better match `try_new` --- arrow-array/src/array/union_array.rs | 96 +++++++++++++++------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index bda1ef67010f..0a85fa6c3917 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -23,6 +23,7 @@ use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; /// Contains the `UnionArray` type. /// use std::any::Any; +use std::collections::HashMap; use std::sync::Arc; /// An array of [values of varying types](https://arrow.apache.org/docs/format/Columnar.html#union-layout) @@ -325,13 +326,25 @@ impl UnionArray { /// # Example /// /// ``` + /// # use arrow_array::array::UnionArray; /// # use arrow_array::types::Int32Type; /// # use arrow_array::builder::UnionBuilder; + /// # use arrow_buffer::ScalarBuffer; /// # fn main() -> Result<(), arrow_schema::ArrowError> { /// let mut builder = UnionBuilder::new_dense(); /// builder.append::("a", 1).unwrap(); /// let union_array = builder.build()?; - /// let (union_fields, union_mode, type_ids, offsets, fields) = union_array.into_parts(); + /// + /// // Deconstruct into parts + /// let (union_mode, type_ids, offsets, field_type_ids, fields) = union_array.into_parts(); + /// + /// // Reconstruct from parts + /// let union_array = UnionArray::try_new( + /// &field_type_ids, + /// type_ids.into_inner(), + /// offsets.map(ScalarBuffer::into_inner), + /// fields, + /// ); /// # Ok(()) /// # } /// ``` @@ -339,11 +352,11 @@ impl UnionArray { pub fn into_parts( self, ) -> ( - UnionFields, UnionMode, ScalarBuffer, Option>, - Vec>, + Vec, + Vec<(Field, ArrayRef)>, ) { let Self { data_type, @@ -353,7 +366,21 @@ impl UnionArray { } = self; match data_type { DataType::Union(union_fields, union_mode) => { - (union_fields, union_mode, type_ids, offsets, fields) + let union_fields = union_fields.iter().collect::>(); + let (field_type_ids, fields) = fields + .into_iter() + .enumerate() + .flat_map(|(type_id, array_ref)| { + array_ref.map(|array_ref| { + let type_id = type_id as i8; + ( + type_id, + ((*Arc::clone(union_fields[&type_id])).clone(), array_ref), + ) + }) + }) + .unzip(); + (union_mode, type_ids, offsets, field_type_ids, fields) } _ => unreachable!(), } @@ -1254,27 +1281,26 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int8, false), ]; - let field_type_ids = [0, 1]; - let (union_fields, union_mode, type_ids, offsets, fields) = dense_union.into_parts(); + let (union_mode, type_ids, offsets, field_type_ids, fields) = dense_union.into_parts(); + assert_eq!(union_mode, UnionMode::Dense); + assert_eq!(field_type_ids, [0, 1]); assert_eq!( - union_fields, - UnionFields::new(field_type_ids, field.clone()) + field.to_vec(), + fields + .iter() + .cloned() + .map(|(field, _)| field) + .collect::>() ); - assert_eq!(union_mode, UnionMode::Dense); assert_eq!(type_ids, [0, 1, 0]); assert!(offsets.is_some()); assert_eq!(offsets.as_ref().unwrap(), &[0, 0, 1]); - assert_eq!(fields.len(), 2); let result = UnionArray::try_new( - &[0, 1], + &field_type_ids, type_ids.into_inner(), offsets.map(ScalarBuffer::into_inner), - field - .clone() - .into_iter() - .zip(fields.into_iter().flatten()) - .collect(), + fields, ); assert!(result.is_ok()); assert_eq!(result.unwrap().len(), 3); @@ -1285,24 +1311,16 @@ mod tests { builder.append::("a", 3).unwrap(); let sparse_union = builder.build().unwrap(); - let (union_fields, union_mode, type_ids, offsets, fields) = sparse_union.into_parts(); - assert_eq!( - union_fields, - UnionFields::new(field_type_ids, field.clone()) - ); + let (union_mode, type_ids, offsets, field_type_ids, fields) = sparse_union.into_parts(); assert_eq!(union_mode, UnionMode::Sparse); assert_eq!(type_ids, [0, 1, 0]); assert!(offsets.is_none()); - assert_eq!(fields.len(), 2); let result = UnionArray::try_new( - &[0, 1], + &field_type_ids, type_ids.into_inner(), offsets.map(ScalarBuffer::into_inner), - field - .into_iter() - .zip(fields.into_iter().flatten()) - .collect(), + fields, ); assert!(result.is_ok()); assert_eq!(result.unwrap().len(), 3); @@ -1310,10 +1328,10 @@ mod tests { #[test] fn into_parts_custom_type_ids() { - const TYPE_IDS: [i8; 3] = [8, 4, 9]; + let mut set_field_type_ids: [i8; 3] = [8, 4, 9]; let data_type = DataType::Union( UnionFields::new( - TYPE_IDS, + set_field_type_ids, [ Field::new("strings", DataType::Utf8, false), Field::new("integers", DataType::Int32, false), @@ -1339,28 +1357,18 @@ mod tests { .unwrap(); let array = UnionArray::from(data); - let (union_fields, union_mode, type_ids, offsets, mut fields) = array.into_parts(); + let (union_mode, type_ids, offsets, field_type_ids, fields) = array.into_parts(); assert_eq!(union_mode, UnionMode::Dense); + set_field_type_ids.sort(); + assert_eq!(field_type_ids, set_field_type_ids); let result = UnionArray::try_new( - &TYPE_IDS, + &field_type_ids, type_ids.into_inner(), offsets.map(ScalarBuffer::into_inner), - union_fields - .iter() - .map(|(type_id, field)| { - ( - (*Arc::clone(field)).clone(), - fields[type_id as usize].take().unwrap(), - ) - }) - .collect(), + fields, ); assert!(result.is_ok()); let array = result.unwrap(); assert_eq!(array.len(), 7); - let (_, _, _, _, fields) = array.into_parts(); - for type_id in TYPE_IDS { - assert!(fields.get(type_id as usize).is_some_and(Option::is_some)) - } } } From 106deb8274c4140a3bd5c6ff7cb3590b47938739 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 9 Apr 2024 10:25:42 +0100 Subject: [PATCH 5/5] Remove UnionMode --- arrow-array/src/array/union_array.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 0a85fa6c3917..22d4cf90a092 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -336,7 +336,7 @@ impl UnionArray { /// let union_array = builder.build()?; /// /// // Deconstruct into parts - /// let (union_mode, type_ids, offsets, field_type_ids, fields) = union_array.into_parts(); + /// let (type_ids, offsets, field_type_ids, fields) = union_array.into_parts(); /// /// // Reconstruct from parts /// let union_array = UnionArray::try_new( @@ -352,7 +352,6 @@ impl UnionArray { pub fn into_parts( self, ) -> ( - UnionMode, ScalarBuffer, Option>, Vec, @@ -365,7 +364,7 @@ impl UnionArray { fields, } = self; match data_type { - DataType::Union(union_fields, union_mode) => { + DataType::Union(union_fields, _) => { let union_fields = union_fields.iter().collect::>(); let (field_type_ids, fields) = fields .into_iter() @@ -380,7 +379,7 @@ impl UnionArray { }) }) .unzip(); - (union_mode, type_ids, offsets, field_type_ids, fields) + (type_ids, offsets, field_type_ids, fields) } _ => unreachable!(), } @@ -1281,8 +1280,7 @@ mod tests { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int8, false), ]; - let (union_mode, type_ids, offsets, field_type_ids, fields) = dense_union.into_parts(); - assert_eq!(union_mode, UnionMode::Dense); + let (type_ids, offsets, field_type_ids, fields) = dense_union.into_parts(); assert_eq!(field_type_ids, [0, 1]); assert_eq!( field.to_vec(), @@ -1311,8 +1309,7 @@ mod tests { builder.append::("a", 3).unwrap(); let sparse_union = builder.build().unwrap(); - let (union_mode, type_ids, offsets, field_type_ids, fields) = sparse_union.into_parts(); - assert_eq!(union_mode, UnionMode::Sparse); + let (type_ids, offsets, field_type_ids, fields) = sparse_union.into_parts(); assert_eq!(type_ids, [0, 1, 0]); assert!(offsets.is_none()); @@ -1357,8 +1354,7 @@ mod tests { .unwrap(); let array = UnionArray::from(data); - let (union_mode, type_ids, offsets, field_type_ids, fields) = array.into_parts(); - assert_eq!(union_mode, UnionMode::Dense); + let (type_ids, offsets, field_type_ids, fields) = array.into_parts(); set_field_type_ids.sort(); assert_eq!(field_type_ids, set_field_type_ids); let result = UnionArray::try_new(