From d96d0b14581482a45d5073bb2ec2e2d8d64dbfcf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 25 Feb 2025 10:50:14 -0500 Subject: [PATCH 1/5] feat: ChunkedArray uses into_canonical for to_canonical --- vortex-array/src/arrays/chunked/canonical.rs | 486 +++++++++---------- 1 file changed, 238 insertions(+), 248 deletions(-) diff --git a/vortex-array/src/arrays/chunked/canonical.rs b/vortex-array/src/arrays/chunked/canonical.rs index c9526fb700..2d28d17ad2 100644 --- a/vortex-array/src/arrays/chunked/canonical.rs +++ b/vortex-array/src/arrays/chunked/canonical.rs @@ -1,24 +1,14 @@ -use arrow_buffer::BooleanBufferBuilder; -use vortex_buffer::BufferMut; -use vortex_dtype::{DType, NativePType, Nullability, PType, StructDType, match_each_native_ptype}; -use vortex_error::{VortexExpect, VortexResult, vortex_err}; - -use crate::array::ArrayCanonicalImpl; -use crate::arrays::chunked::ChunkedArray; -use crate::arrays::extension::ExtensionArray; -use crate::arrays::null::NullArray; -use crate::arrays::primitive::PrimitiveArray; -use crate::arrays::struct_::StructArray; -use crate::arrays::{BoolArray, ListArray, VarBinViewArray}; -use crate::builders::ArrayBuilder; -use crate::compute::{scalar_at, slice, try_cast}; -use crate::validity::Validity; -use crate::{Array, ArrayRef, ArrayVariants, Canonical, ToCanonical}; +use vortex_error::VortexResult; + +use super::ChunkedArray; +use crate::builders::{builder_with_capacity, ArrayBuilder}; +use crate::{Array as _, ArrayCanonicalImpl, Canonical}; impl ArrayCanonicalImpl for ChunkedArray { fn _to_canonical(&self) -> VortexResult { - let validity = Validity::copy_from_array(self)?; - try_canonicalize_chunks(self.chunks(), validity, self.dtype()) + let mut builder = builder_with_capacity(self.dtype(), self.len()); + self.append_to_builder(builder.as_mut())?; + builder.finish().to_canonical() } fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> { @@ -29,235 +19,235 @@ impl ArrayCanonicalImpl for ChunkedArray { } } -pub(crate) fn try_canonicalize_chunks( - chunks: &[ArrayRef], - validity: Validity, - dtype: &DType, -) -> VortexResult { - match dtype { - // Structs can have their internal field pointers swizzled to push the chunking down - // one level internally without copying or decompressing any data. - DType::Struct(struct_dtype, _) => { - let struct_array = swizzle_struct_chunks(chunks, validity, struct_dtype)?; - Ok(Canonical::Struct(struct_array)) - } - - // Extension arrays are containers that wraps an inner storage array with some metadata. - // We delegate to the canonical format of the internal storage array for every chunk, and - // push the chunking down into the inner storage array. - // - // Input: - // ------ - // - // ChunkedArray - // / \ - // / \ - // ExtensionArray ExtensionArray - // | | - // storage storage - // - // - // Output: - // ------ - // - // ExtensionArray - // | - // ChunkedArray - // / \ - // storage storage - // - DType::Extension(ext_dtype) => { - // Recursively apply canonicalization and packing to the storage array backing - // each chunk of the extension array. - let storage_chunks: Vec = chunks - .iter() - // Extension-typed arrays can be compressed into something that is not an - // ExtensionArray, so we should canonicalize each chunk into ExtensionArray first. - .map(|chunk| { - chunk - .clone() - .as_extension_typed() - .vortex_expect("Chunk could not be downcast to ExtensionArrayTrait") - .storage_data() - }) - .collect(); - let storage_dtype = ext_dtype.storage_dtype().clone(); - let chunked_storage = - ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); - - Ok(Canonical::Extension(ExtensionArray::new( - ext_dtype.clone(), - chunked_storage, - ))) - } - - DType::List(..) => { - // TODO(joe): improve performance, use a listview, once it exists - - let list = pack_lists(chunks, validity, dtype)?; - Ok(Canonical::List(list)) - } - - DType::Bool(_) => { - let bool_array = pack_bools(chunks, validity)?; - Ok(Canonical::Bool(bool_array)) - } - DType::Primitive(ptype, _) => { - match_each_native_ptype!(ptype, |$P| { - let prim_array = pack_primitives::<$P>(chunks, validity)?; - Ok(Canonical::Primitive(prim_array)) - }) - } - DType::Utf8(_) => { - let varbin_array = pack_views(chunks, dtype, validity)?; - Ok(Canonical::VarBinView(varbin_array)) - } - DType::Binary(_) => { - let varbin_array = pack_views(chunks, dtype, validity)?; - Ok(Canonical::VarBinView(varbin_array)) - } - DType::Null => { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let null_array = NullArray::new(len); - Ok(Canonical::Null(null_array)) - } - } -} - -fn pack_lists(chunks: &[ArrayRef], validity: Validity, dtype: &DType) -> VortexResult { - let len: usize = chunks.iter().map(|c| c.len()).sum(); - let mut offsets = BufferMut::::with_capacity(len + 1); - offsets.push(0); - let mut elements = Vec::new(); - let elem_dtype = dtype - .as_list_element() - .vortex_expect("ListArray must have List dtype"); - - for chunk in chunks { - let chunk = chunk.to_list()?; - // TODO: handle i32 offsets if they fit. - let offsets_arr = try_cast( - chunk.offsets(), - &DType::Primitive(PType::I64, Nullability::NonNullable), - )? - .to_primitive()?; - - let first_offset_value: usize = usize::try_from(&scalar_at(&offsets_arr, 0)?)?; - let last_offset_value: usize = - usize::try_from(&scalar_at(&offsets_arr, offsets_arr.len() - 1)?)?; - elements.push(slice( - chunk.elements(), - first_offset_value, - last_offset_value, - )?); - - let adjustment_from_previous = *offsets - .last() - .ok_or_else(|| vortex_err!("List offsets must have at least one element"))?; - offsets.extend( - offsets_arr - .as_slice::() - .iter() - .skip(1) - .map(|off| off + adjustment_from_previous - first_offset_value as i64), - ); - } - let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array(); - let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable); - - ListArray::try_new(chunked_elements, offsets.into_array(), validity) -} - -/// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single -/// StructArray, where the Array for each Field is a ChunkedArray. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn swizzle_struct_chunks( - chunks: &[ArrayRef], - validity: Validity, - struct_dtype: &StructDType, -) -> VortexResult { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let mut field_arrays = Vec::new(); - - for (field_idx, field_dtype) in struct_dtype.fields().enumerate() { - let field_chunks = chunks - .iter() - .map(|c| { - c.as_struct_typed() - .vortex_expect("Chunk was not a StructArray") - .maybe_null_field_by_idx(field_idx) - .vortex_expect("Invalid chunked array") - }) - .collect::>(); - let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?; - field_arrays.push(field_array.into_array()); - } - - StructArray::try_new(struct_dtype.names().clone(), field_arrays, len, validity) -} - -/// Builds a new [BoolArray] by repacking the values from the chunks in a single contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_bools(chunks: &[ArrayRef], validity: Validity) -> VortexResult { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let mut buffer = BooleanBufferBuilder::new(len); - for chunk in chunks { - let chunk = chunk.to_bool()?; - buffer.append_buffer(chunk.boolean_buffer()); - } - Ok(BoolArray::new(buffer.finish(), validity)) -} - -/// Builds a new [PrimitiveArray] by repacking the values from the chunks into a single -/// contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_primitives( - chunks: &[ArrayRef], - validity: Validity, -) -> VortexResult { - let total_len = chunks.iter().map(|a| a.len()).sum(); - let mut buffer = BufferMut::with_capacity(total_len); - for chunk in chunks { - let chunk = chunk.to_primitive()?; - buffer.extend_from_slice(chunk.as_slice::()); - } - Ok(PrimitiveArray::new(buffer.freeze(), validity)) -} - -/// Builds a new [VarBinViewArray] by repacking the values from the chunks into a single -/// contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_views( - chunks: &[ArrayRef], - dtype: &DType, - validity: Validity, -) -> VortexResult { - let total_len = chunks.iter().map(|a| a.len()).sum(); - let mut views = BufferMut::with_capacity(total_len); - let mut buffers = Vec::new(); - for chunk in chunks { - let buffers_offset = u32::try_from(buffers.len())?; - let canonical_chunk = chunk.to_varbinview()?; - buffers.extend(canonical_chunk.buffers().iter().cloned()); - - views.extend( - canonical_chunk - .views() - .iter() - .map(|view| view.offset_view(buffers_offset)), - ); - } - - VarBinViewArray::try_new(views.freeze(), buffers, dtype.clone(), validity) -} +// pub(crate) fn try_canonicalize_chunks( +// chunks: &[ArrayRef], +// validity: Validity, +// dtype: &DType, +// ) -> VortexResult { +// match dtype { +// // Structs can have their internal field pointers swizzled to push the chunking down +// // one level internally without copying or decompressing any data. +// DType::Struct(struct_dtype, _) => { +// let struct_array = swizzle_struct_chunks(chunks, validity, struct_dtype)?; +// Ok(Canonical::Struct(struct_array)) +// } + +// // Extension arrays are containers that wraps an inner storage array with some metadata. +// // We delegate to the canonical format of the internal storage array for every chunk, and +// // push the chunking down into the inner storage array. +// // +// // Input: +// // ------ +// // +// // ChunkedArray +// // / \ +// // / \ +// // ExtensionArray ExtensionArray +// // | | +// // storage storage +// // +// // +// // Output: +// // ------ +// // +// // ExtensionArray +// // | +// // ChunkedArray +// // / \ +// // storage storage +// // +// DType::Extension(ext_dtype) => { +// // Recursively apply canonicalization and packing to the storage array backing +// // each chunk of the extension array. +// let storage_chunks: Vec = chunks +// .iter() +// // Extension-typed arrays can be compressed into something that is not an +// // ExtensionArray, so we should canonicalize each chunk into ExtensionArray first. +// .map(|chunk| { +// chunk +// .clone() +// .as_extension_typed() +// .vortex_expect("Chunk could not be downcast to ExtensionArrayTrait") +// .storage_data() +// }) +// .collect(); +// let storage_dtype = ext_dtype.storage_dtype().clone(); +// let chunked_storage = +// ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); + +// Ok(Canonical::Extension(ExtensionArray::new( +// ext_dtype.clone(), +// chunked_storage, +// ))) +// } + +// DType::List(..) => { +// // TODO(joe): improve performance, use a listview, once it exists + +// let list = pack_lists(chunks, validity, dtype)?; +// Ok(Canonical::List(list)) +// } + +// DType::Bool(_) => { +// let bool_array = pack_bools(chunks, validity)?; +// Ok(Canonical::Bool(bool_array)) +// } +// DType::Primitive(ptype, _) => { +// match_each_native_ptype!(ptype, |$P| { +// let prim_array = pack_primitives::<$P>(chunks, validity)?; +// Ok(Canonical::Primitive(prim_array)) +// }) +// } +// DType::Utf8(_) => { +// let varbin_array = pack_views(chunks, dtype, validity)?; +// Ok(Canonical::VarBinView(varbin_array)) +// } +// DType::Binary(_) => { +// let varbin_array = pack_views(chunks, dtype, validity)?; +// Ok(Canonical::VarBinView(varbin_array)) +// } +// DType::Null => { +// let len = chunks.iter().map(|chunk| chunk.len()).sum(); +// let null_array = NullArray::new(len); +// Ok(Canonical::Null(null_array)) +// } +// } +// } + +// fn pack_lists(chunks: &[ArrayRef], validity: Validity, dtype: &DType) -> VortexResult { +// let len: usize = chunks.iter().map(|c| c.len()).sum(); +// let mut offsets = BufferMut::::with_capacity(len + 1); +// offsets.push(0); +// let mut elements = Vec::new(); +// let elem_dtype = dtype +// .as_list_element() +// .vortex_expect("ListArray must have List dtype"); + +// for chunk in chunks { +// let chunk = chunk.to_list()?; +// // TODO: handle i32 offsets if they fit. +// let offsets_arr = try_cast( +// chunk.offsets(), +// &DType::Primitive(PType::I64, Nullability::NonNullable), +// )? +// .to_primitive()?; + +// let first_offset_value: usize = usize::try_from(&scalar_at(&offsets_arr, 0)?)?; +// let last_offset_value: usize = +// usize::try_from(&scalar_at(&offsets_arr, offsets_arr.len() - 1)?)?; +// elements.push(slice( +// chunk.elements(), +// first_offset_value, +// last_offset_value, +// )?); + +// let adjustment_from_previous = *offsets +// .last() +// .ok_or_else(|| vortex_err!("List offsets must have at least one element"))?; +// offsets.extend( +// offsets_arr +// .as_slice::() +// .iter() +// .skip(1) +// .map(|off| off + adjustment_from_previous - first_offset_value as i64), +// ); +// } +// let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array(); +// let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable); + +// ListArray::try_new(chunked_elements, offsets.into_array(), validity) +// } + +// /// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single +// /// StructArray, where the Array for each Field is a ChunkedArray. +// /// +// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have +// /// been checked to have the same DType already. +// fn swizzle_struct_chunks( +// chunks: &[ArrayRef], +// validity: Validity, +// struct_dtype: &StructDType, +// ) -> VortexResult { +// let len = chunks.iter().map(|chunk| chunk.len()).sum(); +// let mut field_arrays = Vec::new(); + +// for (field_idx, field_dtype) in struct_dtype.fields().enumerate() { +// let field_chunks = chunks +// .iter() +// .map(|c| { +// c.as_struct_typed() +// .vortex_expect("Chunk was not a StructArray") +// .maybe_null_field_by_idx(field_idx) +// .vortex_expect("Invalid chunked array") +// }) +// .collect::>(); +// let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?; +// field_arrays.push(field_array.into_array()); +// } + +// StructArray::try_new(struct_dtype.names().clone(), field_arrays, len, validity) +// } + +// /// Builds a new [BoolArray] by repacking the values from the chunks in a single contiguous array. +// /// +// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have +// /// been checked to have the same DType already. +// fn pack_bools(chunks: &[ArrayRef], validity: Validity) -> VortexResult { +// let len = chunks.iter().map(|chunk| chunk.len()).sum(); +// let mut buffer = BooleanBufferBuilder::new(len); +// for chunk in chunks { +// let chunk = chunk.to_bool()?; +// buffer.append_buffer(chunk.boolean_buffer()); +// } +// Ok(BoolArray::new(buffer.finish(), validity)) +// } + +// /// Builds a new [PrimitiveArray] by repacking the values from the chunks into a single +// /// contiguous array. +// /// +// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have +// /// been checked to have the same DType already. +// fn pack_primitives( +// chunks: &[ArrayRef], +// validity: Validity, +// ) -> VortexResult { +// let total_len = chunks.iter().map(|a| a.len()).sum(); +// let mut buffer = BufferMut::with_capacity(total_len); +// for chunk in chunks { +// let chunk = chunk.to_primitive()?; +// buffer.extend_from_slice(chunk.as_slice::()); +// } +// Ok(PrimitiveArray::new(buffer.freeze(), validity)) +// } + +// /// Builds a new [VarBinViewArray] by repacking the values from the chunks into a single +// /// contiguous array. +// /// +// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have +// /// been checked to have the same DType already. +// fn pack_views( +// chunks: &[ArrayRef], +// dtype: &DType, +// validity: Validity, +// ) -> VortexResult { +// let total_len = chunks.iter().map(|a| a.len()).sum(); +// let mut views = BufferMut::with_capacity(total_len); +// let mut buffers = Vec::new(); +// for chunk in chunks { +// let buffers_offset = u32::try_from(buffers.len())?; +// let canonical_chunk = chunk.to_varbinview()?; +// buffers.extend(canonical_chunk.buffers().iter().cloned()); + +// views.extend( +// canonical_chunk +// .views() +// .iter() +// .map(|view| view.offset_view(buffers_offset)), +// ); +// } + +// VarBinViewArray::try_new(views.freeze(), buffers, dtype.clone(), validity) +// } #[cfg(test)] mod tests { @@ -268,7 +258,6 @@ mod tests { use vortex_dtype::Nullability::NonNullable; use vortex_dtype::PType::I32; - use crate::ToCanonical; use crate::accessor::ArrayAccessor; use crate::array::Array; use crate::arrays::chunked::canonical::pack_views; @@ -276,6 +265,7 @@ mod tests { use crate::compute::{scalar_at, slice}; use crate::validity::Validity; use crate::variants::StructArrayTrait; + use crate::ToCanonical; fn stringview_array() -> VarBinViewArray { VarBinViewArray::from_iter_str(["foo", "bar", "baz", "quak"]) From 6c9fc3ce18459b982312d22680b8fd57cc69ece9 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 25 Feb 2025 10:57:29 -0500 Subject: [PATCH 2/5] fix --- vortex-array/src/arrays/chunked/canonical.rs | 230 ------------------- 1 file changed, 230 deletions(-) diff --git a/vortex-array/src/arrays/chunked/canonical.rs b/vortex-array/src/arrays/chunked/canonical.rs index 2d28d17ad2..44cba81526 100644 --- a/vortex-array/src/arrays/chunked/canonical.rs +++ b/vortex-array/src/arrays/chunked/canonical.rs @@ -19,236 +19,6 @@ impl ArrayCanonicalImpl for ChunkedArray { } } -// pub(crate) fn try_canonicalize_chunks( -// chunks: &[ArrayRef], -// validity: Validity, -// dtype: &DType, -// ) -> VortexResult { -// match dtype { -// // Structs can have their internal field pointers swizzled to push the chunking down -// // one level internally without copying or decompressing any data. -// DType::Struct(struct_dtype, _) => { -// let struct_array = swizzle_struct_chunks(chunks, validity, struct_dtype)?; -// Ok(Canonical::Struct(struct_array)) -// } - -// // Extension arrays are containers that wraps an inner storage array with some metadata. -// // We delegate to the canonical format of the internal storage array for every chunk, and -// // push the chunking down into the inner storage array. -// // -// // Input: -// // ------ -// // -// // ChunkedArray -// // / \ -// // / \ -// // ExtensionArray ExtensionArray -// // | | -// // storage storage -// // -// // -// // Output: -// // ------ -// // -// // ExtensionArray -// // | -// // ChunkedArray -// // / \ -// // storage storage -// // -// DType::Extension(ext_dtype) => { -// // Recursively apply canonicalization and packing to the storage array backing -// // each chunk of the extension array. -// let storage_chunks: Vec = chunks -// .iter() -// // Extension-typed arrays can be compressed into something that is not an -// // ExtensionArray, so we should canonicalize each chunk into ExtensionArray first. -// .map(|chunk| { -// chunk -// .clone() -// .as_extension_typed() -// .vortex_expect("Chunk could not be downcast to ExtensionArrayTrait") -// .storage_data() -// }) -// .collect(); -// let storage_dtype = ext_dtype.storage_dtype().clone(); -// let chunked_storage = -// ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); - -// Ok(Canonical::Extension(ExtensionArray::new( -// ext_dtype.clone(), -// chunked_storage, -// ))) -// } - -// DType::List(..) => { -// // TODO(joe): improve performance, use a listview, once it exists - -// let list = pack_lists(chunks, validity, dtype)?; -// Ok(Canonical::List(list)) -// } - -// DType::Bool(_) => { -// let bool_array = pack_bools(chunks, validity)?; -// Ok(Canonical::Bool(bool_array)) -// } -// DType::Primitive(ptype, _) => { -// match_each_native_ptype!(ptype, |$P| { -// let prim_array = pack_primitives::<$P>(chunks, validity)?; -// Ok(Canonical::Primitive(prim_array)) -// }) -// } -// DType::Utf8(_) => { -// let varbin_array = pack_views(chunks, dtype, validity)?; -// Ok(Canonical::VarBinView(varbin_array)) -// } -// DType::Binary(_) => { -// let varbin_array = pack_views(chunks, dtype, validity)?; -// Ok(Canonical::VarBinView(varbin_array)) -// } -// DType::Null => { -// let len = chunks.iter().map(|chunk| chunk.len()).sum(); -// let null_array = NullArray::new(len); -// Ok(Canonical::Null(null_array)) -// } -// } -// } - -// fn pack_lists(chunks: &[ArrayRef], validity: Validity, dtype: &DType) -> VortexResult { -// let len: usize = chunks.iter().map(|c| c.len()).sum(); -// let mut offsets = BufferMut::::with_capacity(len + 1); -// offsets.push(0); -// let mut elements = Vec::new(); -// let elem_dtype = dtype -// .as_list_element() -// .vortex_expect("ListArray must have List dtype"); - -// for chunk in chunks { -// let chunk = chunk.to_list()?; -// // TODO: handle i32 offsets if they fit. -// let offsets_arr = try_cast( -// chunk.offsets(), -// &DType::Primitive(PType::I64, Nullability::NonNullable), -// )? -// .to_primitive()?; - -// let first_offset_value: usize = usize::try_from(&scalar_at(&offsets_arr, 0)?)?; -// let last_offset_value: usize = -// usize::try_from(&scalar_at(&offsets_arr, offsets_arr.len() - 1)?)?; -// elements.push(slice( -// chunk.elements(), -// first_offset_value, -// last_offset_value, -// )?); - -// let adjustment_from_previous = *offsets -// .last() -// .ok_or_else(|| vortex_err!("List offsets must have at least one element"))?; -// offsets.extend( -// offsets_arr -// .as_slice::() -// .iter() -// .skip(1) -// .map(|off| off + adjustment_from_previous - first_offset_value as i64), -// ); -// } -// let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array(); -// let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable); - -// ListArray::try_new(chunked_elements, offsets.into_array(), validity) -// } - -// /// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single -// /// StructArray, where the Array for each Field is a ChunkedArray. -// /// -// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -// /// been checked to have the same DType already. -// fn swizzle_struct_chunks( -// chunks: &[ArrayRef], -// validity: Validity, -// struct_dtype: &StructDType, -// ) -> VortexResult { -// let len = chunks.iter().map(|chunk| chunk.len()).sum(); -// let mut field_arrays = Vec::new(); - -// for (field_idx, field_dtype) in struct_dtype.fields().enumerate() { -// let field_chunks = chunks -// .iter() -// .map(|c| { -// c.as_struct_typed() -// .vortex_expect("Chunk was not a StructArray") -// .maybe_null_field_by_idx(field_idx) -// .vortex_expect("Invalid chunked array") -// }) -// .collect::>(); -// let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?; -// field_arrays.push(field_array.into_array()); -// } - -// StructArray::try_new(struct_dtype.names().clone(), field_arrays, len, validity) -// } - -// /// Builds a new [BoolArray] by repacking the values from the chunks in a single contiguous array. -// /// -// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -// /// been checked to have the same DType already. -// fn pack_bools(chunks: &[ArrayRef], validity: Validity) -> VortexResult { -// let len = chunks.iter().map(|chunk| chunk.len()).sum(); -// let mut buffer = BooleanBufferBuilder::new(len); -// for chunk in chunks { -// let chunk = chunk.to_bool()?; -// buffer.append_buffer(chunk.boolean_buffer()); -// } -// Ok(BoolArray::new(buffer.finish(), validity)) -// } - -// /// Builds a new [PrimitiveArray] by repacking the values from the chunks into a single -// /// contiguous array. -// /// -// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -// /// been checked to have the same DType already. -// fn pack_primitives( -// chunks: &[ArrayRef], -// validity: Validity, -// ) -> VortexResult { -// let total_len = chunks.iter().map(|a| a.len()).sum(); -// let mut buffer = BufferMut::with_capacity(total_len); -// for chunk in chunks { -// let chunk = chunk.to_primitive()?; -// buffer.extend_from_slice(chunk.as_slice::()); -// } -// Ok(PrimitiveArray::new(buffer.freeze(), validity)) -// } - -// /// Builds a new [VarBinViewArray] by repacking the values from the chunks into a single -// /// contiguous array. -// /// -// /// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -// /// been checked to have the same DType already. -// fn pack_views( -// chunks: &[ArrayRef], -// dtype: &DType, -// validity: Validity, -// ) -> VortexResult { -// let total_len = chunks.iter().map(|a| a.len()).sum(); -// let mut views = BufferMut::with_capacity(total_len); -// let mut buffers = Vec::new(); -// for chunk in chunks { -// let buffers_offset = u32::try_from(buffers.len())?; -// let canonical_chunk = chunk.to_varbinview()?; -// buffers.extend(canonical_chunk.buffers().iter().cloned()); - -// views.extend( -// canonical_chunk -// .views() -// .iter() -// .map(|view| view.offset_view(buffers_offset)), -// ); -// } - -// VarBinViewArray::try_new(views.freeze(), buffers, dtype.clone(), validity) -// } - #[cfg(test)] mod tests { use std::sync::Arc; From f474f95d553d746fbb2702d8c7cad4f0ffed1f92 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 25 Feb 2025 11:17:52 -0500 Subject: [PATCH 3/5] fix things --- rustfmt.toml | 1 + vortex-array/src/arrays/chunked/canonical.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rustfmt.toml b/rustfmt.toml index 9fccaf4276..62bd702f18 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -5,3 +5,4 @@ group_imports = "StdExternalCrate" unstable_features = true use_field_init_shorthand = true imports_granularity = "Module" +style_edition = "2024" diff --git a/vortex-array/src/arrays/chunked/canonical.rs b/vortex-array/src/arrays/chunked/canonical.rs index 44cba81526..044a458087 100644 --- a/vortex-array/src/arrays/chunked/canonical.rs +++ b/vortex-array/src/arrays/chunked/canonical.rs @@ -1,7 +1,7 @@ use vortex_error::VortexResult; use super::ChunkedArray; -use crate::builders::{builder_with_capacity, ArrayBuilder}; +use crate::builders::{ArrayBuilder, builder_with_capacity}; use crate::{Array as _, ArrayCanonicalImpl, Canonical}; impl ArrayCanonicalImpl for ChunkedArray { @@ -28,6 +28,7 @@ mod tests { use vortex_dtype::Nullability::NonNullable; use vortex_dtype::PType::I32; + use crate::ToCanonical; use crate::accessor::ArrayAccessor; use crate::array::Array; use crate::arrays::chunked::canonical::pack_views; @@ -35,7 +36,6 @@ mod tests { use crate::compute::{scalar_at, slice}; use crate::validity::Validity; use crate::variants::StructArrayTrait; - use crate::ToCanonical; fn stringview_array() -> VarBinViewArray { VarBinViewArray::from_iter_str(["foo", "bar", "baz", "quak"]) From 99897f316c65f5bd043e993523304b9fd677cbe1 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 25 Feb 2025 11:21:51 -0500 Subject: [PATCH 4/5] remove --- rustfmt.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/rustfmt.toml b/rustfmt.toml index 62bd702f18..9fccaf4276 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -5,4 +5,3 @@ group_imports = "StdExternalCrate" unstable_features = true use_field_init_shorthand = true imports_granularity = "Module" -style_edition = "2024" From 06023f0ecee4b47da0dbb95ddfd38c821aca35fb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 25 Feb 2025 11:52:26 -0500 Subject: [PATCH 5/5] zoom --- vortex-array/benches/chunk_array_builder.rs | 18 ++++++++---------- vortex-array/src/arrays/varbinview/mod.rs | 8 +------- vortex-array/src/stats/mod.rs | 13 +++++-------- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs index 3e78c5c8f7..b95c4e2acc 100644 --- a/vortex-array/benches/chunk_array_builder.rs +++ b/vortex-array/benches/chunk_array_builder.rs @@ -65,16 +65,14 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize, fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); - bencher - .with_inputs(|| chunks.clone()) - .bench_values(|chunk| { - let mut builder = VarBinViewBuilder::with_capacity( - DType::Utf8(chunk.dtype().nullability()), - len * chunk_count, - ); - chunk.append_to_builder(&mut builder).vortex_unwrap(); - builder.finish() - }) + bencher.with_inputs(|| chunks.clone()).bench_refs(|chunk| { + let mut builder = VarBinViewBuilder::with_capacity( + DType::Utf8(chunk.dtype().nullability()), + len * chunk_count, + ); + chunk.append_to_builder(&mut builder).vortex_unwrap(); + builder.finish() + }) } #[divan::bench(args = BENCH_ARGS)] diff --git a/vortex-array/src/arrays/varbinview/mod.rs b/vortex-array/src/arrays/varbinview/mod.rs index 423b19a313..05eb7980dc 100644 --- a/vortex-array/src/arrays/varbinview/mod.rs +++ b/vortex-array/src/arrays/varbinview/mod.rs @@ -456,13 +456,7 @@ impl ArrayStatisticsImpl for VarBinViewArray { impl ArrayCanonicalImpl for VarBinViewArray { fn _to_canonical(&self) -> VortexResult { - let nullable = self.dtype().is_nullable(); - let arrow_array = varbinview_as_arrow(self); - let vortex_array = ArrayRef::from_arrow(arrow_array, nullable); - - Ok(Canonical::VarBinView(VarBinViewArray::try_from_array( - vortex_array, - )?)) + Ok(Canonical::VarBinView(self.clone())) } fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> { diff --git a/vortex-array/src/stats/mod.rs b/vortex-array/src/stats/mod.rs index e116df48b1..007868d433 100644 --- a/vortex-array/src/stats/mod.rs +++ b/vortex-array/src/stats/mod.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use arrow_buffer::bit_iterator::BitIterator; use arrow_buffer::{BooleanBufferBuilder, MutableBuffer}; -use enum_iterator::{Sequence, all, cardinality}; +use enum_iterator::{Sequence, cardinality}; use itertools::Itertools; use log::debug; use num_enum::{IntoPrimitive, TryFromPrimitive}; @@ -327,13 +327,10 @@ pub trait Statistics { fn retain_only(&self, stats: &[Stat]); fn inherit(&self, parent: &dyn Statistics) { - for stat in all::() { - if let Some(s) = parent.get_stat(stat) { - // TODO(ngates): we may need a set_all(&[(Stat, Precision)]) method - // so we don't have to take lots of write locks. - // TODO(ngates): depending on statistic, this should choose the more precise one. - self.set_stat(stat, s); - } + let parent_stats_set = parent.stats_set(); + for (stat, value) in parent_stats_set.into_iter() { + // TODO(ngates): depending on statistic, this should choose the more precise one. + self.set_stat(stat, value); } } }