diff --git a/vortex-array/benches/chunk_array_builder.rs b/vortex-array/benches/chunk_array_builder.rs index 3e78c5c8f7..b95c4e2acc 100644 --- a/vortex-array/benches/chunk_array_builder.rs +++ b/vortex-array/benches/chunk_array_builder.rs @@ -65,16 +65,14 @@ fn chunked_opt_bool_into_canonical(bencher: Bencher, (len, chunk_count): (usize, fn chunked_varbinview_canonical_into(bencher: Bencher, (len, chunk_count): (usize, usize)) { let chunks = make_string_chunks(false, len, chunk_count); - bencher - .with_inputs(|| chunks.clone()) - .bench_values(|chunk| { - let mut builder = VarBinViewBuilder::with_capacity( - DType::Utf8(chunk.dtype().nullability()), - len * chunk_count, - ); - chunk.append_to_builder(&mut builder).vortex_unwrap(); - builder.finish() - }) + bencher.with_inputs(|| chunks.clone()).bench_refs(|chunk| { + let mut builder = VarBinViewBuilder::with_capacity( + DType::Utf8(chunk.dtype().nullability()), + len * chunk_count, + ); + chunk.append_to_builder(&mut builder).vortex_unwrap(); + builder.finish() + }) } #[divan::bench(args = BENCH_ARGS)] diff --git a/vortex-array/src/arrays/chunked/canonical.rs b/vortex-array/src/arrays/chunked/canonical.rs index c9526fb700..044a458087 100644 --- a/vortex-array/src/arrays/chunked/canonical.rs +++ b/vortex-array/src/arrays/chunked/canonical.rs @@ -1,24 +1,14 @@ -use arrow_buffer::BooleanBufferBuilder; -use vortex_buffer::BufferMut; -use vortex_dtype::{DType, NativePType, Nullability, PType, StructDType, match_each_native_ptype}; -use vortex_error::{VortexExpect, VortexResult, vortex_err}; +use vortex_error::VortexResult; -use crate::array::ArrayCanonicalImpl; -use crate::arrays::chunked::ChunkedArray; -use crate::arrays::extension::ExtensionArray; -use crate::arrays::null::NullArray; -use crate::arrays::primitive::PrimitiveArray; -use crate::arrays::struct_::StructArray; -use crate::arrays::{BoolArray, ListArray, VarBinViewArray}; -use crate::builders::ArrayBuilder; -use crate::compute::{scalar_at, slice, try_cast}; -use crate::validity::Validity; -use crate::{Array, ArrayRef, ArrayVariants, Canonical, ToCanonical}; +use super::ChunkedArray; +use crate::builders::{ArrayBuilder, builder_with_capacity}; +use crate::{Array as _, ArrayCanonicalImpl, Canonical}; impl ArrayCanonicalImpl for ChunkedArray { fn _to_canonical(&self) -> VortexResult { - let validity = Validity::copy_from_array(self)?; - try_canonicalize_chunks(self.chunks(), validity, self.dtype()) + let mut builder = builder_with_capacity(self.dtype(), self.len()); + self.append_to_builder(builder.as_mut())?; + builder.finish().to_canonical() } fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> { @@ -29,236 +19,6 @@ impl ArrayCanonicalImpl for ChunkedArray { } } -pub(crate) fn try_canonicalize_chunks( - chunks: &[ArrayRef], - validity: Validity, - dtype: &DType, -) -> VortexResult { - match dtype { - // Structs can have their internal field pointers swizzled to push the chunking down - // one level internally without copying or decompressing any data. - DType::Struct(struct_dtype, _) => { - let struct_array = swizzle_struct_chunks(chunks, validity, struct_dtype)?; - Ok(Canonical::Struct(struct_array)) - } - - // Extension arrays are containers that wraps an inner storage array with some metadata. - // We delegate to the canonical format of the internal storage array for every chunk, and - // push the chunking down into the inner storage array. - // - // Input: - // ------ - // - // ChunkedArray - // / \ - // / \ - // ExtensionArray ExtensionArray - // | | - // storage storage - // - // - // Output: - // ------ - // - // ExtensionArray - // | - // ChunkedArray - // / \ - // storage storage - // - DType::Extension(ext_dtype) => { - // Recursively apply canonicalization and packing to the storage array backing - // each chunk of the extension array. - let storage_chunks: Vec = chunks - .iter() - // Extension-typed arrays can be compressed into something that is not an - // ExtensionArray, so we should canonicalize each chunk into ExtensionArray first. - .map(|chunk| { - chunk - .clone() - .as_extension_typed() - .vortex_expect("Chunk could not be downcast to ExtensionArrayTrait") - .storage_data() - }) - .collect(); - let storage_dtype = ext_dtype.storage_dtype().clone(); - let chunked_storage = - ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); - - Ok(Canonical::Extension(ExtensionArray::new( - ext_dtype.clone(), - chunked_storage, - ))) - } - - DType::List(..) => { - // TODO(joe): improve performance, use a listview, once it exists - - let list = pack_lists(chunks, validity, dtype)?; - Ok(Canonical::List(list)) - } - - DType::Bool(_) => { - let bool_array = pack_bools(chunks, validity)?; - Ok(Canonical::Bool(bool_array)) - } - DType::Primitive(ptype, _) => { - match_each_native_ptype!(ptype, |$P| { - let prim_array = pack_primitives::<$P>(chunks, validity)?; - Ok(Canonical::Primitive(prim_array)) - }) - } - DType::Utf8(_) => { - let varbin_array = pack_views(chunks, dtype, validity)?; - Ok(Canonical::VarBinView(varbin_array)) - } - DType::Binary(_) => { - let varbin_array = pack_views(chunks, dtype, validity)?; - Ok(Canonical::VarBinView(varbin_array)) - } - DType::Null => { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let null_array = NullArray::new(len); - Ok(Canonical::Null(null_array)) - } - } -} - -fn pack_lists(chunks: &[ArrayRef], validity: Validity, dtype: &DType) -> VortexResult { - let len: usize = chunks.iter().map(|c| c.len()).sum(); - let mut offsets = BufferMut::::with_capacity(len + 1); - offsets.push(0); - let mut elements = Vec::new(); - let elem_dtype = dtype - .as_list_element() - .vortex_expect("ListArray must have List dtype"); - - for chunk in chunks { - let chunk = chunk.to_list()?; - // TODO: handle i32 offsets if they fit. - let offsets_arr = try_cast( - chunk.offsets(), - &DType::Primitive(PType::I64, Nullability::NonNullable), - )? - .to_primitive()?; - - let first_offset_value: usize = usize::try_from(&scalar_at(&offsets_arr, 0)?)?; - let last_offset_value: usize = - usize::try_from(&scalar_at(&offsets_arr, offsets_arr.len() - 1)?)?; - elements.push(slice( - chunk.elements(), - first_offset_value, - last_offset_value, - )?); - - let adjustment_from_previous = *offsets - .last() - .ok_or_else(|| vortex_err!("List offsets must have at least one element"))?; - offsets.extend( - offsets_arr - .as_slice::() - .iter() - .skip(1) - .map(|off| off + adjustment_from_previous - first_offset_value as i64), - ); - } - let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array(); - let offsets = PrimitiveArray::new(offsets.freeze(), Validity::NonNullable); - - ListArray::try_new(chunked_elements, offsets.into_array(), validity) -} - -/// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single -/// StructArray, where the Array for each Field is a ChunkedArray. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn swizzle_struct_chunks( - chunks: &[ArrayRef], - validity: Validity, - struct_dtype: &StructDType, -) -> VortexResult { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let mut field_arrays = Vec::new(); - - for (field_idx, field_dtype) in struct_dtype.fields().enumerate() { - let field_chunks = chunks - .iter() - .map(|c| { - c.as_struct_typed() - .vortex_expect("Chunk was not a StructArray") - .maybe_null_field_by_idx(field_idx) - .vortex_expect("Invalid chunked array") - }) - .collect::>(); - let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?; - field_arrays.push(field_array.into_array()); - } - - StructArray::try_new(struct_dtype.names().clone(), field_arrays, len, validity) -} - -/// Builds a new [BoolArray] by repacking the values from the chunks in a single contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_bools(chunks: &[ArrayRef], validity: Validity) -> VortexResult { - let len = chunks.iter().map(|chunk| chunk.len()).sum(); - let mut buffer = BooleanBufferBuilder::new(len); - for chunk in chunks { - let chunk = chunk.to_bool()?; - buffer.append_buffer(chunk.boolean_buffer()); - } - Ok(BoolArray::new(buffer.finish(), validity)) -} - -/// Builds a new [PrimitiveArray] by repacking the values from the chunks into a single -/// contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_primitives( - chunks: &[ArrayRef], - validity: Validity, -) -> VortexResult { - let total_len = chunks.iter().map(|a| a.len()).sum(); - let mut buffer = BufferMut::with_capacity(total_len); - for chunk in chunks { - let chunk = chunk.to_primitive()?; - buffer.extend_from_slice(chunk.as_slice::()); - } - Ok(PrimitiveArray::new(buffer.freeze(), validity)) -} - -/// Builds a new [VarBinViewArray] by repacking the values from the chunks into a single -/// contiguous array. -/// -/// It is expected this function is only called from [try_canonicalize_chunks], and thus all chunks have -/// been checked to have the same DType already. -fn pack_views( - chunks: &[ArrayRef], - dtype: &DType, - validity: Validity, -) -> VortexResult { - let total_len = chunks.iter().map(|a| a.len()).sum(); - let mut views = BufferMut::with_capacity(total_len); - let mut buffers = Vec::new(); - for chunk in chunks { - let buffers_offset = u32::try_from(buffers.len())?; - let canonical_chunk = chunk.to_varbinview()?; - buffers.extend(canonical_chunk.buffers().iter().cloned()); - - views.extend( - canonical_chunk - .views() - .iter() - .map(|view| view.offset_view(buffers_offset)), - ); - } - - VarBinViewArray::try_new(views.freeze(), buffers, dtype.clone(), validity) -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/vortex-array/src/arrays/varbinview/mod.rs b/vortex-array/src/arrays/varbinview/mod.rs index 423b19a313..05eb7980dc 100644 --- a/vortex-array/src/arrays/varbinview/mod.rs +++ b/vortex-array/src/arrays/varbinview/mod.rs @@ -456,13 +456,7 @@ impl ArrayStatisticsImpl for VarBinViewArray { impl ArrayCanonicalImpl for VarBinViewArray { fn _to_canonical(&self) -> VortexResult { - let nullable = self.dtype().is_nullable(); - let arrow_array = varbinview_as_arrow(self); - let vortex_array = ArrayRef::from_arrow(arrow_array, nullable); - - Ok(Canonical::VarBinView(VarBinViewArray::try_from_array( - vortex_array, - )?)) + Ok(Canonical::VarBinView(self.clone())) } fn _append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> { diff --git a/vortex-array/src/stats/mod.rs b/vortex-array/src/stats/mod.rs index e116df48b1..007868d433 100644 --- a/vortex-array/src/stats/mod.rs +++ b/vortex-array/src/stats/mod.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use arrow_buffer::bit_iterator::BitIterator; use arrow_buffer::{BooleanBufferBuilder, MutableBuffer}; -use enum_iterator::{Sequence, all, cardinality}; +use enum_iterator::{Sequence, cardinality}; use itertools::Itertools; use log::debug; use num_enum::{IntoPrimitive, TryFromPrimitive}; @@ -327,13 +327,10 @@ pub trait Statistics { fn retain_only(&self, stats: &[Stat]); fn inherit(&self, parent: &dyn Statistics) { - for stat in all::() { - if let Some(s) = parent.get_stat(stat) { - // TODO(ngates): we may need a set_all(&[(Stat, Precision)]) method - // so we don't have to take lots of write locks. - // TODO(ngates): depending on statistic, this should choose the more precise one. - self.set_stat(stat, s); - } + let parent_stats_set = parent.stats_set(); + for (stat, value) in parent_stats_set.into_iter() { + // TODO(ngates): depending on statistic, this should choose the more precise one. + self.set_stat(stat, value); } } }