diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 0f95adacf10c..ae2479e81f2d 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::print_long_array; +use crate::array::{print_long_array, replace_nulls}; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; @@ -316,6 +316,10 @@ impl Array for BooleanArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_null_count(&self) -> usize { self.null_count() } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index a57abc5b1e71..4cb85e15db83 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_offsets, print_long_array}; +use crate::array::{get_offsets, print_long_array, replace_nulls}; use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; @@ -461,6 +461,10 @@ impl Array for GenericByteArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut sum = self.value_offsets.inner().inner().capacity(); sum += self.value_data.capacity(); diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 5b313913a7ef..ab2915625e3c 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::print_long_array; +use crate::array::{print_long_array, replace_nulls}; use crate::builder::{ArrayBuilder, GenericByteViewBuilder}; use crate::iterator::ArrayIter; use crate::types::bytes::ByteArrayNativeType; @@ -549,6 +549,10 @@ impl Array for GenericByteViewArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::(); sum += self.views.inner().capacity(); diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 6f27b383c0ea..7c325460c4a0 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::replace_nulls; use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder}; use crate::cast::AsArray; use crate::iterator::ArrayIter; @@ -728,6 +729,10 @@ impl Array for DictionaryArray { self.keys.nulls() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { match self.values.nulls() { None => self.nulls().cloned(), @@ -862,6 +867,10 @@ impl Array for TypedDictionaryArray<'_, K, V self.dictionary.nulls() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { self.dictionary.logical_nulls() } diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 1371e81e2650..693563375af2 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::print_long_array; +use crate::array::{print_long_array, replace_nulls}; use crate::iterator::FixedSizeBinaryIter; use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray, Scalar}; use arrow_buffer::buffer::NullBuffer; @@ -610,6 +610,10 @@ impl Array for FixedSizeBinaryArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut sum = self.value_data.capacity(); if let Some(n) = &self.nulls { diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 72855cef1f04..2e9b9d497e6c 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::print_long_array; +use crate::array::{print_long_array, replace_nulls}; use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder}; use crate::iterator::FixedSizeListIter; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType}; @@ -409,6 +409,10 @@ impl Array for FixedSizeListArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut size = self.values.get_buffer_memory_size(); if let Some(n) = self.nulls.as_ref() { diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 06d5ee4e142d..b187ddfefce5 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_offsets, make_array, print_long_array}; +use crate::array::{get_offsets, make_array, print_long_array, replace_nulls}; use crate::builder::{GenericListBuilder, PrimitiveBuilder}; use crate::{ iterator::GenericListArrayIter, new_empty_array, Array, ArrayAccessor, ArrayRef, @@ -493,6 +493,10 @@ impl Array for GenericListArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut size = self.values.get_buffer_memory_size(); size += self.value_offsets.inner().inner().capacity(); diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index bab686c3e47a..808ac26fc230 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -22,7 +22,7 @@ use std::any::Any; use std::ops::Add; use std::sync::Arc; -use crate::array::{make_array, print_long_array}; +use crate::array::{make_array, print_long_array, replace_nulls}; use crate::iterator::GenericListViewArrayIter; use crate::{new_empty_array, Array, ArrayAccessor, ArrayRef, FixedSizeListArray, OffsetSizeTrait}; @@ -334,6 +334,10 @@ impl Array for GenericListViewArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut size = self.values.get_buffer_memory_size(); size += self.value_offsets.inner().capacity(); diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index d40b8ee84518..45f841322a97 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_offsets, print_long_array}; +use crate::array::{get_offsets, print_long_array, replace_nulls}; use crate::iterator::MapArrayIter; use crate::{make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; @@ -380,6 +380,10 @@ impl Array for MapArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut size = self.entries.get_buffer_memory_size(); size += self.value_offsets.inner().inner().capacity(); diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 7ca59680a6fe..b332587545ea 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -103,6 +103,8 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn as_any(&self) -> &dyn Any; /// Returns the underlying data of this array + /// + /// See [`Self::into_data`] for a version that consumes self fn to_data(&self) -> ArrayData; /// Returns the underlying data of this array @@ -196,6 +198,28 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// use the slower [`Array::logical_nulls`] to obtain a computed mask. fn nulls(&self) -> Option<&NullBuffer>; + /// Replaces the nulls of this array. + /// + /// # Panics + /// Panics if the length of the null buffer is not equal to the length of the array. + /// + /// # Example: + /// ``` + /// # use arrow_array::{Array, Int32Array}; + /// # use arrow_array::cast::AsArray; + /// # use arrow_buffer::NullBuffer; + /// // Create an array with values [1, null, 3, 4, 5] + /// let array = Int32Array::from(vec![Some(1), None, Some(3), Some(4), Some(5)]); + /// // Set the first, third, and fifth elements to null, others to valid + /// let nulls = Some(NullBuffer::from(vec![false, true, false, true, false])); + /// let array_with_nulls = array.with_nulls(nulls); + /// assert_eq!( + /// array_with_nulls.as_primitive(), + /// &Int32Array::from(vec![None, Some(0), None, Some(4), None]), + /// ); + /// ``` + fn with_nulls(self, nulls: Option) -> ArrayRef; + /// Returns a potentially computed [`NullBuffer`] that represents the logical /// null values of this array, if any. /// @@ -372,6 +396,10 @@ impl Array for ArrayRef { self.as_ref().nulls() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { self.as_ref().logical_nulls() } @@ -442,6 +470,10 @@ impl Array for &T { T::nulls(self) } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { T::logical_nulls(self) } @@ -843,6 +875,30 @@ where Ok(()) } +/// Helper function to replace the nulls in an array with a new null buffer +/// +/// See [`Array::with_nulls`] for more information +pub(crate) fn replace_nulls(array_data: ArrayData, nulls: Option) -> ArrayRef { + let Some(nulls) = nulls else { + return make_array(array_data); + }; + + if nulls.len() != array_data.len() { + panic!( + "Null buffer length must be equal to the array length. \ + Expected: {}, got: {}", + array_data.len(), + nulls.len() + ); + } + + let data = array_data.into_builder().nulls(Some(nulls)); + + // SAFETY: + // Checked that the null buffer has the same length as the array + make_array(unsafe { data.build_unchecked() }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index 9a7a5ebe17fe..a218ee062571 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -17,6 +17,7 @@ //! Contains the `NullArray` type. +use crate::array::replace_nulls; use crate::builder::NullBuilder; use crate::{Array, ArrayRef}; use arrow_buffer::buffer::NullBuffer; @@ -113,6 +114,10 @@ impl Array for NullArray { None } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { (self.len != 0).then(|| NullBuffer::new_null(self.len)) } diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 7b0d6c5ca1b6..7c4fbd490274 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::print_long_array; +use crate::array::{print_long_array, replace_nulls}; use crate::builder::{BooleanBufferBuilder, BufferBuilder, PrimitiveBuilder}; use crate::iterator::PrimitiveIter; use crate::temporal_conversions::{ @@ -1160,6 +1160,10 @@ impl Array for PrimitiveArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_null_count(&self) -> usize { self.null_count() } diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index 81c8cdcea4d3..cbc68985f45a 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -22,6 +22,7 @@ use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuff use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field}; +use crate::array::replace_nulls; use crate::{ builder::StringRunBuilder, make_array, @@ -338,6 +339,10 @@ impl Array for RunArray { None } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { let len = self.len(); let nulls = self.values.logical_nulls()?; @@ -592,6 +597,10 @@ impl Array for TypedRunArray<'_, R, V> { self.run_array.nulls() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { self.run_array.logical_nulls() } diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 059bc0b5e65b..ba27253345b7 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::replace_nulls; use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -377,6 +378,10 @@ impl Array for StructArray { self.nulls.as_ref() } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn get_buffer_memory_size(&self) -> usize { let mut size = self.fields.iter().map(|a| a.get_buffer_memory_size()).sum(); if let Some(n) = self.nulls.as_ref() { diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 3c6da5a7b5c0..224931b590a5 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -16,6 +16,7 @@ // under the License. #![allow(clippy::enum_clike_unportable_variant)] +use crate::array::replace_nulls; use crate::{make_array, Array, ArrayRef}; use arrow_buffer::bit_chunk_iterator::{BitChunkIterator, BitChunks}; use arrow_buffer::buffer::NullBuffer; @@ -752,6 +753,10 @@ impl Array for UnionArray { None } + fn with_nulls(self, nulls: Option) -> ArrayRef { + replace_nulls(self.to_data(), nulls) + } + fn logical_nulls(&self) -> Option { let fields = match self.data_type() { DataType::Union(fields, _) => fields, diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index d1e3c35bfbde..3d6368bbed72 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -27,17 +27,16 @@ use arrow_schema::{ArrowError, DataType}; /// /// Typically used to implement NULLIF. pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - let left_data = left.to_data(); - - if left_data.len() != right.len() { + if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform comparison operation on arrays of different length".to_string(), )); } - let len = left_data.len(); + let len = left.len(); - if len == 0 || left_data.data_type() == &DataType::Null { - return Ok(make_array(left_data)); + let left = make_array(left.to_data()); + if len == 0 || left.data_type() == &DataType::Null { + return Ok(left); } // left=0 (null) right=null output bitmap=null @@ -57,7 +56,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut valid_count = 0; let b = bitwise_bin_op_helper( @@ -89,11 +88,8 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result