Skip to content

Commit bbd85ed

Browse files
authored
Add ListView & LargeListView basic construction and validation (#5664)
* feat: list view basic construction and validation * fix: validate offset and sizes * chore: remove unused check * chore: add overflow checked * chore: lint
1 parent a20d2e5 commit bbd85ed

File tree

2 files changed

+154
-4
lines changed

2 files changed

+154
-4
lines changed

arrow-data/src/data.rs

+66-4
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ use crate::bit_iterator::BitSliceIterator;
2222
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
2323
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
2424
use arrow_schema::{ArrowError, DataType, UnionMode};
25-
use std::mem;
2625
use std::ops::Range;
2726
use std::sync::Arc;
27+
use std::{mem, usize};
2828

2929
use crate::{equal, validate_binary_view, validate_string_view};
3030

@@ -929,6 +929,41 @@ impl ArrayData {
929929
Ok(())
930930
}
931931

932+
/// Does a cheap sanity check that the `self.len` values in `buffer` are valid
933+
/// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
934+
fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
935+
&self,
936+
values_length: usize,
937+
) -> Result<(), ArrowError> {
938+
let offsets: &[T] = self.typed_buffer(0, self.len)?;
939+
let sizes: &[T] = self.typed_buffer(1, self.len)?;
940+
for i in 0..values_length {
941+
let size = sizes[i].to_usize().ok_or_else(|| {
942+
ArrowError::InvalidArgumentError(format!(
943+
"Error converting size[{}] ({}) to usize for {}",
944+
i, sizes[i], self.data_type
945+
))
946+
})?;
947+
let offset = offsets[i].to_usize().ok_or_else(|| {
948+
ArrowError::InvalidArgumentError(format!(
949+
"Error converting offset[{}] ({}) to usize for {}",
950+
i, offsets[i], self.data_type
951+
))
952+
})?;
953+
if size
954+
.checked_add(offset)
955+
.expect("Offset and size have exceeded the usize boundary")
956+
> values_length
957+
{
958+
return Err(ArrowError::InvalidArgumentError(format!(
959+
"Size {} at index {} is larger than the remaining values for {}",
960+
size, i, self.data_type
961+
)));
962+
}
963+
}
964+
Ok(())
965+
}
966+
932967
/// Validates the layout of `child_data` ArrayData structures
933968
fn validate_child_data(&self) -> Result<(), ArrowError> {
934969
match &self.data_type {
@@ -942,6 +977,16 @@ impl ArrayData {
942977
self.validate_offsets::<i64>(values_data.len)?;
943978
Ok(())
944979
}
980+
DataType::ListView(field) => {
981+
let values_data = self.get_single_valid_child_data(field.data_type())?;
982+
self.validate_offsets_and_sizes::<i32>(values_data.len)?;
983+
Ok(())
984+
}
985+
DataType::LargeListView(field) => {
986+
let values_data = self.get_single_valid_child_data(field.data_type())?;
987+
self.validate_offsets_and_sizes::<i64>(values_data.len)?;
988+
Ok(())
989+
}
945990
DataType::FixedSizeList(field, list_size) => {
946991
let values_data = self.get_single_valid_child_data(field.data_type())?;
947992

@@ -1546,9 +1591,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
15461591
DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
15471592
DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
15481593
DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1549-
DataType::ListView(_) | DataType::LargeListView(_) => {
1550-
unimplemented!("ListView/LargeListView not implemented")
1551-
}
1594+
DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1595+
DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
15521596
DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
15531597
DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
15541598
DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
@@ -1661,6 +1705,24 @@ impl DataTypeLayout {
16611705
variadic: true,
16621706
}
16631707
}
1708+
1709+
/// Describes a list view type
1710+
pub fn new_list_view<T>() -> Self {
1711+
Self {
1712+
buffers: vec![
1713+
BufferSpec::FixedWidth {
1714+
byte_width: mem::size_of::<T>(),
1715+
alignment: mem::align_of::<T>(),
1716+
},
1717+
BufferSpec::FixedWidth {
1718+
byte_width: mem::size_of::<T>(),
1719+
alignment: mem::align_of::<T>(),
1720+
},
1721+
],
1722+
can_contain_null_mask: true,
1723+
variadic: true,
1724+
}
1725+
}
16641726
}
16651727

16661728
/// Layout specification for a single data type buffer

arrow/tests/array_validation.rs

+88
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() {
342342
.unwrap();
343343
}
344344

345+
/// Test that the list of type `data_type` generates correct offset and size out of bounds errors
346+
fn check_list_view_offsets_sizes<T: ArrowNativeType>(
347+
data_type: DataType,
348+
offsets: Vec<T>,
349+
sizes: Vec<T>,
350+
) {
351+
let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
352+
let offsets_buffer = Buffer::from_slice_ref(offsets);
353+
let sizes_buffer = Buffer::from_slice_ref(sizes);
354+
ArrayData::try_new(
355+
data_type,
356+
4,
357+
None,
358+
0,
359+
vec![offsets_buffer, sizes_buffer],
360+
vec![values.into_data()],
361+
)
362+
.unwrap();
363+
}
364+
365+
#[test]
366+
#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for ListView")]
367+
fn test_validate_list_view_offsets_sizes() {
368+
let field_type = Field::new("f", DataType::Int32, true);
369+
check_list_view_offsets_sizes::<i32>(
370+
DataType::ListView(Arc::new(field_type)),
371+
vec![0, 1, 1, 2],
372+
vec![1, 1, 1, 3],
373+
);
374+
}
375+
376+
#[test]
377+
#[should_panic(
378+
expected = "Size 3 at index 3 is larger than the remaining values for LargeListView"
379+
)]
380+
fn test_validate_large_list_view_offsets_sizes() {
381+
let field_type = Field::new("f", DataType::Int32, true);
382+
check_list_view_offsets_sizes::<i64>(
383+
DataType::LargeListView(Arc::new(field_type)),
384+
vec![0, 1, 1, 2],
385+
vec![1, 1, 1, 3],
386+
);
387+
}
388+
389+
#[test]
390+
#[should_panic(expected = "Error converting offset[1] (-1) to usize for ListView")]
391+
fn test_validate_list_view_negative_offsets() {
392+
let field_type = Field::new("f", DataType::Int32, true);
393+
check_list_view_offsets_sizes::<i32>(
394+
DataType::ListView(Arc::new(field_type)),
395+
vec![0, -1, 1, 2],
396+
vec![1, 1, 1, 3],
397+
);
398+
}
399+
400+
#[test]
401+
#[should_panic(expected = "Error converting size[2] (-1) to usize for ListView")]
402+
fn test_validate_list_view_negative_sizes() {
403+
let field_type = Field::new("f", DataType::Int32, true);
404+
check_list_view_offsets_sizes::<i32>(
405+
DataType::ListView(Arc::new(field_type)),
406+
vec![0, 1, 1, 2],
407+
vec![1, 1, -1, 3],
408+
);
409+
}
410+
411+
#[test]
412+
#[should_panic(expected = "Error converting offset[1] (-1) to usize for LargeListView")]
413+
fn test_validate_large_list_view_negative_offsets() {
414+
let field_type = Field::new("f", DataType::Int32, true);
415+
check_list_view_offsets_sizes::<i64>(
416+
DataType::LargeListView(Arc::new(field_type)),
417+
vec![0, -1, 1, 2],
418+
vec![1, 1, 1, 3],
419+
);
420+
}
421+
422+
#[test]
423+
#[should_panic(expected = "Error converting size[2] (-1) to usize for LargeListView")]
424+
fn test_validate_large_list_view_negative_sizes() {
425+
let field_type = Field::new("f", DataType::Int32, true);
426+
check_list_view_offsets_sizes::<i64>(
427+
DataType::LargeListView(Arc::new(field_type)),
428+
vec![0, 1, 1, 2],
429+
vec![1, 1, -1, 3],
430+
);
431+
}
432+
345433
#[test]
346434
#[should_panic(
347435
expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"

0 commit comments

Comments
 (0)