Skip to content

Commit afcb132

Browse files
authored
fix: teach list builder to slice off trailing garbage (#2514)
ListArray is allowed to have a final offset less than the length of its elements. E.g.: ``` 1, 2, 3, 4, 5 ``` with the offsets: ``` 0, 2, 4 ``` the lists are: ``` [1, 2], [3, 4] ``` The element five is included in no lists.
1 parent dbb734c commit afcb132

File tree

2 files changed

+63
-10
lines changed

2 files changed

+63
-10
lines changed

vortex-array/src/builders/list.rs

+59-10
Original file line numberDiff line numberDiff line change
@@ -119,24 +119,34 @@ impl<O: OffsetPType> ArrayBuilder for ListBuilder<O> {
119119

120120
let list = array.to_canonical()?.into_list()?;
121121

122-
let offset = self.value_builder.len();
123-
self.value_builder.extend_from_array(list.elements())?;
122+
let cursor_usize = self.value_builder.len();
123+
let cursor = O::from_usize(cursor_usize).ok_or_else(|| {
124+
vortex_err!(
125+
"cannot convert length {} to type {:?}",
126+
cursor_usize,
127+
O::PTYPE
128+
)
129+
})?;
124130

125131
let offsets = binary_numeric(
126132
&try_cast(
127133
&slice(list.offsets(), 1, list.offsets().len())?,
128134
&DType::Primitive(O::PTYPE, NonNullable),
129135
)?,
130-
&ConstantArray::new(
131-
O::from_usize(offset).ok_or_else(|| {
132-
vortex_err!("cannot convert offset {} to type {:?}", offset, O::PTYPE)
133-
})?,
134-
list.len(),
135-
),
136+
&ConstantArray::new(cursor, list.len()),
136137
BinaryNumericOperator::Add,
137138
)?;
138139
self.index_builder.extend_from_array(&offsets)?;
139140

141+
if !list.is_empty() {
142+
let last_used_index = self.index_builder.values().last().vortex_expect("there must be at least one index because we just extended a non-zero list of offsets");
143+
self.value_builder.extend_from_array(&slice(
144+
list.elements(),
145+
0,
146+
last_used_index.as_() - cursor_usize,
147+
)?)?;
148+
}
149+
140150
Ok(())
141151
}
142152

@@ -162,15 +172,18 @@ mod tests {
162172
use std::sync::Arc;
163173

164174
use Nullability::{NonNullable, Nullable};
175+
use vortex_buffer::buffer;
165176
use vortex_dtype::PType::I32;
166177
use vortex_dtype::{DType, Nullability};
167178
use vortex_scalar::Scalar;
168179

169-
use crate::ToCanonical;
170180
use crate::array::Array;
171-
use crate::arrays::{ListArray, OffsetPType};
181+
use crate::arrays::{ChunkedArray, ListArray, OffsetPType};
172182
use crate::builders::ArrayBuilder;
173183
use crate::builders::list::ListBuilder;
184+
use crate::compute::scalar_at;
185+
use crate::validity::Validity;
186+
use crate::{IntoArray as _, ToCanonical};
174187

175188
#[test]
176189
fn test_empty() {
@@ -328,4 +341,40 @@ mod tests {
328341
test_extend_builder_gen::<u32>();
329342
test_extend_builder_gen::<u64>();
330343
}
344+
345+
#[test]
346+
pub fn test_array_with_gap() {
347+
let one_trailing_unused_element = ListArray::try_new(
348+
buffer![1, 2, 3, 4].into_array(),
349+
buffer![0, 3].into_array(),
350+
Validity::NonNullable,
351+
)
352+
.unwrap();
353+
354+
let second_array = ListArray::try_new(
355+
buffer![5, 6].into_array(),
356+
buffer![0, 2].into_array(),
357+
Validity::NonNullable,
358+
)
359+
.unwrap();
360+
361+
let chunked_list = ChunkedArray::try_new(
362+
vec![
363+
one_trailing_unused_element.clone().into_array(),
364+
second_array.clone().into_array(),
365+
],
366+
DType::List(Arc::new(DType::Primitive(I32, NonNullable)), NonNullable),
367+
);
368+
369+
let canon_values = chunked_list.unwrap().to_list().unwrap();
370+
371+
assert_eq!(
372+
scalar_at(&one_trailing_unused_element, 0).unwrap(),
373+
scalar_at(&canon_values, 0).unwrap()
374+
);
375+
assert_eq!(
376+
scalar_at(&second_array, 0).unwrap(),
377+
scalar_at(&canon_values, 1).unwrap()
378+
);
379+
}
331380
}

vortex-array/src/builders/primitive.rs

+4
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ impl<T: NativePType> PrimitiveBuilder<T> {
5454
}
5555
}
5656

57+
pub fn values(&self) -> &[T] {
58+
self.values.as_ref()
59+
}
60+
5761
/// Create a new handle to the next `len` uninitialized values in the builder.
5862
///
5963
/// All reads/writes through the handle to the values buffer or the validity buffer will operate

0 commit comments

Comments
 (0)