Skip to content

Commit 78762dc

Browse files
clide-stustvold
andauthored
Split arrow_cast::cast::list into it's own submodule (#5537)
* Split up arrow_cast::list * Update arrow-cast/src/cast/mod.rs Co-authored-by: Raphael Taylor-Davies <[email protected]> * Fix failing tests --------- Co-authored-by: Clide Stefani <[email protected]> Co-authored-by: Raphael Taylor-Davies <[email protected]>
1 parent 161924e commit 78762dc

File tree

2 files changed

+174
-151
lines changed

2 files changed

+174
-151
lines changed

arrow-cast/src/cast/list.rs

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::cast::*;
19+
20+
/// Helper function that takes a primitive array and casts to a (generic) list array.
21+
pub(crate) fn cast_values_to_list<O: OffsetSizeTrait>(
22+
array: &dyn Array,
23+
to: &FieldRef,
24+
cast_options: &CastOptions,
25+
) -> Result<ArrayRef, ArrowError> {
26+
let values = cast_with_options(array, to.data_type(), cast_options)?;
27+
let offsets = OffsetBuffer::from_lengths(std::iter::repeat(1).take(values.len()));
28+
let list = GenericListArray::<O>::new(to.clone(), offsets, values, None);
29+
Ok(Arc::new(list))
30+
}
31+
32+
/// Helper function that takes a primitive array and casts to a fixed size list array.
33+
pub(crate) fn cast_values_to_fixed_size_list(
34+
array: &dyn Array,
35+
to: &FieldRef,
36+
size: i32,
37+
cast_options: &CastOptions,
38+
) -> Result<ArrayRef, ArrowError> {
39+
let values = cast_with_options(array, to.data_type(), cast_options)?;
40+
let list = FixedSizeListArray::new(to.clone(), size, values, None);
41+
Ok(Arc::new(list))
42+
}
43+
44+
pub(crate) fn cast_fixed_size_list_to_list<OffsetSize>(
45+
array: &dyn Array,
46+
) -> Result<ArrayRef, ArrowError>
47+
where
48+
OffsetSize: OffsetSizeTrait,
49+
{
50+
let fixed_size_list: &FixedSizeListArray = array.as_fixed_size_list();
51+
let list: GenericListArray<OffsetSize> = fixed_size_list.clone().into();
52+
Ok(Arc::new(list))
53+
}
54+
55+
pub(crate) fn cast_list_to_fixed_size_list<OffsetSize>(
56+
array: &GenericListArray<OffsetSize>,
57+
field: &FieldRef,
58+
size: i32,
59+
cast_options: &CastOptions,
60+
) -> Result<ArrayRef, ArrowError>
61+
where
62+
OffsetSize: OffsetSizeTrait,
63+
{
64+
let cap = array.len() * size as usize;
65+
66+
let mut nulls = (cast_options.safe || array.null_count() != 0).then(|| {
67+
let mut buffer = BooleanBufferBuilder::new(array.len());
68+
match array.nulls() {
69+
Some(n) => buffer.append_buffer(n.inner()),
70+
None => buffer.append_n(array.len(), true),
71+
}
72+
buffer
73+
});
74+
75+
// Nulls in FixedSizeListArray take up space and so we must pad the values
76+
let values = array.values().to_data();
77+
let mut mutable = MutableArrayData::new(vec![&values], cast_options.safe, cap);
78+
// The end position in values of the last incorrectly-sized list slice
79+
let mut last_pos = 0;
80+
for (idx, w) in array.offsets().windows(2).enumerate() {
81+
let start_pos = w[0].as_usize();
82+
let end_pos = w[1].as_usize();
83+
let len = end_pos - start_pos;
84+
85+
if len != size as usize {
86+
if cast_options.safe || array.is_null(idx) {
87+
if last_pos != start_pos {
88+
// Extend with valid slices
89+
mutable.extend(0, last_pos, start_pos);
90+
}
91+
// Pad this slice with nulls
92+
mutable.extend_nulls(size as _);
93+
nulls.as_mut().unwrap().set_bit(idx, false);
94+
// Set last_pos to the end of this slice's values
95+
last_pos = end_pos
96+
} else {
97+
return Err(ArrowError::CastError(format!(
98+
"Cannot cast to FixedSizeList({size}): value at index {idx} has length {len}",
99+
)));
100+
}
101+
}
102+
}
103+
104+
let values = match last_pos {
105+
0 => array.values().slice(0, cap), // All slices were the correct length
106+
_ => {
107+
if mutable.len() != cap {
108+
// Remaining slices were all correct length
109+
let remaining = cap - mutable.len();
110+
mutable.extend(0, last_pos, last_pos + remaining)
111+
}
112+
make_array(mutable.freeze())
113+
}
114+
};
115+
116+
// Cast the inner values if necessary
117+
let values = cast_with_options(values.as_ref(), field.data_type(), cast_options)?;
118+
119+
// Construct the FixedSizeListArray
120+
let nulls = nulls.map(|mut x| x.finish().into());
121+
let array = FixedSizeListArray::new(field.clone(), size, values, nulls);
122+
Ok(Arc::new(array))
123+
}
124+
125+
/// Helper function that takes an Generic list container and casts the inner datatype.
126+
pub(crate) fn cast_list_values<O: OffsetSizeTrait>(
127+
array: &dyn Array,
128+
to: &FieldRef,
129+
cast_options: &CastOptions,
130+
) -> Result<ArrayRef, ArrowError> {
131+
let list = array.as_list::<O>();
132+
let values = cast_with_options(list.values(), to.data_type(), cast_options)?;
133+
Ok(Arc::new(GenericListArray::<O>::new(
134+
to.clone(),
135+
list.offsets().clone(),
136+
values,
137+
list.nulls().cloned(),
138+
)))
139+
}
140+
141+
/// Cast the container type of List/Largelist array along with the inner datatype
142+
pub(crate) fn cast_list<I: OffsetSizeTrait, O: OffsetSizeTrait>(
143+
array: &dyn Array,
144+
field: &FieldRef,
145+
cast_options: &CastOptions,
146+
) -> Result<ArrayRef, ArrowError> {
147+
let list = array.as_list::<I>();
148+
let values = list.values();
149+
let offsets = list.offsets();
150+
let nulls = list.nulls().cloned();
151+
152+
if !O::IS_LARGE && values.len() > i32::MAX as usize {
153+
return Err(ArrowError::ComputeError(
154+
"LargeList too large to cast to List".into(),
155+
));
156+
}
157+
158+
// Recursively cast values
159+
let values = cast_with_options(values, field.data_type(), cast_options)?;
160+
let offsets: Vec<_> = offsets.iter().map(|x| O::usize_as(x.as_usize())).collect();
161+
162+
// Safety: valid offsets and checked for overflow
163+
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
164+
165+
Ok(Arc::new(GenericListArray::<O>::new(
166+
field.clone(),
167+
offsets,
168+
values,
169+
nulls,
170+
)))
171+
}

arrow-cast/src/cast.rs arrow-cast/src/cast/mod.rs

+3-151
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
//! assert_eq!(7.0, c.value(2));
3838
//! ```
3939
40+
mod list;
41+
use crate::cast::list::*;
42+
4043
use chrono::{NaiveTime, Offset, TimeZone, Utc};
4144
use std::cmp::Ordering;
4245
use std::sync::Arc;
@@ -3051,30 +3054,6 @@ where
30513054
Ok(Arc::new(b.finish()))
30523055
}
30533056

3054-
/// Helper function that takes a primitive array and casts to a (generic) list array.
3055-
fn cast_values_to_list<O: OffsetSizeTrait>(
3056-
array: &dyn Array,
3057-
to: &FieldRef,
3058-
cast_options: &CastOptions,
3059-
) -> Result<ArrayRef, ArrowError> {
3060-
let values = cast_with_options(array, to.data_type(), cast_options)?;
3061-
let offsets = OffsetBuffer::from_lengths(std::iter::repeat(1).take(values.len()));
3062-
let list = GenericListArray::<O>::new(to.clone(), offsets, values, None);
3063-
Ok(Arc::new(list))
3064-
}
3065-
3066-
/// Helper function that takes a primitive array and casts to a fixed size list array.
3067-
fn cast_values_to_fixed_size_list(
3068-
array: &dyn Array,
3069-
to: &FieldRef,
3070-
size: i32,
3071-
cast_options: &CastOptions,
3072-
) -> Result<ArrayRef, ArrowError> {
3073-
let values = cast_with_options(array, to.data_type(), cast_options)?;
3074-
let list = FixedSizeListArray::new(to.clone(), size, values, None);
3075-
Ok(Arc::new(list))
3076-
}
3077-
30783057
/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
30793058
/// offset size so re-encoding offset is unnecessary.
30803059
fn cast_binary_to_string<O: OffsetSizeTrait>(
@@ -3217,133 +3196,6 @@ where
32173196
Ok(Arc::new(GenericByteArray::<TO>::from(array_data)))
32183197
}
32193198

3220-
fn cast_fixed_size_list_to_list<OffsetSize>(array: &dyn Array) -> Result<ArrayRef, ArrowError>
3221-
where
3222-
OffsetSize: OffsetSizeTrait,
3223-
{
3224-
let fixed_size_list: &FixedSizeListArray = array.as_fixed_size_list();
3225-
let list: GenericListArray<OffsetSize> = fixed_size_list.clone().into();
3226-
Ok(Arc::new(list))
3227-
}
3228-
3229-
fn cast_list_to_fixed_size_list<OffsetSize>(
3230-
array: &GenericListArray<OffsetSize>,
3231-
field: &FieldRef,
3232-
size: i32,
3233-
cast_options: &CastOptions,
3234-
) -> Result<ArrayRef, ArrowError>
3235-
where
3236-
OffsetSize: OffsetSizeTrait,
3237-
{
3238-
let cap = array.len() * size as usize;
3239-
3240-
let mut nulls = (cast_options.safe || array.null_count() != 0).then(|| {
3241-
let mut buffer = BooleanBufferBuilder::new(array.len());
3242-
match array.nulls() {
3243-
Some(n) => buffer.append_buffer(n.inner()),
3244-
None => buffer.append_n(array.len(), true),
3245-
}
3246-
buffer
3247-
});
3248-
3249-
// Nulls in FixedSizeListArray take up space and so we must pad the values
3250-
let values = array.values().to_data();
3251-
let mut mutable = MutableArrayData::new(vec![&values], cast_options.safe, cap);
3252-
// The end position in values of the last incorrectly-sized list slice
3253-
let mut last_pos = 0;
3254-
for (idx, w) in array.offsets().windows(2).enumerate() {
3255-
let start_pos = w[0].as_usize();
3256-
let end_pos = w[1].as_usize();
3257-
let len = end_pos - start_pos;
3258-
3259-
if len != size as usize {
3260-
if cast_options.safe || array.is_null(idx) {
3261-
if last_pos != start_pos {
3262-
// Extend with valid slices
3263-
mutable.extend(0, last_pos, start_pos);
3264-
}
3265-
// Pad this slice with nulls
3266-
mutable.extend_nulls(size as _);
3267-
nulls.as_mut().unwrap().set_bit(idx, false);
3268-
// Set last_pos to the end of this slice's values
3269-
last_pos = end_pos
3270-
} else {
3271-
return Err(ArrowError::CastError(format!(
3272-
"Cannot cast to FixedSizeList({size}): value at index {idx} has length {len}",
3273-
)));
3274-
}
3275-
}
3276-
}
3277-
3278-
let values = match last_pos {
3279-
0 => array.values().slice(0, cap), // All slices were the correct length
3280-
_ => {
3281-
if mutable.len() != cap {
3282-
// Remaining slices were all correct length
3283-
let remaining = cap - mutable.len();
3284-
mutable.extend(0, last_pos, last_pos + remaining)
3285-
}
3286-
make_array(mutable.freeze())
3287-
}
3288-
};
3289-
3290-
// Cast the inner values if necessary
3291-
let values = cast_with_options(values.as_ref(), field.data_type(), cast_options)?;
3292-
3293-
// Construct the FixedSizeListArray
3294-
let nulls = nulls.map(|mut x| x.finish().into());
3295-
let array = FixedSizeListArray::new(field.clone(), size, values, nulls);
3296-
Ok(Arc::new(array))
3297-
}
3298-
3299-
/// Helper function that takes an Generic list container and casts the inner datatype.
3300-
fn cast_list_values<O: OffsetSizeTrait>(
3301-
array: &dyn Array,
3302-
to: &FieldRef,
3303-
cast_options: &CastOptions,
3304-
) -> Result<ArrayRef, ArrowError> {
3305-
let list = array.as_list::<O>();
3306-
let values = cast_with_options(list.values(), to.data_type(), cast_options)?;
3307-
Ok(Arc::new(GenericListArray::<O>::new(
3308-
to.clone(),
3309-
list.offsets().clone(),
3310-
values,
3311-
list.nulls().cloned(),
3312-
)))
3313-
}
3314-
3315-
/// Cast the container type of List/Largelist array along with the inner datatype
3316-
fn cast_list<I: OffsetSizeTrait, O: OffsetSizeTrait>(
3317-
array: &dyn Array,
3318-
field: &FieldRef,
3319-
cast_options: &CastOptions,
3320-
) -> Result<ArrayRef, ArrowError> {
3321-
let list = array.as_list::<I>();
3322-
let values = list.values();
3323-
let offsets = list.offsets();
3324-
let nulls = list.nulls().cloned();
3325-
3326-
if !O::IS_LARGE && values.len() > i32::MAX as usize {
3327-
return Err(ArrowError::ComputeError(
3328-
"LargeList too large to cast to List".into(),
3329-
));
3330-
}
3331-
3332-
// Recursively cast values
3333-
let values = cast_with_options(values, field.data_type(), cast_options)?;
3334-
let offsets: Vec<_> = offsets.iter().map(|x| O::usize_as(x.as_usize())).collect();
3335-
3336-
// Safety: valid offsets and checked for overflow
3337-
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
3338-
3339-
Ok(Arc::new(GenericListArray::<O>::new(
3340-
field.clone(),
3341-
offsets,
3342-
values,
3343-
nulls,
3344-
)))
3345-
}
3346-
33473199
#[cfg(test)]
33483200
mod tests {
33493201
use arrow_buffer::{Buffer, NullBuffer};

0 commit comments

Comments
 (0)