Skip to content

Commit b69d89b

Browse files
authored
Improve expression performance (#2273)
* Only simplify once in the scanner * Manually implement return_dtype instead of expensive Canonical::Empty (FLUP: we should add `return_nullability()` expr too) * Remove `Field::Index` weirdness. * More consistent APIs for Struct{Scalar,Array,DType}
1 parent 857486b commit b69d89b

File tree

53 files changed

+357
-428
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+357
-428
lines changed

bench-vortex/src/clickbench.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ pub async fn register_vortex_files(
184184
for st in sts.into_iter() {
185185
let struct_dtype = st.dtype().as_struct().unwrap();
186186
let names = struct_dtype.names().iter();
187-
let types = struct_dtype.dtypes();
187+
let types = struct_dtype.fields();
188188

189189
for (field_name, field_type) in names.zip(types) {
190190
let val = arrays_map.entry(field_name.clone()).or_default();

bench-vortex/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ impl CompressionRunStats {
241241

242242
self.compressed_sizes
243243
.iter()
244-
.zip_eq(st.names().iter().zip_eq(st.dtypes()))
244+
.zip_eq(st.names().iter().zip_eq(st.fields()))
245245
.map(
246246
|(&size, (column_name, column_type))| CompressionRunResults {
247247
dataset_name: dataset_name.clone(),

bench-vortex/src/tpch/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ async fn register_vortex_file(
224224
for st in sts.into_iter() {
225225
let struct_dtype = st.dtype().as_struct().unwrap();
226226
let names = struct_dtype.names().iter();
227-
let types = struct_dtype.dtypes();
227+
let types = struct_dtype.fields();
228228

229229
for (field_name, field_type) in names.zip(types) {
230230
let val = arrays_map.entry(field_name.clone()).or_default();

encodings/sparse/src/variants.rs

+16-23
Original file line numberDiff line numberDiff line change
@@ -61,29 +61,22 @@ impl Utf8ArrayTrait for SparseArray {}
6161
impl BinaryArrayTrait for SparseArray {}
6262

6363
impl StructArrayTrait for SparseArray {
64-
fn maybe_null_field_by_idx(&self, idx: usize) -> Option<Array> {
65-
let new_patches = self
66-
.patches()
67-
.map_values_opt(|values| {
68-
values
69-
.as_struct_array()
70-
.and_then(|s| s.maybe_null_field_by_idx(idx))
71-
})
72-
.vortex_expect("field array length should equal struct array length")?;
73-
let scalar = StructScalar::try_from(&self.fill_scalar())
74-
.ok()?
75-
.field_by_idx(idx)?;
76-
77-
Some(
78-
SparseArray::try_new_from_patches(
79-
new_patches,
80-
self.len(),
81-
self.indices_offset(),
82-
scalar,
83-
)
84-
.ok()?
85-
.into_array(),
86-
)
64+
fn maybe_null_field_by_idx(&self, idx: usize) -> VortexResult<Array> {
65+
let new_patches = self.patches().map_values(|values| {
66+
values
67+
.as_struct_array()
68+
.vortex_expect("Expected struct array")
69+
.maybe_null_field_by_idx(idx)
70+
})?;
71+
let scalar = StructScalar::try_from(&self.fill_scalar())?.field_by_idx(idx)?;
72+
73+
Ok(SparseArray::try_new_from_patches(
74+
new_patches,
75+
self.len(),
76+
self.indices_offset(),
77+
scalar,
78+
)?
79+
.into_array())
8780
}
8881

8982
fn project(&self, projection: &[FieldName]) -> VortexResult<Array> {

fuzz/fuzz_targets/file_io.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,6 @@ fn has_nullable_struct(dtype: &DType) -> bool {
116116
dtype.is_nullable()
117117
|| dtype
118118
.as_struct()
119-
.map(|sdt| sdt.dtypes().any(|dtype| has_nullable_struct(&dtype)))
119+
.map(|sdt| sdt.fields().any(|dtype| has_nullable_struct(&dtype)))
120120
.unwrap_or(false)
121121
}

pyvortex/src/dtype/struct_.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ impl PyStructDType {
2828
};
2929

3030
let mut fields = Vec::with_capacity(dtype.names().len());
31-
for dtype in dtype.dtypes() {
31+
for dtype in dtype.fields() {
3232
fields.push(PyDType::init(self_.py(), dtype)?);
3333
}
3434
Ok(fields)

pyvortex/src/encoding/builtins/struct_.rs

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use pyo3::exceptions::PyKeyError;
21
use pyo3::{pyclass, pymethods, Bound, PyRef, PyResult};
32
use vortex::array::StructEncoding;
43
use vortex::variants::StructArrayTrait;
@@ -22,10 +21,7 @@ impl PyStructEncoding {
2221

2322
/// Returns the given field of the struct array.
2423
pub fn field<'py>(self_: PyRef<'py, Self>, name: &str) -> PyResult<Bound<'py, PyArray>> {
25-
let field = self_
26-
.as_array_ref()
27-
.maybe_null_field_by_name(name)
28-
.ok_or_else(|| PyKeyError::new_err(format!("Field name not found: {}", name)))?;
24+
let field = self_.as_array_ref().maybe_null_field_by_name(name)?;
2925
PyArray::init(self_.py(), field)
3026
}
3127
}

pyvortex/src/python_repr.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ impl Display for DTypePythonRepr<'_> {
5555
"struct({{{}}}, nullable={})",
5656
st.names()
5757
.iter()
58-
.zip(st.dtypes())
58+
.zip(st.fields())
5959
.map(|(n, dt)| format!("\"{}\": {}", n, dt.python_repr()))
6060
.join(", "),
6161
n.python_repr()

pyvortex/src/scalar/struct_.rs

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
use pyo3::exceptions::PyKeyError;
21
use pyo3::{pyclass, pymethods, IntoPy, PyObject, PyRef, PyResult};
32
use vortex::scalar::StructScalar;
43

@@ -18,9 +17,7 @@ impl PyStructScalar {
1817
/// Return the child scalar with the given field name.
1918
pub fn field(self_: PyRef<'_, Self>, name: &str) -> PyResult<PyObject> {
2019
let scalar = self_.as_scalar_ref();
21-
let child = scalar
22-
.field_by_name(name)
23-
.ok_or_else(|| PyKeyError::new_err(format!("Field not found {}", name)))?;
20+
let child = scalar.field(name)?;
2421
Ok(PyVortex(&child).into_py(self_.py()))
2522
}
2623
}

vortex-array/src/array/arbitrary.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
5959
DType::Binary(n) => random_bytes(u, *n, chunk_len),
6060
DType::Struct(sdt, n) => {
6161
let first_array = sdt
62-
.dtypes()
62+
.fields()
6363
.next()
6464
.map(|d| random_array(u, &d, chunk_len))
6565
.transpose()?;
@@ -73,7 +73,7 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
7373
.into_iter()
7474
.map(Ok)
7575
.chain(
76-
sdt.dtypes()
76+
sdt.fields()
7777
.skip(1)
7878
.map(|d| random_array(u, &d, Some(resolved_len))),
7979
)

vortex-array/src/array/chunked/canonical.rs

+10-6
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,16 @@ fn swizzle_struct_chunks(
175175
let len = chunks.iter().map(|chunk| chunk.len()).sum();
176176
let mut field_arrays = Vec::new();
177177

178-
for (field_idx, field_dtype) in struct_dtype.dtypes().enumerate() {
179-
let field_chunks = chunks.iter().map(|c| c.as_struct_array()
180-
.vortex_expect("Chunk was not a StructArray")
181-
.maybe_null_field_by_idx(field_idx)
182-
.ok_or_else(|| vortex_err!("All chunks must have same dtype; missing field at index {}, current chunk dtype: {}", field_idx, c.dtype()))
183-
).collect::<VortexResult<Vec<_>>>()?;
178+
for (field_idx, field_dtype) in struct_dtype.fields().enumerate() {
179+
let field_chunks = chunks
180+
.iter()
181+
.map(|c| {
182+
c.as_struct_array()
183+
.vortex_expect("Chunk was not a StructArray")
184+
.maybe_null_field_by_idx(field_idx)
185+
.vortex_expect("Invalid chunked array")
186+
})
187+
.collect::<Vec<_>>();
184188
let field_array = ChunkedArray::try_new(field_chunks, field_dtype.clone())?;
185189
field_arrays.push(field_array.into_array());
186190
}

vortex-array/src/array/chunked/variants.rs

+11-13
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use std::sync::Arc;
22

3-
use itertools::Itertools;
4-
use vortex_dtype::{DType, Field, FieldName};
3+
use vortex_dtype::{DType, FieldName};
54
use vortex_error::{vortex_err, vortex_panic, VortexExpect, VortexResult};
65

76
use crate::array::chunked::ChunkedArray;
@@ -65,17 +64,22 @@ impl Utf8ArrayTrait for ChunkedArray {}
6564
impl BinaryArrayTrait for ChunkedArray {}
6665

6766
impl StructArrayTrait for ChunkedArray {
68-
fn maybe_null_field_by_idx(&self, idx: usize) -> Option<Array> {
67+
fn maybe_null_field_by_idx(&self, idx: usize) -> VortexResult<Array> {
6968
let mut chunks = Vec::with_capacity(self.nchunks());
7069
for chunk in self.chunks() {
7170
chunks.push(
7271
chunk
7372
.as_struct_array()
74-
.and_then(|s| s.maybe_null_field_by_idx(idx))?,
73+
.ok_or_else(|| vortex_err!("Chunk was not a StructArray"))?
74+
.maybe_null_field_by_idx(idx)?,
7575
);
7676
}
7777

78-
let projected_dtype = self.dtype().as_struct().map(|s| s.field_dtype(idx))?.ok()?;
78+
let projected_dtype = self
79+
.dtype()
80+
.as_struct()
81+
.ok_or_else(|| vortex_err!("Not a struct dtype"))?
82+
.field_by_index(idx)?;
7983
let chunked = ChunkedArray::try_new(chunks, projected_dtype.clone())
8084
.unwrap_or_else(|err| {
8185
vortex_panic!(
@@ -85,7 +89,7 @@ impl StructArrayTrait for ChunkedArray {
8589
)
8690
})
8791
.into_array();
88-
Some(chunked)
92+
Ok(chunked)
8993
}
9094

9195
fn project(&self, projection: &[FieldName]) -> VortexResult<Array> {
@@ -103,13 +107,7 @@ impl StructArrayTrait for ChunkedArray {
103107
.dtype()
104108
.as_struct()
105109
.ok_or_else(|| vortex_err!("Not a struct dtype"))?
106-
.project(
107-
projection
108-
.iter()
109-
.map(|f| Field::Name(f.clone()))
110-
.collect_vec()
111-
.as_slice(),
112-
)?;
110+
.project(projection)?;
113111
ChunkedArray::try_new(
114112
chunks,
115113
DType::Struct(Arc::new(projected_dtype), self.dtype().nullability()),

vortex-array/src/array/constant/variants.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ impl Utf8ArrayTrait for ConstantArray {}
7474
impl BinaryArrayTrait for ConstantArray {}
7575

7676
impl StructArrayTrait for ConstantArray {
77-
fn maybe_null_field_by_idx(&self, idx: usize) -> Option<Array> {
77+
fn maybe_null_field_by_idx(&self, idx: usize) -> VortexResult<Array> {
7878
self.scalar()
7979
.as_struct()
8080
.field_by_idx(idx)

vortex-array/src/array/struct_/mod.rs

+13-28
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ use std::fmt::{Debug, Display};
22
use std::sync::Arc;
33

44
use serde::{Deserialize, Serialize};
5-
use vortex_dtype::{DType, Field, FieldName, FieldNames, StructDType};
6-
use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, VortexResult};
5+
use vortex_dtype::{DType, FieldName, FieldNames, StructDType};
6+
use vortex_error::{vortex_bail, vortex_err, VortexExpect as _, VortexResult};
77
use vortex_mask::Mask;
88

99
use crate::encoding::encoding_ids;
@@ -51,9 +51,8 @@ impl StructArray {
5151

5252
pub fn children(&self) -> impl Iterator<Item = Array> + '_ {
5353
(0..self.nfields()).map(move |idx| {
54-
self.maybe_null_field_by_idx(idx).unwrap_or_else(|| {
55-
vortex_panic!("Field {} not found, nfields: {}", idx, self.nfields())
56-
})
54+
self.maybe_null_field_by_idx(idx)
55+
.vortex_expect("never out of bounds")
5756
})
5857
}
5958

@@ -138,7 +137,7 @@ impl StructArray {
138137
names.push(self.names()[idx].clone());
139138
children.push(
140139
self.maybe_null_field_by_idx(idx)
141-
.ok_or_else(|| vortex_err!(OutOfBounds: idx, 0, self.nfields()))?,
140+
.vortex_expect("never out of bounds"),
142141
);
143142
}
144143

@@ -160,25 +159,13 @@ impl VariantsVTable<StructArray> for StructEncoding {
160159
}
161160

162161
impl StructArrayTrait for StructArray {
163-
fn maybe_null_field_by_idx(&self, idx: usize) -> Option<Array> {
164-
Some(
165-
self.field_info(&Field::Index(idx))
166-
.map(|field_info| {
167-
self.as_ref()
168-
.child(
169-
idx,
170-
&field_info
171-
.dtype
172-
.value()
173-
.vortex_expect("FieldInfo could not access dtype"),
174-
self.len(),
175-
)
176-
.unwrap_or_else(|e| {
177-
vortex_panic!(e, "StructArray: field {} not found", idx)
178-
})
179-
})
180-
.unwrap_or_else(|e| vortex_panic!(e, "StructArray: field {} not found", idx)),
181-
)
162+
fn maybe_null_field_by_idx(&self, idx: usize) -> VortexResult<Array> {
163+
let dtype = self
164+
.dtype()
165+
.as_struct()
166+
.vortex_expect("Not a struct dtype")
167+
.field_by_index(idx)?;
168+
self.child(idx, &dtype, self.len())
182169
}
183170

184171
fn project(&self, projection: &[FieldName]) -> VortexResult<Array> {
@@ -210,9 +197,7 @@ impl ValidityVTable<StructArray> for StructEncoding {
210197
impl VisitorVTable<StructArray> for StructEncoding {
211198
fn accept(&self, array: &StructArray, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> {
212199
for (idx, name) in array.names().iter().enumerate() {
213-
let child = array
214-
.maybe_null_field_by_idx(idx)
215-
.ok_or_else(|| vortex_err!(OutOfBounds: idx, 0, array.nfields()))?;
200+
let child = array.maybe_null_field_by_idx(idx)?;
216201
visitor.visit_child(name.as_ref(), &child)?;
217202
}
218203

vortex-array/src/arrow/dtype.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ pub fn infer_schema(dtype: &DType) -> VortexResult<Schema> {
114114
}
115115

116116
let mut builder = SchemaBuilder::with_capacity(struct_dtype.names().len());
117-
for (field_name, field_dtype) in struct_dtype.names().iter().zip(struct_dtype.dtypes()) {
117+
for (field_name, field_dtype) in struct_dtype.names().iter().zip(struct_dtype.fields()) {
118118
builder.push(FieldRef::from(Field::new(
119119
field_name.to_string(),
120120
infer_data_type(&field_dtype)?,
@@ -149,7 +149,7 @@ pub fn infer_data_type(dtype: &DType) -> VortexResult<DataType> {
149149
DType::Binary(_) => DataType::BinaryView,
150150
DType::Struct(struct_dtype, _) => {
151151
let mut fields = Vec::with_capacity(struct_dtype.names().len());
152-
for (field_name, field_dt) in struct_dtype.names().iter().zip(struct_dtype.dtypes()) {
152+
for (field_name, field_dt) in struct_dtype.names().iter().zip(struct_dtype.fields()) {
153153
fields.push(FieldRef::from(Field::new(
154154
field_name.to_string(),
155155
infer_data_type(&field_dt)?,

vortex-array/src/builders/struct_.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ impl StructBuilder {
2626
capacity: usize,
2727
) -> Self {
2828
let builders = struct_dtype
29-
.dtypes()
29+
.fields()
3030
.map(|dt| builder_with_capacity(&dt, capacity))
3131
.collect();
3232

vortex-array/src/patches.rs

-17
Original file line numberDiff line numberDiff line change
@@ -362,23 +362,6 @@ impl Patches {
362362
}
363363
Ok(Self::new(self.array_len, self.indices, values))
364364
}
365-
366-
pub fn map_values_opt<F>(self, f: F) -> VortexResult<Option<Self>>
367-
where
368-
F: FnOnce(Array) -> Option<Array>,
369-
{
370-
let Some(values) = f(self.values) else {
371-
return Ok(None);
372-
};
373-
if self.indices.len() == values.len() {
374-
vortex_bail!(
375-
"map_values must preserve length: expected {} received {}",
376-
self.indices.len(),
377-
values.len()
378-
)
379-
}
380-
Ok(Some(Self::new(self.array_len, self.indices, values)))
381-
}
382365
}
383366

384367
/// Filter patches with the provided mask (in flattened space).

0 commit comments

Comments
 (0)