Skip to content

Commit 94034be

Browse files
authored
Improve lpad udf by using a GenericStringBuilder (#11987)
* Update LPAD scalar function to support Utf8View * Lpad code improvements and benchmark. * Improved use of GenericStringBuilder.
1 parent 02bfefe commit 94034be

File tree

3 files changed

+244
-115
lines changed

3 files changed

+244
-115
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,8 @@ required-features = ["string_expressions"]
146146
harness = false
147147
name = "upper"
148148
required-features = ["string_expressions"]
149+
150+
[[bench]]
151+
harness = false
152+
name = "pad"
153+
required-features = ["unicode_expressions"]

datafusion/functions/benches/pad.rs

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
19+
use arrow::datatypes::Int64Type;
20+
use arrow::util::bench_util::{
21+
create_string_array_with_len, create_string_view_array_with_len,
22+
};
23+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
24+
use datafusion_expr::ColumnarValue;
25+
use datafusion_functions::unicode::{lpad, rpad};
26+
use rand::distributions::{Distribution, Uniform};
27+
use rand::Rng;
28+
use std::sync::Arc;
29+
30+
struct Filter<Dist> {
31+
dist: Dist,
32+
}
33+
34+
impl<T, Dist> Distribution<T> for Filter<Dist>
35+
where
36+
Dist: Distribution<T>,
37+
{
38+
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> T {
39+
self.dist.sample(rng)
40+
}
41+
}
42+
43+
pub fn create_primitive_array<T>(
44+
size: usize,
45+
null_density: f32,
46+
len: usize,
47+
) -> PrimitiveArray<T>
48+
where
49+
T: ArrowPrimitiveType<Native = i64>,
50+
{
51+
let dist = Filter {
52+
dist: Uniform::new_inclusive::<i64, i64>(0, len as i64),
53+
};
54+
55+
let mut rng = rand::thread_rng();
56+
(0..size)
57+
.map(|_| {
58+
if rng.gen::<f32>() < null_density {
59+
None
60+
} else {
61+
Some(rng.sample(&dist))
62+
}
63+
})
64+
.collect()
65+
}
66+
67+
fn create_args<O: OffsetSizeTrait>(
68+
size: usize,
69+
str_len: usize,
70+
use_string_view: bool,
71+
) -> Vec<ColumnarValue> {
72+
let length_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.0, str_len));
73+
74+
if !use_string_view {
75+
let string_array =
76+
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
77+
let fill_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
78+
79+
vec![
80+
ColumnarValue::Array(string_array),
81+
ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
82+
ColumnarValue::Array(fill_array),
83+
]
84+
} else {
85+
let string_array =
86+
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
87+
let fill_array =
88+
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
89+
90+
vec![
91+
ColumnarValue::Array(string_array),
92+
ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
93+
ColumnarValue::Array(fill_array),
94+
]
95+
}
96+
}
97+
98+
fn criterion_benchmark(c: &mut Criterion) {
99+
for size in [1024, 2048] {
100+
let mut group = c.benchmark_group("lpad function");
101+
102+
let args = create_args::<i32>(size, 32, false);
103+
group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
104+
b.iter(|| criterion::black_box(lpad().invoke(&args).unwrap()))
105+
});
106+
107+
let args = create_args::<i64>(size, 32, false);
108+
group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
109+
b.iter(|| criterion::black_box(lpad().invoke(&args).unwrap()))
110+
});
111+
112+
let args = create_args::<i32>(size, 32, true);
113+
group.bench_function(BenchmarkId::new("stringview type", size), |b| {
114+
b.iter(|| criterion::black_box(lpad().invoke(&args).unwrap()))
115+
});
116+
117+
group.finish();
118+
119+
let mut group = c.benchmark_group("rpad function");
120+
121+
let args = create_args::<i32>(size, 32, false);
122+
group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
123+
b.iter(|| criterion::black_box(rpad().invoke(&args).unwrap()))
124+
});
125+
126+
let args = create_args::<i64>(size, 32, false);
127+
group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
128+
b.iter(|| criterion::black_box(rpad().invoke(&args).unwrap()))
129+
});
130+
//
131+
// let args = create_args::<i32>(size, 32, true);
132+
// group.bench_function(BenchmarkId::new("stringview type", size), |b| {
133+
// b.iter(|| criterion::black_box(rpad().invoke(&args).unwrap()))
134+
// });
135+
136+
group.finish();
137+
}
138+
}
139+
140+
criterion_group!(benches, criterion_benchmark);
141+
criterion_main!(benches);

0 commit comments

Comments
 (0)