Skip to content

Commit 5abef41

Browse files
Rachelintalamb
andauthored
Improve benchmark for ltrim (#12513)
* complete benchmark for ltrim. * improve benchmarks. * remove unused param. * fix bench. * refactor to remove repeated codes. * fix clippy. * Update datafusion/functions/benches/ltrim.rs Co-authored-by: Andrew Lamb <[email protected]> * improve codes and add more comments. * fix clippy. --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent f514e12 commit 5abef41

File tree

1 file changed

+206
-17
lines changed

1 file changed

+206
-17
lines changed

datafusion/functions/benches/ltrim.rs

Lines changed: 206 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,221 @@
1717

1818
extern crate criterion;
1919

20-
use arrow::array::{ArrayRef, StringArray};
21-
use criterion::{black_box, criterion_group, criterion_main, Criterion};
20+
use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
21+
use criterion::{
22+
black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
23+
Criterion, SamplingMode,
24+
};
2225
use datafusion_common::ScalarValue;
23-
use datafusion_expr::ColumnarValue;
26+
use datafusion_expr::{ColumnarValue, ScalarUDF};
2427
use datafusion_functions::string;
25-
use std::sync::Arc;
28+
use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
29+
use std::{fmt, sync::Arc};
2630

27-
fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
28-
let iter =
29-
std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size);
30-
let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
31+
pub fn seedable_rng() -> StdRng {
32+
StdRng::seed_from_u64(42)
33+
}
34+
35+
#[derive(Clone, Copy)]
36+
pub enum StringArrayType {
37+
Utf8View,
38+
Utf8,
39+
LargeUtf8,
40+
}
41+
42+
impl fmt::Display for StringArrayType {
43+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44+
match self {
45+
StringArrayType::Utf8View => f.write_str("string_view"),
46+
StringArrayType::Utf8 => f.write_str("string"),
47+
StringArrayType::LargeUtf8 => f.write_str("large_string"),
48+
}
49+
}
50+
}
51+
52+
/// returns an array of strings, and `characters` as a ScalarValue
53+
pub fn create_string_array_and_characters(
54+
size: usize,
55+
characters: &str,
56+
trimmed: &str,
57+
remaining_len: usize,
58+
string_array_type: StringArrayType,
59+
) -> (ArrayRef, ScalarValue) {
60+
let rng = &mut seedable_rng();
61+
62+
// Create `size` rows:
63+
// - 10% rows will be `None`
64+
// - Other 90% will be strings with same `remaining_len` lengths
65+
// We will build the string array on it later.
66+
let string_iter = (0..size).map(|_| {
67+
if rng.gen::<f32>() < 0.1 {
68+
None
69+
} else {
70+
let mut value = trimmed.as_bytes().to_vec();
71+
let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
72+
value.extend(generated);
73+
Some(String::from_utf8(value).unwrap())
74+
}
75+
});
76+
77+
// Build the target `string array` and `characters` according to `string_array_type`
78+
match string_array_type {
79+
StringArrayType::Utf8View => (
80+
Arc::new(string_iter.collect::<StringViewArray>()),
81+
ScalarValue::Utf8View(Some(characters.to_string())),
82+
),
83+
StringArrayType::Utf8 => (
84+
Arc::new(string_iter.collect::<StringArray>()),
85+
ScalarValue::Utf8(Some(characters.to_string())),
86+
),
87+
StringArrayType::LargeUtf8 => (
88+
Arc::new(string_iter.collect::<LargeStringArray>()),
89+
ScalarValue::LargeUtf8(Some(characters.to_string())),
90+
),
91+
}
92+
}
93+
94+
/// Create args for the ltrim benchmark
95+
/// Inputs:
96+
/// - size: rows num of the test array
97+
/// - characters: the characters we need to trim
98+
/// - trimmed: the part in the testing string that will be trimmed
99+
/// - remaining_len: the len of the remaining part of testing string after trimming
100+
/// - string_array_type: the method used to store the testing strings
101+
///
102+
/// Outputs:
103+
/// - testing string array
104+
/// - trimmed characters
105+
///
106+
fn create_args(
107+
size: usize,
108+
characters: &str,
109+
trimmed: &str,
110+
remaining_len: usize,
111+
string_array_type: StringArrayType,
112+
) -> Vec<ColumnarValue> {
113+
let (string_array, pattern) = create_string_array_and_characters(
114+
size,
115+
characters,
116+
trimmed,
117+
remaining_len,
118+
string_array_type,
119+
);
31120
vec![
32-
ColumnarValue::Array(array),
33-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
121+
ColumnarValue::Array(string_array),
122+
ColumnarValue::Scalar(pattern),
34123
]
35124
}
36125

126+
#[allow(clippy::too_many_arguments)]
127+
fn run_with_string_type<M: Measurement>(
128+
group: &mut BenchmarkGroup<'_, M>,
129+
ltrim: &ScalarUDF,
130+
size: usize,
131+
len: usize,
132+
characters: &str,
133+
trimmed: &str,
134+
remaining_len: usize,
135+
string_type: StringArrayType,
136+
) {
137+
let args = create_args(size, characters, trimmed, remaining_len, string_type);
138+
group.bench_function(
139+
format!(
140+
"{string_type} [size={size}, len_before={len}, len_after={remaining_len}]",
141+
),
142+
|b| b.iter(|| black_box(ltrim.invoke(&args))),
143+
);
144+
}
145+
146+
#[allow(clippy::too_many_arguments)]
147+
fn run_one_group(
148+
c: &mut Criterion,
149+
group_name: &str,
150+
ltrim: &ScalarUDF,
151+
string_types: &[StringArrayType],
152+
size: usize,
153+
len: usize,
154+
characters: &str,
155+
trimmed: &str,
156+
remaining_len: usize,
157+
) {
158+
let mut group = c.benchmark_group(group_name);
159+
group.sampling_mode(SamplingMode::Flat);
160+
group.sample_size(10);
161+
162+
for string_type in string_types {
163+
run_with_string_type(
164+
&mut group,
165+
ltrim,
166+
size,
167+
len,
168+
characters,
169+
trimmed,
170+
remaining_len,
171+
*string_type,
172+
);
173+
}
174+
175+
group.finish();
176+
}
177+
37178
fn criterion_benchmark(c: &mut Criterion) {
38179
let ltrim = string::ltrim();
39-
for char in ["\"", "Header:"] {
40-
for size in [1024, 4096, 8192] {
41-
let args = create_args(size, char);
42-
c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
43-
b.iter(|| black_box(ltrim.invoke(&args)))
44-
});
45-
}
180+
let characters = ",!()";
181+
182+
let string_types = [
183+
StringArrayType::Utf8View,
184+
StringArrayType::Utf8,
185+
StringArrayType::LargeUtf8,
186+
];
187+
for size in [1024, 4096, 8192] {
188+
// len=12, trimmed_len=4, len_after_ltrim=8
189+
let len = 12;
190+
let trimmed = characters;
191+
let remaining_len = len - trimmed.len();
192+
run_one_group(
193+
c,
194+
"INPUT LEN <= 12",
195+
&ltrim,
196+
&string_types,
197+
size,
198+
len,
199+
characters,
200+
trimmed,
201+
remaining_len,
202+
);
203+
204+
// len=64, trimmed_len=4, len_after_ltrim=60
205+
let len = 64;
206+
let trimmed = characters;
207+
let remaining_len = len - trimmed.len();
208+
run_one_group(
209+
c,
210+
"INPUT LEN > 12, OUTPUT LEN > 12",
211+
&ltrim,
212+
&string_types,
213+
size,
214+
len,
215+
characters,
216+
trimmed,
217+
remaining_len,
218+
);
219+
220+
// len=64, trimmed_len=56, len_after_ltrim=8
221+
let len = 64;
222+
let trimmed = characters.repeat(15);
223+
let remaining_len = len - trimmed.len();
224+
run_one_group(
225+
c,
226+
"INPUT LEN > 12, OUTPUT LEN <= 12",
227+
&ltrim,
228+
&string_types,
229+
size,
230+
len,
231+
characters,
232+
&trimmed,
233+
remaining_len,
234+
);
46235
}
47236
}
48237

0 commit comments

Comments
 (0)