|
17 | 17 |
|
18 | 18 | extern crate criterion;
|
19 | 19 |
|
20 |
| -use arrow::array::{ArrayRef, StringArray}; |
21 |
| -use criterion::{black_box, criterion_group, criterion_main, Criterion}; |
| 20 | +use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray}; |
| 21 | +use criterion::{ |
| 22 | + black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, |
| 23 | + Criterion, SamplingMode, |
| 24 | +}; |
22 | 25 | use datafusion_common::ScalarValue;
|
23 |
| -use datafusion_expr::ColumnarValue; |
| 26 | +use datafusion_expr::{ColumnarValue, ScalarUDF}; |
24 | 27 | use datafusion_functions::string;
|
25 |
| -use std::sync::Arc; |
| 28 | +use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; |
| 29 | +use std::{fmt, sync::Arc}; |
26 | 30 |
|
27 |
| -fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> { |
28 |
| - let iter = |
29 |
| - std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size); |
30 |
| - let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef; |
| 31 | +pub fn seedable_rng() -> StdRng { |
| 32 | + StdRng::seed_from_u64(42) |
| 33 | +} |
| 34 | + |
| 35 | +#[derive(Clone, Copy)] |
| 36 | +pub enum StringArrayType { |
| 37 | + Utf8View, |
| 38 | + Utf8, |
| 39 | + LargeUtf8, |
| 40 | +} |
| 41 | + |
| 42 | +impl fmt::Display for StringArrayType { |
| 43 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 44 | + match self { |
| 45 | + StringArrayType::Utf8View => f.write_str("string_view"), |
| 46 | + StringArrayType::Utf8 => f.write_str("string"), |
| 47 | + StringArrayType::LargeUtf8 => f.write_str("large_string"), |
| 48 | + } |
| 49 | + } |
| 50 | +} |
| 51 | + |
| 52 | +/// returns an array of strings, and `characters` as a ScalarValue |
| 53 | +pub fn create_string_array_and_characters( |
| 54 | + size: usize, |
| 55 | + characters: &str, |
| 56 | + trimmed: &str, |
| 57 | + remaining_len: usize, |
| 58 | + string_array_type: StringArrayType, |
| 59 | +) -> (ArrayRef, ScalarValue) { |
| 60 | + let rng = &mut seedable_rng(); |
| 61 | + |
| 62 | + // Create `size` rows: |
| 63 | + // - 10% rows will be `None` |
| 64 | + // - Other 90% will be strings with same `remaining_len` lengths |
| 65 | + // We will build the string array on it later. |
| 66 | + let string_iter = (0..size).map(|_| { |
| 67 | + if rng.gen::<f32>() < 0.1 { |
| 68 | + None |
| 69 | + } else { |
| 70 | + let mut value = trimmed.as_bytes().to_vec(); |
| 71 | + let generated = rng.sample_iter(&Alphanumeric).take(remaining_len); |
| 72 | + value.extend(generated); |
| 73 | + Some(String::from_utf8(value).unwrap()) |
| 74 | + } |
| 75 | + }); |
| 76 | + |
| 77 | + // Build the target `string array` and `characters` according to `string_array_type` |
| 78 | + match string_array_type { |
| 79 | + StringArrayType::Utf8View => ( |
| 80 | + Arc::new(string_iter.collect::<StringViewArray>()), |
| 81 | + ScalarValue::Utf8View(Some(characters.to_string())), |
| 82 | + ), |
| 83 | + StringArrayType::Utf8 => ( |
| 84 | + Arc::new(string_iter.collect::<StringArray>()), |
| 85 | + ScalarValue::Utf8(Some(characters.to_string())), |
| 86 | + ), |
| 87 | + StringArrayType::LargeUtf8 => ( |
| 88 | + Arc::new(string_iter.collect::<LargeStringArray>()), |
| 89 | + ScalarValue::LargeUtf8(Some(characters.to_string())), |
| 90 | + ), |
| 91 | + } |
| 92 | +} |
| 93 | + |
| 94 | +/// Create args for the ltrim benchmark |
| 95 | +/// Inputs: |
| 96 | +/// - size: rows num of the test array |
| 97 | +/// - characters: the characters we need to trim |
| 98 | +/// - trimmed: the part in the testing string that will be trimmed |
| 99 | +/// - remaining_len: the len of the remaining part of testing string after trimming |
| 100 | +/// - string_array_type: the method used to store the testing strings |
| 101 | +/// |
| 102 | +/// Outputs: |
| 103 | +/// - testing string array |
| 104 | +/// - trimmed characters |
| 105 | +/// |
| 106 | +fn create_args( |
| 107 | + size: usize, |
| 108 | + characters: &str, |
| 109 | + trimmed: &str, |
| 110 | + remaining_len: usize, |
| 111 | + string_array_type: StringArrayType, |
| 112 | +) -> Vec<ColumnarValue> { |
| 113 | + let (string_array, pattern) = create_string_array_and_characters( |
| 114 | + size, |
| 115 | + characters, |
| 116 | + trimmed, |
| 117 | + remaining_len, |
| 118 | + string_array_type, |
| 119 | + ); |
31 | 120 | vec![
|
32 |
| - ColumnarValue::Array(array), |
33 |
| - ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))), |
| 121 | + ColumnarValue::Array(string_array), |
| 122 | + ColumnarValue::Scalar(pattern), |
34 | 123 | ]
|
35 | 124 | }
|
36 | 125 |
|
| 126 | +#[allow(clippy::too_many_arguments)] |
| 127 | +fn run_with_string_type<M: Measurement>( |
| 128 | + group: &mut BenchmarkGroup<'_, M>, |
| 129 | + ltrim: &ScalarUDF, |
| 130 | + size: usize, |
| 131 | + len: usize, |
| 132 | + characters: &str, |
| 133 | + trimmed: &str, |
| 134 | + remaining_len: usize, |
| 135 | + string_type: StringArrayType, |
| 136 | +) { |
| 137 | + let args = create_args(size, characters, trimmed, remaining_len, string_type); |
| 138 | + group.bench_function( |
| 139 | + format!( |
| 140 | + "{string_type} [size={size}, len_before={len}, len_after={remaining_len}]", |
| 141 | + ), |
| 142 | + |b| b.iter(|| black_box(ltrim.invoke(&args))), |
| 143 | + ); |
| 144 | +} |
| 145 | + |
| 146 | +#[allow(clippy::too_many_arguments)] |
| 147 | +fn run_one_group( |
| 148 | + c: &mut Criterion, |
| 149 | + group_name: &str, |
| 150 | + ltrim: &ScalarUDF, |
| 151 | + string_types: &[StringArrayType], |
| 152 | + size: usize, |
| 153 | + len: usize, |
| 154 | + characters: &str, |
| 155 | + trimmed: &str, |
| 156 | + remaining_len: usize, |
| 157 | +) { |
| 158 | + let mut group = c.benchmark_group(group_name); |
| 159 | + group.sampling_mode(SamplingMode::Flat); |
| 160 | + group.sample_size(10); |
| 161 | + |
| 162 | + for string_type in string_types { |
| 163 | + run_with_string_type( |
| 164 | + &mut group, |
| 165 | + ltrim, |
| 166 | + size, |
| 167 | + len, |
| 168 | + characters, |
| 169 | + trimmed, |
| 170 | + remaining_len, |
| 171 | + *string_type, |
| 172 | + ); |
| 173 | + } |
| 174 | + |
| 175 | + group.finish(); |
| 176 | +} |
| 177 | + |
37 | 178 | fn criterion_benchmark(c: &mut Criterion) {
|
38 | 179 | let ltrim = string::ltrim();
|
39 |
| - for char in ["\"", "Header:"] { |
40 |
| - for size in [1024, 4096, 8192] { |
41 |
| - let args = create_args(size, char); |
42 |
| - c.bench_function(&format!("ltrim {}: {}", char, size), |b| { |
43 |
| - b.iter(|| black_box(ltrim.invoke(&args))) |
44 |
| - }); |
45 |
| - } |
| 180 | + let characters = ",!()"; |
| 181 | + |
| 182 | + let string_types = [ |
| 183 | + StringArrayType::Utf8View, |
| 184 | + StringArrayType::Utf8, |
| 185 | + StringArrayType::LargeUtf8, |
| 186 | + ]; |
| 187 | + for size in [1024, 4096, 8192] { |
| 188 | + // len=12, trimmed_len=4, len_after_ltrim=8 |
| 189 | + let len = 12; |
| 190 | + let trimmed = characters; |
| 191 | + let remaining_len = len - trimmed.len(); |
| 192 | + run_one_group( |
| 193 | + c, |
| 194 | + "INPUT LEN <= 12", |
| 195 | + <rim, |
| 196 | + &string_types, |
| 197 | + size, |
| 198 | + len, |
| 199 | + characters, |
| 200 | + trimmed, |
| 201 | + remaining_len, |
| 202 | + ); |
| 203 | + |
| 204 | + // len=64, trimmed_len=4, len_after_ltrim=60 |
| 205 | + let len = 64; |
| 206 | + let trimmed = characters; |
| 207 | + let remaining_len = len - trimmed.len(); |
| 208 | + run_one_group( |
| 209 | + c, |
| 210 | + "INPUT LEN > 12, OUTPUT LEN > 12", |
| 211 | + <rim, |
| 212 | + &string_types, |
| 213 | + size, |
| 214 | + len, |
| 215 | + characters, |
| 216 | + trimmed, |
| 217 | + remaining_len, |
| 218 | + ); |
| 219 | + |
| 220 | + // len=64, trimmed_len=56, len_after_ltrim=8 |
| 221 | + let len = 64; |
| 222 | + let trimmed = characters.repeat(15); |
| 223 | + let remaining_len = len - trimmed.len(); |
| 224 | + run_one_group( |
| 225 | + c, |
| 226 | + "INPUT LEN > 12, OUTPUT LEN <= 12", |
| 227 | + <rim, |
| 228 | + &string_types, |
| 229 | + size, |
| 230 | + len, |
| 231 | + characters, |
| 232 | + &trimmed, |
| 233 | + remaining_len, |
| 234 | + ); |
46 | 235 | }
|
47 | 236 | }
|
48 | 237 |
|
|
0 commit comments