Skip to content

Commit 6a036ae

Browse files
authored
Speed up chr UDF (~4x faster) (#14700)
* add chr bench * speed up chr * 1 byte assumption
1 parent 481515e commit 6a036ae

File tree

3 files changed

+90
-20
lines changed

3 files changed

+90
-20
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ harness = false
108108
name = "encoding"
109109
required-features = ["encoding_expressions"]
110110

111+
[[bench]]
112+
harness = false
113+
name = "chr"
114+
required-features = ["string_expressions"]
115+
111116
[[bench]]
112117
harness = false
113118
name = "uuid"

datafusion/functions/benches/chr.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::{array::PrimitiveArray, datatypes::Int64Type, util::test_util::seedable_rng};
21+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
22+
use datafusion_expr::ColumnarValue;
23+
use datafusion_functions::string::chr;
24+
use rand::Rng;
25+
26+
use std::sync::Arc;
27+
28+
fn criterion_benchmark(c: &mut Criterion) {
29+
let cot_fn = chr();
30+
let size = 1024;
31+
let input: PrimitiveArray<Int64Type> = {
32+
let null_density = 0.2;
33+
let mut rng = seedable_rng();
34+
(0..size)
35+
.map(|_| {
36+
if rng.gen::<f32>() < null_density {
37+
None
38+
} else {
39+
Some(rng.gen_range::<i64, _>(1i64..10_000))
40+
}
41+
})
42+
.collect()
43+
};
44+
let input = Arc::new(input);
45+
let args = vec![ColumnarValue::Array(input)];
46+
c.bench_function("chr", |b| {
47+
b.iter(|| black_box(cot_fn.invoke_batch(&args, size).unwrap()))
48+
});
49+
}
50+
51+
criterion_group!(benches, criterion_benchmark);
52+
criterion_main!(benches);

datafusion/functions/src/string/chr.rs

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::any::Any;
1919
use std::sync::Arc;
2020

2121
use arrow::array::ArrayRef;
22-
use arrow::array::StringArray;
22+
use arrow::array::GenericStringBuilder;
2323
use arrow::datatypes::DataType;
2424
use arrow::datatypes::DataType::Int64;
2525
use arrow::datatypes::DataType::Utf8;
@@ -36,26 +36,39 @@ use datafusion_macros::user_doc;
3636
pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
3737
let integer_array = as_int64_array(&args[0])?;
3838

39-
// first map is the iterator, second is for the `Option<_>`
40-
let result = integer_array
41-
.iter()
42-
.map(|integer: Option<i64>| {
43-
integer
44-
.map(|integer| {
45-
if integer == 0 {
46-
exec_err!("null character not permitted.")
47-
} else {
48-
match core::char::from_u32(integer as u32) {
49-
Some(integer) => Ok(integer.to_string()),
50-
None => {
51-
exec_err!("requested character too large for encoding.")
52-
}
39+
let mut builder = GenericStringBuilder::<i32>::with_capacity(
40+
integer_array.len(),
41+
// 1 byte per character, assuming that is the common case
42+
integer_array.len(),
43+
);
44+
45+
let mut buf = [0u8; 4];
46+
47+
for integer in integer_array {
48+
match integer {
49+
Some(integer) => {
50+
if integer == 0 {
51+
return exec_err!("null character not permitted.");
52+
} else {
53+
match core::char::from_u32(integer as u32) {
54+
Some(c) => {
55+
builder.append_value(c.encode_utf8(&mut buf));
56+
}
57+
None => {
58+
return exec_err!(
59+
"requested character too large for encoding."
60+
);
5361
}
5462
}
55-
})
56-
.transpose()
57-
})
58-
.collect::<Result<StringArray>>()?;
63+
}
64+
}
65+
None => {
66+
builder.append_null();
67+
}
68+
}
69+
}
70+
71+
let result = builder.finish();
5972

6073
Ok(Arc::new(result) as ArrayRef)
6174
}
@@ -70,7 +83,7 @@ pub fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
7083
| chr(Int64(128640)) |
7184
+--------------------+
7285
| 🚀 |
73-
+--------------------+
86+
+--------------------+
7487
```"#,
7588
standard_argument(name = "expression", prefix = "String"),
7689
related_udf(name = "ascii")

0 commit comments

Comments
 (0)