Skip to content

Commit 7e91bc8

Browse files
committed
Faster reverse() string function for ASCII-only case
1 parent e9a77e0 commit 7e91bc8

File tree

4 files changed

+139
-96
lines changed

4 files changed

+139
-96
lines changed

datafusion/functions/benches/character_length.rs

+2-54
Original file line numberDiff line numberDiff line change
@@ -17,62 +17,10 @@
1717

1818
extern crate criterion;
1919

20-
use arrow::array::{StringArray, StringViewArray};
2120
use criterion::{black_box, criterion_group, criterion_main, Criterion};
22-
use datafusion_expr::ColumnarValue;
23-
use rand::distributions::Alphanumeric;
24-
use rand::{rngs::StdRng, Rng, SeedableRng};
25-
use std::sync::Arc;
21+
use helper::gen_string_array;
2622

27-
/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
28-
/// 4096 rows, each row containing a string with 128 random characters.
29-
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
30-
fn gen_string_array(
31-
n_rows: usize,
32-
str_len_chars: usize,
33-
null_density: f32,
34-
utf8_density: f32,
35-
is_string_view: bool, // false -> StringArray, true -> StringViewArray
36-
) -> Vec<ColumnarValue> {
37-
let mut rng = StdRng::seed_from_u64(42);
38-
let rng_ref = &mut rng;
39-
40-
let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
41-
let corpus_char_count = corpus.chars().count();
42-
43-
let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
44-
for _ in 0..n_rows {
45-
let rand_num = rng_ref.gen::<f32>(); // [0.0, 1.0)
46-
if rand_num < null_density {
47-
output_string_vec.push(None);
48-
} else if rand_num < null_density + utf8_density {
49-
// Generate random UTF8 string
50-
let mut generated_string = String::with_capacity(str_len_chars);
51-
for _ in 0..str_len_chars {
52-
let idx = rng_ref.gen_range(0..corpus_char_count);
53-
let char = corpus.chars().nth(idx).unwrap();
54-
generated_string.push(char);
55-
}
56-
output_string_vec.push(Some(generated_string));
57-
} else {
58-
// Generate random ASCII-only string
59-
let value = rng_ref
60-
.sample_iter(&Alphanumeric)
61-
.take(str_len_chars)
62-
.collect();
63-
let value = String::from_utf8(value).unwrap();
64-
output_string_vec.push(Some(value));
65-
}
66-
}
67-
68-
if is_string_view {
69-
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
70-
vec![ColumnarValue::Array(Arc::new(string_view_array))]
71-
} else {
72-
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
73-
vec![ColumnarValue::Array(Arc::new(string_array))]
74-
}
75-
}
23+
mod helper;
7624

7725
fn criterion_benchmark(c: &mut Criterion) {
7826
// All benches are single batch run with 8192 rows
+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{StringArray, StringViewArray};
19+
use datafusion_expr::ColumnarValue;
20+
use rand::distributions::Alphanumeric;
21+
use rand::{rngs::StdRng, Rng, SeedableRng};
22+
use std::sync::Arc;
23+
24+
/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
25+
/// 4096 rows, each row containing a string with 128 random characters.
26+
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
27+
pub fn gen_string_array(
28+
n_rows: usize,
29+
str_len_chars: usize,
30+
null_density: f32,
31+
utf8_density: f32,
32+
is_string_view: bool, // false -> StringArray, true -> StringViewArray
33+
) -> Vec<ColumnarValue> {
34+
let mut rng = StdRng::seed_from_u64(42);
35+
let rng_ref = &mut rng;
36+
37+
let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes
38+
let corpus_char_count = corpus.chars().count();
39+
40+
let mut output_string_vec: Vec<Option<String>> = Vec::with_capacity(n_rows);
41+
for _ in 0..n_rows {
42+
let rand_num = rng_ref.gen::<f32>(); // [0.0, 1.0)
43+
if rand_num < null_density {
44+
output_string_vec.push(None);
45+
} else if rand_num < null_density + utf8_density {
46+
// Generate random UTF8 string
47+
let mut generated_string = String::with_capacity(str_len_chars);
48+
for _ in 0..str_len_chars {
49+
let idx = rng_ref.gen_range(0..corpus_char_count);
50+
let char = corpus.chars().nth(idx).unwrap();
51+
generated_string.push(char);
52+
}
53+
output_string_vec.push(Some(generated_string));
54+
} else {
55+
// Generate random ASCII-only string
56+
let value = rng_ref
57+
.sample_iter(&Alphanumeric)
58+
.take(str_len_chars)
59+
.collect();
60+
let value = String::from_utf8(value).unwrap();
61+
output_string_vec.push(Some(value));
62+
}
63+
}
64+
65+
if is_string_view {
66+
let string_view_array: StringViewArray = output_string_vec.into_iter().collect();
67+
vec![ColumnarValue::Array(Arc::new(string_view_array))]
68+
} else {
69+
let string_array: StringArray = output_string_vec.clone().into_iter().collect();
70+
vec![ColumnarValue::Array(Arc::new(string_array))]
71+
}
72+
}

datafusion/functions/benches/reverse.rs

+52-38
Original file line numberDiff line numberDiff line change
@@ -16,70 +16,84 @@
1616
// under the License.
1717

1818
extern crate criterion;
19+
mod helper;
1920

20-
use arrow::array::OffsetSizeTrait;
21-
use arrow::util::bench_util::{
22-
create_string_array_with_len, create_string_view_array_with_len,
23-
};
2421
use criterion::{black_box, criterion_group, criterion_main, Criterion};
25-
use datafusion_expr::ColumnarValue;
26-
use datafusion_functions::unicode;
27-
use std::sync::Arc;
28-
29-
fn create_args<O: OffsetSizeTrait>(
30-
size: usize,
31-
str_len: usize,
32-
force_view_types: bool,
33-
) -> Vec<ColumnarValue> {
34-
if force_view_types {
35-
let string_array =
36-
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
37-
38-
vec![ColumnarValue::Array(string_array)]
39-
} else {
40-
let string_array =
41-
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
42-
43-
vec![ColumnarValue::Array(string_array)]
44-
}
45-
}
22+
use helper::gen_string_array;
4623

4724
fn criterion_benchmark(c: &mut Criterion) {
48-
let reverse = unicode::reverse();
49-
for size in [1024, 4096] {
50-
let str_len = 8;
25+
// All benches are single batch run with 8192 rows
26+
let reverse = datafusion_functions::unicode::reverse();
5127

52-
let args = create_args::<i32>(size, str_len, true);
28+
const N_ROWS: usize = 8192;
29+
const NULL_DENSITY: f32 = 0.1;
30+
const UTF8_DENSITY_OF_ALL_ASCII: f32 = 0.0;
31+
const NORMAL_UTF8_DENSITY: f32 = 0.8;
32+
for str_len in [8, 32, 128, 4096] {
33+
// StringArray ASCII only
34+
let args_string_ascii = gen_string_array(
35+
N_ROWS,
36+
str_len,
37+
NULL_DENSITY,
38+
UTF8_DENSITY_OF_ALL_ASCII,
39+
false,
40+
);
5341
c.bench_function(
54-
format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(),
42+
&format!("reverse_StringArray_ascii_str_len_{}", str_len),
5543
|b| {
5644
b.iter(|| {
5745
// TODO use invoke_with_args
58-
black_box(reverse.invoke_batch(&args, str_len))
46+
black_box(reverse.invoke_batch(&args_string_ascii, N_ROWS))
5947
})
6048
},
6149
);
6250

63-
let str_len = 32;
51+
// StringArray UTF8
52+
let args_string_utf8 =
53+
gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, false);
54+
c.bench_function(
55+
&format!(
56+
"reverse_StringArray_utf8_density_{}_str_len_{}",
57+
NORMAL_UTF8_DENSITY, str_len
58+
),
59+
|b| {
60+
b.iter(|| {
61+
// TODO use invoke_with_args
62+
black_box(reverse.invoke_batch(&args_string_utf8, N_ROWS))
63+
})
64+
},
65+
);
6466

65-
let args = create_args::<i32>(size, str_len, true);
67+
// StringViewArray ASCII only
68+
let args_string_view_ascii = gen_string_array(
69+
N_ROWS,
70+
str_len,
71+
NULL_DENSITY,
72+
UTF8_DENSITY_OF_ALL_ASCII,
73+
true,
74+
);
6675
c.bench_function(
67-
format!("reverse_string_view [size={}, str_len={}]", size, str_len).as_str(),
76+
&format!("reverse_StringViewArray_ascii_str_len_{}", str_len),
6877
|b| {
6978
b.iter(|| {
7079
// TODO use invoke_with_args
71-
black_box(reverse.invoke_batch(&args, str_len))
80+
black_box(reverse.invoke_batch(&args_string_view_ascii, N_ROWS))
7281
})
7382
},
7483
);
7584

76-
let args = create_args::<i32>(size, str_len, false);
85+
// StringViewArray UTF8
86+
let args_string_view_utf8 =
87+
gen_string_array(N_ROWS, str_len, NULL_DENSITY, NORMAL_UTF8_DENSITY, true);
7788
c.bench_function(
78-
format!("reverse_string [size={}, str_len={}]", size, str_len).as_str(),
89+
&format!(
90+
"reverse_StringViewArray_utf8_density_{}_str_len_{}",
91+
NORMAL_UTF8_DENSITY, str_len
92+
),
7993
|b| {
8094
b.iter(|| {
8195
// TODO use invoke_with_args
82-
black_box(reverse.invoke_batch(&args, str_len))
96+
black_box(reverse.invoke_batch(&args_string_view_utf8, N_ROWS))
8397
})
8498
},
8599
);

datafusion/functions/src/unicode/reverse.rs

+13-4
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,21 @@ fn reverse_impl<'a, T: OffsetSizeTrait, V: StringArrayType<'a>>(
119119
) -> Result<ArrayRef> {
120120
let mut builder = GenericStringBuilder::<T>::with_capacity(string_array.len(), 1024);
121121

122-
let mut reversed = String::new();
122+
let mut string_buf = String::new();
123123
for string in string_array.iter() {
124124
if let Some(s) = string {
125-
reversed.extend(s.chars().rev());
126-
builder.append_value(&reversed);
127-
reversed.clear();
125+
if s.is_ascii() {
126+
// SAFETY: Since the original string was ASCII, reversing the bytes still results in valid UTF-8.
127+
let reversed = unsafe {
128+
// reverse bytes directly since ASCII characters are single bytes
129+
String::from_utf8_unchecked(s.bytes().rev().collect::<Vec<u8>>())
130+
};
131+
builder.append_value(&reversed);
132+
} else {
133+
string_buf.extend(s.chars().rev());
134+
builder.append_value(&string_buf);
135+
string_buf.clear();
136+
}
128137
} else {
129138
builder.append_null();
130139
}

0 commit comments

Comments
 (0)