Skip to content

Commit 435f959

Browse files
Omega359alamb
andauthored
Update TO_DATE, TO_TIMESTAMP scalar functions to support LargeUtf8, Utf8View (#12929)
* Update to_date and to_timestamp* udfs to support largeutf8 and utf8view. Benchmark updated as well * datetime depends on string expressions until #12898 lands * update to reflect the stringarraytype move to a common path * Update datafusion/functions/src/datetime/common.rs --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 90720c0 commit 435f959

File tree

7 files changed

+584
-190
lines changed

7 files changed

+584
-190
lines changed

datafusion/functions/benches/to_timestamp.rs

Lines changed: 169 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -20,27 +20,123 @@ extern crate criterion;
2020
use std::sync::Arc;
2121

2222
use arrow::array::builder::StringBuilder;
23-
use arrow::array::ArrayRef;
23+
use arrow::array::{ArrayRef, StringArray};
24+
use arrow::compute::cast;
25+
use arrow::datatypes::DataType;
2426
use criterion::{black_box, criterion_group, criterion_main, Criterion};
2527

2628
use datafusion_expr::ColumnarValue;
2729
use datafusion_functions::datetime::to_timestamp;
2830

31+
fn data() -> StringArray {
32+
let data: Vec<&str> = vec![
33+
"1997-01-31T09:26:56.123Z",
34+
"1997-01-31T09:26:56.123-05:00",
35+
"1997-01-31 09:26:56.123-05:00",
36+
"2023-01-01 04:05:06.789 -08",
37+
"1997-01-31T09:26:56.123",
38+
"1997-01-31 09:26:56.123",
39+
"1997-01-31 09:26:56",
40+
"1997-01-31 13:26:56",
41+
"1997-01-31 13:26:56+04:00",
42+
"1997-01-31",
43+
];
44+
45+
StringArray::from(data)
46+
}
47+
48+
fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray) {
49+
let mut inputs = StringBuilder::new();
50+
let mut format1_builder = StringBuilder::with_capacity(2, 10);
51+
let mut format2_builder = StringBuilder::with_capacity(2, 10);
52+
let mut format3_builder = StringBuilder::with_capacity(2, 10);
53+
54+
inputs.append_value("1997-01-31T09:26:56.123Z");
55+
format1_builder.append_value("%+");
56+
format2_builder.append_value("%c");
57+
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z");
58+
59+
inputs.append_value("1997-01-31T09:26:56.123-05:00");
60+
format1_builder.append_value("%+");
61+
format2_builder.append_value("%c");
62+
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z");
63+
64+
inputs.append_value("1997-01-31 09:26:56.123-05:00");
65+
format1_builder.append_value("%+");
66+
format2_builder.append_value("%c");
67+
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z");
68+
69+
inputs.append_value("2023-01-01 04:05:06.789 -08");
70+
format1_builder.append_value("%+");
71+
format2_builder.append_value("%c");
72+
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z");
73+
74+
inputs.append_value("1997-01-31T09:26:56.123");
75+
format1_builder.append_value("%+");
76+
format2_builder.append_value("%c");
77+
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");
78+
79+
inputs.append_value("1997-01-31 09:26:56.123");
80+
format1_builder.append_value("%+");
81+
format2_builder.append_value("%c");
82+
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");
83+
84+
inputs.append_value("1997-01-31 09:26:56");
85+
format1_builder.append_value("%+");
86+
format2_builder.append_value("%c");
87+
format3_builder.append_value("%Y-%m-%d %H:%M:%S");
88+
89+
inputs.append_value("1997-01-31 092656");
90+
format1_builder.append_value("%+");
91+
format2_builder.append_value("%c");
92+
format3_builder.append_value("%Y-%m-%d %H%M%S");
93+
94+
inputs.append_value("1997-01-31 092656+04:00");
95+
format1_builder.append_value("%+");
96+
format2_builder.append_value("%c");
97+
format3_builder.append_value("%Y-%m-%d %H%M%S%:z");
98+
99+
inputs.append_value("Sun Jul 8 00:34:60 2001");
100+
format1_builder.append_value("%+");
101+
format2_builder.append_value("%c");
102+
format3_builder.append_value("%Y-%m-%d 00:00:00");
103+
104+
(
105+
inputs.finish(),
106+
format1_builder.finish(),
107+
format2_builder.finish(),
108+
format3_builder.finish(),
109+
)
110+
}
29111
fn criterion_benchmark(c: &mut Criterion) {
30-
c.bench_function("to_timestamp_no_formats", |b| {
31-
let mut inputs = StringBuilder::new();
32-
inputs.append_value("1997-01-31T09:26:56.123Z");
33-
inputs.append_value("1997-01-31T09:26:56.123-05:00");
34-
inputs.append_value("1997-01-31 09:26:56.123-05:00");
35-
inputs.append_value("2023-01-01 04:05:06.789 -08");
36-
inputs.append_value("1997-01-31T09:26:56.123");
37-
inputs.append_value("1997-01-31 09:26:56.123");
38-
inputs.append_value("1997-01-31 09:26:56");
39-
inputs.append_value("1997-01-31 13:26:56");
40-
inputs.append_value("1997-01-31 13:26:56+04:00");
41-
inputs.append_value("1997-01-31");
42-
43-
let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef);
112+
c.bench_function("to_timestamp_no_formats_utf8", |b| {
113+
let string_array = ColumnarValue::Array(Arc::new(data()) as ArrayRef);
114+
115+
b.iter(|| {
116+
black_box(
117+
to_timestamp()
118+
.invoke(&[string_array.clone()])
119+
.expect("to_timestamp should work on valid values"),
120+
)
121+
})
122+
});
123+
124+
c.bench_function("to_timestamp_no_formats_largeutf8", |b| {
125+
let data = cast(&data(), &DataType::LargeUtf8).unwrap();
126+
let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
127+
128+
b.iter(|| {
129+
black_box(
130+
to_timestamp()
131+
.invoke(&[string_array.clone()])
132+
.expect("to_timestamp should work on valid values"),
133+
)
134+
})
135+
});
136+
137+
c.bench_function("to_timestamp_no_formats_utf8view", |b| {
138+
let data = cast(&data(), &DataType::Utf8View).unwrap();
139+
let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
44140

45141
b.iter(|| {
46142
black_box(
@@ -51,67 +147,66 @@ fn criterion_benchmark(c: &mut Criterion) {
51147
})
52148
});
53149

54-
c.bench_function("to_timestamp_with_formats", |b| {
55-
let mut inputs = StringBuilder::new();
56-
let mut format1_builder = StringBuilder::with_capacity(2, 10);
57-
let mut format2_builder = StringBuilder::with_capacity(2, 10);
58-
let mut format3_builder = StringBuilder::with_capacity(2, 10);
59-
60-
inputs.append_value("1997-01-31T09:26:56.123Z");
61-
format1_builder.append_value("%+");
62-
format2_builder.append_value("%c");
63-
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z");
64-
65-
inputs.append_value("1997-01-31T09:26:56.123-05:00");
66-
format1_builder.append_value("%+");
67-
format2_builder.append_value("%c");
68-
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z");
69-
70-
inputs.append_value("1997-01-31 09:26:56.123-05:00");
71-
format1_builder.append_value("%+");
72-
format2_builder.append_value("%c");
73-
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z");
74-
75-
inputs.append_value("2023-01-01 04:05:06.789 -08");
76-
format1_builder.append_value("%+");
77-
format2_builder.append_value("%c");
78-
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z");
79-
80-
inputs.append_value("1997-01-31T09:26:56.123");
81-
format1_builder.append_value("%+");
82-
format2_builder.append_value("%c");
83-
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");
84-
85-
inputs.append_value("1997-01-31 09:26:56.123");
86-
format1_builder.append_value("%+");
87-
format2_builder.append_value("%c");
88-
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");
89-
90-
inputs.append_value("1997-01-31 09:26:56");
91-
format1_builder.append_value("%+");
92-
format2_builder.append_value("%c");
93-
format3_builder.append_value("%Y-%m-%d %H:%M:%S");
94-
95-
inputs.append_value("1997-01-31 092656");
96-
format1_builder.append_value("%+");
97-
format2_builder.append_value("%c");
98-
format3_builder.append_value("%Y-%m-%d %H%M%S");
99-
100-
inputs.append_value("1997-01-31 092656+04:00");
101-
format1_builder.append_value("%+");
102-
format2_builder.append_value("%c");
103-
format3_builder.append_value("%Y-%m-%d %H%M%S%:z");
104-
105-
inputs.append_value("Sun Jul 8 00:34:60 2001");
106-
format1_builder.append_value("%+");
107-
format2_builder.append_value("%c");
108-
format3_builder.append_value("%Y-%m-%d 00:00:00");
150+
c.bench_function("to_timestamp_with_formats_utf8", |b| {
151+
let (inputs, format1, format2, format3) = data_with_formats();
152+
153+
let args = [
154+
ColumnarValue::Array(Arc::new(inputs) as ArrayRef),
155+
ColumnarValue::Array(Arc::new(format1) as ArrayRef),
156+
ColumnarValue::Array(Arc::new(format2) as ArrayRef),
157+
ColumnarValue::Array(Arc::new(format3) as ArrayRef),
158+
];
159+
b.iter(|| {
160+
black_box(
161+
to_timestamp()
162+
.invoke(&args.clone())
163+
.expect("to_timestamp should work on valid values"),
164+
)
165+
})
166+
});
167+
168+
c.bench_function("to_timestamp_with_formats_largeutf8", |b| {
169+
let (inputs, format1, format2, format3) = data_with_formats();
170+
171+
let args = [
172+
ColumnarValue::Array(
173+
Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as ArrayRef
174+
),
175+
ColumnarValue::Array(
176+
Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as ArrayRef
177+
),
178+
ColumnarValue::Array(
179+
Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as ArrayRef
180+
),
181+
ColumnarValue::Array(
182+
Arc::new(cast(&format3, &DataType::LargeUtf8).unwrap()) as ArrayRef
183+
),
184+
];
185+
b.iter(|| {
186+
black_box(
187+
to_timestamp()
188+
.invoke(&args.clone())
189+
.expect("to_timestamp should work on valid values"),
190+
)
191+
})
192+
});
193+
194+
c.bench_function("to_timestamp_with_formats_utf8view", |b| {
195+
let (inputs, format1, format2, format3) = data_with_formats();
109196

110197
let args = [
111-
ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef),
112-
ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef),
113-
ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef),
114-
ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef),
198+
ColumnarValue::Array(
199+
Arc::new(cast(&inputs, &DataType::Utf8View).unwrap()) as ArrayRef
200+
),
201+
ColumnarValue::Array(
202+
Arc::new(cast(&format1, &DataType::Utf8View).unwrap()) as ArrayRef
203+
),
204+
ColumnarValue::Array(
205+
Arc::new(cast(&format2, &DataType::Utf8View).unwrap()) as ArrayRef
206+
),
207+
ColumnarValue::Array(
208+
Arc::new(cast(&format3, &DataType::Utf8View).unwrap()) as ArrayRef
209+
),
115210
];
116211
b.iter(|| {
117212
black_box(

0 commit comments

Comments
 (0)