Skip to content

Commit cdb042e

Browse files
authored
Faster timestamp parsing (~70-90% faster) (#3801)
* Faster timestamp parsing * Faster timezone parsing * More tests * Review feedback * Review feedback * Fix test * Format
1 parent de9f826 commit cdb042e

File tree

7 files changed

+371
-131
lines changed

7 files changed

+371
-131
lines changed

arrow-array/src/timezone.rs

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,34 @@
1818
//! Timezone for timestamp arrays
1919
2020
use arrow_schema::ArrowError;
21-
use chrono::format::{parse, Parsed, StrftimeItems};
2221
use chrono::FixedOffset;
2322
pub use private::{Tz, TzOffset};
2423

25-
/// Parses a fixed offset of the form "+09:00"
26-
fn parse_fixed_offset(tz: &str) -> Result<FixedOffset, ArrowError> {
27-
let mut parsed = Parsed::new();
28-
29-
if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z"))
30-
.and_then(|_| parsed.to_fixed_offset())
31-
{
32-
return Ok(fixed_offset);
24+
/// Parses a fixed offset of the form "+09:00", "-09" or "+0930"
25+
fn parse_fixed_offset(tz: &str) -> Option<FixedOffset> {
26+
let bytes = tz.as_bytes();
27+
28+
let mut values = match bytes.len() {
29+
// [+-]XX:XX
30+
6 if bytes[3] == b':' => [bytes[1], bytes[2], bytes[4], bytes[5]],
31+
// [+-]XXXX
32+
5 => [bytes[1], bytes[2], bytes[3], bytes[4]],
33+
// [+-]XX
34+
3 => [bytes[1], bytes[2], b'0', b'0'],
35+
_ => return None,
36+
};
37+
values.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0'));
38+
if values.iter().any(|x| *x > 9) {
39+
return None;
3340
}
41+
let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60
42+
+ (values[2] * 10 + values[3]) as i32 * 60;
3443

35-
if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z"))
36-
.and_then(|_| parsed.to_fixed_offset())
37-
{
38-
return Ok(fixed_offset);
44+
match bytes[0] {
45+
b'+' => FixedOffset::east_opt(secs),
46+
b'-' => FixedOffset::west_opt(secs),
47+
_ => None,
3948
}
40-
41-
Err(ArrowError::ParseError(format!(
42-
"Invalid timezone \"{tz}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX"
43-
)))
4449
}
4550

4651
#[cfg(feature = "chrono-tz")]
@@ -83,12 +88,11 @@ mod private {
8388
type Err = ArrowError;
8489

8590
fn from_str(tz: &str) -> Result<Self, Self::Err> {
86-
if tz.starts_with('+') || tz.starts_with('-') {
87-
Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?)))
88-
} else {
89-
Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| {
91+
match parse_fixed_offset(tz) {
92+
Some(offset) => Ok(Self(TzInner::Offset(offset))),
93+
None => Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| {
9094
ArrowError::ParseError(format!("Invalid timezone \"{tz}\": {e}"))
91-
})?)))
95+
})?))),
9296
}
9397
}
9498
}
@@ -261,13 +265,12 @@ mod private {
261265
type Err = ArrowError;
262266

263267
fn from_str(tz: &str) -> Result<Self, Self::Err> {
264-
if tz.starts_with('+') || tz.starts_with('-') {
265-
Ok(Self(parse_fixed_offset(tz)?))
266-
} else {
267-
Err(ArrowError::ParseError(format!(
268+
let offset = parse_fixed_offset(tz).ok_or_else(|| {
269+
ArrowError::ParseError(format!(
268270
"Invalid timezone \"{tz}\": only offset based timezones supported without chrono-tz feature"
269-
)))
270-
}
271+
))
272+
})?;
273+
Ok(Self(offset))
271274
}
272275
}
273276

arrow-cast/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,10 @@ num = { version = "0.4", default-features = false, features = ["std"] }
4848
lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] }
4949

5050
[dev-dependencies]
51+
criterion = { version = "0.4", default-features = false }
5152

5253
[build-dependencies]
54+
55+
[[bench]]
56+
name = "parse_timestamp"
57+
harness = false

arrow-cast/benches/parse_timestamp.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_cast::parse::string_to_timestamp_nanos;
19+
use criterion::*;
20+
21+
fn criterion_benchmark(c: &mut Criterion) {
22+
let timestamps = [
23+
"2020-09-08",
24+
"2020-09-08T13:42:29",
25+
"2020-09-08T13:42:29.190",
26+
"2020-09-08T13:42:29.190855",
27+
"2020-09-08T13:42:29.190855999",
28+
"2020-09-08T13:42:29+00:00",
29+
"2020-09-08T13:42:29.190+00:00",
30+
"2020-09-08T13:42:29.190855+00:00",
31+
"2020-09-08T13:42:29.190855999-05:00",
32+
"2020-09-08T13:42:29.190855Z",
33+
];
34+
35+
for timestamp in timestamps {
36+
let t = black_box(timestamp);
37+
c.bench_function(t, |b| {
38+
b.iter(|| string_to_timestamp_nanos(t).unwrap());
39+
});
40+
}
41+
}
42+
43+
criterion_group!(benches, criterion_benchmark);
44+
criterion_main!(benches);

arrow-cast/src/cast.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4914,7 +4914,7 @@ mod tests {
49144914
let err = cast_with_options(array, &to_type, &options).unwrap_err();
49154915
assert_eq!(
49164916
err.to_string(),
4917-
"Cast error: Error parsing 'Not a valid date' as timestamp"
4917+
"Parser error: Error parsing timestamp from 'Not a valid date': error parsing date"
49184918
);
49194919
}
49204920
}
@@ -7899,8 +7899,12 @@ mod tests {
78997899
]);
79007900

79017901
let array = Arc::new(valid) as ArrayRef;
7902-
let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)))
7903-
.unwrap();
7902+
let b = cast_with_options(
7903+
&array,
7904+
&DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)),
7905+
&CastOptions { safe: false },
7906+
)
7907+
.unwrap();
79047908

79057909
let c = b
79067910
.as_any()

0 commit comments

Comments
 (0)