Skip to content

Commit 6cd0917

Browse files
tustvoldalamb
andauthored
Timezone aware timestamp parsing (#3794) (#3795)
* Timezone aware timestamp parsing (#3794) * Add further test * Update arrow-cast/src/parse.rs Co-authored-by: Andrew Lamb <[email protected]> --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 7fdd0d8 commit 6cd0917

File tree

1 file changed

+88
-45
lines changed

1 file changed

+88
-45
lines changed

arrow-cast/src/parse.rs

Lines changed: 88 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,7 @@ use arrow_array::ArrowPrimitiveType;
2020
use arrow_schema::ArrowError;
2121
use chrono::prelude::*;
2222

23-
/// Accepts a string in RFC3339 / ISO8601 standard format and some
24-
/// variants and converts it to a nanosecond precision timestamp.
25-
///
26-
/// Implements the `to_timestamp` function to convert a string to a
27-
/// timestamp, following the model of spark SQL’s to_`timestamp`.
23+
/// Accepts a string and parses it relative to the provided `timezone`
2824
///
2925
/// In addition to RFC3339 / ISO8601 standard timestamps, it also
3026
/// accepts strings that use a space ` ` to separate the date and time
@@ -38,36 +34,6 @@ use chrono::prelude::*;
3834
/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset
3935
/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds
4036
/// * `1997-01-31` # close to RCF3339, only date no time
41-
//
42-
/// Internally, this function uses the `chrono` library for the
43-
/// datetime parsing
44-
///
45-
/// We hope to extend this function in the future with a second
46-
/// parameter to specifying the format string.
47-
///
48-
/// ## Timestamp Precision
49-
///
50-
/// Function uses the maximum precision timestamps supported by
51-
/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
52-
/// means the range of dates that timestamps can represent is ~1677 AD
53-
/// to 2262 AM
54-
///
55-
///
56-
/// ## Timezone / Offset Handling
57-
///
58-
/// Numerical values of timestamps are stored compared to offset UTC.
59-
///
60-
/// This function interprets strings without an explicit time zone as
61-
/// timestamps with offsets of the local time on the machine
62-
///
63-
/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as
64-
/// it has an explicit timezone specifier (“Z” for Zulu/UTC)
65-
///
66-
/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in
67-
/// the timezone of the machine. For example, if
68-
/// the system timezone is set to Americas/New_York (UTC-5) the
69-
/// timestamp will be interpreted as though it were
70-
/// `1997-01-31T09:26:56.123-05:00`
7137
///
7238
/// Some formats that supported by PostgresSql <https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
7339
/// still not supported by chrono, like
@@ -76,12 +42,14 @@ use chrono::prelude::*;
7642
/// "2023-01-01 040506 +07:30:00",
7743
/// "2023-01-01 04:05:06.789 PST",
7844
/// "2023-01-01 04:05:06.789 -08",
79-
#[inline]
80-
pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
45+
pub fn string_to_datetime<T: TimeZone>(
46+
timezone: &T,
47+
s: &str,
48+
) -> Result<DateTime<T>, ArrowError> {
8149
// Fast path: RFC3339 timestamp (with a T)
8250
// Example: 2020-09-08T13:42:29.190855Z
8351
if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
84-
return Ok(ts.timestamp_nanos());
52+
return Ok(ts.with_timezone(timezone));
8553
}
8654

8755
// Implement quasi-RFC3339 support by trying to parse the
@@ -96,14 +64,14 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
9664

9765
for f in supported_formats.iter() {
9866
if let Ok(ts) = DateTime::parse_from_str(s, f) {
99-
return to_timestamp_nanos(ts.naive_utc());
67+
return Ok(ts.with_timezone(timezone));
10068
}
10169
}
10270

10371
// with an explicit Z, using ' ' as a separator
10472
// Example: 2020-09-08 13:42:29Z
10573
if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
106-
return to_timestamp_nanos(ts.naive_utc());
74+
return Ok(ts.with_timezone(timezone));
10775
}
10876

10977
// Support timestamps without an explicit timezone offset, again
@@ -112,34 +80,44 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
11280
// without a timezone specifier as a local time, using T as a separator
11381
// Example: 2020-09-08T13:42:29.190855
11482
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") {
115-
return to_timestamp_nanos(ts);
83+
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
84+
return Ok(DateTime::from_local(ts, offset));
85+
}
11686
}
11787

11888
// without a timezone specifier as a local time, using T as a
11989
// separator, no fractional seconds
12090
// Example: 2020-09-08T13:42:29
12191
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
122-
return Ok(ts.timestamp_nanos());
92+
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
93+
return Ok(DateTime::from_local(ts, offset));
94+
}
12395
}
12496

12597
// without a timezone specifier as a local time, using ' ' as a separator
12698
// Example: 2020-09-08 13:42:29.190855
12799
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") {
128-
return to_timestamp_nanos(ts);
100+
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
101+
return Ok(DateTime::from_local(ts, offset));
102+
}
129103
}
130104

131105
// without a timezone specifier as a local time, using ' ' as a
132106
// separator, no fractional seconds
133107
// Example: 2020-09-08 13:42:29
134108
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
135-
return Ok(ts.timestamp_nanos());
109+
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
110+
return Ok(DateTime::from_local(ts, offset));
111+
}
136112
}
137113

138114
// without a timezone specifier as a local time, only date
139115
// Example: 2020-09-08
140116
if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
141117
if let Some(ts) = dt.and_hms_opt(0, 0, 0) {
142-
return Ok(ts.timestamp_nanos());
118+
if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() {
119+
return Ok(DateTime::from_local(ts, offset));
120+
}
143121
}
144122
}
145123

@@ -153,6 +131,42 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
153131
)))
154132
}
155133

134+
/// Accepts a string in RFC3339 / ISO8601 standard format and some
135+
/// variants and converts it to a nanosecond precision timestamp.
136+
///
137+
/// See [`string_to_datetime`] for the full set of supported formats
138+
///
139+
/// Implements the `to_timestamp` function to convert a string to a
140+
/// timestamp, following the model of spark SQL’s to_`timestamp`.
141+
///
142+
/// Internally, this function uses the `chrono` library for the
143+
/// datetime parsing
144+
///
145+
/// We hope to extend this function in the future with a second
146+
/// parameter to specifying the format string.
147+
///
148+
/// ## Timestamp Precision
149+
///
150+
/// Function uses the maximum precision timestamps supported by
151+
/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
152+
/// means the range of dates that timestamps can represent is ~1677 AD
153+
/// to 2262 AM
154+
///
155+
/// ## Timezone / Offset Handling
156+
///
157+
/// Numerical values of timestamps are stored compared to offset UTC.
158+
///
159+
/// This function interprets string without an explicit time zone as timestamps
160+
/// relative to UTC, see [`string_to_datetime`] for alternative semantics
161+
///
162+
/// For example, both `1997-01-31 09:26:56.123Z`, `1997-01-31T09:26:56.123`,
163+
/// and `1997-01-31T14:26:56.123-05:00` will be parsed as the same value
164+
///
165+
#[inline]
166+
pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
167+
to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc())
168+
}
169+
156170
/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates
157171
#[inline]
158172
fn to_timestamp_nanos(dt: NaiveDateTime) -> Result<i64, ArrowError> {
@@ -448,6 +462,7 @@ impl Parser for Date64Type {
448462
#[cfg(test)]
449463
mod tests {
450464
use super::*;
465+
use arrow_array::timezone::Tz;
451466

452467
#[test]
453468
fn string_to_timestamp_timezone() {
@@ -614,6 +629,34 @@ mod tests {
614629
naive_datetime.timestamp_nanos(),
615630
parse_timestamp("2020-09-08 13:42:29").unwrap()
616631
);
632+
633+
let tz: Tz = "+02:00".parse().unwrap();
634+
let date = string_to_datetime(&tz, "2020-09-08 13:42:29").unwrap();
635+
let utc = date.naive_utc().to_string();
636+
assert_eq!(utc, "2020-09-08 11:42:29");
637+
let local = date.naive_local().to_string();
638+
assert_eq!(local, "2020-09-08 13:42:29");
639+
640+
let date = string_to_datetime(&tz, "2020-09-08 13:42:29Z").unwrap();
641+
let utc = date.naive_utc().to_string();
642+
assert_eq!(utc, "2020-09-08 13:42:29");
643+
let local = date.naive_local().to_string();
644+
assert_eq!(local, "2020-09-08 15:42:29");
645+
646+
let dt =
647+
NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ")
648+
.unwrap();
649+
let local: Tz = "+08:00".parse().unwrap();
650+
651+
// Parsed as offset from UTC
652+
let date = string_to_datetime(&local, "2020-09-08T13:42:29Z").unwrap();
653+
assert_eq!(dt, date.naive_utc());
654+
assert_ne!(dt, date.naive_local());
655+
656+
// Parsed as offset from local
657+
let date = string_to_datetime(&local, "2020-09-08 13:42:29").unwrap();
658+
assert_eq!(dt, date.naive_local());
659+
assert_ne!(dt, date.naive_utc());
617660
}
618661

619662
#[test]

0 commit comments

Comments
 (0)