Skip to content

Commit d012bb2

Browse files
aprimaditustvold
andauthored
Fix incorrect cast Timestamp with Timezone (#4201)
* Fix incorrect cast Timestamp with Timezone * Fix incorrect cast timestamp with timezone * Support chrono_tz Timezone * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <[email protected]> * Update arrow-cast/src/cast.rs Co-authored-by: Raphael Taylor-Davies <[email protected]> * Move chrono-tz timestamp test to arrow/tests * Fix clippy and cargo fmt * Fix clippy --------- Co-authored-by: Raphael Taylor-Davies <[email protected]>
1 parent e1e1c79 commit d012bb2

File tree

2 files changed

+240
-3
lines changed

2 files changed

+240
-3
lines changed

arrow-cast/src/cast.rs

+148-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
//! assert_eq!(7.0, c.value(2));
3636
//! ```
3737
38-
use chrono::{NaiveTime, TimeZone, Timelike, Utc};
38+
use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc};
3939
use std::cmp::Ordering;
4040
use std::sync::Arc;
4141

@@ -1770,7 +1770,7 @@ pub fn cast_with_options(
17701770
tz.clone(),
17711771
)),
17721772

1773-
(Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => {
1773+
(Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => {
17741774
let array = cast_with_options(array, &Int64, cast_options)?;
17751775
let time_array = array.as_primitive::<Int64Type>();
17761776
let from_size = time_unit_multiple(from_unit);
@@ -1792,8 +1792,52 @@ pub fn cast_with_options(
17921792
}
17931793
}
17941794
};
1795+
// Normalize timezone
1796+
let adjusted = match (from_tz, to_tz) {
1797+
// Only this case needs to be adjusted because we're casting from
1798+
// unknown time offset to some time offset, we want the time to be
1799+
// unchanged.
1800+
//
1801+
// i.e. Timestamp('2001-01-01T00:00', None) -> Timestamp('2001-01-01T00:00', '+0700')
1802+
(None, Some(to_tz)) => {
1803+
let to_tz: Tz = to_tz.parse()?;
1804+
match to_unit {
1805+
TimeUnit::Second => {
1806+
adjust_timestamp_to_timezone::<TimestampSecondType>(
1807+
converted,
1808+
&to_tz,
1809+
cast_options,
1810+
)?
1811+
}
1812+
TimeUnit::Millisecond => {
1813+
adjust_timestamp_to_timezone::<TimestampMillisecondType>(
1814+
converted,
1815+
&to_tz,
1816+
cast_options,
1817+
)?
1818+
}
1819+
TimeUnit::Microsecond => {
1820+
adjust_timestamp_to_timezone::<TimestampMicrosecondType>(
1821+
converted,
1822+
&to_tz,
1823+
cast_options,
1824+
)?
1825+
}
1826+
TimeUnit::Nanosecond => {
1827+
adjust_timestamp_to_timezone::<TimestampNanosecondType>(
1828+
converted,
1829+
&to_tz,
1830+
cast_options,
1831+
)?
1832+
}
1833+
}
1834+
}
1835+
_ => {
1836+
converted
1837+
}
1838+
};
17951839
Ok(make_timestamp_array(
1796-
&converted,
1840+
&adjusted,
17971841
to_unit.clone(),
17981842
to_tz.clone(),
17991843
))
@@ -3005,6 +3049,30 @@ fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
30053049
Ok(Arc::new(interval_array) as ArrayRef)
30063050
}
30073051

3052+
fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
3053+
array: PrimitiveArray<Int64Type>,
3054+
to_tz: &Tz,
3055+
cast_options: &CastOptions,
3056+
) -> Result<PrimitiveArray<Int64Type>, ArrowError> {
3057+
let adjust = |o| {
3058+
let local = as_datetime::<T>(o)?;
3059+
let offset = to_tz.offset_from_local_datetime(&local).single()?;
3060+
T::make_value(local - offset.fix())
3061+
};
3062+
let adjusted = if cast_options.safe {
3063+
array.unary_opt::<_, Int64Type>(adjust)
3064+
} else {
3065+
array.try_unary::<_, Int64Type, _>(|o| {
3066+
adjust(o).ok_or_else(|| {
3067+
ArrowError::CastError(
3068+
"Cannot cast timezone to different timezone".to_string(),
3069+
)
3070+
})
3071+
})?
3072+
};
3073+
Ok(adjusted)
3074+
}
3075+
30083076
/// Casts Utf8 to Boolean
30093077
fn cast_utf8_to_boolean<OffsetSize>(
30103078
from: &dyn Array,
@@ -5978,6 +6046,83 @@ mod tests {
59786046
assert!(b.is_err());
59796047
}
59806048

6049+
// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
6050+
#[test]
6051+
fn test_cast_timestamp_with_timezone_1() {
6052+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
6053+
Some("2000-01-01T00:00:00.123456789"),
6054+
Some("2010-01-01T00:00:00.123456789"),
6055+
None,
6056+
]));
6057+
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
6058+
let timestamp_array = cast(&string_array, &to_type).unwrap();
6059+
6060+
let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
6061+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
6062+
6063+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
6064+
let result = string_array.as_string::<i32>();
6065+
assert_eq!("2000-01-01T00:00:00.123456+07:00", result.value(0));
6066+
assert_eq!("2010-01-01T00:00:00.123456+07:00", result.value(1));
6067+
assert!(result.is_null(2));
6068+
}
6069+
6070+
// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
6071+
#[test]
6072+
fn test_cast_timestamp_with_timezone_2() {
6073+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
6074+
Some("2000-01-01T07:00:00.123456789"),
6075+
Some("2010-01-01T07:00:00.123456789"),
6076+
None,
6077+
]));
6078+
let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("+0700".into()));
6079+
let timestamp_array = cast(&string_array, &to_type).unwrap();
6080+
6081+
// Check intermediate representation is correct
6082+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
6083+
let result = string_array.as_string::<i32>();
6084+
assert_eq!("2000-01-01T07:00:00.123+07:00", result.value(0));
6085+
assert_eq!("2010-01-01T07:00:00.123+07:00", result.value(1));
6086+
assert!(result.is_null(2));
6087+
6088+
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
6089+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
6090+
6091+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
6092+
let result = string_array.as_string::<i32>();
6093+
assert_eq!("2000-01-01T00:00:00.123", result.value(0));
6094+
assert_eq!("2010-01-01T00:00:00.123", result.value(1));
6095+
assert!(result.is_null(2));
6096+
}
6097+
6098+
// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
6099+
#[test]
6100+
fn test_cast_timestamp_with_timezone_3() {
6101+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
6102+
Some("2000-01-01T07:00:00.123456789"),
6103+
Some("2010-01-01T07:00:00.123456789"),
6104+
None,
6105+
]));
6106+
let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
6107+
let timestamp_array = cast(&string_array, &to_type).unwrap();
6108+
6109+
// Check intermediate representation is correct
6110+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
6111+
let result = string_array.as_string::<i32>();
6112+
assert_eq!("2000-01-01T07:00:00.123456+07:00", result.value(0));
6113+
assert_eq!("2010-01-01T07:00:00.123456+07:00", result.value(1));
6114+
assert!(result.is_null(2));
6115+
6116+
let to_type = DataType::Timestamp(TimeUnit::Second, Some("-08:00".into()));
6117+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
6118+
6119+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
6120+
let result = string_array.as_string::<i32>();
6121+
assert_eq!("1999-12-31T16:00:00-08:00", result.value(0));
6122+
assert_eq!("2009-12-31T16:00:00-08:00", result.value(1));
6123+
assert!(result.is_null(2));
6124+
}
6125+
59816126
#[test]
59826127
fn test_cast_date64_to_timestamp() {
59836128
let array =

arrow/tests/array_cast.rs

+92
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use arrow_array::builder::{
1919
PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder,
2020
};
21+
use arrow_array::cast::AsArray;
2122
use arrow_array::types::{
2223
ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type,
2324
Int64Type, Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type,
@@ -64,6 +65,97 @@ fn test_cast_timestamp_to_string() {
6465
assert!(c.is_null(2));
6566
}
6667

68+
// See: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for list of valid
69+
// timezones
70+
71+
// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
72+
#[test]
73+
fn test_cast_timestamp_with_timezone_daylight_1() {
74+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
75+
// This is winter in New York so daylight saving is not in effect
76+
// UTC offset is -05:00
77+
Some("2000-01-01T00:00:00.123456789"),
78+
// This is summer in New York so daylight saving is in effect
79+
// UTC offset is -04:00
80+
Some("2010-07-01T00:00:00.123456789"),
81+
None,
82+
]));
83+
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
84+
let timestamp_array = cast(&string_array, &to_type).unwrap();
85+
86+
let to_type =
87+
DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
88+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
89+
90+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
91+
let result = string_array.as_string::<i32>();
92+
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
93+
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
94+
assert!(result.is_null(2));
95+
}
96+
97+
// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
98+
#[test]
99+
fn test_cast_timestamp_with_timezone_daylight_2() {
100+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
101+
Some("2000-01-01T07:00:00.123456789"),
102+
Some("2010-07-01T07:00:00.123456789"),
103+
None,
104+
]));
105+
let to_type =
106+
DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into()));
107+
let timestamp_array = cast(&string_array, &to_type).unwrap();
108+
109+
// Check intermediate representation is correct
110+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
111+
let result = string_array.as_string::<i32>();
112+
assert_eq!("2000-01-01T07:00:00.123-05:00", result.value(0));
113+
assert_eq!("2010-07-01T07:00:00.123-04:00", result.value(1));
114+
assert!(result.is_null(2));
115+
116+
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
117+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
118+
119+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
120+
let result = string_array.as_string::<i32>();
121+
assert_eq!("2000-01-01T12:00:00.123", result.value(0));
122+
assert_eq!("2010-07-01T11:00:00.123", result.value(1));
123+
assert!(result.is_null(2));
124+
}
125+
126+
// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
127+
#[test]
128+
fn test_cast_timestamp_with_timezone_daylight_3() {
129+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
130+
// Winter in New York, summer in Sydney
131+
// UTC offset is -05:00 (New York) and +11:00 (Sydney)
132+
Some("2000-01-01T00:00:00.123456789"),
133+
// Summer in New York, winter in Sydney
134+
// UTC offset is -04:00 (New York) and +10:00 (Sydney)
135+
Some("2010-07-01T00:00:00.123456789"),
136+
None,
137+
]));
138+
let to_type =
139+
DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
140+
let timestamp_array = cast(&string_array, &to_type).unwrap();
141+
142+
// Check intermediate representation is correct
143+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
144+
let result = string_array.as_string::<i32>();
145+
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
146+
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
147+
assert!(result.is_null(2));
148+
149+
let to_type = DataType::Timestamp(TimeUnit::Second, Some("Australia/Sydney".into()));
150+
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();
151+
152+
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
153+
let result = string_array.as_string::<i32>();
154+
assert_eq!("2000-01-01T16:00:00+11:00", result.value(0));
155+
assert_eq!("2010-07-01T14:00:00+10:00", result.value(1));
156+
assert!(result.is_null(2));
157+
}
158+
67159
#[test]
68160
#[cfg_attr(miri, ignore)] // running forever
69161
fn test_can_cast_types() {

0 commit comments

Comments
 (0)