Skip to content

Fix incorrect cast Timestamp with Timezone #4201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 148 additions & 3 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
//! assert_eq!(7.0, c.value(2));
//! ```

use chrono::{NaiveTime, TimeZone, Timelike, Utc};
use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc};
use std::cmp::Ordering;
use std::sync::Arc;

Expand Down Expand Up @@ -1770,7 +1770,7 @@ pub fn cast_with_options(
tz.clone(),
)),

(Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => {
(Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => {
let array = cast_with_options(array, &Int64, cast_options)?;
let time_array = array.as_primitive::<Int64Type>();
let from_size = time_unit_multiple(from_unit);
Expand All @@ -1792,8 +1792,52 @@ pub fn cast_with_options(
}
}
};
// Normalize timezone
let adjusted = match (from_tz, to_tz) {
// Only this case needs to be adjusted because we're casting from
// unknown time offset to some time offset, we want the time to be
// unchanged.
//
// i.e. Timestamp('2001-01-01T00:00', None) -> Timestamp('2001-01-01T00:00', '+0700')
(None, Some(to_tz)) => {
let to_tz: Tz = to_tz.parse()?;
match to_unit {
TimeUnit::Second => {
adjust_timestamp_to_timezone::<TimestampSecondType>(
converted,
&to_tz,
cast_options,
)?
}
TimeUnit::Millisecond => {
adjust_timestamp_to_timezone::<TimestampMillisecondType>(
converted,
&to_tz,
cast_options,
)?
}
TimeUnit::Microsecond => {
adjust_timestamp_to_timezone::<TimestampMicrosecondType>(
converted,
&to_tz,
cast_options,
)?
}
TimeUnit::Nanosecond => {
adjust_timestamp_to_timezone::<TimestampNanosecondType>(
converted,
&to_tz,
cast_options,
)?
}
}
}
_ => {
converted
}
};
Ok(make_timestamp_array(
&converted,
&adjusted,
to_unit.clone(),
to_tz.clone(),
))
Expand Down Expand Up @@ -3005,6 +3049,30 @@ fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
Ok(Arc::new(interval_array) as ArrayRef)
}

fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
array: PrimitiveArray<Int64Type>,
to_tz: &Tz,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<Int64Type>, ArrowError> {
let adjust = |o| {
let local = as_datetime::<T>(o)?;
let offset = to_tz.offset_from_local_datetime(&local).single()?;
T::make_value(local - offset.fix())
};
let adjusted = if cast_options.safe {
array.unary_opt::<_, Int64Type>(adjust)
} else {
array.try_unary::<_, Int64Type, _>(|o| {
adjust(o).ok_or_else(|| {
ArrowError::CastError(
"Cannot cast timezone to different timezone".to_string(),
)
})
})?
};
Ok(adjusted)
}

/// Casts Utf8 to Boolean
fn cast_utf8_to_boolean<OffsetSize>(
from: &dyn Array,
Expand Down Expand Up @@ -5978,6 +6046,83 @@ mod tests {
assert!(b.is_err());
}

// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
#[test]
fn test_cast_timestamp_with_timezone_1() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T00:00:00.123456789"),
Some("2010-01-01T00:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&string_array, &to_type).unwrap();

let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T00:00:00.123456+07:00", result.value(0));
assert_eq!("2010-01-01T00:00:00.123456+07:00", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
#[test]
fn test_cast_timestamp_with_timezone_2() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-01-01T07:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("+0700".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T07:00:00.123+07:00", result.value(0));
assert_eq!("2010-01-01T07:00:00.123+07:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T00:00:00.123", result.value(0));
assert_eq!("2010-01-01T00:00:00.123", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
#[test]
fn test_cast_timestamp_with_timezone_3() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-01-01T07:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T07:00:00.123456+07:00", result.value(0));
assert_eq!("2010-01-01T07:00:00.123456+07:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Second, Some("-08:00".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("1999-12-31T16:00:00-08:00", result.value(0));
assert_eq!("2009-12-31T16:00:00-08:00", result.value(1));
assert!(result.is_null(2));
}

#[test]
fn test_cast_date64_to_timestamp() {
let array =
Expand Down
92 changes: 92 additions & 0 deletions arrow/tests/array_cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use arrow_array::builder::{
PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder,
};
use arrow_array::cast::AsArray;
use arrow_array::types::{
ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type,
Int64Type, Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type,
Expand Down Expand Up @@ -64,6 +65,97 @@ fn test_cast_timestamp_to_string() {
assert!(c.is_null(2));
}

// See: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for list of valid
// timezones

// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
#[test]
fn test_cast_timestamp_with_timezone_daylight_1() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
// This is winter in New York so daylight saving is not in effect
// UTC offset is -05:00
Some("2000-01-01T00:00:00.123456789"),
// This is summer in New York so daylight saving is in effect
// UTC offset is -04:00
Some("2010-07-01T00:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&string_array, &to_type).unwrap();

let to_type =
DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
#[test]
fn test_cast_timestamp_with_timezone_daylight_2() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-07-01T07:00:00.123456789"),
None,
]));
let to_type =
DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T07:00:00.123-05:00", result.value(0));
assert_eq!("2010-07-01T07:00:00.123-04:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T12:00:00.123", result.value(0));
assert_eq!("2010-07-01T11:00:00.123", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
#[test]
fn test_cast_timestamp_with_timezone_daylight_3() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
// Winter in New York, summer in Sydney
// UTC offset is -05:00 (New York) and +11:00 (Sydney)
Some("2000-01-01T00:00:00.123456789"),
// Summer in New York, winter in Sydney
// UTC offset is -04:00 (New York) and +10:00 (Sydney)
Some("2010-07-01T00:00:00.123456789"),
None,
]));
let to_type =
DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Second, Some("Australia/Sydney".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_string::<i32>();
assert_eq!("2000-01-01T16:00:00+11:00", result.value(0));
assert_eq!("2010-07-01T14:00:00+10:00", result.value(1));
assert!(result.is_null(2));
}

#[test]
#[cfg_attr(miri, ignore)] // running forever
fn test_can_cast_types() {
Expand Down