Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect cast Timestamp with Timezone #4201

Merged
merged 8 commits into from
May 12, 2023
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 243 additions & 3 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
//! assert_eq!(7.0, c.value(2));
//! ```

use chrono::{NaiveTime, TimeZone, Timelike, Utc};
use chrono::{NaiveTime, Offset, TimeZone, Timelike, Utc};
use std::cmp::Ordering;
use std::sync::Arc;

Expand Down Expand Up @@ -1770,7 +1770,7 @@ pub fn cast_with_options(
tz.clone(),
)),

(Timestamp(from_unit, _), Timestamp(to_unit, to_tz)) => {
(Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => {
let array = cast_with_options(array, &Int64, cast_options)?;
let time_array = array.as_primitive::<Int64Type>();
let from_size = time_unit_multiple(from_unit);
Expand All @@ -1792,8 +1792,52 @@ pub fn cast_with_options(
}
}
};
// Normalize timezone
let adjusted = match (from_tz, to_tz) {
// Only this case needs to be adjusted because we're casting from
// unknown time offset to some time offset, we want the time to be
// unchanged.
//
// i.e. Timestamp('2001-01-01T00:00', None) -> Timestamp('2001-01-01T00:00', '+0700')
(None, Some(to_tz)) => {
let to_tz: Tz = to_tz.parse()?;
match to_unit {
TimeUnit::Second => {
adjust_timestamp_to_timezone::<TimestampSecondType>(
converted,
&to_tz,
&cast_options,
)?
}
TimeUnit::Millisecond => {
adjust_timestamp_to_timezone::<TimestampMillisecondType>(
converted,
&to_tz,
&cast_options,
)?
}
TimeUnit::Microsecond => {
adjust_timestamp_to_timezone::<TimestampMicrosecondType>(
converted,
&to_tz,
&cast_options,
)?
}
TimeUnit::Nanosecond => {
adjust_timestamp_to_timezone::<TimestampNanosecondType>(
converted,
&to_tz,
&cast_options,
)?
}
}
}
_ => {
tustvold marked this conversation as resolved.
Show resolved Hide resolved
converted
}
};
Ok(make_timestamp_array(
&converted,
&adjusted,
to_unit.clone(),
to_tz.clone(),
))
Expand Down Expand Up @@ -3005,6 +3049,34 @@ fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
Ok(Arc::new(interval_array) as ArrayRef)
}

fn adjust_timestamp_to_timezone<
T: ArrowTimestampType,
>(
array: PrimitiveArray<Int64Type>,
to_tz: &Tz,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<Int64Type>, ArrowError> {
let adjusted = if cast_options.safe {
array.unary_opt::<_, Int64Type>(|o| {
let local = as_datetime::<T>(o)?;
let offset = to_tz.offset_from_local_datetime(&local).single()?;
T::make_value(local - offset.fix())
})
} else {
let error_fn = || {
ArrowError::CastError(
"Cannot cast timezone to different timezone".to_string(),
)
};
array.try_unary::<_, Int64Type, _>(|o| {
let local = as_datetime::<T>(o).ok_or_else(error_fn)?;
let offset = to_tz.offset_from_local_datetime(&local).single().ok_or_else(error_fn)?;
T::make_value(local - offset.fix()).ok_or_else(error_fn)
})?
};
aprimadi marked this conversation as resolved.
Show resolved Hide resolved
Ok(adjusted)
}

/// Casts Utf8 to Boolean
fn cast_utf8_to_boolean<OffsetSize>(
from: &dyn Array,
Expand Down Expand Up @@ -5978,6 +6050,174 @@ mod tests {
assert!(b.is_err());
}

// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
aprimadi marked this conversation as resolved.
Show resolved Hide resolved
#[test]
fn test_cast_timestamp_with_timezone_1() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T00:00:00.123456789"),
Some("2010-01-01T00:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&string_array, &to_type).unwrap();

let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
aprimadi marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!("2000-01-01T00:00:00.123456+07:00", result.value(0));
assert_eq!("2010-01-01T00:00:00.123456+07:00", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
#[test]
fn test_cast_timestamp_with_timezone_2() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-01-01T07:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("+0700".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T07:00:00.123+07:00", result.value(0));
assert_eq!("2010-01-01T07:00:00.123+07:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T00:00:00.123", result.value(0));
assert_eq!("2010-01-01T00:00:00.123", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
#[test]
fn test_cast_timestamp_with_timezone_3() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-01-01T07:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("+0700".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T07:00:00.123456+07:00", result.value(0));
assert_eq!("2010-01-01T07:00:00.123456+07:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Second, Some("-08:00".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("1999-12-31T16:00:00-08:00", result.value(0));
assert_eq!("2009-12-31T16:00:00-08:00", result.value(1));
assert!(result.is_null(2));
}

// See: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones for list of valid
// timezones

// Cast Timestamp(_, None) -> Timestamp(_, Some(timezone))
#[cfg(feature = "chrono-tz")]
aprimadi marked this conversation as resolved.
Show resolved Hide resolved
#[test]
fn test_cast_timestamp_with_timezone_daylight_1() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
// This is winter in New York so daylight saving is not in effect
// UTC offset is -05:00
Some("2000-01-01T00:00:00.123456789"),
// This is summer in New York so daylight saving is in effect
// UTC offset is -04:00
Some("2010-07-01T00:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&string_array, &to_type).unwrap();

let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, None)
#[cfg(feature = "chrono-tz")]
#[test]
fn test_cast_timestamp_with_timezone_daylight_2() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
Some("2000-01-01T07:00:00.123456789"),
Some("2010-07-01T07:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T07:00:00.123-05:00", result.value(0));
assert_eq!("2010-07-01T07:00:00.123-04:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T12:00:00.123", result.value(0));
tustvold marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!("2010-07-01T11:00:00.123", result.value(1));
assert!(result.is_null(2));
}

// Cast Timestamp(_, Some(timezone)) -> Timestamp(_, Some(timezone))
#[cfg(feature = "chrono-tz")]
#[test]
fn test_cast_timestamp_with_timezone_daylight_3() {
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(vec![
// Winter in New York, summer in Sydney
// UTC offset is -05:00 (New York) and +11:00 (Sydney)
Some("2000-01-01T00:00:00.123456789"),
// Summer in New York, winter in Sydney
// UTC offset is -04:00 (New York) and +10:00 (Sydney)
Some("2010-07-01T00:00:00.123456789"),
None,
]));
let to_type = DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
let timestamp_array = cast(&string_array, &to_type).unwrap();

// Check intermediate representation is correct
let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T00:00:00.123456-05:00", result.value(0));
assert_eq!("2010-07-01T00:00:00.123456-04:00", result.value(1));
assert!(result.is_null(2));

let to_type = DataType::Timestamp(TimeUnit::Second, Some("Australia/Sydney".into()));
let timestamp_array = cast(&timestamp_array, &to_type).unwrap();

let string_array = cast(&timestamp_array, &DataType::Utf8).unwrap();
let result = string_array.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!("2000-01-01T16:00:00+11:00", result.value(0));
assert_eq!("2010-07-01T14:00:00+10:00", result.value(1));
assert!(result.is_null(2));
}

#[test]
fn test_cast_date64_to_timestamp() {
let array =
Expand Down