diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 827729ca682e..e2d7a2492227 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -26,7 +26,7 @@ use arrow_schema::{ DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; -use chrono::{Duration, NaiveDate}; +use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; use std::marker::PhantomData; use std::ops::{Add, Sub}; @@ -311,19 +311,43 @@ pub trait ArrowTimestampType: ArrowTemporalType { fn get_time_unit() -> TimeUnit { Self::UNIT } + + /// Creates a ArrowTimestampType::Native from the provided [`NaiveDateTime`] + /// + /// See [`DataType::Timestamp`] for more information on timezone handling + fn make_value(naive: NaiveDateTime) -> Option; } impl ArrowTimestampType for TimestampSecondType { const UNIT: TimeUnit = TimeUnit::Second; + + fn make_value(naive: NaiveDateTime) -> Option { + Some(naive.timestamp()) + } } impl ArrowTimestampType for TimestampMillisecondType { const UNIT: TimeUnit = TimeUnit::Millisecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let millis = naive.timestamp().checked_mul(1_000)?; + millis.checked_add(naive.timestamp_subsec_millis() as i64) + } } impl ArrowTimestampType for TimestampMicrosecondType { const UNIT: TimeUnit = TimeUnit::Microsecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let micros = naive.timestamp().checked_mul(1_000_000)?; + micros.checked_add(naive.timestamp_subsec_micros() as i64) + } } impl ArrowTimestampType for TimestampNanosecondType { const UNIT: TimeUnit = TimeUnit::Nanosecond; + + fn make_value(naive: NaiveDateTime) -> Option { + let nanos = naive.timestamp().checked_mul(1_000_000_000)?; + nanos.checked_add(naive.timestamp_subsec_nanos() as i64) + } } impl IntervalYearMonthType { diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 372fcc1a3132..05b56a0e8d32 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,14 +35,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{NaiveTime, Timelike}; +use chrono::{NaiveTime, TimeZone, Timelike, Utc}; use std::cmp::Ordering; use std::sync::Arc; use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, - string_to_timestamp_nanos, + string_to_datetime, }; use arrow_array::{ builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, @@ -1233,16 +1233,16 @@ pub fn cast_with_options( cast_string_to_time64nanosecond::(array, cast_options) } Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Timestamp(TimeUnit::Nanosecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz,cast_options) + cast_string_to_timestamp::(array, to_tz, cast_options) } Interval(IntervalUnit::YearMonth) => { cast_string_to_year_month_interval::(array, cast_options) @@ -2653,45 +2653,58 @@ fn cast_string_to_time64nanosecond( } /// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) -fn cast_string_to_timestamp< - Offset: OffsetSizeTrait, - TimestampType: ArrowTimestampType, ->( +fn cast_string_to_timestamp( array: &dyn Array, to_tz: &Option>, cast_options: &CastOptions, ) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let scale_factor = match TimestampType::UNIT { - TimeUnit::Second => 1_000_000_000, - TimeUnit::Millisecond => 1_000_000, - TimeUnit::Microsecond => 1_000, - TimeUnit::Nanosecond => 1, + let array = array.as_string::(); + let out: PrimitiveArray = match to_tz { + Some(tz) => { + let tz: Tz = tz.as_ref().parse()?; + cast_string_to_timestamp_impl(array, &tz, cast_options)? + } + None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, }; + Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) +} - let array = if cast_options.safe { - let iter = string_array.iter().map(|v| { - v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor)) +fn cast_string_to_timestamp_impl< + O: OffsetSizeTrait, + T: ArrowTimestampType, + Tz: TimeZone, +>( + array: &GenericStringArray, + tz: &Tz, + cast_options: &CastOptions, +) -> Result, ArrowError> { + if cast_options.safe { + let iter = array.iter().map(|v| { + v.and_then(|v| { + let naive = string_to_datetime(tz, v).ok()?.naive_utc(); + T::make_value(naive) + }) }); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { - PrimitiveArray::::from_trusted_len_iter(iter) - .with_timezone_opt(to_tz.clone()) - } + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) } else { - let vec = string_array + let vec = array .iter() .map(|v| { - v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor)) - .transpose() + v.map(|v| { + let naive = string_to_datetime(tz, v)?.naive_utc(); + T::make_value(naive).ok_or_else(|| { + ArrowError::CastError(format!( + "Overflow converting {naive} to {:?}", + T::UNIT + )) + }) + }) + .transpose() }) .collect::>, _>>()?; @@ -2699,13 +2712,8 @@ fn cast_string_to_timestamp< // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { - PrimitiveArray::::from_trusted_len_iter(vec.iter()) - .with_timezone_opt(to_tz.clone()) - } - }; - - Ok(Arc::new(array) as ArrayRef) + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) + } } fn cast_string_to_year_month_interval( @@ -5018,6 +5026,14 @@ mod tests { } } + #[test] + fn test_cast_string_to_timestamp_overflow() { + let array = StringArray::from(vec!["9800-09-08T12:00:00.123456789"]); + let result = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); + let result = result.as_primitive::(); + assert_eq!(result.values(), &[247112596800]); + } + #[test] fn test_cast_string_to_date32() { let a1 = Arc::new(StringArray::from(vec![ @@ -8079,24 +8095,45 @@ mod tests { let array = Arc::new(valid) as ArrayRef; let b = cast_with_options( &array, - &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)), + &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.clone())), &CastOptions { safe: false }, ) .unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1672574706789000000, c.value(0)); - assert_eq!(1672571106789000000, c.value(1)); - assert_eq!(1672574706789000000, c.value(2)); - assert_eq!(1672574706789000000, c.value(3)); - assert_eq!(1672518906000000000, c.value(4)); - assert_eq!(1672518906000000000, c.value(5)); - assert_eq!(1672545906789000000, c.value(6)); - assert_eq!(1672545906000000000, c.value(7)); - assert_eq!(1672531200000000000, c.value(8)); + let tz = tz.as_ref().parse().unwrap(); + + let as_tz = |v: i64| { + as_datetime_with_timezone::(v, tz).unwrap() + }; + + let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string(); + let as_local = |v: &i64| as_tz(*v).naive_local().to_string(); + + let values = b.as_primitive::().values(); + let utc_results: Vec<_> = values.iter().map(as_utc).collect(); + let local_results: Vec<_> = values.iter().map(as_local).collect(); + + // Absolute timestamps should be parsed preserving the same UTC instant + assert_eq!( + &utc_results[..6], + &[ + "2023-01-01 12:05:06.789".to_string(), + "2023-01-01 11:05:06.789".to_string(), + "2023-01-01 12:05:06.789".to_string(), + "2023-01-01 12:05:06.789".to_string(), + "2022-12-31 20:35:06".to_string(), + "2022-12-31 20:35:06".to_string(), + ] + ); + // Non-absolute timestamps should be parsed preserving the same local instant + assert_eq!( + &local_results[6..], + &[ + "2023-01-01 04:05:06.789".to_string(), + "2023-01-01 04:05:06".to_string(), + "2023-01-01 00:00:00".to_string() + ] + ) } test_tz("+00:00".into());