|
35 | 35 | //! assert_eq!(7.0, c.value(2));
|
36 | 36 | //! ```
|
37 | 37 |
|
38 |
| -use chrono::{NaiveTime, Timelike}; |
| 38 | +use chrono::{NaiveTime, TimeZone, Timelike, Utc}; |
39 | 39 | use std::cmp::Ordering;
|
40 | 40 | use std::sync::Arc;
|
41 | 41 |
|
42 | 42 | use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions};
|
43 | 43 | use crate::parse::{
|
44 | 44 | parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month,
|
45 |
| - string_to_timestamp_nanos, |
| 45 | + string_to_datetime, |
46 | 46 | };
|
47 | 47 | use arrow_array::{
|
48 | 48 | builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *,
|
@@ -1233,16 +1233,16 @@ pub fn cast_with_options(
|
1233 | 1233 | cast_string_to_time64nanosecond::<i64>(array, cast_options)
|
1234 | 1234 | }
|
1235 | 1235 | Timestamp(TimeUnit::Second, to_tz) => {
|
1236 |
| - cast_string_to_timestamp::<i64, TimestampSecondType>(array, to_tz,cast_options) |
| 1236 | + cast_string_to_timestamp::<i64, TimestampSecondType>(array, to_tz, cast_options) |
1237 | 1237 | }
|
1238 | 1238 | Timestamp(TimeUnit::Millisecond, to_tz) => {
|
1239 |
| - cast_string_to_timestamp::<i64, TimestampMillisecondType>(array, to_tz,cast_options) |
| 1239 | + cast_string_to_timestamp::<i64, TimestampMillisecondType>(array, to_tz, cast_options) |
1240 | 1240 | }
|
1241 | 1241 | Timestamp(TimeUnit::Microsecond, to_tz) => {
|
1242 |
| - cast_string_to_timestamp::<i64, TimestampMicrosecondType>(array, to_tz,cast_options) |
| 1242 | + cast_string_to_timestamp::<i64, TimestampMicrosecondType>(array, to_tz, cast_options) |
1243 | 1243 | }
|
1244 | 1244 | Timestamp(TimeUnit::Nanosecond, to_tz) => {
|
1245 |
| - cast_string_to_timestamp::<i64, TimestampNanosecondType>(array, to_tz,cast_options) |
| 1245 | + cast_string_to_timestamp::<i64, TimestampNanosecondType>(array, to_tz, cast_options) |
1246 | 1246 | }
|
1247 | 1247 | Interval(IntervalUnit::YearMonth) => {
|
1248 | 1248 | cast_string_to_year_month_interval::<i64>(array, cast_options)
|
@@ -2653,59 +2653,67 @@ fn cast_string_to_time64nanosecond<Offset: OffsetSizeTrait>(
|
2653 | 2653 | }
|
2654 | 2654 |
|
2655 | 2655 | /// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
|
2656 |
| -fn cast_string_to_timestamp< |
2657 |
| - Offset: OffsetSizeTrait, |
2658 |
| - TimestampType: ArrowTimestampType<Native = i64>, |
2659 |
| ->( |
| 2656 | +fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>( |
2660 | 2657 | array: &dyn Array,
|
2661 | 2658 | to_tz: &Option<Arc<str>>,
|
2662 | 2659 | cast_options: &CastOptions,
|
2663 | 2660 | ) -> Result<ArrayRef, ArrowError> {
|
2664 |
| - let string_array = array |
2665 |
| - .as_any() |
2666 |
| - .downcast_ref::<GenericStringArray<Offset>>() |
2667 |
| - .unwrap(); |
2668 |
| - |
2669 |
| - let scale_factor = match TimestampType::UNIT { |
2670 |
| - TimeUnit::Second => 1_000_000_000, |
2671 |
| - TimeUnit::Millisecond => 1_000_000, |
2672 |
| - TimeUnit::Microsecond => 1_000, |
2673 |
| - TimeUnit::Nanosecond => 1, |
| 2661 | + let array = array.as_string::<O>(); |
| 2662 | + let out: PrimitiveArray<T> = match to_tz { |
| 2663 | + Some(tz) => { |
| 2664 | + let tz: Tz = tz.as_ref().parse()?; |
| 2665 | + cast_string_to_timestamp_impl(array, &tz, cast_options)? |
| 2666 | + } |
| 2667 | + None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, |
2674 | 2668 | };
|
| 2669 | + Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) |
| 2670 | +} |
2675 | 2671 |
|
2676 |
| - let array = if cast_options.safe { |
2677 |
| - let iter = string_array.iter().map(|v| { |
2678 |
| - v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor)) |
| 2672 | +fn cast_string_to_timestamp_impl< |
| 2673 | + O: OffsetSizeTrait, |
| 2674 | + T: ArrowTimestampType, |
| 2675 | + Tz: TimeZone, |
| 2676 | +>( |
| 2677 | + array: &GenericStringArray<O>, |
| 2678 | + tz: &Tz, |
| 2679 | + cast_options: &CastOptions, |
| 2680 | +) -> Result<PrimitiveArray<T>, ArrowError> { |
| 2681 | + if cast_options.safe { |
| 2682 | + let iter = array.iter().map(|v| { |
| 2683 | + v.and_then(|v| { |
| 2684 | + let naive = string_to_datetime(tz, v).ok()?.naive_utc(); |
| 2685 | + T::make_value(naive) |
| 2686 | + }) |
2679 | 2687 | });
|
2680 | 2688 | // Benefit:
|
2681 | 2689 | // 20% performance improvement
|
2682 | 2690 | // Soundness:
|
2683 | 2691 | // The iterator is trustedLen because it comes from an `StringArray`.
|
2684 | 2692 |
|
2685 |
| - unsafe { |
2686 |
| - PrimitiveArray::<TimestampType>::from_trusted_len_iter(iter) |
2687 |
| - .with_timezone_opt(to_tz.clone()) |
2688 |
| - } |
| 2693 | + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) }) |
2689 | 2694 | } else {
|
2690 |
| - let vec = string_array |
| 2695 | + let vec = array |
2691 | 2696 | .iter()
|
2692 | 2697 | .map(|v| {
|
2693 |
| - v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor)) |
2694 |
| - .transpose() |
| 2698 | + v.map(|v| { |
| 2699 | + let naive = string_to_datetime(tz, v)?.naive_utc(); |
| 2700 | + T::make_value(naive).ok_or_else(|| { |
| 2701 | + ArrowError::CastError(format!( |
| 2702 | + "Overflow converting {naive} to {:?}", |
| 2703 | + T::UNIT |
| 2704 | + )) |
| 2705 | + }) |
| 2706 | + }) |
| 2707 | + .transpose() |
2695 | 2708 | })
|
2696 | 2709 | .collect::<Result<Vec<Option<i64>>, _>>()?;
|
2697 | 2710 |
|
2698 | 2711 | // Benefit:
|
2699 | 2712 | // 20% performance improvement
|
2700 | 2713 | // Soundness:
|
2701 | 2714 | // The iterator is trustedLen because it comes from an `StringArray`.
|
2702 |
| - unsafe { |
2703 |
| - PrimitiveArray::<TimestampType>::from_trusted_len_iter(vec.iter()) |
2704 |
| - .with_timezone_opt(to_tz.clone()) |
2705 |
| - } |
2706 |
| - }; |
2707 |
| - |
2708 |
| - Ok(Arc::new(array) as ArrayRef) |
| 2715 | + Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) }) |
| 2716 | + } |
2709 | 2717 | }
|
2710 | 2718 |
|
2711 | 2719 | fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
|
@@ -5018,6 +5026,14 @@ mod tests {
|
5018 | 5026 | }
|
5019 | 5027 | }
|
5020 | 5028 |
|
| 5029 | + #[test] |
| 5030 | + fn test_cast_string_to_timestamp_overflow() { |
| 5031 | + let array = StringArray::from(vec!["9800-09-08T12:00:00.123456789"]); |
| 5032 | + let result = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); |
| 5033 | + let result = result.as_primitive::<TimestampSecondType>(); |
| 5034 | + assert_eq!(result.values(), &[247112596800]); |
| 5035 | + } |
| 5036 | + |
5021 | 5037 | #[test]
|
5022 | 5038 | fn test_cast_string_to_date32() {
|
5023 | 5039 | let a1 = Arc::new(StringArray::from(vec![
|
@@ -8079,24 +8095,45 @@ mod tests {
|
8079 | 8095 | let array = Arc::new(valid) as ArrayRef;
|
8080 | 8096 | let b = cast_with_options(
|
8081 | 8097 | &array,
|
8082 |
| - &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)), |
| 8098 | + &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.clone())), |
8083 | 8099 | &CastOptions { safe: false },
|
8084 | 8100 | )
|
8085 | 8101 | .unwrap();
|
8086 | 8102 |
|
8087 |
| - let c = b |
8088 |
| - .as_any() |
8089 |
| - .downcast_ref::<TimestampNanosecondArray>() |
8090 |
| - .unwrap(); |
8091 |
| - assert_eq!(1672574706789000000, c.value(0)); |
8092 |
| - assert_eq!(1672571106789000000, c.value(1)); |
8093 |
| - assert_eq!(1672574706789000000, c.value(2)); |
8094 |
| - assert_eq!(1672574706789000000, c.value(3)); |
8095 |
| - assert_eq!(1672518906000000000, c.value(4)); |
8096 |
| - assert_eq!(1672518906000000000, c.value(5)); |
8097 |
| - assert_eq!(1672545906789000000, c.value(6)); |
8098 |
| - assert_eq!(1672545906000000000, c.value(7)); |
8099 |
| - assert_eq!(1672531200000000000, c.value(8)); |
| 8103 | + let tz = tz.as_ref().parse().unwrap(); |
| 8104 | + |
| 8105 | + let as_tz = |v: i64| { |
| 8106 | + as_datetime_with_timezone::<TimestampNanosecondType>(v, tz).unwrap() |
| 8107 | + }; |
| 8108 | + |
| 8109 | + let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string(); |
| 8110 | + let as_local = |v: &i64| as_tz(*v).naive_local().to_string(); |
| 8111 | + |
| 8112 | + let values = b.as_primitive::<TimestampNanosecondType>().values(); |
| 8113 | + let utc_results: Vec<_> = values.iter().map(as_utc).collect(); |
| 8114 | + let local_results: Vec<_> = values.iter().map(as_local).collect(); |
| 8115 | + |
| 8116 | + // Absolute timestamps should be parsed preserving the same UTC instant |
| 8117 | + assert_eq!( |
| 8118 | + &utc_results[..6], |
| 8119 | + &[ |
| 8120 | + "2023-01-01 12:05:06.789".to_string(), |
| 8121 | + "2023-01-01 11:05:06.789".to_string(), |
| 8122 | + "2023-01-01 12:05:06.789".to_string(), |
| 8123 | + "2023-01-01 12:05:06.789".to_string(), |
| 8124 | + "2022-12-31 20:35:06".to_string(), |
| 8125 | + "2022-12-31 20:35:06".to_string(), |
| 8126 | + ] |
| 8127 | + ); |
| 8128 | + // Non-absolute timestamps should be parsed preserving the same local instant |
| 8129 | + assert_eq!( |
| 8130 | + &local_results[6..], |
| 8131 | + &[ |
| 8132 | + "2023-01-01 04:05:06.789".to_string(), |
| 8133 | + "2023-01-01 04:05:06".to_string(), |
| 8134 | + "2023-01-01 00:00:00".to_string() |
| 8135 | + ] |
| 8136 | + ) |
8100 | 8137 | }
|
8101 | 8138 |
|
8102 | 8139 | test_tz("+00:00".into());
|
|
0 commit comments