Skip to content

Commit bc15cbd

Browse files
authored
Fix timestamp handling in cast kernel (#1936) (#4033) (#4034)
1 parent d946cc4 commit bc15cbd

File tree

2 files changed

+113
-52
lines changed

2 files changed

+113
-52
lines changed

arrow-array/src/types.rs

+25-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use arrow_schema::{
2626
DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
2727
DECIMAL_DEFAULT_SCALE,
2828
};
29-
use chrono::{Duration, NaiveDate};
29+
use chrono::{Duration, NaiveDate, NaiveDateTime};
3030
use half::f16;
3131
use std::marker::PhantomData;
3232
use std::ops::{Add, Sub};
@@ -311,19 +311,43 @@ pub trait ArrowTimestampType: ArrowTemporalType<Native = i64> {
311311
fn get_time_unit() -> TimeUnit {
312312
Self::UNIT
313313
}
314+
315+
/// Creates a ArrowTimestampType::Native from the provided [`NaiveDateTime`]
316+
///
317+
/// See [`DataType::Timestamp`] for more information on timezone handling
318+
fn make_value(naive: NaiveDateTime) -> Option<i64>;
314319
}
315320

316321
impl ArrowTimestampType for TimestampSecondType {
317322
const UNIT: TimeUnit = TimeUnit::Second;
323+
324+
fn make_value(naive: NaiveDateTime) -> Option<i64> {
325+
Some(naive.timestamp())
326+
}
318327
}
319328
impl ArrowTimestampType for TimestampMillisecondType {
320329
const UNIT: TimeUnit = TimeUnit::Millisecond;
330+
331+
fn make_value(naive: NaiveDateTime) -> Option<i64> {
332+
let millis = naive.timestamp().checked_mul(1_000)?;
333+
millis.checked_add(naive.timestamp_subsec_millis() as i64)
334+
}
321335
}
322336
impl ArrowTimestampType for TimestampMicrosecondType {
323337
const UNIT: TimeUnit = TimeUnit::Microsecond;
338+
339+
fn make_value(naive: NaiveDateTime) -> Option<i64> {
340+
let micros = naive.timestamp().checked_mul(1_000_000)?;
341+
micros.checked_add(naive.timestamp_subsec_micros() as i64)
342+
}
324343
}
325344
impl ArrowTimestampType for TimestampNanosecondType {
326345
const UNIT: TimeUnit = TimeUnit::Nanosecond;
346+
347+
fn make_value(naive: NaiveDateTime) -> Option<i64> {
348+
let nanos = naive.timestamp().checked_mul(1_000_000_000)?;
349+
nanos.checked_add(naive.timestamp_subsec_nanos() as i64)
350+
}
327351
}
328352

329353
impl IntervalYearMonthType {

arrow-cast/src/cast.rs

+88-51
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@
3535
//! assert_eq!(7.0, c.value(2));
3636
//! ```
3737
38-
use chrono::{NaiveTime, Timelike};
38+
use chrono::{NaiveTime, TimeZone, Timelike, Utc};
3939
use std::cmp::Ordering;
4040
use std::sync::Arc;
4141

4242
use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions};
4343
use crate::parse::{
4444
parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month,
45-
string_to_timestamp_nanos,
45+
string_to_datetime,
4646
};
4747
use arrow_array::{
4848
builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *,
@@ -1233,16 +1233,16 @@ pub fn cast_with_options(
12331233
cast_string_to_time64nanosecond::<i64>(array, cast_options)
12341234
}
12351235
Timestamp(TimeUnit::Second, to_tz) => {
1236-
cast_string_to_timestamp::<i64, TimestampSecondType>(array, to_tz,cast_options)
1236+
cast_string_to_timestamp::<i64, TimestampSecondType>(array, to_tz, cast_options)
12371237
}
12381238
Timestamp(TimeUnit::Millisecond, to_tz) => {
1239-
cast_string_to_timestamp::<i64, TimestampMillisecondType>(array, to_tz,cast_options)
1239+
cast_string_to_timestamp::<i64, TimestampMillisecondType>(array, to_tz, cast_options)
12401240
}
12411241
Timestamp(TimeUnit::Microsecond, to_tz) => {
1242-
cast_string_to_timestamp::<i64, TimestampMicrosecondType>(array, to_tz,cast_options)
1242+
cast_string_to_timestamp::<i64, TimestampMicrosecondType>(array, to_tz, cast_options)
12431243
}
12441244
Timestamp(TimeUnit::Nanosecond, to_tz) => {
1245-
cast_string_to_timestamp::<i64, TimestampNanosecondType>(array, to_tz,cast_options)
1245+
cast_string_to_timestamp::<i64, TimestampNanosecondType>(array, to_tz, cast_options)
12461246
}
12471247
Interval(IntervalUnit::YearMonth) => {
12481248
cast_string_to_year_month_interval::<i64>(array, cast_options)
@@ -2653,59 +2653,67 @@ fn cast_string_to_time64nanosecond<Offset: OffsetSizeTrait>(
26532653
}
26542654

26552655
/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.)
2656-
fn cast_string_to_timestamp<
2657-
Offset: OffsetSizeTrait,
2658-
TimestampType: ArrowTimestampType<Native = i64>,
2659-
>(
2656+
fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
26602657
array: &dyn Array,
26612658
to_tz: &Option<Arc<str>>,
26622659
cast_options: &CastOptions,
26632660
) -> Result<ArrayRef, ArrowError> {
2664-
let string_array = array
2665-
.as_any()
2666-
.downcast_ref::<GenericStringArray<Offset>>()
2667-
.unwrap();
2668-
2669-
let scale_factor = match TimestampType::UNIT {
2670-
TimeUnit::Second => 1_000_000_000,
2671-
TimeUnit::Millisecond => 1_000_000,
2672-
TimeUnit::Microsecond => 1_000,
2673-
TimeUnit::Nanosecond => 1,
2661+
let array = array.as_string::<O>();
2662+
let out: PrimitiveArray<T> = match to_tz {
2663+
Some(tz) => {
2664+
let tz: Tz = tz.as_ref().parse()?;
2665+
cast_string_to_timestamp_impl(array, &tz, cast_options)?
2666+
}
2667+
None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
26742668
};
2669+
Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
2670+
}
26752671

2676-
let array = if cast_options.safe {
2677-
let iter = string_array.iter().map(|v| {
2678-
v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor))
2672+
fn cast_string_to_timestamp_impl<
2673+
O: OffsetSizeTrait,
2674+
T: ArrowTimestampType,
2675+
Tz: TimeZone,
2676+
>(
2677+
array: &GenericStringArray<O>,
2678+
tz: &Tz,
2679+
cast_options: &CastOptions,
2680+
) -> Result<PrimitiveArray<T>, ArrowError> {
2681+
if cast_options.safe {
2682+
let iter = array.iter().map(|v| {
2683+
v.and_then(|v| {
2684+
let naive = string_to_datetime(tz, v).ok()?.naive_utc();
2685+
T::make_value(naive)
2686+
})
26792687
});
26802688
// Benefit:
26812689
// 20% performance improvement
26822690
// Soundness:
26832691
// The iterator is trustedLen because it comes from an `StringArray`.
26842692

2685-
unsafe {
2686-
PrimitiveArray::<TimestampType>::from_trusted_len_iter(iter)
2687-
.with_timezone_opt(to_tz.clone())
2688-
}
2693+
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
26892694
} else {
2690-
let vec = string_array
2695+
let vec = array
26912696
.iter()
26922697
.map(|v| {
2693-
v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor))
2694-
.transpose()
2698+
v.map(|v| {
2699+
let naive = string_to_datetime(tz, v)?.naive_utc();
2700+
T::make_value(naive).ok_or_else(|| {
2701+
ArrowError::CastError(format!(
2702+
"Overflow converting {naive} to {:?}",
2703+
T::UNIT
2704+
))
2705+
})
2706+
})
2707+
.transpose()
26952708
})
26962709
.collect::<Result<Vec<Option<i64>>, _>>()?;
26972710

26982711
// Benefit:
26992712
// 20% performance improvement
27002713
// Soundness:
27012714
// The iterator is trustedLen because it comes from an `StringArray`.
2702-
unsafe {
2703-
PrimitiveArray::<TimestampType>::from_trusted_len_iter(vec.iter())
2704-
.with_timezone_opt(to_tz.clone())
2705-
}
2706-
};
2707-
2708-
Ok(Arc::new(array) as ArrayRef)
2715+
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
2716+
}
27092717
}
27102718

27112719
fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
@@ -5018,6 +5026,14 @@ mod tests {
50185026
}
50195027
}
50205028

5029+
#[test]
5030+
fn test_cast_string_to_timestamp_overflow() {
5031+
let array = StringArray::from(vec!["9800-09-08T12:00:00.123456789"]);
5032+
let result = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap();
5033+
let result = result.as_primitive::<TimestampSecondType>();
5034+
assert_eq!(result.values(), &[247112596800]);
5035+
}
5036+
50215037
#[test]
50225038
fn test_cast_string_to_date32() {
50235039
let a1 = Arc::new(StringArray::from(vec![
@@ -8079,24 +8095,45 @@ mod tests {
80798095
let array = Arc::new(valid) as ArrayRef;
80808096
let b = cast_with_options(
80818097
&array,
8082-
&DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)),
8098+
&DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.clone())),
80838099
&CastOptions { safe: false },
80848100
)
80858101
.unwrap();
80868102

8087-
let c = b
8088-
.as_any()
8089-
.downcast_ref::<TimestampNanosecondArray>()
8090-
.unwrap();
8091-
assert_eq!(1672574706789000000, c.value(0));
8092-
assert_eq!(1672571106789000000, c.value(1));
8093-
assert_eq!(1672574706789000000, c.value(2));
8094-
assert_eq!(1672574706789000000, c.value(3));
8095-
assert_eq!(1672518906000000000, c.value(4));
8096-
assert_eq!(1672518906000000000, c.value(5));
8097-
assert_eq!(1672545906789000000, c.value(6));
8098-
assert_eq!(1672545906000000000, c.value(7));
8099-
assert_eq!(1672531200000000000, c.value(8));
8103+
let tz = tz.as_ref().parse().unwrap();
8104+
8105+
let as_tz = |v: i64| {
8106+
as_datetime_with_timezone::<TimestampNanosecondType>(v, tz).unwrap()
8107+
};
8108+
8109+
let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string();
8110+
let as_local = |v: &i64| as_tz(*v).naive_local().to_string();
8111+
8112+
let values = b.as_primitive::<TimestampNanosecondType>().values();
8113+
let utc_results: Vec<_> = values.iter().map(as_utc).collect();
8114+
let local_results: Vec<_> = values.iter().map(as_local).collect();
8115+
8116+
// Absolute timestamps should be parsed preserving the same UTC instant
8117+
assert_eq!(
8118+
&utc_results[..6],
8119+
&[
8120+
"2023-01-01 12:05:06.789".to_string(),
8121+
"2023-01-01 11:05:06.789".to_string(),
8122+
"2023-01-01 12:05:06.789".to_string(),
8123+
"2023-01-01 12:05:06.789".to_string(),
8124+
"2022-12-31 20:35:06".to_string(),
8125+
"2022-12-31 20:35:06".to_string(),
8126+
]
8127+
);
8128+
// Non-absolute timestamps should be parsed preserving the same local instant
8129+
assert_eq!(
8130+
&local_results[6..],
8131+
&[
8132+
"2023-01-01 04:05:06.789".to_string(),
8133+
"2023-01-01 04:05:06".to_string(),
8134+
"2023-01-01 00:00:00".to_string()
8135+
]
8136+
)
81008137
}
81018138

81028139
test_tz("+00:00".into());

0 commit comments

Comments
 (0)