Skip to content

Commit 1575815

Browse files
mhiltonWeijun-H
authored andcommitted
preserve array type / timezone in date_bin and date_trunc functions (apache#7729)
* preserve array type in date_bin and date_trunc functions The result type of date_bin and date_trunc never includes any timezone information. Change this such that the timezone of the resulting array from these functions is copied from the input array. * Update datafusion/expr/src/built_in_function.rs Co-authored-by: Alex Huang <[email protected]> * fix: syntax error * fix: datafusion-cli cargo update * review suggestions Add some additional tests suggested in code reviews. * fix formatting --------- Co-authored-by: Alex Huang <[email protected]>
1 parent fbb691e commit 1575815

File tree

3 files changed

+355
-17
lines changed

3 files changed

+355
-17
lines changed

datafusion/expr/src/built_in_function.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -618,13 +618,20 @@ impl BuiltinScalarFunction {
618618
BuiltinScalarFunction::ConcatWithSeparator => Ok(Utf8),
619619
BuiltinScalarFunction::DatePart => Ok(Float64),
620620
BuiltinScalarFunction::DateBin | BuiltinScalarFunction::DateTrunc => {
621-
match input_expr_types[1] {
622-
Timestamp(Nanosecond, _) | Utf8 | Null => {
621+
match &input_expr_types[1] {
622+
Timestamp(Nanosecond, None) | Utf8 | Null => {
623623
Ok(Timestamp(Nanosecond, None))
624624
}
625-
Timestamp(Microsecond, _) => Ok(Timestamp(Microsecond, None)),
626-
Timestamp(Millisecond, _) => Ok(Timestamp(Millisecond, None)),
627-
Timestamp(Second, _) => Ok(Timestamp(Second, None)),
625+
Timestamp(Nanosecond, tz_opt) => {
626+
Ok(Timestamp(Nanosecond, tz_opt.clone()))
627+
}
628+
Timestamp(Microsecond, tz_opt) => {
629+
Ok(Timestamp(Microsecond, tz_opt.clone()))
630+
}
631+
Timestamp(Millisecond, tz_opt) => {
632+
Ok(Timestamp(Millisecond, tz_opt.clone()))
633+
}
634+
Timestamp(Second, tz_opt) => Ok(Timestamp(Second, tz_opt.clone())),
628635
_ => plan_err!(
629636
"The {self} function can only accept timestamp as the second arg."
630637
),

datafusion/physical-expr/src/datetime_expressions.rs

Lines changed: 287 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,8 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result<ColumnarValue> {
433433
granularity.as_str(),
434434
)
435435
})
436-
.collect::<Result<TimestampSecondArray>>()?;
436+
.collect::<Result<TimestampSecondArray>>()?
437+
.with_timezone_opt(tz_opt.clone());
437438
ColumnarValue::Array(Arc::new(array))
438439
}
439440
DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
@@ -449,7 +450,8 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result<ColumnarValue> {
449450
granularity.as_str(),
450451
)
451452
})
452-
.collect::<Result<TimestampMillisecondArray>>()?;
453+
.collect::<Result<TimestampMillisecondArray>>()?
454+
.with_timezone_opt(tz_opt.clone());
453455
ColumnarValue::Array(Arc::new(array))
454456
}
455457
DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
@@ -465,7 +467,25 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result<ColumnarValue> {
465467
granularity.as_str(),
466468
)
467469
})
468-
.collect::<Result<TimestampMicrosecondArray>>()?;
470+
.collect::<Result<TimestampMicrosecondArray>>()?
471+
.with_timezone_opt(tz_opt.clone());
472+
ColumnarValue::Array(Arc::new(array))
473+
}
474+
DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
475+
let parsed_tz = parse_tz(tz_opt)?;
476+
let array = as_timestamp_nanosecond_array(array)?;
477+
let array = array
478+
.iter()
479+
.map(|x| {
480+
_date_trunc(
481+
TimeUnit::Nanosecond,
482+
&x,
483+
parsed_tz,
484+
granularity.as_str(),
485+
)
486+
})
487+
.collect::<Result<TimestampNanosecondArray>>()?
488+
.with_timezone_opt(tz_opt.clone());
469489
ColumnarValue::Array(Arc::new(array))
470490
}
471491
_ => {
@@ -713,35 +733,39 @@ fn date_bin_impl(
713733
))
714734
}
715735
ColumnarValue::Array(array) => match array.data_type() {
716-
DataType::Timestamp(TimeUnit::Nanosecond, _) => {
736+
DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
717737
let array = as_timestamp_nanosecond_array(array)?
718738
.iter()
719739
.map(f_nanos)
720-
.collect::<TimestampNanosecondArray>();
740+
.collect::<TimestampNanosecondArray>()
741+
.with_timezone_opt(tz_opt.clone());
721742

722743
ColumnarValue::Array(Arc::new(array))
723744
}
724-
DataType::Timestamp(TimeUnit::Microsecond, _) => {
745+
DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
725746
let array = as_timestamp_microsecond_array(array)?
726747
.iter()
727748
.map(f_micros)
728-
.collect::<TimestampMicrosecondArray>();
749+
.collect::<TimestampMicrosecondArray>()
750+
.with_timezone_opt(tz_opt.clone());
729751

730752
ColumnarValue::Array(Arc::new(array))
731753
}
732-
DataType::Timestamp(TimeUnit::Millisecond, _) => {
754+
DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
733755
let array = as_timestamp_millisecond_array(array)?
734756
.iter()
735757
.map(f_millis)
736-
.collect::<TimestampMillisecondArray>();
758+
.collect::<TimestampMillisecondArray>()
759+
.with_timezone_opt(tz_opt.clone());
737760

738761
ColumnarValue::Array(Arc::new(array))
739762
}
740-
DataType::Timestamp(TimeUnit::Second, _) => {
763+
DataType::Timestamp(TimeUnit::Second, tz_opt) => {
741764
let array = as_timestamp_second_array(array)?
742765
.iter()
743766
.map(f_secs)
744-
.collect::<TimestampSecondArray>();
767+
.collect::<TimestampSecondArray>()
768+
.with_timezone_opt(tz_opt.clone());
745769

746770
ColumnarValue::Array(Arc::new(array))
747771
}
@@ -925,7 +949,9 @@ where
925949
mod tests {
926950
use std::sync::Arc;
927951

928-
use arrow::array::{ArrayRef, Int64Array, IntervalDayTimeArray, StringBuilder};
952+
use arrow::array::{
953+
as_primitive_array, ArrayRef, Int64Array, IntervalDayTimeArray, StringBuilder,
954+
};
929955

930956
use super::*;
931957

@@ -1051,6 +1077,125 @@ mod tests {
10511077
});
10521078
}
10531079

1080+
#[test]
1081+
fn test_date_trunc_timezones() {
1082+
let cases = vec![
1083+
(
1084+
vec![
1085+
"2020-09-08T00:00:00Z",
1086+
"2020-09-08T01:00:00Z",
1087+
"2020-09-08T02:00:00Z",
1088+
"2020-09-08T03:00:00Z",
1089+
"2020-09-08T04:00:00Z",
1090+
],
1091+
Some("+00".into()),
1092+
vec![
1093+
"2020-09-08T00:00:00Z",
1094+
"2020-09-08T00:00:00Z",
1095+
"2020-09-08T00:00:00Z",
1096+
"2020-09-08T00:00:00Z",
1097+
"2020-09-08T00:00:00Z",
1098+
],
1099+
),
1100+
(
1101+
vec![
1102+
"2020-09-08T00:00:00Z",
1103+
"2020-09-08T01:00:00Z",
1104+
"2020-09-08T02:00:00Z",
1105+
"2020-09-08T03:00:00Z",
1106+
"2020-09-08T04:00:00Z",
1107+
],
1108+
None,
1109+
vec![
1110+
"2020-09-08T00:00:00Z",
1111+
"2020-09-08T00:00:00Z",
1112+
"2020-09-08T00:00:00Z",
1113+
"2020-09-08T00:00:00Z",
1114+
"2020-09-08T00:00:00Z",
1115+
],
1116+
),
1117+
(
1118+
vec![
1119+
"2020-09-08T00:00:00Z",
1120+
"2020-09-08T01:00:00Z",
1121+
"2020-09-08T02:00:00Z",
1122+
"2020-09-08T03:00:00Z",
1123+
"2020-09-08T04:00:00Z",
1124+
],
1125+
Some("-02".into()),
1126+
vec![
1127+
"2020-09-07T02:00:00Z",
1128+
"2020-09-07T02:00:00Z",
1129+
"2020-09-08T02:00:00Z",
1130+
"2020-09-08T02:00:00Z",
1131+
"2020-09-08T02:00:00Z",
1132+
],
1133+
),
1134+
(
1135+
vec![
1136+
"2020-09-08T00:00:00+05",
1137+
"2020-09-08T01:00:00+05",
1138+
"2020-09-08T02:00:00+05",
1139+
"2020-09-08T03:00:00+05",
1140+
"2020-09-08T04:00:00+05",
1141+
],
1142+
Some("+05".into()),
1143+
vec![
1144+
"2020-09-08T00:00:00+05",
1145+
"2020-09-08T00:00:00+05",
1146+
"2020-09-08T00:00:00+05",
1147+
"2020-09-08T00:00:00+05",
1148+
"2020-09-08T00:00:00+05",
1149+
],
1150+
),
1151+
(
1152+
vec![
1153+
"2020-09-08T00:00:00+08",
1154+
"2020-09-08T01:00:00+08",
1155+
"2020-09-08T02:00:00+08",
1156+
"2020-09-08T03:00:00+08",
1157+
"2020-09-08T04:00:00+08",
1158+
],
1159+
Some("+08".into()),
1160+
vec![
1161+
"2020-09-08T00:00:00+08",
1162+
"2020-09-08T00:00:00+08",
1163+
"2020-09-08T00:00:00+08",
1164+
"2020-09-08T00:00:00+08",
1165+
"2020-09-08T00:00:00+08",
1166+
],
1167+
),
1168+
];
1169+
1170+
cases.iter().for_each(|(original, tz_opt, expected)| {
1171+
let input = original
1172+
.iter()
1173+
.map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
1174+
.collect::<TimestampNanosecondArray>()
1175+
.with_timezone_opt(tz_opt.clone());
1176+
let right = expected
1177+
.iter()
1178+
.map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
1179+
.collect::<TimestampNanosecondArray>()
1180+
.with_timezone_opt(tz_opt.clone());
1181+
let result = date_trunc(&[
1182+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("day".to_string()))),
1183+
ColumnarValue::Array(Arc::new(input)),
1184+
])
1185+
.unwrap();
1186+
if let ColumnarValue::Array(result) = result {
1187+
assert_eq!(
1188+
result.data_type(),
1189+
&DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone())
1190+
);
1191+
let left = as_primitive_array::<TimestampNanosecondType>(&result);
1192+
assert_eq!(left, &right);
1193+
} else {
1194+
panic!("unexpected column type");
1195+
}
1196+
});
1197+
}
1198+
10541199
#[test]
10551200
fn test_date_bin_single() {
10561201
use chrono::Duration;
@@ -1252,6 +1397,136 @@ mod tests {
12521397
);
12531398
}
12541399

1400+
#[test]
1401+
fn test_date_bin_timezones() {
1402+
let cases = vec![
1403+
(
1404+
vec![
1405+
"2020-09-08T00:00:00Z",
1406+
"2020-09-08T01:00:00Z",
1407+
"2020-09-08T02:00:00Z",
1408+
"2020-09-08T03:00:00Z",
1409+
"2020-09-08T04:00:00Z",
1410+
],
1411+
Some("+00".into()),
1412+
"1970-01-01T00:00:00Z",
1413+
vec![
1414+
"2020-09-08T00:00:00Z",
1415+
"2020-09-08T00:00:00Z",
1416+
"2020-09-08T00:00:00Z",
1417+
"2020-09-08T00:00:00Z",
1418+
"2020-09-08T00:00:00Z",
1419+
],
1420+
),
1421+
(
1422+
vec![
1423+
"2020-09-08T00:00:00Z",
1424+
"2020-09-08T01:00:00Z",
1425+
"2020-09-08T02:00:00Z",
1426+
"2020-09-08T03:00:00Z",
1427+
"2020-09-08T04:00:00Z",
1428+
],
1429+
None,
1430+
"1970-01-01T00:00:00Z",
1431+
vec![
1432+
"2020-09-08T00:00:00Z",
1433+
"2020-09-08T00:00:00Z",
1434+
"2020-09-08T00:00:00Z",
1435+
"2020-09-08T00:00:00Z",
1436+
"2020-09-08T00:00:00Z",
1437+
],
1438+
),
1439+
(
1440+
vec![
1441+
"2020-09-08T00:00:00Z",
1442+
"2020-09-08T01:00:00Z",
1443+
"2020-09-08T02:00:00Z",
1444+
"2020-09-08T03:00:00Z",
1445+
"2020-09-08T04:00:00Z",
1446+
],
1447+
Some("-02".into()),
1448+
"1970-01-01T00:00:00Z",
1449+
vec![
1450+
"2020-09-08T00:00:00Z",
1451+
"2020-09-08T00:00:00Z",
1452+
"2020-09-08T00:00:00Z",
1453+
"2020-09-08T00:00:00Z",
1454+
"2020-09-08T00:00:00Z",
1455+
],
1456+
),
1457+
(
1458+
vec![
1459+
"2020-09-08T00:00:00+05",
1460+
"2020-09-08T01:00:00+05",
1461+
"2020-09-08T02:00:00+05",
1462+
"2020-09-08T03:00:00+05",
1463+
"2020-09-08T04:00:00+05",
1464+
],
1465+
Some("+05".into()),
1466+
"1970-01-01T00:00:00+05",
1467+
vec![
1468+
"2020-09-08T00:00:00+05",
1469+
"2020-09-08T00:00:00+05",
1470+
"2020-09-08T00:00:00+05",
1471+
"2020-09-08T00:00:00+05",
1472+
"2020-09-08T00:00:00+05",
1473+
],
1474+
),
1475+
(
1476+
vec![
1477+
"2020-09-08T00:00:00+08",
1478+
"2020-09-08T01:00:00+08",
1479+
"2020-09-08T02:00:00+08",
1480+
"2020-09-08T03:00:00+08",
1481+
"2020-09-08T04:00:00+08",
1482+
],
1483+
Some("+08".into()),
1484+
"1970-01-01T00:00:00+08",
1485+
vec![
1486+
"2020-09-08T00:00:00+08",
1487+
"2020-09-08T00:00:00+08",
1488+
"2020-09-08T00:00:00+08",
1489+
"2020-09-08T00:00:00+08",
1490+
"2020-09-08T00:00:00+08",
1491+
],
1492+
),
1493+
];
1494+
1495+
cases
1496+
.iter()
1497+
.for_each(|(original, tz_opt, origin, expected)| {
1498+
let input = original
1499+
.iter()
1500+
.map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
1501+
.collect::<TimestampNanosecondArray>()
1502+
.with_timezone_opt(tz_opt.clone());
1503+
let right = expected
1504+
.iter()
1505+
.map(|s| Some(string_to_timestamp_nanos(s).unwrap()))
1506+
.collect::<TimestampNanosecondArray>()
1507+
.with_timezone_opt(tz_opt.clone());
1508+
let result = date_bin(&[
1509+
ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)),
1510+
ColumnarValue::Array(Arc::new(input)),
1511+
ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
1512+
Some(string_to_timestamp_nanos(origin).unwrap()),
1513+
tz_opt.clone(),
1514+
)),
1515+
])
1516+
.unwrap();
1517+
if let ColumnarValue::Array(result) = result {
1518+
assert_eq!(
1519+
result.data_type(),
1520+
&DataType::Timestamp(TimeUnit::Nanosecond, tz_opt.clone())
1521+
);
1522+
let left = as_primitive_array::<TimestampNanosecondType>(&result);
1523+
assert_eq!(left, &right);
1524+
} else {
1525+
panic!("unexpected column type");
1526+
}
1527+
});
1528+
}
1529+
12551530
#[test]
12561531
fn to_timestamp_invalid_input_type() -> Result<()> {
12571532
// pass the wrong type of input array to to_timestamp and test

0 commit comments

Comments
 (0)