Skip to content

Commit e6eabca

Browse files
spencerkclarkkmuehlbauerdcherian
authored
Refactor datetime and timedelta encoding for increased robustness (#9498)
* Increase robustness of encode_cf_datetime * Implement robust units checking for lazy encoding * Remove dtype casting, which interferes with missing value encoding * Take the same approach for timedelta encoding * Fix typing * Add what's new entries * Remove unused function * Add note regarding rollback of integer overflow checking code * Fix dtype coding test failures * Fix what's new merge * Restore return types in tests * Fix order of assignment and warning Co-authored-by: Kai Mühlbauer <[email protected]> * Address review comments --------- Co-authored-by: Kai Mühlbauer <[email protected]> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent bb539d8 commit e6eabca

File tree

3 files changed

+137
-108
lines changed

3 files changed

+137
-108
lines changed

doc/whats-new.rst

+15
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,20 @@ New Features
3232
By `Justus Magin <https://github.com/keewis>`_.
3333
- Added experimental support for coordinate transforms (not ready for public use yet!) (:pull:`9543`)
3434
By `Benoit Bovy <https://github.com/benbovy>`_.
35+
- Similar to our :py:class:`numpy.datetime64` encoding path, automatically
36+
modify the units when an integer dtype is specified during eager cftime
37+
encoding, but the specified units would not allow for an exact round trip
38+
(:pull:`9498`). By `Spencer Clark <https://github.com/spencerkclark>`_.
3539
- Support reading to `GPU memory with Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html>`_ (:pull:`10078`).
3640
By `Deepak Cherian <https://github.com/dcherian>`_.
3741

3842
Breaking changes
3943
~~~~~~~~~~~~~~~~
44+
- Rolled back code that would attempt to catch integer overflow when encoding
45+
times with small integer dtypes (:issue:`8542`), since it was inconsistent
46+
with xarray's handling of standard integers, and interfered with encoding
47+
times with small integer dtypes and missing values (:pull:`9498`). By
48+
`Spencer Clark <https://github.com/spencerkclark>`_.
4049
- Warn instead of raise if phony_dims are detected when using h5netcdf-backend and ``phony_dims=None`` (:issue:`10049`, :pull:`10058`)
4150
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
4251

@@ -62,6 +71,12 @@ Bug fixes
6271
- Fix DataArray().drop_attrs(deep=False) and add support for attrs to
6372
DataArray()._replace(). (:issue:`10027`, :pull:`10030`). By `Jan
6473
Haacker <https://github.com/j-haacker>`_.
74+
- Fix bug preventing encoding times with missing values with small integer
75+
dtype (:issue:`9134`, :pull:`9498`). By `Spencer Clark
76+
<https://github.com/spencerkclark>`_.
77+
- More robustly raise an error when lazily encoding times and an integer dtype
78+
is specified with units that do not allow for an exact round trip
79+
(:pull:`9498`). By `Spencer Clark <https://github.com/spencerkclark>`_.
6580
- Prevent false resolution change warnings from being emitted when decoding
6681
timedeltas encoded with floating point values, and make it clearer how to
6782
silence this warning message in the case that it is rightfully emitted

xarray/coding/times.py

+83-68
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from xarray.compat.pdcompat import default_precision_timestamp, timestamp_as_unit
2424
from xarray.core import indexing
2525
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
26-
from xarray.core.duck_array_ops import array_all, array_any, asarray, ravel, reshape
26+
from xarray.core.duck_array_ops import array_all, asarray, ravel, reshape
2727
from xarray.core.formatting import first_n_items, format_timestamp, last_item
2828
from xarray.core.utils import attempt_import, emit_user_level_warning
2929
from xarray.core.variable import Variable
@@ -298,6 +298,19 @@ def _unpack_time_unit_and_ref_date(
298298
return time_unit, ref_date
299299

300300

301+
def _unpack_time_units_and_ref_date_cftime(units: str, calendar: str):
302+
# same as _unpack_netcdf_time_units but finalizes ref_date for
303+
# processing in encode_cf_datetime
304+
time_units, ref_date = _unpack_netcdf_time_units(units)
305+
ref_date = cftime.num2date(
306+
0,
307+
units=f"microseconds since {ref_date}",
308+
calendar=calendar,
309+
only_use_cftime_datetimes=True,
310+
)
311+
return time_units, ref_date
312+
313+
301314
def _decode_cf_datetime_dtype(
302315
data,
303316
units: str,
@@ -686,6 +699,7 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
686699
# todo: check, if this function works correctly wrt np.timedelta64
687700
unit_timedelta: Callable[[str], timedelta] | Callable[[str], np.timedelta64]
688701
zero_timedelta: timedelta | np.timedelta64
702+
unique_timedeltas = asarray(unique_timedeltas)
689703
if unique_timedeltas.dtype == np.dtype("O"):
690704
time_units = _NETCDF_TIME_UNITS_CFTIME
691705
unit_timedelta = _unit_timedelta_cftime
@@ -700,6 +714,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
700714
return "seconds"
701715

702716

717+
def _time_units_to_timedelta(units: str) -> timedelta:
718+
return timedelta(microseconds=_US_PER_TIME_DELTA[units])
719+
720+
703721
def infer_calendar_name(dates) -> CFCalendar:
704722
"""Given an array of datetimes, infer the CF calendar name"""
705723
if is_np_datetime_like(dates.dtype):
@@ -974,42 +992,6 @@ def _division(deltas, delta, floor):
974992
return num
975993

976994

977-
def _cast_to_dtype_if_safe(num: np.ndarray, dtype: np.dtype) -> np.ndarray:
978-
with warnings.catch_warnings():
979-
warnings.filterwarnings("ignore", message="overflow")
980-
cast_num = np.asarray(num, dtype=dtype)
981-
982-
if np.issubdtype(dtype, np.integer):
983-
if not array_all(num == cast_num):
984-
if np.issubdtype(num.dtype, np.floating):
985-
raise ValueError(
986-
f"Not possible to cast all encoded times from "
987-
f"{num.dtype!r} to {dtype!r} without losing precision. "
988-
f"Consider modifying the units such that integer values "
989-
f"can be used, or removing the units and dtype encoding, "
990-
f"at which point xarray will make an appropriate choice."
991-
)
992-
else:
993-
raise OverflowError(
994-
f"Not possible to cast encoded times from "
995-
f"{num.dtype!r} to {dtype!r} without overflow. Consider "
996-
f"removing the dtype encoding, at which point xarray will "
997-
f"make an appropriate choice, or explicitly switching to "
998-
"a larger integer dtype."
999-
)
1000-
else:
1001-
if array_any(np.isinf(cast_num)):
1002-
raise OverflowError(
1003-
f"Not possible to cast encoded times from {num.dtype!r} to "
1004-
f"{dtype!r} without overflow. Consider removing the dtype "
1005-
f"encoding, at which point xarray will make an appropriate "
1006-
f"choice, or explicitly switching to a larger floating point "
1007-
f"dtype."
1008-
)
1009-
1010-
return cast_num
1011-
1012-
1013995
def encode_cf_datetime(
1014996
dates: T_DuckArray, # type: ignore[misc]
1015997
units: str | None = None,
@@ -1032,6 +1014,27 @@ def encode_cf_datetime(
10321014
return _eagerly_encode_cf_datetime(dates, units, calendar, dtype)
10331015

10341016

1017+
def _infer_needed_units_numpy(ref_date, data_units):
1018+
needed_units, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
1019+
needed_units = _numpy_to_netcdf_timeunit(needed_units)
1020+
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
1021+
data_delta = _unit_timedelta_numpy(needed_units)
1022+
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
1023+
needed_units = _infer_time_units_from_diff(ref_delta)
1024+
return needed_units
1025+
1026+
1027+
def _infer_needed_units_cftime(ref_date, data_units, calendar):
1028+
needed_units, data_ref_date = _unpack_time_units_and_ref_date_cftime(
1029+
data_units, calendar
1030+
)
1031+
ref_delta = abs(data_ref_date - ref_date)
1032+
data_delta = _time_units_to_timedelta(needed_units)
1033+
if (ref_delta % data_delta) > timedelta(seconds=0):
1034+
needed_units = _infer_time_units_from_diff(ref_delta)
1035+
return needed_units
1036+
1037+
10351038
def _eagerly_encode_cf_datetime(
10361039
dates: T_DuckArray, # type: ignore[misc]
10371040
units: str | None = None,
@@ -1049,6 +1052,7 @@ def _eagerly_encode_cf_datetime(
10491052
if calendar is None:
10501053
calendar = infer_calendar_name(dates)
10511054

1055+
raise_incompatible_units_error = False
10521056
try:
10531057
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
10541058
# parse with cftime instead
@@ -1081,16 +1085,7 @@ def _eagerly_encode_cf_datetime(
10811085
time_deltas = dates_as_index - ref_date
10821086

10831087
# retrieve needed units to faithfully encode to int64
1084-
needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
1085-
needed_units = _numpy_to_netcdf_timeunit(needed_unit)
1086-
if data_units != units:
1087-
# this accounts for differences in the reference times
1088-
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
1089-
data_delta = np.timedelta64(1, needed_unit)
1090-
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
1091-
needed_units = _infer_time_units_from_diff(ref_delta)
1092-
1093-
# needed time delta to encode faithfully to int64
1088+
needed_units = _infer_needed_units_numpy(ref_date, data_units)
10941089
needed_time_delta = _unit_timedelta_numpy(needed_units)
10951090

10961091
floor_division = np.issubdtype(dtype, np.integer) or dtype is None
@@ -1104,7 +1099,6 @@ def _eagerly_encode_cf_datetime(
11041099
f"Set encoding['dtype'] to floating point dtype to silence this warning."
11051100
)
11061101
elif np.issubdtype(dtype, np.integer) and allow_units_modification:
1107-
floor_division = True
11081102
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
11091103
emit_user_level_warning(
11101104
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
@@ -1114,6 +1108,10 @@ def _eagerly_encode_cf_datetime(
11141108
)
11151109
units = new_units
11161110
time_delta = needed_time_delta
1111+
floor_division = True
1112+
elif np.issubdtype(dtype, np.integer) and not allow_units_modification:
1113+
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
1114+
raise_incompatible_units_error = True
11171115

11181116
# get resolution of TimedeltaIndex and align time_delta
11191117
# todo: check, if this works in any case
@@ -1123,14 +1121,39 @@ def _eagerly_encode_cf_datetime(
11231121
num = reshape(num.values, dates.shape)
11241122

11251123
except (OutOfBoundsDatetime, OverflowError, ValueError):
1124+
time_units, ref_date = _unpack_time_units_and_ref_date_cftime(units, calendar)
1125+
time_delta_cftime = _time_units_to_timedelta(time_units)
1126+
needed_units = _infer_needed_units_cftime(ref_date, data_units, calendar)
1127+
needed_time_delta_cftime = _time_units_to_timedelta(needed_units)
1128+
1129+
if (
1130+
np.issubdtype(dtype, np.integer)
1131+
and time_delta_cftime > needed_time_delta_cftime
1132+
):
1133+
new_units = f"{needed_units} since {format_cftime_datetime(ref_date)}"
1134+
if allow_units_modification:
1135+
emit_user_level_warning(
1136+
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
1137+
f"Serializing with units {new_units!r} instead. "
1138+
f"Set encoding['dtype'] to floating point dtype to serialize with units {units!r}. "
1139+
f"Set encoding['units'] to {new_units!r} to silence this warning ."
1140+
)
1141+
units = new_units
1142+
else:
1143+
raise_incompatible_units_error = True
1144+
11261145
num = _encode_datetime_with_cftime(dates, units, calendar)
11271146
# do it now only for cftime-based flow
11281147
# we already covered for this in pandas-based flow
11291148
num = cast_to_int_if_safe(num)
11301149

1131-
if dtype is not None:
1132-
# todo: check, if this is really needed for all dtypes
1133-
num = _cast_to_dtype_if_safe(num, dtype)
1150+
if raise_incompatible_units_error:
1151+
raise ValueError(
1152+
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
1153+
f"Consider setting encoding['dtype'] to a floating point dtype to serialize with "
1154+
f"units {units!r}. Consider setting encoding['units'] to {new_units!r} to "
1155+
f"serialize with an integer dtype."
1156+
)
11341157

11351158
return num, units, calendar
11361159

@@ -1243,14 +1266,17 @@ def _eagerly_encode_cf_timedelta(
12431266
time_delta = needed_time_delta
12441267
time_delta = time_delta.astype(f"=m8[{deltas_unit}]")
12451268
floor_division = True
1269+
elif np.issubdtype(dtype, np.integer) and not allow_units_modification:
1270+
raise ValueError(
1271+
f"Timedeltas can't be serialized faithfully to int64 with requested units {units!r}. "
1272+
f"Consider setting encoding['dtype'] to a floating point dtype to serialize with "
1273+
f"units {units!r}. Consider setting encoding['units'] to {needed_units!r} to "
1274+
f"serialize with an integer dtype."
1275+
)
12461276

12471277
num = _division(time_deltas, time_delta, floor_division)
12481278
num = reshape(num.values, timedeltas.shape)
12491279

1250-
if dtype is not None:
1251-
# todo: check, if this is needed for all dtypes
1252-
num = _cast_to_dtype_if_safe(num, dtype)
1253-
12541280
return num, units
12551281

12561282

@@ -1328,20 +1354,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
13281354

13291355
units = encoding.pop("units", None)
13301356
calendar = encoding.pop("calendar", None)
1331-
dtype = encoding.pop("dtype", None)
1357+
dtype = encoding.get("dtype", None)
13321358

13331359
# in the case of packed data we need to encode into
13341360
# float first, the correct dtype will be established
13351361
# via CFScaleOffsetCoder/CFMaskCoder
1336-
set_dtype_encoding = None
13371362
if "add_offset" in encoding or "scale_factor" in encoding:
1338-
set_dtype_encoding = dtype
13391363
dtype = data.dtype if data.dtype.kind == "f" else "float64"
13401364
(data, units, calendar) = encode_cf_datetime(data, units, calendar, dtype)
13411365

1342-
# retain dtype for packed data
1343-
if set_dtype_encoding is not None:
1344-
safe_setitem(encoding, "dtype", set_dtype_encoding, name=name)
13451366
safe_setitem(attrs, "units", units, name=name)
13461367
safe_setitem(attrs, "calendar", calendar, name=name)
13471368

@@ -1393,22 +1414,16 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
13931414
if np.issubdtype(variable.data.dtype, np.timedelta64):
13941415
dims, data, attrs, encoding = unpack_for_encoding(variable)
13951416

1396-
dtype = encoding.pop("dtype", None)
1417+
dtype = encoding.get("dtype", None)
13971418

13981419
# in the case of packed data we need to encode into
13991420
# float first, the correct dtype will be established
14001421
# via CFScaleOffsetCoder/CFMaskCoder
1401-
set_dtype_encoding = None
14021422
if "add_offset" in encoding or "scale_factor" in encoding:
1403-
set_dtype_encoding = dtype
14041423
dtype = data.dtype if data.dtype.kind == "f" else "float64"
14051424

14061425
data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype)
14071426

1408-
# retain dtype for packed data
1409-
if set_dtype_encoding is not None:
1410-
safe_setitem(encoding, "dtype", set_dtype_encoding, name=name)
1411-
14121427
safe_setitem(attrs, "units", units, name=name)
14131428

14141429
return Variable(dims, data, attrs, encoding, fastpath=True)

0 commit comments

Comments
 (0)