25
25
from xarray .core .formatting import first_n_items , format_timestamp , last_item
26
26
from xarray .core .pdcompat import nanosecond_precision_timestamp
27
27
from xarray .core .pycompat import is_duck_dask_array
28
+ from xarray .core .utils import emit_user_level_warning
28
29
from xarray .core .variable import Variable
29
30
30
31
try :
@@ -122,6 +123,18 @@ def _netcdf_to_numpy_timeunit(units: str) -> str:
122
123
}[units ]
123
124
124
125
126
+ def _numpy_to_netcdf_timeunit (units : str ) -> str :
127
+ return {
128
+ "ns" : "nanoseconds" ,
129
+ "us" : "microseconds" ,
130
+ "ms" : "milliseconds" ,
131
+ "s" : "seconds" ,
132
+ "m" : "minutes" ,
133
+ "h" : "hours" ,
134
+ "D" : "days" ,
135
+ }[units ]
136
+
137
+
125
138
def _ensure_padded_year (ref_date : str ) -> str :
126
139
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
127
140
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
@@ -171,6 +184,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
171
184
return delta_units , ref_date
172
185
173
186
187
+ def _unpack_time_units_and_ref_date (units : str ) -> tuple [str , pd .Timestamp ]:
188
+ # same us _unpack_netcdf_time_units but finalizes ref_date for
189
+ # processing in encode_cf_datetime
190
+ time_units , _ref_date = _unpack_netcdf_time_units (units )
191
+ # TODO: the strict enforcement of nanosecond precision Timestamps can be
192
+ # relaxed when addressing GitHub issue #7493.
193
+ ref_date = nanosecond_precision_timestamp (_ref_date )
194
+ # If the ref_date Timestamp is timezone-aware, convert to UTC and
195
+ # make it timezone-naive (GH 2649).
196
+ if ref_date .tz is not None :
197
+ ref_date = ref_date .tz_convert (None )
198
+ return time_units , ref_date
199
+
200
+
174
201
def _decode_cf_datetime_dtype (
175
202
data , units : str , calendar : str , use_cftime : bool | None
176
203
) -> np .dtype :
@@ -222,8 +249,8 @@ def _decode_datetime_with_pandas(
222
249
"pandas."
223
250
)
224
251
225
- delta , ref_date = _unpack_netcdf_time_units (units )
226
- delta = _netcdf_to_numpy_timeunit (delta )
252
+ time_units , ref_date = _unpack_netcdf_time_units (units )
253
+ time_units = _netcdf_to_numpy_timeunit (time_units )
227
254
try :
228
255
# TODO: the strict enforcement of nanosecond precision Timestamps can be
229
256
# relaxed when addressing GitHub issue #7493.
@@ -237,8 +264,8 @@ def _decode_datetime_with_pandas(
237
264
warnings .filterwarnings ("ignore" , "invalid value encountered" , RuntimeWarning )
238
265
if flat_num_dates .size > 0 :
239
266
# avoid size 0 datetimes GH1329
240
- pd .to_timedelta (flat_num_dates .min (), delta ) + ref_date
241
- pd .to_timedelta (flat_num_dates .max (), delta ) + ref_date
267
+ pd .to_timedelta (flat_num_dates .min (), time_units ) + ref_date
268
+ pd .to_timedelta (flat_num_dates .max (), time_units ) + ref_date
242
269
243
270
# To avoid integer overflow when converting to nanosecond units for integer
244
271
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
@@ -251,9 +278,12 @@ def _decode_datetime_with_pandas(
251
278
252
279
# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
253
280
# works much faster when dealing with integers (GH 1399).
254
- flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA [delta ]).astype (
255
- np .int64
256
- )
281
+ # properly handle NaN/NaT to prevent casting NaN to int
282
+ nan = np .isnan (flat_num_dates ) | (flat_num_dates == np .iinfo (np .int64 ).min )
283
+ flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA [time_units ]
284
+ flat_num_dates_ns_int = np .zeros_like (flat_num_dates , dtype = np .int64 )
285
+ flat_num_dates_ns_int [nan ] = np .iinfo (np .int64 ).min
286
+ flat_num_dates_ns_int [~ nan ] = flat_num_dates [~ nan ].astype (np .int64 )
257
287
258
288
# Use pd.to_timedelta to safely cast integer values to timedeltas,
259
289
# and add those to a Timestamp to safely produce a DatetimeIndex. This
@@ -364,6 +394,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
364
394
return "seconds"
365
395
366
396
397
+ def _time_units_to_timedelta64 (units : str ) -> np .timedelta64 :
398
+ return np .timedelta64 (1 , _netcdf_to_numpy_timeunit (units )).astype ("timedelta64[ns]" )
399
+
400
+
367
401
def infer_calendar_name (dates ) -> CFCalendar :
368
402
"""Given an array of datetimes, infer the CF calendar name"""
369
403
if is_np_datetime_like (dates .dtype ):
@@ -572,9 +606,12 @@ def _should_cftime_be_used(
572
606
573
607
574
608
def _cleanup_netcdf_time_units (units : str ) -> str :
575
- delta , ref_date = _unpack_netcdf_time_units (units )
609
+ time_units , ref_date = _unpack_netcdf_time_units (units )
610
+ time_units = time_units .lower ()
611
+ if not time_units .endswith ("s" ):
612
+ time_units = f"{ time_units } s"
576
613
try :
577
- units = f"{ delta } since { format_timestamp (ref_date )} "
614
+ units = f"{ time_units } since { format_timestamp (ref_date )} "
578
615
except (OutOfBoundsDatetime , ValueError ):
579
616
# don't worry about reifying the units if they're out of bounds or
580
617
# formatted badly
@@ -633,62 +670,93 @@ def encode_cf_datetime(
633
670
"""
634
671
dates = np .asarray (dates )
635
672
673
+ data_units = infer_datetime_units (dates )
674
+
636
675
if units is None :
637
- units = infer_datetime_units ( dates )
676
+ units = data_units
638
677
else :
639
678
units = _cleanup_netcdf_time_units (units )
640
679
641
680
if calendar is None :
642
681
calendar = infer_calendar_name (dates )
643
682
644
- delta , _ref_date = _unpack_netcdf_time_units (units )
645
683
try :
646
684
if not _is_standard_calendar (calendar ) or dates .dtype .kind == "O" :
647
685
# parse with cftime instead
648
686
raise OutOfBoundsDatetime
649
687
assert dates .dtype == "datetime64[ns]"
650
688
651
- delta_units = _netcdf_to_numpy_timeunit (delta )
652
- time_delta = np .timedelta64 (1 , delta_units ).astype ("timedelta64[ns]" )
653
-
654
- # TODO: the strict enforcement of nanosecond precision Timestamps can be
655
- # relaxed when addressing GitHub issue #7493.
656
- ref_date = nanosecond_precision_timestamp (_ref_date )
689
+ time_units , ref_date = _unpack_time_units_and_ref_date (units )
690
+ time_delta = _time_units_to_timedelta64 (time_units )
657
691
658
- # If the ref_date Timestamp is timezone-aware, convert to UTC and
659
- # make it timezone-naive (GH 2649).
660
- if ref_date .tz is not None :
661
- ref_date = ref_date .tz_convert (None )
692
+ # retrieve needed units to faithfully encode to int64
693
+ needed_units , data_ref_date = _unpack_time_units_and_ref_date (data_units )
694
+ if data_units != units :
695
+ # this accounts for differences in the reference times
696
+ ref_delta = abs (data_ref_date - ref_date ).to_timedelta64 ()
697
+ if ref_delta > np .timedelta64 (0 , "ns" ):
698
+ needed_units = _infer_time_units_from_diff (ref_delta )
662
699
663
700
# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
664
701
# an OverflowError is raised if the ref_date is too far away from
665
702
# dates to be encoded (GH 2272).
666
703
dates_as_index = pd .DatetimeIndex (dates .ravel ())
667
704
time_deltas = dates_as_index - ref_date
668
705
669
- # Use floor division if time_delta evenly divides all differences
670
- # to preserve integer dtype if possible (GH 4045).
671
- if np .all (time_deltas % time_delta == np .timedelta64 (0 , "ns" )):
672
- num = time_deltas // time_delta
706
+ # needed time delta to encode faithfully to int64
707
+ needed_time_delta = _time_units_to_timedelta64 (needed_units )
708
+ if time_delta <= needed_time_delta :
709
+ # calculate int64 floor division
710
+ # to preserve integer dtype if possible (GH 4045, GH7817).
711
+ num = time_deltas // time_delta .astype (np .int64 )
712
+ num = num .astype (np .int64 , copy = False )
673
713
else :
714
+ emit_user_level_warning (
715
+ f"Times can't be serialized faithfully with requested units { units !r} . "
716
+ f"Resolution of { needed_units !r} needed. "
717
+ f"Serializing timeseries to floating point."
718
+ )
674
719
num = time_deltas / time_delta
675
720
num = num .values .reshape (dates .shape )
676
721
677
722
except (OutOfBoundsDatetime , OverflowError , ValueError ):
678
723
num = _encode_datetime_with_cftime (dates , units , calendar )
724
+ # do it now only for cftime-based flow
725
+ # we already covered for this in pandas-based flow
726
+ num = cast_to_int_if_safe (num )
679
727
680
- num = cast_to_int_if_safe (num )
681
728
return (num , units , calendar )
682
729
683
730
684
731
def encode_cf_timedelta (timedeltas , units : str | None = None ) -> tuple [np .ndarray , str ]:
685
- if units is None :
686
- units = infer_timedelta_units (timedeltas )
732
+ data_units = infer_timedelta_units (timedeltas )
687
733
688
- np_unit = _netcdf_to_numpy_timeunit (units )
689
- num = 1.0 * timedeltas / np .timedelta64 (1 , np_unit )
690
- num = np .where (pd .isnull (timedeltas ), np .nan , num )
691
- num = cast_to_int_if_safe (num )
734
+ if units is None :
735
+ units = data_units
736
+
737
+ time_delta = _time_units_to_timedelta64 (units )
738
+ time_deltas = pd .TimedeltaIndex (timedeltas .ravel ())
739
+
740
+ # retrieve needed units to faithfully encode to int64
741
+ needed_units = data_units
742
+ if data_units != units :
743
+ needed_units = _infer_time_units_from_diff (np .unique (time_deltas .dropna ()))
744
+
745
+ # needed time delta to encode faithfully to int64
746
+ needed_time_delta = _time_units_to_timedelta64 (needed_units )
747
+ if time_delta <= needed_time_delta :
748
+ # calculate int64 floor division
749
+ # to preserve integer dtype if possible
750
+ num = time_deltas // time_delta .astype (np .int64 )
751
+ num = num .astype (np .int64 , copy = False )
752
+ else :
753
+ emit_user_level_warning (
754
+ f"Timedeltas can't be serialized faithfully with requested units { units !r} . "
755
+ f"Resolution of { needed_units !r} needed. "
756
+ f"Serializing timedeltas to floating point."
757
+ )
758
+ num = time_deltas / time_delta
759
+ num = num .values .reshape (timedeltas .shape )
692
760
return (num , units )
693
761
694
762
@@ -702,9 +770,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
702
770
) or contains_cftime_datetimes (variable ):
703
771
dims , data , attrs , encoding = unpack_for_encoding (variable )
704
772
705
- (data , units , calendar ) = encode_cf_datetime (
706
- data , encoding .pop ("units" , None ), encoding .pop ("calendar" , None )
707
- )
773
+ units = encoding .pop ("units" , None )
774
+ calendar = encoding .pop ("calendar" , None )
775
+ (data , units , calendar ) = encode_cf_datetime (data , units , calendar )
776
+
708
777
safe_setitem (attrs , "units" , units , name = name )
709
778
safe_setitem (attrs , "calendar" , calendar , name = name )
710
779
0 commit comments