From 02e2baed7769bb62620cfa198f8e4fc302ab145b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Nov 2023 10:18:07 -0800 Subject: [PATCH] BUG: resolution inference with NaT ints/floats/strings (#55981) --- pandas/_libs/lib.pyx | 1 + pandas/_libs/tslib.pyx | 34 +++++++++++-------- pandas/_libs/tslibs/strptime.pyx | 2 +- pandas/tests/tslibs/test_array_to_datetime.py | 18 ++++++++++ 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f2ff500ae068e..67e0224b64d7f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2636,6 +2636,7 @@ def maybe_convert_objects(ndarray[object] objects, tsobj = convert_to_tsobject(val, None, None, 0, 0) tsobj.ensure_reso(NPY_FR_ns) except OutOfBoundsDatetime: + # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True break else: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 41d8a3cfad8eb..b5f448aa32fad 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -450,12 +450,12 @@ cpdef array_to_datetime( Returns ------- np.ndarray - May be datetime64[ns] or object dtype + May be datetime64[creso_unit] or object dtype tzinfo or None """ cdef: Py_ssize_t i, n = values.size - object val, tz + object val ndarray[int64_t] iresult npy_datetimestruct dts bint utc_convert = bool(utc) @@ -467,7 +467,7 @@ cpdef array_to_datetime( _TSObject _ts float tz_offset set out_tzoffset_vals = set() - tzinfo tz_out = None + tzinfo tz, tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) NPY_DATETIMEUNIT item_reso bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -522,15 +522,14 @@ cpdef array_to_datetime( elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition - item_reso = NPY_FR_ns - state.update_creso(item_reso) - if infer_reso: - creso = state.creso if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - # we now need to parse this as if unit='ns' + item_reso = NPY_FR_ns + state.update_creso(item_reso) + if infer_reso: + creso = state.creso iresult[i] = cast_from_unit(val, "ns", out_reso=creso) state.found_other = True @@ -552,6 +551,16 @@ cpdef array_to_datetime( _ts = convert_str_to_tsobject( val, None, dayfirst=dayfirst, yearfirst=yearfirst ) + + if _ts.value == NPY_NAT: + # e.g. "NaT" string or empty string, we do not consider + # this as either tzaware or tznaive. See + # test_to_datetime_with_empty_str_utc_false_format_mixed + # We also do not update resolution inference based on this, + # see test_infer_with_nat_int_float_str + iresult[i] = _ts.value + continue + item_reso = _ts.creso state.update_creso(item_reso) if infer_reso: @@ -562,12 +571,7 @@ cpdef array_to_datetime( iresult[i] = _ts.value tz = _ts.tzinfo - if _ts.value == NPY_NAT: - # e.g. "NaT" string or empty string, we do not consider - # this as either tzaware or tznaive. See - # test_to_datetime_with_empty_str_utc_false_format_mixed - pass - elif tz is not None: + if tz is not None: # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() @@ -640,7 +644,7 @@ cpdef array_to_datetime( # Otherwise we can use the single reso that we encountered and avoid # a second pass. abbrev = npy_unit_to_abbrev(state.creso) - result = iresult.view(f"M8[{abbrev}]") + result = iresult.view(f"M8[{abbrev}]").reshape(result.shape) return result, tz_out diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index bc7028997c697..d8926d14ae7e5 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -240,7 +240,7 @@ cdef _get_format_regex(str fmt): cdef class DatetimeParseState: - def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns): + def __cinit__(self, NPY_DATETIMEUNIT creso): # found_tz and found_naive are specifically about datetime/Timestamp # objects with and without tzinfos attached. self.found_tz = False diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index dbf81013662e7..15e34c68c4d2f 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -82,6 +82,24 @@ def test_infer_heterogeneous(self): assert tz is None tm.assert_numpy_array_equal(result, expected[::-1]) + @pytest.mark.parametrize( + "item", [float("nan"), NaT.value, float(NaT.value), "NaT", ""] + ) + def test_infer_with_nat_int_float_str(self, item): + # floats/ints get inferred to nanos *unless* they are NaN/iNaT, + # similar NaT string gets treated like NaT scalar (ignored for resolution) + dt = datetime(2023, 11, 15, 15, 5, 6) + + arr = np.array([dt, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, np.datetime64("NaT")], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result2, tz2 = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz2 is None + tm.assert_numpy_array_equal(result2, expected[::-1]) + class TestArrayToDatetimeWithTZResolutionInference: def test_array_to_datetime_with_tz_resolution(self):