Merge branch 'main' into enh_cumsum_for_np_str

pandas-dev · Feb 18, 2025 · 85093a6 · 85093a6
2 parents 2fd9779 + d4dff29
commit 85093a6
Show file tree

Hide file tree

Showing 43 changed files with 272 additions and 198 deletions.
diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst
@@ -109,7 +109,7 @@ development experience:
 
 *  `VSCode rst extension <https://marketplace.visualstudio.com/items?itemName=lextudio.restructuredtext>`_
 *  `Markdown All in One <https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one>`_
-*  `VSCode Gitlens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
+*  `VSCode GitLens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
 *  `VSCode Git Graph extension <https://marketplace.visualstudio.com/items?itemName=mhutchie.git-graph>`_
 
 Development workflow with Gitpod

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -37,7 +37,8 @@ Other enhancements
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`60633`)
+- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
 .. ---------------------------------------------------------------------------
@@ -53,6 +54,16 @@ These are bug fixes that might have notable behavior changes.
 notable_bug_fix1
 ^^^^^^^^^^^^^^^^
 
+.. _whatsnew_230.api_changes:
+
+API changes
+~~~~~~~~~~~
+
+- When enabling the ``future.infer_string`` option: Index set operations (like
+  union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
+  empty ``Index`` with object dtype when determining the dtype of the resulting
+  Index (:issue:`60797`)
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.deprecations:
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -361,6 +361,9 @@ Other API changes
 - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
 - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
 - when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`)
+- Index set operations (like union or intersection) will now ignore the dtype of
+  an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
+  the dtype of the resulting Index (:issue:`60797`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.deprecations:
@@ -667,6 +670,7 @@ Conversion
 - Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`)
 - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
 - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
+- Bug in :meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` removing timezone information for objects with :class:`ArrowDtype` (:issue:`60237`)
 - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)
 
 Strings

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -818,33 +818,7 @@ def is_monotonic(const numeric_object_t[:] arr, bint timelike):
     if timelike and <int64_t>arr[0] == NPY_NAT:
         return False, False, False
 
-    if numeric_object_t is not object:
-        with nogil:
-            prev = arr[0]
-            for i in range(1, n):
-                cur = arr[i]
-                if timelike and <int64_t>cur == NPY_NAT:
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                if cur < prev:
-                    is_monotonic_inc = 0
-                elif cur > prev:
-                    is_monotonic_dec = 0
-                elif cur == prev:
-                    is_unique = 0
-                else:
-                    # cur or prev is NaN
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                if not is_monotonic_inc and not is_monotonic_dec:
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                prev = cur
-    else:
-        # object-dtype, identical to above except we cannot use `with nogil`
+    with nogil(numeric_object_t is not object):
         prev = arr[0]
         for i in range(1, n):
             cur = arr[i]

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
 
     modes = np.empty(nkeys, dtype=values.dtype)
 
-    if htfunc_t is not object:
-        with nogil:
-            for k in range(nkeys):
-                count = counts[k]
-                if count == max_count:
-                    j += 1
-                elif count > max_count:
-                    max_count = count
-                    j = 0
-                else:
-                    continue
-
-                modes[j] = keys[k]
-    else:
+    with nogil(htfunc_t is not object):
         for k in range(nkeys):
             count = counts[k]
             if count == max_count:

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -502,7 +502,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def get_blkno_indexers(
-    int64_t[:] blknos, bint group=True
+    const int64_t[:] blknos, bint group=True
 ) -> list[tuple[int, slice | np.ndarray]]:
     """
     Enumerate contiguous runs of integers in ndarray.
@@ -596,8 +596,8 @@ def get_blkno_placements(blknos, group: bool = True):
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cpdef update_blklocs_and_blknos(
-    ndarray[intp_t, ndim=1] blklocs,
-    ndarray[intp_t, ndim=1] blknos,
+    const intp_t[:] blklocs,
+    const intp_t[:] blknos,
     Py_ssize_t loc,
     intp_t nblocks,
 ):

diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
@@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil:
+cdef void _get_result_indexer(
+    const intp_t[::1] sorter,
+    intp_t[::1] indexer,
+) noexcept nogil:
     """NOTE: overwrites indexer with the result to avoid allocating another array"""
     cdef:
         Py_ssize_t i, n, idx
@@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t]
 from pandas._libs.hashtable cimport Int64HashTable
 
 
-def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
-                                 ndarray[numeric_t] right_values,
+def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values,
+                                 const numeric_t[:] right_values,
                                  const int64_t[:] left_by_values,
                                  const int64_t[:] right_by_values,
                                  bint allow_exact_matches=True,
@@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
     return left_indexer, right_indexer
 
 
-def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
-                                ndarray[numeric_t] right_values,
+def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values,
+                                const numeric_t[:] right_values,
                                 const int64_t[:] left_by_values,
                                 const int64_t[:] right_by_values,
                                 bint allow_exact_matches=1,
@@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
     return left_indexer, right_indexer
 
 
-def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
-                                ndarray[numeric_t] right_values,
+def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values,
+                                const numeric_t[:] right_values,
                                 const int64_t[:] left_by_values,
                                 const int64_t[:] right_by_values,
                                 bint allow_exact_matches=True,

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -981,16 +981,14 @@ def get_level_sorter(
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
+def count_level_2d(const uint8_t[:, :] mask,
                    const intp_t[:] labels,
                    Py_ssize_t max_bin,
                    ):
     cdef:
-        Py_ssize_t i, j, k, n
+        Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0]
         ndarray[int64_t, ndim=2] counts
 
-    n, k = (<object>mask).shape
-
     counts = np.zeros((n, max_bin), dtype="i8")
     with nogil:
         for i in range(n):

diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx
@@ -40,27 +40,7 @@ def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask,
     cdef:
         Py_ssize_t i, j, w, nulls, s, offset
 
-    if numeric_object_t is not object:
-        # evaluated at compile-time
-        with nogil:
-            for i in range(stride):
-
-                nulls = 0
-                for j in range(length):
-
-                    for w in range(width):
-
-                        offset = j * width + w
-
-                        if mask[offset]:
-                            s = i * width + w
-                            new_values[j, s] = values[offset - nulls, i]
-                            new_mask[j, s] = 1
-                        else:
-                            nulls += 1
-
-    else:
-        # object-dtype, identical to above but we cannot use nogil
+    with nogil(numeric_object_t is not object):
         for i in range(stride):
 
             nulls = 0

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1113,7 +1113,7 @@ def convert_dtypes(
     else:
         inferred_dtype = input_array.dtype
 
-    if dtype_backend == "pyarrow":
+    if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype):
         from pandas.core.arrays.arrow.array import to_pyarrow_type
         from pandas.core.arrays.string_ import StringDtype
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -19,7 +19,10 @@
 
 import numpy as np
 
-from pandas._config import get_option
+from pandas._config import (
+    get_option,
+    using_string_dtype,
+)
 
 from pandas._libs import (
     NaT,
@@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
         """
         target_dtype, _ = infer_dtype_from(target)
 
+        if using_string_dtype():
+            # special case: if left or right is a zero-length RangeIndex or
+            # Index[object], those can be created by the default empty constructors
+            # -> for that case ignore this dtype and always return the other
+            # (https://github.com/pandas-dev/pandas/pull/60797)
+            from pandas.core.indexes.range import RangeIndex
+
+            if len(self) == 0 and (
+                isinstance(self, RangeIndex) or self.dtype == np.object_
+            ):
+                return target_dtype
+            if (
+                isinstance(target, Index)
+                and len(target) == 0
+                and (isinstance(target, RangeIndex) or target_dtype == np.object_)
+            ):
+                return self.dtype
+
         # special case: if one dtype is uint64 and the other a signed int, return object
         # See https://github.com/pandas-dev/pandas/issues/26778 for discussion
         # Now it's:
@@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:
 
         arr = self._values
 
+        if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
+            # special case: if we are an empty object-dtype Index, also
+            # take into account the inserted item for the resulting dtype
+            # (https://github.com/pandas-dev/pandas/pull/60797)
+            dtype = self._find_common_type_compat(item)
+            if dtype != self.dtype:
+                return self.astype(dtype).insert(loc, item)
+
         try:
             if isinstance(arr, ExtensionArray):
                 res_values = arr.insert(loc, item)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -34,6 +34,7 @@
     is_numeric_dtype,
     is_object_dtype,
     is_re,
+    is_string_dtype,
 )
 from pandas.core.dtypes.dtypes import (
     ArrowDtype,
@@ -2102,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None):
         result = self._data.array._str_slice_replace(start, stop, repl)
         return self._wrap_result(result)
 
-    def decode(self, encoding, errors: str = "strict"):
+    def decode(
+        self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None
+    ):
         """
         Decode character string in the Series/Index using indicated encoding.
 
@@ -2116,6 +2119,12 @@ def decode(self, encoding, errors: str = "strict"):
         errors : str, optional
             Specifies the error handling scheme.
             Possible values are those supported by :meth:`bytes.decode`.
+        dtype : str or dtype, optional
+            The dtype of the result. When not ``None``, must be either a string or
+            object dtype. When ``None``, the dtype of the result is determined by
+            ``pd.options.future.infer_string``.
+
+            .. versionadded:: 2.3.0
 
         Returns
         -------
@@ -2137,6 +2146,10 @@ def decode(self, encoding, errors: str = "strict"):
         2   ()
         dtype: object
         """
+        if dtype is not None and not is_string_dtype(dtype):
+            raise ValueError(f"dtype must be string or object, got {dtype=}")
+        if dtype is None and get_option("future.infer_string"):
+            dtype = "str"
         # TODO: Add a similar _bytes interface.
         if encoding in _cpython_optimized_decoders:
             # CPython optimized implementation
@@ -2146,7 +2159,6 @@ def decode(self, encoding, errors: str = "strict"):
             f = lambda x: decoder(x, errors)[0]
         arr = self._data.array
         result = arr._str_map(f)
-        dtype = "str" if get_option("future.infer_string") else None
         return self._wrap_result(result, dtype=dtype)
 
     @forbid_nonstring_types(["bytes"])

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -4159,6 +4159,8 @@ def _create_axes(
                 ordered = data_converted.ordered
                 meta = "category"
                 metadata = np.asarray(data_converted.categories).ravel()
+            elif isinstance(blk.dtype, StringDtype):
+                meta = str(blk.dtype)
 
             data, dtype_name = _get_data_and_dtype_name(data_converted)
 
@@ -4419,7 +4421,8 @@ def read_column(
                     errors=self.errors,
                 )
                 cvs = col_values[1]
-                return Series(cvs, name=column, copy=False)
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                return Series(cvs, name=column, copy=False, dtype=dtype)
 
         raise KeyError(f"column [{column}] not found in the table")
 
@@ -4769,8 +4772,18 @@ def read(
                 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
             if not (using_string_dtype() and values.dtype.kind == "O"):
                 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
+
+            # If str / string dtype is stored in meta, use that.
+            converted = False
+            for column in cols_:
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                if dtype in ["str", "string"]:
+                    df[column] = df[column].astype(dtype)
+                    converted = True
+            # Otherwise try inference.
             if (
-                using_string_dtype()
+                not converted
+                and using_string_dtype()
                 and isinstance(values, np.ndarray)
                 and is_string_array(
                     values,

diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py
@@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
         _concat.concat_compat([arr[:2], arr[2:]], axis=1)
 
 
-def test_concat_series_between_empty_and_tzaware_series():
+def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
     tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
     ser1 = Series(index=[tzaware_time], data=0, dtype=float)
     ser2 = Series(dtype=float)
@@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
         data=[
             (0.0, None),
         ],
-        index=pd.Index([tzaware_time], dtype=object),
+        index=[tzaware_time]
+        if using_infer_string
+        else pd.Index([tzaware_time], dtype=object),
         columns=[0, 1],
         dtype=float,
     )