pandas-dev · jorisvandenbossche · Oct 10, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 27, 2024
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
@@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details,
 see the `commit logs <https://github.com/pandas-dev/pandas/commits/>`_. For install and
 upgrade instructions, see :ref:`install`.
 
+Version 2.3
+-----------
+
+.. toctree::
+   :maxdepth: 2
+
+   v2.3.0
+
 Version 2.2
 -----------
 

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -0,0 +1,180 @@
+.. _whatsnew_230:
+
+What's new in 2.3.0 (Month XX, 2024)
+------------------------------------
+
+These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_230.upcoming_changes:
+
+Upcoming changes in pandas 3.0
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+.. _whatsnew_230.enhancements:
+
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_230.enhancements.enhancement1:
+
+enhancement1
+^^^^^^^^^^^^
+
+
+.. _whatsnew_230.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.notable_bug_fixes:
+
+Notable bug fixes
+~~~~~~~~~~~~~~~~~
+
+These are bug fixes that might have notable behavior changes.
+
+.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1:
+
+notable_bug_fix1
+^^^^^^^^^^^^^^^^
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+Categorical
+^^^^^^^^^^^
+-
+-
+
+Datetimelike
+^^^^^^^^^^^^
+-
+-
+
+Timedelta
+^^^^^^^^^
+-
+-
+
+Timezones
+^^^^^^^^^
+-
+-
+
+Numeric
+^^^^^^^
+-
+-
+
+Conversion
+^^^^^^^^^^
+-
+-
+
+Strings
+^^^^^^^
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
+- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
+- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
+-
+
+Interval
+^^^^^^^^
+-
+-
+
+Indexing
+^^^^^^^^
+-
+-
+
+Missing
+^^^^^^^
+-
+-
+
+MultiIndex
+^^^^^^^^^^
+-
+-
+
+I/O
+^^^
+-
+-
+
+Period
+^^^^^^
+-
+-
+
+Plotting
+^^^^^^^^
+-
+-
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+-
+-
+
+Reshaping
+^^^^^^^^^
+-
+-
+
+Sparse
+^^^^^^
+-
+-
+
+ExtensionArray
+^^^^^^^^^^^^^^
+-
+-
+
+Styler
+^^^^^^
+-
+-
+
+Other
+^^^^^
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.contributors:
+
+Contributors
+~~~~~~~~~~~~
@@ -67,6 +67,10 @@ cdef class NDArrayBacked:
         """
         Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
 
+        The returned array has the same dtype as self.
+
+        Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`.
+
         This should round-trip:
             self == self._from_backing_data(self._ndarray)
         """

@@ -33,7 +33,10 @@ from pandas._libs.khash cimport (
     kh_python_hash_func,
     khiter_t,
 )
-from pandas._libs.missing cimport checknull
+from pandas._libs.missing cimport (
+    checknull,
+    is_matching_na,
+)
 
 
 def get_hashtable_trace_domain():

@@ -1121,11 +1121,13 @@ cdef class StringHashTable(HashTable):
             const char **vecs
             khiter_t k
             bint use_na_value
+            bint non_null_na_value
 
         if return_inverse:
             labels = np.zeros(n, dtype=np.intp)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
+        non_null_na_value = not checknull(na_value)
 
         # assign pointers and pre-filter out missing (if ignore_na)
         vecs = <const char **>malloc(n * sizeof(char *))
@@ -1134,7 +1136,12 @@ cdef class StringHashTable(HashTable):
 
             if (ignore_na
                 and (not isinstance(val, str)
-                     or (use_na_value and val == na_value))):
+                     or (use_na_value and (
+                        (non_null_na_value and val == na_value) or
+                        (not non_null_na_value and is_matching_na(val, na_value)))
+                        )
+                    )
+                ):
                 # if missing values do not count as unique values (i.e. if
                 # ignore_na is True), we can skip the actual value, and
                 # replace the label with na_sentinel directly
@@ -1400,18 +1407,23 @@ cdef class PyObjectHashTable(HashTable):
             object val
             khiter_t k
             bint use_na_value
-
+            bint non_null_na_value
         if return_inverse:
             labels = np.empty(n, dtype=np.intp)
         use_na_value = na_value is not None
+        non_null_na_value = not checknull(na_value)
 
         for i in range(n):
             val = values[i]
             hash(val)
 
             if ignore_na and (
                 checknull(val)
-                or (use_na_value and val == na_value)
+                or (use_na_value and (
+                    (non_null_na_value and val == na_value) or
+                    (not non_null_na_value and is_matching_na(val, na_value))
+                    )
+                )
             ):
                 # if missing values do not count as unique values (i.e. if
                 # ignore_na is True), skip the hashtable entry for them, and

@@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array(
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
     copy : bool, default True
-        Whether to ensure that a new array is returned.
+        Whether to ensure that a new array is returned. When True, a new array
+        is always returned. When False, a new array is only returned when needed
+        to avoid mutating the input array.
     skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
         (e.g. if False, NaN becomes 'nan').
@@ -753,7 +755,14 @@ cpdef ndarray[object] ensure_string_array(
 
     if hasattr(arr, "to_numpy"):
 
-        if hasattr(arr, "dtype") and arr.dtype.kind in "mM":
+        if (
+            hasattr(arr, "dtype")
+            and arr.dtype.kind in "mM"
+            # TODO: we should add a custom ArrowExtensionArray.astype implementation
+            # that handles astype(str) specifically, avoiding ending up here and
+            # then we can remove the below check for `_pa_array` (for ArrowEA)
+            and not hasattr(arr, "_pa_array")
+        ):
             # dtype check to exclude DataFrame
             # GH#41409 TODO: not a great place for this
             out = arr.astype(str).astype(object)
@@ -765,10 +774,17 @@ cpdef ndarray[object] ensure_string_array(
 
     result = np.asarray(arr, dtype="object")
 
-    if copy and (result is arr or np.shares_memory(arr, result)):
-        # GH#54654
-        result = result.copy()
-    elif not copy and result is arr:
+    if result is arr or np.may_share_memory(arr, result):
+        # if np.asarray(..) did not make a copy of the input arr, we still need
+        #  to do that to avoid mutating the input array
+        # GH#54654: share_memory check is needed for rare cases where np.asarray
+        #  returns a new object without making a copy of the actual data
+        if copy:
+            result = result.copy()
+        else:
+            already_copied = False
+    elif not copy and not result.flags.writeable:
+        # Weird edge case where result is a view
         already_copied = False
 
     if issubclass(arr.dtype.type, np.str_):

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -112,7 +112,7 @@
 
 COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
 if using_string_dtype():
-    STRING_DTYPES: list[Dtype] = [str, "U"]
+    STRING_DTYPES: list[Dtype] = ["U"]
 else:
     STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
 COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1228,6 +1228,34 @@ def string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        ("python", pd.NA),
+        pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
+        pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+        ("python", np.nan),
+    ],
+    ids=[
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+        "string=str[python]",
+    ],
+)
+def string_dtype_no_object(request):
+    """
+    Parametrized fixture for string dtypes.
+    * 'string[python]' (NA variant)
+    * 'string[pyarrow]' (NA variant)
+    * 'str' (NaN variant, with pyarrow)
+    * 'str' (NaN variant, without pyarrow)
+    """
+    # need to instantiate the StringDtype here instead of in the params
+    # to avoid importing pyarrow during test collection
+    storage, na_value = request.param
+    return pd.StringDtype(storage, na_value)
+
+
 @pytest.fixture(
     params=[
         "string[python]",
@@ -1266,7 +1294,13 @@ def string_storage(request):
         pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
         pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
         ("python", np.nan),
-    ]
+    ],
+    ids=[
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+        "string=str[python]",
+    ],
 )
 def string_dtype_arguments(request):
     """
@@ -1297,6 +1331,7 @@ def dtype_backend(request):
 
 # Alias so we can test with cartesian product of string_storage
 string_storage2 = string_storage
+string_dtype_arguments2 = string_dtype_arguments
 
 
 @pytest.fixture(params=tm.BYTES_DTYPES)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
@@ -49,7 +49,8 @@
 @contextmanager
 def set_numba_data(index: Index):
     numba_data = index._data
-    if numba_data.dtype == object:
+    if numba_data.dtype in (object, "string"):
+        numba_data = np.asarray(numba_data)
         if not lib.is_string_array(numba_data):
             raise ValueError(
                 "The numba engine only supports using string or numeric column names"