Skip to content

Commit

Permalink
Merge branch 'main' into enh_cumsum_for_np_str
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Feb 18, 2025
2 parents 2fd9779 + d4dff29 commit 85093a6
Show file tree
Hide file tree
Showing 43 changed files with 272 additions and 198 deletions.
2 changes: 1 addition & 1 deletion doc/source/development/contributing_gitpod.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ development experience:

* `VSCode rst extension <https://marketplace.visualstudio.com/items?itemName=lextudio.restructuredtext>`_
* `Markdown All in One <https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one>`_
* `VSCode Gitlens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
* `VSCode GitLens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
* `VSCode Git Graph extension <https://marketplace.visualstudio.com/items?itemName=mhutchie.git-graph>`_

Development workflow with Gitpod
Expand Down
13 changes: 12 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ Other enhancements
updated to work correctly with NumPy >= 2 (:issue:`57739`)
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`60633`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)

.. ---------------------------------------------------------------------------
Expand All @@ -53,6 +54,16 @@ These are bug fixes that might have notable behavior changes.
notable_bug_fix1
^^^^^^^^^^^^^^^^

.. _whatsnew_230.api_changes:

API changes
~~~~~~~~~~~

- When enabling the ``future.infer_string`` option: Index set operations (like
union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
empty ``Index`` with object dtype when determining the dtype of the resulting
Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_230.deprecations:

Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,9 @@ Other API changes
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
- pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`)
- Index set operations (like union or intersection) will now ignore the dtype of
an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
the dtype of the resulting Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.deprecations:
Expand Down Expand Up @@ -667,6 +670,7 @@ Conversion
- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`)
- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
- Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
- Bug in :meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` removing timezone information for objects with :class:`ArrowDtype` (:issue:`60237`)
- Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)

Strings
Expand Down
28 changes: 1 addition & 27 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -818,33 +818,7 @@ def is_monotonic(const numeric_object_t[:] arr, bint timelike):
if timelike and <int64_t>arr[0] == NPY_NAT:
return False, False, False

if numeric_object_t is not object:
with nogil:
prev = arr[0]
for i in range(1, n):
cur = arr[i]
if timelike and <int64_t>cur == NPY_NAT:
is_monotonic_inc = 0
is_monotonic_dec = 0
break
if cur < prev:
is_monotonic_inc = 0
elif cur > prev:
is_monotonic_dec = 0
elif cur == prev:
is_unique = 0
else:
# cur or prev is NaN
is_monotonic_inc = 0
is_monotonic_dec = 0
break
if not is_monotonic_inc and not is_monotonic_dec:
is_monotonic_inc = 0
is_monotonic_dec = 0
break
prev = cur
else:
# object-dtype, identical to above except we cannot use `with nogil`
with nogil(numeric_object_t is not object):
prev = arr[0]
for i in range(1, n):
cur = arr[i]
Expand Down
15 changes: 1 addition & 14 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):

modes = np.empty(nkeys, dtype=values.dtype)

if htfunc_t is not object:
with nogil:
for k in range(nkeys):
count = counts[k]
if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue

modes[j] = keys[k]
else:
with nogil(htfunc_t is not object):
for k in range(nkeys):
count = counts[k]
if count == max_count:
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
@cython.boundscheck(False)
@cython.wraparound(False)
def get_blkno_indexers(
int64_t[:] blknos, bint group=True
const int64_t[:] blknos, bint group=True
) -> list[tuple[int, slice | np.ndarray]]:
"""
Enumerate contiguous runs of integers in ndarray.
Expand Down Expand Up @@ -596,8 +596,8 @@ def get_blkno_placements(blknos, group: bool = True):
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef update_blklocs_and_blknos(
ndarray[intp_t, ndim=1] blklocs,
ndarray[intp_t, ndim=1] blknos,
const intp_t[:] blklocs,
const intp_t[:] blknos,
Py_ssize_t loc,
intp_t nblocks,
):
Expand Down
17 changes: 10 additions & 7 deletions pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,

@cython.wraparound(False)
@cython.boundscheck(False)
cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil:
cdef void _get_result_indexer(
const intp_t[::1] sorter,
intp_t[::1] indexer,
) noexcept nogil:
"""NOTE: overwrites indexer with the result to avoid allocating another array"""
cdef:
Py_ssize_t i, n, idx
Expand Down Expand Up @@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t]
from pandas._libs.hashtable cimport Int64HashTable


def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values,
const numeric_t[:] right_values,
const int64_t[:] left_by_values,
const int64_t[:] right_by_values,
bint allow_exact_matches=True,
Expand Down Expand Up @@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
return left_indexer, right_indexer


def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values,
const numeric_t[:] right_values,
const int64_t[:] left_by_values,
const int64_t[:] right_by_values,
bint allow_exact_matches=1,
Expand Down Expand Up @@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
return left_indexer, right_indexer


def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values,
const numeric_t[:] right_values,
const int64_t[:] left_by_values,
const int64_t[:] right_by_values,
bint allow_exact_matches=True,
Expand Down
6 changes: 2 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -981,16 +981,14 @@ def get_level_sorter(

@cython.boundscheck(False)
@cython.wraparound(False)
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
def count_level_2d(const uint8_t[:, :] mask,
const intp_t[:] labels,
Py_ssize_t max_bin,
):
cdef:
Py_ssize_t i, j, k, n
Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0]
ndarray[int64_t, ndim=2] counts

n, k = (<object>mask).shape

counts = np.zeros((n, max_bin), dtype="i8")
with nogil:
for i in range(n):
Expand Down
22 changes: 1 addition & 21 deletions pandas/_libs/reshape.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,27 +40,7 @@ def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask,
cdef:
Py_ssize_t i, j, w, nulls, s, offset

if numeric_object_t is not object:
# evaluated at compile-time
with nogil:
for i in range(stride):

nulls = 0
for j in range(length):

for w in range(width):

offset = j * width + w

if mask[offset]:
s = i * width + w
new_values[j, s] = values[offset - nulls, i]
new_mask[j, s] = 1
else:
nulls += 1

else:
# object-dtype, identical to above but we cannot use nogil
with nogil(numeric_object_t is not object):
for i in range(stride):

nulls = 0
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def convert_dtypes(
else:
inferred_dtype = input_array.dtype

if dtype_backend == "pyarrow":
if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype):
from pandas.core.arrays.arrow.array import to_pyarrow_type
from pandas.core.arrays.string_ import StringDtype

Expand Down
31 changes: 30 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
# (https://github.com/pandas-dev/pandas/pull/60797)
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down Expand Up @@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:

arr = self._values

if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
# special case: if we are an empty object-dtype Index, also
# take into account the inserted item for the resulting dtype
# (https://github.com/pandas-dev/pandas/pull/60797)
dtype = self._find_common_type_compat(item)
if dtype != self.dtype:
return self.astype(dtype).insert(loc, item)

try:
if isinstance(arr, ExtensionArray):
res_values = arr.insert(loc, item)
Expand Down
16 changes: 14 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
is_numeric_dtype,
is_object_dtype,
is_re,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
Expand Down Expand Up @@ -2102,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None):
result = self._data.array._str_slice_replace(start, stop, repl)
return self._wrap_result(result)

def decode(self, encoding, errors: str = "strict"):
def decode(
self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None
):
"""
Decode character string in the Series/Index using indicated encoding.
Expand All @@ -2116,6 +2119,12 @@ def decode(self, encoding, errors: str = "strict"):
errors : str, optional
Specifies the error handling scheme.
Possible values are those supported by :meth:`bytes.decode`.
dtype : str or dtype, optional
The dtype of the result. When not ``None``, must be either a string or
object dtype. When ``None``, the dtype of the result is determined by
``pd.options.future.infer_string``.
.. versionadded:: 2.3.0
Returns
-------
Expand All @@ -2137,6 +2146,10 @@ def decode(self, encoding, errors: str = "strict"):
2 ()
dtype: object
"""
if dtype is not None and not is_string_dtype(dtype):
raise ValueError(f"dtype must be string or object, got {dtype=}")
if dtype is None and get_option("future.infer_string"):
dtype = "str"
# TODO: Add a similar _bytes interface.
if encoding in _cpython_optimized_decoders:
# CPython optimized implementation
Expand All @@ -2146,7 +2159,6 @@ def decode(self, encoding, errors: str = "strict"):
f = lambda x: decoder(x, errors)[0]
arr = self._data.array
result = arr._str_map(f)
dtype = "str" if get_option("future.infer_string") else None
return self._wrap_result(result, dtype=dtype)

@forbid_nonstring_types(["bytes"])
Expand Down
17 changes: 15 additions & 2 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4159,6 +4159,8 @@ def _create_axes(
ordered = data_converted.ordered
meta = "category"
metadata = np.asarray(data_converted.categories).ravel()
elif isinstance(blk.dtype, StringDtype):
meta = str(blk.dtype)

data, dtype_name = _get_data_and_dtype_name(data_converted)

Expand Down Expand Up @@ -4419,7 +4421,8 @@ def read_column(
errors=self.errors,
)
cvs = col_values[1]
return Series(cvs, name=column, copy=False)
dtype = getattr(self.table.attrs, f"{column}_meta", None)
return Series(cvs, name=column, copy=False, dtype=dtype)

raise KeyError(f"column [{column}] not found in the table")

Expand Down Expand Up @@ -4769,8 +4772,18 @@ def read(
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
if not (using_string_dtype() and values.dtype.kind == "O"):
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)

# If str / string dtype is stored in meta, use that.
converted = False
for column in cols_:
dtype = getattr(self.table.attrs, f"{column}_meta", None)
if dtype in ["str", "string"]:
df[column] = df[column].astype(dtype)
converted = True
# Otherwise try inference.
if (
using_string_dtype()
not converted
and using_string_dtype()
and isinstance(values, np.ndarray)
and is_string_array(
values,
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
_concat.concat_compat([arr[:2], arr[2:]], axis=1)


def test_concat_series_between_empty_and_tzaware_series():
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
ser2 = Series(dtype=float)
Expand All @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
data=[
(0.0, None),
],
index=pd.Index([tzaware_time], dtype=object),
index=[tzaware_time]
if using_infer_string
else pd.Index([tzaware_time], dtype=object),
columns=[0, 1],
dtype=float,
)
Expand Down
Loading

0 comments on commit 85093a6

Please sign in to comment.