Skip to content

Commit

Permalink
API: ignore empty range/object dtype in Index setop operations (strin…
Browse files Browse the repository at this point in the history
…g dtype compat) (#60797)
  • Loading branch information
jorisvandenbossche authored Feb 17, 2025
1 parent 6bcd303 commit ee06e71
Show file tree
Hide file tree
Showing 26 changed files with 170 additions and 102 deletions.
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ These are bug fixes that might have notable behavior changes.
notable_bug_fix1
^^^^^^^^^^^^^^^^

.. _whatsnew_230.api_changes:

API changes
~~~~~~~~~~~

- When enabling the ``future.infer_string`` option: Index set operations (like
union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
empty ``Index`` with object dtype when determining the dtype of the resulting
Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_230.deprecations:

Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,9 @@ Other API changes
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
- pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`)
- Index set operations (like union or intersection) will now ignore the dtype of
an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
the dtype of the resulting Index (:issue:`60797`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.deprecations:
Expand Down
31 changes: 30 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_string_dtype,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
"""
target_dtype, _ = infer_dtype_from(target)

if using_string_dtype():
# special case: if left or right is a zero-length RangeIndex or
# Index[object], those can be created by the default empty constructors
# -> for that case ignore this dtype and always return the other
# (https://github.com/pandas-dev/pandas/pull/60797)
from pandas.core.indexes.range import RangeIndex

if len(self) == 0 and (
isinstance(self, RangeIndex) or self.dtype == np.object_
):
return target_dtype
if (
isinstance(target, Index)
and len(target) == 0
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
):
return self.dtype

# special case: if one dtype is uint64 and the other a signed int, return object
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
# Now it's:
Expand Down Expand Up @@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:

arr = self._values

if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
# special case: if we are an empty object-dtype Index, also
# take into account the inserted item for the resulting dtype
# (https://github.com/pandas-dev/pandas/pull/60797)
dtype = self._find_common_type_compat(item)
if dtype != self.dtype:
return self.astype(dtype).insert(loc, item)

try:
if isinstance(arr, ExtensionArray):
res_values = arr.insert(loc, item)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/dtypes/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
_concat.concat_compat([arr[:2], arr[2:]], axis=1)


def test_concat_series_between_empty_and_tzaware_series():
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
ser2 = Series(dtype=float)
Expand All @@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
data=[
(0.0, None),
],
index=pd.Index([tzaware_time], dtype=object),
index=[tzaware_time]
if using_infer_string
else pd.Index([tzaware_time], dtype=object),
columns=[0, 1],
dtype=float,
)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -44,7 +42,6 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
7 changes: 1 addition & 6 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,7 @@ def test_26395(indexer_al):
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame(
{"D": [0, 0, 2]},
index=["A", "B", "C"],
columns=pd.Index(["D"], dtype=object),
dtype=np.int64,
)
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

with pytest.raises(TypeError, match="Invalid value"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
result = df.dtypes
expected = Series(
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
index=Index(list("ABCDEFGH"), dtype=object),
index=list("ABCDEFGH"),
)
tm.assert_series_equal(result, expected)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self):
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
exp = DataFrame(
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
columns=Index(["A", "A", "A"], dtype=object),
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
)
tm.assert_frame_equal(df, exp)

Expand Down
32 changes: 22 additions & 10 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,32 @@ def test_setitem_different_dtype(self):
)
tm.assert_series_equal(result, expected)

def test_setitem_empty_columns(self):
# GH 13522
def test_setitem_overwrite_index(self):
# GH 13522 - assign the index as a column and then overwrite the values
# -> should not affect the index
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(
data={"X": ["x", "y", "z"]},
index=["A", "B", "C"],
columns=Index(["X"], dtype=object),
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
)
tm.assert_frame_equal(df, exp)

def test_setitem_empty_columns(self):
# Starting from an empty DataFrame and setting a column should result
# in a default string dtype for the columns' Index
# https://github.com/pandas-dev/pandas/issues/60338

df = DataFrame()
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

df = DataFrame(columns=Index([]))
df["foo"] = [1, 2, 3]
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(df, expected)

def test_setitem_dt64_index_empty_columns(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
Expand All @@ -169,9 +183,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC")

expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3,
index=range(3),
columns=Index(["now"], dtype=object),
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
)
tm.assert_frame_equal(df, expected)

Expand Down Expand Up @@ -210,7 +222,7 @@ def test_setitem_period_preserves_dtype(self):
result = DataFrame([])
result["a"] = data

expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
expected = DataFrame({"a": data}, columns=["a"])

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -930,7 +942,7 @@ def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)

def test_setitem_newcol_tuple_key(self, float_frame):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/frame/methods/test_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
with pytest.raises(TypeError, match="supplying multiple axes"):
inp.dropna(how="all", axis=(0, 1), inplace=True)

def test_dropna_tz_aware_datetime(self, using_infer_string):
def test_dropna_tz_aware_datetime(self):
# GH13407

df = DataFrame()
if using_infer_string:
df.columns = df.columns.astype("str")
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
df["Time"] = [dt1]
Expand Down
34 changes: 31 additions & 3 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
tm.assert_frame_equal(res, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
@pytest.mark.parametrize(
"array, dtype",
[
Expand Down Expand Up @@ -781,3 +778,34 @@ def test_reset_index_false_index_name():
result_frame.reset_index()
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
tm.assert_frame_equal(result_frame, expected_frame)


@pytest.mark.parametrize("columns", [None, Index([])])
def test_reset_index_with_empty_frame(columns):
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
# resetting the index we still want to end up with the default string dtype
# https://github.com/pandas-dev/pandas/issues/60338

index = Index([], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo"])
tm.assert_frame_equal(result, expected)

index = Index([1, 2, 3], name="foo")
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2, 3]})
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame(columns=["foo", "bar"])
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
df = DataFrame(index=index, columns=columns)
result = df.reset_index()
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
tm.assert_frame_equal(result, expected)
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from numpy.ma import mrecords
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
Expand Down Expand Up @@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self):
df = DataFrame({"value": dr})
assert str(df.iat[0, 0].tz) == "US/Eastern"

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_constructor_with_datetimes5(self):
# GH 7822
# preserver an index with a tz on dict construction
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)

expected = DataFrame(df_index)
expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_insert(self):

# test empty
null_index = Index([])
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))

def test_insert_missing(self, nulls_fixture, using_infer_string):
# GH#22295
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -251,7 +250,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name, dtype=expected_dtype)
expected = Index(vals, name=expected_name)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/indexes/datetimes/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz is timezone.utc

def test_datetimeindex_union_join_empty(self, sort):
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
empty = Index([])

result = dti.union(empty, sort=sort)
expected = dti.astype("O")
tm.assert_index_equal(result, expected)
if using_infer_string:
assert isinstance(result, DatetimeIndex)
tm.assert_index_equal(result, dti)
else:
expected = dti.astype("O")
tm.assert_index_equal(result, expected)

result = dti.join(empty)
assert isinstance(result, DatetimeIndex)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
else:
msg = "slice indices must be integers or None or have an __index__ method"

if using_infer_string and (
index.dtype == "string" or index.dtype == "category"
):
msg = "loc must be an integer between"
if using_infer_string:
if index.dtype == "string" or index.dtype == "category":
msg = "loc must be an integer between"
elif index.dtype == "object" and len(index) == 0:
msg = "loc must be an integer between"
err = TypeError

with pytest.raises(err, match=msg):
index.insert(0.5, "foo")
Expand Down
Loading

0 comments on commit ee06e71

Please sign in to comment.