diff --git a/pandas/conftest.py b/pandas/conftest.py index 35a0d3b89400f..168ef5fed8fae 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2022,6 +2022,14 @@ def warn_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if infer_string is enabled. + """ + return pd.options.future.infer_string + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 53ac7fbf40af1..618f69eb744e3 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -586,13 +586,15 @@ def test_strftime_dt64_days(self): # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - def test_strftime_period_days(self): + def test_strftime_period_days(self, using_infer_string): period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) + if using_infer_string: + expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index af6b3910baec0..3d1082c3d040b 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,19 +31,16 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self): + def test_delitem_object_index(self, using_infer_string): # Index(dtype=object) - s = Series(1, index=["a"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + s = Series(1, index=Index(["a"], dtype=dtype)) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=["a"])) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 479e74703bc0e..596a225c288b8 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -71,7 +71,7 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" warn_msg = "Series.__getitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -363,7 +363,9 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]" + msg = ( + r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): ser[key] @@ -437,7 +439,7 @@ def test_getitem_boolean_empty(self): # GH#5877 # indexing with empty series - ser = Series(["A", "B"]) + ser = Series(["A", "B"], dtype=object) expected = Series(dtype=object, index=Index([], dtype="int64")) result = ser[Series([], dtype=object)] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 16c127e6ece7b..02d51d5119469 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -2,6 +2,7 @@ date, datetime, ) +from decimal import Decimal import numpy as np import pytest @@ -175,7 +176,8 @@ class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + # string index falls back to positional + msg = "index -11|-1 is out of bounds for axis 0 with size 10" warn_msg = "Series.__setitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -527,8 +529,12 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self): Timedelta("9 days").to_pytimedelta(), ], ) - def test_append_timedelta_does_not_cast(self, td): + def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): # GH#22717 inserting a Timedelta should _not_ cast to int64 + if using_infer_string and not isinstance(td, Timedelta): + # TODO: GH#56010 + request.applymarker(pytest.mark.xfail(reason="inferred as string")) + expected = Series(["x", td], index=[0, "td"], dtype=object) ser = Series(["x"]) @@ -595,13 +601,21 @@ def test_setitem_enlarge_with_na( expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlargement_object_none(self, nulls_fixture): + def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 ser = Series(["a", "b"]) ser[3] = nulls_fixture - expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3]) + dtype = ( + "string[pyarrow_numpy]" + if using_infer_string and not isinstance(nulls_fixture, Decimal) + else object + ) + expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype) tm.assert_series_equal(ser, expected) - assert ser[3] is nulls_fixture + if using_infer_string: + ser[3] is np.nan + else: + assert ser[3] is nulls_fixture def test_setitem_scalar_into_readonly_backing_data(): @@ -845,20 +859,28 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val): + def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).where(~mask, val) + else: + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val): + def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).putmask(mask, val) + else: + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 7c1507ce423ad..c978481ca9988 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -230,6 +232,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 7d4ad99deab38..122e1927bebe9 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -75,7 +75,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc") + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -163,10 +163,12 @@ def test_astype_empty_constructor_equality(self, dtype): Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), ], ) - def test_astype_str_map(self, dtype, series): + def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 result = series.astype(dtype) expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -276,13 +278,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"]) + expected = Series(["2010-01-04"], dtype=object) tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"]) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -291,7 +293,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"]) + expected = Series(["1 days"], dtype=object) tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -338,7 +340,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"]) + expected = Series(["0.1"], dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -409,7 +411,7 @@ def test_astype_cast_object_int(self): tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_unicode(self): + def test_astype_unicode(self, using_infer_string): # see GH#7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits @@ -426,12 +428,14 @@ def test_astype_unicode(self): item = "野菜食べないとやばい" ser = Series([item.encode()]) result = ser.astype(np.str_) - expected = Series([item]) + expected = Series([item], dtype=object) tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype(np.str_) expec = ser.map(str) + if using_infer_string: + expec = expec.astype(object) tm.assert_series_equal(res, expec) # Restore the former encoding @@ -527,12 +531,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 47659308cfcad..795b2eab82aca 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -53,7 +53,7 @@ def test_combine_first(self): # mixed types index = tm.makeStringIndex(20) floats = Series(np.random.default_rng(2).standard_normal(20), index=index) - strings = Series(tm.makeStringIndex(10), index=index[::2]) + strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object) combined = strings.combine_first(floats) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 0cd39140938c4..2b1bb4c31125f 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -186,6 +186,7 @@ def test_convert_dtypes( self, test_cases, params, + using_infer_string, ): data, maindtype, expected_default, expected_other = test_cases if ( @@ -219,6 +220,16 @@ def test_convert_dtypes( for spec, dtype in expected_other.items(): if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype + if ( + using_infer_string + and expected_default == "string" + and expected_dtype == object + and params[0] + and not params[1] + ): + # If we would convert with convert strings then infer_objects converts + # with the option + expected_dtype = "string[pyarrow_numpy]" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 6d78ecd61cdcb..f86f6069a2ef3 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -83,7 +83,7 @@ def func(x): tm.assert_series_equal(result, expected) -def test_map_series_stringdtype(any_string_dtype): +def test_map_series_stringdtype(any_string_dtype, using_infer_string): # map test on StringDType, GH#40823 ser1 = Series( data=["cat", "dog", "rabbit"], @@ -98,6 +98,8 @@ def test_map_series_stringdtype(any_string_dtype): item = np.nan expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) + if using_infer_string and any_string_dtype == "object": + expected = expected.astype("string[pyarrow_numpy]") tm.assert_series_equal(result, expected) @@ -106,7 +108,7 @@ def test_map_series_stringdtype(any_string_dtype): "data, expected_dtype", [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) -def test_map_categorical_with_nan_values(data, expected_dtype): +def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -114,6 +116,8 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") + if using_infer_string and expected_dtype == object: + expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -133,11 +137,15 @@ def test_map_empty_integer_series_with_datetime_index(): @pytest.mark.parametrize("func", [str, lambda x: str(x)]) -def test_map_simple_str_callables_same_as_astype(string_series, func): +def test_map_simple_str_callables_same_as_astype( + string_series, func, using_infer_string +): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype(str) + expected = string_series.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) @@ -461,7 +469,7 @@ def test_map_box_period(): @pytest.mark.parametrize("na_action", [None, "ignore"]) -def test_map_categorical(na_action): +def test_map_categorical(na_action, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = Series(values, name="XX", index=list("abcdefg")) @@ -474,7 +482,7 @@ def test_map_categorical(na_action): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string" @pytest.mark.parametrize( @@ -536,12 +544,14 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp): +def test_map_missing_mixed(vals, mapping, exp, using_infer_string): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) + exp = Series(exp) + if using_infer_string and mapping == {np.nan: "not NaN"}: + exp.iloc[-1] = np.nan + tm.assert_series_equal(result, exp) def test_map_scalar_on_date_time_index_aware_series(): diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 0923a2d42ce10..6f0c8d751a92a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -22,6 +24,9 @@ import pandas._testing as tm +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" +) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 83adff08b758e..119654bd19b3f 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -8,6 +8,7 @@ Index, MultiIndex, Series, + array, ) import pandas._testing as tm @@ -45,22 +46,28 @@ def test_rename_by_series(self): expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) - def test_rename_set_name(self): + def test_rename_set_name(self, using_infer_string): ser = Series(range(4), index=list("abcd")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = ser.rename(name) assert result.name == name - tm.assert_numpy_array_equal(result.index.values, ser.index.values) + if using_infer_string: + tm.assert_extension_array_equal(result.index.values, ser.index.values) + else: + tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - def test_rename_set_name_inplace(self): + def test_rename_set_name_inplace(self, using_infer_string): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: ser.rename(name, inplace=True) assert ser.name == name - exp = np.array(["a", "b", "c"], dtype=np.object_) - tm.assert_numpy_array_equal(ser.index.values, exp) + if using_infer_string: + exp = array(exp, dtype="string[pyarrow_numpy]") + tm.assert_extension_array_equal(ser.index.values, exp) + else: + tm.assert_numpy_array_equal(ser.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f08966c3816c0..fe0f79b766f72 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -389,6 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -719,6 +722,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 @@ -748,10 +752,12 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self): + def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - df["Test"] = df["Test"].replace([True], [np.nan]) + warn = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warn, match="Downcasting"): + df["Test"] = df["Test"].replace([True], [np.nan]) expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index db36221d8f510..bb5ad8cc5a25a 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -166,12 +166,20 @@ def test_reset_index_inplace_and_drop_ignore_name(self): ), ], ) -def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_series_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + exp = "string" if using_infer_string else object expected = Series( - {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} + { + "level_0": np.int64, + "level_1": np.float64, + "level_2": exp if dtype == object else dtype, + 0: object, + } ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 76ca05a60eb7a..1c17013d621c7 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -165,7 +165,7 @@ def test_to_csv_compression(self, s, encoding, compression): pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), ) - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) @@ -175,6 +175,8 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s.copy() - expected.index = expected.index.astype(str) - + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index c38b2400f0f4e..3745c045078b7 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -38,6 +38,7 @@ def test_update(self, using_copy_on_write): expected = DataFrame( [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] ) + expected["c"] = expected["c"].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b835be6d8e501..e7233f005e427 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -204,9 +204,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting" + msg = "not all arguments converted during string formatting|mod not" - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, NotImplementedError), match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -491,14 +491,27 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self): + def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + # TODO(3.0) GH56008 + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s == s2 + with tm.assert_produces_warning( + DeprecationWarning, match="comparison", check_stacklevel=False + ): + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s2 == s + else: + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 195c50969ffae..dd503827ba812 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -152,7 +152,7 @@ def test_scalar_extension_dtype(self, ea_scalar_and_dtype): assert ser.dtype == ea_dtype tm.assert_series_equal(ser, expected) - def test_constructor(self, datetime_series): + def test_constructor(self, datetime_series, using_infer_string): empty_series = Series() assert datetime_series.index._is_all_dates @@ -166,7 +166,7 @@ def test_constructor(self, datetime_series): # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ + assert mixed.dtype == np.object_ if not using_infer_string else "string" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -197,7 +197,7 @@ def test_constructor_index_ndim_gt_1_raises(self): Series([1, 3, 2], index=df) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) - def test_constructor_empty(self, input_class): + def test_constructor_empty(self, input_class, using_infer_string): empty = Series() empty2 = Series(input_class()) @@ -228,7 +228,10 @@ def test_constructor_empty(self, input_class): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) - empty2 = Series("", index=range(3)) + if using_infer_string: + empty2 = Series("", index=range(3), dtype=object) + else: + empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) @@ -1439,7 +1442,7 @@ def test_constructor_dict_of_tuples(self): # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") - def test_fromDict(self): + def test_fromDict(self, using_infer_string): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) @@ -1451,19 +1454,19 @@ def test_fromDict(self): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ + assert series.dtype == np.object_ if not using_infer_string else "string" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 - def test_fromValue(self, datetime_series): + def test_fromValue(self, datetime_series, using_infer_string): nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ + assert strings.dtype == np.object_ if not using_infer_string else "string" assert len(strings) == len(datetime_series) d = datetime.now() diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 25b34351627a1..040b1186980b2 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -142,6 +144,9 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) assert "\t" not in repr(ser) @@ -301,7 +306,7 @@ def __repr__(self) -> str: repr(ser) str(ser) - def test_categorical_repr(self): + def test_categorical_repr(self, using_infer_string): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" @@ -311,22 +316,38 @@ def test_categorical_repr(self): assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, string): [a, b]" + ) + else: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + ) with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, string): [a < b < c < d ... w < x < y < z]" + ) + else: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " + "'w' < 'x' < 'y' < 'z']" + ) assert exp == a.__str__() def test_categorical_series_repr(self): diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 2146e154dc7fa..166f52181fed4 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -146,7 +146,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self): + def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -155,8 +155,14 @@ def test_logical_operators_int_dtype_with_object(self): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + s_0123 & s_abNd + else: + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -354,7 +360,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - def test_logical_ops_label_based(self): + def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -422,7 +428,17 @@ def test_logical_ops_label_based(self): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - result = a[a | e] + warn = FutureWarning if using_infer_string else None + if using_infer_string: + import pyarrow as pa + + with tm.assert_produces_warning(warn, match="Operation between non"): + with pytest.raises( + pa.lib.ArrowNotImplementedError, match="has no kernel" + ): + result = a[a | e] + else: + result = a[a | e] tm.assert_series_equal(result, a[a]) # vs scalars diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index fbdf843a998bb..78a2819a49fc9 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -131,17 +131,22 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager): +def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric" + if using_infer_string: + msg = "does not support" + with pytest.raises(TypeError, match=msg): + ser.sum() + else: + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric|does not support" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric" + msg = r"Could not convert \['12'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() @@ -152,29 +157,30 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric" + msg = r"Could not convert \['J'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric" + msg = "Could not convert string 'J' to numeric|does not support" with pytest.raises(TypeError, match=msg): df["db"].mean() + msg = "Could not convert string 'J' to numeric|ufunc 'divide'" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median()