From 888d5cb3e07cdac27178570677192b31c1f1aeae Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 Nov 2024 15:14:10 +0000 Subject: [PATCH 01/10] test_startswith_endswith_validate_na --- pandas/tests/strings/test_find_replace.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 34a6377b5786f..30e6ebf0eed13 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -293,23 +293,12 @@ def test_startswith_endswith_validate_na(any_string_dtype): dtype=any_string_dtype, ) - dtype = ser.dtype - if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): - msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.startswith("kapow", na="baz") - msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser.str.endswith("bar", na="baz") - else: - # TODO(infer_string): don't surface pyarrow errors - import pyarrow as pa - - msg = "Could not convert 'baz' with type str: tried to convert to boolean" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.startswith("kapow", na="baz") - with pytest.raises(pa.lib.ArrowInvalid, match=msg): - ser.str.endswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.startswith("kapow", na="baz") + msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser.str.endswith("bar", na="baz") @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) From 181c7e5d5ff795f7941492d34aa5e2220cfe020a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 Nov 2024 16:50:37 +0000 Subject: [PATCH 02/10] test_pivot_index_is_none --- pandas/tests/reshape/test_pivot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d8a9acdc561fd..580842e919667 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2682,15 +2682,15 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_index_is_none(self): # GH#48293 - df = DataFrame({None: [1], "b": 2, "c": 3}) + df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) result = df.pivot(columns="b", index=None) expected = DataFrame({("c", 2): 3}, index=[1]) + expected.columns = expected.columns.set_levels( + expected.columns.levels[0].astype(object), level=0 + ) expected.columns.names = [None, "b"] tm.assert_frame_equal(result, expected) From 2cad97c4c3b0d6e71fe02ccb91d9493664169183 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 Nov 2024 18:39:42 +0000 Subject: [PATCH 03/10] test_pivot_columns_is_none --- pandas/tests/reshape/test_pivot.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 580842e919667..f51ee1d9f8014 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2664,18 +2664,21 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_columns_is_none(self): # GH#48293 - df = DataFrame({None: [1], "b": 2, "c": 3}) + df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) result = df.pivot(columns=None) expected = DataFrame({("b", 1): [2], ("c", 1): 3}) + expected.columns = expected.columns.set_levels( + expected.columns.levels[0].astype(object), level=0 + ) tm.assert_frame_equal(result, expected) result = df.pivot(columns=None, index="b") expected = DataFrame({("c", 1): 3}, index=Index([2], name="b")) + expected.columns = expected.columns.set_levels( + expected.columns.levels[0].astype(object), level=0 + ) tm.assert_frame_equal(result, expected) result = df.pivot(columns=None, index="b", values="c") From fc39d866865c009ea71af454b0fc661bcbbf4c8e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 Nov 2024 18:48:27 +0000 Subject: [PATCH 04/10] test_pivot_values_is_none --- pandas/tests/reshape/test_pivot.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f51ee1d9f8014..774e8c49e5168 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -2701,12 +2699,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 - df = DataFrame({None: [1], "b": 2, "c": 3}) + df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) result = df.pivot(columns="b", index="c", values=None) expected = DataFrame( From 4aa160e21d126a623b7d8154b8cd664cf454efc3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 Nov 2024 22:35:08 +0000 Subject: [PATCH 05/10] test_get_dummies_with_str_dtype --- pandas/conftest.py | 4 + pandas/tests/strings/test_get_dummies.py | 99 +++++++++++++++++------- 2 files changed, 76 insertions(+), 27 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..d09eec1b04660 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1435,6 +1435,10 @@ def any_string_dtype(request): return pd.StringDtype(storage, na_value) +# Generate cartesian product of any_string_dtype: +any_string_dtype2 = any_string_dtype + + @pytest.fixture(params=tm.DATETIME64_DTYPES) def datetime64_dtype(request): """ diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 3b989e284ca25..56569ac522ae8 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas import ( @@ -98,30 +96,77 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_get_dummies_with_str_dtype(any_string_dtype): +@pytest.mark.parametrize("use_string_repr", [True, False]) +def test_get_dummies_with_any_string_dtype( + request, any_string_dtype, any_string_dtype2, use_string_repr, using_infer_string +): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype=str) - expected = DataFrame( - [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], - columns=list("abc"), - dtype=str, - ) - tm.assert_frame_equal(result, expected) - - -# GH#47872 -@td.skip_if_no("pyarrow") -def test_get_dummies_with_pa_str_dtype(any_string_dtype): - s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - result = s.str.get_dummies("|", dtype="str[pyarrow]") - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype="str[pyarrow]", - ) + test_ids = request.node.callspec.id.split("-") + series_dtype_id = test_ids[0][7:] + expected_dtype_id = test_ids[1][7:] + if expected_dtype_id == "object": + if "pyarrow" in series_dtype_id: + request.applymarker( + pytest.mark.xfail( + reason=("pyarrow.lib.ArrowTypeError: Expected integer, got bool"), + strict=True, + ) + ) + expected = DataFrame( + [ + [True, True, False], + [True, False, True], + [False, False, False], + ], + columns=list("abc"), + dtype=np.bool_, + ) + elif expected_dtype_id == "str[pyarrow]" and use_string_repr: + # data type 'str[pyarrow]' uses pandas.ArrowDtype instead + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) + elif expected_dtype_id == "str[python]" and use_string_repr: + # data type 'str[python]' not understood" + expected_dtype_id = str + if using_infer_string: + expected = DataFrame( + [ + ["True", "True", "False"], + ["True", "False", "True"], + ["False", "False", "False"], + ], + columns=list("abc"), + dtype=expected_dtype_id, + ) + else: + expected = DataFrame( + [ + ["T", "T", "F"], + ["T", "F", "T"], + ["F", "F", "F"], + ], + columns=list("abc"), + dtype=expected_dtype_id, + ) + else: + expected = DataFrame( + [ + ["True", "True", "False"], + ["True", "False", "True"], + ["False", "False", "False"], + ], + columns=list("abc"), + dtype=any_string_dtype2, + ) + if use_string_repr: + result = s.str.get_dummies("|", dtype=expected_dtype_id) + else: + result = s.str.get_dummies("|", dtype=any_string_dtype2) tm.assert_frame_equal(result, expected) From 13a40ff6eb9cb64eae0099831821adfc59c2174d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 Nov 2024 14:06:03 +0000 Subject: [PATCH 06/10] test_with_prefix_default_category --- pandas/core/reshape/encoding.py | 12 +++++++++++- pandas/tests/reshape/test_from_dummies.py | 11 ++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..7a81331e2b7a9 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -13,6 +13,10 @@ from pandas._libs import missing as libmissing from pandas._libs.sparse import IntIndex +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( is_integer_dtype, is_list_like, @@ -567,7 +571,13 @@ def from_dummies( ) else: data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) + dtype = data.columns.dtype + if default_category: + default_category_dtype = infer_dtype_from_scalar(default_category[prefix])[ + 0 + ] + dtype = find_common_type([dtype, default_category_dtype]) + cats_array = data._constructor_sliced(cats, dtype=dtype) # get indices of True entries along axis=1 true_values = data_slice.idxmax(axis=1) indexer = data_slice.columns.get_indexer_for(true_values) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index da1930323f464..063bd22a0c511 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -330,14 +328,10 @@ def test_no_prefix_string_cats_contains_get_dummies_NaN_column(): ), ], ) -def test_no_prefix_string_cats_default_category( - default_category, expected, using_infer_string -): +def test_no_prefix_string_cats_default_category(default_category, expected): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) expected = DataFrame(expected) - if using_infer_string: - expected[""] = expected[""].astype("str") tm.assert_frame_equal(result, expected) @@ -364,7 +358,6 @@ def test_with_prefix_contains_get_dummies_NaN_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "default_category, expected", [ @@ -390,7 +383,7 @@ def test_with_prefix_contains_get_dummies_NaN_column(): ), pytest.param( {"col2": None, "col1": False}, - {"col1": ["a", "b", False], "col2": [None, "a", "c"]}, + {"col1": ["a", "b", False], "col2": Series([None, "a", "c"], dtype=object)}, id="default_category is a dict with bool and None values", ), pytest.param( From ce7cdb1bce00f1d2f404726a30659d8d02f1b85a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 Nov 2024 15:09:08 +0000 Subject: [PATCH 07/10] Revert "test_get_dummies_with_str_dtype" This reverts commit 4aa160e21d126a623b7d8154b8cd664cf454efc3. --- pandas/conftest.py | 4 - pandas/tests/strings/test_get_dummies.py | 99 +++++++----------------- 2 files changed, 27 insertions(+), 76 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d09eec1b04660..106518678df6a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1435,10 +1435,6 @@ def any_string_dtype(request): return pd.StringDtype(storage, na_value) -# Generate cartesian product of any_string_dtype: -any_string_dtype2 = any_string_dtype - - @pytest.fixture(params=tm.DATETIME64_DTYPES) def datetime64_dtype(request): """ diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 56569ac522ae8..3b989e284ca25 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -96,77 +98,30 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 -@pytest.mark.parametrize("use_string_repr", [True, False]) -def test_get_dummies_with_any_string_dtype( - request, any_string_dtype, any_string_dtype2, use_string_repr, using_infer_string -): +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) - test_ids = request.node.callspec.id.split("-") - series_dtype_id = test_ids[0][7:] - expected_dtype_id = test_ids[1][7:] - if expected_dtype_id == "object": - if "pyarrow" in series_dtype_id: - request.applymarker( - pytest.mark.xfail( - reason=("pyarrow.lib.ArrowTypeError: Expected integer, got bool"), - strict=True, - ) - ) - expected = DataFrame( - [ - [True, True, False], - [True, False, True], - [False, False, False], - ], - columns=list("abc"), - dtype=np.bool_, - ) - elif expected_dtype_id == "str[pyarrow]" and use_string_repr: - # data type 'str[pyarrow]' uses pandas.ArrowDtype instead - expected = DataFrame( - [ - ["true", "true", "false"], - ["true", "false", "true"], - ["false", "false", "false"], - ], - columns=list("abc"), - dtype="str[pyarrow]", - ) - elif expected_dtype_id == "str[python]" and use_string_repr: - # data type 'str[python]' not understood" - expected_dtype_id = str - if using_infer_string: - expected = DataFrame( - [ - ["True", "True", "False"], - ["True", "False", "True"], - ["False", "False", "False"], - ], - columns=list("abc"), - dtype=expected_dtype_id, - ) - else: - expected = DataFrame( - [ - ["T", "T", "F"], - ["T", "F", "T"], - ["F", "F", "F"], - ], - columns=list("abc"), - dtype=expected_dtype_id, - ) - else: - expected = DataFrame( - [ - ["True", "True", "False"], - ["True", "False", "True"], - ["False", "False", "False"], - ], - columns=list("abc"), - dtype=any_string_dtype2, - ) - if use_string_repr: - result = s.str.get_dummies("|", dtype=expected_dtype_id) - else: - result = s.str.get_dummies("|", dtype=any_string_dtype2) + result = s.str.get_dummies("|", dtype=str) + expected = DataFrame( + [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], + columns=list("abc"), + dtype=str, + ) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pa_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype="str[pyarrow]") + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) tm.assert_frame_equal(result, expected) From 2e1d33c36ab6af60e20da432a4a5d5aad757c158 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 Nov 2024 15:09:56 +0000 Subject: [PATCH 08/10] Revert "test_pivot_values_is_none" This reverts commit fc39d866865c009ea71af454b0fc661bcbbf4c8e. --- pandas/tests/reshape/test_pivot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 774e8c49e5168..f51ee1d9f8014 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas.compat.numpy import np_version_gte1p25 import pandas as pd @@ -2699,9 +2701,12 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 - df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) + df = DataFrame({None: [1], "b": 2, "c": 3}) result = df.pivot(columns="b", index="c", values=None) expected = DataFrame( From eafca1cd6e360ddd30ea9a80d952bc90cfb4dc26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 Nov 2024 15:10:42 +0000 Subject: [PATCH 09/10] Revert "test_pivot_columns_is_none" This reverts commit 2cad97c4c3b0d6e71fe02ccb91d9493664169183. --- pandas/tests/reshape/test_pivot.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f51ee1d9f8014..580842e919667 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2664,21 +2664,18 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 - df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) + df = DataFrame({None: [1], "b": 2, "c": 3}) result = df.pivot(columns=None) expected = DataFrame({("b", 1): [2], ("c", 1): 3}) - expected.columns = expected.columns.set_levels( - expected.columns.levels[0].astype(object), level=0 - ) tm.assert_frame_equal(result, expected) result = df.pivot(columns=None, index="b") expected = DataFrame({("c", 1): 3}, index=Index([2], name="b")) - expected.columns = expected.columns.set_levels( - expected.columns.levels[0].astype(object), level=0 - ) tm.assert_frame_equal(result, expected) result = df.pivot(columns=None, index="b", values="c") From f44f64c5844fd9a7101fb5d866df3308a6b7e0f5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 19 Nov 2024 15:10:59 +0000 Subject: [PATCH 10/10] Revert "test_pivot_index_is_none" This reverts commit 181c7e5d5ff795f7941492d34aa5e2220cfe020a. --- pandas/tests/reshape/test_pivot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 580842e919667..d8a9acdc561fd 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2682,15 +2682,15 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 - df = DataFrame([[1, 2, 3]], columns=Index([None, "b", "c"], dtype="object")) + df = DataFrame({None: [1], "b": 2, "c": 3}) result = df.pivot(columns="b", index=None) expected = DataFrame({("c", 2): 3}, index=[1]) - expected.columns = expected.columns.set_levels( - expected.columns.levels[0].astype(object), level=0 - ) expected.columns.names = [None, "b"] tm.assert_frame_equal(result, expected)