Skip to content

Commit 3362822

Browse files
authoredDec 17, 2024··
String dtype (2.3.x): avoid downcasting object to string in fillna/where/interpolate (#60183)
1 parent 6d9a2b4 commit 3362822

File tree

7 files changed

+57
-58
lines changed

7 files changed

+57
-58
lines changed
 

‎pandas/_libs/lib.pyi

+3
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def maybe_convert_objects(
8686
safe: bool = ...,
8787
convert_numeric: bool = ...,
8888
convert_non_numeric: Literal[False] = ...,
89+
convert_string: Literal[False] = ...,
8990
convert_to_nullable_dtype: Literal[False] = ...,
9091
dtype_if_all_nat: DtypeObj | None = ...,
9192
) -> npt.NDArray[np.object_ | np.number]: ...
@@ -97,6 +98,7 @@ def maybe_convert_objects(
9798
safe: bool = ...,
9899
convert_numeric: bool = ...,
99100
convert_non_numeric: bool = ...,
101+
convert_string: bool = ...,
100102
convert_to_nullable_dtype: Literal[True] = ...,
101103
dtype_if_all_nat: DtypeObj | None = ...,
102104
) -> ArrayLike: ...
@@ -108,6 +110,7 @@ def maybe_convert_objects(
108110
safe: bool = ...,
109111
convert_numeric: bool = ...,
110112
convert_non_numeric: bool = ...,
113+
convert_string: bool = ...,
111114
convert_to_nullable_dtype: bool = ...,
112115
dtype_if_all_nat: DtypeObj | None = ...,
113116
) -> ArrayLike: ...

‎pandas/_libs/lib.pyx

+6-1
Original file line numberDiff line numberDiff line change
@@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
24982498
bint convert_numeric=True, # NB: different default!
24992499
bint convert_to_nullable_dtype=False,
25002500
bint convert_non_numeric=False,
2501+
bint convert_string=True,
25012502
object dtype_if_all_nat=None) -> "ArrayLike":
25022503
"""
25032504
Type inference function-- convert object array to proper dtype
@@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects,
27472748
dtype = StringDtype()
27482749
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
27492750

2750-
elif using_string_dtype() and is_string_array(objects, skipna=True):
2751+
elif (
2752+
convert_string
2753+
and using_string_dtype()
2754+
and is_string_array(objects, skipna=True)
2755+
):
27512756
from pandas.core.arrays.string_ import StringDtype
27522757

27532758
dtype = StringDtype(na_value=np.nan)

‎pandas/core/internals/blocks.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,12 @@ def _maybe_downcast(
563563
return blocks
564564

565565
nbs = extend_blocks(
566-
[blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
566+
[
567+
blk.convert(
568+
using_cow=using_cow, copy=not using_cow, convert_string=False
569+
)
570+
for blk in blocks
571+
]
567572
)
568573
if caller == "fillna":
569574
if len(nbs) != len(blocks) or not all(
@@ -636,6 +641,7 @@ def convert(
636641
*,
637642
copy: bool = True,
638643
using_cow: bool = False,
644+
convert_string: bool = True,
639645
) -> list[Block]:
640646
"""
641647
Attempt to coerce any object types to better types. Return a copy
@@ -648,7 +654,10 @@ def convert(
648654

649655
if self.ndim != 1 and self.shape[0] != 1:
650656
blocks = self.split_and_operate(
651-
Block.convert, copy=copy, using_cow=using_cow
657+
Block.convert,
658+
copy=copy,
659+
using_cow=using_cow,
660+
convert_string=convert_string,
652661
)
653662
if all(blk.dtype.kind == "O" for blk in blocks):
654663
# Avoid fragmenting the block if convert is a no-op
@@ -666,6 +675,7 @@ def convert(
666675
res_values = lib.maybe_convert_objects(
667676
values, # type: ignore[arg-type]
668677
convert_non_numeric=True,
678+
convert_string=convert_string,
669679
)
670680
refs = None
671681
if (
@@ -851,6 +861,7 @@ def replace(
851861
mask: npt.NDArray[np.bool_] | None = None,
852862
using_cow: bool = False,
853863
already_warned=None,
864+
convert_string=None,
854865
) -> list[Block]:
855866
"""
856867
replace the to_replace value with value, possible to create new
@@ -915,7 +926,11 @@ def replace(
915926
if get_option("future.no_silent_downcasting") is True:
916927
blocks = [blk]
917928
else:
918-
blocks = blk.convert(copy=False, using_cow=using_cow)
929+
blocks = blk.convert(
930+
copy=False,
931+
using_cow=using_cow,
932+
convert_string=convert_string or self.dtype != _dtype_obj,
933+
)
919934
if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
920935
warnings.warn(
921936
# GH#54710
@@ -944,6 +959,7 @@ def replace(
944959
inplace=True,
945960
mask=mask,
946961
using_cow=using_cow,
962+
convert_string=convert_string,
947963
)
948964

949965
else:
@@ -958,6 +974,7 @@ def replace(
958974
inplace=True,
959975
mask=mask[i : i + 1],
960976
using_cow=using_cow,
977+
convert_string=convert_string,
961978
)
962979
)
963980
return blocks
@@ -970,6 +987,7 @@ def _replace_regex(
970987
inplace: bool = False,
971988
mask=None,
972989
using_cow: bool = False,
990+
convert_string: bool = True,
973991
already_warned=None,
974992
) -> list[Block]:
975993
"""
@@ -1029,7 +1047,9 @@ def _replace_regex(
10291047
)
10301048
already_warned.warned_already = True
10311049

1032-
nbs = block.convert(copy=False, using_cow=using_cow)
1050+
nbs = block.convert(
1051+
copy=False, using_cow=using_cow, convert_string=convert_string
1052+
)
10331053
opt = get_option("future.no_silent_downcasting")
10341054
if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
10351055
warnings.warn(
@@ -1068,6 +1088,8 @@ def replace_list(
10681088
values._replace(to_replace=src_list, value=dest_list, inplace=True)
10691089
return [blk]
10701090

1091+
convert_string = self.dtype != _dtype_obj
1092+
10711093
# Exclude anything that we know we won't contain
10721094
pairs = [
10731095
(x, y)
@@ -1152,6 +1174,7 @@ def replace_list(
11521174
inplace=inplace,
11531175
regex=regex,
11541176
using_cow=using_cow,
1177+
convert_string=convert_string,
11551178
)
11561179

11571180
if using_cow and i != src_len:
@@ -1174,7 +1197,9 @@ def replace_list(
11741197
nbs = []
11751198
for res_blk in result:
11761199
converted = res_blk.convert(
1177-
copy=True and not using_cow, using_cow=using_cow
1200+
copy=True and not using_cow,
1201+
using_cow=using_cow,
1202+
convert_string=convert_string,
11781203
)
11791204
if len(converted) > 1 or converted[0].dtype != res_blk.dtype:
11801205
warnings.warn(
@@ -1204,6 +1229,7 @@ def _replace_coerce(
12041229
inplace: bool = True,
12051230
regex: bool = False,
12061231
using_cow: bool = False,
1232+
convert_string: bool = True,
12071233
) -> list[Block]:
12081234
"""
12091235
Replace value corresponding to the given boolean array with another
@@ -1233,6 +1259,7 @@ def _replace_coerce(
12331259
inplace=inplace,
12341260
mask=mask,
12351261
using_cow=using_cow,
1262+
convert_string=convert_string,
12361263
)
12371264
else:
12381265
if value is None:
@@ -1256,6 +1283,7 @@ def _replace_coerce(
12561283
inplace=inplace,
12571284
mask=mask,
12581285
using_cow=using_cow,
1286+
convert_string=convert_string,
12591287
)
12601288

12611289
# ---------------------------------------------------------------------

‎pandas/tests/frame/methods/test_fillna.py

+5-16
Original file line numberDiff line numberDiff line change
@@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string):
132132
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
133133
)
134134

135-
if using_infer_string:
136-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
137-
result = df.fillna({2: "foo"})
138-
else:
139-
result = df.fillna({2: "foo"})
135+
result = df.fillna({2: "foo"})
140136
expected = DataFrame(
141137
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
142138
)
139+
expected[2] = expected[2].astype("object")
143140
tm.assert_frame_equal(result, expected)
144141

145-
if using_infer_string:
146-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
147-
return_value = df.fillna({2: "foo"}, inplace=True)
148-
else:
149-
return_value = df.fillna({2: "foo"}, inplace=True)
142+
return_value = df.fillna({2: "foo"}, inplace=True)
150143
tm.assert_frame_equal(df, expected)
151144
assert return_value is None
152145

@@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string):
385378

386379
# empty block
387380
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
388-
if using_infer_string:
389-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
390-
result = df.fillna("nan")
391-
else:
392-
result = df.fillna("nan")
393-
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
381+
result = df.fillna("nan")
382+
expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object)
394383
tm.assert_frame_equal(result, expected)
395384

396385
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])

‎pandas/tests/frame/methods/test_replace.py

+5-32
Original file line numberDiff line numberDiff line change
@@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc):
281281
tm.assert_frame_equal(res3, expec)
282282
tm.assert_frame_equal(res4, expec)
283283

284-
def test_regex_replace_dict_nested_non_first_character(
285-
self, any_string_dtype, using_infer_string
286-
):
284+
def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
287285
# GH 25259
288286
dtype = any_string_dtype
289287
df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
290-
if using_infer_string and any_string_dtype == "object":
291-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
292-
result = df.replace({"a": "."}, regex=True)
293-
expected = DataFrame({"first": [".bc", "bc.", "c.b"]})
294-
295-
else:
296-
result = df.replace({"a": "."}, regex=True)
297-
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
288+
result = df.replace({"a": "."}, regex=True)
289+
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
298290
tm.assert_frame_equal(result, expected)
299291

300292
def test_regex_replace_dict_nested_gh4115(self):
@@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar):
429421
],
430422
)
431423
def test_regex_replace_string_types(
432-
self,
433-
data,
434-
to_replace,
435-
expected,
436-
frame_or_series,
437-
any_string_dtype,
438-
using_infer_string,
439-
request,
424+
self, data, to_replace, expected, frame_or_series, any_string_dtype
440425
):
441426
# GH-41333, GH-35977
442427
dtype = any_string_dtype
443428
obj = frame_or_series(data, dtype=dtype)
444-
if using_infer_string and any_string_dtype == "object":
445-
if len(to_replace) > 1 and isinstance(obj, DataFrame):
446-
request.node.add_marker(
447-
pytest.mark.xfail(
448-
reason="object input array that gets downcasted raises on "
449-
"second pass"
450-
)
451-
)
452-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
453-
result = obj.replace(to_replace, regex=True)
454-
dtype = "str"
455-
else:
456-
result = obj.replace(to_replace, regex=True)
429+
result = obj.replace(to_replace, regex=True)
457430
expected = frame_or_series(expected, dtype=dtype)
458431

459432
tm.assert_equal(result, expected)

‎pandas/tests/indexing/test_coercion.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key):
831831
raise ValueError
832832
return replacer
833833

834-
def test_replace_series(self, how, to_key, from_key, replacer):
834+
def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string):
835835
index = pd.Index([3, 4], name="xxx")
836836
obj = pd.Series(self.rep[from_key], index=index, name="yyy")
837837
obj = obj.astype(from_key)
@@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer):
856856
else:
857857
exp = pd.Series(self.rep[to_key], index=index, name="yyy")
858858

859+
if using_infer_string and exp.dtype == "string" and obj.dtype == object:
860+
# with infer_string, we disable the deprecated downcasting behavior
861+
exp = exp.astype(object)
862+
859863
msg = "Downcasting behavior in `replace`"
860864
warn = FutureWarning
861865
if (

‎pandas/tests/series/methods/test_replace.py

-3
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
import pandas as pd
97
import pandas._testing as tm
108
from pandas.core.arrays import IntervalArray
@@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val):
768766
expected = pd.Series([1, None], dtype=object)
769767
tm.assert_series_equal(result, expected)
770768

771-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
772769
def test_replace_change_dtype_series(self):
773770
# GH#25797
774771
df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)

0 commit comments

Comments
 (0)
Please sign in to comment.