Skip to content

Commit c079337

Browse files
[backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245) (#60257)
1 parent 75a1007 commit c079337

File tree

5 files changed

+51
-61
lines changed

5 files changed

+51
-61
lines changed

pandas/_testing/__init__.py

+9-19
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
TYPE_CHECKING,
99
Callable,
1010
ContextManager,
11-
cast,
1211
)
1312
import warnings
1413

@@ -23,8 +22,6 @@
2322

2423
from pandas.compat import pa_version_under10p1
2524

26-
from pandas.core.dtypes.common import is_string_dtype
27-
2825
import pandas as pd
2926
from pandas import (
3027
ArrowDtype,
@@ -83,8 +80,8 @@
8380
with_csv_dialect,
8481
)
8582
from pandas.core.arrays import (
83+
ArrowExtensionArray,
8684
BaseMaskedArray,
87-
ExtensionArray,
8885
NumpyExtensionArray,
8986
)
9087
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -96,7 +93,6 @@
9693
NpDtype,
9794
)
9895

99-
from pandas.core.arrays import ArrowExtensionArray
10096

10197
UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
10298
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool:
530526
if isinstance(left, pd.core.arrays.IntervalArray):
531527
return shares_memory(left._left, right) or shares_memory(left._right, right)
532528

533-
if (
534-
isinstance(left, ExtensionArray)
535-
and is_string_dtype(left.dtype)
536-
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
537-
):
538-
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
539-
left = cast("ArrowExtensionArray", left)
540-
if (
541-
isinstance(right, ExtensionArray)
542-
and is_string_dtype(right.dtype)
543-
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
544-
):
545-
right = cast("ArrowExtensionArray", right)
529+
if isinstance(left, ArrowExtensionArray):
530+
if isinstance(right, ArrowExtensionArray):
531+
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
546532
left_pa_data = left._pa_array
547533
right_pa_data = right._pa_array
548534
left_buf1 = left_pa_data.chunk(0).buffers()[1]
549535
right_buf1 = right_pa_data.chunk(0).buffers()[1]
550-
return left_buf1 == right_buf1
536+
return left_buf1.address == right_buf1.address
537+
else:
538+
# if we have one one ArrowExtensionArray and one other array, assume
539+
# they can only share memory if they share the same numpy buffer
540+
return np.shares_memory(left, right)
551541

552542
if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
553543
# By convention, we'll say these share memory if they share *either*

pandas/tests/copy_view/test_astype.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.compat import HAS_PYARROW
97
from pandas.compat.pyarrow import pa_version_under12p0
108
import pandas.util._test_decorators as td
@@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
244242
)
245243

246244

247-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
248245
def test_convert_dtypes_infer_objects(using_copy_on_write):
249246
ser = Series(["a", "b", "c"])
250247
ser_orig = ser.copy()
@@ -256,30 +253,35 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
256253
)
257254

258255
if using_copy_on_write:
259-
assert np.shares_memory(get_array(ser), get_array(result))
256+
assert tm.shares_memory(get_array(ser), get_array(result))
260257
else:
261258
assert not np.shares_memory(get_array(ser), get_array(result))
262259

263260
result.iloc[0] = "x"
264261
tm.assert_series_equal(ser, ser_orig)
265262

266263

267-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
268-
def test_convert_dtypes(using_copy_on_write):
264+
def test_convert_dtypes(using_copy_on_write, using_infer_string):
269265
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
270266
df_orig = df.copy()
271267
df2 = df.convert_dtypes()
272268

273269
if using_copy_on_write:
274-
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
275-
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
276-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
277-
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
270+
if using_infer_string and HAS_PYARROW:
271+
# TODO the default nullable string dtype still uses python storage
272+
# this should be changed to pyarrow if installed
273+
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
274+
else:
275+
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
276+
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
277+
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
278+
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
278279
else:
279280
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
280281
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
281282
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
282283
assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
283284

284285
df2.iloc[0, 0] = "x"
286+
df2.iloc[0, 1] = 10
285287
tm.assert_frame_equal(df, df_orig)

pandas/tests/copy_view/test_functions.py

-1
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
201201
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
202202

203203

204-
# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
205204
@pytest.mark.parametrize(
206205
"func",
207206
[

pandas/tests/copy_view/test_methods.py

+24-19
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas.compat import HAS_PYARROW
75
from pandas.errors import SettingWithCopyWarning
86

@@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write):
953951
tm.assert_frame_equal(df, df_orig)
954952

955953

956-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
957-
def test_infer_objects(using_copy_on_write):
958-
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
954+
def test_infer_objects(using_copy_on_write, using_infer_string):
955+
df = DataFrame(
956+
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
957+
)
959958
df_orig = df.copy()
960959
df2 = df.infer_objects()
961960

962961
if using_copy_on_write:
963962
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
964-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
963+
if using_infer_string:
964+
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
965+
else:
966+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
965967

966968
else:
967969
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
@@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write):
975977
tm.assert_frame_equal(df, df_orig)
976978

977979

978-
@pytest.mark.xfail(
979-
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
980-
)
981-
def test_infer_objects_no_reference(using_copy_on_write):
980+
def test_infer_objects_no_reference(using_copy_on_write, using_infer_string):
982981
df = DataFrame(
983982
{
984983
"a": [1, 2],
985-
"b": "c",
984+
"b": Series(["x", "y"], dtype=object),
986985
"c": 1,
987986
"d": Series(
988987
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
989988
),
990-
"e": "b",
989+
"e": Series(["z", "w"], dtype=object),
991990
}
992991
)
993992
df = df.infer_objects()
@@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write):
10011000
df.iloc[0, 3] = Timestamp("2018-12-31")
10021001
if using_copy_on_write:
10031002
assert np.shares_memory(arr_a, get_array(df, "a"))
1004-
# TODO(CoW): Block splitting causes references here
1005-
assert not np.shares_memory(arr_b, get_array(df, "b"))
1003+
if using_infer_string:
1004+
# note that the underlying memory of arr_b has been copied anyway
1005+
# because of the assignment, but the EA is updated inplace so still
1006+
# appears the share memory
1007+
assert tm.shares_memory(arr_b, get_array(df, "b"))
1008+
else:
1009+
# TODO(CoW): Block splitting causes references here
1010+
assert not np.shares_memory(arr_b, get_array(df, "b"))
10061011
assert np.shares_memory(arr_d, get_array(df, "d"))
10071012

10081013

1009-
def test_infer_objects_reference(using_copy_on_write):
1014+
def test_infer_objects_reference(using_copy_on_write, using_infer_string):
10101015
df = DataFrame(
10111016
{
10121017
"a": [1, 2],
1013-
"b": "c",
1018+
"b": Series(["x", "y"], dtype=object),
10141019
"c": 1,
10151020
"d": Series(
10161021
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write):
10291034
df.iloc[0, 3] = Timestamp("2018-12-31")
10301035
if using_copy_on_write:
10311036
assert not np.shares_memory(arr_a, get_array(df, "a"))
1032-
assert not np.shares_memory(arr_b, get_array(df, "b"))
1037+
if not using_infer_string or HAS_PYARROW:
1038+
assert not np.shares_memory(arr_b, get_array(df, "b"))
10331039
assert np.shares_memory(arr_d, get_array(df, "d"))
10341040

10351041

@@ -1184,15 +1190,14 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ
11841190
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
11851191

11861192

1187-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
11881193
@pytest.mark.parametrize("decimals", [-1, 0, 1])
11891194
def test_round(using_copy_on_write, warn_copy_on_write, decimals):
11901195
df = DataFrame({"a": [1, 2], "b": "c"})
11911196
df_orig = df.copy()
11921197
df2 = df.round(decimals=decimals)
11931198

11941199
if using_copy_on_write:
1195-
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
1200+
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
11961201
# TODO: Make inplace by using out parameter of ndarray.round?
11971202
if decimals >= 0:
11981203
# Ensure lazy copy if no-op

pandas/tests/copy_view/test_replace.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
6-
from pandas.compat import HAS_PYARROW
7-
84
from pandas import (
95
Categorical,
106
DataFrame,
@@ -14,7 +10,6 @@
1410
from pandas.tests.copy_view.util import get_array
1511

1612

17-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1813
@pytest.mark.parametrize(
1914
"replace_kwargs",
2015
[
@@ -31,15 +26,15 @@
3126
],
3227
)
3328
def test_replace(using_copy_on_write, replace_kwargs):
34-
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
29+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
3530
df_orig = df.copy()
3631

3732
df_replaced = df.replace(**replace_kwargs)
3833

3934
if using_copy_on_write:
4035
if (df_replaced["b"] == df["b"]).all():
4136
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
42-
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
37+
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
4338

4439
# mutating squeezed df triggers a copy-on-write for that column/block
4540
df_replaced.loc[0, "c"] = -1
@@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
6156
with tm.assert_cow_warning(warn_copy_on_write):
6257
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
6358
if using_copy_on_write:
64-
assert not np.shares_memory(arr, get_array(df, "a"))
59+
assert not tm.shares_memory(arr, get_array(df, "a"))
6560
assert df._mgr._has_no_reference(0)
6661
tm.assert_frame_equal(view, df_orig)
6762
else:
6863
assert np.shares_memory(arr, get_array(df, "a"))
6964

7065

71-
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
7266
def test_replace_regex_inplace(using_copy_on_write):
7367
df = DataFrame({"a": ["aaa", "bbb"]})
7468
arr = get_array(df, "a")
7569
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
7670
if using_copy_on_write:
7771
assert df._mgr._has_no_reference(0)
78-
assert np.shares_memory(arr, get_array(df, "a"))
72+
assert tm.shares_memory(arr, get_array(df, "a"))
7973

8074
df_orig = df.copy()
8175
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
8276
tm.assert_frame_equal(df_orig, df)
83-
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
77+
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
8478

8579

8680
def test_replace_regex_inplace_no_op(using_copy_on_write):
@@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value):
362356
arr = get_array(df, "a")
363357
df.replace(["c"], value, inplace=True)
364358
if using_copy_on_write or value is None:
365-
assert np.shares_memory(arr, get_array(df, "a"))
359+
assert tm.shares_memory(arr, get_array(df, "a"))
366360
else:
367361
# This could be inplace
368362
assert not np.shares_memory(arr, get_array(df, "a"))

0 commit comments

Comments
 (0)