Skip to content

TST (string dtype): resolve xfails in pandas/tests/copy_view #60245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 11 additions & 17 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

from pandas.compat import pa_version_under10p1

from pandas.core.dtypes.common import is_string_dtype

import pandas as pd
from pandas import (
ArrowDtype,
Expand Down Expand Up @@ -77,8 +75,8 @@
with_csv_dialect,
)
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
ExtensionArray,
NumpyExtensionArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
Expand All @@ -92,7 +90,6 @@
NpDtype,
)

from pandas.core.arrays import ArrowExtensionArray

UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
Expand Down Expand Up @@ -512,24 +509,21 @@ def shares_memory(left, right) -> bool:
if isinstance(left, pd.core.arrays.IntervalArray):
return shares_memory(left._left, right) or shares_memory(left._right, right)

if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
if isinstance(left, ArrowExtensionArray):
if isinstance(right, ArrowExtensionArray):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
# breakpoint()
left = cast("ArrowExtensionArray", left)
right = cast("ArrowExtensionArray", right)
left_pa_data = left._pa_array
right_pa_data = right._pa_array
left_buf1 = left_pa_data.chunk(0).buffers()[1]
right_buf1 = right_pa_data.chunk(0).buffers()[1]
return left_buf1 == right_buf1
return left_buf1.address == right_buf1.address
else:
# if we have one one ArrowExtensionArray and one other array, assume
# they can only share memory if they share the same numpy buffer
return np.shares_memory(left, right)

if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
# By convention, we'll say these share memory if they share *either*
Expand Down
23 changes: 12 additions & 11 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0

from pandas import (
Expand Down Expand Up @@ -206,7 +203,6 @@ def test_astype_arrow_timestamp():
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes_infer_objects():
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
Expand All @@ -217,20 +213,25 @@ def test_convert_dtypes_infer_objects():
convert_string=False,
)

assert np.shares_memory(get_array(ser), get_array(result))
assert tm.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes():
def test_convert_dtypes(using_infer_string):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()

assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
if using_infer_string:
# TODO the default nullable string dtype still uses python storage
# this should be changed to pyarrow if installed
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this behavior change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the df starts with using str dtype with pyarrow storage (if pyarrow is installed, so just realize this needs a HAS_PYARROW check), and convert_dtypes will convert this to the nullable string dtype but using python storage (because that is still the default storage in this case, but we should change that before 3.0)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK - so this is really just a 2.3 behavior nuance?

else:
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
df2.iloc[0, 0] = "x"
df2.iloc[0, 1] = 10
tm.assert_frame_equal(df, df_orig)
1 change: 0 additions & 1 deletion pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def test_concat_copy_keyword():
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))


# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
Expand Down
38 changes: 21 additions & 17 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

import pandas as pd
Expand Down Expand Up @@ -716,14 +714,18 @@ def test_head_tail(method):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_infer_objects():
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
def test_infer_objects(using_infer_string):
df = DataFrame(
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
)
df_orig = df.copy()
df2 = df.infer_objects()

assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
else:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
Expand All @@ -732,19 +734,16 @@ def test_infer_objects():
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_infer_objects_no_reference():
def test_infer_objects_no_reference(using_infer_string):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
),
"e": "b",
"e": Series(["z", "w"], dtype=object),
}
)
df = df.infer_objects()
Expand All @@ -757,16 +756,22 @@ def test_infer_objects_no_reference():
df.iloc[0, 1] = "d"
df.iloc[0, 3] = Timestamp("2018-12-31")
assert np.shares_memory(arr_a, get_array(df, "a"))
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
# note that the underlying memory of arr_b has been copied anyway
# because of the assignment, but the EA is updated inplace so still
# appears the share memory
assert tm.shares_memory(arr_b, get_array(df, "b"))
else:
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
assert np.shares_memory(arr_d, get_array(df, "d"))


def test_infer_objects_reference():
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
Expand Down Expand Up @@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs):
tm.assert_equal(view, obj_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
df_orig = df.copy()
df2 = df.round(decimals=decimals)

assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
# TODO: Make inplace by using out parameter of ndarray.round?
if decimals >= 0:
# Ensure lazy copy if no-op
Expand Down
14 changes: 4 additions & 10 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

from pandas import (
Categorical,
DataFrame,
Expand All @@ -13,7 +9,6 @@
from pandas.tests.copy_view.util import get_array


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"replace_kwargs",
[
Expand All @@ -30,14 +25,14 @@
],
)
def test_replace(replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
Expand All @@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs():
tm.assert_frame_equal(view, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_replace_regex_inplace():
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_regex_inplace_no_op():
Expand Down
Loading