pandas-dev · jorisvandenbossche · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · WillAyd
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -21,8 +21,6 @@
 
 from pandas.compat import pa_version_under10p1
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     ArrowDtype,
@@ -77,8 +75,8 @@
     with_csv_dialect,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
-    ExtensionArray,
     NumpyExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -92,7 +90,6 @@
         NpDtype,
     )
 
-    from pandas.core.arrays import ArrowExtensionArray
 
 UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
 UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -512,24 +509,21 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if (
-        isinstance(left, ExtensionArray)
-        and is_string_dtype(left.dtype)
-        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-    ):
-        # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-        left = cast("ArrowExtensionArray", left)
-        if (
-            isinstance(right, ExtensionArray)
-            and is_string_dtype(right.dtype)
-            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-        ):
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
+            # breakpoint()
+            left = cast("ArrowExtensionArray", left)
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
-            return left_buf1 == right_buf1
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
 
     if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
         # By convention, we'll say these share memory if they share *either*

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -3,9 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 
 from pandas import (
@@ -206,7 +203,6 @@ def test_astype_arrow_timestamp():
         assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects():
     ser = Series(["a", "b", "c"])
     ser_orig = ser.copy()
@@ -217,20 +213,25 @@ def test_convert_dtypes_infer_objects():
         convert_string=False,
     )
 
-    assert np.shares_memory(get_array(ser), get_array(result))
+    assert tm.shares_memory(get_array(ser), get_array(result))
     result.iloc[0] = "x"
     tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_convert_dtypes():
+def test_convert_dtypes(using_infer_string):
     df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
-    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    if using_infer_string:
+        # TODO the default nullable string dtype still uses python storage
+        # this should be changed to pyarrow if installed
+        assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
     df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
     tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
@@ -153,7 +153,6 @@ def test_concat_copy_keyword():
     assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "func",
     [

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 
 import pandas as pd
@@ -716,14 +714,18 @@ def test_head_tail(method):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_infer_objects():
-    df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
+def test_infer_objects(using_infer_string):
+    df = DataFrame(
+        {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
+    )
     df_orig = df.copy()
     df2 = df.infer_objects()
 
     assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    if using_infer_string and HAS_PYARROW:
+        assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    else:
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
 
     df2.iloc[0, 0] = 0
     df2.iloc[0, 1] = "d"
@@ -732,19 +734,16 @@ def test_infer_objects():
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_infer_objects_no_reference():
+def test_infer_objects_no_reference(using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
             ),
-            "e": "b",
+            "e": Series(["z", "w"], dtype=object),
         }
     )
     df = df.infer_objects()
@@ -757,16 +756,22 @@ def test_infer_objects_no_reference():
     df.iloc[0, 1] = "d"
     df.iloc[0, 3] = Timestamp("2018-12-31")
     assert np.shares_memory(arr_a, get_array(df, "a"))
-    # TODO(CoW): Block splitting causes references here
-    assert not np.shares_memory(arr_b, get_array(df, "b"))
+    if using_infer_string and HAS_PYARROW:
+        # note that the underlying memory of arr_b has been copied anyway
+        # because of the assignment, but the EA is updated inplace so still
+        # appears the share memory
+        assert tm.shares_memory(arr_b, get_array(df, "b"))
+    else:
+        # TODO(CoW): Block splitting causes references here
+        assert not np.shares_memory(arr_b, get_array(df, "b"))
     assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
 def test_infer_objects_reference():
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs):
     tm.assert_equal(view, obj_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("decimals", [-1, 0, 1])
 def test_round(decimals):
     df = DataFrame({"a": [1, 2], "b": "c"})
     df_orig = df.copy()
     df2 = df.round(decimals=decimals)
 
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
     # TODO: Make inplace by using out parameter of ndarray.round?
     if decimals >= 0:
         # Ensure lazy copy if no-op

diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -13,7 +9,6 @@
 from pandas.tests.copy_view.util import get_array
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "replace_kwargs",
     [
@@ -30,14 +25,14 @@
     ],
 )
 def test_replace(replace_kwargs):
-    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
     df_orig = df.copy()
 
     df_replaced = df.replace(**replace_kwargs)
 
     if (df_replaced["b"] == df["b"]).all():
         assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
-    assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+    assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
 
     # mutating squeezed df triggers a copy-on-write for that column/block
     df_replaced.loc[0, "c"] = -1
@@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs():
     tm.assert_frame_equal(view, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_replace_regex_inplace():
     df = DataFrame({"a": ["aaa", "bbb"]})
     arr = get_array(df, "a")
     df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     assert df._mgr._has_no_reference(0)
-    assert np.shares_memory(arr, get_array(df, "a"))
+    assert tm.shares_memory(arr, get_array(df, "a"))
 
     df_orig = df.copy()
     df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
     tm.assert_frame_equal(df_orig, df)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
 
 def test_replace_regex_inplace_no_op():
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,7 +153,6 @@ def test_concat_copy_keyword(): @@
         assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-    # @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
     @pytest.mark.parametrize(
         "func",
         [
@@ Expand Down @@