Skip to content

Commit b717abb

Browse files
BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array (#59756)
* BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array * update
1 parent 47b56ea commit b717abb

File tree

3 files changed

+39
-11
lines changed

3 files changed

+39
-11
lines changed

pandas/_libs/lib.pyx

+12-6
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
733733
convert_na_value : bool, default True
734734
If False, existing na values will be used unchanged in the new array.
735735
copy : bool, default True
736-
Whether to ensure that a new array is returned.
736+
Whether to ensure that a new array is returned. When True, a new array
737+
is always returned. When False, a new array is only returned when needed
738+
to avoid mutating the input array.
737739
skipna : bool, default True
738740
Whether or not to coerce nulls to their stringified form
739741
(e.g. if False, NaN becomes 'nan').
@@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(
762764

763765
result = np.asarray(arr, dtype="object")
764766

765-
if copy and (result is arr or np.shares_memory(arr, result)):
766-
# GH#54654
767-
result = result.copy()
768-
elif not copy and result is arr:
769-
already_copied = False
767+
if result is arr or np.may_share_memory(arr, result):
768+
# if np.asarray(..) did not make a copy of the input arr, we still need
769+
# to do that to avoid mutating the input array
770+
# GH#54654: share_memory check is needed for rare cases where np.asarray
771+
# returns a new object without making a copy of the actual data
772+
if copy:
773+
result = result.copy()
774+
else:
775+
already_copied = False
770776
elif not copy and not result.flags.writeable:
771777
# Weird edge case where result is a view
772778
already_copied = False

pandas/tests/copy_view/test_astype.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from pandas.compat import HAS_PYARROW
99
from pandas.compat.pyarrow import pa_version_under12p0
10-
import pandas.util._test_decorators as td
1110

1211
from pandas import (
1312
DataFrame,
@@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
111110
tm.assert_frame_equal(df2, df_orig)
112111

113112

114-
def test_astype_string_copy_on_pickle_roundrip():
113+
def test_astype_str_copy_on_pickle_roundrip():
114+
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
115115
# https://github.com/pandas-dev/pandas/issues/54654
116116
# ensure_string_array may alter array inplace
117117
base = Series(np.array([(1, 2), None, 1], dtype="object"))
@@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
120120
tm.assert_series_equal(base, base_copy)
121121

122122

123-
@td.skip_if_no("pyarrow")
124-
def test_astype_string_read_only_on_pickle_roundrip():
123+
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
124+
# https://github.com/pandas-dev/pandas/issues/54654
125+
# ensure_string_array may alter array inplace
126+
base = Series(np.array([(1, 2), None, 1], dtype="object"))
127+
base_copy = pickle.loads(pickle.dumps(base))
128+
base_copy.astype(any_string_dtype)
129+
tm.assert_series_equal(base, base_copy)
130+
131+
132+
def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
125133
# https://github.com/pandas-dev/pandas/issues/54654
126134
# ensure_string_array may alter read-only array inplace
127135
base = Series(np.array([(1, 2), None, 1], dtype="object"))
128136
base_copy = pickle.loads(pickle.dumps(base))
129137
base_copy._values.flags.writeable = False
130-
base_copy.astype("string[pyarrow]")
138+
base_copy.astype(any_string_dtype)
131139
tm.assert_series_equal(base, base_copy)
132140

133141

pandas/tests/libs/test_lib.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pickle
2+
13
import numpy as np
24
import pytest
35

@@ -283,3 +285,15 @@ def test_no_default_pickle():
283285
# GH#40397
284286
obj = tm.round_trip_pickle(lib.no_default)
285287
assert obj is lib.no_default
288+
289+
290+
def test_ensure_string_array_copy():
291+
# ensure the original array is not modified in case of copy=False with
292+
# pickle-roundtripped object dtype array
293+
# https://github.com/pandas-dev/pandas/issues/54654
294+
arr = np.array(["a", None], dtype=object)
295+
arr = pickle.loads(pickle.dumps(arr))
296+
result = lib.ensure_string_array(arr, copy=False)
297+
assert not np.shares_memory(arr, result)
298+
assert arr[1] is None
299+
assert result[1] is np.nan

0 commit comments

Comments
 (0)