Skip to content

Commit d1052cf

Browse files
BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array (#59756)
* BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array * update
1 parent 1fd7e25 commit d1052cf

File tree

3 files changed

+49
-6
lines changed

3 files changed

+49
-6
lines changed

pandas/_libs/lib.pyx

+14-5
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array(
736736
convert_na_value : bool, default True
737737
If False, existing na values will be used unchanged in the new array.
738738
copy : bool, default True
739-
Whether to ensure that a new array is returned.
739+
Whether to ensure that a new array is returned. When True, a new array
740+
is always returned. When False, a new array is only returned when needed
741+
to avoid mutating the input array.
740742
skipna : bool, default True
741743
Whether or not to coerce nulls to their stringified form
742744
(e.g. if False, NaN becomes 'nan').
@@ -765,10 +767,17 @@ cpdef ndarray[object] ensure_string_array(
765767

766768
result = np.asarray(arr, dtype="object")
767769

768-
if copy and (result is arr or np.shares_memory(arr, result)):
769-
# GH#54654
770-
result = result.copy()
771-
elif not copy and result is arr:
770+
if result is arr or np.may_share_memory(arr, result):
771+
# if np.asarray(..) did not make a copy of the input arr, we still need
772+
# to do that to avoid mutating the input array
773+
# GH#54654: share_memory check is needed for rare cases where np.asarray
774+
# returns a new object without making a copy of the actual data
775+
if copy:
776+
result = result.copy()
777+
else:
778+
already_copied = False
779+
elif not copy and not result.flags.writeable:
780+
# Weird edge case where result is a view
772781
already_copied = False
773782

774783
if issubclass(arr.dtype.type, np.str_):

pandas/tests/copy_view/test_astype.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ def test_astype_string_and_object_update_original(
135135
tm.assert_frame_equal(df2, df_orig)
136136

137137

138-
def test_astype_string_copy_on_pickle_roundrip():
138+
def test_astype_str_copy_on_pickle_roundrip():
139+
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
139140
# https://github.com/pandas-dev/pandas/issues/54654
140141
# ensure_string_array may alter array inplace
141142
base = Series(np.array([(1, 2), None, 1], dtype="object"))
@@ -144,6 +145,25 @@ def test_astype_string_copy_on_pickle_roundrip():
144145
tm.assert_series_equal(base, base_copy)
145146

146147

148+
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
149+
# https://github.com/pandas-dev/pandas/issues/54654
150+
# ensure_string_array may alter array inplace
151+
base = Series(np.array([(1, 2), None, 1], dtype="object"))
152+
base_copy = pickle.loads(pickle.dumps(base))
153+
base_copy.astype(any_string_dtype)
154+
tm.assert_series_equal(base, base_copy)
155+
156+
157+
def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
158+
# https://github.com/pandas-dev/pandas/issues/54654
159+
# ensure_string_array may alter read-only array inplace
160+
base = Series(np.array([(1, 2), None, 1], dtype="object"))
161+
base_copy = pickle.loads(pickle.dumps(base))
162+
base_copy._values.flags.writeable = False
163+
base_copy.astype(any_string_dtype)
164+
tm.assert_series_equal(base, base_copy)
165+
166+
147167
def test_astype_dict_dtypes(using_copy_on_write):
148168
df = DataFrame(
149169
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}

pandas/tests/libs/test_lib.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pickle
2+
13
import numpy as np
24
import pytest
35

@@ -283,3 +285,15 @@ def test_no_default_pickle():
283285
# GH#40397
284286
obj = tm.round_trip_pickle(lib.no_default)
285287
assert obj is lib.no_default
288+
289+
290+
def test_ensure_string_array_copy():
291+
# ensure the original array is not modified in case of copy=False with
292+
# pickle-roundtripped object dtype array
293+
# https://github.com/pandas-dev/pandas/issues/54654
294+
arr = np.array(["a", None], dtype=object)
295+
arr = pickle.loads(pickle.dumps(arr))
296+
result = lib.ensure_string_array(arr, copy=False)
297+
assert not np.shares_memory(arr, result)
298+
assert arr[1] is None
299+
assert result[1] is np.nan

0 commit comments

Comments
 (0)