Skip to content

Commit f97ee3a

Browse files
[backport 2.3.x] BUG(string dtype): Empty sum produces incorrect result (#60936) (#61625)
1 parent 4e20ce1 commit f97ee3a

File tree

5 files changed

+77
-1
lines changed

5 files changed

+77
-1
lines changed

pandas/core/arrays/base.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2386,7 +2386,14 @@ def _groupby_op(
23862386
if op.how not in ["any", "all"]:
23872387
# Fail early to avoid conversion to object
23882388
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
2389-
npvalues = self.to_numpy(object, na_value=np.nan)
2389+
2390+
arr = self
2391+
if op.how == "sum":
2392+
# https://github.com/pandas-dev/pandas/issues/60229
2393+
# All NA should result in the empty string.
2394+
if min_count == 0:
2395+
arr = arr.fillna("")
2396+
npvalues = arr.to_numpy(object, na_value=np.nan)
23902397
else:
23912398
raise NotImplementedError(
23922399
f"function is not implemented for this dtype: {self.dtype}"

pandas/tests/frame/test_reductions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,16 @@ def test_axis_1_empty(self, all_reductions, index):
846846
expected = Series([], index=index, dtype=expected_dtype)
847847
tm.assert_series_equal(result, expected)
848848

849+
@pytest.mark.parametrize("min_count", [0, 1])
850+
def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
851+
# https://github.com/pandas-dev/pandas/issues/60229
852+
dtype = string_dtype_no_object
853+
df = DataFrame({"a": [pd.NA]}, dtype=dtype)
854+
result = df.sum(axis=1, skipna=skipna, min_count=min_count)
855+
value = "" if skipna and min_count == 0 else pd.NA
856+
expected = Series([value], dtype=dtype)
857+
tm.assert_series_equal(result, expected)
858+
849859
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
850860
@pytest.mark.parametrize("numeric_only", [None, True, False])
851861
def test_sum_prod_nanops(self, method, unit, numeric_only):

pandas/tests/groupby/test_reductions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,20 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
710710
tm.assert_frame_equal(result, expected)
711711

712712

713+
@pytest.mark.parametrize("min_count", [0, 1])
714+
def test_string_dtype_empty_sum(string_dtype_no_object, min_count):
715+
# https://github.com/pandas-dev/pandas/issues/60229
716+
dtype = string_dtype_no_object
717+
df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
718+
gb = df.groupby("a")
719+
result = gb.sum(min_count=min_count)
720+
value = "" if min_count == 0 else pd.NA
721+
expected = DataFrame(
722+
{"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
723+
)
724+
tm.assert_frame_equal(result, expected)
725+
726+
713727
def test_max_nan_bug():
714728
df = DataFrame(
715729
{

pandas/tests/resample/test_base.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,31 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method):
149149
assert result.index.freq == expected.index.freq
150150

151151

152+
@pytest.mark.parametrize("min_count", [0, 1])
153+
def test_resample_empty_sum_string(string_dtype_no_object, min_count):
154+
# https://github.com/pandas-dev/pandas/issues/60229
155+
dtype = string_dtype_no_object
156+
ser = Series(
157+
pd.NA,
158+
index=DatetimeIndex(
159+
[
160+
"2000-01-01 00:00:00",
161+
"2000-01-01 00:00:10",
162+
"2000-01-01 00:00:20",
163+
"2000-01-01 00:00:30",
164+
]
165+
),
166+
dtype=dtype,
167+
)
168+
rs = ser.resample("20s")
169+
result = rs.sum(min_count=min_count)
170+
171+
value = "" if min_count == 0 else pd.NA
172+
index = date_range(start="2000-01-01", freq="20s", periods=2)
173+
expected = Series(value, index=index, dtype=dtype)
174+
tm.assert_series_equal(result, expected)
175+
176+
152177
@all_ts
153178
@pytest.mark.parametrize(
154179
"freq",

pandas/tests/resample/test_resampler_grouper.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
526526
tm.assert_frame_equal(result, expected)
527527

528528

529+
@pytest.mark.parametrize("min_count", [0, 1])
530+
def test_groupby_resample_empty_sum_string(
531+
string_dtype_no_object, test_frame, min_count
532+
):
533+
# https://github.com/pandas-dev/pandas/issues/60229
534+
dtype = string_dtype_no_object
535+
test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
536+
gbrs = test_frame.groupby("A").resample("40s", include_groups=False)
537+
result = gbrs.sum(min_count=min_count)
538+
539+
index = pd.MultiIndex(
540+
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
541+
codes=[[0, 1, 2], [0, 0, 0]],
542+
names=["A", None],
543+
)
544+
value = "" if min_count == 0 else pd.NA
545+
expected = DataFrame({"B": value}, index=index, dtype=dtype)
546+
tm.assert_frame_equal(result, expected)
547+
548+
529549
def test_groupby_resample_with_list_of_keys():
530550
# GH 47362
531551
df = DataFrame(

0 commit comments

Comments
 (0)