Skip to content

Commit

Permalink
ENH: Support skipna parameter in GroupBy min, max, prod, median, var,…
Browse files Browse the repository at this point in the history
… std and sem methods (#60752)
  • Loading branch information
snitish authored Feb 3, 2025
1 parent f1441b2 commit a68048e
Show file tree
Hide file tree
Showing 10 changed files with 405 additions and 51 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def group_median_float64(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ..., # bint
skipna: bool = ...,
) -> None: ...
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
Expand Down Expand Up @@ -76,6 +77,7 @@ def group_prod(
mask: np.ndarray | None,
result_mask: np.ndarray | None = ...,
min_count: int = ...,
skipna: bool = ...,
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
Expand All @@ -88,6 +90,7 @@ def group_var(
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
name: str = ...,
skipna: bool = ...,
) -> None: ...
def group_skew(
out: np.ndarray, # float64_t[:, ::1]
Expand Down Expand Up @@ -183,6 +186,7 @@ def group_max(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_min(
out: np.ndarray, # groupby_t[:, ::1]
Expand All @@ -193,6 +197,7 @@ def group_min(
is_datetimelike: bool = ...,
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_idxmin_idxmax(
out: npt.NDArray[np.intp],
Expand Down
99 changes: 77 additions & 22 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
INTERPOLATION_MIDPOINT


cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
cdef float64_t median_linear_mask(
float64_t* a,
int n,
uint8_t* mask,
bint skipna=True
) noexcept nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
Expand All @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
na_count += 1

if na_count:
if na_count == n:
if na_count == n or not skipna:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
Expand All @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
cdef float64_t median_linear(
float64_t* a,
int n,
bint is_datetimelike=False
bint is_datetimelike=False,
bint skipna=True,
) noexcept nogil:
cdef:
int i, j, na_count = 0
Expand All @@ -125,7 +131,7 @@ cdef float64_t median_linear(
na_count += 1

if na_count:
if na_count == n:
if na_count == n or not skipna:
return NaN

tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
Expand Down Expand Up @@ -186,6 +192,7 @@ def group_median_float64(
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -229,7 +236,7 @@ def group_median_float64(

for j in range(ngroups):
size = _counts[j + 1]
result = median_linear_mask(ptr, size, ptr_mask)
result = median_linear_mask(ptr, size, ptr_mask, skipna)
out[j, i] = result

if result != result:
Expand All @@ -244,7 +251,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size, is_datetimelike)
out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
ptr += size


Expand Down Expand Up @@ -804,17 +811,18 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
int64float_t val
int64float_t val, nan_val
int64float_t[:, ::1] prodx
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")
Expand All @@ -823,6 +831,7 @@ def group_prod(
prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
nan_val = _get_na_val(<int64float_t>0, False)

with nogil:
for i in range(N):
Expand All @@ -836,12 +845,23 @@ def group_prod(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, False)
isna_result = _treat_as_na(prodx[lab, j], False)

if not skipna and isna_result:
# If prod is already NA, no need to update it
continue

if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
prodx[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -862,14 +882,15 @@ def group_var(
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
str name="var",
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
floating[:, ::1] mean
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None
bint is_std = name == "std"
bint is_sem = name == "sem"

Expand Down Expand Up @@ -898,19 +919,34 @@ def group_var(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = out[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(out[lab, j], is_datetimelike)

if not skipna and isna_result:
# If aggregate is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in a NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
elif not skipna:
nobs[lab, j] = 0
if uses_mask:
result_mask[lab, j] = True
else:
out[lab, j] = NAN

for i in range(ncounts):
for j in range(K):
Expand Down Expand Up @@ -1164,7 +1200,7 @@ def group_mean(
mean_t[:, ::1] sumx, compensation
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
bint isna_entry, isna_result, uses_mask = mask is not None

assert min_count == -1, "'min_count' only used in sum and prod"

Expand Down Expand Up @@ -1194,25 +1230,24 @@ def group_mean(
for j in range(K):
val = values[i, j]

if not skipna and (
(uses_mask and result_mask[lab, j]) or
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
_treat_as_na(sumx[lab, j], False)
):
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
elif is_datetimelike:
# With group_mean, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
isna_result = sumx[lab, j] == NPY_NAT
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)

if not skipna and isna_result:
# If sum is already NA, don't add to it. This is important for
# datetimelike because adding a value to NPY_NAT may not result
# in NPY_NAT
continue

if not isna_entry:
nobs[lab, j] += 1
Expand Down Expand Up @@ -1806,6 +1841,7 @@ cdef group_min_max(
bint compute_max=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand Down Expand Up @@ -1833,6 +1869,8 @@ cdef group_min_max(
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
skipna : bool, default True
If True, ignore nans in `values`.
Notes
-----
Expand All @@ -1841,17 +1879,18 @@ cdef group_min_max(
"""
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
numeric_t val
numeric_t val, nan_val
numeric_t[:, ::1] group_min_or_max
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry
bint isna_entry, isna_result

if not len(values) == len(labels):
raise AssertionError("len(index) != len(labels)")

min_count = max(min_count, 1)
nobs = np.zeros((<object>out).shape, dtype=np.int64)
nan_val = _get_na_val(<numeric_t>0, is_datetimelike)

group_min_or_max = np.empty_like(out)
group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
Expand All @@ -1870,8 +1909,15 @@ cdef group_min_max(

if uses_mask:
isna_entry = mask[i, j]
isna_result = result_mask[lab, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)
isna_result = _treat_as_na(group_min_or_max[lab, j],
is_datetimelike)

if not skipna and isna_result:
# If current min/max is already NA, it will always be NA
continue

if not isna_entry:
nobs[lab, j] += 1
Expand All @@ -1881,6 +1927,11 @@ cdef group_min_max(
else:
if val < group_min_or_max[lab, j]:
group_min_or_max[lab, j] = val
elif not skipna:
if uses_mask:
result_mask[lab, j] = True
else:
group_min_or_max[lab, j] = nan_val

_check_below_mincount(
out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
Expand Down Expand Up @@ -2012,6 +2063,7 @@ def group_max(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2024,6 +2076,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -2038,6 +2091,7 @@ def group_min(
bint is_datetimelike=False,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_min_max.__doc__"""
group_min_max(
Expand All @@ -2050,6 +2104,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
8 changes: 6 additions & 2 deletions pandas/core/_numba/kernels/min_max_.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def grouped_min_max(
ngroups: int,
min_periods: int,
is_max: bool,
skipna: bool = True,
) -> tuple[np.ndarray, list[int]]:
N = len(labels)
nobs = np.zeros(ngroups, dtype=np.int64)
Expand All @@ -97,13 +98,16 @@ def grouped_min_max(
for i in range(N):
lab = labels[i]
val = values[i]
if lab < 0:
if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
continue

if values.dtype.kind == "i" or not np.isnan(val):
nobs[lab] += 1
else:
# NaN value cannot be a min/max value
if not skipna:
# If skipna is False and we encounter a NaN,
# both min and max of the group will be NaN
output[lab] = np.nan
continue

if nobs[lab] == 1:
Expand Down
Loading

0 comments on commit a68048e

Please sign in to comment.