Skip to content

Commit

Permalink
Support melt(ignore_index=False) (#18080)
Browse files Browse the repository at this point in the history
Additionally, refactors `melt` to avoid Series/DataFrame constructors by operating on columns and passing the result to `_from_data`

```python
In [1]: import numpy as np, cudf

In [2]: df = cudf.DataFrame(np.ones((1000, 1000)))

In [3]: %timeit df.melt(id_vars=range(50, 300))   # this PR
1.35 s ± 12.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [3]: %timeit df.melt(id_vars=range(50, 300))  # branch-25.04

24.8 s ± 47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #18080
  • Loading branch information
mroeschke authored Feb 24, 2025
1 parent bcff1f7 commit 6590cc2
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 28 deletions.
54 changes: 31 additions & 23 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,18 @@
from cudf.api.extensions import no_default
from cudf.api.types import is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core.column import ColumnBase, as_column, column_empty
from cudf.core.column import (
ColumnBase,
as_column,
column_empty,
concat_columns,
)
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type

if TYPE_CHECKING:
from collections.abc import Hashable

from cudf._typing import DtypeObj

_AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
Expand Down Expand Up @@ -534,14 +541,14 @@ def concat(


def melt(
frame,
frame: cudf.DataFrame,
id_vars=None,
value_vars=None,
var_name=None,
value_name="value",
value_name: Hashable = "value",
col_level=None,
ignore_index: bool = True,
):
) -> cudf.DataFrame:
"""Unpivots a DataFrame from wide format to long format,
optionally leaving identifier variables set.
Expand Down Expand Up @@ -605,14 +612,12 @@ def melt(
"""
if col_level is not None:
raise NotImplementedError("col_level != None is not supported yet.")
if ignore_index is not True:
raise NotImplementedError("ignore_index is currently not supported.")

# Arg cleaning

# id_vars
if id_vars is not None:
if cudf.api.types.is_scalar(id_vars):
if is_scalar(id_vars):
id_vars = [id_vars]
id_vars = list(id_vars)
missing = set(id_vars) - set(frame._column_names)
Expand All @@ -626,7 +631,7 @@ def melt(

# value_vars
if value_vars is not None:
if cudf.api.types.is_scalar(value_vars):
if is_scalar(value_vars):
value_vars = [value_vars]
value_vars = list(value_vars)
missing = set(value_vars) - set(frame._column_names)
Expand All @@ -643,7 +648,7 @@ def melt(
# Error for unimplemented support for datatype
if any(
isinstance(frame[col].dtype, cudf.CategoricalDtype)
for col in id_vars + value_vars
for col in itertools.chain(id_vars, value_vars)
):
raise NotImplementedError(
"Categorical columns are not yet supported for function"
Expand All @@ -668,15 +673,14 @@ def melt(
N = len(frame)
K = len(value_vars)

def _tile(A, reps):
series_list = [A] * reps
def _tile(base_col: ColumnBase, reps: int) -> ColumnBase:
if reps > 0:
return cudf.Series._concat(objs=series_list, index=False)
return concat_columns([base_col] * reps)
else:
return cudf.Series([], dtype=A.dtype)
return column_empty(0, dtype=base_col.dtype)

# Step 1: tile id_vars
mdata = {col: _tile(frame[col], K) for col in id_vars}
mdata = {col: _tile(frame[col]._column, K) for col in id_vars}

# Step 2: add variable
nval = len(value_vars)
Expand All @@ -687,23 +691,27 @@ def _tile(A, reps):

if not value_vars:
# TODO: Use frame._data.label_dtype when it's more consistently set
var_data = cudf.Series(
value_vars, dtype=frame._data.to_pandas_index.dtype
var_data = column_empty(
0, dtype=cudf.dtype(frame._data.to_pandas_index.dtype)
)
else:
var_data = (
cudf.Series(value_vars)
.take(np.repeat(np.arange(nval, dtype=dtype), N))
.reset_index(drop=True)
var_data = as_column(value_vars).take(
as_column(np.repeat(np.arange(nval, dtype=dtype), N)),
check_bounds=False,
)
mdata[var_name] = var_data

# Step 3: add values
mdata[value_name] = cudf.Series._concat(
objs=[frame[val] for val in value_vars], index=False
mdata[value_name] = concat_columns(
[frame[val]._column for val in value_vars]
)

return cudf.DataFrame(mdata)
result = cudf.DataFrame._from_data(mdata)
if not ignore_index:
taker = np.tile(np.arange(len(frame)), frame.shape[1] - len(id_vars))
result.index = frame.index.take(taker)

return result


def get_dummies(
Expand Down
25 changes: 20 additions & 5 deletions python/cudf/cudf/tests/test_reshape.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.

import re
from itertools import chain
Expand Down Expand Up @@ -40,7 +40,10 @@
@pytest.mark.parametrize("num_rows", [1, 2, 100])
@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@pytest.mark.parametrize("nulls", ["none", "some", "all"])
def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
@pytest.mark.parametrize("ignore_index", [True, False])
def test_melt(
nulls, num_id_vars, num_value_vars, num_rows, dtype, ignore_index
):
if dtype not in ["float32", "float64"] and nulls in ["some", "all"]:
pytest.skip(reason="nulls not supported in dtype: " + dtype)

Expand Down Expand Up @@ -72,10 +75,22 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):

gdf = cudf.from_pandas(pdf)

got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)
got = cudf.melt(
frame=gdf,
id_vars=id_vars,
value_vars=value_vars,
ignore_index=ignore_index,
)
got_from_melt_method = gdf.melt(
id_vars=id_vars, value_vars=value_vars, ignore_index=ignore_index
)

expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
expect = pd.melt(
frame=pdf,
id_vars=id_vars,
value_vars=value_vars,
ignore_index=ignore_index,
)

assert_eq(expect, got)

Expand Down

0 comments on commit 6590cc2

Please sign in to comment.