Skip to content

Commit

Permalink
Merge pull request #17821 from rapidsai/branch-25.02
Browse files Browse the repository at this point in the history
Forward-merge branch-25.02 into branch-25.04
  • Loading branch information
GPUtester authored Jan 24, 2025
2 parents 573c730 + 00dca76 commit cce09a3
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 24 deletions.
14 changes: 8 additions & 6 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,15 @@ def explode(self):
2 3 z
3 4 a
"""
data = {
name: col.copy(deep=True)
for name, col in zip(
self._column.dtype.fields, self._column.children
)
}
rangeindex = len(data) == 0
return cudf.DataFrame._from_data(
cudf.core.column_accessor.ColumnAccessor(
{
name: col.copy(deep=True)
for name, col in zip(
self._column.dtype.fields, self._column.children
)
}
data, rangeindex=rangeindex
)
)
5 changes: 5 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,8 @@ def __init__(
label_dtype=label_dtype,
verify=False,
)
else:
self._data.rangeindex = True
elif isinstance(data, ColumnAccessor):
raise TypeError(
"Use cudf.DataFrame._from_data for constructing a DataFrame from "
Expand Down Expand Up @@ -1120,6 +1122,9 @@ def _init_from_dict_like(
data[col_name],
nan_as_null=nan_as_null,
)
elif columns is None:
self._data.rangeindex = True

self._data._level_names = (
tuple(columns.names)
if isinstance(columns, pd.Index)
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/io/avro.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

import pylibcudf as plc

import cudf
from cudf._lib.column import Column
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils import ioutils


Expand Down Expand Up @@ -54,4 +55,5 @@ def read_avro(
strict=True,
)
}
return cudf.DataFrame._from_data(data)
ca = ColumnAccessor(data, rangeindex=len(data) == 0)
return cudf.DataFrame._from_data(ca)
5 changes: 3 additions & 2 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from cudf._lib.column import Column
from cudf.api.types import is_hashable, is_scalar
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils import ioutils
from cudf.utils.dtypes import (
_maybe_convert_to_default_type,
Expand Down Expand Up @@ -262,8 +263,8 @@ def read_csv(
strict=True,
)
}

df = cudf.DataFrame._from_data(data)
ca = ColumnAccessor(data, rangeindex=len(data) == 0)
df = cudf.DataFrame._from_data(ca)

if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from cudf._lib.column import Column
from cudf.api.types import is_list_like
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.index import _index_from_data
from cudf.utils import ioutils
from cudf.utils.dtypes import dtype_to_pylibcudf_type
Expand Down Expand Up @@ -267,7 +268,7 @@ def read_orc(
# When `columns=[]`, index needs to be
# established, but not the columns.
nrows = tbl_w_meta.tbl.num_rows()
data = {}
ca = ColumnAccessor({})
index = cudf.RangeIndex(nrows)
else:
names = tbl_w_meta.column_names(include_children=False)
Expand Down Expand Up @@ -367,8 +368,9 @@ def read_orc(
data.items(), child_name_values
)
}
ca = ColumnAccessor(data, rangeindex=len(data) == 0)

return cudf.DataFrame._from_data(data, index=index)
return cudf.DataFrame._from_data(ca, index=index)
else:
from pyarrow import orc

Expand Down
14 changes: 3 additions & 11 deletions python/cudf/cudf/tests/dataframe/test_io_serialization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
import contextlib
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
from io import BytesIO

import pandas as pd
Expand Down Expand Up @@ -35,18 +34,11 @@ def test_dataframe_parquet_roundtrip(index, write_index, empty):
metadata_equal = (
gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata
)
if empty and write_index is not False:
# https://github.com/rapidsai/cudf/issues/15372
ctx = pytest.raises(AssertionError)
else:
ctx = contextlib.nullcontext()
with ctx:
assert metadata_equal
assert metadata_equal

gpu_read = cudf.read_parquet(gpu_buf)
cpu_read = cudf.read_parquet(cpu_buf)
with ctx:
assert_eq(gpu_read, cpu_read)
assert_eq(gpu_read, cpu_read)


@pytest.mark.parametrize("preserve_index", [False, True, None])
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8811,7 +8811,9 @@ def test_dataframe_mode(df, numeric_only, dropna):

expected = pdf.mode(numeric_only=numeric_only, dropna=dropna)
actual = df.mode(numeric_only=numeric_only, dropna=dropna)

if len(actual.columns) == 0:
# pandas < 3.0 returns an Index[object] instead of RangeIndex
actual.columns = expected.columns
assert_eq(expected, actual, check_dtype=False)


Expand Down Expand Up @@ -11260,3 +11262,10 @@ def test_roundtrip_dataframe_plc_table(pdf):
expect = cudf.DataFrame.from_pandas(pdf)
actual = cudf.DataFrame.from_pylibcudf(*expect.to_pylibcudf())
assert_eq(expect, actual)


@pytest.mark.parametrize("data", [None, {}])
def test_empty_construction_rangeindex_columns(data):
result = cudf.DataFrame(data=data).columns
expected = pd.RangeIndex(0)
pd.testing.assert_index_equal(result, expected, exact=True)

0 comments on commit cce09a3

Please sign in to comment.