Skip to content

Commit 902ba7c

Browse files
committed
Make .describe_categoricals["mapping"] a Column
1 parent 1b42205 commit 902ba7c

File tree

2 files changed

+22
-29
lines changed

2 files changed

+22
-29
lines changed

protocol/dataframe_protocol.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -170,21 +170,23 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
170170
pass
171171

172172
@property
173-
def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
173+
def describe_categorical(self) -> dict[bool, bool, Optional[Column]]:
174174
"""
175175
If the dtype is categorical, there are two options:
176176
177177
- There are only values in the data buffer.
178-
- The data buffer stores encoded values, while the (single)
179-
child column stores the categorical values themselves.
178+
- There is a separate non-categortical Column encoding categorical values.
180179
181180
Raises RuntimeError if the dtype is not categorical
182181
183182
Content of returned dict:
184183
185184
- "is_ordered" : bool, whether the ordering of dictionary indices is
186185
semantically meaningful.
187-
- "is_dictionary" : bool, whether the data is integer encoded
186+
- "is_dictionary" : bool, whether a mapping of
187+
categorical values to other objects exists
188+
- "mapping" : Column representing the mapping of indices to category values.
189+
None if not a dictionary-style categorical.
188190
189191
TBD: are there any other in-memory representations that are needed?
190192
"""
@@ -263,12 +265,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
263265
"""
264266
pass
265267

266-
def get_children(self) -> Iterable[Column]:
267-
"""
268-
Children columns underneath the column, each object in this iterator
269-
must adhere to the column specification.
270-
"""
271-
pass
268+
# def get_children(self) -> Iterable[Column]:
269+
# """
270+
# Children columns underneath the column, each object in this iterator
271+
# must adhere to the column specification.
272+
# """
273+
# pass
272274

273275

274276
class DataFrame:

protocol/pandas_implementation.py

+10-19
Original file line numberDiff line numberDiff line change
@@ -145,15 +145,14 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
145145
"""
146146
Convert a categorical column to a Series instance.
147147
"""
148-
ordered, is_dict = col.describe_categorical
148+
ordered, is_dict, mapping = col.describe_categorical
149149
if not is_dict:
150150
raise NotImplementedError('Non-dictionary categoricals not supported yet')
151151

152152
# If you want to cheat for testing (can't use `_col` in real-world code):
153153
# categories = col._col.values.categories.values
154154
# codes = col._col.values.codes
155-
categories_column, = col.get_children() # need to keep a reference to the child
156-
categories = convert_column_to_ndarray(categories_column)[0]
155+
categories = convert_column_to_ndarray(mapping)
157156
codes_buffer, codes_dtype = col.get_buffers()["data"]
158157
codes = buffer_to_ndarray(codes_buffer, codes_dtype)
159158
values = categories[codes]
@@ -457,20 +456,19 @@ def describe_categorical(self) -> Dict[str, Any]:
457456
- "is_ordered" : bool, whether the ordering of dictionary indices is
458457
semantically meaningful.
459458
- "is_dictionary" : bool, whether the data is integer encoded
460-
459+
- "mapping" : Column representing the mapping of indices to category values.
460+
None if not a dictionary-style categorical.
461461
"""
462462
if not self.dtype[0] == _DtypeKind.CATEGORICAL:
463463
raise TypeError("`describe_categorical only works on a column with "
464464
"categorical dtype!")
465465

466466
ordered = self._col.dtype.ordered
467467
is_dictionary = True
468-
# NOTE: this shows the children approach is better, transforming
469-
# `categories` to a "mapping" dict is inefficient
470-
codes = self._col.values.codes # ndarray, length `self.size`
471-
# categories.values is ndarray of length n_categories
472-
categories = self._col.values.categories.values
473-
return ordered, is_dictionary
468+
categories = _PandasColumn(self._col.dtype.categories.to_series())
469+
return {"is_ordered": ordered,
470+
"is_dictionary": is_dictionary,
471+
"mapping": categories}
474472

475473
@property
476474
def describe_null(self) -> Tuple[int, Any]:
@@ -692,14 +690,6 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
692690

693691
return buffer, dtype
694692

695-
def get_children(self):
696-
if self.dtype[0] == _DtypeKind.CATEGORICAL:
697-
if self.describe_categorical[1]:
698-
# return the categories as a child Column
699-
return (_PandasColumn(self._col.dtype.categories.to_series()),)
700-
else:
701-
return tuple()
702-
703693

704694
class _PandasDataFrame:
705695
"""
@@ -847,7 +837,8 @@ def test_categorical_dtype():
847837
assert col.null_count == 1
848838
assert col.describe_null == (2, -1) # sentinel value -1
849839
assert col.num_chunks() == 1
850-
assert col.describe_categorical == (False, True)
840+
assert col.describe_categorical["is_ordered"] == False
841+
assert col.describe_categorical["is_dictionary"] == True
851842

852843
df2 = from_dataframe(df)
853844
assert_dataframe_equal(df.__dataframe__(), df)

0 commit comments

Comments
 (0)