Skip to content

Commit 1b42205

Browse files
committed
Use a column to store categories
1 parent 5cbbfc1 commit 1b42205

File tree

2 files changed

+26
-21
lines changed

2 files changed

+26
-21
lines changed

protocol/dataframe_protocol.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -175,18 +175,16 @@ def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
175175
If the dtype is categorical, there are two options:
176176
177177
- There are only values in the data buffer.
178-
- There is a separate dictionary-style encoding for categorical values.
178+
- The data buffer stores encoded values, while the (single)
179+
child column stores the categorical values themselves.
179180
180181
Raises RuntimeError if the dtype is not categorical
181182
182183
Content of returned dict:
183184
184185
- "is_ordered" : bool, whether the ordering of dictionary indices is
185186
semantically meaningful.
186-
- "is_dictionary" : bool, whether a dictionary-style mapping of
187-
categorical values to other objects exists
188-
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
189-
None if not a dictionary-style categorical.
187+
- "is_dictionary" : bool, whether the data is integer encoded
190188
191189
TBD: are there any other in-memory representations that are needed?
192190
"""
@@ -265,12 +263,12 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]],
265263
"""
266264
pass
267265

268-
# def get_children(self) -> Iterable[Column]:
269-
# """
270-
# Children columns underneath the column, each object in this iterator
271-
# must adhere to the column specification.
272-
# """
273-
# pass
266+
def get_children(self) -> Iterable[Column]:
267+
"""
268+
Children columns underneath the column, each object in this iterator
269+
must adhere to the column specification.
270+
"""
271+
pass
274272

275273

276274
class DataFrame:

protocol/pandas_implementation.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,15 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
145145
"""
146146
Convert a categorical column to a Series instance.
147147
"""
148-
ordered, is_dict, mapping = col.describe_categorical
148+
ordered, is_dict = col.describe_categorical
149149
if not is_dict:
150150
raise NotImplementedError('Non-dictionary categoricals not supported yet')
151151

152152
# If you want to cheat for testing (can't use `_col` in real-world code):
153153
# categories = col._col.values.categories.values
154154
# codes = col._col.values.codes
155-
categories = np.asarray(list(mapping.values()))
155+
categories_column, = col.get_children() # need to keep a reference to the child
156+
categories = convert_column_to_ndarray(categories_column)[0]
156157
codes_buffer, codes_dtype = col.get_buffers()["data"]
157158
codes = buffer_to_ndarray(codes_buffer, codes_dtype)
158159
values = categories[codes]
@@ -446,18 +447,17 @@ def describe_categorical(self) -> Dict[str, Any]:
446447
If the dtype is categorical, there are two options:
447448
448449
- There are only values in the data buffer.
449-
- There is a separate dictionary-style encoding for categorical values.
450+
- The data buffer stores encoded values, while the (single)
451+
child column stores the categorical values themselves.
450452
451453
Raises RuntimeError if the dtype is not categorical
452454
453455
Content of returned dict:
454456
455457
- "is_ordered" : bool, whether the ordering of dictionary indices is
456458
semantically meaningful.
457-
- "is_dictionary" : bool, whether a dictionary-style mapping of
458-
categorical values to other objects exists
459-
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
460-
None if not a dictionary-style categorical.
459+
- "is_dictionary" : bool, whether the data is integer encoded
460+
461461
"""
462462
if not self.dtype[0] == _DtypeKind.CATEGORICAL:
463463
raise TypeError("`describe_categorical only works on a column with "
@@ -470,8 +470,7 @@ def describe_categorical(self) -> Dict[str, Any]:
470470
codes = self._col.values.codes # ndarray, length `self.size`
471471
# categories.values is ndarray of length n_categories
472472
categories = self._col.values.categories.values
473-
mapping = {ix: val for ix, val in enumerate(categories)}
474-
return ordered, is_dictionary, mapping
473+
return ordered, is_dictionary
475474

476475
@property
477476
def describe_null(self) -> Tuple[int, Any]:
@@ -693,6 +692,14 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
693692

694693
return buffer, dtype
695694

695+
def get_children(self):
696+
if self.dtype[0] == _DtypeKind.CATEGORICAL:
697+
if self.describe_categorical[1]:
698+
# return the categories as a child Column
699+
return (_PandasColumn(self._col.dtype.categories.to_series()),)
700+
else:
701+
return tuple()
702+
696703

697704
class _PandasDataFrame:
698705
"""
@@ -840,7 +847,7 @@ def test_categorical_dtype():
840847
assert col.null_count == 1
841848
assert col.describe_null == (2, -1) # sentinel value -1
842849
assert col.num_chunks() == 1
843-
assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
850+
assert col.describe_categorical == (False, True)
844851

845852
df2 = from_dataframe(df)
846853
assert_dataframe_equal(df.__dataframe__(), df)

0 commit comments

Comments
 (0)