@@ -145,14 +145,15 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
145
145
"""
146
146
Convert a categorical column to a Series instance.
147
147
"""
148
- ordered , is_dict , mapping = col .describe_categorical
148
+ ordered , is_dict = col .describe_categorical
149
149
if not is_dict :
150
150
raise NotImplementedError ('Non-dictionary categoricals not supported yet' )
151
151
152
152
# If you want to cheat for testing (can't use `_col` in real-world code):
153
153
# categories = col._col.values.categories.values
154
154
# codes = col._col.values.codes
155
- categories = np .asarray (list (mapping .values ()))
155
+ categories_column , = col .get_children () # need to keep a reference to the child
156
+ categories = convert_column_to_ndarray (categories_column )[0 ]
156
157
codes_buffer , codes_dtype = col .get_buffers ()["data" ]
157
158
codes = buffer_to_ndarray (codes_buffer , codes_dtype )
158
159
values = categories [codes ]
@@ -446,18 +447,17 @@ def describe_categorical(self) -> Dict[str, Any]:
446
447
If the dtype is categorical, there are two options:
447
448
448
449
- There are only values in the data buffer.
449
- - There is a separate dictionary-style encoding for categorical values.
450
+ - The data buffer stores encoded values, while the (single)
451
+ child column stores the categorical values themselves.
450
452
451
453
Raises RuntimeError if the dtype is not categorical
452
454
453
455
Content of returned dict:
454
456
455
457
- "is_ordered" : bool, whether the ordering of dictionary indices is
456
458
semantically meaningful.
457
- - "is_dictionary" : bool, whether a dictionary-style mapping of
458
- categorical values to other objects exists
459
- - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
460
- None if not a dictionary-style categorical.
459
+ - "is_dictionary" : bool, whether the data is integer encoded
460
+
461
461
"""
462
462
if not self .dtype [0 ] == _DtypeKind .CATEGORICAL :
463
463
raise TypeError ("`describe_categorical only works on a column with "
@@ -470,8 +470,7 @@ def describe_categorical(self) -> Dict[str, Any]:
470
470
codes = self ._col .values .codes # ndarray, length `self.size`
471
471
# categories.values is ndarray of length n_categories
472
472
categories = self ._col .values .categories .values
473
- mapping = {ix : val for ix , val in enumerate (categories )}
474
- return ordered , is_dictionary , mapping
473
+ return ordered , is_dictionary
475
474
476
475
@property
477
476
def describe_null (self ) -> Tuple [int , Any ]:
@@ -693,6 +692,14 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
693
692
694
693
return buffer , dtype
695
694
695
+ def get_children (self ):
696
+ if self .dtype [0 ] == _DtypeKind .CATEGORICAL :
697
+ if self .describe_categorical [1 ]:
698
+ # return the categories as a child Column
699
+ return (_PandasColumn (self ._col .dtype .categories .to_series ()),)
700
+ else :
701
+ return tuple ()
702
+
696
703
697
704
class _PandasDataFrame :
698
705
"""
@@ -840,7 +847,7 @@ def test_categorical_dtype():
840
847
assert col .null_count == 1
841
848
assert col .describe_null == (2 , - 1 ) # sentinel value -1
842
849
assert col .num_chunks () == 1
843
- assert col .describe_categorical == (False , True , { 0 : 1 , 1 : 2 , 2 : 5 } )
850
+ assert col .describe_categorical == (False , True )
844
851
845
852
df2 = from_dataframe (df )
846
853
assert_dataframe_equal (df .__dataframe__ (), df )
0 commit comments