@@ -145,15 +145,14 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
145
145
"""
146
146
Convert a categorical column to a Series instance.
147
147
"""
148
- ordered , is_dict = col .describe_categorical
148
+ ordered , is_dict , mapping = col .describe_categorical
149
149
if not is_dict :
150
150
raise NotImplementedError ('Non-dictionary categoricals not supported yet' )
151
151
152
152
# If you want to cheat for testing (can't use `_col` in real-world code):
153
153
# categories = col._col.values.categories.values
154
154
# codes = col._col.values.codes
155
- categories_column , = col .get_children () # need to keep a reference to the child
156
- categories = convert_column_to_ndarray (categories_column )[0 ]
155
+ categories = convert_column_to_ndarray (mapping )
157
156
codes_buffer , codes_dtype = col .get_buffers ()["data" ]
158
157
codes = buffer_to_ndarray (codes_buffer , codes_dtype )
159
158
values = categories [codes ]
@@ -457,20 +456,19 @@ def describe_categorical(self) -> Dict[str, Any]:
457
456
- "is_ordered" : bool, whether the ordering of dictionary indices is
458
457
semantically meaningful.
459
458
- "is_dictionary" : bool, whether the data is integer encoded
460
-
459
+ - "mapping" : Column representing the mapping of indices to category values.
460
+ None if not a dictionary-style categorical.
461
461
"""
462
462
if not self .dtype [0 ] == _DtypeKind .CATEGORICAL :
463
463
raise TypeError ("`describe_categorical only works on a column with "
464
464
"categorical dtype!" )
465
465
466
466
ordered = self ._col .dtype .ordered
467
467
is_dictionary = True
468
- # NOTE: this shows the children approach is better, transforming
469
- # `categories` to a "mapping" dict is inefficient
470
- codes = self ._col .values .codes # ndarray, length `self.size`
471
- # categories.values is ndarray of length n_categories
472
- categories = self ._col .values .categories .values
473
- return ordered , is_dictionary
468
+ categories = _PandasColumn (self ._col .dtype .categories .to_series ())
469
+ return {"is_ordered" : ordered ,
470
+ "is_dictionary" : is_dictionary ,
471
+ "mapping" : categories }
474
472
475
473
@property
476
474
def describe_null (self ) -> Tuple [int , Any ]:
@@ -692,14 +690,6 @@ def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
692
690
693
691
return buffer , dtype
694
692
695
- def get_children (self ):
696
- if self .dtype [0 ] == _DtypeKind .CATEGORICAL :
697
- if self .describe_categorical [1 ]:
698
- # return the categories as a child Column
699
- return (_PandasColumn (self ._col .dtype .categories .to_series ()),)
700
- else :
701
- return tuple ()
702
-
703
693
704
694
class _PandasDataFrame :
705
695
"""
@@ -847,7 +837,8 @@ def test_categorical_dtype():
847
837
assert col .null_count == 1
848
838
assert col .describe_null == (2 , - 1 ) # sentinel value -1
849
839
assert col .num_chunks () == 1
850
- assert col .describe_categorical == (False , True )
840
+ assert col .describe_categorical ["is_ordered" ] == False
841
+ assert col .describe_categorical ["is_dictionary" ] == True
851
842
852
843
df2 = from_dataframe (df )
853
844
assert_dataframe_equal (df .__dataframe__ (), df )
0 commit comments