@@ -62,8 +62,16 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
62
62
# We need a dict of columns here, with each column being a numpy array (at
63
63
# least for now, deal with non-numpy dtypes later).
64
64
columns = dict ()
65
+ _k = _DtypeKind
65
66
for name in df .column_names ():
66
- columns [name ] = convert_column_to_ndarray (df .get_column_by_name (name ))
67
+ col = df .get_column_by_name (name )
68
+ if col .dtype [0 ] in (_k .INT , _k .UINT , _k .FLOAT , _k .BOOL ):
69
+ # Simple numerical or bool dtype, turn into numpy array
70
+ columns [name ] = convert_column_to_ndarray (col )
71
+ elif col .dtype [0 ] == _k .CATEGORICAL :
72
+ columns [name ] = convert_categorical_column (col )
73
+ else :
74
+ raise NotImplementedError (f"Data type { col .dtype [0 ]} not handled yet" )
67
75
68
76
return pd .DataFrame (columns )
69
77
@@ -80,6 +88,7 @@ class _DtypeKind(enum.IntEnum):
80
88
81
89
def convert_column_to_ndarray (col : ColumnObject ) -> np .ndarray :
82
90
"""
91
+ Convert an int, uint, float or bool column to a numpy array
83
92
"""
84
93
if col .offset != 0 :
85
94
raise NotImplementedError ("column.offset > 0 not handled yet" )
@@ -117,6 +126,32 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
117
126
return x
118
127
119
128
129
+ def convert_categorical_column (col : ColumnObject ) -> pd .Series :
130
+ """
131
+ Convert a categorical column to a Series instance
132
+ """
133
+ ordered , is_dict , mapping = col .describe_categorical
134
+ if not is_dict :
135
+ raise NotImplementedError ('Non-dictionary categoricals not supported yet' )
136
+
137
+ # FIXME: this is cheating, can't use `_col` (just testing now)
138
+ categories = col ._col .values .categories .values
139
+ codes = col ._col .values .codes
140
+ values = categories [codes ]
141
+
142
+ # Deal with null values
143
+ null_kind = col .describe_null [0 ]
144
+ if null_kind == 2 : # sentinel value
145
+ sentinel = col .describe_null [1 ]
146
+
147
+ # Seems like Pandas can only construct with non-null values, so need to
148
+ # null out the nulls later
149
+ cat = pd .Categorical (values , categories = categories , ordered = ordered )
150
+ series = pd .Series (cat )
151
+ series [codes == sentinel ] = np .nan
152
+ return series
153
+
154
+
120
155
def __dataframe__ (cls , nan_as_null : bool = False ) -> dict :
121
156
"""
122
157
The public method to attach to pd.DataFrame
@@ -324,13 +359,14 @@ def describe_categorical(self) -> Dict[str, Any]:
324
359
"categorical dtype!" )
325
360
326
361
ordered = self ._col .dtype .ordered
327
- is_dictionary = False
328
- # NOTE: this shows the children approach is better, transforming this
329
- # to a "mapping" dict would be inefficient
362
+ is_dictionary = True
363
+ # NOTE: this shows the children approach is better, transforming
364
+ # `categories` to a "mapping" dict is inefficient
330
365
codes = self ._col .values .codes # ndarray, length `self.size`
331
366
# categories.values is ndarray of length n_categories
332
- categories = self ._col .values .categories
333
- return ordered , is_dictionary , None
367
+ categories = self ._col .values .categories .values
368
+ mapping = {ix : val for ix , val in enumerate (categories )}
369
+ return ordered , is_dictionary , mapping
334
370
335
371
@property
336
372
def describe_null (self ) -> Tuple [int , Any ]:
@@ -402,7 +438,7 @@ def get_mask(self) -> _PandasBuffer:
402
438
403
439
Raises RuntimeError if null representation is not a bit or byte mask.
404
440
"""
405
- null = self .describe_null ()
441
+ null , value = self .describe_null
406
442
if null == 0 :
407
443
msg = "This column is non-nullable so does not have a mask"
408
444
elif null == 1 :
@@ -501,7 +537,7 @@ def test_noncontiguous_columns():
501
537
502
538
503
539
def test_categorical_dtype ():
504
- df = pd .DataFrame ({"A" : [1 , 2 , 3 , 1 ]})
540
+ df = pd .DataFrame ({"A" : [1 , 2 , 5 , 1 ]})
505
541
df ["B" ] = df ["A" ].astype ("category" )
506
542
df .at [1 , 'B' ] = np .nan # Set one item to null
507
543
@@ -511,15 +547,15 @@ def test_categorical_dtype():
511
547
assert col .null_count == 1
512
548
assert col .describe_null == (2 , - 1 ) # sentinel value -1
513
549
assert col .num_chunks () == 1
514
- assert col .describe_categorical == (False , False , None )
550
+ assert col .describe_categorical == (False , True , { 0 : 1 , 1 : 2 , 2 : 5 } )
515
551
516
552
df2 = from_dataframe (df )
517
553
tm .assert_frame_equal (df , df2 )
518
554
519
555
520
556
if __name__ == '__main__' :
557
+ test_categorical_dtype ()
521
558
test_float_only ()
522
559
test_mixed_intfloat ()
523
560
test_noncontiguous_columns ()
524
- test_categorical_dtype ()
525
561
0 commit comments