Skip to content

Commit 1b6ef4e

Browse files
committed
Illustrate issue with categorical dtype & get_data_buffer()
This shows the simple design doesn't fully work (see the FIXMEs in the diff). Instead, the `children` concept is needed. That way the categorical encoded data values can be returned as a child Column rather than a Buffer, and hence there's the necessary Column.dtype to interpret the buffer backing the column.
1 parent cfabb9f commit 1b6ef4e

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

protocol/pandas_implementation.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -135,20 +135,24 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
135135
raise NotImplementedError('Non-dictionary categoricals not supported yet')
136136

137137
# FIXME: this is cheating, can't use `_col` (just testing now)
138-
categories = col._col.values.categories.values
139-
codes = col._col.values.codes
138+
# categories = col._col.values.categories.values
139+
# codes = col._col.values.codes
140+
categories = np.asarray(list(mapping.values()))
141+
codes = col.get_data_buffer() # this is broken; don't have dtype info for buffer
140142
values = categories[codes]
141143

142-
# Deal with null values
143-
null_kind = col.describe_null[0]
144-
if null_kind == 2: # sentinel value
145-
sentinel = col.describe_null[1]
146-
147144
# Seems like Pandas can only construct with non-null values, so need to
148145
# null out the nulls later
149146
cat = pd.Categorical(values, categories=categories, ordered=ordered)
150147
series = pd.Series(cat)
151-
series[codes == sentinel] = np.nan
148+
null_kind = col.describe_null[0]
149+
if null_kind == 2: # sentinel value
150+
sentinel = col.describe_null[1]
151+
series[codes == sentinel] = np.nan
152+
else:
153+
raise NotImplementedError("Only categorical columns with sentinel "
154+
"value supported at the moment")
155+
152156
return series
153157

154158

@@ -430,7 +434,16 @@ def get_data_buffer(self) -> _PandasBuffer:
430434
"""
431435
Return the buffer containing the data.
432436
"""
433-
return _PandasBuffer(self._col.to_numpy())
437+
_k = _DtypeKind
438+
if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
439+
buffer = _PandasBuffer(self._col.to_numpy())
440+
elif self.dtype[0] == _k.CATEGORICAL:
441+
# FIXME: losing the dtype info here - see `convert_categorical_column`
442+
buffer = _PandasBuffer(self._col.values.codes)
443+
else:
444+
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
445+
446+
return buffer
434447

435448
def get_mask(self) -> _PandasBuffer:
436449
"""

0 commit comments

Comments
 (0)