Skip to content

Commit 9c2717b

Browse files
committed
Partial support for categorical dtypes - export works
1 parent c08ec10 commit 9c2717b

File tree

1 file changed

+43
-18
lines changed

1 file changed

+43
-18
lines changed

protocol/pandas_implementation.py

+43-18
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
8484
if col.offset != 0:
8585
raise NotImplementedError("column.offset > 0 not handled yet")
8686

87-
if col.describe_null not in (0, 1):
87+
if col.describe_null[0] not in (0, 1):
8888
raise NotImplementedError("Null values represented as masks or "
8989
"sentinel values not handled yet")
9090

@@ -230,19 +230,19 @@ def offset(self) -> int:
230230
return 0
231231

232232
@property
233-
def dtype(self) -> Tuple[int, int, str, str]:
233+
def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
234234
"""
235235
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
236236
237237
Kind :
238238
239-
- 0 : signed integer
240-
- 1 : unsigned integer
241-
- 2 : IEEE floating point
242-
- 20 : boolean
243-
- 21 : string (UTF-8)
244-
- 22 : datetime
245-
- 23 : categorical
239+
- INT = 0
240+
- UINT = 1
241+
- FLOAT = 2
242+
- BOOL = 20
243+
- STRING = 21 # UTF-8
244+
- DATETIME = 22
245+
- CATEGORICAL = 23
246246
247247
Bit-width : the number of bits as an integer
248248
Format string : data type description format string in Apache Arrow C
@@ -273,15 +273,25 @@ def dtype(self) -> Tuple[int, int, str, str]:
273273
# Note: 'c' (complex) not handled yet (not in array spec v1).
274274
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
275275
# datetime and timedelta both map to datetime (is timedelta handled?)
276-
_np_kinds = {'i': 0, 'u': 1, 'f': 2, 'b': 20, 'O': 21, 'U': 21,
277-
'M': 22, 'm': 22}
276+
_k = _DtypeKind
277+
_np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL,
278+
'U': _k.STRING,
279+
'M': _k.DATETIME, 'm': _k.DATETIME}
278280
kind = _np_kinds.get(dtype.kind, None)
279281
if kind is None:
280-
raise NotImplementedError("Data type {} not handled".format(dtype))
282+
# Not a NumPy dtype. Check if it's a categorical maybe
283+
if isinstance(dtype, pd.CategoricalDtype):
284+
kind = 23
285+
else:
286+
raise ValueError(f"Data type {dtype} not supported by exchange"
287+
"protocol")
288+
289+
if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL):
290+
raise NotImplementedError(f"Data type {dtype} not handled yet")
281291

282292
bitwidth = dtype.itemsize * 8
283293
format_str = dtype.str
284-
endianness = dtype.byteorder
294+
endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
285295
return (kind, bitwidth, format_str, endianness)
286296

287297

@@ -324,19 +334,26 @@ def describe_null(self) -> Tuple[int, Any]:
324334
325335
Value : if kind is "sentinel value", the actual value. None otherwise.
326336
"""
337+
_k = _DtypeKind
327338
kind = self.dtype[0]
328-
if kind == 2:
339+
value = None
340+
if kind == _k.FLOAT:
329341
null = 1 # np.nan
330-
elif kind == 22:
342+
elif kind == _k.DATETIME:
331343
null = 1 # np.datetime64('NaT')
332-
elif kind in (0, 1, 20):
344+
elif kind in (_k.INT, _k.UINT, _k.BOOL):
333345
# TODO: check if extension dtypes are used once support for them is
334346
# implemented in this procotol code
335347
null = 0 # integer and boolean dtypes are non-nullable
348+
elif kind == _k.CATEGORICAL:
349+
# Null values for categoricals are stored as `-1` sentinel values
350+
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
351+
null = 2
352+
value = -1
336353
else:
337-
raise NotImplementedError('TODO')
354+
raise NotImplementedError(f'Data type {self.dtype} not yet supported')
338355

339-
return null
356+
return null, value
340357

341358
@property
342359
def null_count(self) -> int:
@@ -469,8 +486,16 @@ def test_noncontiguous_columns():
469486
#df2 = from_dataframe(df)
470487

471488

489+
def test_categorical_dtype():
490+
df = pd.DataFrame({"A": [1, 2, 3, 1]})
491+
df["B"] = df["A"].astype("category")
492+
df.at[1, 'B'] = np.nan # Set one item to null
493+
df2 = from_dataframe(df)
494+
495+
472496
if __name__ == '__main__':
473497
test_float_only()
474498
test_mixed_intfloat()
475499
test_noncontiguous_columns()
500+
test_categorical_dtype()
476501

0 commit comments

Comments
 (0)