@@ -97,8 +97,12 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
97
97
raise NotImplementedError ("Null values represented as masks or "
98
98
"sentinel values not handled yet" )
99
99
100
+ _buffer , _dtype = col .get_data_buffer ()
101
+ return buffer_to_ndarray (_buffer , _dtype )
102
+
103
+
104
+ def buffer_to_ndarray (_buffer , _dtype ) -> np .ndarray :
100
105
# Handle the dtype
101
- _dtype = col .dtype
102
106
kind = _dtype [0 ]
103
107
bitwidth = _dtype [1 ]
104
108
_k = _DtypeKind
@@ -113,7 +117,6 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
113
117
114
118
# No DLPack yet, so need to construct a new ndarray from the data pointer
115
119
# and size in the buffer plus the dtype on the column
116
- _buffer = col .get_data_buffer ()
117
120
ctypes_type = np .ctypeslib .as_ctypes_type (column_dtype )
118
121
data_pointer = ctypes .cast (_buffer .ptr , ctypes .POINTER (ctypes_type ))
119
122
@@ -134,11 +137,12 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
134
137
if not is_dict :
135
138
raise NotImplementedError ('Non-dictionary categoricals not supported yet' )
136
139
137
- # FIXME: this is cheating, can't use `_col` (just testing now)
140
+ # If you want to cheat for testing ( can't use `_col` in real-world code):
138
141
# categories = col._col.values.categories.values
139
142
# codes = col._col.values.codes
140
143
categories = np .asarray (list (mapping .values ()))
141
- codes = col .get_data_buffer () # this is broken; don't have dtype info for buffer
144
+ codes_buffer , codes_dtype = col .get_data_buffer ()
145
+ codes = buffer_to_ndarray (codes_buffer , codes_dtype )
142
146
values = categories [codes ]
143
147
144
148
# Seems like Pandas can only construct with non-null values, so need to
@@ -314,6 +318,12 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
314
318
and nested (list, struct, map, union) dtypes.
315
319
"""
316
320
dtype = self ._col .dtype
321
+ return self ._dtype_from_pandasdtype (dtype )
322
+
323
+ def _dtype_from_pandasdtype (self , dtype ) -> Tuple [enum .IntEnum , int , str , str ]:
324
+ """
325
+ See `self.dtype` for details
326
+ """
317
327
# Note: 'c' (complex) not handled yet (not in array spec v1).
318
328
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
319
329
# datetime and timedelta both map to datetime (is timedelta handled?)
@@ -430,20 +440,22 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
430
440
"""
431
441
return (self ,)
432
442
433
- def get_data_buffer (self ) -> _PandasBuffer :
443
+ def get_data_buffer (self ) -> Tuple [ _PandasBuffer , Any ]: # Any is for self.dtype tuple
434
444
"""
435
445
Return the buffer containing the data.
436
446
"""
437
447
_k = _DtypeKind
438
448
if self .dtype [0 ] in (_k .INT , _k .UINT , _k .FLOAT , _k .BOOL ):
439
449
buffer = _PandasBuffer (self ._col .to_numpy ())
450
+ dtype = self .dtype
440
451
elif self .dtype [0 ] == _k .CATEGORICAL :
441
- # FIXME: losing the dtype info here - see `convert_categorical_column`
442
- buffer = _PandasBuffer (self ._col .values .codes )
452
+ codes = self ._col .values .codes
453
+ buffer = _PandasBuffer (codes )
454
+ dtype = self ._dtype_from_pandasdtype (codes .dtype )
443
455
else :
444
456
raise NotImplementedError (f"Data type { self ._col .dtype } not handled yet" )
445
457
446
- return buffer
458
+ return buffer , dtype
447
459
448
460
def get_mask (self ) -> _PandasBuffer :
449
461
"""
0 commit comments