From 9f82870143f2c93bf9c6ff7879c1936b14116542 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Tue, 9 Feb 2021 21:29:17 +0100 Subject: [PATCH 01/13] Add a prototype of the dataframe interchange protocol Related to requirements in gh-35. TBD (to be discussed) comments and design decisions at the top of the file indicate topics for closer review/discussion. --- protocol/dataframe_protocol.py | 381 +++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 protocol/dataframe_protocol.py diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py new file mode 100644 index 00000000..f36fe2c3 --- /dev/null +++ b/protocol/dataframe_protocol.py @@ -0,0 +1,381 @@ +""" +Specification for objects to be accessed, for the purpose of dataframe +interchange between libraries, via the ``__dataframe__`` method on a libraries' +data frame object. + +For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35 + +Design decisions +---------------- + +**1. Use a separate column abstraction in addition to a dataframe interface.** + +Rationales: +- This is how it works in R, Julia and Apache Arrow. +- Semantically most existing applications and users treat a column similar to a 1-D array +- We should be able to connect a column to the array data interchange mechanism(s) + +Note that this does not imply a library must have such a public user-facing +abstraction (ex. ``pandas.Series``) - it can only be accessed via ``__dataframe__``. + +**2. Use methods and properties on an opaque object rather than returning +hierarchical dictionaries describing memory** + +This is better for implementations that may rely on, for example, lazy +computation. + +**3. No row names. If a library uses row names, use a regular column for them.** + +See discussion at https://github.com/wesm/dataframe-protocol/pull/1/files#r394316241 +Optional row names are not a good idea, because people will assume they're present +(see cuDF experience, forced to add because pandas has them). +Requiring row names seems worse than leaving them out. + +""" + + +class Buffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes + """ + pass + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer + """ + pass + + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + def __array_interface__(self): + """ + TBD: implement or not? Will work for all dtypes except bit masks. + """ + raise NotImplementedError("__array_interface__") + + +class Column: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain either one + or two buffers - one data buffer and (depending on null representation) it + may have a mask buffer. + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + + """ + @property + def name(self) -> str: + pass + + @property + def size(self) -> Optional[int]: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + """ + pass + + @property + def offset(self) -> int: + """ + Offset of first element + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + pass + + @property + def dtype(self) -> Tuple[int, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + + Kind : + + - 0 : signed integer + - 1 : unsigned integer + - 2 : IEEE floating point + - 20 : boolean + - 21 : string (UTF-8) + - 22 : datetime + - 23 : categorical + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + @property + def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: + """ + If the dtype is categorical, there are two options: + + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + Raises RuntimeError if the dtype is not categorical + + Content of returned dict: + + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + + TBD: are there any other in-memory representations that are needed? + """ + pass + + @property + def describe_null(self) -> Tuple[int, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Kind: + + - 0 : NaN/NaT + - 1 : sentinel value + - 2 : bit mask + - 3 : byte mask + + Value : if kind is "sentinel value", the actual value. None otherwise. + """ + pass + + @property + def null_count(self) -> Optional[int]: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + def get_buffer(self) -> Buffer: + """ + Return the buffer containing the data. + """ + pass + + def get_mask(self) -> Buffer: + """ + Return the buffer containing the mask values indicating missing data. + + Raises RuntimeError if null representation is not a bit or byte mask. + """ + pass + +# # NOTE: not needed unless one considers nested dtypes +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification +# """ +# pass + + +class DataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + def __dataframe__(self, nan_as_null : bool = False) -> dict: + """ + Produces a dictionary object following the dataframe protocol spec + """ + self._nan_as_null = nan_as_null + return { + "dataframe": self, # DataFrame object adhering to the protocol + "version": 0 # Version number of the protocol + } + + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame + """ + pass + + def num_rows(self) -> Optional[int]: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of + """ + pass + + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + pass + + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + pass + + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + pass + + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + pass + + def select_columns(self, indices: Sequence[int]) -> DataFrame: + """ + Create a new DataFrame by selecting a subset of columns by index + """ + pass + + def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + pass + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + """ + pass + + @property + def device(self) -> int: + """ + Device type the dataframe resides on. + + Uses device type codes matching DLPack: + + - 1 : CPU + - 2 : CUDA + - 3 : CPU pinned + - 4 : OpenCL + - 7 : Vulkan + - 8 : Metal + - 9 : Verilog + - 10 : ROCm + """ + pass From b201c68643c06d0cc78ca8717d929b5beec74aaf Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 10 Feb 2021 22:20:05 +0100 Subject: [PATCH 02/13] Remove Column.name, rename get_buffer to get_data_buffer --- protocol/dataframe_protocol.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index f36fe2c3..f612667b 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -117,9 +117,6 @@ class Column: doesn't need its own version or ``__column__`` protocol. """ - @property - def name(self) -> str: - pass @property def size(self) -> Optional[int]: @@ -247,7 +244,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: """ pass - def get_buffer(self) -> Buffer: + def get_data_buffer(self) -> Buffer: """ Return the buffer containing the data. """ From 61d84f35fb76f2db346991e4bab10e80aac5a2e9 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 11 Feb 2021 17:52:04 +0100 Subject: [PATCH 03/13] Remove __array_interface__ --- protocol/dataframe_protocol.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index f612667b..9e3d02b8 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -67,12 +67,6 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") - def __array_interface__(self): - """ - TBD: implement or not? Will work for all dtypes except bit masks. - """ - raise NotImplementedError("__array_interface__") - class Column: """ From dc3b373e13569338deb96038f17a8585ac9a5f90 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 11 Feb 2021 17:53:36 +0100 Subject: [PATCH 04/13] Add "non-nullable" to `Column.describe_null` --- protocol/dataframe_protocol.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 9e3d02b8..448dbc2c 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -206,10 +206,11 @@ def describe_null(self) -> Tuple[int, Any]: Kind: - - 0 : NaN/NaT - - 1 : sentinel value - - 2 : bit mask - - 3 : byte mask + - 0 : non-nullable + - 1 : NaN/NaT + - 2 : sentinel value + - 3 : bit mask + - 4 : byte mask Value : if kind is "sentinel value", the actual value. None otherwise. """ From cb338fd6b5edfacf101e932dfe67f7124f541685 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Mon, 1 Mar 2021 11:36:52 +0100 Subject: [PATCH 05/13] Address some review comments and add more docs --- protocol/dataframe_protocol.py | 82 +++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 17 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 448dbc2c..f7e4b31e 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -5,6 +5,32 @@ For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35 + +Concepts in this design +----------------------- + +1. A `Buffer` class. A *buffer* is a contiguous block of memory - this is the + only thing that actually maps to a 1-D array in a sense that it could be + converted to NumPy, CuPy, et al. +2. A `Column` class. A *column* has a name and a single dtype. It can consist + of multiple *chunks*. A single chunk of a column (which may be the whole + column if ``num_chunks == 1``) is modeled as again a `Column` instance, and + contains 1 data *buffer* and (optionally) one *mask* for missing data. +3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*. + It has a single device, and all its rows are the same length. It can consist + of multiple *chunks*. A single chunk of a data frame is modeled as + again a `DataFrame` instance. +4. A *mask* concept. A *mask* of a single-chunk column is a *buffer*. +5. A *chunk* concept. A *chunk* is a sub-dividing element that can be applied + to a *data frame* or a *column*. + +Note that the only way to access these objects is through a call to +``__dataframe__`` on a data frame object. This is NOT meant as public API; +only think of instances of the different classes here to describe the API of +what is returned by a call to ``__dataframe__``. They are the concepts needed +to capture the memory layout and data access of a data frame. + + Design decisions ---------------- @@ -31,12 +57,27 @@ (see cuDF experience, forced to add because pandas has them). Requiring row names seems worse than leaving them out. +Note that row labels could be added in the future - right now there's no clear +requirements for more complex row labels that cannot be represented by a single +column. That do exist, for example Modin has has table and tree-based row +labels. + """ class Buffer: """ Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. """ @property @@ -67,6 +108,25 @@ def __dlpack__(self): """ raise NotImplementedError("__dlpack__") + def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + """ + Device type and device ID for where the data in the buffer resides. + + Uses device type codes matching DLPack. Enum members are:: + + - CPU = 1 + - CUDA = 2 + - CPU_PINNED = 3 + - OPENCL = 4 + - VULKAN = 7 + - METAL = 8 + - VPI = 9 + - ROCM = 10 + + Note: must be implemented even if ``__dlpack__`` is not. + """ + pass + class Column: """ @@ -279,6 +339,11 @@ class DataFrame: def __dataframe__(self, nan_as_null : bool = False) -> dict: """ Produces a dictionary object following the dataframe protocol spec + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. """ self._nan_as_null = nan_as_null return { @@ -354,20 +419,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]: """ pass - @property - def device(self) -> int: - """ - Device type the dataframe resides on. - - Uses device type codes matching DLPack: - - - 1 : CPU - - 2 : CUDA - - 3 : CPU pinned - - 4 : OpenCL - - 7 : Vulkan - - 8 : Metal - - 9 : Verilog - - 10 : ROCm - """ - pass From 35d3c0d8a21c2925144abe197d4d84f247d04974 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 3 Mar 2021 00:27:23 +0100 Subject: [PATCH 06/13] Add a Pandas implementation of the interchange protocol --- protocol/pandas_implementation.py | 450 ++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 protocol/pandas_implementation.py diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py new file mode 100644 index 00000000..b3cd9f1a --- /dev/null +++ b/protocol/pandas_implementation.py @@ -0,0 +1,450 @@ +""" +Implementation of the dataframe exchange protocol. + +Public API +---------- + +from_dataframe : construct a pandas.DataFrame from an input data frame which + implements the exchange protocol + +Notes +----- + +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to + do in pure Python. It's more general but definitely less friendly than having + ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), + this is worth looking at again. + +""" + +import enum +import collections +import ctypes +from typing import Any, Optional, Tuple, Dict, Iterable, Sequence + +import pandas as pd +import numpy as np +import pandas._testing as tm + + +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any + + +def from_dataframe(df : DataFrameObject) -> pd.DataFrame: + """ + Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` + """ + # NOTE: commented out for roundtrip testing + # if isinstance(df, pd.DataFrame): + # return df + + if not hasattr(df, '__dataframe__'): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__()) + + +def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: + """ + Note: not all cases are handled yet, only ones that can be implemented with + only Pandas. Later, we need to implement/test support for categoricals, + bit/byte masks, chunk handling, etc. + """ + # Check number of chunks, if there's more than one we need to iterate + if df.num_chunks() > 1: + raise NotImplementedError + + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + columns = dict() + for name in df.column_names(): + columns[name] = convert_column_to_ndarray(df.get_column_by_name(name)) + + return pd.DataFrame(columns) + + +def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: + """ + """ + if col.offset != 0: + raise NotImplementedError("column.offset > 0 not handled yet") + + if col.describe_null not in (0, 1): + raise NotImplementedError("Null values represented as masks or " + "sentinel values not handled yet") + + # Handle the dtype + _dtype = col.dtype + kind = _dtype[0] + bitwidth = _dtype[1] + if _dtype[0] not in (0, 1, 2, 20): + raise RuntimeError("Not a boolean, integer or floating-point dtype") + + _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} + _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} + _floats = {32: np.float32, 64: np.float64} + _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + column_dtype = _np_dtypes[kind][bitwidth] + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + _buffer = col.get_data_buffer() + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) + + # NOTE: `x` does not own its memory, so the caller of this function must + # either make a copy or hold on to a reference of the column or + # buffer! (not done yet, this is pretty awful ...) + x = np.ctypeslib.as_array(data_pointer, + shape=(_buffer.bufsize // (bitwidth//8),)) + + return x + + +def __dataframe__(cls, nan_as_null : bool = False) -> dict: + """ + The public method to attach to pd.DataFrame + + We'll attach it via monkeypatching here for demo purposes. If Pandas adopt + the protocol, this will be a regular method on pandas.DataFrame. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + return _PandasDataFrame(cls, nan_as_null=nan_as_null) + + +# Monkeypatch the Pandas DataFrame class to support the interchange protocol +pd.DataFrame.__dataframe__ = __dataframe__ + + +# Implementation of interchange protocol +# -------------------------------------- + +class _PandasBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x : np.ndarray) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # Array is not contiguous - is this possible? + raise RuntimeError("Design needs fixing - non-contiguous buffer") + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer + """ + return self._x.__array_interface__['data'][0] + + def __dlpack__(self): + """ + DLPack not implemented in NumPy yet, so leave it out here + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: + """ + Device type and device ID for where the data in the buffer resides. + """ + class Device(enum.IntEnum): + CPU = 1 + + return (Device.CPU, None) + + +class _PandasColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain either one + or two buffers - one data buffer and (depending on null representation) it + may have a mask buffer. + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + + """ + + def __init__(self, column : pd.Series) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, pd.Series): + raise NotImplementedError("Columns of type {} not handled " + "yet".format(type(column))) + + # Store the column as a private attribute + self._col = column + + @property + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + return 0 + + @property + def dtype(self) -> Tuple[int, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + + Kind : + + - 0 : signed integer + - 1 : unsigned integer + - 2 : IEEE floating point + - 20 : boolean + - 21 : string (UTF-8) + - 22 : datetime + - 23 : categorical + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.dtype + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + _np_kinds = {'i': 0, 'u': 1, 'f': 2, 'b': 20, 'O': 21, 'U': 21, + 'M': 22, 'm': 22} + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + raise NotImplementedError("Data type {} not handled".format(dtype)) + + bitwidth = dtype.itemsize * 8 + format_str = dtype.str + endianness = dtype.byteorder + return (kind, bitwidth, format_str, endianness) + + + @property + def describe_categorical(self) -> Dict[str, Any]: + """ + If the dtype is categorical, there are two options: + + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + Raises RuntimeError if the dtype is not categorical + + Content of returned dict: + + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + + TBD: are there any other in-memory representations that are needed? + """ + raise NotImplementedError("TODO") + + @property + def describe_null(self) -> Tuple[int, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Kind: + + - 0 : non-nullable + - 1 : NaN/NaT + - 2 : sentinel value + - 3 : bit mask + - 4 : byte mask + + Value : if kind is "sentinel value", the actual value. None otherwise. + """ + kind = self.dtype[0] + if kind == 2: + null = 1 # np.nan + elif kind == 22: + null = 1 # np.datetime64('NaT') + elif kind in (0, 1, 20): + # TODO: check if extension dtypes are used once support for them is + # implemented in this procotol code + null = 0 # integer and boolean dtypes are non-nullable + else: + raise NotImplementedError('TODO') + + return null + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.isna().sum() + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn']: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + return (self,) + + def get_data_buffer(self) -> _PandasBuffer: + """ + Return the buffer containing the data. + """ + return _PandasBuffer(self._col.to_numpy()) + + def get_mask(self) -> _PandasBuffer: + """ + Return the buffer containing the mask values indicating missing data. + + Raises RuntimeError if null representation is not a bit or byte mask. + """ + null = self.describe_null() + if null == 0: + msg = "This column is non-nullable so does not have a mask" + elif null == 1: + msg = "This column uses NaN as null so does not have a separate mask" + else: + raise NotImplementedError('See self.describe_null') + + raise RuntimeError(msg) + + +class _PandasDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + Instances of this (private) class are returned from + ``pd.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pd.DataFrame.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + self._nan_as_null = nan_as_null + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self) -> Iterable[str]: + return self._df.columns.tolist() + + def get_column(self, i: int) -> _PandasColumn: + return _PandasColumn(self._df.iloc[:, i]) + + def get_column_by_name(self, name: str) -> _PandasColumn: + return _PandasColumn(self._df[name]) + + def get_columns(self) -> Iterable[_PandasColumn]: + return [_PandasColumn(self._df[name]) for name in self._df.columns] + + def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame': + if not isinstance(indices, collections.Sequence): + raise ValueError("`indices` is not a sequence") + + return _PandasDataFrame(self._df.iloc[:, indices]) + + def select_columns_by_name(self, names: Sequence[str]) -> '_PandasDataFrame': + if not isinstance(names, collections.Sequence): + raise ValueError("`names` is not a sequence") + + return _PandasDataFrame(self._df.xs(indices, axis='columns')) + + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFrame']: + """ + Return an iterator yielding the chunks. + """ + return (self,) + + +# Roundtrip testing +# ----------------- + +def test_float_only(): + df = pd.DataFrame(data=dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])) + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) + + +def test_mixed_intfloat(): + df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[3, 4, 5], + c=[1.5, 2.5, 3.5], d=[9, 10, 11])) + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) + + +if __name__ == '__main__': + test_float_only() + test_mixed_intfloat() + From 90b4f42630d058a5637f38d498a58c147299192c Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 17 Mar 2021 23:54:29 +0100 Subject: [PATCH 07/13] Minor change: add a test for strided columns --- protocol/pandas_implementation.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index b3cd9f1a..24b41188 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -26,6 +26,7 @@ import pandas as pd import numpy as np import pandas._testing as tm +import pytest # A typing protocol could be added later to let Mypy validate code using @@ -137,7 +138,10 @@ def __init__(self, x : np.ndarray) -> None: Handle only regular columns (= numpy arrays) for now. """ if not x.strides == (x.dtype.itemsize,): - # Array is not contiguous - is this possible? + # Array is not contiguous - this is possible to get in Pandas, + # there was some discussion on whether to support it. Som extra + # complexity for libraries that don't support it (e.g. Arrow), + # but would help with numpy-based libraries like Pandas. raise RuntimeError("Design needs fixing - non-contiguous buffer") # Store the numpy array in which the data resides as a private @@ -444,7 +448,18 @@ def test_mixed_intfloat(): tm.assert_frame_equal(df, df2) +def test_noncontiguous_columns(): + # Currently raises: TBD whether it should work or not, see code comment + # where the RuntimeError is raised. + arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + df = pd.DataFrame(arr) + assert df[0].to_numpy().strides == (24,) + pytest.raises(RuntimeError, from_dataframe, df) + #df2 = from_dataframe(df) + + if __name__ == '__main__': test_float_only() test_mixed_intfloat() + test_noncontiguous_columns() From c08ec1021aca9495fefb49906f3d480c434e3e29 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 15:49:37 +0200 Subject: [PATCH 08/13] Address some code review comments --- protocol/dataframe_protocol.py | 28 ++++++++++++++-------------- protocol/pandas_implementation.py | 13 ++++++++++++- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index f7e4b31e..816d648e 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -12,14 +12,14 @@ 1. A `Buffer` class. A *buffer* is a contiguous block of memory - this is the only thing that actually maps to a 1-D array in a sense that it could be converted to NumPy, CuPy, et al. -2. A `Column` class. A *column* has a name and a single dtype. It can consist +2. A `Column` class. A *column* has a single dtype. It can consist of multiple *chunks*. A single chunk of a column (which may be the whole column if ``num_chunks == 1``) is modeled as again a `Column` instance, and contains 1 data *buffer* and (optionally) one *mask* for missing data. -3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*. - It has a single device, and all its rows are the same length. It can consist - of multiple *chunks*. A single chunk of a data frame is modeled as - again a `DataFrame` instance. +3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*, + which are identified with names that are unique strings. All the data + frame's rows are the same length. It can consist of multiple *chunks*. A + single chunk of a data frame is modeled as again a `DataFrame` instance. 4. A *mask* concept. A *mask* of a single-chunk column is a *buffer*. 5. A *chunk* concept. A *chunk* is a sub-dividing element that can be applied to a *data frame* or a *column*. @@ -59,7 +59,7 @@ Note that row labels could be added in the future - right now there's no clear requirements for more complex row labels that cannot be represented by a single -column. That do exist, for example Modin has has table and tree-based row +column. These do exist, for example Modin has has table and tree-based row labels. """ @@ -194,19 +194,19 @@ def offset(self) -> int: pass @property - def dtype(self) -> Tuple[int, int, str, str]: + def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` Kind : - - 0 : signed integer - - 1 : unsigned integer - - 2 : IEEE floating point - - 20 : boolean - - 21 : string (UTF-8) - - 22 : datetime - - 23 : categorical + - INT = 0 + - UINT = 1 + - FLOAT = 2 + - BOOL = 20 + - STRING = 21 # UTF-8 + - DATETIME = 22 + - CATEGORICAL = 23 Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 24b41188..c567560d 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -68,6 +68,16 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: return pd.DataFrame(columns) +class _DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: """ """ @@ -82,7 +92,8 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: _dtype = col.dtype kind = _dtype[0] bitwidth = _dtype[1] - if _dtype[0] not in (0, 1, 2, 20): + _k = _DtypeKind + if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): raise RuntimeError("Not a boolean, integer or floating-point dtype") _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} From 9c2717b1eb6cb9593e70ebbec11b8b3a53f948f5 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 16:44:09 +0200 Subject: [PATCH 09/13] Partial support for categorical dtypes - export works --- protocol/pandas_implementation.py | 61 ++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index c567560d..6c5a99e1 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -84,7 +84,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") - if col.describe_null not in (0, 1): + if col.describe_null[0] not in (0, 1): raise NotImplementedError("Null values represented as masks or " "sentinel values not handled yet") @@ -230,19 +230,19 @@ def offset(self) -> int: return 0 @property - def dtype(self) -> Tuple[int, int, str, str]: + def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` Kind : - - 0 : signed integer - - 1 : unsigned integer - - 2 : IEEE floating point - - 20 : boolean - - 21 : string (UTF-8) - - 22 : datetime - - 23 : categorical + - INT = 0 + - UINT = 1 + - FLOAT = 2 + - BOOL = 20 + - STRING = 21 # UTF-8 + - DATETIME = 22 + - CATEGORICAL = 23 Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C @@ -273,15 +273,25 @@ def dtype(self) -> Tuple[int, int, str, str]: # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) - _np_kinds = {'i': 0, 'u': 1, 'f': 2, 'b': 20, 'O': 21, 'U': 21, - 'M': 22, 'm': 22} + _k = _DtypeKind + _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL, + 'U': _k.STRING, + 'M': _k.DATETIME, 'm': _k.DATETIME} kind = _np_kinds.get(dtype.kind, None) if kind is None: - raise NotImplementedError("Data type {} not handled".format(dtype)) + # Not a NumPy dtype. Check if it's a categorical maybe + if isinstance(dtype, pd.CategoricalDtype): + kind = 23 + else: + raise ValueError(f"Data type {dtype} not supported by exchange" + "protocol") + + if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): + raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 format_str = dtype.str - endianness = dtype.byteorder + endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '=' return (kind, bitwidth, format_str, endianness) @@ -324,19 +334,26 @@ def describe_null(self) -> Tuple[int, Any]: Value : if kind is "sentinel value", the actual value. None otherwise. """ + _k = _DtypeKind kind = self.dtype[0] - if kind == 2: + value = None + if kind == _k.FLOAT: null = 1 # np.nan - elif kind == 22: + elif kind == _k.DATETIME: null = 1 # np.datetime64('NaT') - elif kind in (0, 1, 20): + elif kind in (_k.INT, _k.UINT, _k.BOOL): # TODO: check if extension dtypes are used once support for them is # implemented in this procotol code null = 0 # integer and boolean dtypes are non-nullable + elif kind == _k.CATEGORICAL: + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + null = 2 + value = -1 else: - raise NotImplementedError('TODO') + raise NotImplementedError(f'Data type {self.dtype} not yet supported') - return null + return null, value @property def null_count(self) -> int: @@ -469,8 +486,16 @@ def test_noncontiguous_columns(): #df2 = from_dataframe(df) +def test_categorical_dtype(): + df = pd.DataFrame({"A": [1, 2, 3, 1]}) + df["B"] = df["A"].astype("category") + df.at[1, 'B'] = np.nan # Set one item to null + df2 = from_dataframe(df) + + if __name__ == '__main__': test_float_only() test_mixed_intfloat() test_noncontiguous_columns() + test_categorical_dtype() From 552b7943f9110c0acc8e5fed86df9f1ead606c3a Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 17:20:58 +0200 Subject: [PATCH 10/13] Add describe_categorical support and a buffer `__repr__` --- protocol/pandas_implementation.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 6c5a99e1..f96b0f31 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -188,6 +188,11 @@ class Device(enum.IntEnum): return (Device.CPU, None) + def __repr__(self) -> str: + return 'PandasBuffer(' + str({'bufsize': self.bufsize, + 'ptr': self.ptr, + 'device': self.__dlpack_device__()[0].name} + ) + ')' class _PandasColumn: """ @@ -313,10 +318,19 @@ def describe_categorical(self) -> Dict[str, Any]: categorical values to other objects exists - "mapping" : dict, Python-level only (e.g. ``{int: str}``). None if not a dictionary-style categorical. - - TBD: are there any other in-memory representations that are needed? """ - raise NotImplementedError("TODO") + if not self.dtype[0] == _DtypeKind.CATEGORICAL: + raise TypeError("`describe_categorical only works on a column with " + "categorical dtype!") + + ordered = self._col.dtype.ordered + is_dictionary = False + # NOTE: this shows the children approach is better, transforming this + # to a "mapping" dict would be inefficient + codes = self._col.values.codes # ndarray, length `self.size` + # categories.values is ndarray of length n_categories + categories = self._col.values.categories + return ordered, is_dictionary, None @property def describe_null(self) -> Tuple[int, Any]: @@ -490,7 +504,17 @@ def test_categorical_dtype(): df = pd.DataFrame({"A": [1, 2, 3, 1]}) df["B"] = df["A"].astype("category") df.at[1, 'B'] = np.nan # Set one item to null + + # Some detailed testing for correctness of dtype and null handling: + col = df.__dataframe__().get_column_by_name('B') + assert col.dtype[0] == _DtypeKind.CATEGORICAL + assert col.null_count == 1 + assert col.describe_null == (2, -1) # sentinel value -1 + assert col.num_chunks() == 1 + assert col.describe_categorical == (False, False, None) + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) if __name__ == '__main__': From cfabb9f17166bbdde82252f2167828ab1f1461aa Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 18:11:30 +0200 Subject: [PATCH 11/13] Make roundtripping with categorical dype work (with some cheating) --- protocol/pandas_implementation.py | 56 +++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index f96b0f31..1222260c 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -62,8 +62,16 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: # We need a dict of columns here, with each column being a numpy array (at # least for now, deal with non-numpy dtypes later). columns = dict() + _k = _DtypeKind for name in df.column_names(): - columns[name] = convert_column_to_ndarray(df.get_column_by_name(name)) + col = df.get_column_by_name(name) + if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + # Simple numerical or bool dtype, turn into numpy array + columns[name] = convert_column_to_ndarray(col) + elif col.dtype[0] == _k.CATEGORICAL: + columns[name] = convert_categorical_column(col) + else: + raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") return pd.DataFrame(columns) @@ -80,6 +88,7 @@ class _DtypeKind(enum.IntEnum): def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: """ + Convert an int, uint, float or bool column to a numpy array """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") @@ -117,6 +126,32 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: return x +def convert_categorical_column(col : ColumnObject) -> pd.Series: + """ + Convert a categorical column to a Series instance + """ + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError('Non-dictionary categoricals not supported yet') + + # FIXME: this is cheating, can't use `_col` (just testing now) + categories = col._col.values.categories.values + codes = col._col.values.codes + values = categories[codes] + + # Deal with null values + null_kind = col.describe_null[0] + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + + # Seems like Pandas can only construct with non-null values, so need to + # null out the nulls later + cat = pd.Categorical(values, categories=categories, ordered=ordered) + series = pd.Series(cat) + series[codes == sentinel] = np.nan + return series + + def __dataframe__(cls, nan_as_null : bool = False) -> dict: """ The public method to attach to pd.DataFrame @@ -324,13 +359,14 @@ def describe_categorical(self) -> Dict[str, Any]: "categorical dtype!") ordered = self._col.dtype.ordered - is_dictionary = False - # NOTE: this shows the children approach is better, transforming this - # to a "mapping" dict would be inefficient + is_dictionary = True + # NOTE: this shows the children approach is better, transforming + # `categories` to a "mapping" dict is inefficient codes = self._col.values.codes # ndarray, length `self.size` # categories.values is ndarray of length n_categories - categories = self._col.values.categories - return ordered, is_dictionary, None + categories = self._col.values.categories.values + mapping = {ix: val for ix, val in enumerate(categories)} + return ordered, is_dictionary, mapping @property def describe_null(self) -> Tuple[int, Any]: @@ -402,7 +438,7 @@ def get_mask(self) -> _PandasBuffer: Raises RuntimeError if null representation is not a bit or byte mask. """ - null = self.describe_null() + null, value = self.describe_null if null == 0: msg = "This column is non-nullable so does not have a mask" elif null == 1: @@ -501,7 +537,7 @@ def test_noncontiguous_columns(): def test_categorical_dtype(): - df = pd.DataFrame({"A": [1, 2, 3, 1]}) + df = pd.DataFrame({"A": [1, 2, 5, 1]}) df["B"] = df["A"].astype("category") df.at[1, 'B'] = np.nan # Set one item to null @@ -511,15 +547,15 @@ def test_categorical_dtype(): assert col.null_count == 1 assert col.describe_null == (2, -1) # sentinel value -1 assert col.num_chunks() == 1 - assert col.describe_categorical == (False, False, None) + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) df2 = from_dataframe(df) tm.assert_frame_equal(df, df2) if __name__ == '__main__': + test_categorical_dtype() test_float_only() test_mixed_intfloat() test_noncontiguous_columns() - test_categorical_dtype() From 1b6ef4e09d2cda9fcd871da0eb847cd4edbe7e6a Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 18:30:10 +0200 Subject: [PATCH 12/13] Illustrate issue with categorical dtype & get_data_buffer() This shows the simple design doesn't fully work (see the FIXMEs in the diff). Instead, the `children` concept is needed. That way the categorical encoded data values can be returned as a child Column rather than a Buffer, and hence there's the necessary Column.dtype to interpret the buffer backing the column. --- protocol/pandas_implementation.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 1222260c..e05a26c1 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -135,20 +135,24 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: raise NotImplementedError('Non-dictionary categoricals not supported yet') # FIXME: this is cheating, can't use `_col` (just testing now) - categories = col._col.values.categories.values - codes = col._col.values.codes + # categories = col._col.values.categories.values + # codes = col._col.values.codes + categories = np.asarray(list(mapping.values())) + codes = col.get_data_buffer() # this is broken; don't have dtype info for buffer values = categories[codes] - # Deal with null values - null_kind = col.describe_null[0] - if null_kind == 2: # sentinel value - sentinel = col.describe_null[1] - # Seems like Pandas can only construct with non-null values, so need to # null out the nulls later cat = pd.Categorical(values, categories=categories, ordered=ordered) series = pd.Series(cat) - series[codes == sentinel] = np.nan + null_kind = col.describe_null[0] + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = np.nan + else: + raise NotImplementedError("Only categorical columns with sentinel " + "value supported at the moment") + return series @@ -430,7 +434,16 @@ def get_data_buffer(self) -> _PandasBuffer: """ Return the buffer containing the data. """ - return _PandasBuffer(self._col.to_numpy()) + _k = _DtypeKind + if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + buffer = _PandasBuffer(self._col.to_numpy()) + elif self.dtype[0] == _k.CATEGORICAL: + # FIXME: losing the dtype info here - see `convert_categorical_column` + buffer = _PandasBuffer(self._col.values.codes) + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer def get_mask(self) -> _PandasBuffer: """ From 81ec86ea8bbb9a31243b488158abb48010490e46 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 7 Apr 2021 18:35:17 +0200 Subject: [PATCH 13/13] Make the roundtripping for a categorical column work --- protocol/dataframe_protocol.py | 1 - protocol/pandas_implementation.py | 28 ++++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 816d648e..00cf5b12 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -313,7 +313,6 @@ def get_mask(self) -> Buffer: """ pass -# # NOTE: not needed unless one considers nested dtypes # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e05a26c1..e3e3e62e 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -97,8 +97,12 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: raise NotImplementedError("Null values represented as masks or " "sentinel values not handled yet") + _buffer, _dtype = col.get_data_buffer() + return buffer_to_ndarray(_buffer, _dtype) + + +def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray: # Handle the dtype - _dtype = col.dtype kind = _dtype[0] bitwidth = _dtype[1] _k = _DtypeKind @@ -113,7 +117,6 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: # No DLPack yet, so need to construct a new ndarray from the data pointer # and size in the buffer plus the dtype on the column - _buffer = col.get_data_buffer() ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type)) @@ -134,11 +137,12 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: if not is_dict: raise NotImplementedError('Non-dictionary categoricals not supported yet') - # FIXME: this is cheating, can't use `_col` (just testing now) + # If you want to cheat for testing (can't use `_col` in real-world code): # categories = col._col.values.categories.values # codes = col._col.values.codes categories = np.asarray(list(mapping.values())) - codes = col.get_data_buffer() # this is broken; don't have dtype info for buffer + codes_buffer, codes_dtype = col.get_data_buffer() + codes = buffer_to_ndarray(codes_buffer, codes_dtype) values = categories[codes] # Seems like Pandas can only construct with non-null values, so need to @@ -314,6 +318,12 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ dtype = self._col.dtype + return self._dtype_from_pandasdtype(dtype) + + def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: + """ + See `self.dtype` for details + """ # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) @@ -430,20 +440,22 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn """ return (self,) - def get_data_buffer(self) -> _PandasBuffer: + def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data. """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): buffer = _PandasBuffer(self._col.to_numpy()) + dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: - # FIXME: losing the dtype info here - see `convert_categorical_column` - buffer = _PandasBuffer(self._col.values.codes) + codes = self._col.values.codes + buffer = _PandasBuffer(codes) + dtype = self._dtype_from_pandasdtype(codes.dtype) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") - return buffer + return buffer, dtype def get_mask(self) -> _PandasBuffer: """