From 9f82870143f2c93bf9c6ff7879c1936b14116542 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Tue, 9 Feb 2021 21:29:17 +0100
Subject: [PATCH 01/13] Add a prototype of the dataframe interchange protocol

Related to requirements in gh-35.

TBD (to be discussed) comments and design decisions at the top
of the file indicate topics for closer review/discussion.
---
 protocol/dataframe_protocol.py | 381 +++++++++++++++++++++++++++++++++
 1 file changed, 381 insertions(+)
 create mode 100644 protocol/dataframe_protocol.py

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
new file mode 100644
index 00000000..f36fe2c3
--- /dev/null
+++ b/protocol/dataframe_protocol.py
@@ -0,0 +1,381 @@
+"""
+Specification for objects to be accessed, for the purpose of dataframe
+interchange between libraries, via the ``__dataframe__`` method on a libraries'
+data frame object.
+
+For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35
+
+Design decisions
+----------------
+
+**1. Use a separate column abstraction in addition to a dataframe interface.**
+
+Rationales:
+- This is how it works in R, Julia and Apache Arrow.
+- Semantically most existing applications and users treat a column similar to a 1-D array
+- We should be able to connect a column to the array data interchange mechanism(s)
+
+Note that this does not imply a library must have such a public user-facing
+abstraction (ex. ``pandas.Series``) - it can only be accessed via ``__dataframe__``.
+
+**2. Use methods and properties on an opaque object rather than returning
+hierarchical dictionaries describing memory**
+
+This is better for implementations that may rely on, for example, lazy
+computation.
+
+**3. No row names. If a library uses row names, use a regular column for them.**
+
+See discussion at https://github.com/wesm/dataframe-protocol/pull/1/files#r394316241
+Optional row names are not a good idea, because people will assume they're present
+(see cuDF experience, forced to add because pandas has them).
+Requiring row names seems worse than leaving them out.
+
+"""
+
+
+class Buffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes
+        """
+        pass
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer
+        """
+        pass
+
+    def __dlpack__(self):
+        """
+        Produce DLPack capsule (see array API standard).
+
+        Raises:
+
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        """
+        raise NotImplementedError("__dlpack__")
+
+    def __array_interface__(self):
+        """
+        TBD: implement or not? Will work for all dtypes except bit masks.
+        """
+        raise NotImplementedError("__array_interface__")
+
+
+class Column:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain either one
+    or two buffers - one data buffer and (depending on null representation) it
+    may have a mask buffer.
+
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+    @property
+    def name(self) -> str:
+        pass
+
+    @property
+    def size(self) -> Optional[int]:
+        """
+        Size of the column, in elements.
+
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+        """
+        pass
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element
+
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        pass
+
+    @property
+    def dtype(self) -> Tuple[int, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - 0 : signed integer
+            - 1 : unsigned integer
+            - 2 : IEEE floating point
+            - 20 : boolean
+            - 21 : string (UTF-8)
+            - 22 : datetime
+            - 23 : categorical
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        pass
+
+    @property
+    def describe_categorical(self) -> dict[bool, bool, Optional[dict]]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises RuntimeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        pass
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : NaN/NaT
+            - 1 : sentinel value
+            - 2 : bit mask
+            - 3 : byte mask
+
+        Value : if kind is "sentinel value", the actual value. None otherwise.
+        """
+        pass
+
+    @property
+    def null_count(self) -> Optional[int]:
+        """
+        Number of null elements, if known.
+
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        pass
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        pass
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        pass
+
+    def get_buffer(self) -> Buffer:
+        """
+        Return the buffer containing the data.
+        """
+        pass
+
+    def get_mask(self) -> Buffer:
+        """
+        Return the buffer containing the mask values indicating missing data.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+        pass
+
+#    # NOTE: not needed unless one considers nested dtypes
+#    def get_children(self) -> Iterable[Column]:
+#        """
+#        Children columns underneath the column, each object in this iterator
+#        must adhere to the column specification
+#        """
+#        pass
+
+
+class DataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    A "data frame" represents an ordered collection of named columns.
+    A column's "name" must be a unique string.
+    Columns may be accessed by name or by position.
+
+    This could be a public data frame class, or an object with the methods and
+    attributes defined on this DataFrame class could be returned from the
+    ``__dataframe__`` method of a public data frame class in a library adhering
+    to the dataframe interchange protocol specification.
+    """
+    def __dataframe__(self, nan_as_null : bool = False) -> dict:
+        """
+        Produces a dictionary object following the dataframe protocol spec
+        """
+        self._nan_as_null = nan_as_null
+        return {
+            "dataframe": self,  # DataFrame object adhering to the protocol
+            "version": 0        # Version number of the protocol
+        }
+
+    def num_columns(self) -> int:
+        """
+        Return the number of columns in the DataFrame
+        """
+        pass
+
+    def num_rows(self) -> Optional[int]:
+        # TODO: not happy with Optional, but need to flag it may be expensive
+        #       why include it if it may be None - what do we expect consumers
+        #       to do here?
+        """
+        Return the number of rows in the DataFrame, if available
+        """
+        pass
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the DataFrame consists of
+        """
+        pass
+
+    def column_names(self) -> Iterable[str]:
+        """
+        Return an iterator yielding the column names.
+        """
+        pass
+
+    def get_column(self, i: int) -> Column:
+        """
+        Return the column at the indicated position.
+        """
+        pass
+
+    def get_column_by_name(self, name: str) -> Column:
+        """
+        Return the column whose name is the indicated name.
+        """
+        pass
+
+    def get_columns(self) -> Iterable[Column]:
+        """
+        Return an iterator yielding the columns.
+        """
+        pass
+
+    def select_columns(self, indices: Sequence[int]) -> DataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by index
+        """
+        pass
+
+    def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by name.
+        """
+        pass
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
+        """
+        Return an iterator yielding the chunks.
+
+        By default (None), yields the chunks that the data is stored as by the
+        producer. If given, ``n_chunks`` must be a multiple of
+        ``self.num_chunks()``, meaning the producer must subdivide each chunk
+        before yielding it.
+        """
+        pass
+
+    @property
+    def device(self) -> int:
+        """
+        Device type the dataframe resides on.
+
+        Uses device type codes matching DLPack:
+
+            - 1 : CPU
+            - 2 : CUDA
+            - 3 : CPU pinned
+            - 4 : OpenCL
+            - 7 : Vulkan
+            - 8 : Metal
+            - 9 : Verilog
+            - 10 : ROCm
+        """
+        pass

From b201c68643c06d0cc78ca8717d929b5beec74aaf Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 10 Feb 2021 22:20:05 +0100
Subject: [PATCH 02/13] Remove Column.name, rename get_buffer to
 get_data_buffer

---
 protocol/dataframe_protocol.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index f36fe2c3..f612667b 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -117,9 +117,6 @@ class Column:
           doesn't need its own version or ``__column__`` protocol.
 
     """
-    @property
-    def name(self) -> str:
-        pass
 
     @property
     def size(self) -> Optional[int]:
@@ -247,7 +244,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
         """
         pass
 
-    def get_buffer(self) -> Buffer:
+    def get_data_buffer(self) -> Buffer:
         """
         Return the buffer containing the data.
         """

From 61d84f35fb76f2db346991e4bab10e80aac5a2e9 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Thu, 11 Feb 2021 17:52:04 +0100
Subject: [PATCH 03/13] Remove __array_interface__

---
 protocol/dataframe_protocol.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index f612667b..9e3d02b8 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -67,12 +67,6 @@ def __dlpack__(self):
         """
         raise NotImplementedError("__dlpack__")
 
-    def __array_interface__(self):
-        """
-        TBD: implement or not? Will work for all dtypes except bit masks.
-        """
-        raise NotImplementedError("__array_interface__")
-
 
 class Column:
     """

From dc3b373e13569338deb96038f17a8585ac9a5f90 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Thu, 11 Feb 2021 17:53:36 +0100
Subject: [PATCH 04/13] Add "non-nullable" to `Column.describe_null`

---
 protocol/dataframe_protocol.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 9e3d02b8..448dbc2c 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -206,10 +206,11 @@ def describe_null(self) -> Tuple[int, Any]:
 
         Kind:
 
-            - 0 : NaN/NaT
-            - 1 : sentinel value
-            - 2 : bit mask
-            - 3 : byte mask
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
 
         Value : if kind is "sentinel value", the actual value. None otherwise.
         """

From cb338fd6b5edfacf101e932dfe67f7124f541685 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Mon, 1 Mar 2021 11:36:52 +0100
Subject: [PATCH 05/13] Address some review comments and add more docs

---
 protocol/dataframe_protocol.py | 82 +++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 17 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 448dbc2c..f7e4b31e 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -5,6 +5,32 @@
 
 For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35
 
+
+Concepts in this design
+-----------------------
+
+1. A `Buffer` class. A *buffer* is a contiguous block of memory - this is the
+  only thing that actually maps to a 1-D array in a sense that it could be
+  converted to NumPy, CuPy, et al.
+2. A `Column` class. A *column* has a name and a single dtype. It can consist
+   of multiple *chunks*. A single chunk of a column (which may be the whole
+   column if ``num_chunks == 1``) is modeled as again a `Column` instance, and
+   contains 1 data *buffer* and (optionally) one *mask* for missing data.
+3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*.
+   It has a single device, and all its rows are the same length. It can consist
+   of multiple *chunks*. A single chunk of a data frame is modeled as
+   again a `DataFrame` instance.
+4. A *mask* concept. A *mask* of a single-chunk column is a *buffer*.
+5. A *chunk* concept. A *chunk* is a sub-dividing element that can be applied
+   to a *data frame* or a *column*.
+
+Note that the only way to access these objects is through a call to
+``__dataframe__`` on a data frame object. This is NOT meant as public API;
+only think of instances of the different classes here to describe the API of
+what is returned by a call to ``__dataframe__``. They are the concepts needed
+to capture the memory layout and data access of a data frame.
+
+
 Design decisions
 ----------------
 
@@ -31,12 +57,27 @@
 (see cuDF experience, forced to add because pandas has them).
 Requiring row names seems worse than leaving them out.
 
+Note that row labels could be added in the future - right now there's no clear
+requirements for more complex row labels that cannot be represented by a single
+column. That do exist, for example Modin has has table and tree-based row
+labels.
+
 """
 
 
 class Buffer:
     """
     Data in the buffer is guaranteed to be contiguous in memory.
+
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
     """
 
     @property
@@ -67,6 +108,25 @@ def __dlpack__(self):
         """
         raise NotImplementedError("__dlpack__")
 
+    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+
+        Uses device type codes matching DLPack. Enum members are::
+
+            - CPU = 1
+            - CUDA = 2
+            - CPU_PINNED = 3
+            - OPENCL = 4
+            - VULKAN = 7
+            - METAL = 8
+            - VPI = 9
+            - ROCM = 10
+
+        Note: must be implemented even if ``__dlpack__`` is not.
+        """
+        pass
+
 
 class Column:
     """
@@ -279,6 +339,11 @@ class DataFrame:
     def __dataframe__(self, nan_as_null : bool = False) -> dict:
         """
         Produces a dictionary object following the dataframe protocol spec
+
+        ``nan_as_null`` is a keyword intended for the consumer to tell the
+        producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+        It is intended for cases where the consumer does not support the bit
+        mask or byte mask that is the producer's native representation.
         """
         self._nan_as_null = nan_as_null
         return {
@@ -354,20 +419,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
         """
         pass
 
-    @property
-    def device(self) -> int:
-        """
-        Device type the dataframe resides on.
-
-        Uses device type codes matching DLPack:
-
-            - 1 : CPU
-            - 2 : CUDA
-            - 3 : CPU pinned
-            - 4 : OpenCL
-            - 7 : Vulkan
-            - 8 : Metal
-            - 9 : Verilog
-            - 10 : ROCm
-        """
-        pass

From 35d3c0d8a21c2925144abe197d4d84f247d04974 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 3 Mar 2021 00:27:23 +0100
Subject: [PATCH 06/13] Add a Pandas implementation of the interchange protocol

---
 protocol/pandas_implementation.py | 450 ++++++++++++++++++++++++++++++
 1 file changed, 450 insertions(+)
 create mode 100644 protocol/pandas_implementation.py

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
new file mode 100644
index 00000000..b3cd9f1a
--- /dev/null
+++ b/protocol/pandas_implementation.py
@@ -0,0 +1,450 @@
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a pandas.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than having
+  ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+import enum
+import collections
+import ctypes
+from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
+
+import pandas as pd
+import numpy as np
+import pandas._testing as tm
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+def from_dataframe(df : DataFrameObject) -> pd.DataFrame:
+    """
+    Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    # NOTE: commented out for roundtrip testing
+    # if isinstance(df, pd.DataFrame):
+    #     return df
+
+    if not hasattr(df, '__dataframe__'):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__())
+
+
+def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
+    """
+    Note: not all cases are handled yet, only ones that can be implemented with
+    only Pandas. Later, we need to implement/test support for categoricals,
+    bit/byte masks, chunk handling, etc.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError
+
+    # We need a dict of columns here, with each column being a numpy array (at
+    # least for now, deal with non-numpy dtypes later).
+    columns = dict()
+    for name in df.column_names():
+        columns[name] = convert_column_to_ndarray(df.get_column_by_name(name))
+
+    return pd.DataFrame(columns)
+
+
+def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
+    """
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    if col.describe_null not in (0, 1):
+        raise NotImplementedError("Null values represented as masks or "
+                                  "sentinel values not handled yet")
+
+    # Handle the dtype
+    _dtype = col.dtype
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    if _dtype[0] not in (0, 1, 2, 20):
+        raise RuntimeError("Not a boolean, integer or floating-point dtype")
+
+    _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}
+    _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}
+    _floats = {32: np.float32, 64: np.float64}
+    _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}}
+    column_dtype = _np_dtypes[kind][bitwidth]
+
+    # No DLPack yet, so need to construct a new ndarray from the data pointer
+    # and size in the buffer plus the dtype on the column
+    _buffer = col.get_data_buffer()
+    ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
+    data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type))
+
+    # NOTE: `x` does not own its memory, so the caller of this function must
+    #       either make a copy or hold on to a reference of the column or
+    #       buffer! (not done yet, this is pretty awful ...)
+    x = np.ctypeslib.as_array(data_pointer,
+                              shape=(_buffer.bufsize // (bitwidth//8),))
+
+    return x
+
+
+def __dataframe__(cls, nan_as_null : bool = False) -> dict:
+    """
+    The public method to attach to pd.DataFrame
+
+    We'll attach it via monkeypatching here for demo purposes. If Pandas adopt
+    the protocol, this will be a regular method on pandas.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+    """
+    return _PandasDataFrame(cls, nan_as_null=nan_as_null)
+
+
+# Monkeypatch the Pandas DataFrame class to support the interchange protocol
+pd.DataFrame.__dataframe__ = __dataframe__
+
+
+# Implementation of interchange protocol
+# --------------------------------------
+
+class _PandasBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(self, x : np.ndarray) -> None:
+        """
+        Handle only regular columns (= numpy arrays) for now.
+        """
+        if not x.strides == (x.dtype.itemsize,):
+            # Array is not contiguous - is this possible?
+            raise RuntimeError("Design needs fixing - non-contiguous buffer")
+
+        # Store the numpy array in which the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self._x = x
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes
+        """
+        return self._x.size * self._x.dtype.itemsize
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer
+        """
+        return self._x.__array_interface__['data'][0]
+
+    def __dlpack__(self):
+        """
+        DLPack not implemented in NumPy yet, so leave it out here
+        """
+        raise NotImplementedError("__dlpack__")
+
+    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        """
+        class Device(enum.IntEnum):
+            CPU = 1
+
+        return (Device.CPU, None)
+
+
+class _PandasColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain either one
+    or two buffers - one data buffer and (depending on null representation) it
+    may have a mask buffer.
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+
+    def __init__(self, column : pd.Series) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        if not isinstance(column, pd.Series):
+            raise NotImplementedError("Columns of type {} not handled "
+                                      "yet".format(type(column)))
+
+        # Store the column as a private attribute
+        self._col = column
+
+    @property
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        """
+        return self._col.size
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        """
+        return 0
+
+    @property
+    def dtype(self) -> Tuple[int, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - 0 : signed integer
+            - 1 : unsigned integer
+            - 2 : IEEE floating point
+            - 20 : boolean
+            - 21 : string (UTF-8)
+            - 22 : datetime
+            - 23 : categorical
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        dtype = self._col.dtype
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
+        #       datetime and timedelta both map to datetime (is timedelta handled?)
+        _np_kinds = {'i': 0, 'u': 1, 'f': 2, 'b': 20, 'O': 21, 'U': 21,
+                     'M': 22, 'm': 22}
+        kind = _np_kinds.get(dtype.kind, None)
+        if kind is None:
+            raise NotImplementedError("Data type {} not handled".format(dtype))
+
+        bitwidth = dtype.itemsize * 8
+        format_str = dtype.str
+        endianness = dtype.byteorder
+        return (kind, bitwidth, format_str, endianness)
+
+
+    @property
+    def describe_categorical(self) -> Dict[str, Any]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises RuntimeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        raise NotImplementedError("TODO")
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
+
+        Value : if kind is "sentinel value", the actual value. None otherwise.
+        """
+        kind = self.dtype[0]
+        if kind == 2:
+            null = 1  # np.nan
+        elif kind == 22:
+            null = 1  # np.datetime64('NaT')
+        elif kind in (0, 1, 20):
+            # TODO: check if extension dtypes are used once support for them is
+            #       implemented in this procotol code
+            null = 0  # integer and boolean dtypes are non-nullable
+        else:
+            raise NotImplementedError('TODO')
+
+        return null
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+        """
+        return self._col.isna().sum()
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn']:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        return (self,)
+
+    def get_data_buffer(self) -> _PandasBuffer:
+        """
+        Return the buffer containing the data.
+        """
+        return _PandasBuffer(self._col.to_numpy())
+
+    def get_mask(self) -> _PandasBuffer:
+        """
+        Return the buffer containing the mask values indicating missing data.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+        null = self.describe_null()
+        if null == 0:
+            msg = "This column is non-nullable so does not have a mask"
+        elif null == 1:
+            msg = "This column uses NaN as null so does not have a separate mask"
+        else:
+            raise NotImplementedError('See self.describe_null')
+
+        raise RuntimeError(msg)
+
+
+class _PandasDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    Instances of this (private) class are returned from
+    ``pd.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+    def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `pd.DataFrame.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        self._nan_as_null = nan_as_null
+
+    def num_columns(self) -> int:
+        return len(self._df.columns)
+
+    def num_rows(self) -> int:
+        return len(self._df)
+
+    def num_chunks(self) -> int:
+        return 1
+
+    def column_names(self) -> Iterable[str]:
+        return self._df.columns.tolist()
+
+    def get_column(self, i: int) -> _PandasColumn:
+        return _PandasColumn(self._df.iloc[:, i])
+
+    def get_column_by_name(self, name: str) -> _PandasColumn:
+        return _PandasColumn(self._df[name])
+
+    def get_columns(self) -> Iterable[_PandasColumn]:
+        return [_PandasColumn(self._df[name]) for name in self._df.columns]
+
+    def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame':
+        if not isinstance(indices, collections.Sequence):
+            raise ValueError("`indices` is not a sequence")
+
+        return _PandasDataFrame(self._df.iloc[:, indices])
+
+    def select_columns_by_name(self, names: Sequence[str]) -> '_PandasDataFrame':
+        if not isinstance(names, collections.Sequence):
+            raise ValueError("`names` is not a sequence")
+
+        return _PandasDataFrame(self._df.xs(indices, axis='columns'))
+
+    def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFrame']:
+        """
+        Return an iterator yielding the chunks.
+        """
+        return (self,)
+
+
+# Roundtrip testing
+# -----------------
+
+def test_float_only():
+    df = pd.DataFrame(data=dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]))
+    df2 = from_dataframe(df)
+    tm.assert_frame_equal(df, df2)
+
+
+def test_mixed_intfloat():
+    df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[3, 4, 5],
+                                c=[1.5, 2.5, 3.5], d=[9, 10, 11]))
+    df2 = from_dataframe(df)
+    tm.assert_frame_equal(df, df2)
+
+
+if __name__ == '__main__':
+    test_float_only()
+    test_mixed_intfloat()
+

From 90b4f42630d058a5637f38d498a58c147299192c Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 17 Mar 2021 23:54:29 +0100
Subject: [PATCH 07/13] Minor change: add a test for strided columns

---
 protocol/pandas_implementation.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index b3cd9f1a..24b41188 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -26,6 +26,7 @@
 import pandas as pd
 import numpy as np
 import pandas._testing as tm
+import pytest
 
 
 # A typing protocol could be added later to let Mypy validate code using
@@ -137,7 +138,10 @@ def __init__(self, x : np.ndarray) -> None:
         Handle only regular columns (= numpy arrays) for now.
         """
         if not x.strides == (x.dtype.itemsize,):
-            # Array is not contiguous - is this possible?
+            # Array is not contiguous - this is possible to get in Pandas,
+            # there was some discussion on whether to support it. Som extra
+            # complexity for libraries that don't support it (e.g. Arrow),
+            # but would help with numpy-based libraries like Pandas.
             raise RuntimeError("Design needs fixing - non-contiguous buffer")
 
         # Store the numpy array in which the data resides as a private
@@ -444,7 +448,18 @@ def test_mixed_intfloat():
     tm.assert_frame_equal(df, df2)
 
 
+def test_noncontiguous_columns():
+    # Currently raises: TBD whether it should work or not, see code comment
+    # where the RuntimeError is raised.
+    arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    df = pd.DataFrame(arr)
+    assert df[0].to_numpy().strides == (24,)
+    pytest.raises(RuntimeError, from_dataframe, df)
+    #df2 = from_dataframe(df)
+
+
 if __name__ == '__main__':
     test_float_only()
     test_mixed_intfloat()
+    test_noncontiguous_columns()
 

From c08ec1021aca9495fefb49906f3d480c434e3e29 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 15:49:37 +0200
Subject: [PATCH 08/13] Address some code review comments

---
 protocol/dataframe_protocol.py    | 28 ++++++++++++++--------------
 protocol/pandas_implementation.py | 13 ++++++++++++-
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index f7e4b31e..816d648e 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -12,14 +12,14 @@
 1. A `Buffer` class. A *buffer* is a contiguous block of memory - this is the
   only thing that actually maps to a 1-D array in a sense that it could be
   converted to NumPy, CuPy, et al.
-2. A `Column` class. A *column* has a name and a single dtype. It can consist
+2. A `Column` class. A *column* has a single dtype. It can consist
    of multiple *chunks*. A single chunk of a column (which may be the whole
    column if ``num_chunks == 1``) is modeled as again a `Column` instance, and
    contains 1 data *buffer* and (optionally) one *mask* for missing data.
-3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*.
-   It has a single device, and all its rows are the same length. It can consist
-   of multiple *chunks*. A single chunk of a data frame is modeled as
-   again a `DataFrame` instance.
+3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*,
+   which are identified with names that are unique strings.  All the data
+   frame's rows are the same length. It can consist of multiple *chunks*. A
+   single chunk of a data frame is modeled as again a `DataFrame` instance.
 4. A *mask* concept. A *mask* of a single-chunk column is a *buffer*.
 5. A *chunk* concept. A *chunk* is a sub-dividing element that can be applied
    to a *data frame* or a *column*.
@@ -59,7 +59,7 @@
 
 Note that row labels could be added in the future - right now there's no clear
 requirements for more complex row labels that cannot be represented by a single
-column. That do exist, for example Modin has has table and tree-based row
+column. These do exist, for example Modin has has table and tree-based row
 labels.
 
 """
@@ -194,19 +194,19 @@ def offset(self) -> int:
         pass
 
     @property
-    def dtype(self) -> Tuple[int, int, str, str]:
+    def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
         """
         Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
 
         Kind :
 
-            - 0 : signed integer
-            - 1 : unsigned integer
-            - 2 : IEEE floating point
-            - 20 : boolean
-            - 21 : string (UTF-8)
-            - 22 : datetime
-            - 23 : categorical
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
 
         Bit-width : the number of bits as an integer
         Format string : data type description format string in Apache Arrow C
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index 24b41188..c567560d 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -68,6 +68,16 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
     return pd.DataFrame(columns)
 
 
+class _DtypeKind(enum.IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21   # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
 def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
     """
     """
@@ -82,7 +92,8 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
     _dtype = col.dtype
     kind = _dtype[0]
     bitwidth = _dtype[1]
-    if _dtype[0] not in (0, 1, 2, 20):
+    _k = _DtypeKind
+    if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
         raise RuntimeError("Not a boolean, integer or floating-point dtype")
 
     _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}

From 9c2717b1eb6cb9593e70ebbec11b8b3a53f948f5 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 16:44:09 +0200
Subject: [PATCH 09/13] Partial support for categorical dtypes - export works

---
 protocol/pandas_implementation.py | 61 ++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index c567560d..6c5a99e1 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -84,7 +84,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
 
-    if col.describe_null not in (0, 1):
+    if col.describe_null[0] not in (0, 1):
         raise NotImplementedError("Null values represented as masks or "
                                   "sentinel values not handled yet")
 
@@ -230,19 +230,19 @@ def offset(self) -> int:
         return 0
 
     @property
-    def dtype(self) -> Tuple[int, int, str, str]:
+    def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
         """
         Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
 
         Kind :
 
-            - 0 : signed integer
-            - 1 : unsigned integer
-            - 2 : IEEE floating point
-            - 20 : boolean
-            - 21 : string (UTF-8)
-            - 22 : datetime
-            - 23 : categorical
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
 
         Bit-width : the number of bits as an integer
         Format string : data type description format string in Apache Arrow C
@@ -273,15 +273,25 @@ def dtype(self) -> Tuple[int, int, str, str]:
         # Note: 'c' (complex) not handled yet (not in array spec v1).
         #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
         #       datetime and timedelta both map to datetime (is timedelta handled?)
-        _np_kinds = {'i': 0, 'u': 1, 'f': 2, 'b': 20, 'O': 21, 'U': 21,
-                     'M': 22, 'm': 22}
+        _k = _DtypeKind
+        _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL,
+                     'U': _k.STRING,
+                     'M': _k.DATETIME, 'm': _k.DATETIME}
         kind = _np_kinds.get(dtype.kind, None)
         if kind is None:
-            raise NotImplementedError("Data type {} not handled".format(dtype))
+            # Not a NumPy dtype. Check if it's a categorical maybe
+            if isinstance(dtype, pd.CategoricalDtype):
+                kind = 23
+            else:
+                raise ValueError(f"Data type {dtype} not supported by exchange"
+                                 "protocol")
+
+        if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL):
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
 
         bitwidth = dtype.itemsize * 8
         format_str = dtype.str
-        endianness = dtype.byteorder
+        endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
         return (kind, bitwidth, format_str, endianness)
 
 
@@ -324,19 +334,26 @@ def describe_null(self) -> Tuple[int, Any]:
 
         Value : if kind is "sentinel value", the actual value. None otherwise.
         """
+        _k = _DtypeKind
         kind = self.dtype[0]
-        if kind == 2:
+        value = None
+        if kind == _k.FLOAT:
             null = 1  # np.nan
-        elif kind == 22:
+        elif kind == _k.DATETIME:
             null = 1  # np.datetime64('NaT')
-        elif kind in (0, 1, 20):
+        elif kind in (_k.INT, _k.UINT, _k.BOOL):
             # TODO: check if extension dtypes are used once support for them is
             #       implemented in this procotol code
             null = 0  # integer and boolean dtypes are non-nullable
+        elif kind == _k.CATEGORICAL:
+            # Null values for categoricals are stored as `-1` sentinel values
+            # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
+            null = 2
+            value = -1
         else:
-            raise NotImplementedError('TODO')
+            raise NotImplementedError(f'Data type {self.dtype} not yet supported')
 
-        return null
+        return null, value
 
     @property
     def null_count(self) -> int:
@@ -469,8 +486,16 @@ def test_noncontiguous_columns():
     #df2 = from_dataframe(df)
 
 
+def test_categorical_dtype():
+    df = pd.DataFrame({"A": [1, 2, 3, 1]})
+    df["B"] = df["A"].astype("category")
+    df.at[1, 'B'] = np.nan  # Set one item to null
+    df2 = from_dataframe(df)
+
+
 if __name__ == '__main__':
     test_float_only()
     test_mixed_intfloat()
     test_noncontiguous_columns()
+    test_categorical_dtype()
 

From 552b7943f9110c0acc8e5fed86df9f1ead606c3a Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 17:20:58 +0200
Subject: [PATCH 10/13] Add describe_categorical support and a buffer
 `__repr__`

---
 protocol/pandas_implementation.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index 6c5a99e1..f96b0f31 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -188,6 +188,11 @@ class Device(enum.IntEnum):
 
         return (Device.CPU, None)
 
+    def __repr__(self) -> str:
+        return 'PandasBuffer(' + str({'bufsize': self.bufsize,
+                                      'ptr': self.ptr,
+                                      'device': self.__dlpack_device__()[0].name}
+                                      ) + ')'
 
 class _PandasColumn:
     """
@@ -313,10 +318,19 @@ def describe_categorical(self) -> Dict[str, Any]:
                                 categorical values to other objects exists
             - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
                           None if not a dictionary-style categorical.
-
-        TBD: are there any other in-memory representations that are needed?
         """
-        raise NotImplementedError("TODO")
+        if not self.dtype[0] == _DtypeKind.CATEGORICAL:
+            raise TypeError("`describe_categorical only works on a column with "
+                            "categorical dtype!")
+
+        ordered = self._col.dtype.ordered
+        is_dictionary = False
+        # NOTE: this shows the children approach is better, transforming this
+        # to a "mapping" dict would be inefficient
+        codes = self._col.values.codes  # ndarray, length `self.size`
+        # categories.values is ndarray of length n_categories
+        categories = self._col.values.categories
+        return ordered, is_dictionary, None
 
     @property
     def describe_null(self) -> Tuple[int, Any]:
@@ -490,7 +504,17 @@ def test_categorical_dtype():
     df = pd.DataFrame({"A": [1, 2, 3, 1]})
     df["B"] = df["A"].astype("category")
     df.at[1, 'B'] = np.nan  # Set one item to null
+
+    # Some detailed testing for correctness of dtype and null handling:
+    col = df.__dataframe__().get_column_by_name('B')
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.null_count == 1
+    assert col.describe_null == (2, -1)  # sentinel value -1
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == (False, False, None)
+
     df2 = from_dataframe(df)
+    tm.assert_frame_equal(df, df2)
 
 
 if __name__ == '__main__':

From cfabb9f17166bbdde82252f2167828ab1f1461aa Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 18:11:30 +0200
Subject: [PATCH 11/13] Make roundtripping with categorical dype work (with
 some cheating)

---
 protocol/pandas_implementation.py | 56 +++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index f96b0f31..1222260c 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -62,8 +62,16 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
     # We need a dict of columns here, with each column being a numpy array (at
     # least for now, deal with non-numpy dtypes later).
     columns = dict()
+    _k = _DtypeKind
     for name in df.column_names():
-        columns[name] = convert_column_to_ndarray(df.get_column_by_name(name))
+        col = df.get_column_by_name(name)
+        if col.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            # Simple numerical or bool dtype, turn into numpy array
+            columns[name] = convert_column_to_ndarray(col)
+        elif col.dtype[0] == _k.CATEGORICAL:
+            columns[name] = convert_categorical_column(col)
+        else:
+            raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
 
     return pd.DataFrame(columns)
 
@@ -80,6 +88,7 @@ class _DtypeKind(enum.IntEnum):
 
 def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
     """
+    Convert an int, uint, float or bool column to a numpy array
     """
     if col.offset != 0:
         raise NotImplementedError("column.offset > 0 not handled yet")
@@ -117,6 +126,32 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
     return x
 
 
+def convert_categorical_column(col : ColumnObject) -> pd.Series:
+    """
+    Convert a categorical column to a Series instance
+    """
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError('Non-dictionary categoricals not supported yet')
+
+    # FIXME: this is cheating, can't use `_col` (just testing now)
+    categories = col._col.values.categories.values
+    codes = col._col.values.codes
+    values = categories[codes]
+
+    # Deal with null values
+    null_kind = col.describe_null[0]
+    if null_kind == 2:  # sentinel value
+        sentinel = col.describe_null[1]
+
+    # Seems like Pandas can only construct with non-null values, so need to
+    # null out the nulls later
+    cat = pd.Categorical(values, categories=categories, ordered=ordered)
+    series = pd.Series(cat)
+    series[codes == sentinel] = np.nan
+    return series
+
+
 def __dataframe__(cls, nan_as_null : bool = False) -> dict:
     """
     The public method to attach to pd.DataFrame
@@ -324,13 +359,14 @@ def describe_categorical(self) -> Dict[str, Any]:
                             "categorical dtype!")
 
         ordered = self._col.dtype.ordered
-        is_dictionary = False
-        # NOTE: this shows the children approach is better, transforming this
-        # to a "mapping" dict would be inefficient
+        is_dictionary = True
+        # NOTE: this shows the children approach is better, transforming
+        # `categories` to a "mapping" dict is inefficient
         codes = self._col.values.codes  # ndarray, length `self.size`
         # categories.values is ndarray of length n_categories
-        categories = self._col.values.categories
-        return ordered, is_dictionary, None
+        categories = self._col.values.categories.values
+        mapping = {ix: val for ix, val in enumerate(categories)}
+        return ordered, is_dictionary, mapping
 
     @property
     def describe_null(self) -> Tuple[int, Any]:
@@ -402,7 +438,7 @@ def get_mask(self) -> _PandasBuffer:
 
         Raises RuntimeError if null representation is not a bit or byte mask.
         """
-        null = self.describe_null()
+        null, value = self.describe_null
         if null == 0:
             msg = "This column is non-nullable so does not have a mask"
         elif null == 1:
@@ -501,7 +537,7 @@ def test_noncontiguous_columns():
 
 
 def test_categorical_dtype():
-    df = pd.DataFrame({"A": [1, 2, 3, 1]})
+    df = pd.DataFrame({"A": [1, 2, 5, 1]})
     df["B"] = df["A"].astype("category")
     df.at[1, 'B'] = np.nan  # Set one item to null
 
@@ -511,15 +547,15 @@ def test_categorical_dtype():
     assert col.null_count == 1
     assert col.describe_null == (2, -1)  # sentinel value -1
     assert col.num_chunks() == 1
-    assert col.describe_categorical == (False, False, None)
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
 
     df2 = from_dataframe(df)
     tm.assert_frame_equal(df, df2)
 
 
 if __name__ == '__main__':
+    test_categorical_dtype()
     test_float_only()
     test_mixed_intfloat()
     test_noncontiguous_columns()
-    test_categorical_dtype()
 

From 1b6ef4e09d2cda9fcd871da0eb847cd4edbe7e6a Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 18:30:10 +0200
Subject: [PATCH 12/13] Illustrate issue with categorical dtype &
 get_data_buffer()

This shows the simple design doesn't fully work (see the FIXMEs
in the diff). Instead, the `children` concept is needed.
That way the categorical encoded data values can be returned
as a child Column rather than a Buffer, and hence there's the
necessary Column.dtype to interpret the buffer backing the column.
---
 protocol/pandas_implementation.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index 1222260c..e05a26c1 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -135,20 +135,24 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
 
     # FIXME: this is cheating, can't use `_col` (just testing now)
-    categories = col._col.values.categories.values
-    codes = col._col.values.codes
+    #    categories = col._col.values.categories.values
+    #    codes = col._col.values.codes
+    categories = np.asarray(list(mapping.values()))
+    codes = col.get_data_buffer()  # this is broken; don't have dtype info for buffer
     values = categories[codes]
 
-    # Deal with null values
-    null_kind = col.describe_null[0]
-    if null_kind == 2:  # sentinel value
-        sentinel = col.describe_null[1]
-
     # Seems like Pandas can only construct with non-null values, so need to
     # null out the nulls later
     cat = pd.Categorical(values, categories=categories, ordered=ordered)
     series = pd.Series(cat)
-    series[codes == sentinel] = np.nan
+    null_kind = col.describe_null[0]
+    if null_kind == 2:  # sentinel value
+        sentinel = col.describe_null[1]
+        series[codes == sentinel] = np.nan
+    else:
+        raise NotImplementedError("Only categorical columns with sentinel "
+                                  "value supported at the moment")
+
     return series
 
 
@@ -430,7 +434,16 @@ def get_data_buffer(self) -> _PandasBuffer:
         """
         Return the buffer containing the data.
         """
-        return _PandasBuffer(self._col.to_numpy())
+        _k = _DtypeKind
+        if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
+            buffer = _PandasBuffer(self._col.to_numpy())
+        elif self.dtype[0] == _k.CATEGORICAL:
+            # FIXME: losing the dtype info here - see `convert_categorical_column`
+            buffer = _PandasBuffer(self._col.values.codes)
+        else:
+            raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
+
+        return buffer
 
     def get_mask(self) -> _PandasBuffer:
         """

From 81ec86ea8bbb9a31243b488158abb48010490e46 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 7 Apr 2021 18:35:17 +0200
Subject: [PATCH 13/13] Make the roundtripping for a categorical column work

---
 protocol/dataframe_protocol.py    |  1 -
 protocol/pandas_implementation.py | 28 ++++++++++++++++++++--------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
index 816d648e..00cf5b12 100644
--- a/protocol/dataframe_protocol.py
+++ b/protocol/dataframe_protocol.py
@@ -313,7 +313,6 @@ def get_mask(self) -> Buffer:
         """
         pass
 
-#    # NOTE: not needed unless one considers nested dtypes
 #    def get_children(self) -> Iterable[Column]:
 #        """
 #        Children columns underneath the column, each object in this iterator
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
index e05a26c1..e3e3e62e 100644
--- a/protocol/pandas_implementation.py
+++ b/protocol/pandas_implementation.py
@@ -97,8 +97,12 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
         raise NotImplementedError("Null values represented as masks or "
                                   "sentinel values not handled yet")
 
+    _buffer, _dtype = col.get_data_buffer()
+    return buffer_to_ndarray(_buffer, _dtype)
+
+
+def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray:
     # Handle the dtype
-    _dtype = col.dtype
     kind = _dtype[0]
     bitwidth = _dtype[1]
     _k = _DtypeKind
@@ -113,7 +117,6 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
 
     # No DLPack yet, so need to construct a new ndarray from the data pointer
     # and size in the buffer plus the dtype on the column
-    _buffer = col.get_data_buffer()
     ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
     data_pointer = ctypes.cast(_buffer.ptr, ctypes.POINTER(ctypes_type))
 
@@ -134,11 +137,12 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
     if not is_dict:
         raise NotImplementedError('Non-dictionary categoricals not supported yet')
 
-    # FIXME: this is cheating, can't use `_col` (just testing now)
+    # If you want to cheat for testing (can't use `_col` in real-world code):
     #    categories = col._col.values.categories.values
     #    codes = col._col.values.codes
     categories = np.asarray(list(mapping.values()))
-    codes = col.get_data_buffer()  # this is broken; don't have dtype info for buffer
+    codes_buffer, codes_dtype = col.get_data_buffer()
+    codes = buffer_to_ndarray(codes_buffer, codes_dtype)
     values = categories[codes]
 
     # Seems like Pandas can only construct with non-null values, so need to
@@ -314,6 +318,12 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
               and nested (list, struct, map, union) dtypes.
         """
         dtype = self._col.dtype
+        return self._dtype_from_pandasdtype(dtype)
+
+    def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
+        """
+        See `self.dtype` for details
+        """
         # Note: 'c' (complex) not handled yet (not in array spec v1).
         #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
         #       datetime and timedelta both map to datetime (is timedelta handled?)
@@ -430,20 +440,22 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
         """
         return (self,)
 
-    def get_data_buffer(self) -> _PandasBuffer:
+    def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]:  # Any is for self.dtype tuple
         """
         Return the buffer containing the data.
         """
         _k = _DtypeKind
         if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
             buffer = _PandasBuffer(self._col.to_numpy())
+            dtype = self.dtype
         elif self.dtype[0] == _k.CATEGORICAL:
-            # FIXME: losing the dtype info here - see `convert_categorical_column`
-            buffer = _PandasBuffer(self._col.values.codes)
+            codes = self._col.values.codes
+            buffer = _PandasBuffer(codes)
+            dtype = self._dtype_from_pandasdtype(codes.dtype)
         else:
             raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
 
-        return buffer
+        return buffer, dtype
 
     def get_mask(self) -> _PandasBuffer:
         """