From 0448823f2f737439bebc7f7664aad21ae4335964 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 13:22:46 -0700 Subject: [PATCH 1/4] REF: PandasColumn.describe_categorical return categores instead of mapping --- pandas/core/exchange/column.py | 13 ++++++++----- pandas/core/exchange/dataframe_protocol.py | 11 ++++++----- pandas/core/exchange/from_dataframe.py | 4 +--- pandas/tests/exchange/test_impl.py | 13 ++++++++----- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/pandas/core/exchange/column.py b/pandas/core/exchange/column.py index c2a1cfe766b22..c7553fad5cb3f 100644 --- a/pandas/core/exchange/column.py +++ b/pandas/core/exchange/column.py @@ -145,15 +145,18 @@ def describe_categorical(self): """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. - Raises RuntimeError if the dtype is not categorical + - There is a separate non-categorical Column encoding for categorical values. + + Raises TypeError if the dtype is not categorical + Content of returned dict: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - "is_dictionary" : bool, whether a dictionary-style mapping of categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. """ if not self.dtype[0] == DtypeKind.CATEGORICAL: raise TypeError( @@ -163,7 +166,7 @@ def describe_categorical(self): return { "is_ordered": self._col.cat.ordered, "is_dictionary": True, - "mapping": dict(enumerate(self._col.cat.categories)), + "categories": PandasColumn(pd.Series(self._col.cat.categories)), } @property diff --git a/pandas/core/exchange/dataframe_protocol.py b/pandas/core/exchange/dataframe_protocol.py index 367b906332741..51e02ab313214 100644 --- a/pandas/core/exchange/dataframe_protocol.py +++ b/pandas/core/exchange/dataframe_protocol.py @@ -110,7 +110,7 @@ class CategoricalDescription(TypedDict): is_dictionary: bool # Python-level only (e.g. ``{int: str}``). # None if not a dictionary-style categorical. - mapping: dict | None + categories: Column | None class Buffer(ABC): @@ -274,17 +274,18 @@ def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. + - There is a separate non-categorical Column encoding for categorical values. Raises TypeError if the dtype is not categorical Returns the dictionary with description on how to interpret the data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of + - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index a33e47ba3b68e..07ee538011ae9 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -179,9 +179,7 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: if not categorical["is_dictionary"]: raise NotImplementedError("Non-dictionary categoricals not supported yet") - mapping = categorical["mapping"] - assert isinstance(mapping, dict), "Categorical mapping must be a dict" - categories = np.array(tuple(mapping[k] for k in sorted(mapping))) + categories = np.array(categorical["categories"]._col) buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] diff --git a/pandas/tests/exchange/test_impl.py b/pandas/tests/exchange/test_impl.py index e0e9fdce645d0..de5ea83a58a47 100644 --- a/pandas/tests/exchange/test_impl.py +++ b/pandas/tests/exchange/test_impl.py @@ -6,6 +6,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.exchange.column import PandasColumn from pandas.core.exchange.dataframe_protocol import ( ColumnNullType, DtypeKind, @@ -59,11 +60,13 @@ def test_categorical_dtype(data): assert col.null_count == 0 assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) assert col.num_chunks() == 1 - assert col.describe_categorical == { - "is_ordered": data[1], - "is_dictionary": True, - "mapping": {0: "a", 1: "d", 2: "e", 3: "s", 4: "t"}, - } + desc_cat = col.describe_categorical + assert desc_cat["is_ordered"] == data[1] + assert desc_cat["is_dictionary"] is True + assert isinstance(desc_cat["categories"], PandasColumn) + tm.assert_series_equal( + desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"]) + ) tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) From 23a9ce3e180748c422cdd1f68a0604563d44d3b0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 28 Jul 2022 15:57:58 -0700 Subject: [PATCH 2/4] ignore typing --- pandas/core/exchange/from_dataframe.py | 2 +- pyright_reportGeneralTypeIssues.json | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/exchange/from_dataframe.py b/pandas/core/exchange/from_dataframe.py index 07ee538011ae9..80a629393abd2 100644 --- a/pandas/core/exchange/from_dataframe.py +++ b/pandas/core/exchange/from_dataframe.py @@ -179,7 +179,7 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: if not categorical["is_dictionary"]: raise NotImplementedError("Non-dictionary categoricals not supported yet") - categories = np.array(categorical["categories"]._col) + categories = np.array(categorical["categories"]._col) # type:ignore[union-attr] buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index c482aa32600fb..4318971cdbe96 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -36,6 +36,7 @@ "pandas/core/arrays/timedeltas.py", "pandas/core/computation/align.py", "pandas/core/construction.py", + "pandas/core/exchange/from_dataframe.py", "pandas/core/dtypes/cast.py", "pandas/core/dtypes/common.py", "pandas/core/dtypes/concat.py", From 5a2b5afa0fa17fc5356625243315621e87e7f3af Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Aug 2022 11:00:20 -0700 Subject: [PATCH 3/4] Change naming --- pyright_reportGeneralTypeIssues.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index 4318971cdbe96..cc24280958a8a 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -36,7 +36,6 @@ "pandas/core/arrays/timedeltas.py", "pandas/core/computation/align.py", "pandas/core/construction.py", - "pandas/core/exchange/from_dataframe.py", "pandas/core/dtypes/cast.py", "pandas/core/dtypes/common.py", "pandas/core/dtypes/concat.py", @@ -59,6 +58,7 @@ "pandas/core/indexes/numeric.py", "pandas/core/indexes/period.py", "pandas/core/indexing.py", + "pandas/core/interchange/from_dataframe.py", "pandas/core/internals/api.py", "pandas/core/internals/array_manager.py", "pandas/core/internals/blocks.py", From a731b11e88c7988698d5141ab91b0da34b28d969 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 9 Aug 2022 10:54:33 -0700 Subject: [PATCH 4/4] Add back assert for typing --- pandas/core/interchange/from_dataframe.py | 6 +++++- pyright_reportGeneralTypeIssues.json | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 737f4737f2825..6e1b2de10e8e6 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import ( Buffer, Column, @@ -179,7 +180,10 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: if not categorical["is_dictionary"]: raise NotImplementedError("Non-dictionary categoricals not supported yet") - categories = np.array(categorical["categories"]._col) # type:ignore[union-attr] + cat_column = categorical["categories"] + # for mypy/pyright + assert isinstance(cat_column, PandasColumn), "categories must be a PandasColumn" + categories = np.array(cat_column._col) buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index cc24280958a8a..c482aa32600fb 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -58,7 +58,6 @@ "pandas/core/indexes/numeric.py", "pandas/core/indexes/period.py", "pandas/core/indexing.py", - "pandas/core/interchange/from_dataframe.py", "pandas/core/internals/api.py", "pandas/core/internals/array_manager.py", "pandas/core/internals/blocks.py",