From 2db1cc098d939b7d7b41e18ad8525d18ad20c1ad Mon Sep 17 00:00:00 2001 From: Sven Date: Mon, 11 Dec 2017 05:28:05 +1100 Subject: [PATCH] BUG: Categorical data fails to load from hdf when all columns are NaN (#18652) --- .gitignore | 1 + doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/pytables.py | 15 +++++++++++---- pandas/tests/io/test_pytables.py | 19 +++++++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index ff0a6aef47163..b1748ae72b8ba 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ doc/build/html/index.html doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ +env/ diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 31902c98d0b6c..fbfbb403b2a17 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -126,6 +126,7 @@ I/O - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) - Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) +- Bug when reading NaN-only categorical columns in :class:`HDFStore` (:issue:`18413`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 74cd2ba7dc4d8..d73417f7b0c95 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2137,10 +2137,17 @@ def convert(self, values, nan_rep, encoding): # if we have stored a NaN in the categories # then strip it; in theory we could have BOTH # -1s in the codes and nulls :< - mask = isna(categories) - if mask.any(): - categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = Index([], dtype=np.float64) + else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values self.data = Categorical.from_codes(codes, categories=categories, diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index f239b7fe7855d..85f24e794f12a 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4928,6 +4928,25 @@ def test_categorical_conversion(self): result = read_hdf(path, 'df', where='obsids=B') tm.assert_frame_equal(result, expected) + def test_categorical_nan_only_columns(self): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = pd.DataFrame({ + 'a': ['a', 'b', 'c', np.nan], + 'b': [np.nan, np.nan, np.nan, np.nan], + 'c': [1, 2, 3, 4], + 'd': pd.Series([None] * 4, dtype=object) + }) + df['a'] = df.a.astype('category') + df['b'] = df.b.astype('category') + df['d'] = df.b.astype('category') + expected = df + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df') + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]])