Add load_table array support for columnar method (#311)

xmnlab · jp-harvey · web-flow · commit f7f5352ea0f4 · 2020-05-25T10:04:58.000-07:00
* Basic working array load

* Fix columnar array load_table issue

* Add load_table tests for None and empty array

Co-authored-by: jp-harvey &lt;jpharvey@cloudquarterback.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -93,6 +93,9 @@ ENV/
 # pycharm
 .idea/
 
+# vscode
+.vscode/
+
 # Rope project settings
 .ropeproject
 
diff --git a/pymapd/_pandas_loaders.py b/pymapd/_pandas_loaders.py
@@ -26,7 +26,8 @@
 
 
 def get_mapd_dtype(data):
-    "Get the OmniSci type"
+    """Get the OmniSci type"""
+
     if is_object_dtype(data):
         return get_mapd_type_from_object(data)
     else:
@@ -119,37 +120,80 @@ def build_input_columnar(
 
     dfs = np.array_split(df, chunks)
     cols_array = []
+
     for df in dfs:
         input_cols = []
 
         colindex = 0
         for col in col_names:
-            data = df[col]
+            data = df.loc[:, [col]]
 
-            mapd_type = col_types[colindex][0]
+            mapd_type = col_types[colindex].type
+            is_array = col_types[colindex].is_array
+            scale = col_types[colindex].scale
+            has_nulls = data[col].hasnans
 
-            has_nulls = data.hasnans
             if has_nulls:
-                nulls = data.isnull().values.tolist()
+                nulls = data[col].isnull().values.tolist()
             else:
                 nulls = [False] * len(df)
 
+            if is_array:
+                # Expand the dataframe so each array item has
+                # its own field in the dataframe.
+                data = data.iloc[:, 0].apply(pd.Series)
+
             if mapd_type in {'TIME', 'TIMESTAMP', 'DATE', 'BOOL'}:
                 # requires a cast to integer
-                data = thrift_cast(data, mapd_type, 0)
+                for c in data:
+                    data.loc[:, c] = thrift_cast(
+                        data=data[c], mapd_type=mapd_type
+                    )
 
             if mapd_type in ['DECIMAL']:
                 # requires a calculation be done using the scale
                 # then cast to int
-                data = thrift_cast(data, mapd_type, col_types[colindex][1])
+                for c in data:
+                    data.loc[:, c] = thrift_cast(
+                        data=data[c],
+                        mapd_type=mapd_type,
+                        scale=scale,
+                        is_array=is_array,
+                    )
 
             if has_nulls:
-                data = data.fillna(mapd_to_na[mapd_type])
+                if not is_array:
+                    for c in data:
+                        data.loc[:, c] = data[c].fillna(mapd_to_na[mapd_type])
+
+            if is_array:
+                data = data.apply(lambda x: [i for i in x.dropna()], axis=1)
+                if has_nulls:
+                    data[nulls] = mapd_to_na[mapd_type]
 
             if mapd_type not in ['FLOAT', 'DOUBLE', 'VARCHAR', 'STR']:
-                data = data.astype('int64')
-            # use .values so that indexes don't have to be serialized too
-            kwargs = {mapd_to_slot[mapd_type]: data.values}
+                if is_array:
+                    data = data.apply(
+                        lambda _array: [int(item) for item in _array]
+                        if isinstance(_array, list)
+                        else None
+                    )
+                else:
+                    for c in data:
+                        data.loc[:, c] = data.loc[:, c].astype('int64')
+
+            # If this is an array column, we need the data to be a series
+            # of TColumn objects of type mapd_type.
+            if is_array:
+                data = data.apply(
+                    lambda x: TColumn(
+                        data=TColumnData(**{mapd_to_slot[mapd_type]: x})
+                    )
+                )
+                kwargs = {'arr_col': data}
+            else:
+                kwargs = {mapd_to_slot[mapd_type]: data.iloc[:, 0].values}
+
             input_cols.append(TColumn(data=TColumnData(**kwargs), nulls=nulls))
             colindex += 1
         cols_array.append(input_cols)
diff --git a/pymapd/connection.py b/pymapd/connection.py
@@ -262,8 +262,8 @@ def __init__(
             proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport)
         else:
             raise ValueError(
-                "`protocol` should be one of",
-                " ['http', 'https', 'binary'],",
+                "`protocol` should be one of"
+                " ['http', 'https', 'binary'],"
                 " got {} instead".format(protocol),
             )
         self._user = user
@@ -433,7 +433,7 @@ def select_ipc_gpu(
             from cudf.core.dataframe import DataFrame  # noqa
         except ImportError:
             raise ImportError(
-                "The 'cudf' package is required for " "`select_ipc_gpu`"
+                "The 'cudf' package is required for `select_ipc_gpu`"
             )
 
         self.register_runtime_udfs()
@@ -771,37 +771,38 @@ def load_table_columnar(
         order to avoid loading inconsistent values into DATE column.
         """
 
-        if isinstance(data, pd.DataFrame):
-            table_details = self.get_table_details(table_name)
-            # Validate that there are the same number of columns in the table
-            # as there are in the dataframe. No point trying to load the data
-            # if this is not the case
-            if len(table_details) != len(data.columns):
-                raise ValueError(
-                    'Number of columns in dataframe ({}) does not \
-                                  match number of columns in OmniSci table \
-                                  ({})'.format(
-                        len(data.columns), len(table_details)
-                    )
-                )
+        if not isinstance(data, pd.DataFrame):
+            raise TypeError('Unknown type {}'.format(type(data)))
 
-            col_names = (
-                [i[0] for i in table_details]
-                if col_names_from_schema
-                else list(data)
+        table_details = self.get_table_details(table_name)
+        # Validate that there are the same number of columns in the table
+        # as there are in the dataframe. No point trying to load the data
+        # if this is not the case
+        if len(table_details) != len(data.columns):
+            raise ValueError(
+                'Number of columns in dataframe ({}) does not \
+                                match number of columns in OmniSci table \
+                                ({})'.format(
+                    len(data.columns), len(table_details)
+                )
             )
 
-            col_types = [(i[1], i[4]) for i in table_details]
+        col_names = (
+            [i.name for i in table_details]
+            if col_names_from_schema
+            else list(data)
+        )
+
+        col_types = table_details
+
+        input_cols = _pandas_loaders.build_input_columnar(
+            data,
+            preserve_index=preserve_index,
+            chunk_size_bytes=chunk_size_bytes,
+            col_types=col_types,
+            col_names=col_names,
+        )
 
-            input_cols = _pandas_loaders.build_input_columnar(
-                data,
-                preserve_index=preserve_index,
-                chunk_size_bytes=chunk_size_bytes,
-                col_types=col_types,
-                col_names=col_names,
-            )
-        else:
-            raise TypeError("Unknown type {}".format(type(data)))
         for cols in input_cols:
             self._client.load_table_binary_columnar(
                 self._session, table_name, cols
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,7 @@
 import subprocess
 import time
+from uuid import uuid4
+
 import pytest
 from thrift.transport import TSocket, TTransport
 from thrift.transport.TSocket import TTransportException
@@ -189,3 +191,14 @@ def _tests_table_no_nulls(n_samples):
     }
 
     return pd.DataFrame(d)
+
+
+@pytest.fixture
+def tmp_table(con) -> str:
+    table_name = 'table_{}'.format(uuid4().hex)
+    con.execute("drop table if exists {};".format(table_name))
+
+    try:
+        yield table_name
+    finally:
+        con.execute("drop table if exists {};".format(table_name))
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -25,6 +25,11 @@
 TOmniSciException.__hash__ = lambda x: id(x)
 
 
+def _cursor2df(cursor):
+    col_names = [c.name for c in cursor.description]
+    return pd.DataFrame(cursor.fetchall(), columns=col_names)
+
+
 @pytest.mark.usefixtures("mapd_server")
 class TestIntegration:
     def test_connect_binary(self):
@@ -666,17 +671,50 @@ def test_load_empty_table_arrow(self, con):
         self.check_empty_insert(result, data)
         con.execute("drop table if exists baz;")
 
-    def test_load_table_columnar(self, con):
-
-        con.execute("drop table if exists baz;")
-        con.execute("create table baz (a int, b float, c text);")
-
-        df = pd.DataFrame(
-            {"a": [1, 2, 3], "b": [1.1, 2.2, 3.3], "c": ['a', '2', '3']},
-            columns=['a', 'b', 'c'],
-        )
-        con.load_table_columnar("baz", df)
-        con.execute("drop table if exists baz;")
+    @pytest.mark.parametrize(
+        'df, table_fields',
+        [
+            (
+                pd.DataFrame(
+                    {
+                        "a": [1, 2, 3],
+                        "b": [1.1, 2.2, 3.3],
+                        "c": ['a', '2', '3'],
+                    },
+                ),
+                'a int, b float, c text',
+            ),
+            (
+                pd.DataFrame(
+                    [
+                        {'ary': [2, 3, 4]},
+                        {'ary': [4444]},
+                        {'ary': []},
+                        {'ary': None},
+                        {'ary': [2, 3, 4]},
+                    ]
+                ),
+                'ary INT[]',
+            ),
+            (
+                pd.DataFrame(
+                    [
+                        {'ary': [2, 3, 4], 'strtest': 'teststr'},
+                        {'ary': None, 'strtest': 'teststr'},
+                        {'ary': [4444], 'strtest': 'teststr'},
+                        {'ary': [], 'strtest': 'teststr'},
+                        {'ary': [2, 3, 4], 'strtest': 'teststr'},
+                    ]
+                ),
+                'ary INT[], strtest TEXT',
+            ),
+        ],
+    )
+    def test_load_table_columnar(self, con, tmp_table, df, table_fields):
+        con.execute("create table {} ({});".format(tmp_table, table_fields))
+        con.load_table_columnar(tmp_table, df)
+        result = _cursor2df(con.execute('select * from {}'.format(tmp_table)))
+        pd.testing.assert_frame_equal(df, result)
 
     def test_load_infer(self, con):
 
diff --git a/tests/test_loaders.py b/tests/test_loaders.py