From 17afac9ff70a2b93ed70dc7bcce7beb9a53c2ece Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 3 Oct 2023 16:04:14 -0500
Subject: [PATCH] feat: support STRUCT data type with `Series.struct.field` to
 extract child fields (#71)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes #<issue_number_goes_here> 🦕
---
 .kokoro/build.sh                              |  3 +
 bigframes/dataframe.py                        | 10 +-
 bigframes/dtypes.py                           | 63 ++++++++++++-
 bigframes/operations/base.py                  | 10 +-
 bigframes/operations/structs.py               | 61 ++++++++++++
 bigframes/series.py                           |  5 +
 noxfile.py                                    |  2 +-
 tests/system/small/test_dataframe.py          | 14 ++-
 tests/unit/test_dtypes.py                     | 64 +++++++++++++
 .../pandas/core/arrays/__init__.py            |  0
 .../pandas/core/arrays/arrow/__init__.py      |  0
 .../pandas/core/arrays/arrow/accessors.py     | 94 +++++++++++++++++++
 .../bigframes_vendored/sklearn/__init__.py    |  0
 .../sklearn/ensemble/__init__.py              |  0
 .../bigframes_vendored/xgboost/__init__.py    |  0
 15 files changed, 318 insertions(+), 8 deletions(-)
 create mode 100644 bigframes/operations/structs.py
 create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/__init__.py
 create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py
 create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
 create mode 100644 third_party/bigframes_vendored/sklearn/__init__.py
 create mode 100644 third_party/bigframes_vendored/sklearn/ensemble/__init__.py
 create mode 100644 third_party/bigframes_vendored/xgboost/__init__.py

diff --git a/.kokoro/build.sh b/.kokoro/build.sh
index a0fa4bc787..58eaa7fedf 100755
--- a/.kokoro/build.sh
+++ b/.kokoro/build.sh
@@ -26,6 +26,9 @@ cd "${PROJECT_ROOT}"
 # Disable buffering, so that the logs stream through.
 export PYTHONUNBUFFERED=1
 
+# Workaround https://github.com/pytest-dev/pytest/issues/9567
+export PY_IGNORE_IMPORTMISMATCH=1
+
 # Debug: show build environment
 env | grep KOKORO
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index b8ae37ee6c..345afbe6e6 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -160,7 +160,15 @@ def __init__(
                 columns=columns,  # type:ignore
                 dtype=dtype,  # type:ignore
             )
-            if pd_dataframe.size < MAX_INLINE_DF_SIZE:
+            if (
+                pd_dataframe.size < MAX_INLINE_DF_SIZE
+                # TODO(swast): Workaround data types limitation in inline data.
+                and not any(
+                    dt.pyarrow_dtype
+                    for dt in pd_dataframe.dtypes
+                    if isinstance(dt, pandas.ArrowDtype)
+                )
+            ):
                 self._block = blocks.block_from_local(
                     pd_dataframe, session or bigframes.pandas.get_global_session()
                 )
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 59d3007fab..46a7a1cb50 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -84,10 +84,10 @@
 
 BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = (
     (ibis_dtypes.boolean, pd.BooleanDtype()),
+    (ibis_dtypes.date, pd.ArrowDtype(pa.date32())),
     (ibis_dtypes.float64, pd.Float64Dtype()),
     (ibis_dtypes.int64, pd.Int64Dtype()),
     (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")),
-    (ibis_dtypes.date, pd.ArrowDtype(pa.date32())),
     (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))),
     (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))),
     (
@@ -100,6 +100,19 @@
     pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS
 }
 
+IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = {
+    ibis_dtypes.boolean: pa.bool_(),
+    ibis_dtypes.date: pa.date32(),
+    ibis_dtypes.float64: pa.float64(),
+    ibis_dtypes.int64: pa.int64(),
+    ibis_dtypes.string: pa.string(),
+    ibis_dtypes.time: pa.time64("us"),
+    ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"),
+    ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"),
+}
+
+ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()}
+
 IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = {
     ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS
 }
@@ -148,11 +161,12 @@ def ibis_dtype_to_bigframes_dtype(
     # Special cases: Ibis supports variations on these types, but currently
     # our IO returns them as objects. Eventually, we should support them as
     # ArrowDType (and update the IO accordingly)
-    if isinstance(ibis_dtype, ibis_dtypes.Array) or isinstance(
-        ibis_dtype, ibis_dtypes.Struct
-    ):
+    if isinstance(ibis_dtype, ibis_dtypes.Array):
         return np.dtype("O")
 
+    if isinstance(ibis_dtype, ibis_dtypes.Struct):
+        return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
+
     if ibis_dtype in IBIS_TO_BIGFRAMES:
         return IBIS_TO_BIGFRAMES[ibis_dtype]
     elif isinstance(ibis_dtype, ibis_dtypes.Null):
@@ -164,6 +178,26 @@ def ibis_dtype_to_bigframes_dtype(
         )
 
 
+def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType:
+    if isinstance(ibis_dtype, ibis_dtypes.Array):
+        return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type))
+
+    if isinstance(ibis_dtype, ibis_dtypes.Struct):
+        return pa.struct(
+            [
+                (name, ibis_dtype_to_arrow_dtype(dtype))
+                for name, dtype in ibis_dtype.fields.items()
+            ]
+        )
+
+    if ibis_dtype in IBIS_TO_ARROW:
+        return IBIS_TO_ARROW[ibis_dtype]
+    else:
+        raise ValueError(
+            f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}"
+        )
+
+
 def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
     """Converts an Ibis expression to canonical type.
 
@@ -187,6 +221,24 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table:
     return table.select(*casted_columns)
 
 
+def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
+    if pa.types.is_struct(arrow_dtype):
+        struct_dtype = typing.cast(pa.StructType, arrow_dtype)
+        return ibis_dtypes.Struct.from_tuples(
+            [
+                (field.name, arrow_dtype_to_ibis_dtype(field.type))
+                for field in struct_dtype
+            ]
+        )
+
+    if arrow_dtype in ARROW_TO_IBIS:
+        return ARROW_TO_IBIS[arrow_dtype]
+    else:
+        raise ValueError(
+            f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
+        )
+
+
 def bigframes_dtype_to_ibis_dtype(
     bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
 ) -> ibis_dtypes.DataType:
@@ -202,6 +254,9 @@ def bigframes_dtype_to_ibis_dtype(
     Raises:
         ValueError: If passed a dtype not supported by BigQuery DataFrames.
     """
+    if isinstance(bigframes_dtype, pd.ArrowDtype):
+        return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
+
     type_string = str(bigframes_dtype)
     if type_string in BIGFRAMES_STRING_TO_BIGFRAMES:
         bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index add6af57f4..51eaad18b9 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -86,7 +86,15 @@ def __init__(
             if pd_series.name is None:
                 # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename
                 pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1)
-            if pd_dataframe.size < MAX_INLINE_SERIES_SIZE:
+            if (
+                pd_dataframe.size < MAX_INLINE_SERIES_SIZE
+                # TODO(swast): Workaround data types limitation in inline data.
+                and not any(
+                    dt.pyarrow_dtype
+                    for dt in pd_dataframe.dtypes
+                    if isinstance(dt, pd.ArrowDtype)
+                )
+            ):
                 self._block = blocks.block_from_local(
                     pd_dataframe, session or bigframes.pandas.get_global_session()
                 )
diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py
new file mode 100644
index 0000000000..80d51115d0
--- /dev/null
+++ b/bigframes/operations/structs.py
@@ -0,0 +1,61 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import typing
+
+import ibis.expr.types as ibis_types
+
+import bigframes.dataframe
+import bigframes.operations
+import bigframes.operations.base
+import bigframes.series
+import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors
+
+
+class StructField(bigframes.operations.UnaryOp):
+    def __init__(self, name_or_index: str | int):
+        self._name_or_index = name_or_index
+
+    def _as_ibis(self, x: ibis_types.Value):
+        struct_value = typing.cast(ibis_types.StructValue, x)
+        if isinstance(self._name_or_index, str):
+            name = self._name_or_index
+        else:
+            name = struct_value.names[self._name_or_index]
+        return struct_value[name].name(name)
+
+
+class StructAccessor(
+    bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor
+):
+    __doc__ = vendoracessors.StructAccessor.__doc__
+
+    def field(self, name_or_index: str | int) -> bigframes.series.Series:
+        series = self._apply_unary_op(StructField(name_or_index))
+        if isinstance(name_or_index, str):
+            name = name_or_index
+        else:
+            struct_field = self._dtype.pyarrow_dtype[name_or_index]
+            name = struct_field.name
+        return series.rename(name)
+
+    def explode(self) -> bigframes.dataframe.DataFrame:
+        import bigframes.pandas
+
+        pa_type = self._dtype.pyarrow_dtype
+        return bigframes.pandas.concat(
+            [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
+        )
diff --git a/bigframes/series.py b/bigframes/series.py
index 47298d59f5..84d737210e 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -51,6 +51,7 @@
 import bigframes.operations.base
 import bigframes.operations.datetimes as dt
 import bigframes.operations.strings as strings
+import bigframes.operations.structs as structs
 import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
 
 LevelType = typing.Union[str, int]
@@ -118,6 +119,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
             self._set_internal_query_job(self._compute_dry_run())
         return self._query_job
 
+    @property
+    def struct(self) -> structs.StructAccessor:
+        return structs.StructAccessor(self._block)
+
     def _set_internal_query_job(self, query_job: bigquery.QueryJob):
         self._query_job = query_job
 
diff --git a/noxfile.py b/noxfile.py
index 81ed9c2b2c..a113e1fcde 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -362,7 +362,7 @@ def doctest(session: nox.sessions.Session):
     run_system(
         session=session,
         prefix_name="doctest",
-        extra_pytest_options=("--doctest-modules",),
+        extra_pytest_options=("--doctest-modules", "third_party"),
         test_folder="bigframes",
         check_cov=True,
     )
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index a27d8fd487..2880932d10 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -917,7 +917,19 @@ def test_get_dtypes_array_struct(session):
     dtypes = df.dtypes
     pd.testing.assert_series_equal(
         dtypes,
-        pd.Series({"array_column": np.dtype("O"), "struct_column": np.dtype("O")}),
+        pd.Series(
+            {
+                "array_column": np.dtype("O"),
+                "struct_column": pd.ArrowDtype(
+                    pa.struct(
+                        [
+                            ("string_field", pa.string()),
+                            ("float_field", pa.float64()),
+                        ]
+                    )
+                ),
+            }
+        ),
     )
 
 
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index bb8ae570dc..3baff2e1f5 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -85,6 +85,70 @@ def test_ibis_float32_raises_unexpected_datatype():
         bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32)
 
 
+IBIS_ARROW_DTYPES = (
+    (ibis_dtypes.boolean, pa.bool_()),
+    (ibis_dtypes.date, pa.date32()),
+    (ibis_dtypes.Timestamp(), pa.timestamp("us")),
+    (ibis_dtypes.float64, pa.float64()),
+    (
+        ibis_dtypes.Timestamp(timezone="UTC"),
+        pa.timestamp("us", tz="UTC"),
+    ),
+    (
+        ibis_dtypes.Struct.from_tuples(
+            [
+                ("name", ibis_dtypes.string()),
+                ("version", ibis_dtypes.int64()),
+            ]
+        ),
+        pa.struct(
+            [
+                ("name", pa.string()),
+                ("version", pa.int64()),
+            ]
+        ),
+    ),
+    (
+        ibis_dtypes.Struct.from_tuples(
+            [
+                (
+                    "nested",
+                    ibis_dtypes.Struct.from_tuples(
+                        [
+                            ("field", ibis_dtypes.string()),
+                        ]
+                    ),
+                ),
+            ]
+        ),
+        pa.struct(
+            [
+                (
+                    "nested",
+                    pa.struct(
+                        [
+                            ("field", pa.string()),
+                        ]
+                    ),
+                ),
+            ]
+        ),
+    ),
+)
+
+
+@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES)
+def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype):
+    result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype)
+    assert result == ibis_dtype
+
+
+@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES)
+def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype):
+    result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype)
+    assert result == arrow_dtype
+
+
 @pytest.mark.parametrize(
     ["bigframes_dtype", "ibis_dtype"],
     [
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
new file mode 100644
index 0000000000..8e3ea06a3d
--- /dev/null
+++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
@@ -0,0 +1,94 @@
+# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arrays/arrow/accessors.py
+"""Accessors for arrow-backed data."""
+
+from __future__ import annotations
+
+from bigframes import constants
+
+
+class StructAccessor:
+    """
+    Accessor object for structured data properties of the Series values.
+    """
+
+    def field(self, name_or_index: str | int):
+        """
+        Extract a child field of a struct as a Series.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import pyarrow as pa
+            >>> bpd.options.display.progress_bar = None
+            >>> s = bpd.Series(
+            ...     [
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
+            ...     ],
+            ...     dtype=bpd.ArrowDtype(pa.struct(
+            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...     ))
+            ... )
+
+        Extract by field name.
+
+            >>> s.struct.field("project")
+            0    pandas
+            1    pandas
+            2     numpy
+            Name: project, dtype: string
+
+        Extract by field index.
+
+            >>> s.struct.field(0)
+            0    1
+            1    2
+            2    1
+            Name: version, dtype: Int64
+
+        Args:
+            name_or_index:
+                Name (str) or index (int) of the child field to extract.
+
+        Returns:
+            Series:
+                The data corresponding to the selected child field.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def explode(self):
+        """
+        Extract all child fields of a struct as a DataFrame.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import pyarrow as pa
+            >>> bpd.options.display.progress_bar = None
+            >>> s = bpd.Series(
+            ...     [
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
+            ...     ],
+            ...     dtype=bpd.ArrowDtype(pa.struct(
+            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...     ))
+            ... )
+
+        Extract all child fields.
+
+            >>> s.struct.explode()
+               version project
+            0        1  pandas
+            1        2  pandas
+            2        1   numpy
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
+        Returns:
+            DataFrame:
+                The data corresponding to all child fields.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/__init__.py b/third_party/bigframes_vendored/sklearn/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/sklearn/ensemble/__init__.py b/third_party/bigframes_vendored/sklearn/ensemble/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/bigframes_vendored/xgboost/__init__.py b/third_party/bigframes_vendored/xgboost/__init__.py
new file mode 100644
index 0000000000..e69de29bb2