From 3752760ea836d55fd8e977a84c22e6b903cad7d2 Mon Sep 17 00:00:00 2001 From: JB Lovland Date: Mon, 15 Jan 2024 15:07:27 +0100 Subject: [PATCH] CLN: Lazy import pyarrow --- src/fmu/dataio/_objectdata_provider.py | 45 ++++++++++++-------------- src/fmu/dataio/_utils.py | 33 ++++++++----------- tests/conftest.py | 43 +++++++++++++----------- tests/test_units/test_table.py | 5 +-- 4 files changed, 61 insertions(+), 65 deletions(-) diff --git a/src/fmu/dataio/_objectdata_provider.py b/src/fmu/dataio/_objectdata_provider.py index 3f779358b..0a6ee8232 100644 --- a/src/fmu/dataio/_objectdata_provider.py +++ b/src/fmu/dataio/_objectdata_provider.py @@ -99,13 +99,6 @@ from ._definitions import ALLOWED_CONTENTS, STANDARD_TABLE_INDEX_COLUMNS, _ValidFormats from ._utils import generate_description, parse_timedata -try: - import pyarrow as pa -except ImportError: - HAS_PYARROW = False -else: - HAS_PYARROW = True - logger: Final = logging.getLogger(__name__) @@ -304,21 +297,6 @@ def _derive_objectdata(self) -> dict: ) result["spec"], result["bbox"] = self._derive_spec_bbox_dataframe() - elif HAS_PYARROW and isinstance(self.obj, pa.Table): - result["table_index"] = self._derive_index() - - result["subtype"] = "ArrowTable" - result["classname"] = "table" - result["layout"] = "table" - result["efolder"] = "tables" - result["fmt"] = self.dataio.arrow_fformat - result["extension"] = self._validate_get_ext( - result["fmt"], - result["subtype"], - _ValidFormats().table, - ) - result["spec"], result["bbox"] = self._derive_spec_bbox_arrowtable() - elif isinstance(self.obj, dict): result["subtype"] = "JSON" result["classname"] = "dictionary" @@ -333,9 +311,26 @@ def _derive_objectdata(self) -> dict: result["spec"], result["bbox"] = self._derive_spec_bbox_dict() else: - raise NotImplementedError( - "This data type is not (yet) supported: ", type(self.obj) - ) + from pyarrow import Table + + if isinstance(self.obj, Table): + result["table_index"] = self._derive_index() + + result["subtype"] = "ArrowTable" + result["classname"] = "table" + result["layout"] = "table" + result["efolder"] = "tables" + result["fmt"] = self.dataio.arrow_fformat + result["extension"] = self._validate_get_ext( + result["fmt"], + result["subtype"], + _ValidFormats().table, + ) + result["spec"], result["bbox"] = self._derive_spec_bbox_arrowtable() + else: + raise NotImplementedError( + "This data type is not (yet) supported: ", type(self.obj) + ) # override efolder with forcefolder as exception! if self.dataio.forcefolder and not self.dataio.forcefolder.startswith("/"): diff --git a/src/fmu/dataio/_utils.py b/src/fmu/dataio/_utils.py index a735ccb7f..3cab7f19c 100644 --- a/src/fmu/dataio/_utils.py +++ b/src/fmu/dataio/_utils.py @@ -15,20 +15,11 @@ from typing import Any, Final, Literal import pandas as pd +import xtgeo import yaml from fmu.config import utilities as ut -try: - import pyarrow as pa -except ImportError: - HAS_PYARROW = False -else: - HAS_PYARROW = True - from pyarrow import feather - -import xtgeo - from . import _design_kw, _oyaml as oyaml logger: Final = logging.getLogger(__name__) @@ -149,15 +140,19 @@ def export_file( elif filename.suffix == ".csv" and isinstance(obj, pd.DataFrame): includeindex = flag == "include_index" obj.to_csv(filename, index=includeindex) - elif filename.suffix == ".arrow" and HAS_PYARROW and isinstance(obj, pa.Table): - # comment taken from equinor/webviz_subsurface/smry2arrow.py - - # Writing here is done through the feather import, but could also be done using - # pa.RecordBatchFileWriter.write_table() with a few pa.ipc.IpcWriteOptions(). It - # is convenient to use feather since it has ready configured defaults and the - # actual file format is the same - # (https://arrow.apache.org/docs/python/feather.html) - feather.write_feather(obj, dest=filename) + elif filename.suffix == ".arrow": + from pyarrow import Table + + if isinstance(obj, Table): + from pyarrow import feather + # comment taken from equinor/webviz_subsurface/smry2arrow.py + + # Writing here is done through the feather import, but could also be + # done using pa.RecordBatchFileWriter.write_table() with a few + # pa.ipc.IpcWriteOptions(). It is convenient to use feather since it + # has ready configured defaults and the actual file format is the same + # (https://arrow.apache.org/docs/python/feather.html) + feather.write_feather(obj, dest=filename) elif filename.suffix == ".json": with open(filename, "w") as stream: json.dump(obj, stream) diff --git a/tests/conftest.py b/tests/conftest.py index 1f2c9f40f..e19b07001 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,22 +9,14 @@ from functools import wraps from pathlib import Path +import fmu.dataio as dio import pandas as pd import pytest import xtgeo import yaml from fmu.config import utilities as ut -from termcolor import cprint - -try: - import pyarrow as pa -except ImportError: - HAS_PYARROW = False -else: - HAS_PYARROW = True - -import fmu.dataio as dio from fmu.dataio.dataio import ExportData, read_metadata +from termcolor import cprint logger = logging.getLogger(__name__) @@ -478,12 +470,19 @@ def fixture_wellpicks(): @pytest.fixture(name="arrowtable", scope="module", autouse=True) def fixture_arrowtable(): """Create an arrow table instance.""" - table = None - if HAS_PYARROW: - logger.info("Ran %s", inspect.currentframe().f_code.co_name) - dfr = pd.DataFrame({"COL1": [1, 2, 3, 4], "COL2": [99.0, 98.0, 97.0, 96.0]}) - table = pa.Table.from_pandas(dfr) - return table + try: + from pyarrow import Table + + return Table.from_pandas( + pd.DataFrame( + { + "COL1": [1, 2, 3, 4], + "COL2": [99.0, 98.0, 97.0, 96.0], + } + ) + ) + except ImportError: + return None @pytest.fixture(name="aggr_surfs_mean", scope="module", autouse=True) @@ -554,8 +553,9 @@ def fixture_drogon_sum(): Returns: pa.Table: table with summary data """ + from pyarrow import feather path = ROOTPWD / "tests/data/drogon/tabular/summary.arrow" - return pa.feather.read_table(path) + return feather.read_table(path) @pytest.fixture(name="mock_volumes") @@ -582,8 +582,13 @@ def fixture_drogon_volumes(): Returns: pa.Table: table with summary data """ - path = ROOTPWD / "tests/data/drogon/tabular/geogrid--vol.csv" - return pa.Table.from_pandas(pd.read_csv(path)) + from pyarrow import Table + + return Table.from_pandas( + pd.read_csv( + ROOTPWD / "tests/data/drogon/tabular/geogrid--vol.csv", + ) + ) # ====================================================================================== diff --git a/tests/test_units/test_table.py b/tests/test_units/test_table.py index 5d4f05bdb..39e41d286 100644 --- a/tests/test_units/test_table.py +++ b/tests/test_units/test_table.py @@ -2,7 +2,6 @@ """ from pathlib import Path -import pyarrow as pa import pytest from fmu.config.utilities import yaml_load from fmu.dataio import ExportData @@ -83,9 +82,11 @@ def test_derive_summary_index_pyarrow(mock_summary, globalconfig2): mock_summary (pd.DataFrame): summary "like" dataframe globalconfig2 (dict): global variables dict """ + from pyarrow import Table + answer = ["DATE"] exd = ExportData(config=globalconfig2, content="timeseries") - path = exd.export(pa.Table.from_pandas(mock_summary), name="baretull") + path = exd.export(Table.from_pandas(mock_summary), name="baretull") assert_correct_table_index(path, answer)