Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Lazy import pyarrow #420

Merged
merged 1 commit into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 20 additions & 25 deletions src/fmu/dataio/_objectdata_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,6 @@
from ._definitions import ALLOWED_CONTENTS, STANDARD_TABLE_INDEX_COLUMNS, _ValidFormats
from ._utils import generate_description, parse_timedata

try:
import pyarrow as pa
except ImportError:
HAS_PYARROW = False
else:
HAS_PYARROW = True

logger: Final = logging.getLogger(__name__)


Expand Down Expand Up @@ -304,21 +297,6 @@ def _derive_objectdata(self) -> dict:
)
result["spec"], result["bbox"] = self._derive_spec_bbox_dataframe()

elif HAS_PYARROW and isinstance(self.obj, pa.Table):
result["table_index"] = self._derive_index()

result["subtype"] = "ArrowTable"
result["classname"] = "table"
result["layout"] = "table"
result["efolder"] = "tables"
result["fmt"] = self.dataio.arrow_fformat
result["extension"] = self._validate_get_ext(
result["fmt"],
result["subtype"],
_ValidFormats().table,
)
result["spec"], result["bbox"] = self._derive_spec_bbox_arrowtable()

elif isinstance(self.obj, dict):
result["subtype"] = "JSON"
result["classname"] = "dictionary"
Expand All @@ -333,9 +311,26 @@ def _derive_objectdata(self) -> dict:
result["spec"], result["bbox"] = self._derive_spec_bbox_dict()

else:
raise NotImplementedError(
"This data type is not (yet) supported: ", type(self.obj)
)
from pyarrow import Table

if isinstance(self.obj, Table):
result["table_index"] = self._derive_index()

result["subtype"] = "ArrowTable"
result["classname"] = "table"
result["layout"] = "table"
result["efolder"] = "tables"
result["fmt"] = self.dataio.arrow_fformat
result["extension"] = self._validate_get_ext(
result["fmt"],
result["subtype"],
_ValidFormats().table,
)
result["spec"], result["bbox"] = self._derive_spec_bbox_arrowtable()
else:
raise NotImplementedError(
"This data type is not (yet) supported: ", type(self.obj)
)

# override efolder with forcefolder as exception!
if self.dataio.forcefolder and not self.dataio.forcefolder.startswith("/"):
Expand Down
33 changes: 14 additions & 19 deletions src/fmu/dataio/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,11 @@
from typing import Any, Final, Literal

import pandas as pd
import xtgeo
import yaml

from fmu.config import utilities as ut

try:
import pyarrow as pa
except ImportError:
HAS_PYARROW = False
else:
HAS_PYARROW = True
from pyarrow import feather

import xtgeo

from . import _design_kw

logger: Final = logging.getLogger(__name__)
Expand Down Expand Up @@ -149,15 +140,19 @@ def export_file(
elif filename.suffix == ".csv" and isinstance(obj, pd.DataFrame):
includeindex = flag == "include_index"
obj.to_csv(filename, index=includeindex)
elif filename.suffix == ".arrow" and HAS_PYARROW and isinstance(obj, pa.Table):
# comment taken from equinor/webviz_subsurface/smry2arrow.py

# Writing here is done through the feather import, but could also be done using
# pa.RecordBatchFileWriter.write_table() with a few pa.ipc.IpcWriteOptions(). It
# is convenient to use feather since it has ready configured defaults and the
# actual file format is the same
# (https://arrow.apache.org/docs/python/feather.html)
feather.write_feather(obj, dest=filename)
elif filename.suffix == ".arrow":
from pyarrow import Table

if isinstance(obj, Table):
from pyarrow import feather
# comment taken from equinor/webviz_subsurface/smry2arrow.py

# Writing here is done through the feather import, but could also be
# done using pa.RecordBatchFileWriter.write_table() with a few
# pa.ipc.IpcWriteOptions(). It is convenient to use feather since it
# has ready configured defaults and the actual file format is the same
# (https://arrow.apache.org/docs/python/feather.html)
feather.write_feather(obj, dest=filename)
elif filename.suffix == ".json":
with open(filename, "w") as stream:
json.dump(obj, stream)
Expand Down
44 changes: 25 additions & 19 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,14 @@
from functools import wraps
from pathlib import Path

import fmu.dataio as dio
import pandas as pd
import pytest
import xtgeo
import yaml
from fmu.config import utilities as ut
from termcolor import cprint

try:
import pyarrow as pa
except ImportError:
HAS_PYARROW = False
else:
HAS_PYARROW = True

import fmu.dataio as dio
from fmu.dataio.dataio import ExportData, read_metadata
from termcolor import cprint

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -478,12 +470,19 @@ def fixture_wellpicks():
@pytest.fixture(name="arrowtable", scope="module", autouse=True)
def fixture_arrowtable():
"""Create an arrow table instance."""
table = None
if HAS_PYARROW:
logger.info("Ran %s", inspect.currentframe().f_code.co_name)
dfr = pd.DataFrame({"COL1": [1, 2, 3, 4], "COL2": [99.0, 98.0, 97.0, 96.0]})
table = pa.Table.from_pandas(dfr)
return table
try:
from pyarrow import Table

return Table.from_pandas(
pd.DataFrame(
{
"COL1": [1, 2, 3, 4],
"COL2": [99.0, 98.0, 97.0, 96.0],
}
)
)
except ImportError:
return None


@pytest.fixture(name="aggr_surfs_mean", scope="module", autouse=True)
Expand Down Expand Up @@ -554,8 +553,10 @@ def fixture_drogon_sum():
Returns:
pa.Table: table with summary data
"""
from pyarrow import feather

path = ROOTPWD / "tests/data/drogon/tabular/summary.arrow"
return pa.feather.read_table(path)
return feather.read_table(path)


@pytest.fixture(name="mock_volumes")
Expand All @@ -582,8 +583,13 @@ def fixture_drogon_volumes():
Returns:
pa.Table: table with summary data
"""
path = ROOTPWD / "tests/data/drogon/tabular/geogrid--vol.csv"
return pa.Table.from_pandas(pd.read_csv(path))
from pyarrow import Table

return Table.from_pandas(
pd.read_csv(
ROOTPWD / "tests/data/drogon/tabular/geogrid--vol.csv",
)
)


# ======================================================================================
Expand Down
5 changes: 3 additions & 2 deletions tests/test_units/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""
from pathlib import Path

import pyarrow as pa
import pytest
from fmu.config.utilities import yaml_load
from fmu.dataio import ExportData
Expand Down Expand Up @@ -83,9 +82,11 @@ def test_derive_summary_index_pyarrow(mock_summary, globalconfig2):
mock_summary (pd.DataFrame): summary "like" dataframe
globalconfig2 (dict): global variables dict
"""
from pyarrow import Table

answer = ["DATE"]
exd = ExportData(config=globalconfig2, content="timeseries")
path = exd.export(pa.Table.from_pandas(mock_summary), name="baretull")
path = exd.export(Table.from_pandas(mock_summary), name="baretull")
assert_correct_table_index(path, answer)


Expand Down
Loading