Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Create ExistingDataProvider directly #560

Merged
merged 1 commit into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 26 additions & 35 deletions src/fmu/dataio/providers/_objectdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,42 +133,30 @@ def objectdata_provider_factory(
metadata for.
"""
if meta_existing:
return ExistingDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)

meta_existing = {}
return ExistingDataProvider.from_metadata_dict(obj, dataio, meta_existing)
if isinstance(obj, xtgeo.RegularSurface):
return RegularSurfaceDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return RegularSurfaceDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Polygons):
return PolygonsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return PolygonsDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Points):
return PointsDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return PointsDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Cube):
return CubeDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return CubeDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.Grid):
return CPGridDataProvider(obj=obj, dataio=dataio, meta_existing=meta_existing)
return CPGridDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, xtgeo.GridProperty):
return CPGridPropertyDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return CPGridPropertyDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, pd.DataFrame):
return DataFrameDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return DataFrameDataProvider(obj=obj, dataio=dataio)
if isinstance(obj, dict):
return DictionaryDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return DictionaryDataProvider(obj=obj, dataio=dataio)

from pyarrow import Table

if isinstance(obj, Table):
return ArrowTableDataProvider(
obj=obj, dataio=dataio, meta_existing=meta_existing
)
return ArrowTableDataProvider(obj=obj, dataio=dataio)

raise NotImplementedError("This data type is not (yet) supported: ", type(obj))
raise NotImplementedError(f"This data type is not currently supported: {type(obj)}")


@dataclass
Expand Down Expand Up @@ -345,30 +333,33 @@ def get_objectdata(self) -> DerivedObjectDescriptor:

@dataclass
class ExistingDataProvider(ObjectDataProvider):
"""These functions should never be called because derive_metadata will populate the
object data from existing metadata, by calling _derive_from_existing, and return
before calling them."""
"""These getters should never be called because metadata was derived a priori."""

obj: Any
obj: Inferrable

def get_spec(self) -> dict[str, Any]:
def get_spec(self) -> dict:
"""Derive data.spec from existing metadata."""
return self.meta_existing["spec"]
return self.metadata["spec"]

def get_bbox(self) -> dict[str, Any]:
def get_bbox(self) -> dict:
"""Derive data.bbox from existing metadata."""
return self.meta_existing["bbox"]
return self.metadata["bbox"]

def get_objectdata(self) -> DerivedObjectDescriptor:
"""Derive object data for existing metadata."""
return DerivedObjectDescriptor(
subtype=self.meta_existing["subtype"],
classname=self.meta_existing["class"],
layout=self.meta_existing["layout"],
subtype=self.metadata["subtype"],
classname=self.metadata["class"],
layout=self.metadata["layout"],
efolder=self.efolder,
fmt=self.meta_existing["format"],
fmt=self.fmt,
extension=self.extension,
spec=self.get_spec(),
bbox=self.get_bbox(),
table_index=None,
)

def derive_metadata(self) -> None:
"""Metadata has already been derived for this provider, and is already set from
instantiation, so override this method and do nothing."""
return
80 changes: 39 additions & 41 deletions src/fmu/dataio/providers/_objectdata_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Final, Literal, Optional, TypeVar
from typing import TYPE_CHECKING, Any, Dict, Final, Literal, Optional, TypeVar
from warnings import warn

from fmu.dataio import dataio, types
Expand All @@ -14,6 +14,10 @@
from fmu.dataio.datastructure._internal.internal import AllowedContent
from fmu.dataio.datastructure.meta import content

if TYPE_CHECKING:
from fmu.dataio.dataio import ExportData
from fmu.dataio.types import Inferrable

logger: Final = null_logger(__name__)

V = TypeVar("V")
Expand Down Expand Up @@ -126,21 +130,16 @@ class ObjectDataProvider(ABC):
# input fields
obj: types.Inferrable
dataio: dataio.ExportData
meta_existing: dict = field(default_factory=dict)

# result properties; the most important is metadata which IS the 'data' part in
# the resulting metadata. But other variables needed later are also given
# as instance properties in addition (for simplicity in other classes/functions)
bbox: dict = field(default_factory=dict)
metadata: dict = field(default_factory=dict)
name: str = field(default="")
classname: str = field(default="")
efolder: str = field(default="")
extension: str = field(default="")
fmt: str = field(default="")
layout: str = field(default="")
metadata: dict = field(default_factory=dict)
name: str = field(default="")
specs: dict = field(default_factory=dict)
subtype: str = field(default="")
time0: str | None = field(default=None)
time1: str | None = field(default=None)

Expand Down Expand Up @@ -255,35 +254,22 @@ def _derive_timedata(self) -> Optional[dict[str, str]]:
mode="json", exclude_none=True
)

def _derive_from_existing(self) -> None:
"""Derive from existing metadata."""

# do not change any items in 'data' block, as it may ruin e.g. stratigrapical
# setting (i.e. changing data.name is not allowed)
self.metadata = self.meta_existing["data"]
self.name = self.meta_existing["data"]["name"]

# derive the additional attributes needed later e.g. in Filedata provider:
relpath = Path(self.meta_existing["file"]["relative_path"])
if self.dataio.subfolder:
self.efolder = relpath.parent.parent.name
else:
self.efolder = relpath.parent.name
@abstractmethod
def get_spec(self) -> dict:
raise NotImplementedError

self.classname = self.meta_existing["class"]
self.extension = relpath.suffix
self.fmt = self.meta_existing["data"]["format"]
@abstractmethod
def get_bbox(self) -> dict:
raise NotImplementedError

self.time0, self.time1 = parse_timedata(self.meta_existing["data"])
@abstractmethod
def get_objectdata(self) -> DerivedObjectDescriptor:
raise NotImplementedError

def derive_metadata(self) -> None:
"""Main function here, will populate the metadata block for 'data'."""
logger.info("Derive all metadata for data object...")

if self.meta_existing:
self._derive_from_existing()
return

namedstratigraphy = self._derive_name_stratigraphy()
objres = self.get_objectdata()
if self.dataio.forcefolder and not self.dataio.forcefolder.startswith("/"):
Expand Down Expand Up @@ -341,14 +327,26 @@ def derive_metadata(self) -> None:
self.fmt = objres.fmt
logger.info("Derive all metadata for data object... DONE")

@abstractmethod
def get_spec(self) -> dict[str, Any]:
raise NotImplementedError

@abstractmethod
def get_bbox(self) -> dict[str, Any]:
raise NotImplementedError

@abstractmethod
def get_objectdata(self) -> DerivedObjectDescriptor:
raise NotImplementedError
@classmethod
def from_metadata_dict(
cls, obj: Inferrable, dataio: ExportData, meta_existing: dict
) -> ObjectDataProvider:
"""Instantiate from existing metadata."""

relpath = Path(meta_existing["file"]["relative_path"])
time0, time1 = parse_timedata(meta_existing["data"])

return cls(
obj=obj,
dataio=dataio,
metadata=meta_existing["data"],
name=meta_existing["data"]["name"],
classname=meta_existing["class"],
efolder=(
relpath.parent.parent.name if dataio.subfolder else relpath.parent.name
),
extension=relpath.suffix,
fmt=meta_existing["data"]["format"],
time0=time0,
time1=time1,
)
81 changes: 78 additions & 3 deletions tests/test_units/test_objectdataprovider_class.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
"""Test the _ObjectData class from the _objectdata.py module"""

import os

import pytest
from fmu.dataio import dataio
from fmu.dataio._definitions import ConfigurationError, ValidFormats
from fmu.dataio.providers._objectdata import objectdata_provider_factory
from fmu.dataio._metadata import MetaData
from fmu.dataio.providers._objectdata import (
ExistingDataProvider,
objectdata_provider_factory,
)
from fmu.dataio.providers._objectdata_xtgeo import RegularSurfaceDataProvider

from ..utils import inside_rms

# --------------------------------------------------------------------------------------
# RegularSurface
Expand Down Expand Up @@ -67,8 +77,10 @@ def test_objectdata_regularsurface_spec_bbox(regsurf, edataobj1):
def test_objectdata_regularsurface_derive_objectdata(regsurf, edataobj1):
"""Derive other properties."""

res = objectdata_provider_factory(regsurf, edataobj1).get_objectdata()
objdata = objectdata_provider_factory(regsurf, edataobj1)
assert isinstance(objdata, RegularSurfaceDataProvider)

res = objdata.get_objectdata()
assert res.subtype == "RegularSurface"
assert res.classname == "surface"
assert res.extension == ".gri"
Expand All @@ -81,5 +93,68 @@ def test_objectdata_regularsurface_derive_metadata(regsurf, edataobj1):
myobj.derive_metadata()
res = myobj.metadata
assert res["content"] == "depth"

assert res["alias"]


def test_objectdata_provider_factory_raises_on_unknown(edataobj1):
with pytest.raises(NotImplementedError, match="not currently supported"):
objectdata_provider_factory(object(), edataobj1)


def test_regsurf_preprocessed_observation(
fmurun_w_casemetadata, rmssetup, rmsglobalconfig, regsurf
):
"""Test generating pre-realization surfaces that comes to share/preprocessed.

Later, a fmu run will update this (merge metadata)
"""

@inside_rms
def _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf):
"""Run an export of a preprocessed surface inside RMS."""

os.chdir(rmssetup)
edata = dataio.ExportData(
config=rmsglobalconfig, # read from global config
fmu_context="preprocessed",
name="TopVolantis",
content="depth",
is_observation=True,
timedata=[[20240802, "moni"], [20200909, "base"]],
)
return edata, edata.export(regsurf)

def _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, surfacepath):
"""Run FMU workflow, using the preprocessed data as case data.

When re-using metadata, the input object to dataio shall not be a XTGeo or
Pandas or ... instance, but just a file path (either as string or a pathlib.Path
object). This is because we want to avoid time and resources spent on double
reading e.g. a seismic cube, but rather trigger a file copy action instead.

But it requires that valid metadata for that file is found. The rule for
merging is currently defaulted to "preprocessed".
"""
os.chdir(fmurun_w_casemetadata)

casepath = fmurun_w_casemetadata.parent.parent
edata = dataio.ExportData(
config=rmsglobalconfig,
fmu_context="case",
content=None,
is_observation=True,
)
_ = edata.generate_metadata(
surfacepath,
casepath=casepath,
)
metaobj = MetaData(surfacepath, edata)
metaobj._populate_meta_objectdata()
assert isinstance(metaobj.objdata, ExistingDataProvider)
return metaobj

# run two stage process
edata, mysurf = _export_data_from_rms(rmssetup, rmsglobalconfig, regsurf)
metaobj = _run_case_fmu(fmurun_w_casemetadata, rmsglobalconfig, mysurf)
case_meta = metaobj.generate_export_metadata()
assert edata._metadata["data"] == case_meta["data"]
Loading