getml · jan-meyer-1986 · Nov 21, 2024 · Oct 25, 2024 · Oct 29, 2024 · Nov 4, 2024
diff --git a/mlflow/getml/__init__.py b/mlflow/getml/__init__.py
@@ -1,18 +1,18 @@
-import os
 import logging
+import os
 import pathlib
-
-from typing import Any, Literal, Union
+from typing import Any, Union
 
 import yaml
 
 import mlflow
 from mlflow import pyfunc
-from mlflow.models import Model, ModelSignature, ModelInputExample
+from mlflow.getml.autologging import autolog as _autolog
+from mlflow.models import Model
 from mlflow.models.model import MLMODEL_FILE_NAME
-from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-
+from mlflow.utils.autologging_utils import autologging_integration
+from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring
 from mlflow.utils.environment import (
     _CONDA_ENV_FILE_NAME,
     _CONSTRAINTS_FILE_NAME,
@@ -34,14 +34,8 @@
     _validate_and_copy_code_paths,
     _validate_and_prepare_target_save_path,
 )
-
-from mlflow.utils.autologging_utils import (
-    autologging_integration
-)
 from mlflow.utils.requirements_utils import _get_pinned_requirement
 
-from .autologging import autolog as _autolog
-
 FLAVOR_NAME = "getml"
 
 _logger = logging.getLogger(__name__)
@@ -67,31 +61,27 @@
         The default Conda environment for MLflow Models produced by calls to
         :func:`save_model()` and :func:`log_model()`.
     """
-    return _mlflow_conda_env(
-        additional_pip_deps=get_default_pip_requirements(include_cloudpickle)
-    )
+    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements(include_cloudpickle))
+
 
 def _ignore(pipeline_id: str, directory: str, files: list[str]):
     if "pipelines" in directory:
         return directory, [f for f in files if pipeline_id == f]
     return directory, files
 
 
-
 def _copy_getml_engine_folders(getml_project_folder: pathlib.Path, pipeline_id: str, dst_path: str):
     import shutil
-    dst_project_path = (pathlib.Path(dst_path) / "projects")
+
+    dst_project_path = pathlib.Path(dst_path) / "projects"
 
     # copy data structure but what is really necessary
     shutil.copytree(
         src=os.path.join(str(getml_project_folder)),
         dst=dst_project_path,
-        ignore=lambda directory, files: _ignore(pipeline_id, directory, files) 
+        ignore=lambda directory, files: _ignore(pipeline_id, directory, files),
     )
 
-
-
-
 
 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name="getml"))
 def save_model(
@@ -115,7 +105,9 @@
 
     current_user_home_dir = pathlib.Path.home()
 
-    getml_project_name = settings.get("project_name", getml.project.name) if settings else getml.project.name  # type: ignore
+    getml_project_name = (
+        settings.get("project_name", getml.project.name) if settings else getml.project.name
+    )  # type: ignore
     if settings and (wd := settings.get("working_dir")):
         if not pathlib.Path(wd).exists():
             raise Exception(f"{wd} Working directory does not exists")
@@ -126,9 +118,7 @@
         raise Exception("No default getML project directory")
 
     assert getml_project_name
-    if not (
-        getml_project_folder := getml_working_dir / "projects" / getml_project_name
-    ).exists():
+    if not (getml_project_folder := getml_working_dir / "projects" / getml_project_name).exists():
         raise Exception(f"{getml_project_folder} does not exists")
 
     if mlflow_model is None:
@@ -147,7 +137,7 @@
         yaml.safe_dump(settings, stream=settings_file)
 
     _copy_getml_engine_folders(getml_project_folder, getml_pipeline.id, path)
-        # copy files from project folder
+    # copy files from project folder
     pyfunc.add_to_model(
         mlflow_model,
         loader_module="mlflow.getml",
@@ -216,7 +206,7 @@
     """Log an H2O model as an MLflow artifact for the current run.
 
     Args:
-        h2o_model: H2O model to be saved.
+        getml_pipeline: getML pipeline to be saved.
         artifact_path: Run-relative artifact path.
         conda_env: {{ conda_env }}
         code_paths: {{ code_paths }}
@@ -228,7 +218,7 @@
         pip_requirements: {{ pip_requirements }}
         extra_pip_requirements: {{ extra_pip_requirements }}
         metadata:  {{ metadata }}
-        kwargs: kwargs to pass to ``h2o.save_model`` method.
+        kwargs: kwargs to pass to ``getml.save_model`` method.
 
     Returns:
         A :py:class:`ModelInfo <mlflow.models.model.ModelInfo>` instance that contains the
@@ -248,47 +238,78 @@
         **kwargs,
     )
 
-class _GetMLModelWrapper:
 
+class _GetMLModelWrapper:
     def __init__(self, getml_pipeline):
         self.getml_pipeline = getml_pipeline
 
     def get_raw_model(self):
         return self.getml_pipeline
 
     def predict(self, data):
-        """
-            {
-                "population": [],
-                "peripherals": {
-                    "transaction": [],
-                    ...
-                }
-            }
-        """
-        # TODO: validate incoming data
-        def _extract_roles_from_data_model():
-            ...
-
-        def _validate():
-            ...
-
-        self.getml_pipeline.check(data)
-        return self.getml_pipeline.predict(data)
+        import getml
+
+        self._validate_incoming_data(data)
+        roles = self._extract_roles_from_data_model()
+
+        population = getml.data.DataFrame.from_pandas(
+            data["population"], name="population", roles=roles["population"]
+        )
+
+        peripheral_frames = {}
+        for name, peripheral_df in data["peripheral"].items():
+            peripheral_frames[name] = getml.data.DataFrame.from_pandas(
+                peripheral_df, name=name, roles=roles["peripherals"][name]
+            )
+
+        container = getml.data.Container(population=population, peripheral=peripheral_frames)
+
+        return self.getml_pipeline.predict(container.full)
+
+    def _validate_incoming_data(self, data):
+        import pandas as pd
+
+        assert "population" in data
+        assert "peripheral" in data
+        assert isinstance(data["population"], pd.DataFrame)
+        assert isinstance(data["peripheral"], dict)
+
+        peripheral_names_in_data = []
+
+        for name, df in data["peripheral"].items():
+            assert isinstance(df, pd.DataFrame)
+            peripheral_names_in_data.append(name)
+
+        for peripheral_table in self.getml_pipeline.data_model.population.children:
+            if peripheral_table.name not in peripheral_names_in_data:
+                raise Exception(
+                    f"Peripheral table '{peripheral_table.name}' is missing in the data"
+                )
+
+    def _extract_roles_from_data_model(self):
+        roles = {}
+        roles["peripherals"] = {}
+        roles["population"] = self.getml_pipeline.data_model.population.roles
+
+        for peripheral in self.getml_pipeline.data_model.population.children:
+            roles["peripherals"][peripheral.name] = peripheral.roles
+
+        return roles
 
 
 def _load_model(path):
-    import getml
     import shutil
 
-    import pdb; pdb.set_trace()
+    import getml
+
+
     with open(os.path.join(path, "getml.yaml")) as f:
         getml_settings = yaml.safe_load(f.read())
 
     getml_project_name = getml_settings["getml_project_name"]
     getml_pipeline_id = getml_settings["pipeline_id"]
     current_user_home_dir = pathlib.Path.home()
-    getml_project_path = current_user_home_dir / ".getML" / "projects" / getml_project_name 
+    getml_project_path = current_user_home_dir / ".getML" / "projects" / getml_project_name
     shutil.copytree(
         src=os.path.join(path, "projects"),
         dst=str(getml_project_path),
@@ -299,7 +320,6 @@
     return getml.pipeline.load(getml_pipeline_id)
 
 
-
 def _load_pyfunc(path):
     """Load PyFunc implementation. Called by ``pyfunc.load_model``.
 
@@ -360,15 +380,10 @@
     log_post_training_metrics=True,
 ):
     return _autolog(
-        flavor_name = FLAVOR_NAME,
+        flavor_name=FLAVOR_NAME,
         log_input_examples=log_input_examples,
         log_model_signatures=log_model_signatures,
         log_models=log_models,
         log_datasets=log_datasets,
         log_post_training_metrics=log_post_training_metrics,
     )
-
-
-
-
-