Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log metrics #2

Merged
merged 10 commits into from
Nov 21, 2024
129 changes: 72 additions & 57 deletions mlflow/getml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import os
import logging

Check failure on line 1 in mlflow/getml/__init__.py

View workflow job for this annotation

GitHub Actions / lint

Unformatted file. Run `ruff format .` or comment `/autoformat` to format.
import os
import pathlib

from typing import Any, Literal, Union
from typing import Any, Union

import yaml

import mlflow
from mlflow import pyfunc
from mlflow.models import Model, ModelSignature, ModelInputExample
from mlflow.getml.autologging import autolog as _autolog
from mlflow.models import Model
from mlflow.models.model import MLMODEL_FILE_NAME
from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring
from mlflow.tracking.artifact_utils import _download_artifact_from_uri

from mlflow.utils.autologging_utils import autologging_integration
from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring
from mlflow.utils.environment import (
_CONDA_ENV_FILE_NAME,
_CONSTRAINTS_FILE_NAME,
Expand All @@ -34,14 +34,8 @@
_validate_and_copy_code_paths,
_validate_and_prepare_target_save_path,
)

from mlflow.utils.autologging_utils import (
autologging_integration
)
from mlflow.utils.requirements_utils import _get_pinned_requirement

from .autologging import autolog as _autolog

FLAVOR_NAME = "getml"

_logger = logging.getLogger(__name__)
Expand All @@ -67,31 +61,27 @@
The default Conda environment for MLflow Models produced by calls to
:func:`save_model()` and :func:`log_model()`.
"""
return _mlflow_conda_env(
additional_pip_deps=get_default_pip_requirements(include_cloudpickle)
)
return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements(include_cloudpickle))


def _ignore(pipeline_id: str, directory: str, files: list[str]):
if "pipelines" in directory:
return directory, [f for f in files if pipeline_id == f]
return directory, files



def _copy_getml_engine_folders(getml_project_folder: pathlib.Path, pipeline_id: str, dst_path: str):
import shutil

Check failure on line 74 in mlflow/getml/__init__.py

View workflow job for this annotation

GitHub Actions / lint

MLF0002: Builtin modules must be imported at the top level.
dst_project_path = (pathlib.Path(dst_path) / "projects")

dst_project_path = pathlib.Path(dst_path) / "projects"

# copy data structure but what is really necessary
shutil.copytree(
src=os.path.join(str(getml_project_folder)),
dst=dst_project_path,
ignore=lambda directory, files: _ignore(pipeline_id, directory, files)
ignore=lambda directory, files: _ignore(pipeline_id, directory, files),
)





@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name="getml"))
def save_model(
Expand All @@ -115,7 +105,9 @@

current_user_home_dir = pathlib.Path.home()

getml_project_name = settings.get("project_name", getml.project.name) if settings else getml.project.name # type: ignore
getml_project_name = (
settings.get("project_name", getml.project.name) if settings else getml.project.name
) # type: ignore
if settings and (wd := settings.get("working_dir")):
if not pathlib.Path(wd).exists():
raise Exception(f"{wd} Working directory does not exists")
Expand All @@ -126,9 +118,7 @@
raise Exception("No default getML project directory")

assert getml_project_name
if not (
getml_project_folder := getml_working_dir / "projects" / getml_project_name
).exists():
if not (getml_project_folder := getml_working_dir / "projects" / getml_project_name).exists():
raise Exception(f"{getml_project_folder} does not exists")

if mlflow_model is None:
Expand All @@ -147,7 +137,7 @@
yaml.safe_dump(settings, stream=settings_file)

_copy_getml_engine_folders(getml_project_folder, getml_pipeline.id, path)
# copy files from project folder
# copy files from project folder
pyfunc.add_to_model(
mlflow_model,
loader_module="mlflow.getml",
Expand Down Expand Up @@ -216,7 +206,7 @@
"""Log an H2O model as an MLflow artifact for the current run.

Args:
h2o_model: H2O model to be saved.
getml_pipeline: getML pipeline to be saved.
artifact_path: Run-relative artifact path.
conda_env: {{ conda_env }}
code_paths: {{ code_paths }}
Expand All @@ -228,7 +218,7 @@
pip_requirements: {{ pip_requirements }}
extra_pip_requirements: {{ extra_pip_requirements }}
metadata: {{ metadata }}
kwargs: kwargs to pass to ``h2o.save_model`` method.
kwargs: kwargs to pass to ``getml.save_model`` method.

Returns:
A :py:class:`ModelInfo <mlflow.models.model.ModelInfo>` instance that contains the
Expand All @@ -248,47 +238,78 @@
**kwargs,
)

class _GetMLModelWrapper:

class _GetMLModelWrapper:
def __init__(self, getml_pipeline):
self.getml_pipeline = getml_pipeline

def get_raw_model(self):
return self.getml_pipeline

def predict(self, data):
"""
{
"population": [],
"peripherals": {
"transaction": [],
...
}
}
"""
# TODO: validate incoming data
def _extract_roles_from_data_model():
...

def _validate():
...

self.getml_pipeline.check(data)
return self.getml_pipeline.predict(data)
import getml

self._validate_incoming_data(data)
roles = self._extract_roles_from_data_model()

population = getml.data.DataFrame.from_pandas(
data["population"], name="population", roles=roles["population"]
)

peripheral_frames = {}
for name, peripheral_df in data["peripheral"].items():
peripheral_frames[name] = getml.data.DataFrame.from_pandas(
peripheral_df, name=name, roles=roles["peripherals"][name]
)

container = getml.data.Container(population=population, peripheral=peripheral_frames)

return self.getml_pipeline.predict(container.full)

def _validate_incoming_data(self, data):
import pandas as pd

assert "population" in data
assert "peripheral" in data
assert isinstance(data["population"], pd.DataFrame)
assert isinstance(data["peripheral"], dict)

peripheral_names_in_data = []

for name, df in data["peripheral"].items():
assert isinstance(df, pd.DataFrame)
peripheral_names_in_data.append(name)

for peripheral_table in self.getml_pipeline.data_model.population.children:
if peripheral_table.name not in peripheral_names_in_data:
raise Exception(
f"Peripheral table '{peripheral_table.name}' is missing in the data"
)

def _extract_roles_from_data_model(self):
roles = {}
roles["peripherals"] = {}
roles["population"] = self.getml_pipeline.data_model.population.roles

for peripheral in self.getml_pipeline.data_model.population.children:
roles["peripherals"][peripheral.name] = peripheral.roles

return roles


def _load_model(path):
import getml
import shutil

Check failure on line 301 in mlflow/getml/__init__.py

View workflow job for this annotation

GitHub Actions / lint

MLF0002: Builtin modules must be imported at the top level.

import pdb; pdb.set_trace()
import getml


with open(os.path.join(path, "getml.yaml")) as f:
getml_settings = yaml.safe_load(f.read())

getml_project_name = getml_settings["getml_project_name"]
getml_pipeline_id = getml_settings["pipeline_id"]
current_user_home_dir = pathlib.Path.home()
getml_project_path = current_user_home_dir / ".getML" / "projects" / getml_project_name
getml_project_path = current_user_home_dir / ".getML" / "projects" / getml_project_name
shutil.copytree(
src=os.path.join(path, "projects"),
dst=str(getml_project_path),
Expand All @@ -299,7 +320,6 @@
return getml.pipeline.load(getml_pipeline_id)



def _load_pyfunc(path):
"""Load PyFunc implementation. Called by ``pyfunc.load_model``.

Expand Down Expand Up @@ -360,15 +380,10 @@
log_post_training_metrics=True,
):
return _autolog(
flavor_name = FLAVOR_NAME,
flavor_name=FLAVOR_NAME,
log_input_examples=log_input_examples,
log_model_signatures=log_model_signatures,
log_models=log_models,
log_datasets=log_datasets,
log_post_training_metrics=log_post_training_metrics,
)





Loading
Loading