From c49aa569a29b8dc4b2be5b7f379e2f9c605f67a7 Mon Sep 17 00:00:00 2001 From: Ivar Grytten Date: Wed, 4 Sep 2024 16:01:48 +0200 Subject: [PATCH] Refactored code for getting model from folder/github, supporting mlflow project file --- climate_health/api.py | 51 +--------- climate_health/cli.py | 2 +- climate_health/external/external_model.py | 96 ++++++++++++++++++ climate_health/external/mlflow.py | 6 +- .../MLproject | 19 ++++ .../config.yml | 6 ++ .../future_data.csv | 11 ++ .../mock_predictor_script.py | 47 +++++++++ .../naive_python_model.model | Bin 0 -> 271 bytes .../predictions.csv | 3 + .../training_data.csv | 15 +++ scripts/mlflow_test.py | 9 +- tests/external/test_external_models.py | 20 +++- 13 files changed, 231 insertions(+), 54 deletions(-) create mode 100644 external_models/naive_python_model_with_mlproject_file/MLproject create mode 100644 external_models/naive_python_model_with_mlproject_file/config.yml create mode 100644 external_models/naive_python_model_with_mlproject_file/future_data.csv create mode 100644 external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py create mode 100644 external_models/naive_python_model_with_mlproject_file/naive_python_model.model create mode 100644 external_models/naive_python_model_with_mlproject_file/predictions.csv create mode 100644 external_models/naive_python_model_with_mlproject_file/training_data.csv diff --git a/climate_health/api.py b/climate_health/api.py index f23f26ff..4f23b182 100644 --- a/climate_health/api.py +++ b/climate_health/api.py @@ -1,30 +1,26 @@ import logging import json + from .assessment.forecast import forecast as do_forecast -import os -import shutil import zipfile -from datetime import datetime from pathlib import Path from typing import Optional, List import numpy as np from .assessment.dataset_splitting import train_test_split_with_weather -from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, ClimateHealthData, FullData +from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, FullData from .dhis2_interface.json_parsing import predictions_to_datavalue, parse_disease_data, json_to_pandas, \ parse_population_data -from .external.external_model import get_model_from_yaml_file +from .external.external_model import get_model_from_directory_or_github_url from .file_io.example_data_set import DataSetType, datasets -# from .external.external_model import ExternalCommandLineModel, get_model_from_yaml_file -from .geojson import geojson_to_shape, geojson_to_graph, NeighbourGraph +from .geojson import geojson_to_graph, NeighbourGraph from .plotting.prediction_plot import plot_forecast_from_summaries from .predictor import get_model from .spatio_temporal_data.temporal_dataclass import DataSet import dataclasses -from .time_period.date_util_wrapper import Week, delta_week, delta_month, Month -import git +from .time_period.date_util_wrapper import delta_month, Month from .transformations.covid_mask import mask_covid_data @@ -131,43 +127,6 @@ def read_zip_folder(zip_file_path: str) -> PredictionData: # ... -def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')): - """ - Gets the model and initializes a working directory with the code for the model - """ - is_github = False - if isinstance(model_path, str) and model_path.startswith("https://github.com"): - dir_name = model_path.split("/")[-1].replace(".git", "") - model_name = dir_name - is_github = True - else: - model_name = Path(model_path).name - - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - working_dir = base_working_dir / model_name / timestamp - - if is_github: - working_dir.mkdir(parents=True) - git.Repo.clone_from(model_path, working_dir) - else: - # copy contents of model_path to working_dir - shutil.copytree(model_path, working_dir) - - # assert that a config file exists - assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}" - return get_model_from_yaml_file(working_dir / 'config.yml', working_dir) - - -def get_model_maybe_yaml(model_name): - if model_name.endswith(".yaml") or model_name.endswith(".yml"): - working_dir = Path(model_name).parent - model = get_model_from_yaml_file(model_name, working_dir) - return model, model.name - elif model_name.startswith("https://github.com"): - return get_model_from_directory_or_github_url(model_name), model_name - else: - return get_model(model_name), model_name - def dhis_zip_flow(zip_file_path: str, out_json: Optional[str] = None, model_name=None, n_months=4, docker_filename: Optional[str] = None) -> List[dict] | None: diff --git a/climate_health/cli.py b/climate_health/cli.py index f8eb0197..e90626cf 100644 --- a/climate_health/cli.py +++ b/climate_health/cli.py @@ -9,7 +9,7 @@ import pandas as pd from cyclopts import App -from climate_health.api import get_model_maybe_yaml, get_model_from_directory_or_github_url +from climate_health.external.external_model import get_model_from_directory_or_github_url, get_model_maybe_yaml from climate_health.spatio_temporal_data.multi_country_dataset import MultiCountryDataSet from . import api from climate_health.dhis2_interface.ChapProgram import ChapPullPost diff --git a/climate_health/external/external_model.py b/climate_health/external/external_model.py index 2cf8197f..0e7165ac 100644 --- a/climate_health/external/external_model.py +++ b/climate_health/external/external_model.py @@ -1,13 +1,16 @@ import logging import os.path +import shutil import subprocess import sys import tempfile +from datetime import datetime from hashlib import md5 from pathlib import Path from typing import Protocol, Generic, TypeVar, Tuple, Optional import docker +import git import numpy as np import pandas as pd import pandas.errors @@ -17,6 +20,7 @@ from climate_health._legacy_dataset import IsSpatioTemporalDataSet from climate_health.datatypes import ClimateHealthTimeSeries, ClimateData, HealthData, SummaryStatistics from climate_health.docker_helper_functions import create_docker_image, run_command_through_docker_container +from climate_health.external.mlflow import ExternalMLflowModel from climate_health.geojson import NeighbourGraph from climate_health.runners.command_line_runner import CommandLineRunner from climate_health.runners.docker_runner import DockerImageRunner, DockerRunner @@ -236,6 +240,36 @@ def prediction_summary(self, future_data: DataSet[FeatureType], n_samples=1000): def _provide_temp_file(self): return tempfile.NamedTemporaryFile(dir=self._working_dir, delete=False) + @classmethod + def from_mlproject_file(cls, mlproject_file): + working_dir = mlproject_file.parent + # read yaml file into a dict + with open(mlproject_file, 'r') as file: + data = yaml.load(file, Loader=yaml.FullLoader) + + name = data['name'] + train_command = data['entry_points']['train'] + predict_command = data['entry_points']['predict'] + setup_command = None + data_type = data.get('data_type', None) + allowed_data_types = {'HealthData': HealthData} + data_type = allowed_data_types.get(data_type, None) + + assert "docker_env" in data, "Only docker supported for now" + runner = DockerRunner(data['docker_env']['image'], working_dir) + + model = cls( + name=name, + train_command=train_command, + predict_command=predict_command, + data_type=data_type, + setup_command=setup_command, + working_dir=working_dir, + # working_dir=Path(yaml_file).parent, + adapters=data.get('adapters', None), + runner=runner, + ) + return model # todo: remove this @@ -327,3 +361,65 @@ def get_runner_from_yaml_file(yaml_file: str) -> Runner: def get_model_and_runner_from_yaml_file(yaml_file: str) -> Tuple[ExternalCommandLineModel, Runner]: return ExternalCommandLineModel.from_yaml_file(yaml_file), get_runner_from_yaml_file(yaml_file) + + + +def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')): + """ + Gets the model and initializes a working directory with the code for the model. + model_path can be a local directory or github url + """ + is_github = False + if isinstance(model_path, str) and model_path.startswith("https://github.com"): + dir_name = model_path.split("/")[-1].replace(".git", "") + model_name = dir_name + is_github = True + else: + model_name = Path(model_path).name + + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + working_dir = base_working_dir / model_name / timestamp + + if is_github: + working_dir.mkdir(parents=True) + git.Repo.clone_from(model_path, working_dir) + else: + # copy contents of model_path to working_dir + shutil.copytree(model_path, working_dir) + + # assert that a config file exists + assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}" + if (working_dir / 'config.yml').exists(): + return get_model_from_yaml_file(working_dir / 'config.yml', working_dir) + else: + assert (working_dir / 'MLproject').exists(), f"MLproject file not found in {working_dir}" + return get_model_from_mlproject_file(working_dir / 'MLproject') + + +def get_model_from_mlproject_file(mlproject_file): + """ parses file and returns the model + Will not use MLflows project setup if docker is specified + """ + + with open(mlproject_file, 'r') as file: + config = yaml.load(mlproject_file, Loader=yaml.FullLoader) + if "docker_env" in config: + return ExternalCommandLineModel.from_mlproject_file(mlproject_file) + else: + name = config["name"] + return ExternalMLflowModel(mlproject_file, name=name, working_dir=Path(mlproject_file).parent) + + +def get_model_maybe_yaml(model_name): + model = get_model_from_directory_or_github_url(model_name) + return model, model.name + + from climate_health.predictor import get_model + if model_name.endswith(".yaml") or model_name.endswith(".yml"): + working_dir = Path(model_name).parent + model = get_model_from_yaml_file(model_name, working_dir) + return model, model.name + elif model_name.startswith("https://github.com"): + return get_model_from_directory_or_github_url(model_name), model_name + else: + return get_model(model_name), model_name diff --git a/climate_health/external/mlflow.py b/climate_health/external/mlflow.py index 6ba0fdbb..62bee4fe 100644 --- a/climate_health/external/mlflow.py +++ b/climate_health/external/mlflow.py @@ -19,7 +19,7 @@ class ExternalMLflowModel(Generic[FeatureType]): Wrapper around an mlflow model with commands for training and predicting """ - def __init__(self, model_path: str, adapters=None, working_dir="./"): + def __init__(self, model_path: str, name: str=None, adapters=None, working_dir="./"): self.model_path = model_path self._adapters = adapters self._working_dir = working_dir @@ -27,6 +27,10 @@ def __init__(self, model_path: str, adapters=None, working_dir="./"): self._model_file_name = Path(model_path).name + ".model" self.is_lagged = True + @property + def name(self): + return self._name + def train(self, train_data: DataSet, extra_args=None): if extra_args is None: diff --git a/external_models/naive_python_model_with_mlproject_file/MLproject b/external_models/naive_python_model_with_mlproject_file/MLproject new file mode 100644 index 00000000..0505db9b --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/MLproject @@ -0,0 +1,19 @@ +name: naive_python + +docker_env: + image: python:3.10 + +#python_env: python_env.yaml + +entry_points: + train: + parameters: + train_data: path + model_output_file: path + command: "python train.py {train_data} {model_output_file}" + predict: + parameters: + future_data: path + model: path + out_file: path + command: "python predict.py {future_data} {model} {out_file}" diff --git a/external_models/naive_python_model_with_mlproject_file/config.yml b/external_models/naive_python_model_with_mlproject_file/config.yml new file mode 100644 index 00000000..86919be5 --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/config.yml @@ -0,0 +1,6 @@ + + +name: naive_python_model +train_command: "python mock_predictor_script.py train {train_data} {model}" +predict_command: "python mock_predictor_script.py predict {future_data} {model} {out_file}" +data_type: HealthData diff --git a/external_models/naive_python_model_with_mlproject_file/future_data.csv b/external_models/naive_python_model_with_mlproject_file/future_data.csv new file mode 100644 index 00000000..d5a17e98 --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/future_data.csv @@ -0,0 +1,11 @@ +,time_period,rainfall,mean_temperature,max_temperature,location,disease_cases +0,2012-08,20,1,1,oslo, +1,2012-09,20,1,1,oslo, +2,2012-10,20,1,1,oslo, +3,2012-11,20,1,1,oslo, +4,2012-12,20,1,1,oslo, +0,2012-08,1,100,1,bergen, +1,2012-09,1,100,1,bergen, +2,2012-10,1,100,1,bergen, +3,2012-11,1,100,1,bergen, +4,2012-12,1,100,1,bergen, diff --git a/external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py b/external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py new file mode 100644 index 00000000..eddac689 --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py @@ -0,0 +1,47 @@ +import pickle + +import climate_health +from climate_health.datatypes import ClimateData, ClimateHealthTimeSeries +from climate_health.predictor.naive_predictor import NaivePredictor, MultiRegionNaivePredictor +import typer + +from climate_health.spatio_temporal_data.temporal_dataclass import DataSet + +app = typer.Typer() + + +@app.command() +def train(train_data_set: str, model_output_file: str): + predictor = MultiRegionNaivePredictor() + train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries) + predictor.train(train_data) + + # pickle predictor + with open(model_output_file, 'wb') as f: + pickle.dump(predictor, f) + + +@app.command() +def predict(future_climate_data_set: str, model_file: str, output_file: str): + with open(model_file, 'rb') as f: + predictor = pickle.load(f) + + future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData) + predictions = predictor.predict(future_climate_data) + print(predictions) + predictions.to_csv(output_file) + + +@app.command() +def predict_values(train_data_set: str, future_climate_data_set: str, output_file: str): + predictor = MultiRegionNaivePredictor() + train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries) + future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData) + predictor.train(train_data) + predictions = predictor.predict(future_climate_data) + print(predictions) + predictions.to_csv(output_file) + + +if __name__ == "__main__": + app() diff --git a/external_models/naive_python_model_with_mlproject_file/naive_python_model.model b/external_models/naive_python_model_with_mlproject_file/naive_python_model.model new file mode 100644 index 0000000000000000000000000000000000000000..2d10ce555b90b2e957683d9dcbcdc0e40bb10d35 GIT binary patch literal 271 zcmX|+&q@O^5XM`(mV!|5;K4UoFFASWL1@97#UOowFq@9Mfo!rQ*}@(Kz4!-mb9_(V zK(mqh9cGyC`|-g%7hlD6k`+s-RSN+{gP=phI){5DgLTY+I-vN!DPG?9I;b7i${Id~ z4@Z_h>FR}^sEkVy94MnqEqt&Y@#Me(9n>f!c=VZ?SLkXq6faEQb_0{vA#2BWa1I8F zb1y-IOTFHCWP0#_VqG5`Po literal 0 HcmV?d00001 diff --git a/external_models/naive_python_model_with_mlproject_file/predictions.csv b/external_models/naive_python_model_with_mlproject_file/predictions.csv new file mode 100644 index 00000000..b670b122 --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/predictions.csv @@ -0,0 +1,3 @@ +,time_period,disease_cases,location +0,2012-08,1.0,bergen +0,2012-08,20.0,oslo diff --git a/external_models/naive_python_model_with_mlproject_file/training_data.csv b/external_models/naive_python_model_with_mlproject_file/training_data.csv new file mode 100644 index 00000000..19b825d6 --- /dev/null +++ b/external_models/naive_python_model_with_mlproject_file/training_data.csv @@ -0,0 +1,15 @@ +,time_period,rainfall,mean_temperature,disease_cases,location +0,2012-01,1,1,20,oslo +1,2012-02,1,1,20,oslo +2,2012-03,1,1,20,oslo +3,2012-04,1,1,20,oslo +4,2012-05,1,1,20,oslo +5,2012-06,1,1,20,oslo +6,2012-07,1,1,20,oslo +0,2012-01,100,1,1,bergen +1,2012-02,100,1,1,bergen +2,2012-03,100,1,1,bergen +3,2012-04,100,1,1,bergen +4,2012-05,100,1,1,bergen +5,2012-06,100,1,1,bergen +6,2012-07,100,1,1,bergen diff --git a/scripts/mlflow_test.py b/scripts/mlflow_test.py index 5443cc3a..3400db18 100644 --- a/scripts/mlflow_test.py +++ b/scripts/mlflow_test.py @@ -1,16 +1,19 @@ +import sys + from climate_health.data.datasets import ISIMIP_dengue_harmonized from climate_health.external.external_model import get_model_from_yaml_file from climate_health.assessment.prediction_evaluator import evaluate_model from climate_health.external.mlflow import ExternalMLflowModel +""" model_name = 'config.yml' #working_dir = '../external_models/ewars_Plus/' working_dir = '../../chap_auto_ewars/' #model = get_model_from_yaml_file(working_dir + model_name, working_dir) -model = ExternalMLflowModel(working_dir, working_dir="./") -#model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir) +#model = ExternalMLflowModel(working_dir, working_dir="./") +model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir) dataset = ISIMIP_dengue_harmonized for country, data in dataset.items(): @@ -26,4 +29,4 @@ print(e) continue - +""" diff --git a/tests/external/test_external_models.py b/tests/external/test_external_models.py index bc4525a3..85e3a3f8 100644 --- a/tests/external/test_external_models.py +++ b/tests/external/test_external_models.py @@ -4,15 +4,17 @@ import pandas as pd import pytest import yaml +from databricks.sdk.service.serving import ExternalModel -from climate_health.api import get_model_from_directory_or_github_url from climate_health.spatio_temporal_data.temporal_dataclass import DataSet from climate_health.datatypes import ClimateHealthTimeSeries,FullData logging.basicConfig(level=logging.INFO) -from climate_health.external.external_model import get_model_from_yaml_file, run_command +from climate_health.external.external_model import (get_model_from_yaml_file, run_command, + ExternalCommandLineModel, + get_model_from_directory_or_github_url) from ..data_fixtures import train_data, train_data_pop, future_climate_data -from climate_health.util import conda_available +from climate_health.util import conda_available, docker_available @pytest.mark.skipif(not conda_available(), reason='requires conda') @@ -33,6 +35,18 @@ def test_python_model_from_folder(models_path, train_data, future_climate_data): assert results is not None +@pytest.mark.skipif(not docker_available(), reason='Requires docker') +def test_python_model_from_folder_with_mlproject_file(models_path): + path = models_path / 'naive_python_model_with_mlproject_file' + model = ExternalCommandLineModel.from_mlproject_file(path / 'MLproject') + + +def test_model_from_string_acceptance(models_path): + model = get_model_from_directory_or_github_url(models_path / 'naive_python_model_with_mlproject_file') + model = get_model_from_directory_or_github_url(models_path / 'naive_python_model') + model = get_model_from_directory_or_github_url("https://github.com/knutdrand/external_rmodel_example.git") + + def get_dataset_from_yaml(yaml_path: Path, datatype=ClimateHealthTimeSeries): specs = yaml.load(yaml_path.read_text(), Loader=yaml.FullLoader) if 'demo_data' in specs: