Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
knutdrand committed Sep 4, 2024
2 parents 0d771b7 + 27d6717 commit eea872f
Show file tree
Hide file tree
Showing 13 changed files with 230 additions and 54 deletions.
51 changes: 5 additions & 46 deletions climate_health/api.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,26 @@
import logging
import json

from .assessment.forecast import forecast as do_forecast
import os
import shutil
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Optional, List

import numpy as np

from .assessment.dataset_splitting import train_test_split_with_weather
from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, ClimateHealthData, FullData
from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, FullData
from .dhis2_interface.json_parsing import predictions_to_datavalue, parse_disease_data, json_to_pandas, \
parse_population_data
from .external.external_model import get_model_from_yaml_file
from .external.external_model import get_model_from_directory_or_github_url
from .file_io.example_data_set import DataSetType, datasets
# from .external.external_model import ExternalCommandLineModel, get_model_from_yaml_file
from .geojson import geojson_to_shape, geojson_to_graph, NeighbourGraph
from .geojson import geojson_to_graph, NeighbourGraph
from .plotting.prediction_plot import plot_forecast_from_summaries
from .predictor import get_model
from .spatio_temporal_data.temporal_dataclass import DataSet
import dataclasses

from .time_period.date_util_wrapper import Week, delta_week, delta_month, Month
import git
from .time_period.date_util_wrapper import delta_month, Month

from .transformations.covid_mask import mask_covid_data

Expand Down Expand Up @@ -131,43 +127,6 @@ def read_zip_folder(zip_file_path: str) -> PredictionData:

# ...

def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')):
"""
Gets the model and initializes a working directory with the code for the model
"""
is_github = False
if isinstance(model_path, str) and model_path.startswith("https://github.com"):
dir_name = model_path.split("/")[-1].replace(".git", "")
model_name = dir_name
is_github = True
else:
model_name = Path(model_path).name

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
working_dir = base_working_dir / model_name / timestamp

if is_github:
working_dir.mkdir(parents=True)
git.Repo.clone_from(model_path, working_dir)
else:
# copy contents of model_path to working_dir
shutil.copytree(model_path, working_dir)

# assert that a config file exists
assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}"
return get_model_from_yaml_file(working_dir / 'config.yml', working_dir)


def get_model_maybe_yaml(model_name):
if model_name.endswith(".yaml") or model_name.endswith(".yml"):
working_dir = Path(model_name).parent
model = get_model_from_yaml_file(model_name, working_dir)
return model, model.name
elif model_name.startswith("https://github.com"):
return get_model_from_directory_or_github_url(model_name), model_name
else:
return get_model(model_name), model_name


def dhis_zip_flow(zip_file_path: str, out_json: Optional[str] = None, model_name=None, n_months=4,
docker_filename: Optional[str] = None) -> List[dict] | None:
Expand Down
1 change: 0 additions & 1 deletion climate_health/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import numpy as np
import pandas as pd
from cyclopts import App
from climate_health.api import get_model_maybe_yaml, get_model_from_directory_or_github_url
from climate_health.spatio_temporal_data.multi_country_dataset import MultiCountryDataSet
from . import api
from climate_health.dhis2_interface.ChapProgram import ChapPullPost
Expand Down
96 changes: 96 additions & 0 deletions climate_health/external/external_model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import logging
import os.path
import shutil
import subprocess
import sys
import tempfile
from datetime import datetime
from hashlib import md5
from pathlib import Path
from typing import Protocol, Generic, TypeVar, Tuple, Optional

import docker
import git
import numpy as np
import pandas as pd
import pandas.errors
Expand All @@ -17,6 +20,7 @@
from climate_health._legacy_dataset import IsSpatioTemporalDataSet
from climate_health.datatypes import ClimateHealthTimeSeries, ClimateData, HealthData, SummaryStatistics
from climate_health.docker_helper_functions import create_docker_image, run_command_through_docker_container
from climate_health.external.mlflow import ExternalMLflowModel
from climate_health.geojson import NeighbourGraph
from climate_health.runners.command_line_runner import CommandLineRunner
from climate_health.runners.docker_runner import DockerImageRunner, DockerRunner
Expand Down Expand Up @@ -236,6 +240,36 @@ def prediction_summary(self, future_data: DataSet[FeatureType], n_samples=1000):
def _provide_temp_file(self):
return tempfile.NamedTemporaryFile(dir=self._working_dir, delete=False)

@classmethod
def from_mlproject_file(cls, mlproject_file):
working_dir = mlproject_file.parent
# read yaml file into a dict
with open(mlproject_file, 'r') as file:
data = yaml.load(file, Loader=yaml.FullLoader)

name = data['name']
train_command = data['entry_points']['train']
predict_command = data['entry_points']['predict']
setup_command = None
data_type = data.get('data_type', None)
allowed_data_types = {'HealthData': HealthData}
data_type = allowed_data_types.get(data_type, None)

assert "docker_env" in data, "Only docker supported for now"
runner = DockerRunner(data['docker_env']['image'], working_dir)

model = cls(
name=name,
train_command=train_command,
predict_command=predict_command,
data_type=data_type,
setup_command=setup_command,
working_dir=working_dir,
# working_dir=Path(yaml_file).parent,
adapters=data.get('adapters', None),
runner=runner,
)
return model


# todo: remove this
Expand Down Expand Up @@ -327,3 +361,65 @@ def get_runner_from_yaml_file(yaml_file: str) -> Runner:

def get_model_and_runner_from_yaml_file(yaml_file: str) -> Tuple[ExternalCommandLineModel, Runner]:
return ExternalCommandLineModel.from_yaml_file(yaml_file), get_runner_from_yaml_file(yaml_file)



def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')):
"""
Gets the model and initializes a working directory with the code for the model.
model_path can be a local directory or github url
"""
is_github = False
if isinstance(model_path, str) and model_path.startswith("https://github.com"):
dir_name = model_path.split("/")[-1].replace(".git", "")
model_name = dir_name
is_github = True
else:
model_name = Path(model_path).name

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
working_dir = base_working_dir / model_name / timestamp

if is_github:
working_dir.mkdir(parents=True)
git.Repo.clone_from(model_path, working_dir)
else:
# copy contents of model_path to working_dir
shutil.copytree(model_path, working_dir)

# assert that a config file exists
assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}"
if (working_dir / 'config.yml').exists():
return get_model_from_yaml_file(working_dir / 'config.yml', working_dir)
else:
assert (working_dir / 'MLproject').exists(), f"MLproject file not found in {working_dir}"
return get_model_from_mlproject_file(working_dir / 'MLproject')


def get_model_from_mlproject_file(mlproject_file):
""" parses file and returns the model
Will not use MLflows project setup if docker is specified
"""

with open(mlproject_file, 'r') as file:
config = yaml.load(mlproject_file, Loader=yaml.FullLoader)
if "docker_env" in config:
return ExternalCommandLineModel.from_mlproject_file(mlproject_file)
else:
name = config["name"]
return ExternalMLflowModel(mlproject_file, name=name, working_dir=Path(mlproject_file).parent)


def get_model_maybe_yaml(model_name):
model = get_model_from_directory_or_github_url(model_name)
return model, model.name

from climate_health.predictor import get_model
if model_name.endswith(".yaml") or model_name.endswith(".yml"):
working_dir = Path(model_name).parent
model = get_model_from_yaml_file(model_name, working_dir)
return model, model.name
elif model_name.startswith("https://github.com"):
return get_model_from_directory_or_github_url(model_name), model_name
else:
return get_model(model_name), model_name
6 changes: 5 additions & 1 deletion climate_health/external/mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@ class ExternalMLflowModel(Generic[FeatureType]):
Wrapper around an mlflow model with commands for training and predicting
"""

def __init__(self, model_path: str, adapters=None, working_dir="./"):
def __init__(self, model_path: str, name: str=None, adapters=None, working_dir="./"):
self.model_path = model_path
self._adapters = adapters
self._working_dir = working_dir
self._location_mapping = None
self._model_file_name = Path(model_path).name + ".model"
self.is_lagged = True

@property
def name(self):
return self._name

def train(self, train_data: DataSet, extra_args=None):

if extra_args is None:
Expand Down
19 changes: 19 additions & 0 deletions external_models/naive_python_model_with_mlproject_file/MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: naive_python

docker_env:
image: python:3.10

#python_env: python_env.yaml

entry_points:
train:
parameters:
train_data: path
model_output_file: path
command: "python train.py {train_data} {model_output_file}"
predict:
parameters:
future_data: path
model: path
out_file: path
command: "python predict.py {future_data} {model} {out_file}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@


name: naive_python_model
train_command: "python mock_predictor_script.py train {train_data} {model}"
predict_command: "python mock_predictor_script.py predict {future_data} {model} {out_file}"
data_type: HealthData
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,time_period,rainfall,mean_temperature,max_temperature,location,disease_cases
0,2012-08,20,1,1,oslo,
1,2012-09,20,1,1,oslo,
2,2012-10,20,1,1,oslo,
3,2012-11,20,1,1,oslo,
4,2012-12,20,1,1,oslo,
0,2012-08,1,100,1,bergen,
1,2012-09,1,100,1,bergen,
2,2012-10,1,100,1,bergen,
3,2012-11,1,100,1,bergen,
4,2012-12,1,100,1,bergen,
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pickle

import climate_health
from climate_health.datatypes import ClimateData, ClimateHealthTimeSeries
from climate_health.predictor.naive_predictor import NaivePredictor, MultiRegionNaivePredictor
import typer

from climate_health.spatio_temporal_data.temporal_dataclass import DataSet

app = typer.Typer()


@app.command()
def train(train_data_set: str, model_output_file: str):
predictor = MultiRegionNaivePredictor()
train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries)
predictor.train(train_data)

# pickle predictor
with open(model_output_file, 'wb') as f:
pickle.dump(predictor, f)


@app.command()
def predict(future_climate_data_set: str, model_file: str, output_file: str):
with open(model_file, 'rb') as f:
predictor = pickle.load(f)

future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData)
predictions = predictor.predict(future_climate_data)
print(predictions)
predictions.to_csv(output_file)


@app.command()
def predict_values(train_data_set: str, future_climate_data_set: str, output_file: str):
predictor = MultiRegionNaivePredictor()
train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries)
future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData)
predictor.train(train_data)
predictions = predictor.predict(future_climate_data)
print(predictions)
predictions.to_csv(output_file)


if __name__ == "__main__":
app()
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,time_period,disease_cases,location
0,2012-08,1.0,bergen
0,2012-08,20.0,oslo
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
,time_period,rainfall,mean_temperature,disease_cases,location
0,2012-01,1,1,20,oslo
1,2012-02,1,1,20,oslo
2,2012-03,1,1,20,oslo
3,2012-04,1,1,20,oslo
4,2012-05,1,1,20,oslo
5,2012-06,1,1,20,oslo
6,2012-07,1,1,20,oslo
0,2012-01,100,1,1,bergen
1,2012-02,100,1,1,bergen
2,2012-03,100,1,1,bergen
3,2012-04,100,1,1,bergen
4,2012-05,100,1,1,bergen
5,2012-06,100,1,1,bergen
6,2012-07,100,1,1,bergen
9 changes: 6 additions & 3 deletions scripts/mlflow_test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import sys

from climate_health.data.datasets import ISIMIP_dengue_harmonized
from climate_health.external.external_model import get_model_from_yaml_file
from climate_health.assessment.prediction_evaluator import evaluate_model
from climate_health.external.mlflow import ExternalMLflowModel

"""
model_name = 'config.yml'
#working_dir = '../external_models/ewars_Plus/'
working_dir = '../../chap_auto_ewars/'
#model = get_model_from_yaml_file(working_dir + model_name, working_dir)
model = ExternalMLflowModel(working_dir, working_dir="./")
#model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir)
#model = ExternalMLflowModel(working_dir, working_dir="./")
model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir)
dataset = ISIMIP_dengue_harmonized
for country, data in dataset.items():
Expand All @@ -26,4 +29,4 @@
print(e)
continue

"""
Loading

0 comments on commit eea872f

Please sign in to comment.