Merge remote-tracking branch 'origin/dev' into dev

dhis2 · Sep 4, 2024 · eea872f · eea872f
2 parents 0d771b7 + 27d6717
commit eea872f
Show file tree

Hide file tree

Showing 13 changed files with 230 additions and 54 deletions.
diff --git a/climate_health/api.py b/climate_health/api.py
@@ -1,30 +1,26 @@
 import logging
 import json
+
 from .assessment.forecast import forecast as do_forecast
-import os
-import shutil
 import zipfile
-from datetime import datetime
 from pathlib import Path
 from typing import Optional, List
 
 import numpy as np
 
 from .assessment.dataset_splitting import train_test_split_with_weather
-from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, ClimateHealthData, FullData
+from .datatypes import HealthData, ClimateData, HealthPopulationData, SimpleClimateData, FullData
 from .dhis2_interface.json_parsing import predictions_to_datavalue, parse_disease_data, json_to_pandas, \
     parse_population_data
-from .external.external_model import get_model_from_yaml_file
+from .external.external_model import get_model_from_directory_or_github_url
 from .file_io.example_data_set import DataSetType, datasets
-# from .external.external_model import ExternalCommandLineModel, get_model_from_yaml_file
-from .geojson import geojson_to_shape, geojson_to_graph, NeighbourGraph
+from .geojson import geojson_to_graph, NeighbourGraph
 from .plotting.prediction_plot import plot_forecast_from_summaries
 from .predictor import get_model
 from .spatio_temporal_data.temporal_dataclass import DataSet
 import dataclasses
 
-from .time_period.date_util_wrapper import Week, delta_week, delta_month, Month
-import git
+from .time_period.date_util_wrapper import delta_month, Month
 
 from .transformations.covid_mask import mask_covid_data
 
@@ -131,43 +127,6 @@ def read_zip_folder(zip_file_path: str) -> PredictionData:
 
 #    ...
 
-def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')):
-    """
-    Gets the model and initializes a working directory with the code for the model
-    """
-    is_github = False
-    if isinstance(model_path, str) and model_path.startswith("https://github.com"):
-        dir_name = model_path.split("/")[-1].replace(".git", "")
-        model_name = dir_name
-        is_github = True
-    else:
-        model_name = Path(model_path).name
-
-    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    working_dir = base_working_dir / model_name / timestamp
-
-    if is_github:
-        working_dir.mkdir(parents=True)
-        git.Repo.clone_from(model_path, working_dir)
-    else:
-        # copy contents of model_path to working_dir
-        shutil.copytree(model_path, working_dir)
-
-    # assert that a config file exists
-    assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}"
-    return get_model_from_yaml_file(working_dir / 'config.yml', working_dir)
-
-
-def get_model_maybe_yaml(model_name):
-    if model_name.endswith(".yaml") or model_name.endswith(".yml"):
-        working_dir = Path(model_name).parent
-        model = get_model_from_yaml_file(model_name, working_dir)
-        return model, model.name
-    elif model_name.startswith("https://github.com"):
-        return get_model_from_directory_or_github_url(model_name), model_name
-    else:
-        return get_model(model_name), model_name
-
 
 def dhis_zip_flow(zip_file_path: str, out_json: Optional[str] = None, model_name=None, n_months=4,
                   docker_filename: Optional[str] = None) -> List[dict] | None:

diff --git a/climate_health/cli.py b/climate_health/cli.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 from cyclopts import App
-from climate_health.api import get_model_maybe_yaml, get_model_from_directory_or_github_url
 from climate_health.spatio_temporal_data.multi_country_dataset import MultiCountryDataSet
 from . import api
 from climate_health.dhis2_interface.ChapProgram import ChapPullPost

diff --git a/climate_health/external/external_model.py b/climate_health/external/external_model.py
@@ -1,13 +1,16 @@
 import logging
 import os.path
+import shutil
 import subprocess
 import sys
 import tempfile
+from datetime import datetime
 from hashlib import md5
 from pathlib import Path
 from typing import Protocol, Generic, TypeVar, Tuple, Optional
 
 import docker
+import git
 import numpy as np
 import pandas as pd
 import pandas.errors
@@ -17,6 +20,7 @@
 from climate_health._legacy_dataset import IsSpatioTemporalDataSet
 from climate_health.datatypes import ClimateHealthTimeSeries, ClimateData, HealthData, SummaryStatistics
 from climate_health.docker_helper_functions import create_docker_image, run_command_through_docker_container
+from climate_health.external.mlflow import ExternalMLflowModel
 from climate_health.geojson import NeighbourGraph
 from climate_health.runners.command_line_runner import CommandLineRunner
 from climate_health.runners.docker_runner import DockerImageRunner, DockerRunner
@@ -236,6 +240,36 @@ def prediction_summary(self, future_data: DataSet[FeatureType], n_samples=1000):
     def _provide_temp_file(self):
         return tempfile.NamedTemporaryFile(dir=self._working_dir, delete=False)
 
+    @classmethod
+    def from_mlproject_file(cls, mlproject_file):
+        working_dir = mlproject_file.parent
+        # read yaml file into a dict
+        with open(mlproject_file, 'r') as file:
+            data = yaml.load(file, Loader=yaml.FullLoader)
+
+        name = data['name']
+        train_command = data['entry_points']['train']
+        predict_command = data['entry_points']['predict']
+        setup_command = None
+        data_type = data.get('data_type', None)
+        allowed_data_types = {'HealthData': HealthData}
+        data_type = allowed_data_types.get(data_type, None)
+
+        assert "docker_env" in data, "Only docker supported for now"
+        runner = DockerRunner(data['docker_env']['image'], working_dir)
+
+        model = cls(
+            name=name,
+            train_command=train_command,
+            predict_command=predict_command,
+            data_type=data_type,
+            setup_command=setup_command,
+            working_dir=working_dir,
+            # working_dir=Path(yaml_file).parent,
+            adapters=data.get('adapters', None),
+            runner=runner,
+        )
+        return model
 
 
 # todo: remove this
@@ -327,3 +361,65 @@ def get_runner_from_yaml_file(yaml_file: str) -> Runner:
 
 def get_model_and_runner_from_yaml_file(yaml_file: str) -> Tuple[ExternalCommandLineModel, Runner]:
     return ExternalCommandLineModel.from_yaml_file(yaml_file), get_runner_from_yaml_file(yaml_file)
+
+
+
+def get_model_from_directory_or_github_url(model_path, base_working_dir=Path('runs/')):
+    """
+    Gets the model and initializes a working directory with the code for the model.
+    model_path can be a local directory or github url
+    """
+    is_github = False
+    if isinstance(model_path, str) and model_path.startswith("https://github.com"):
+        dir_name = model_path.split("/")[-1].replace(".git", "")
+        model_name = dir_name
+        is_github = True
+    else:
+        model_name = Path(model_path).name
+
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    working_dir = base_working_dir / model_name / timestamp
+
+    if is_github:
+        working_dir.mkdir(parents=True)
+        git.Repo.clone_from(model_path, working_dir)
+    else:
+        # copy contents of model_path to working_dir
+        shutil.copytree(model_path, working_dir)
+
+    # assert that a config file exists
+    assert (working_dir / 'config.yml').exists(), f"config.yml file not found in {working_dir}"
+    if (working_dir / 'config.yml').exists():
+        return get_model_from_yaml_file(working_dir / 'config.yml', working_dir)
+    else:
+        assert (working_dir / 'MLproject').exists(), f"MLproject file not found in {working_dir}"
+        return get_model_from_mlproject_file(working_dir / 'MLproject')
+
+
+def get_model_from_mlproject_file(mlproject_file):
+    """ parses file and returns the model
+    Will not use MLflows project setup if docker is specified
+    """
+
+    with open(mlproject_file, 'r') as file:
+        config = yaml.load(mlproject_file, Loader=yaml.FullLoader)
+    if "docker_env" in config:
+        return ExternalCommandLineModel.from_mlproject_file(mlproject_file)
+    else:
+        name = config["name"]
+        return ExternalMLflowModel(mlproject_file, name=name, working_dir=Path(mlproject_file).parent)
+
+
+def get_model_maybe_yaml(model_name):
+    model = get_model_from_directory_or_github_url(model_name)
+    return model, model.name
+
+    from climate_health.predictor import get_model
+    if model_name.endswith(".yaml") or model_name.endswith(".yml"):
+        working_dir = Path(model_name).parent
+        model = get_model_from_yaml_file(model_name, working_dir)
+        return model, model.name
+    elif model_name.startswith("https://github.com"):
+        return get_model_from_directory_or_github_url(model_name), model_name
+    else:
+        return get_model(model_name), model_name
diff --git a/climate_health/external/mlflow.py b/climate_health/external/mlflow.py
@@ -19,14 +19,18 @@ class ExternalMLflowModel(Generic[FeatureType]):
     Wrapper around an mlflow model with commands for training and predicting
     """
 
-    def __init__(self, model_path: str, adapters=None, working_dir="./"):
+    def __init__(self, model_path: str, name: str=None, adapters=None, working_dir="./"):
         self.model_path = model_path
         self._adapters = adapters
         self._working_dir = working_dir
         self._location_mapping = None
         self._model_file_name = Path(model_path).name + ".model"
         self.is_lagged = True
 
+    @property
+    def name(self):
+        return self._name
+
     def train(self, train_data: DataSet, extra_args=None):
 
         if extra_args is None:

diff --git a/external_models/naive_python_model_with_mlproject_file/MLproject b/external_models/naive_python_model_with_mlproject_file/MLproject
@@ -0,0 +1,19 @@
+name: naive_python
+
+docker_env:
+  image: python:3.10
+
+#python_env: python_env.yaml
+
+entry_points:
+  train:
+    parameters:
+      train_data: path
+      model_output_file: path
+    command: "python train.py {train_data} {model_output_file}"
+  predict:
+    parameters:
+      future_data: path
+      model: path
+      out_file: path
+    command: "python predict.py {future_data} {model} {out_file}"
diff --git a/external_models/naive_python_model_with_mlproject_file/config.yml b/external_models/naive_python_model_with_mlproject_file/config.yml
@@ -0,0 +1,6 @@
+
+
+name: naive_python_model
+train_command: "python mock_predictor_script.py train {train_data} {model}"
+predict_command: "python mock_predictor_script.py predict {future_data} {model} {out_file}"
+data_type: HealthData
diff --git a/external_models/naive_python_model_with_mlproject_file/future_data.csv b/external_models/naive_python_model_with_mlproject_file/future_data.csv
@@ -0,0 +1,11 @@
+,time_period,rainfall,mean_temperature,max_temperature,location,disease_cases
+0,2012-08,20,1,1,oslo,
+1,2012-09,20,1,1,oslo,
+2,2012-10,20,1,1,oslo,
+3,2012-11,20,1,1,oslo,
+4,2012-12,20,1,1,oslo,
+0,2012-08,1,100,1,bergen,
+1,2012-09,1,100,1,bergen,
+2,2012-10,1,100,1,bergen,
+3,2012-11,1,100,1,bergen,
+4,2012-12,1,100,1,bergen,
diff --git a/external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py b/external_models/naive_python_model_with_mlproject_file/mock_predictor_script.py
@@ -0,0 +1,47 @@
+import pickle
+
+import climate_health
+from climate_health.datatypes import ClimateData, ClimateHealthTimeSeries
+from climate_health.predictor.naive_predictor import NaivePredictor, MultiRegionNaivePredictor
+import typer
+
+from climate_health.spatio_temporal_data.temporal_dataclass import DataSet
+
+app = typer.Typer()
+
+
+@app.command()
+def train(train_data_set: str, model_output_file: str):
+    predictor = MultiRegionNaivePredictor()
+    train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries)
+    predictor.train(train_data)
+
+    # pickle predictor
+    with open(model_output_file, 'wb') as f:
+        pickle.dump(predictor, f)
+
+
+@app.command()
+def predict(future_climate_data_set: str, model_file: str, output_file: str):
+    with open(model_file, 'rb') as f:
+        predictor = pickle.load(f)
+
+    future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData)
+    predictions = predictor.predict(future_climate_data)
+    print(predictions)
+    predictions.to_csv(output_file)
+
+
+@app.command()
+def predict_values(train_data_set: str, future_climate_data_set: str, output_file: str):
+    predictor = MultiRegionNaivePredictor()
+    train_data = DataSet.from_csv(train_data_set, ClimateHealthTimeSeries)
+    future_climate_data = DataSet.from_csv(future_climate_data_set, ClimateData)
+    predictor.train(train_data)
+    predictions = predictor.predict(future_climate_data)
+    print(predictions)
+    predictions.to_csv(output_file)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/external_models/naive_python_model_with_mlproject_file/naive_python_model.model b/external_models/naive_python_model_with_mlproject_file/naive_python_model.model
diff --git a/external_models/naive_python_model_with_mlproject_file/predictions.csv b/external_models/naive_python_model_with_mlproject_file/predictions.csv
@@ -0,0 +1,3 @@
+,time_period,disease_cases,location
+0,2012-08,1.0,bergen
+0,2012-08,20.0,oslo
diff --git a/external_models/naive_python_model_with_mlproject_file/training_data.csv b/external_models/naive_python_model_with_mlproject_file/training_data.csv
@@ -0,0 +1,15 @@
+,time_period,rainfall,mean_temperature,disease_cases,location
+0,2012-01,1,1,20,oslo
+1,2012-02,1,1,20,oslo
+2,2012-03,1,1,20,oslo
+3,2012-04,1,1,20,oslo
+4,2012-05,1,1,20,oslo
+5,2012-06,1,1,20,oslo
+6,2012-07,1,1,20,oslo
+0,2012-01,100,1,1,bergen
+1,2012-02,100,1,1,bergen
+2,2012-03,100,1,1,bergen
+3,2012-04,100,1,1,bergen
+4,2012-05,100,1,1,bergen
+5,2012-06,100,1,1,bergen
+6,2012-07,100,1,1,bergen
diff --git a/scripts/mlflow_test.py b/scripts/mlflow_test.py
@@ -1,16 +1,19 @@
+import sys
+
 from climate_health.data.datasets import ISIMIP_dengue_harmonized
 from climate_health.external.external_model import get_model_from_yaml_file
 from climate_health.assessment.prediction_evaluator import evaluate_model
 from climate_health.external.mlflow import ExternalMLflowModel
 
+"""
 model_name = 'config.yml'
 #working_dir = '../external_models/ewars_Plus/'
 working_dir = '../../chap_auto_ewars/'
 
 #model = get_model_from_yaml_file(working_dir  + model_name, working_dir)
 
-model = ExternalMLflowModel(working_dir, working_dir="./")
-#model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir)
+#model = ExternalMLflowModel(working_dir, working_dir="./")
+model = ExternalMLflowModel("https://github.com/sandvelab/chap_auto_ewars", working_dir=working_dir)
 
 dataset = ISIMIP_dengue_harmonized
 for country, data in dataset.items():
@@ -26,4 +29,4 @@
         print(e)
         continue
 
-
+"""