From 9ef243038883bf58ae897fb204346bd51392c2a1 Mon Sep 17 00:00:00 2001 From: Knut Rand Date: Thu, 19 Sep 2024 13:30:41 +0200 Subject: [PATCH] synch --- .gitignore | 1 + climate_health/assessment/forecast.py | 28 +++++- .../assessment/prediction_evaluator.py | 85 ++++++++++++++-- climate_health/climate_data/gridded_data.py | 40 ++++++++ climate_health/climate_predictor.py | 8 +- .../data/gluonts_adaptor/dataset.py | 1 + .../external/models/flax_models/flax_model.py | 1 + climate_health/rest_api.py | 23 +---- climate_health/rest_api_src/data_models.py | 27 +++++ .../rest_api_src/generate_rest_api.py | 56 +++++++++++ climate_health/runners/docker_runner.py | 4 +- .../temporal_dataclass.py | 5 +- .../time_period/date_util_wrapper.py | 15 ++- scripts/analyze_open_dengue.py | 11 +++ scripts/clean_laos_data2.py | 99 +++++++++++++++++++ scripts/explorations/xarray_gee.py | 28 ++++++ scripts/external_model_example.py | 37 ++++--- scripts/prediction_example.py | 15 +++ tests/test_climate_predictor.py | 6 +- tests/test_forecast.py | 15 ++- tests/test_gridded_data.py | 18 ++++ 21 files changed, 466 insertions(+), 57 deletions(-) create mode 100644 climate_health/climate_data/gridded_data.py create mode 100644 climate_health/rest_api_src/data_models.py create mode 100644 climate_health/rest_api_src/generate_rest_api.py create mode 100644 scripts/analyze_open_dengue.py create mode 100644 scripts/clean_laos_data2.py create mode 100644 scripts/explorations/xarray_gee.py create mode 100644 scripts/prediction_example.py create mode 100644 tests/test_gridded_data.py diff --git a/.gitignore b/.gitignore index e83d6279..e30b3906 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ climate_health/web_interface/yarn-error.log* climate_health/web_interface/node_modules/ +/scripts/runs/ diff --git a/climate_health/assessment/forecast.py b/climate_health/assessment/forecast.py index 996d95a5..ef162900 100644 --- a/climate_health/assessment/forecast.py +++ b/climate_health/assessment/forecast.py @@ -1,7 +1,13 @@ +import pandas as pd +from matplotlib import pyplot as plt + from climate_health.assessment.dataset_splitting import train_test_split_with_weather +from climate_health.assessment.prediction_evaluator import Estimator, Predictor +from climate_health.climate_predictor import MonthlyClimatePredictor +from climate_health.data.gluonts_adaptor.dataset import ForecastAdaptor from climate_health.plotting.prediction_plot import plot_forecast_from_summaries from climate_health.spatio_temporal_data.temporal_dataclass import DataSet -from climate_health.time_period.date_util_wrapper import TimeDelta, Month +from climate_health.time_period.date_util_wrapper import TimeDelta, Month, PeriodRange import logging logger = logging.getLogger(__name__) @@ -41,3 +47,23 @@ def multi_forecast(model, dataset: DataSet, prediction_lenght: TimeDelta, pre_tr cur_dataset, _, _ = train_test_split_with_weather(cur_dataset, split_period) logger.info(f'Forecasting {prediction_lenght} months into the future on {len(datasets)} datasets') return (forecast(model, dataset, prediction_lenght) for dataset in datasets[::-1]) + + +def forecast_ahead(estimator: Estimator, dataset: DataSet, prediction_length: int): + ''' + Forecast n_months into the future using the model + ''' + logger.info(f'Forecasting {prediction_length} months into the future') + train_data = dataset + predictor = estimator.train(train_data) + return forecast_with_predicted_weather(predictor, train_data, prediction_length, ) + + +def forecast_with_predicted_weather(predictor: Predictor, historic_data: DataSet, prediction_length: int, ): + prediction_range = PeriodRange.from_start_and_n_periods( + Month(historic_data.end_timestamp).to_string(), prediction_length) + climate_predictor = MonthlyClimatePredictor() + climate_predictor.train(historic_data) + future_weather = climate_predictor.predict(prediction_range) + predictions = predictor.predict(historic_data, future_weather) + return predictions diff --git a/climate_health/assessment/prediction_evaluator.py b/climate_health/assessment/prediction_evaluator.py index 8e27cf35..e9dcc74f 100644 --- a/climate_health/assessment/prediction_evaluator.py +++ b/climate_health/assessment/prediction_evaluator.py @@ -1,6 +1,6 @@ from collections import defaultdict from dataclasses import dataclass -from typing import Protocol, TypeVar +from typing import Protocol, TypeVar, Iterable, Dict from gluonts.evaluation import Evaluator from gluonts.model import Forecast @@ -111,6 +111,26 @@ def train(self, data: DataSet) -> Predictor: def evaluate_model(estimator: Estimator, data: DataSet, prediction_length=3, n_test_sets=4, report_filename=None): + ''' + Evaluate a model on a dataset on a held out test set, making multiple predictions on the test set + using the same trained model + + Parameters + ---------- + estimator : Estimator + The estimator to train and evaluate + data : DataSet + The data to train and evaluate on + prediction_length : int + The number of periods to predict ahead + n_test_sets : int + The number of test sets to evaluate on + + Returns + ------- + tuple + Summary and individual evaluation results + ''' train, test_generator = train_test_generator(data, prediction_length, n_test_sets) predictor = estimator.train(data) truth_data = { @@ -123,10 +143,42 @@ def evaluate_model(estimator: Estimator, data: DataSet, prediction_length=3, n_t evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) results = evaluator(tss, forecast_list) return results + + +def evaluate_multi_model(estimator: Estimator, data: list[DataSet], prediction_length=3, n_test_sets=4, + report_base_name=None): + trains, test_geneartors = zip(*[train_test_generator(d, prediction_length, n_test_sets) for d in data]) + predictor = estimator.multi_train(trains) + result_list = [] + for i, (data, test_generator) in enumerate(zip(data, test_geneartors)): + truth_data = { + location: pd.DataFrame(data[location].disease_cases, index=data[location].time_period.to_period_index()) for + location in data.keys()} + if report_base_name is not None: + _, plot_test_generatro = train_test_generator(data, prediction_length, n_test_sets) + plot_forecasts(predictor, plot_test_generatro, truth_data, f'{report_base_name}_i.pdf') + forecast_list, tss = _get_forecast_generators(predictor, test_generator, truth_data) + evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) + results = evaluator(tss, forecast_list) + result_list.append(results) + return results # forecasts = ((predictor.predict(*test_pair[:2]), test_pair[2]) for test_pair in test_generator) -def _get_forecast_generators(predictor, test_generator, truth_data) -> tuple[list[Forecast], list[pd.DataFrame]]: +def _get_forecast_generators(predictor: Predictor, test_generator: Iterable[tuple[DataSet, DataSet, DataSet]], truth_data: Dict[str, pd.DataFrame]) -> tuple[list[Forecast], list[pd.DataFrame]]: + ''' + Get the forecast and truth data for a predictor and test generator. + One entry is a combination of prediction start period and location + + Parameters + ---------- + predictor : Predictor + The predictor to evaluate + test_generator : Iterable[tuple[DataSet, DataSet, DataSet]] + The test generator to generate test data + truth_data : dict[str, pd.DataFrame] + The truth data for the locations + ''' tss = [] forecast_list = [] for historic_data, future_data, _ in test_generator: @@ -144,11 +196,14 @@ def _get_forecast_dict(predictor: Predictor, test_generator) -> dict[str, list[F forecast_dict = defaultdict(list) for historic_data, future_data, _ in test_generator: + assert len( + future_data.period_range) > 0, f'Future data must have at least one period {historic_data.period_range}, {future_data.period_range}' forecasts = predictor.predict(historic_data, future_data) for location, samples in forecasts.items(): forecast_dict[location].append(ForecastAdaptor.from_samples(samples)) return forecast_dict + def get_forecast_df(predictor: Predictor, test_generator) -> pd.DataFrame: forecast_dict = _get_forecast_dict(predictor, test_generator) dfs = [] @@ -158,20 +213,17 @@ def get_forecast_df(predictor: Predictor, test_generator) -> pd.DataFrame: return forecast_df + def plot_forecasts(predictors: list[Predictor], test_instance, truth, pdf_filename): forecast_dicts = [_get_forecast_dict(predictor, test_instance) for predictor in predictors] with PdfPages(pdf_filename) as pdf: for location in forecast_dicts[0].keys(): _t = truth[location] for forecast_dict in forecast_dicts: - fig = plt.subplots(figsize=(8, 4),ncols=len(forecast_dict)) + fig = plt.subplots(figsize=(8, 4), ncols=len(forecast_dict)) for i in range(len(forecast_dict[location])): forecast = forecast_dict[location][i] - - - - # plt.figure(figsize=(8, 4)) # Set the figure size # t = _t[_t.index <= forecast.index[-1]] # forecast.plot(show_label=True) @@ -182,7 +234,6 @@ def plot_forecasts(predictors: list[Predictor], test_instance, truth, pdf_filena # plt.close() # Close the figure - def plot_forecasts(predictor, test_instance, truth, pdf_filename): forecast_dict = _get_forecast_dict(predictor, test_instance) with PdfPages(pdf_filename) as pdf: @@ -199,13 +250,29 @@ def plot_forecasts(predictor, test_instance, truth, pdf_filename): plt.close() # Close the figure +def plot_predictions(predictions: DataSet[Samples], truth: DataSet, pdf_filename): + truth_dict = {location: pd.DataFrame(truth[location].disease_cases, index=truth[location].time_period.to_period_index()) + for location in truth.keys()} + with PdfPages(pdf_filename) as pdf: + for location, prediction in predictions.items(): + prediction = ForecastAdaptor.from_samples(prediction) + t = truth_dict[location] + plt.figure(figsize=(8, 4)) # Set the figure size + # t = _t[_t.index <= prediction.index[-1]] + prediction.plot(show_label=True) + plt.plot(t[-150:].to_timestamp()) + plt.title(location) + plt.legend() + pdf.savefig() + plt.close() # Close the figure + + def plot_forecasts_list(predictor, test_instances, truth, pdf_filename): forecasts, tss = _get_forecast_generators(predictor, test_instances, truth) with PdfPages(pdf_filename) as pdf: for i, (forecast_entry, ts_entry) in enumerate(zip(forecasts, tss)): last_period = forecast_entry.index[-1] ts_entry = ts_entry[ts_entry.index <= last_period] - offset = ts_entry plt.figure(figsize=(8, 4)) # Set the figure size plt.plot(ts_entry[-150:].to_timestamp()) forecast_entry.plot(show_label=True) diff --git a/climate_health/climate_data/gridded_data.py b/climate_health/climate_data/gridded_data.py new file mode 100644 index 00000000..460b9324 --- /dev/null +++ b/climate_health/climate_data/gridded_data.py @@ -0,0 +1,40 @@ +import ee +import xarray +from matplotlib import pyplot as plt +from matplotlib.colors import Normalize + +from ..google_earth_engine.gee_raw import load_credentials +import geopandas as gpd + +# Load the GeoJSON file using GeoPandas +def get_gridded_data(polygons_filename): + gdf = gpd.read_file(polygons_filename) + # Get the bounding box of all polygons in the GeoJSON + lon1, lat1, lon2, lat2 = gdf.total_bounds + print(lon1, lat1, lon2, lat2) + credentials = load_credentials() + ee.Initialize(ee.ServiceAccountCredentials(credentials.account, key_data=credentials.private_key)) + collection = ee.ImageCollection('ECMWF/ERA5_LAND/DAILY_AGGR').filterDate('2024-08-01', '2024-8-03').select('temperature_2m') + # lon1 = 28.8 + # lon2 = 30.9 + # lat1 = -2.9 + # lat2 = -1.0 + country_bounds = ee.Geometry.Rectangle(*gdf.total_bounds)#lon1, lat1, lon2, lat2) + projection = collection.first().select(0).projection() # EPSG:4326 + dataset = xarray.open_dataset( + collection, + engine='ee', + projection=projection, + geometry=country_bounds + ) + ds = dataset + first_image = dataset.isel(time=0) + temp_d = first_image['temperature_2m'] + temp_d.plot() + temp = temp_d.values + #plt.imshow(temp, extent=[ds.lon.min(), ds.lon.max(), ds.lat.min(), ds.lat.max()], origin='lower', cmap='viridis', + # norm=Normalize()) + #plt.imshow(temp, cmap='viridis') + gdf.boundary.plot(ax=plt.gca(), edgecolor='red', linewidth=1) + plt.show() + return temp \ No newline at end of file diff --git a/climate_health/climate_predictor.py b/climate_health/climate_predictor.py index b725be6d..b76f096b 100644 --- a/climate_health/climate_predictor.py +++ b/climate_health/climate_predictor.py @@ -19,23 +19,23 @@ def _feature_matrix(self, time_period: PeriodRange): return time_period.month[:,None] == np.arange(1, 13) def train(self, train_data: DataSet[ClimateData]): + train_data = train_data.remove_field('disease_cases') for location, data in train_data.items(): - data = data.data() self._cls = data.__class__ x = self._feature_matrix(data.time_period) for field in dataclasses.fields(data): - if field.name == 'time_period': + if field.name in ('time_period'): continue y = getattr(data, field.name) model = linear_model.LinearRegression() - model.fit(x, y[:,None]) + model.fit(x, y[:, None]) self._models[location][field.name] = model def predict(self, time_period: PeriodRange): x = self._feature_matrix(time_period) prediction_dict = {} for location, models in self._models.items(): - prediction_dict[location] = self._cls(time_period, **{field: model.predict(x) for field, model in models.items()}) + prediction_dict[location] = self._cls(time_period, **{field: model.predict(x).ravel() for field, model in models.items()}) return DataSet(prediction_dict) diff --git a/climate_health/data/gluonts_adaptor/dataset.py b/climate_health/data/gluonts_adaptor/dataset.py index da001618..7fd36592 100644 --- a/climate_health/data/gluonts_adaptor/dataset.py +++ b/climate_health/data/gluonts_adaptor/dataset.py @@ -1,3 +1,4 @@ +import warnings import dataclasses from pathlib import Path from typing import Iterable, TypeVar diff --git a/climate_health/external/models/flax_models/flax_model.py b/climate_health/external/models/flax_models/flax_model.py index f21bc6f8..512079de 100644 --- a/climate_health/external/models/flax_models/flax_model.py +++ b/climate_health/external/models/flax_models/flax_model.py @@ -36,6 +36,7 @@ class TrainState(train_state.TrainState): class FlaxModel: model: nn.Module# = RNNModel() n_iter: int = 3000 + def __init__(self, rng_key: jax.random.PRNGKey = jax.random.PRNGKey(100), n_iter: int = None): self.rng_key = rng_key self._losses = [] diff --git a/climate_health/rest_api.py b/climate_health/rest_api.py index 657f033b..2f5d9c20 100644 --- a/climate_health/rest_api.py +++ b/climate_health/rest_api.py @@ -1,8 +1,6 @@ import json -from contextlib import asynccontextmanager import logging -from asyncio import CancelledError -from typing import List, Union +from typing import Union from fastapi import BackgroundTasks, UploadFile, HTTPException from pydantic import BaseModel @@ -12,18 +10,14 @@ from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware -from climate_health.api import read_zip_folder, train_on_prediction_data from climate_health.api_types import RequestV1 -from climate_health.google_earth_engine.gee_era5 import Era5LandGoogleEarthEngine from climate_health.internal_state import Control, InternalState from climate_health.model_spec import ModelSpec, model_spec_from_model from climate_health.predictor import all_models from climate_health.predictor.feature_spec import Feature, all_features +from climate_health.rest_api_src.data_models import FullPredictionResponse from climate_health.rest_api_src.worker_functions import train_on_zip_file, train_on_json_data -from climate_health.training_control import TrainingControl -from dotenv import load_dotenv, find_dotenv -from climate_health.worker.background_tasks_worker import BGTaskWorker from climate_health.worker.rq_worker import RedisQueue logger = logging.getLogger(__name__) @@ -97,23 +91,10 @@ def is_finished(self): # worker = BGTaskWorker(BackgroundTasks(), internal_state, state) worker = RedisQueue() - def set_cur_response(response): state['response'] = response -class PredictionResponse(BaseModel): - value: float - orgUnit: str - dataElement: str - period: str - - -class FullPredictionResponse(BaseModel): - diseaseId: str - dataValues: List[PredictionResponse] - - @app.get('favicon.ico') async def favicon() -> FileResponse: return FileResponse('chap_icon.jpeg') diff --git a/climate_health/rest_api_src/data_models.py b/climate_health/rest_api_src/data_models.py new file mode 100644 index 00000000..93486d69 --- /dev/null +++ b/climate_health/rest_api_src/data_models.py @@ -0,0 +1,27 @@ +from typing import List + +from pydantic import BaseModel + + +class PredictionBase(BaseModel): + orgUnit: str + dataElement: str + period: str + + +class PredictionResponse(PredictionBase): + value: float + + +class PredictionSamplResponse(PredictionBase): + values: list[float] + + +class FullPredictionResponse(BaseModel): + diseaseId: str + dataValues: List[PredictionResponse] + + +class FullPredictionSampleResponse(BaseModel): + diseaseId: str + dataValues: List[PredictionSamplResponse] diff --git a/climate_health/rest_api_src/generate_rest_api.py b/climate_health/rest_api_src/generate_rest_api.py new file mode 100644 index 00000000..773e685e --- /dev/null +++ b/climate_health/rest_api_src/generate_rest_api.py @@ -0,0 +1,56 @@ +from typing import Optional + +from fastapi import FastAPI +from starlette.middleware.cors import CORSMiddleware + +from climate_health.api_types import RequestV1 +from climate_health.assessment.prediction_evaluator import Predictor +from climate_health.datatypes import Samples +from climate_health.rest_api_src.data_models import FullPredictionResponse +from climate_health.spatio_temporal_data.temporal_dataclass import DataSet + + +def get_app(): + app = FastAPI( + root_path="/v1" + ) + origins = [ + '*', # Allow all origins + "http://localhost:3000", + "localhost:3000", + ] + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] + ) + + return app + + +def samples_to_json(samples_dataset: DataSet[Samples]): + data_values = [] + for location, samples in samples_dataset.items(): + for period, data in zip(samples.time_periods, samples.data): + data_values.append( + FullPredictionResponse(orgunit=location, period=period.id(), data=data.tolist())) + return data_values + + +def get_rest_api(estimator): + app = get_app() + predictors: dict[str, Predictor] = {} + predictions: dict[str, DataSet[Samples]] = {} + + @app.post("/train") + def train(self, data: RequestV1, name: Optional[str] = None) -> dict: + name = name or 'model_{len(self._models)}' + predictors[name] = estimator.train(data) + return {'name': name} + + @app.post("/predict") + def predict(model_name: str, data: RequestV1): + samples: DataSet[Samples] = predictors[model_name].predict(data) + return samples_to_json(samples) diff --git a/climate_health/runners/docker_runner.py b/climate_health/runners/docker_runner.py index 6c500e9b..756b90d9 100644 --- a/climate_health/runners/docker_runner.py +++ b/climate_health/runners/docker_runner.py @@ -2,7 +2,8 @@ from ..docker_helper_functions import create_docker_image, run_command_through_docker_container from .runner import Runner - +import logging +logger = logging.getLogger(__name__) class DockerImageRunner(Runner): """A runner based on a docker image (Dockerfile)""" @@ -30,5 +31,6 @@ def __init__(self, docker_name: str, working_dir: str | Path): self._working_dir = working_dir def run_command(self, command): + logger.info(f'Running command {command} in docker container {self._docker_name} in {self._working_dir}') return run_command_through_docker_container(self._docker_name, self._working_dir, command) diff --git a/climate_health/spatio_temporal_data/temporal_dataclass.py b/climate_health/spatio_temporal_data/temporal_dataclass.py index 123d7ff9..56de6126 100644 --- a/climate_health/spatio_temporal_data/temporal_dataclass.py +++ b/climate_health/spatio_temporal_data/temporal_dataclass.py @@ -97,13 +97,16 @@ def start_timestamp(self) -> pd.Timestamp: def end_timestamp(self) -> pd.Timestamp: return self._data.time_period[-1].end_timestamp +class Polygon: + pass + class DataSet(Generic[FeaturesT]): ''' Class representing severeal time series at different locations. ''' - def __init__(self, data_dict: dict[str, FeaturesT]): + def __init__(self, data_dict: dict[str, FeaturesT], polygon_dict: dict[str, Polygon] = None): self._data_dict = {loc: TemporalDataclass(data) if not isinstance(data, TemporalDataclass) else data for loc, data in data_dict.items()} diff --git a/climate_health/time_period/date_util_wrapper.py b/climate_health/time_period/date_util_wrapper.py index 94695584..a229e386 100644 --- a/climate_health/time_period/date_util_wrapper.py +++ b/climate_health/time_period/date_util_wrapper.py @@ -173,6 +173,7 @@ def from_pandas(cls, period: pd.Period): @classmethod def parse_week(cls, week: str): year, weeknr = week.split('W') + print('########', week) return Week(int(year), int(weeknr)) @property @@ -207,7 +208,7 @@ def id(self): class Week(TimePeriod): - _used_attributes = ['year'] + _used_attributes = []#'year'] _extension = relativedelta(weeks=1) @property @@ -223,10 +224,12 @@ def __init__(self, date, *args, **kwargs): week_nr = args[0] if args else kwargs['week'] self._date = self.__date_from_numbers(year, week_nr) self.week = week_nr + self.year = self._date.year else: if isinstance(date, TimeStamp): date = date._date self.week = date.isocalendar()[1] + self.year = date.isocalendar()[0] self._date = date def __sub__(self, other: 'TimePeriod'): @@ -467,8 +470,17 @@ def from_pandas(cls, periods: Iterable[pd.Period]): cls._check_consequtive(time_delta, time_periods) return cls.from_time_periods(time_periods[0], time_periods[-1]) + @classmethod + def _check_consequtive_weeks(cls, time_periods, fill_missing=False): + period_range = pd.period_range(start=time_periods[0]._date, end=time_periods[-1]._date, freq='W') + #start + if not all(is_consective): + ... + @classmethod def _check_consequtive(cls, time_delta, time_periods, fill_missing=False): + # if time_delta == delta_week: + #return cls._check_consequtive_weeks(time_periods, fill_missing) is_consec = [p2 == p1 + time_delta for p1, p2 in zip(time_periods, time_periods[1:])] if not all(is_consec): if fill_missing: @@ -515,6 +527,7 @@ def from_period_list(cls, fill_missing, periods): missing = cls._check_consequtive(delta, periods, fill_missing) ret = cls.from_time_periods(periods[0], periods[-1]) if fill_missing: + assert len(ret) == len(missing)+len(periods), (len(ret), len(missing), len(periods), periods, missing) return ret, missing return ret diff --git a/scripts/analyze_open_dengue.py b/scripts/analyze_open_dengue.py new file mode 100644 index 00000000..9645b708 --- /dev/null +++ b/scripts/analyze_open_dengue.py @@ -0,0 +1,11 @@ +import pandas as pd + +filepath = '~/Downloads/Temporal_extract_V1_2_2.csv' +df = pd.read_csv(filepath) +print(df['S_res'].value_counts()) +print(df['T_res'].value_counts()) + +spatial_filepath = '~/Downloads/Spatial_extract_V1_2_2.csv' +dfS = pd.read_csv(spatial_filepath) +print(dfS['S_res'].value_counts()) +print(dfS['T_res'].value_counts()) diff --git a/scripts/clean_laos_data2.py b/scripts/clean_laos_data2.py new file mode 100644 index 00000000..74bf5dab --- /dev/null +++ b/scripts/clean_laos_data2.py @@ -0,0 +1,99 @@ +import string + +import numpy as np +import pandas as pd + +from climate_health.datatypes import ClimateData, ClimateHealthTimeSeries, FullData, ClimateHealthData +from climate_health.file_io.cleaners import laos_data +from climate_health.spatio_temporal_data.temporal_dataclass import DataSet +from climate_health.time_period import TimePeriod + +filname = '/home/knut/Downloads/laodenguedata.csv' +def parse_week(week): + week, year = week.split() + print(week, year) + weekstr = string.Formatter().format_field(int(week[1:]), '02') + return f'{year}W{weekstr}' + +raw_df = pd.read_csv(filname) +# +raw_df['Location'] = raw_df['Organisation unit'] +# Make a column for each unique value in the 'Data' column +df = raw_df.pivot(index=['Period', 'Location'], columns='Data', values='Value') +df = df.reset_index() +df['Period']= [parse_week(week) for week in df.Period] +colnames = ['Climate-Rainfall', 'Climate-Temperature avg', + 'NCLE: 7. Dengue cases (any)', 'Location', 'Period'] +true_colnames = ['rainfall', 'mean_temperature', 'disease_cases', 'location', 'time_period'] +df.rename(columns={colname: true_colname for colname, true_colname in zip(colnames, true_colnames)}, inplace=True) +df = df.sort_values(by=['time_period', 'location']) +if __name__ == '__main__': + dataset = DataSet.from_pandas(df, dataclass=ClimateHealthData, fill_missing=True) + + +if False: + mapping = {'rainfall': 'gsiW9SgolNd', + 'mean_temperature': 'VA05qvanuVs', + 'max_temperature': 'ZH76qVQl5Mz'} + + health_filaname = '/home/knut/Downloads/dengue.csv' + df = laos_data(health_filaname) + + # df.to_csv('/home/knut/Downloads/dengue_clean.csv') + health = df + climate_filename = '/home/knut/Downloads/climate_monthly_perdataelement.csv' + + + def get_laos_climate(climate_filename): + climate_data = pd.read_csv(climate_filename) + df = climate_data + df = df.sort_values(by=['orgunit', 'year', 'month']) + periods = [f'{year}-{month}' for year, month + in zip(climate_data['year'], + climate_data['month'])] + climate_data['periodid'] = periods + # climate_data = climate_data.sort_values(by=['periodid']) + d = {name: df['value.' + mapping[name]].values for name in mapping.keys()} + new_df = pd.DataFrame( + d | {'time_period': climate_data['periodid'], 'location': climate_data['orgunit']}) + spatio_temporal_dict = DataSet.from_pandas( + new_df, dataclass=ClimateData) + return spatio_temporal_dict.interpolate() + + + spatio_temporal_dict = get_laos_climate(climate_filename) + full_dict = {name: ClimateHealthTimeSeries.combine(health.get_location(name).data(), + spatio_temporal_dict.get_location(name).data()) + for name in health.locations()} + data = DataSet(full_dict) + data.to_csv('/home/knut/Downloads/laos_data.csv') + + laos_population = '''\ + Vientiane Capital: ~820,000 + Phongsali: ~177,000 + Louangnamtha: ~176,000 + Oudomxai: ~307,000 + Bokeo: ~205,000 + Louangphabang: ~431,000 + Houaphan: ~294,000 + Xainyabouli: ~381,000 + Xiangkhouang: ~252,000 + Vientiane: ~432,000 + Bolikhamxai: ~275,000 + Khammouan: ~415,000 + Savannakhet: ~939,000 + Salavan: ~396,000 + Xekong: ~120,000 + Champasak: ~694,000 + Attapu: ~153,000 + Xaisomboun: ~93,000''' + laos_population = {line.split(': ')[0]: int(line.split(': ~')[1].replace(',', '')) for line in + laos_population.split('\n')} + data_dict = {name[3:]: data.data() for name, data in data.items()} + full_data = {name: FullData(d.time_period, d.rainfall, d.mean_temperature, d.disease_cases, + np.full(len(d), laos_population[name])) + for name, d in data_dict.items()} + full_data = DataSet(full_data) + full_data.to_csv('/home/knut/Data/laos_full_data.csv') + # data = {name: FullData.combine(health.get_location(name).data(), spatio_temporal_dict.get_location(name).data(), laos_population[name]) + # for name in health.locations()} \ No newline at end of file diff --git a/scripts/explorations/xarray_gee.py b/scripts/explorations/xarray_gee.py new file mode 100644 index 00000000..a0a0f527 --- /dev/null +++ b/scripts/explorations/xarray_gee.py @@ -0,0 +1,28 @@ +import ee +import xarray +import pyproj ## installed with pip install to avoid missing proj database error +import numpy as np; + +from climate_health.google_earth_engine.gee_raw import load_credentials + +# required: https://github.com/google/Xee +service_account = 'dhis2-demo@dhis2-gis.iam.gserviceaccount.com' +#credentials = ee.ServiceAccountCredentials(service_account, '/Users/mastermaps/DHIS2/dhis-google-auth.json') +credentials = load_credentials() +ee.Initialize(ee.ServiceAccountCredentials(credentials.account, key_data=credentials.private_key)) +#ee.Initialize(credentials, opt_url='https://earthengine-highvolume.googleapis.com') +collection = ee.ImageCollection('ECMWF/ERA5_LAND/DAILY_AGGR').filterDate('2024-08-01', '2024-09-01').select('temperature_2m', 'total_precipitation_sum') +lon1 = 28.8 +lon2 = 30.9 +lat1 = -2.9 +lat2 = -1.0 +rwanda_bounds = ee.Geometry.Rectangle(lon1, lat1, lon2, lat2) +projection = collection.first().select(0).projection() # EPSG:4326 +dataset = xarray.open_dataset( + collection, + engine='ee', + projection=projection, + geometry=rwanda_bounds +) +first_image = dataset.isel(time=0) +temp = first_image['temperature_2m'].values \ No newline at end of file diff --git a/scripts/external_model_example.py b/scripts/external_model_example.py index 8fc93eab..ffe462b8 100644 --- a/scripts/external_model_example.py +++ b/scripts/external_model_example.py @@ -1,19 +1,32 @@ +import pandas as pd + from climate_health.assessment.prediction_evaluator import evaluate_model from climate_health.external.external_model import get_model_from_directory_or_github_url from climate_health.external.r_models import models_path from climate_health.file_io.example_data_set import datasets +import logging -model_names= {'deepar': models_path/ 'deepar', - 'naive_model': models_path/ 'naive_python_model_with_mlproject_file', - 'ewars': 'https://github.com/sandvelab/chap_auto_ewars'} - -dataset = datasets['ISIMIP_dengue_harmonized'].load() -dataset = dataset['brazil'] +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + model_names = { + #'deepar': models_path / 'deepar', + 'naive_model': models_path / 'naive_python_model_with_mlproject_file', + # 'ewars': 'https://github.com/sandvelab/chap_auto_ewars' + } -all_results = {} -for name, model_name in model_names.items(): - model = get_model_from_directory_or_github_url(model_name) - results = evaluate_model(model, dataset, prediction_length=6, n_test_sets=12, report_filename=f'{name}_report.pdf') - all_results[name] = results + dataset = datasets['ISIMIP_dengue_harmonized'].load() + dataset = dataset['vietnam'] + n_tests = 7 + prediction_length = 6 + all_results = {} + for name, model_name in model_names.items(): + model = get_model_from_directory_or_github_url(model_name) + results = evaluate_model(model, dataset, + prediction_length=prediction_length, + n_test_sets=n_tests, + report_filename=f'{name}_{n_tests}_{prediction_length}_report.pdf') + all_results[name] = results -print(all_results) + report_file = 'evaluation_report.csv' + df = pd.DataFrame([res[0] | {'model': name} for name, res in all_results.items()]) + df.to_csv(report_file, mode='w', header=True) diff --git a/scripts/prediction_example.py b/scripts/prediction_example.py new file mode 100644 index 00000000..024580dc --- /dev/null +++ b/scripts/prediction_example.py @@ -0,0 +1,15 @@ +from climate_health.assessment.forecast import forecast_ahead, forecast_with_predicted_weather +from climate_health.assessment.prediction_evaluator import plot_predictions +from climate_health.data.datasets import ISIMIP_dengue_harmonized +from climate_health.external.external_model import get_model_from_directory_or_github_url +from climate_health.external.r_models import models_path + +if __name__ == '__main__': + model_name = 'https://github.com/sandvelab/chap_auto_ewars' + #model_name = models_path / 'naive_python_model_with_mlproject_file' + estimator = get_model_from_directory_or_github_url(model_name) + dataset = ISIMIP_dengue_harmonized['vietnam'] + predictor = estimator.train(dataset) + predictions = forecast_with_predicted_weather(predictor, dataset, 3) + plot_predictions(predictions, dataset, 'prediction_example.pdf') + diff --git a/tests/test_climate_predictor.py b/tests/test_climate_predictor.py index 5bbc855d..fbaa645c 100644 --- a/tests/test_climate_predictor.py +++ b/tests/test_climate_predictor.py @@ -12,9 +12,8 @@ def climate_data(): time_period = PeriodRange.from_time_periods(Month.parse('2020-01'), Month.parse('2020-12')) values = np.arange(len(time_period)) return DataSet( - {'oslo': ClimateData(time_period, values, values*2, values*3), - 'stockholm': ClimateData(time_period, values, values*2, values*3)}) - + {'oslo': ClimateData(time_period, values, values * 2, values * 3), + 'stockholm': ClimateData(time_period, values, values * 2, values * 3)}) def test_climate_predictor(climate_data): @@ -24,4 +23,3 @@ def test_climate_predictor(climate_data): prediction = predictor.predict(time_period) - diff --git a/tests/test_forecast.py b/tests/test_forecast.py index 6f8d38ec..5d1cc213 100644 --- a/tests/test_forecast.py +++ b/tests/test_forecast.py @@ -1,9 +1,11 @@ import pytest -from climate_health.assessment.forecast import forecast, multi_forecast +from climate_health.assessment.forecast import forecast, multi_forecast, forecast_ahead +from climate_health.data.datasets import ISIMIP_dengue_harmonized from climate_health.file_io.example_data_set import datasets from climate_health.plotting.prediction_plot import plot_forecast_from_summaries from climate_health.predictor import get_model +from climate_health.predictor.naive_estimator import NaiveEstimator from climate_health.time_period.date_util_wrapper import delta_month @@ -12,7 +14,7 @@ def test_forecast(): model = get_model('HierarchicalStateModelD2')(num_warmup=20, num_samples=20) dataset = datasets['hydromet_5_filtered'].load() - predictions = forecast(model, dataset, 12*delta_month) + predictions = forecast(model, dataset, 12 * delta_month) for location, prediction in predictions.items(): fig = plot_forecast_from_summaries(prediction.data(), dataset.get_location(location).data()) fig.show() @@ -22,8 +24,15 @@ def test_forecast(): def test_multi_forecast(): model = get_model('HierarchicalStateModelD2')(num_warmup=20, num_samples=20) dataset = datasets['hydromet_5_filtered'].load() - predictions_list = list(multi_forecast(model, dataset, 48*delta_month, pre_train_delta=24*delta_month)) + predictions_list = list(multi_forecast(model, dataset, 48 * delta_month, pre_train_delta=24 * delta_month)) for location, true_data in dataset.items(): local_predictions = [pred.get_location(location).data() for pred in predictions_list] fig = plot_forecast_from_summaries(local_predictions, true_data.data()) fig.show() + + +def test_forecast_ahead(): + model = NaiveEstimator() + dataset = dataset = ISIMIP_dengue_harmonized['vietnam'] + prediction_length = 3 + forecast_ahead(model, dataset, prediction_length) diff --git a/tests/test_gridded_data.py b/tests/test_gridded_data.py new file mode 100644 index 00000000..fe4bc6a6 --- /dev/null +++ b/tests/test_gridded_data.py @@ -0,0 +1,18 @@ +import pytest + +from climate_health.google_earth_engine.gee_raw import load_credentials +from climate_health.climate_data.gridded_data import get_gridded_data + +@pytest.fixture +def credentials(): + try: + load_credentials() + except Exception as e: + pytest.skip("Credentials not found") + +@pytest.fixture +def polygons_filename(data_path): + return data_path/'philippines_polygons.json' + +def test_get_gridded_data(polygons_filename): + get_gridded_data(polygons_filename)