From 7dfdc7daaefae28468ef3b12703cd85cb40bd607 Mon Sep 17 00:00:00 2001 From: Knut Rand Date: Fri, 20 Sep 2024 11:27:52 +0200 Subject: [PATCH] feature: weekly predictions ahead in time --- climate_health/assessment/forecast.py | 16 ++-- .../assessment/prediction_evaluator.py | 7 +- climate_health/climate_predictor.py | 24 ++++-- .../temporal_dataclass.py | 5 +- climate_health/time_period/__init__.py | 2 +- .../time_period/date_util_wrapper.py | 71 +++++++++++++---- pytest.ini | 2 +- scripts/clean_laos_data2.py | 77 ++++++++++++------- scripts/evaluate_laos.py | 12 +++ scripts/prediction_example.py | 17 +++- tests/test_climate_predictor.py | 19 ++++- tests/test_forecast.py | 9 ++- tests/time_period/test_dateutil_wrapper.py | 2 +- 13 files changed, 194 insertions(+), 69 deletions(-) create mode 100644 scripts/evaluate_laos.py diff --git a/climate_health/assessment/forecast.py b/climate_health/assessment/forecast.py index ef162900..28f62750 100644 --- a/climate_health/assessment/forecast.py +++ b/climate_health/assessment/forecast.py @@ -3,7 +3,7 @@ from climate_health.assessment.dataset_splitting import train_test_split_with_weather from climate_health.assessment.prediction_evaluator import Estimator, Predictor -from climate_health.climate_predictor import MonthlyClimatePredictor +from climate_health.climate_predictor import MonthlyClimatePredictor, get_climate_predictor from climate_health.data.gluonts_adaptor.dataset import ForecastAdaptor from climate_health.plotting.prediction_plot import plot_forecast_from_summaries from climate_health.spatio_temporal_data.temporal_dataclass import DataSet @@ -60,10 +60,16 @@ def forecast_ahead(estimator: Estimator, dataset: DataSet, prediction_length: in def forecast_with_predicted_weather(predictor: Predictor, historic_data: DataSet, prediction_length: int, ): - prediction_range = PeriodRange.from_start_and_n_periods( - Month(historic_data.end_timestamp).to_string(), prediction_length) - climate_predictor = MonthlyClimatePredictor() - climate_predictor.train(historic_data) + delta = historic_data.period_range[0].time_delta + prediction_range = PeriodRange(historic_data.end_timestamp, + historic_data.end_timestamp + delta * prediction_length, + delta) + + #prediction_range = PeriodRange.from_start_and_n_periods( + # Month(historic_data.end_timestamp).to_string(), prediction_length) + #climate_predictor = MonthlyClimatePredictor() + #climate_predictor.train(historic_data) + climate_predictor= get_climate_predictor(historic_data) future_weather = climate_predictor.predict(prediction_range) predictions = predictor.predict(historic_data, future_weather) return predictions diff --git a/climate_health/assessment/prediction_evaluator.py b/climate_health/assessment/prediction_evaluator.py index e9dcc74f..64b9b46e 100644 --- a/climate_health/assessment/prediction_evaluator.py +++ b/climate_health/assessment/prediction_evaluator.py @@ -235,6 +235,7 @@ def plot_forecasts(predictors: list[Predictor], test_instance, truth, pdf_filena def plot_forecasts(predictor, test_instance, truth, pdf_filename): + forecast_dict = _get_forecast_dict(predictor, test_instance) with PdfPages(pdf_filename) as pdf: for location, forecasts in forecast_dict.items(): @@ -243,7 +244,8 @@ def plot_forecasts(predictor, test_instance, truth, pdf_filename): plt.figure(figsize=(8, 4)) # Set the figure size t = _t[_t.index <= forecast.index[-1]] forecast.plot(show_label=True) - plt.plot(t[-150:].to_timestamp()) + plotting_context = 52*6 + plt.plot(t[-plotting_context:].to_timestamp()) plt.title(location) plt.legend() pdf.savefig() @@ -260,7 +262,8 @@ def plot_predictions(predictions: DataSet[Samples], truth: DataSet, pdf_filename plt.figure(figsize=(8, 4)) # Set the figure size # t = _t[_t.index <= prediction.index[-1]] prediction.plot(show_label=True) - plt.plot(t[-150:].to_timestamp()) + context_length = 52*6 + plt.plot(t[-context_length:].to_timestamp()) plt.title(location) plt.legend() pdf.savefig() diff --git a/climate_health/climate_predictor.py b/climate_health/climate_predictor.py index b76f096b..804dcd75 100644 --- a/climate_health/climate_predictor.py +++ b/climate_health/climate_predictor.py @@ -6,9 +6,18 @@ from .datatypes import ClimateData from climate_health.spatio_temporal_data.temporal_dataclass import DataSet -from climate_health.time_period import PeriodRange +from climate_health.time_period import PeriodRange, Month, Week +def get_climate_predictor(train_data: DataSet[ClimateData]): + if isinstance(train_data.period_range[0], Month): + estimator = MonthlyClimatePredictor() + else: + assert isinstance(train_data.period_range[0], Week) + estimator = WeeklyClimatePredictor() + estimator.train(train_data) + return estimator + class MonthlyClimatePredictor: def __init__(self): @@ -16,7 +25,7 @@ def __init__(self): self._cls = None def _feature_matrix(self, time_period: PeriodRange): - return time_period.month[:,None] == np.arange(1, 13) + return time_period.month[:, None] == np.arange(1, 13) def train(self, train_data: DataSet[ClimateData]): train_data = train_data.remove_field('disease_cases') @@ -35,10 +44,13 @@ def predict(self, time_period: PeriodRange): x = self._feature_matrix(time_period) prediction_dict = {} for location, models in self._models.items(): - prediction_dict[location] = self._cls(time_period, **{field: model.predict(x).ravel() for field, model in models.items()}) + prediction_dict[location] = self._cls(time_period, **{field: model.predict(x).ravel() for field, model in + models.items()}) return DataSet(prediction_dict) - - - +class WeeklyClimatePredictor(MonthlyClimatePredictor): + def _feature_matrix(self, time_period: PeriodRange): + t = time_period.week[:, None] == np.arange(1, 53) + t[..., -1] |= time_period.week == 53 + return t diff --git a/climate_health/spatio_temporal_data/temporal_dataclass.py b/climate_health/spatio_temporal_data/temporal_dataclass.py index 56de6126..dd228842 100644 --- a/climate_health/spatio_temporal_data/temporal_dataclass.py +++ b/climate_health/spatio_temporal_data/temporal_dataclass.py @@ -296,4 +296,7 @@ def from_fields(cls, dataclass: type[TimeSeriesData], fields: dict[str, 'DataSet new_dict[location] = dataclass(period_range, **{field: fields[field][location].fill_to_range(start_timestamp, end_timestamp).value for field in field_names}) return cls(new_dict) - + def plot(self): + for location, data in self.items(): + df = data.to_pandas() + df.plot(x='time_period', title=location) diff --git a/climate_health/time_period/__init__.py b/climate_health/time_period/__init__.py index 0d216866..68900039 100644 --- a/climate_health/time_period/__init__.py +++ b/climate_health/time_period/__init__.py @@ -1,4 +1,4 @@ -from .date_util_wrapper import TimePeriod, Year, Month, Day, PeriodRange, delta_month, delta_week +from .date_util_wrapper import TimePeriod, Year, Month, Day, PeriodRange, delta_month, delta_week, Week #from ._legacy_implementation import TimePeriod, Year, Month, Day from .period_range import period_range as get_period_range get_period_range = PeriodRange.from_time_periods diff --git a/climate_health/time_period/date_util_wrapper.py b/climate_health/time_period/date_util_wrapper.py index a229e386..0e7f4dad 100644 --- a/climate_health/time_period/date_util_wrapper.py +++ b/climate_health/time_period/date_util_wrapper.py @@ -2,12 +2,14 @@ import logging from datetime import datetime from numbers import Number -from typing import Union, Iterable +from typing import Union, Iterable, Tuple +import dateutil import numpy as np import pandas as pd from dateutil.parser import parse from dateutil.relativedelta import relativedelta +from pytz import utc class DateUtilWrapper: @@ -25,6 +27,10 @@ def __getattr__(self, item: str): class TimeStamp(DateUtilWrapper): _used_attributes = ('year', 'month', 'day', '__str__', '__repr__') + @property + def week(self): + return self._date.isocalendar()[1] + def __init__(self, date: datetime): self._date = date @@ -60,7 +66,7 @@ def __sub__(self, other: 'TimeStamp'): return TimeDelta(relativedelta(self._date, other._date)) def _comparison(self, other: 'TimeStamp', func_name: str): - return getattr(self._date, func_name)(other._date) + return getattr(self._date.replace(tzinfo=utc), func_name)(other._date.replace(tzinfo=utc)) class TimePeriod: @@ -150,7 +156,7 @@ def time_delta(self) -> 'TimeDelta': @classmethod def parse(cls, text_repr: str): - if 'W' in text_repr: + if 'W' in text_repr or '/' in text_repr: return cls.parse_week(text_repr) try: year = int(text_repr) @@ -172,9 +178,15 @@ def from_pandas(cls, period: pd.Period): @classmethod def parse_week(cls, week: str): - year, weeknr = week.split('W') - print('########', week) - return Week(int(year), int(weeknr)) + if 'W' in week: + year, weeknr = week.split('W') + return Week(int(year), int(weeknr)) + elif '/' in week: + start, end = week.split('/') + start_date = dateutil.parser.parse(start) + end_date = dateutil.parser.parse(end) + assert relativedelta(end_date, start_date).days == 6, f'Week must be 7 days {start_date} {end_date}' + return Week(start_date) # type: ignore @property def start_timestamp(self): @@ -206,10 +218,20 @@ def id(self): return self._date.strftime('%Y%m%d') +class WeekNumbering: + @staticmethod + def get_week_info(date: datetime) -> Tuple[int, int, int]: + return date.isocalendar() + + @staticmethod + def get_date(year: int, week: int, day: int) -> datetime: + return datetime.strptime(f'{year}-W{week}-{day}', "%G-W%V-%w") + class Week(TimePeriod): _used_attributes = []#'year'] _extension = relativedelta(weeks=1) + _week_numbering = WeekNumbering @property def id(self): @@ -224,12 +246,15 @@ def __init__(self, date, *args, **kwargs): week_nr = args[0] if args else kwargs['week'] self._date = self.__date_from_numbers(year, week_nr) self.week = week_nr - self.year = self._date.year + self.year = year + #self.year = self._date.year else: if isinstance(date, TimeStamp): date = date._date - self.week = date.isocalendar()[1] - self.year = date.isocalendar()[0] + year, week, day = date.isocalendar() + self.week = week + self.year = year + self._date = date def __sub__(self, other: 'TimePeriod'): @@ -243,11 +268,18 @@ def __str__(self): __repr__ = __str__ - def __date_from_numbers(cls, year: int, week_nr: int): - return datetime.strptime(f'{year}-W{week_nr}-1', "%Y-W%W-%w") + def __date_from_numbers(self, year: int, week_nr: int): + date = self._week_numbering.get_date(year, week_nr, 1) + #date = datetime.strptime(f'{year}-W{week_nr}-1', "%Y-W%W-%w") + assert date.isocalendar()[:2] == (year, week_nr), (date.isocalendar()[:2], year, week_nr) + return date + + @classmethod + def _isocalendar_week_to_date(cls, year: int, week_nr: int, day: int): + return datetime.strptime(f'{year}-W{week_nr}-{day}', "%Y-W%V-%w") def topandas(self): - return self.__str__() + #return self.__str__() return pd.Period(self._date, freq='W-MON') @@ -365,6 +397,10 @@ def month(self): def year(self): return np.array([p.start_timestamp.year for p in self]) + @property + def week(self): + return np.array([p.start_timestamp.week for p in self]) + @property def delta(self): return self._time_delta @@ -418,7 +454,8 @@ def _period_class(self): raise ValueError(f'Unknown time delta {self._time_delta}') def __iter__(self): - return (self._period_class((self._start_timestamp + self._time_delta * i)._date) for i in range(len(self))) + return (self._period_class((self._start_timestamp + self._time_delta * i)._date) + for i in range(len(self))) def __getitem__(self, item: slice | int): ''' Slice by numeric index in the period range''' @@ -517,7 +554,10 @@ def from_ids(cls, ids: Iterable[str], fill_missing=False): @classmethod def from_start_and_n_periods(cls, start_period: pd.Period, n_periods: int): - period = TimePeriod.from_pandas(start_period) + if not isinstance(start_period, TimePeriod): + period = TimePeriod.from_pandas(start_period) + else: + period = start_period delta = period.time_delta return cls.from_time_periods(period, period + delta * (n_periods-1)) @@ -543,7 +583,8 @@ def searchsorted(self, period: TimePeriod, side='left'): if side not in ('left', 'right'): raise ValueError(f'Invalid side {side}') assert period.time_delta == self._time_delta, (period, self._time_delta) - n_steps = TimeDelta(relativedelta(period._date, self._start_timestamp._date)) // self._time_delta + n_steps = self._time_delta.n_periods(self._start_timestamp, period.start_timestamp) + # n_steps = TimeDelta(relativedelta(period._date, self._start_timestamp._date)) // self._time_delta if side == 'right': n_steps += 1 n_steps = min(max(0, n_steps), len(self)) # if period is outside diff --git a/pytest.ini b/pytest.ini index e5ad5956..4f9797aa 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] #norecursedirs = tests/data_wrangling tests/external tests/spatio_temporal_data tests/ -norecursedirs = tests/data_wrangling tests/spatio_temporal_data tests/ +norecursedirs = tests/data_wrangling tests/spatio_temporal_data tests/ .mypy_cache ignore = ['tests/test_meteostat_wrapper'] log_cli = True diff --git a/scripts/clean_laos_data2.py b/scripts/clean_laos_data2.py index 74bf5dab..5c2271be 100644 --- a/scripts/clean_laos_data2.py +++ b/scripts/clean_laos_data2.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +from matplotlib import pyplot as plt from climate_health.datatypes import ClimateData, ClimateHealthTimeSeries, FullData, ClimateHealthData from climate_health.file_io.cleaners import laos_data @@ -9,26 +10,69 @@ from climate_health.time_period import TimePeriod filname = '/home/knut/Downloads/laodenguedata.csv' + + def parse_week(week): week, year = week.split() print(week, year) weekstr = string.Formatter().format_field(int(week[1:]), '02') return f'{year}W{weekstr}' + raw_df = pd.read_csv(filname) # raw_df['Location'] = raw_df['Organisation unit'] # Make a column for each unique value in the 'Data' column df = raw_df.pivot(index=['Period', 'Location'], columns='Data', values='Value') df = df.reset_index() -df['Period']= [parse_week(week) for week in df.Period] +df['Period'] = [parse_week(week) for week in df.Period] colnames = ['Climate-Rainfall', 'Climate-Temperature avg', - 'NCLE: 7. Dengue cases (any)', 'Location', 'Period'] + 'NCLE: 7. Dengue cases (any)', 'Location', 'Period'] true_colnames = ['rainfall', 'mean_temperature', 'disease_cases', 'location', 'time_period'] df.rename(columns={colname: true_colname for colname, true_colname in zip(colnames, true_colnames)}, inplace=True) df = df.sort_values(by=['time_period', 'location']) + + +def add_population_data(data): + laos_population = '''\ + Vientiane Capital: ~820,000 + Phongsali: ~177,000 + Louangnamtha: ~176,000 + Oudomxai: ~307,000 + Bokeo: ~205,000 + Louangphabang: ~431,000 + Houaphan: ~294,000 + Xainyabouli: ~381,000 + Xiangkhouang: ~252,000 + Vientiane: ~432,000 + Bolikhamxai: ~275,000 + Khammouan: ~415,000 + Savannakhet: ~939,000 + Salavan: ~396,000 + Xekong: ~120,000 + Champasak: ~694,000 + Attapu: ~153,000 + Xaisomboun: ~93,000''' + laos_population = {line.split(': ')[0].strip(): int(line.split(': ~')[1].replace(',', '')) for line in + laos_population.split('\n')} + data_dict = {name[3:]: data for name, data in data.items()} + for name in data_dict.keys(): + if name not in laos_population: + print(f'{name} not in population data', laos_population.keys()) + full_data = {name: FullData(d.time_period, d.rainfall, d.mean_temperature, d.disease_cases, + np.full(len(d), laos_population[name])) + for name, d in data_dict.items()} + return DataSet(full_data) + + if __name__ == '__main__': - dataset = DataSet.from_pandas(df, dataclass=ClimateHealthData, fill_missing=True) + dataset = DataSet.from_pandas( + df, + dataclass=ClimateHealthData, + fill_missing=True) + dataset = add_population_data(dataset) + dataset.to_csv('/home/knut/Data/ch_data/weekly_laos_data.csv') + if False: @@ -68,32 +112,7 @@ def get_laos_climate(climate_filename): data = DataSet(full_dict) data.to_csv('/home/knut/Downloads/laos_data.csv') - laos_population = '''\ - Vientiane Capital: ~820,000 - Phongsali: ~177,000 - Louangnamtha: ~176,000 - Oudomxai: ~307,000 - Bokeo: ~205,000 - Louangphabang: ~431,000 - Houaphan: ~294,000 - Xainyabouli: ~381,000 - Xiangkhouang: ~252,000 - Vientiane: ~432,000 - Bolikhamxai: ~275,000 - Khammouan: ~415,000 - Savannakhet: ~939,000 - Salavan: ~396,000 - Xekong: ~120,000 - Champasak: ~694,000 - Attapu: ~153,000 - Xaisomboun: ~93,000''' - laos_population = {line.split(': ')[0]: int(line.split(': ~')[1].replace(',', '')) for line in - laos_population.split('\n')} - data_dict = {name[3:]: data.data() for name, data in data.items()} - full_data = {name: FullData(d.time_period, d.rainfall, d.mean_temperature, d.disease_cases, - np.full(len(d), laos_population[name])) - for name, d in data_dict.items()} full_data = DataSet(full_data) full_data.to_csv('/home/knut/Data/laos_full_data.csv') # data = {name: FullData.combine(health.get_location(name).data(), spatio_temporal_dict.get_location(name).data(), laos_population[name]) - # for name in health.locations()} \ No newline at end of file + # for name in health.locations()} diff --git a/scripts/evaluate_laos.py b/scripts/evaluate_laos.py new file mode 100644 index 00000000..fd722947 --- /dev/null +++ b/scripts/evaluate_laos.py @@ -0,0 +1,12 @@ +from climate_health.assessment.prediction_evaluator import evaluate_model +from climate_health.datatypes import FullData +from climate_health.external.external_model import get_model_from_directory_or_github_url +from climate_health.predictor.naive_estimator import NaiveEstimator +from climate_health.spatio_temporal_data.temporal_dataclass import DataSet +model_url = '/home/knut/Sources/chap_auto_ewars_weekly' +#model_url = 'https://github.com/sandvelab/chap_auto_ewars' +model = get_model_from_directory_or_github_url(model_url) +# model = NaiveEstimator() +dataset = DataSet.from_csv('/home/knut/Data/ch_data/weekly_laos_data.csv', FullData) +if __name__ == '__main__': + evaluate_model(model, dataset, prediction_length=12, n_test_sets=41, report_filename='laos_weekly_report.pdf') \ No newline at end of file diff --git a/scripts/prediction_example.py b/scripts/prediction_example.py index 024580dc..7af2f405 100644 --- a/scripts/prediction_example.py +++ b/scripts/prediction_example.py @@ -1,15 +1,24 @@ +import databricks.sdk.service.sql +import numpy as np + from climate_health.assessment.forecast import forecast_ahead, forecast_with_predicted_weather from climate_health.assessment.prediction_evaluator import plot_predictions from climate_health.data.datasets import ISIMIP_dengue_harmonized +from climate_health.datatypes import FullData, HealthData from climate_health.external.external_model import get_model_from_directory_or_github_url from climate_health.external.r_models import models_path +from climate_health.spatio_temporal_data.temporal_dataclass import DataSet if __name__ == '__main__': - model_name = 'https://github.com/sandvelab/chap_auto_ewars' + #model_name = 'https://github.com/sandvelab/chap_auto_ewars' + model_url = '/home/knut/Sources/chap_auto_ewars_weekly' #model_name = models_path / 'naive_python_model_with_mlproject_file' - estimator = get_model_from_directory_or_github_url(model_name) - dataset = ISIMIP_dengue_harmonized['vietnam'] + estimator = get_model_from_directory_or_github_url(model_url) + #dataset = ISIMIP_dengue_harmonized['vietnam'] + dataset = DataSet.from_csv('/home/knut/Data/ch_data/weekly_laos_data.csv', FullData) predictor = estimator.train(dataset) - predictions = forecast_with_predicted_weather(predictor, dataset, 3) + predictions = forecast_with_predicted_weather(predictor, dataset, 26) plot_predictions(predictions, dataset, 'prediction_example.pdf') + medians = DataSet({loc: HealthData(data.time_period, np.median(data.samples, axis=-1)) for loc, data in predictions.items()}) + medians.to_csv('laos_weekly_predictions.csv') diff --git a/tests/test_climate_predictor.py b/tests/test_climate_predictor.py index fbaa645c..ff520367 100644 --- a/tests/test_climate_predictor.py +++ b/tests/test_climate_predictor.py @@ -1,10 +1,11 @@ import numpy as np import pytest -from climate_health.climate_predictor import MonthlyClimatePredictor +from climate_health.climate_predictor import MonthlyClimatePredictor, WeeklyClimatePredictor from climate_health.datatypes import ClimateData from climate_health.spatio_temporal_data.temporal_dataclass import DataSet -from climate_health.time_period import PeriodRange, Month +from climate_health.time_period import PeriodRange, Month, Week + @pytest.fixture @@ -15,6 +16,14 @@ def climate_data(): {'oslo': ClimateData(time_period, values, values * 2, values * 3), 'stockholm': ClimateData(time_period, values, values * 2, values * 3)}) +@pytest.fixture +def weekly_climate_data(): + time_period = PeriodRange.from_time_periods(Week(2019, 1), Week(2020, 52)) + values = np.arange(len(time_period)) + return DataSet( + {'oslo': ClimateData(time_period, values, values * 2, values * 3), + 'stockholm': ClimateData(time_period, values, values * 2, values * 3)}) + def test_climate_predictor(climate_data): predictor = MonthlyClimatePredictor() @@ -22,4 +31,8 @@ def test_climate_predictor(climate_data): time_period = PeriodRange.from_time_periods(Month.parse('2021-01'), Month.parse('2021-12')) prediction = predictor.predict(time_period) - +def test_weekly_climate_predictor(weekly_climate_data): + predictor = WeeklyClimatePredictor() + predictor.train(weekly_climate_data) + time_period = PeriodRange.from_time_periods(Week(2021, 1), Week(2021, 52)) + prediction = predictor.predict(time_period) diff --git a/tests/test_forecast.py b/tests/test_forecast.py index 5d1cc213..b117f51c 100644 --- a/tests/test_forecast.py +++ b/tests/test_forecast.py @@ -33,6 +33,13 @@ def test_multi_forecast(): def test_forecast_ahead(): model = NaiveEstimator() - dataset = dataset = ISIMIP_dengue_harmonized['vietnam'] + dataset = ISIMIP_dengue_harmonized['vietnam'] + prediction_length = 3 + forecast_ahead(model, dataset, prediction_length) + + +def test_forecast_with_predicted_weather(): + model = NaiveEstimator() + dataset = ISIMIP_dengue_harmonized['vietnam'] prediction_length = 3 forecast_ahead(model, dataset, prediction_length) diff --git a/tests/time_period/test_dateutil_wrapper.py b/tests/time_period/test_dateutil_wrapper.py index 61417b98..301fac19 100644 --- a/tests/time_period/test_dateutil_wrapper.py +++ b/tests/time_period/test_dateutil_wrapper.py @@ -39,7 +39,7 @@ def test_init_week_with_numbers(): week = Week(2023, 2) assert isinstance(week, Week) assert week.start_timestamp == TimeStamp.parse('2023-01-09') - assert week.topandas() == '2023W2' # pd.Period('2023-01-09', freq='W-MON') + assert week.to_string() == '2023W2' # pd.Period('2023-01-09', freq='W-MON') def test_parse(period1):