Skip to content

Commit

Permalink
ts forecasting add strategy, data leak refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
v1docq committed Apr 15, 2024
1 parent f962d6f commit b67c687
Show file tree
Hide file tree
Showing 14 changed files with 180 additions and 117 deletions.
Original file line number Diff line number Diff line change
@@ -1,36 +1,8 @@
import os
import pandas as pd

from fedot_ind.api.utils.path_lib import PROJECT_PATH
from fedot_ind.tools.example_utils import read_results, create_comprasion_df

forecast_result_path = PROJECT_PATH + '/examples/automl_example/api_example/time_series/ts_forecasting/forecasts/'


def read_results(forecast_result_path):
results = os.listdir(forecast_result_path)
df_forecast = []
df_metrics = []
for file in results:
df = pd.read_csv(f'{forecast_result_path}/{file}')
name = file.split('_')[0]
df['dataset_name'] = name
if file.__contains__('forecast'):
df_forecast.append(df)
else:
df_metrics.append(df)
return df_forecast, df_metrics


def create_comprasion_df(df, metric: str = 'rmse'):
df_full = pd.concat(df)
df_full = df_full[df_full['Unnamed: 0'] == metric]
df_full = df_full .drop('Unnamed: 0', axis=1)
df_full['Difference_industrial'] = (df_full.iloc[:, 1:2].min(axis=1) - df_full['industrial'])
df_full['industrial_Wins'] = df_full.apply(lambda row: 'Win' if row.loc['Difference_industrial'] > 0 else 'Loose',
axis=1)
return df_full


if __name__ == "__main__":
for metric in ['rmse', 'smape']:
df_forecast, df_metrics = read_results(forecast_result_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,57 +5,61 @@

from fedot_ind.api.utils.path_lib import PROJECT_PATH
from fedot_ind.core.repository.constanst_repository import M4_FORECASTING_BENCH, M4_FORECASTING_LENGTH
from fedot_ind.tools.example_utils import industrial_forecasting_modelling_loop, compare_forecast_with_sota
from fedot_ind.tools.example_utils import industrial_forecasting_modelling_loop, compare_forecast_with_sota, \
read_results, create_comprasion_df

forecast_col = ['industrial', 'target', 'AG', 'NBEATS']
metric_col = ['industrial', 'AG', 'NBEATS']
benchmark = 'M4'
finetune = False

forecast_result_path = os.listdir(PROJECT_PATH +
'/examples/automl_example/api_example/time_series/ts_forecasting/forecasts/')
forecast_result_path = set([x.split('_')[0] for x in forecast_result_path])

df_forecast, df_metrics = read_results(PROJECT_PATH +
'/examples/automl_example/api_example/time_series/ts_forecasting/forecasts/')
df_comprasion = create_comprasion_df(df_metrics, 'rmse')

if __name__ == "__main__":
forecast_result_path = os.listdir(PROJECT_PATH +
'/examples/automl_example/api_example/time_series/ts_forecasting/forecasts/')
forecast_result_path = set([x.split('_')[0] for x in forecast_result_path])
forecast_col = ['industrial', 'target', 'AG', 'NBEATS']
metric_col = ['industrial', 'AG', 'NBEATS']
benchmark = 'M4'
finetune = False
initial_assumption = PipelineBuilder().add_node('eigen_basis',
params={'low_rank_approximation': False,
'rank_regularization': 'explained_dispersion'}).add_node(
'ar')
industrial_loss = df_comprasion[df_comprasion['industrial_Wins'] == 'Loose']['dataset_name'].values.tolist()

api_config = dict(problem='ts_forecasting',
metric='rmse',
timeout=5,
timeout=15,
with_tuning=False,
initial_assumption=initial_assumption,
industrial_strategy='forecasting_assumptions',
n_jobs=2,
logging_level=30)

for dataset_name in M4_FORECASTING_BENCH:
if dataset_name in forecast_result_path:
print('Already evaluated')
else:
try:
horizon = M4_FORECASTING_LENGTH[dataset_name[0]]
api_config.update(task_params={'forecast_length': horizon})
n_beats_forecast, n_beats_metrics, \
autogluon_forecast, autogluon_metrics = compare_forecast_with_sota(dataset_name=dataset_name,
horizon=horizon)
model, labels, metrics, target = industrial_forecasting_modelling_loop(dataset_name=dataset_name,
benchmark=benchmark,
horizon=horizon,
api_config=api_config,
finetune=finetune)

forecast = pd.DataFrame([labels,
target,
n_beats_forecast,
autogluon_forecast]).T
forecast.columns = forecast_col

metrics_comprasion = pd.concat([metrics,
autogluon_metrics,
n_beats_metrics]).T
metrics_comprasion.columns = metric_col
if dataset_name in industrial_loss:
print('Already evaluated, but with bad metrics')
horizon = M4_FORECASTING_LENGTH[dataset_name[0]]
api_config.update(task_params={'forecast_length': horizon})
api_config.update(output_folder=os.path.join(PROJECT_PATH, 'results_of_experiments',dataset_name))
n_beats_forecast, n_beats_metrics, \
autogluon_forecast, autogluon_metrics = compare_forecast_with_sota(dataset_name=dataset_name,
horizon=horizon)
model, labels, metrics, target = industrial_forecasting_modelling_loop(dataset_name=dataset_name,
benchmark=benchmark,
horizon=horizon,
api_config=api_config,
finetune=finetune)

forecast = pd.DataFrame([labels, target, n_beats_forecast, autogluon_forecast]).T
forecast.columns = forecast_col

metrics_comprasion = pd.concat([metrics, autogluon_metrics, n_beats_metrics]).T
metrics_comprasion.columns = metric_col

model.save_best_model()
model.save_optimization_history()

if metrics_comprasion.T[metrics_comprasion.T['rmse']
== metrics_comprasion.T.min(axis=0).values[0]].index[0] == 'industrial':
forecast.to_csv(f'./{dataset_name}_forecast.csv')
metrics_comprasion.to_csv(f'./{dataset_name}_metrics.csv')

except Exception as ex:
print(f'Skip {dataset_name}. Reason - {ex}')
elif dataset_name in forecast_result_path:
print('Already evaluated')
3 changes: 1 addition & 2 deletions fedot_ind/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,7 @@ def load(self, path):

def save_optimization_history(self, **kwargs):
"""Plot prediction of the model"""
self.solver.history.save(
f"{self.output_folder}/optimization_history.json")
self.solver.history.save(f"{self.output_folder}/optimization_history.json")

def save_best_model(self):
if self.condition_check.solver_is_fedot_class(self.solver):
Expand Down
10 changes: 6 additions & 4 deletions fedot_ind/api/utils/checkers_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.preprocessing import LabelEncoder

from fedot_ind.api.utils.data import check_multivariate_data
from fedot_ind.core.architecture.preprocessing.data_convertor import NumpyConverter
from fedot_ind.core.architecture.preprocessing.data_convertor import NumpyConverter, DataConverter
from fedot_ind.core.architecture.settings.computational import backend_methods as np
from fedot_ind.core.repository.constanst_repository import FEDOT_TASK

Expand All @@ -34,6 +34,7 @@ def __init__(self,
task_params=None):
self.logger = logging.getLogger(self.__class__.__name__)
self.input_data = input_data
self.data_convertor = DataConverter(data=self.input_data)
self.task = task
self.task_params = task_params
self.task_dict = FEDOT_TASK
Expand Down Expand Up @@ -71,7 +72,8 @@ def _init_input_data(self) -> None:
"""
is_multivariate_data = False
if isinstance(self.input_data, tuple):

if self.data_convertor.is_tuple:
X, y = self.input_data[0], self.input_data[1]
features, is_multivariate_data, target = self.__check_features_and_target(X, y)

Expand All @@ -81,15 +83,15 @@ def _init_input_data(self) -> None:
target = self.label_encoder.fit_transform(target)
else:
self.label_encoder = self.label_encoder

if is_multivariate_data:
self.input_data = InputData(idx=np.arange(len(X)),
features=features,
target=target,
task=self.task_dict[self.task],
data_type=DataTypesEnum.image)
elif self.task == 'ts_forecasting':
if type(self.input_data) is pd.DataFrame:
features_array = np.array(self.input_data.values)
features_array = self.data_convertor.convert_to_1d_array()
task = Task(TaskTypesEnum.ts_forecasting,
TsForecastingParams(forecast_length=self.task_params['forecast_length']))
features_array = features_array[:-self.task_params['forecast_length']]
Expand Down
28 changes: 24 additions & 4 deletions fedot_ind/api/utils/industrial_strategy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
from copy import deepcopy

from fedot import Fedot

from fedot_ind.api.utils.checkers_collections import DataCheck
from fedot_ind.core.ensemble.kernel_ensemble import KernelEnsembler
from fedot_ind.core.ensemble.random_automl_forest import RAFensembler
from fedot_ind.core.repository.constanst_repository import BATCH_SIZE_FOR_FEDOT_WORKER, FEDOT_WORKER_NUM, \
FEDOT_WORKER_TIMEOUT_PARTITION, FEDOT_TUNING_METRICS, FEDOT_TUNER_STRATEGY
FEDOT_WORKER_TIMEOUT_PARTITION, FEDOT_TUNING_METRICS, FEDOT_TUNER_STRATEGY, FEDOT_TS_FORECASTING_ASSUMPTIONS

import numpy as np

Expand All @@ -22,9 +24,11 @@ def __init__(self, industrial_strategy_params,
self.industrial_strategy_params = industrial_strategy_params
self.industrial_strategy = industrial_strategy
self.industrial_strategy_fit = {'federated_automl': self._federated_strategy,
'kernel_automl': self._kernel_strategy}
'kernel_automl': self._kernel_strategy,
'forecasting_assumptions': self._forecasting_strategy}
self.industrial_strategy_predict = {'federated_automl': self._federated_predict,
'kernel_automl': self._kernel_predict}
'kernel_automl': self._kernel_predict,
'forecasting_assumptions': self._forecasting_predict}
self.config_dict = api_config
self.logger = logger
self.repo = IndustrialModels().setup_repository()
Expand All @@ -48,6 +52,16 @@ def _federated_strategy(self, input_data):
batch_size=batch_size)
self.logger.info(f'Number of AutoMl models in ensemble - {self.solver.n_splits}')

def _forecasting_strategy(self, input_data):
self.logger.info('TS forecasting algorithm was applied')
self.config_dict['timeout'] = round(self.config_dict['timeout'] / 3)
self.solver = {}
for model_name, init_assumption in FEDOT_TS_FORECASTING_ASSUMPTIONS.items():
self.config_dict['initial_assumption'] = init_assumption.build()
industrial = Fedot(**self.config_dict)
industrial.fit(input_data)
self.solver.update({model_name: industrial})

def _finetune_loop(self,
kernel_ensemble: dict,
kernel_data: dict,
Expand Down Expand Up @@ -94,8 +108,14 @@ def _federated_predict(self,
else:
return np.argmax(head_predict, axis=1)

def _forecasting_predict(self,
input_data,
mode: str = 'labels'):
labels_dict = {k: v.predict(input_data, mode) for k, v in self.solver.items()}
return labels_dict

def _kernel_predict(self,
input_data,
mode: str = 'labels'):
labels_dict = {k: v.predict(input_data, mode) for k, v in self.solver.items()}
labels_dict = {k: v.predict(input_data, mode).predict for k, v in self.solver.items()}
return labels_dict
25 changes: 23 additions & 2 deletions fedot_ind/core/operation/interfaces/industrial_model_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
(IndustrialCustomPreprocessingStrategy, MultiDimPreprocessingStrategy)
from fedot_ind.core.repository.model_repository import FORECASTING_MODELS, NEURAL_MODEL, SKLEARN_CLF_MODELS, \
SKLEARN_REG_MODELS
from scipy.stats import kurtosis
from scipy.stats import skew


class FedotNNClassificationStrategy(EvaluationStrategy):
Expand Down Expand Up @@ -130,6 +132,24 @@ def __init__(self, operation_type: str, params: Optional[OperationParameters] =
self.multi_dim_dispatcher.concat_func = np.vstack
self.ensemble_func = np.sum

def fit(self, train_data: InputData):
if self.operation_type == 'glm':
mean_kurtosis = kurtosis(train_data.features)
mean_skew = skew(train_data.features)
if mean_kurtosis < 3 and mean_skew > 0:
family = 'gamma'
link = 'log'
elif mean_kurtosis > 3 and mean_skew != 0:
family = "inverse_gaussian"
link = 'inverse_power'
else:
family = 'gaussian'
link = 'identity'
self.multi_dim_dispatcher.params_for_fit = {'family': family,
'link': link}
train_data = self.multi_dim_dispatcher._convert_input_data(train_data)
return self.multi_dim_dispatcher.fit(train_data)

def predict(self, trained_operation, predict_data: InputData, output_mode: str = 'labels') -> OutputData:
predict_data = self.multi_dim_dispatcher._convert_input_data(
predict_data, mode=self.multi_dim_dispatcher.mode)
Expand All @@ -140,8 +160,9 @@ def predict(self, trained_operation, predict_data: InputData, output_mode: str =
def predict_for_fit(self, trained_operation, predict_data: InputData, output_mode: str = 'labels') -> OutputData:
predict_data = self.multi_dim_dispatcher._convert_input_data(
predict_data, mode=self.multi_dim_dispatcher.mode)
predict_output = self.multi_dim_dispatcher.predict_for_fit(trained_operation, predict_data, output_mode='labels')
predict_output.predict = self.ensemble_func (predict_output.predict, axis=0)
predict_output = self.multi_dim_dispatcher.predict_for_fit(trained_operation, predict_data,
output_mode='labels')
predict_output.predict = self.ensemble_func(predict_output.predict, axis=0)
return predict_output


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,15 @@ def fit_one_sample(self, operation_implementation, train_data: InputData):
else:
sig = signature(operation_implementation.fit).parameters
if len(sig) > 1:
operation_implementation.fit(
train_data.features, train_data.target)
operation_implementation.fit(train_data.features, train_data.target)
else:
operation_implementation.fit(train_data)
return operation_implementation

def fit(self, train_data: InputData):
# init operation_impl model abstraction
try:
operation_implementation = self.operation_impl(
**self.params_for_fit.to_dict())
operation_implementation = self.operation_impl(**self.params_for_fit.to_dict())
except Exception:
operation_implementation = self.operation_impl(self.params_for_fit)
# Create model and data condition checker
Expand Down Expand Up @@ -338,8 +336,6 @@ def predict(self, trained_operation,
predict_data)
predict_output = self.multi_dim_dispatcher.predict(trained_operation, converted_predict_data,
output_mode=output_mode)
predict_output.predict = np.reshape(predict_output.predict, (len(trained_operation), -1))
predict_output.predict = self.ensemble_func(predict_output.predict, axis=0)
return predict_output

def predict_for_fit(self, trained_operation,
Expand All @@ -350,8 +346,6 @@ def predict_for_fit(self, trained_operation,
predict_output = self.multi_dim_dispatcher.predict_for_fit(trained_operation,
converted_predict_data,
output_mode=output_mode)
predict_output.predict = np.reshape(predict_output.predict, (len(trained_operation), -1))
predict_output.predict = self.ensemble_func(predict_output.predict, axis=0)
return predict_output


Expand Down
9 changes: 9 additions & 0 deletions fedot_ind/core/repository/constanst_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,14 @@ class FedotOperationConstant(Enum):
'ts_forecasting': PipelineBuilder().add_node('ar')
}

FEDOT_TS_FORECASTING_ASSUMPTIONS = {
'lagged_ridge': PipelineBuilder().add_node('lagged').add_node('ridge'),
'eigen_ar': PipelineBuilder().add_node('eigen_basis',
params={'low_rank_approximation': False,
'rank_regularization': 'explained_dispersion'}).add_node('ar'),
'glm': PipelineBuilder().add_node('glm')
}

FEDOT_ENSEMBLE_ASSUMPTIONS = {
'classification': PipelineBuilder().add_node('logit'),
'regression': PipelineBuilder().add_node('treg')
Expand Down Expand Up @@ -626,6 +634,7 @@ class BenchmarkDatasets(Enum):
FEDOT_API_PARAMS = FedotOperationConstant.FEDOT_API_PARAMS.value
FEDOT_ENSEMBLE_ASSUMPTIONS = FedotOperationConstant.FEDOT_ENSEMBLE_ASSUMPTIONS.value
FEDOT_TUNER_STRATEGY = FedotOperationConstant.FEDOT_TUNER_STRATEGY.value
FEDOT_TS_FORECASTING_ASSUMPTIONS = FedotOperationConstant.FEDOT_TS_FORECASTING_ASSUMPTIONS.value

CPU_NUMBERS = ComputationalConstant.CPU_NUMBERS.value
BATCH_SIZE_FOR_FEDOT_WORKER = ComputationalConstant.BATCH_SIZE_FOR_FEDOT_WORKER.value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,9 @@ def transform_lagged_for_fit(self, input_data: InputData) -> OutputData:
input_data.features = input_data.features.squeeze()
new_input_data = copy(input_data)
forecast_length = new_input_data.task.task_params.forecast_length

# Correct window size parameter
self._check_and_correct_window_size(
new_input_data.features, forecast_length)
window_size = self.window_size
self._check_and_correct_window_size(new_input_data.features, forecast_length)
window_size = 3*forecast_length
new_idx, transformed_cols, new_target = transform_features_and_target_into_lagged(
input_data,
forecast_length,
Expand Down
Loading

0 comments on commit b67c687

Please sign in to comment.