From 918126616552a4ea4ade5c06bd28ae142a04b87e Mon Sep 17 00:00:00 2001 From: v1docq Date: Tue, 8 Aug 2023 12:58:17 +0300 Subject: [PATCH] refactor experiments --- fedot_ind/api/exper.py | 4 +- fedot_ind/api/multi_ts_example.py | 134 ++++++++++++------ fedot_ind/api/rank_experiment.py | 4 +- .../transformation/DataTransformer.py | 25 ++-- 4 files changed, 103 insertions(+), 64 deletions(-) diff --git a/fedot_ind/api/exper.py b/fedot_ind/api/exper.py index bc8a5bf98..add74fabd 100644 --- a/fedot_ind/api/exper.py +++ b/fedot_ind/api/exper.py @@ -58,8 +58,8 @@ # 'wavelet_basis', 'data_driven_basis' ], - tuning_iterations=30, - tuning_timeout=15.0, + tuning_iterations=10, + tuning_timeout=10.0, use_cache=False, timeout=10, n_jobs=6, diff --git a/fedot_ind/api/multi_ts_example.py b/fedot_ind/api/multi_ts_example.py index 721f9354f..4ed3e1087 100644 --- a/fedot_ind/api/multi_ts_example.py +++ b/fedot_ind/api/multi_ts_example.py @@ -12,8 +12,8 @@ from matplotlib import pyplot as plt from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader -from fedot_ind.core.models.signal.RecurrenceExtractor import RecurrenceExtractor -from fedot_ind.core.models.statistical.StatsExtractor import StatsExtractor +from fedot_ind.core.models.recurrence.RecurrenceExtractor import RecurrenceExtractor +from fedot_ind.core.models.quantile.quantile_extractor import QuantileExtractor from fedot_ind.core.operation.transformation.basis.data_driven import DataDrivenBasisImplementation from fedot_ind.core.operation.transformation.basis.fourier import FourierBasisImplementation from sklearn.decomposition import PCA @@ -33,7 +33,10 @@ def init_input(X, y): return input_data -def prepare_features(dataset_name, pca_n_components: float = 0.9): +def prepare_features(dataset_name, + pca_n_components: float = 0.95, + feature_generator: list = ['statistical'], + reduce_dimension: bool = False): train_data, test_data = DataLoader(dataset_name=dataset_name).load_data() train_target = np.array([float(i) for i in train_data[1]]) @@ -41,28 +44,34 @@ def prepare_features(dataset_name, pca_n_components: float = 0.9): train_input_data = init_input(train_data[0], train_target) test_input_data = init_input(test_data[0], test_target) - # extractor = StatsExtractor({'window_mode': False, - # 'window_size': 10, - # 'var_threshold': 0}) - extractor = RecurrenceExtractor({'window_mode': False, - 'min_signal_ratio': 0.3, - 'max_signal_ratio': 0.8, - 'rec_metric': 'euclidean' - }) - - pca = PCA(n_components=pca_n_components, - svd_solver='full') - - extracted_features_train = extractor.transform(train_input_data) - train_size = extracted_features_train.features.shape - train_features = extracted_features_train.features.reshape(train_size[0], train_size[1] * train_size[2]) - train_features = pca.fit_transform(train_features) - - extracted_features_test = extractor.transform(test_input_data) - test_size = extracted_features_test.features.shape - test_features = extracted_features_test.features.reshape(test_size[0], test_size[1] * test_size[2]) - test_features = pca.transform(test_features) - return train_features, train_target, test_features, test_target + generator_dict = {'statistical': QuantileExtractor({'window_mode': False, + 'window_size': 10, + 'var_threshold': 0}), + 'reccurence': RecurrenceExtractor({'window_mode': True, + 'min_signal_ratio': 0.7, + 'max_signal_ratio': 0.9, + 'rec_metric': 'cosine' + }) + } + train_features_list, test_features_list = [], [] + for extractor in feature_generator: + extractor = generator_dict[extractor] + extracted_features_train = extractor.transform(train_input_data) + train_size = extracted_features_train.predict.shape + train_features = extracted_features_train.predict + + extracted_features_test = extractor.transform(test_input_data) + test_size = extracted_features_test.predict.shape + test_features = extracted_features_test.predict + + if reduce_dimension: + pca = PCA(n_components=pca_n_components, + svd_solver='full') + train_features = pca.fit_transform(train_features) + test_features = pca.transform(test_features) + train_features_list.append(train_features), test_features_list.append(test_features) + + return train_features_list, train_target, test_features_list, test_target def calculate_metric(test_target, labels): @@ -81,17 +90,17 @@ def calculate_metric(test_target, labels): return df -def evaluate_baseline(train_features, train_target, test_features, test_target): +def evaluate_baseline(train, train_target, test, test_target): node_scaling = PipelineNode('scaling') node_rfr = PipelineNode('lasso', nodes_from=[node_scaling]) baseline_model = Pipeline(node_rfr) - input_fit = InputData(idx=np.arange(len(train_features)), - features=train_features, + input_fit = InputData(idx=np.arange(len(train)), + features=train, target=train_target.reshape(-1, 1), task=Task(TaskTypesEnum.regression), data_type=DataTypesEnum.image) - input_predict = InputData(idx=np.arange(len(test_features)), - features=test_features, + input_predict = InputData(idx=np.arange(len(test)), + features=test, target=test_target.reshape(-1, 1), task=Task(TaskTypesEnum.regression), data_type=DataTypesEnum.image) @@ -111,21 +120,34 @@ def evaluate_baseline(train_features, train_target, test_features, test_target): if __name__ == "__main__": dataset_list = [ - 'AppliancesEnergy', + # 'Gazprom', + # 'AppliancesEnergy', # 'AustraliaRainfall', - # 'BeijingPM10Quality', - # 'BeijingPM25Quality', - # 'BenzeneConcentration', + # 'BeijingPM10Quality', + #'BeijingPM25Quality', + #'BenzeneConcentration', # 'HouseholdPowerConsumption1', - 'HouseholdPowerConsumption2', - # 'IEEEPPG', - # 'FloodModeling1', - # 'FloodModeling2', - # 'FloodModeling3' - # 'LiveFuelMoistureContent', + #'HouseholdPowerConsumption2', + #'IEEEPPG', + #'FloodModeling1', + 'FloodModeling2', + 'FloodModeling3' + 'LiveFuelMoistureContent', 'BIDMC32HR', 'BIDMC32RR', - 'BIDMC32SpO2' + 'BIDMC32SpO2', + 'DailyOilGasPrices', + 'ElectricityPredictor', + 'OccupancyDetectionLight', + 'SolarRadiationAndalusia', + 'TetuanEnergyConsumption', + 'WindTurbinePower', + 'ElectricMotorTemperature', + 'LPGasMonitoringHomeActivity', + 'GasSensorArrayAcetone', + 'GasSensorArrayEthanol', + 'WaveTensionData' + ] ten_minutes = range(0, 3, 1) one_hour = ['1hr'] @@ -135,17 +157,35 @@ def evaluate_baseline(train_features, train_target, test_features, test_target): except Exception: _ = 1 - train_features, train_target, test_features, test_target = prepare_features(dataset_name=dataset_name) - metric_df_baseline = evaluate_baseline(train_features, train_target, test_features, test_target) + train_features, train_target, test_features, test_target = prepare_features(dataset_name=dataset_name, + reduce_dimension=False, + feature_generator=[ + 'statistical' + # ,'reccurence' + ]) + if len(train_features) > 1: + concatenate_train = np.concatenate(train_features, axis=1) + concatenate_test = np.concatenate(test_features, axis=1) + train_features.append(concatenate_train) + test_features.append(concatenate_test) + else: + concatenate_train = train_features[0] + concatenate_test = test_features[0] + + for train, test in zip(train_features, test_features): + metric_df_baseline = evaluate_baseline(train, train_target, test, test_target) + print(metric_df_baseline) metric_df_baseline.to_csv(f'./{dataset_name}/baseline_metrics.csv') - for run in ten_minutes: + for run in one_hour: predictor = Fedot(problem='regression', - timeout=10, metric='rmse', + timeout=60, + early_stopping_timeout=30, + logging_level=20, n_jobs=6) - model = predictor.fit(features=train_features, target=train_target) - labels = predictor.predict(features=test_features) + model = predictor.fit(features=concatenate_train, target=train_target) + labels = predictor.predict(features=concatenate_test) metric_df = calculate_metric(test_target, labels) metric_df.to_csv(f'./{dataset_name}/metrics_run_{run}.csv') pipeline = predictor.current_pipeline diff --git a/fedot_ind/api/rank_experiment.py b/fedot_ind/api/rank_experiment.py index 1f975d79e..f7d5b67fa 100644 --- a/fedot_ind/api/rank_experiment.py +++ b/fedot_ind/api/rank_experiment.py @@ -6,7 +6,7 @@ from sklearn.metrics import f1_score, roc_auc_score from fedot_ind.api.main import FedotIndustrial from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader -from fedot_ind.core.models.statistical.StatsExtractor import StatsExtractor +from fedot_ind.core.models.quantile.quantile_extractor import QuantileExtractor from fedot_ind.core.operation.transformation.basis.data_driven import DataDrivenBasisImplementation from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import StandardScaler @@ -115,7 +115,7 @@ def evaluate_model(feature_train, bss, test_data, model_type: str = 'MLP'): # 'DistalPhalanxOutlineCorrect' ] - stats_model = StatsExtractor({'window_mode': False, 'window_size': 5, 'use_cache': False, 'n_jobs': 4}) + stats_model = QuantileExtractor({'window_mode': False, 'window_size': 5, 'use_cache': False, 'n_jobs': 4}) for group in [ datasets_bad_f1, datasets_good_f1, diff --git a/fedot_ind/core/operation/transformation/DataTransformer.py b/fedot_ind/core/operation/transformation/DataTransformer.py index 5058b50db..089fc010f 100644 --- a/fedot_ind/core/operation/transformation/DataTransformer.py +++ b/fedot_ind/core/operation/transformation/DataTransformer.py @@ -191,24 +191,23 @@ class TSTransformer: def __init__(self, time_series, min_signal_ratio, max_signal_ratio, rec_metric): self.time_series = time_series self.recurrence_matrix = None - self.threshold_baseline = [1, 5, 10, 15, 20, 25, 30] + self.threshold_baseline = [0.95, 0.7] #cosine self.min_signal_ratio = min_signal_ratio self.max_signal_ratio = max_signal_ratio self.rec_metric = rec_metric def ts_to_recurrence_matrix(self, - eps=0.10, - steps=None): - distance_matrix = pdist(metric=self.rec_metric, X=self.time_series[:, None]) - distance_matrix = np.floor(distance_matrix / eps) - distance_matrix, steps = self.binarization(distance_matrix, threshold=steps) - distance_matrix[distance_matrix > steps] = steps + threshold=None): + distance_matrix = pdist(metric=self.rec_metric, X=self.time_series.T) + distance_matrix = np.ones(shape=distance_matrix.shape[0])-distance_matrix + distance_matrix = self.binarization(distance_matrix, threshold=threshold) self.recurrence_matrix = squareform(distance_matrix) return self.recurrence_matrix def binarization(self, distance_matrix, threshold): best_threshold_flag = False signal_ratio_list = [] + reccurence_matrix = None if threshold is None: for threshold_baseline in self.threshold_baseline: threshold = threshold_baseline @@ -219,20 +218,20 @@ def binarization(self, distance_matrix, threshold): if self.min_signal_ratio < signal_ratio < self.max_signal_ratio: best_ratio = signal_ratio - distance_matrix = tmp_array + reccurence_matrix = tmp_array best_threshold_flag = True if signal_ratio > best_ratio: - distance_matrix = tmp_array + reccurence_matrix = tmp_array else: signal_ratio_list.append(abs(self.max_signal_ratio - signal_ratio)) del tmp_array if not best_threshold_flag: - threshold = self.threshold_baseline[signal_ratio_list.index(min(signal_ratio_list))] - distance_matrix[distance_matrix < threshold] = 0.0 - distance_matrix[distance_matrix >= threshold] = 1.0 - return distance_matrix, threshold + distance_matrix[distance_matrix < self.threshold_baseline[0]] = 0.0 + distance_matrix[distance_matrix >= self.threshold_baseline[0]] = 1.0 + reccurence_matrix = distance_matrix + return reccurence_matrix def get_recurrence_metrics(self): if self.recurrence_matrix is None: