Skip to content

Commit

Permalink
All minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
v1docq committed Jul 24, 2023
1 parent 8aecd6e commit 1ca710d
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 150 deletions.
73 changes: 39 additions & 34 deletions fedot_ind/api/exper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,49 @@

if __name__ == "__main__":

# datasets_bad_f1 = [
# 'EOGVerticalSignal',
# 'ScreenType',
# 'CricketY',
# 'ElectricDevices',
# 'Lightning7'
# ]
datasets_bad_f1 = [
#'EOGVerticalSignal',
# 'ScreenType',
# 'CricketY',
# 'ElectricDevices',
'Lightning7'
]

# datasets_good_f1 = [
# 'Car',
# 'ECG5000',
# 'Phoneme',
# 'Meat',
datasets_good_f1 = [
'Car',
'ECG5000',
"Beef",
# 'Phoneme',
#'Meat',
# 'RefrigerationDevices'
# ]
]

datasets_good_roc = [
'Chinatown',
# 'Earthquakes',
# 'Ham',
# 'ECG200',
# 'MiddlePhalanxOutlineCorrect',
# 'MoteStrain',
# 'TwoLeadECG'
# 'Chinatown',
'Computers',
# 'Earthquakes',
'Ham',
'ECG200',
'ECGFiveDays'
# 'MiddlePhalanxOutlineCorrect',
# 'MoteStrain',
# 'TwoLeadECG'
]
# node_scaling = PipelineNode('scaling')
# node_final = PipelineNode('rf', nodes_from=[node_scaling])
# rf_model = Pipeline(node_final)

# datasets_bad_roc = [
# 'Lightning2',
# 'WormsTwoClass',
# 'DistalPhalanxOutlineCorrect'
# ]
datasets_bad_roc = [
'Lightning2',
# 'WormsTwoClass',
# 'DistalPhalanxOutlineCorrect'
]

for group in [
# datasets_bad_f1,
# datasets_good_f1,
datasets_bad_f1,
datasets_good_f1,
datasets_good_roc,
# datasets_bad_roc
datasets_bad_roc
]:

for dataset_name in group:
Expand All @@ -52,15 +58,14 @@
# 'wavelet_basis',
'data_driven_basis'
],
tuning_iterations=10,
tuning_timeout=2,
tuning_iterations=30,
tuning_timeout=15.0,
use_cache=False,
timeout=1,
n_jobs=2,
timeout=10,
n_jobs=6,
)

train_data, test_data = DataLoader(dataset_name=dataset_name).load_data()

model = industrial.fit(features=train_data[0], target=train_data[1])
labels = industrial.predict(features=test_data[0],
target=test_data[1])
Expand All @@ -70,5 +75,5 @@
metric_names=['f1', 'roc_auc'])
for pred, kind in zip([labels, probs], ['labels', 'probs']):
industrial.save_predict(predicted_data=pred, kind=kind)

industrial.save_metrics(metrics=metric)
_ = 1
126 changes: 89 additions & 37 deletions fedot_ind/api/rank_experiment.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,96 @@
import os

import numpy as np
import pandas as pd
from fedot.core.data.data import InputData
import matplotlib.pyplot as plt
from fedot.api.main import Fedot
from sklearn.metrics import f1_score, roc_auc_score
from fedot_ind.api.main import FedotIndustrial
from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader
from fedot_ind.core.models.statistical.StatsExtractor import StatsExtractor
from fedot_ind.core.operation.transformation.basis.data_driven import DataDrivenBasisImplementation
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler


def extract_features(train_data, bss):
basis_1d_raw = bss._transform(train_data[0])
feature_train = stats_model.transform(basis_1d_raw)
return feature_train, bss


def evaluate_model(feature_train, bss, test_data, model_type: str = 'MLP'):
if len(np.unique(test_data[1])) > 2:
metric_name = 'f1'
else:
metric_name = 'roc_auc'

if model_type == 'MLP':
clf = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='relu', solver='adam',
random_state=42)
else:
clf = Fedot(
# available_operations=['fast_ica', 'scaling','normalization',
# 'xgboost',
# 'rf',
# 'logit',
# 'mlp',
# 'knn',
# 'pca'],
metric=metric_name, timeout=10, problem='classification', n_jobs=6)

scaler = StandardScaler()
scaler.fit(feature_train)
feature_train = scaler.transform(feature_train)
clf.fit(feature_train, train_data[1])
basis_1d_raw = bss._transform(test_data[0])
test_feature = stats_model.transform(basis_1d_raw)
test_feature = scaler.transform(test_feature)
if len(np.unique(test_data[1])) > 2:
metric = f1_score(test_data[1], clf.predict(test_feature), average='weighted')
else:
metric = roc_auc_score(test_data[1], clf.predict(test_feature), average='weighted')
return metric, test_feature


# def visualise_and_save():
# for class_number in np.unique(train_data[1]):
# for basis_name, basis in zip(['basis_before_power_iterations', 'basis_after_power_iterations'],
# [basis_1d_raw, basis_1d_approx]):
# class_idx = np.where(train_data[1] == class_number)[0]
# class_slice = np.take(basis, class_idx, 0)
# pd.DataFrame(np.median(class_slice, axis=0)).T.plot()
# # plt.show()
# plt.savefig(f'{dataset_name}/{basis_name}_{class_number}_median_component.png', bbox_inches='tight')
# # plt.title(f'mean_{basis_name}_components_for_{class_number}_class')
# rank_distrib = pd.DataFrame([rank_distribution_befor, rank_distribution_after]).T
# rank_distrib.columns = ['HT_approach',
# 'Proposed_approach']
# rank_distrib.plot(kind='kde')
# # plt.show()
# rank_dispersion_ht = np.round(rank_distrib['HT_approach'].std(), 3)
# rank_dispersion_new = np.round(rank_distrib['Proposed_approach'].std(), 3)
# plt.savefig(f'{dataset_name}/rank_distrib. '
# f'Classical_rank_{low_rank_befor}_std_{rank_dispersion_ht}.'
# f'New_{low_rank_after}_std_{rank_dispersion_new}.png', bbox_inches='tight')
# rank_distrib['classes'] = train_data[1]


if __name__ == "__main__":

datasets_bad_f1 = [
# 'EOGVerticalSignal',
#'EOGVerticalSignal',
# 'ScreenType',
# 'CricketY',
# 'ElectricDevices',
'Lightning7'
]

datasets_good_f1 = [
# 'Car',
# 'ECG5000',
# "Beef"
'Car',
'ECG5000',
"Beef",
# 'Phoneme',
# 'Meat',
'Meat',
# 'RefrigerationDevices'
]

Expand All @@ -39,12 +105,16 @@
# 'MoteStrain',
# 'TwoLeadECG'
]
# node_scaling = PipelineNode('scaling')
# node_final = PipelineNode('rf', nodes_from=[node_scaling])
# rf_model = Pipeline(node_final)

datasets_bad_roc = [
'Lightning2',
# 'WormsTwoClass',
# 'DistalPhalanxOutlineCorrect'
]

stats_model = StatsExtractor({'window_mode': False, 'window_size': 5, 'use_cache': False, 'n_jobs': 4})
for group in [
datasets_bad_f1,
Expand All @@ -63,7 +133,7 @@
# 'wavelet_basis',
'data_driven_basis'
],
tuning_iterations=10,
tuning_iterations=30,
tuning_timeout=15,
use_cache=False,
timeout=5,
Expand All @@ -73,37 +143,19 @@
os.makedirs(f'./{dataset_name}')
except Exception:
_ = 1

train_data, test_data = DataLoader(dataset_name=dataset_name).load_data()
# bss = DataDrivenBasisImplementation({'sv_selector': 'median', 'window_size': 20})
# bss.low_rank_approximation = False
# train_feature, bss = extract_features(train_data, bss)
# f1_HT, test_feature = evaluate_model(train_feature, bss, test_data,model_type='Auto')

bss = DataDrivenBasisImplementation({'sv_selector': 'median', 'window_size': 20})
bss.low_rank_approximation = False
basis_1d_raw = bss._transform(train_data[0])
rank_distribution_befor = bss.rank_distribution
low_rank_befor = bss.SV_threshold
bss.low_rank_approximation = True
bss.SV_threshold = None
basis_1d_approx = bss._transform(train_data[0])
rank_distribution_after = bss.rank_distribution
low_rank_after = bss.SV_threshold

HT_feature = stats_model.transform(basis_1d_raw)
for class_number in np.unique(train_data[1]):
for basis_name, basis in zip(['basis_before_power_iterations', 'basis_after_power_iterations'],
[basis_1d_raw, basis_1d_approx]):
class_idx = np.where(train_data[1] == class_number)[0]
class_slice = np.take(basis, class_idx, 0)
pd.DataFrame(np.median(class_slice, axis=0)).T.plot()
# plt.show()
plt.savefig(f'{dataset_name}/{basis_name}_{class_number}_median_component.png', bbox_inches='tight')
# plt.title(f'mean_{basis_name}_components_for_{class_number}_class')
rank_distrib = pd.DataFrame([rank_distribution_befor, rank_distribution_after]).T
rank_distrib.columns = ['HT_approach',
'Proposed_approach']
rank_distrib.plot(kind='kde')
# plt.show()
rank_dispersion_ht = np.round(rank_distrib['HT_approach'].std(), 3)
rank_dispersion_new = np.round(rank_distrib['Proposed_approach'].std(), 3)
plt.savefig(f'{dataset_name}/rank_distrib. '
f'Classical_rank_{low_rank_befor}_std_{rank_dispersion_ht}.'
f'New_{low_rank_after}_std_{rank_dispersion_new}.png', bbox_inches='tight')
rank_distrib['classes'] = train_data[1]
train_feature, bss = extract_features(train_data, bss)
f1_PI, test_feature_PI = evaluate_model(train_feature, bss, test_data,model_type='Auto')
print(f'Dataset-{dataset_name}')
#print(f'HT_metric-{f1_HT}')
print(f'PI_metric-{f1_PI}')
_ = 1
7 changes: 6 additions & 1 deletion fedot_ind/api/utils/saver_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ def __init__(self, dataset_name: str, generator_name: str, output_dir: str = Non
self.logger = logging.getLogger(self.__class__.__name__)
self.save_method_dict = {'labels': self.save_labels,
'probs': self.save_probs,
'metrics': self.save_metrics}
'metrics': self.save_metrics,
'baseline_metrics': self.save_baseline_metrics
}

def __init_save_path(self, dataset_name, generator_name, output_dir):
if output_dir is None:
Expand Down Expand Up @@ -46,3 +48,6 @@ def save_metrics(self, metrics: dict):
df = pd.DataFrame(metrics, index=[0])
df.to_csv(os.path.join(self.path, 'metrics.csv'))

def save_baseline_metrics(self, metrics: dict):
df = pd.DataFrame(metrics, index=[0])
df.to_csv(os.path.join(self.path, 'baseline_metrics.csv'))
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum
from fedot.core.repository.tasks import Task, TaskTypesEnum
from golem.core.tuning.simultaneous import SimultaneousTuner

from fedot_ind.api.utils.saver_collections import ResultSaver
from fedot_ind.core.architecture.postprocessing.Analyzer import PerformanceAnalyzer
from fedot_ind.core.architecture.utils.utils import default_path_to_save_results
Expand Down Expand Up @@ -49,7 +48,7 @@ def __init__(self, params: Optional[OperationParameters] = None):
self.model_params = params.get('model_params')
self.dataset_name = params.get('dataset')
self.tuning_iters = params.get('tuning_iterations', 30)
self.tuning_timeout = params.get('tuning_timeout', 15)
self.tuning_timeout = params.get('tuning_timeout', 15.0)
self.output_folder = params.get('output_folder', default_path_to_save_results())

self.saver = ResultSaver(dataset_name=self.dataset_name,
Expand Down Expand Up @@ -125,7 +124,10 @@ def _build_pipeline(self):
for index, (basis, extractor) in enumerate(zip(self.branch_nodes, self.extractors)):
pipeline_builder.add_node(basis, branch_idx=index)
pipeline_builder.add_node(extractor, branch_idx=index)
pipeline_builder.join_branches('rf')
pipeline_builder.join_branches('mlp', params={'hidden_layer_sizes': (150, 100, 50),
'max_iter': 300,
'activation': 'relu',
'solver': 'adam', })

return pipeline_builder.build()

Expand Down Expand Up @@ -194,7 +196,18 @@ def fit(self, features,

metric = 'roc_auc' if train_data_preprocessed.num_classes == 2 else 'f1'
self.model_params.update({'metric': metric})
self.predictor = Fedot(**self.model_params)
self.predictor = Fedot(available_operations=['scaling',
'normalization',
'fast_ica',
'xgboost',
'rfr',
'rf',
'logit',
'mlp',
'knn',
'lgbm',
'pca']
, **self.model_params)

self.predictor.fit(train_data_preprocessed)

Expand All @@ -218,7 +231,7 @@ def predict(self, features: pd.DataFrame, target: np.array) -> dict:
target=test_data_preprocessed.target,
data_type=test_data_preprocessed.data_type,
task=test_data_preprocessed.task)
self.prediction_label_baseline = self.baseline_model.predict(self.test_data_preprocessed).predict
self.prediction_label_baseline = self.baseline_model.predict(self.test_data_preprocessed,'labels').predict
self.prediction_label = self.predictor.predict(self.test_data_preprocessed)
return self.prediction_label

Expand All @@ -228,7 +241,7 @@ def predict_proba(self, features, target) -> dict:
test_data_preprocessed = self.preprocessing_pipeline.root_node.predict(test_data)
self.test_data_preprocessed.predict = np.squeeze(test_data_preprocessed.predict)

self.prediction_proba_baseline = self.baseline_model.predict(self.test_data_preprocessed,'probs').predict
self.prediction_proba_baseline = self.baseline_model.predict(self.test_data_preprocessed, 'probs').predict
self.prediction_proba = self.predictor.predict_proba(self.test_data_preprocessed)
return self.prediction_proba

Expand Down
7 changes: 3 additions & 4 deletions fedot_ind/core/operation/transformation/basis/data_driven.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import math
import time
from multiprocessing import Pool
from typing import Tuple, TypeVar, Optional

Expand All @@ -11,8 +10,7 @@
from sklearn.metrics import f1_score, roc_auc_score
from tensorly.decomposition import parafac
from tqdm import tqdm

from fedot_ind.core.architecture.preprocessing import InputData
from fedot.core.data.data import InputData
from fedot_ind.core.operation.decomposition.matrix_decomposition.fast_svd import RSVDDecomposition

from fedot_ind.core.operation.transformation.basis.abstract_basis import BasisDecompositionImplementation
Expand All @@ -39,7 +37,8 @@ def __init__(self, params: Optional[OperationParameters] = None):
self.window_size = params.get('window_size')
self.basis = None
self.SV_threshold = None
self.sv_selector = params.get('sv_selector')
#self.sv_selector = params.get('sv_selector')
self.sv_selector = 'median'
self.svd_estimator = RSVDDecomposition()
self.low_rank_approximation = True
self.logging_params.update({'WS': self.window_size,
Expand Down
Loading

0 comments on commit 1ca710d

Please sign in to comment.