Skip to content

Commit

Permalink
multivariate regression experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
technocreep committed Jul 25, 2023
1 parent f8e9c0c commit 37161df
Show file tree
Hide file tree
Showing 11 changed files with 89 additions and 78 deletions.
9 changes: 6 additions & 3 deletions examples/fedot/fedot_ex.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@
from fedot.core.pipelines.pipeline_builder import PipelineBuilder
from fedot.core.repository.operation_types_repository import get_operations_for_task
from fedot.core.repository.tasks import TaskTypesEnum, Task
from tests.integration.repository.test_repo import initialize_uni_data
from tests.integration.repository.test_repo import initialize_multi_data, initialize_uni_data
from fedot_ind.core.repository.initializer_industrial_models import IndustrialModels

if __name__ == '__main__':
np.random.seed(0)
mode = 'tuning'
dataset_list = ['Adiac', 'ArrowHead', 'Mallat', 'Meat', 'Rock']
dataset_list = [
# 'Adiac', 'ArrowHead', 'Mallat', 'Meat', 'Rock',
'LiveFuelMoistureContent']
# initialize industrial repository
for dataset_name in dataset_list:
with IndustrialModels():
train_data, test_data = initialize_uni_data(dataset_name)
train_data, test_data = initialize_multi_data(dataset_name)
# train_data, test_data = initialize_uni_data(dataset_name)

task = Task(TaskTypesEnum.classification)
industrial = get_operations_for_task(task=train_data.task, mode='data_operation', tags=["extractor", "basis"])
Expand Down
7 changes: 3 additions & 4 deletions fedot_ind/api/an_dec_exper.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@
# 'wavelet_basis',
'data_driven_basis'
],
tuning_iterations=3,
tuning_timeout=15,
tuning_iterations=1,
tuning_timeout=1,
# next are default for every solver
use_cache=False,
timeout=2,
timeout=1,
n_jobs=2)

# use your own ts or let Industrial method generate_anomaly_ts produce it with anomalies
Expand All @@ -77,7 +77,6 @@
predicted = industrial.predict(features=test_data[0], target=test_data[1])
proba = industrial.predict_proba(features=test_data[0], target=test_data[1])

# Doestn work for now :(
industrial.get_metrics(target=test_data[1], metric_names=['f1', 'roc_auc'])

print(classification_report(test_data[1], predicted))
Expand Down
17 changes: 13 additions & 4 deletions fedot_ind/api/exper.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@

industrial = FedotIndustrial(task='ts_classification',
dataset=dataset_name,
# strategy='statistical',
strategy='fedot_preset',
branch_nodes=[
# 'fourier_basis',
# 'wavelet_basis',
'data_driven_basis'
],
tuning_iterations=5,
tuning_timeout=15,
tuning_iterations=10,
tuning_timeout=30,
use_cache=False,
timeout=1,
n_jobs=2,
Expand All @@ -62,8 +63,16 @@
train_data, test_data = DataLoader(dataset_name=dataset_name).load_data()

model = industrial.fit(features=train_data[0], target=train_data[1])
labels = industrial.predict(features=test_data[0],
target=test_data[1])

labels = industrial.predict(features=test_data[0].iloc[:5, :],
target=test_data[1][:5])

labels = industrial.predict(features=test_data[0].iloc[:5, :],
target=test_data[1][:5])

labels = industrial.predict(features=test_data[0].iloc[:1, :],
target=test_data[1][:1])

probs = industrial.predict_proba(features=test_data[0],
target=test_data[1])
metric = industrial.get_metrics(target=test_data[1],
Expand Down
52 changes: 23 additions & 29 deletions fedot_ind/core/architecture/experiment/TimeSeriesClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class TimeSeriesClassifier:
"""

def __init__(self, params: Optional[OperationParameters] = None):
self.test_predict_hash = None
self.strategy = params.get('strategy', 'statistical')
self.model_hyperparams = params.get('model_params')
self.generator_runner = params.get('generator_class')
Expand All @@ -45,7 +44,10 @@ def __init__(self, params: Optional[OperationParameters] = None):
output_dir=self.output_folder)
self.logger = logging.getLogger('TimeSeriesClassifier')
self.datacheck = DataCheck()
self.cacher = DataCacher()

self.prediction_proba = None
self.test_predict_hash = None
self.prediction_label = None
self.predictor = None
self.y_train = None
Expand Down Expand Up @@ -97,34 +99,6 @@ def _fit_baseline_model(self, features: pd.DataFrame, target: np.ndarray, baseli
self.logger.info(f'Baseline model has been fitted')
return baseline_pipeline

def __predict_abstraction(self,
test_features: Union[np.ndarray, pd.DataFrame],
mode: str = 'labels'):
self.logger.info(f'Predicting with {self.strategy} generator')

# data_cacher = DataCacher()
# # get unique hash of input data
# predict_hash = data_cacher.hash_info(data=test_features,
# obj_info_dict=self.__dict__)
# # compare it to existed hash
# if self.test_predict_hash == predict_hash:
# pass

self.test_features = self.generator_runner.extract_features(train_features=test_features,
dataset_name=self.dataset_name)
self.test_features = self.datacheck.check_data(input_data=self.test_features, return_df=True)

if isinstance(self.predictor, Pipeline):
self.input_test_data = array_to_input_data(features_array=self.test_features, target_array=None)
prediction_label = self.predictor.predict(self.input_test_data, output_mode=mode).predict
return prediction_label
else:
if mode == 'labels':
prediction_label = self.predictor.predict(self.test_features)
else:
prediction_label = self.predictor.predict_proba(self.test_features)
return prediction_label

def fit(self, features: Union[np.ndarray, pd.DataFrame],
target: np.ndarray,
**kwargs) -> object:
Expand Down Expand Up @@ -159,6 +133,26 @@ def predict_proba(self, features: np.ndarray, **kwargs) -> dict:
mode='probs', )
return self.prediction_proba

def __predict_abstraction(self,
test_features: Union[np.ndarray, pd.DataFrame],
mode: str = 'labels'):
self.logger.info(f'Predicting with {self.strategy} generator')

self.test_features = self.generator_runner.extract_features(train_features=test_features,
dataset_name=self.dataset_name)
self.test_features = self.datacheck.check_data(input_data=self.test_features, return_df=True)

if isinstance(self.predictor, Pipeline):
self.input_test_data = array_to_input_data(features_array=self.test_features, target_array=None)
prediction_label = self.predictor.predict(self.input_test_data, output_mode=mode).predict
return prediction_label
else:
if mode == 'labels':
prediction_label = self.predictor.predict(self.test_features)
else:
prediction_label = self.predictor.predict_proba(self.test_features)
return prediction_label

def get_metrics(self, target: Union[np.ndarray, pd.Series], metric_names: Union[str, List[str]]):
analyzer = PerformanceAnalyzer()
return analyzer.calculate_metrics(target=target,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,28 +212,27 @@ def predict(self, features: pd.DataFrame, target: np.array) -> dict:
target: numpy array with target values
"""
# data_cacher = DataCacher()
# # get unique hash of input data
# test_predict_hash = data_cacher.hash_info(data=features,
# obj_info_dict=self.__dict__)
# # compare it to existed hash
# if self.test_predict_hash != test_predict_hash:
test_data = self._init_input_data(features, target)
test_data_preprocessed = self.preprocessing_pipeline.root_node.predict(test_data)

if test_data.features.shape[0] == 1:
test_data_preprocessed.predict = np.squeeze(test_data_preprocessed.predict).reshape(1, -1)
else:
test_data_preprocessed.predict = np.squeeze(test_data_preprocessed.predict)
self.test_data_preprocessed = InputData(idx=test_data_preprocessed.idx,
features=test_data_preprocessed.predict,
target=test_data_preprocessed.target,
data_type=test_data_preprocessed.data_type,
task=test_data_preprocessed.task)

# self.prediction_label_baseline = self.baseline_model.predict(self.test_data_preprocessed).predict
self.prediction_label = self.predictor.predict(self.test_data_preprocessed)
# self.test_predict_hash = test_predict_hash
data_cacher = DataCacher()
# get unique hash of input data
test_predict_hash = data_cacher.hash_info(data=features)
# compare it to existed hash
if self.test_predict_hash != test_predict_hash:
test_data = self._init_input_data(features, target)
test_data_preprocessed = self.preprocessing_pipeline.root_node.predict(test_data)

if test_data.features.shape[0] == 1:
test_data_preprocessed.predict = np.squeeze(test_data_preprocessed.predict).reshape(1, -1)
else:
test_data_preprocessed.predict = np.squeeze(test_data_preprocessed.predict)
self.test_data_preprocessed = InputData(idx=test_data_preprocessed.idx,
features=test_data_preprocessed.predict,
target=test_data_preprocessed.target,
data_type=test_data_preprocessed.data_type,
task=test_data_preprocessed.task)

# self.prediction_label_baseline = self.baseline_model.predict(self.test_data_preprocessed).predict
self.prediction_label = self.predictor.predict(self.test_data_preprocessed)
self.test_predict_hash = test_predict_hash

return self.prediction_label

Expand Down
4 changes: 1 addition & 3 deletions fedot_ind/core/models/BaseExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ class BaseExtractor(IndustrialCachableOperationImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.current_window = None
# TODO: get back
self.n_processes = 2
# self.n_processes = math.ceil(cpu_count() * 0.7) if cpu_count() > 1 else 1
self.n_processes = math.ceil(cpu_count() * 0.7) if cpu_count() > 1 else 1
self.data_type = DataTypesEnum.table
self.use_cache = params.get('use_cache', False)

Expand Down
7 changes: 6 additions & 1 deletion fedot_ind/core/models/statistical/StatsExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ def _transform(self, input_data: InputData) -> np.array:
"""
Method for feature generation for all series
"""
input_data_squeezed = np.squeeze(input_data.features, 3)
try:
input_data_squeezed = np.squeeze(input_data.features, 3)
except ValueError:
input_data_squeezed = input_data.features
with Pool(self.n_processes) as p:
v = list(tqdm(p.imap(self.generate_features_from_ts, input_data_squeezed),
total=input_data.features.shape[0],
Expand All @@ -55,6 +58,8 @@ def _transform(self, input_data: InputData) -> np.array:
columns=stat_features,
n_components=n_components)
# return predict
# percent of feature space reduction
self.logger.info(f'Feature space reduced by {len(stat_features)*n_components / predict.shape[1]}%')
return predict.values

def drop_features(self, predict: pd.DataFrame, columns: Index, n_components: int):
Expand Down
15 changes: 7 additions & 8 deletions fedot_ind/core/operation/IndustrialCachableOperation.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,15 @@ def transform(self, input_data: InputData) -> OutputData:
Method firstly tries to load result from cache. If unsuccessful, it starts to generate features
"""
# TODO: get back to
# operation_parameters = self.params.to_dict()
# class_params = {k:v for k,v in self.__dict__.items() if k not in ['cacher',
# 'params',
# 'n_processes',
# 'logging_params']}
# class_params = {k: v for k, v in self.__dict__.items() if k not in ['cacher',
# 'data_type',
# 'params',
# 'n_processes',
# 'logging_params']}
#
# operation_parameters.update(class_params)
# hashed_info = self.cacher.hash_info(data=input_data.features,
# operation_info=operation_parameters.__repr__())

# operation_info=class_params.__repr__())
# '5258e575f6'
# hashed_info = self.cacher.hash_info(data=input_data.features.tobytes(),
# operation_info=self.params.to_dict())

Expand Down
6 changes: 4 additions & 2 deletions fedot_ind/core/operation/transformation/basis/data_driven.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,10 @@ def get_threshold(self, data, selector: str):
'0.25%': lambda x: np.quantile(x, 0.25)}

svd_numbers = []
for signal in data:
svd_numbers.append(self._transform_one_sample(signal, svd_flag=True))
with tqdm(total=len(data), desc='SVD estimation') as pbar:
for signal in data:
svd_numbers.append(self._transform_one_sample(signal, svd_flag=True))
pbar.update(1)

return math.ceil(selectors[selector](svd_numbers))

Expand Down
6 changes: 5 additions & 1 deletion fedot_ind/core/operation/utils/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ def load_data_from_cache(self, hashed_info: str):

start = timeit.default_timer()
file_path = os.path.join(self.cache_folder, hashed_info + '.npy')
data = np.load(file_path)
try:
data = np.load(file_path)
except FileNotFoundError:
self.logger.info('Cache not found')
raise FileNotFoundError(f'File {file_path} was not found')
elapsed_time = round(timeit.default_timer() - start, 5)
print(f'{self.data_type} of {type(data)} type is loaded from cache in {elapsed_time} sec')
return data
Expand Down
1 change: 0 additions & 1 deletion fedot_ind/core/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

'quantile_extractor':
{'window_mode': (hp.choice, [[True, True]]),
# {'window_mode': (hp.choice, [[True, False]]),
'window_size': (hp.choice, [[x for x in range(1, 50, 3)]]),
'var_threshold': (hp.choice, [np.linspace(0, 0.02, 35)])},

Expand Down

0 comments on commit 37161df

Please sign in to comment.