diff --git a/fedot_ind/api/exper.py b/fedot_ind/api/exper.py index 108105b4f..06273a7bb 100644 --- a/fedot_ind/api/exper.py +++ b/fedot_ind/api/exper.py @@ -20,10 +20,10 @@ # ] datasets_good_roc = [ - 'Chinatown', + # 'Chinatown', # 'Earthquakes', # 'Ham', - # 'ECG200', + 'ECG200', # 'MiddlePhalanxOutlineCorrect', # 'MoteStrain', # 'TwoLeadECG' @@ -52,8 +52,8 @@ # 'wavelet_basis', 'data_driven_basis' ], - tuning_iterations=10, - tuning_timeout=2, + tuning_iterations=5, + tuning_timeout=15, use_cache=False, timeout=1, n_jobs=2, diff --git a/fedot_ind/core/architecture/experiment/TimeSeriesClassifierPreset.py b/fedot_ind/core/architecture/experiment/TimeSeriesClassifierPreset.py index 0665ada11..1d105a251 100644 --- a/fedot_ind/core/architecture/experiment/TimeSeriesClassifierPreset.py +++ b/fedot_ind/core/architecture/experiment/TimeSeriesClassifierPreset.py @@ -15,6 +15,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from golem.core.tuning.simultaneous import SimultaneousTuner +from golem.core.tuning.sequential import SequentialTuner from fedot_ind.api.utils.saver_collections import ResultSaver from fedot_ind.core.architecture.postprocessing.Analyzer import PerformanceAnalyzer @@ -91,7 +92,7 @@ def _init_input_data(self, X: pd.DataFrame, y: np.ndarray) -> InputData: y: numpy array with target values Returns: - InputData object convinient for FEDOT framework + InputData object convenient for FEDOT framework """ is_multivariate_data = self.__check_multivariate_data(X) diff --git a/fedot_ind/core/architecture/settings/hyperparams.py b/fedot_ind/core/architecture/settings/hyperparams.py index 057d57bc6..127446285 100644 --- a/fedot_ind/core/architecture/settings/hyperparams.py +++ b/fedot_ind/core/architecture/settings/hyperparams.py @@ -1,10 +1,6 @@ import numpy as np -def quantile(column, q: str): - return np.quantile(a=column, q=q) - - def softmax(w, theta=1.0) -> np.ndarray: """Takes a vector w of S N-element and returns a vectors where each column of the vector sums to 1, with elements exponentially proportional to the @@ -25,17 +21,6 @@ def softmax(w, theta=1.0) -> np.ndarray: return dist -stat_methods_default = { - 'mean_': np.mean, - 'median_': np.median, - 'std_': np.std, - 'var_': np.var, - 'q5_': quantile, - 'q25_': quantile, - 'q75_': quantile, - 'q95_': quantile, -} - stat_methods_ensemble = { 'MeanEnsemble': np.mean, 'MedianEnsemble': np.median, @@ -43,27 +28,3 @@ def softmax(w, theta=1.0) -> np.ndarray: 'MaxEnsemble': np.max, 'ProductEnsemble': np.prod } - -stat_methods_full = { - 'mean_': np.mean, - 'median_': np.median, - 'lambda_less_zero': lambda x: x < 0.01, - 'std_': np.std, - 'var_': np.var, - 'max': np.max, - 'min': np.min, - 'q5_': quantile, - 'q25_': quantile, - 'q75_': quantile, - 'q95_': quantile, - 'sum_': np.sum, - 'dif_': np.diff -} - -hyper_param_dict = {'statistical_methods': stat_methods_default, - 'statistical_methods_extra': stat_methods_full, - 'stat_methods_ensemble': stat_methods_ensemble} - - -def select_hyper_param(param_name): - return hyper_param_dict[param_name] diff --git a/fedot_ind/core/ensemble/static/RankEnsembler.py b/fedot_ind/core/ensemble/static/RankEnsembler.py index 7c6e270cf..915ed7731 100644 --- a/fedot_ind/core/ensemble/static/RankEnsembler.py +++ b/fedot_ind/core/ensemble/static/RankEnsembler.py @@ -5,7 +5,7 @@ from fedot_ind.core.architecture.postprocessing.Analyzer import PerformanceAnalyzer from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader -from fedot_ind.core.architecture.settings.hyperparams import select_hyper_param +from fedot_ind.core.architecture.settings.hyperparams import stat_methods_ensemble from fedot_ind.core.ensemble.BaseEnsembler import BaseEnsemble @@ -32,7 +32,7 @@ def __init__(self, dataset_name: str, proba_dict, metric_dict): self.logger = logging.getLogger(self.__class__.__name__) self.best_ensemble_metric = 0 - self.ensemble_strategy_dict = select_hyper_param('stat_methods_ensemble') + self.ensemble_strategy_dict = stat_methods_ensemble self.ensemble_strategy = self.ensemble_strategy_dict.keys() self.strategy_exclude_list = ['WeightedEnsemble'] diff --git a/fedot_ind/core/models/BaseExtractor.py b/fedot_ind/core/models/BaseExtractor.py index 6da36eb9d..8ef722ed0 100644 --- a/fedot_ind/core/models/BaseExtractor.py +++ b/fedot_ind/core/models/BaseExtractor.py @@ -10,6 +10,7 @@ from fedot_ind.core.metrics.metrics_implementation import * from fedot_ind.core.operation.IndustrialCachableOperation import IndustrialCachableOperationImplementation +from fedot_ind.core.operation.transformation.extraction.statistical import stat_methods from fedot_ind.core.operation.utils.cache import DataCacher @@ -21,7 +22,9 @@ class BaseExtractor(IndustrialCachableOperationImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) self.current_window = None - self.n_processes = math.ceil(cpu_count() * 0.7) if cpu_count() > 1 else 1 + # TODO: get back + self.n_processes = 2 + # self.n_processes = math.ceil(cpu_count() * 0.7) if cpu_count() > 1 else 1 self.data_type = DataTypesEnum.table self.use_cache = params.get('use_cache', False) @@ -103,3 +106,30 @@ def extract_features(self, train_features: pd.DataFrame, return features else: return self.generate_features_from_ts(train_features, dataset_name) + + @staticmethod + def get_statistical_features(time_series: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame: + """ + Method for creating baseline statistical features for a given time series. + + Args: + time_series: time series for which features are generated + + Returns: + Row vector of statistical features in the form of a pandas DataFrame + + """ + names = [] + vals = [] + # flatten time series + if isinstance(time_series, (pd.DataFrame, pd.Series)): + time_series = time_series.values + time_series = time_series.flatten() + + for name, method in stat_methods.items(): + try: + vals.append(method(time_series)) + names.append(name) + except ValueError: + continue + return pd.DataFrame([vals], columns=names) diff --git a/fedot_ind/core/models/signal/SignalExtractor.py b/fedot_ind/core/models/signal/SignalExtractor.py index 7bc714c8d..af817ba61 100644 --- a/fedot_ind/core/models/signal/SignalExtractor.py +++ b/fedot_ind/core/models/signal/SignalExtractor.py @@ -7,7 +7,7 @@ from fedot_ind.core.metrics.metrics_implementation import * from fedot_ind.core.models.signal.WindowedFeaturesExtractor import WindowedFeatureExtractor -from fedot_ind.core.operation.transformation.extraction.statistical import StatFeaturesExtractor +# from fedot_ind.core.operation.transformation.extraction.statistical import StatFeaturesExtractor class SignalExtractor(WindowedFeatureExtractor): @@ -17,7 +17,7 @@ class SignalExtractor(WindowedFeatureExtractor): use_cache: flag to use cache or not. Defined in Config_Classification.yaml Attributes: ts_samples_count (int): number of samples in time series - aggregator (StatFeaturesExtractor): class to aggregate features + # aggregator (StatFeaturesExtractor): class to aggregate features wavelet_extractor (WaveletExtractor): class to extract wavelet features wavelet (str): current wavelet type vis_flag (bool): flag to visualize or not @@ -29,8 +29,8 @@ class SignalExtractor(WindowedFeatureExtractor): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) self.ts_samples_count = None - self.aggregator = StatFeaturesExtractor() - self.wavelet_extractor = WaveletExtractor + # self.aggregator = StatFeaturesExtractor() + # self.wavelet_extractor = WaveletExtractor self.wavelet = params.get('wavelet') self.vis_flag = False diff --git a/fedot_ind/core/models/signal/WindowedFeaturesExtractor.py b/fedot_ind/core/models/signal/WindowedFeaturesExtractor.py index 5094b2b34..4c6677f27 100644 --- a/fedot_ind/core/models/signal/WindowedFeaturesExtractor.py +++ b/fedot_ind/core/models/signal/WindowedFeaturesExtractor.py @@ -22,4 +22,5 @@ def apply_window_for_stat_feature(ts_data: pd.DataFrame, df = feature_generator(slice_ts) df.columns = [x + f'_on_interval: {i} - {i + window_size}' for x in df.columns] tmp_list.append(df) - return tmp_list \ No newline at end of file + return tmp_list + diff --git a/fedot_ind/core/models/statistical/StatsExtractor.py b/fedot_ind/core/models/statistical/StatsExtractor.py index ae18ee1bc..24e2dd4b0 100644 --- a/fedot_ind/core/models/statistical/StatsExtractor.py +++ b/fedot_ind/core/models/statistical/StatsExtractor.py @@ -1,51 +1,93 @@ +from multiprocessing import Pool from typing import Optional +import numpy as np import pandas as pd from fedot.core.data.data import InputData from fedot.core.operations.operation_parameters import OperationParameters +from pandas import Index +from tqdm import tqdm from fedot_ind.core.models.BaseExtractor import BaseExtractor -from fedot_ind.core.operation.transformation.extraction.statistical import StatFeaturesExtractor class StatsExtractor(BaseExtractor): """Class responsible for quantile feature generator experiment. - Args: - window_mode: Flag for window mode. Defaults to False. - use_cache: Flag for cache usage. Defaults to False. + Attributes: use_cache (bool): Flag for cache usage. - aggregator (StatFeaturesExtractor): StatFeaturesExtractor object. - vis_flag (bool): Flag for visualization. - train_feats (pd.DataFrame): Train features. - test_feats (pd.DataFrame): Test features. """ def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.aggregator = StatFeaturesExtractor() + self.var_threshold = params.get('var_threshold') self.window_mode = params.get('window_mode') self.window_size = params.get('window_size') - self.vis_flag = False - self.train_feats = None - self.test_feats = None - self.n_components = None - self.logging_params.update({'WS': self.window_size, 'WM': self.window_mode}) + self.logging_params.update({'Wsize': self.window_size, + 'Wmode': self.window_mode, + 'VarTh': self.var_threshold}) + self.relevant_features = None def fit(self, input_data: InputData): pass + def _transform(self, input_data: InputData) -> np.array: + """ + Method for feature generation for all series + """ + input_data_squeezed = np.squeeze(input_data.features, 3) + with Pool(self.n_processes) as p: + v = list(tqdm(p.imap(self.generate_features_from_ts, input_data_squeezed), + total=input_data.features.shape[0], + desc=f'{self.__class__.__name__} transform', + postfix=f'{self.logging_params}', + colour='green', + unit='ts', + ascii=False, + position=0, + leave=True) + ) + stat_features = v[0].columns + n_components = v[0].shape[0] + predict = self._clean_predict(np.array(v)) + predict = self.drop_features(predict, stat_features, n_components) + return predict.values + + def drop_features(self, predict: pd.DataFrame, columns: Index, n_components: int): + """ + Method for dropping features with low variance + """ + # Fill columns names for every extracted ts component + predict = pd.DataFrame(predict, + columns=[f'{col}{str(i)}' for i in range(1, n_components+1) for col in columns]) + + if self.relevant_features is None: + reduced_df, self.relevant_features = self.filter_by_var(predict, threshold=self.var_threshold) + return reduced_df + else: + return predict[self.relevant_features] + + def filter_by_var(self, data: pd.DataFrame, threshold: float): + cols = data.columns + filtrat = {} + + for col in cols: + if np.var(data[col].values) > threshold: + filtrat.update({col: data[col].values.flatten()}) + + return pd.DataFrame(filtrat), list(filtrat.keys()) + def extract_stats_features(self, ts): if self.window_mode: - aggregator = self.aggregator.create_baseline_features - list_of_stat_features_on_interval = self.apply_window_for_stat_feature(ts_data=ts, - feature_generator=aggregator, - window_size=self.window_size) - aggregation_df = pd.concat(list_of_stat_features_on_interval, axis=1) + # aggregator = self.aggregator.create_baseline_features + list_of_stat_features = self.apply_window_for_stat_feature(ts_data=ts.T if ts.shape[1] == 1 else ts, + feature_generator=self.get_statistical_features, + window_size=self.window_size) + aggregation_df = pd.concat(list_of_stat_features, axis=1) else: - aggregation_df = self.aggregator.create_baseline_features(ts) + aggregation_df = self.get_statistical_features(ts) return aggregation_df def generate_features_from_ts(self, @@ -84,14 +126,9 @@ def __get_feature_matrix(self, ts): ts_components = [pd.DataFrame(x) for x in ts.values.tolist()] if ts_components[0].shape[0] != 1: ts_components = [x.T for x in ts_components] - tmp_list = [] - for index, component in enumerate(ts_components): - aggregation_df = self.extract_stats_features(component) - tmp_list.append(aggregation_df) - aggregation_df = pd.concat(tmp_list, axis=0) - # tmp_list = [self.extract_stats_features(x) for x in ts_components] - # aggregation_df = pd.concat(tmp_list, axis=0) + tmp_list = [self.extract_stats_features(x) for x in ts_components] + aggregation_df = pd.concat(tmp_list, axis=0) return aggregation_df diff --git a/fedot_ind/core/operation/IndustrialCachableOperation.py b/fedot_ind/core/operation/IndustrialCachableOperation.py index e11f3a26b..e1b7b3786 100644 --- a/fedot_ind/core/operation/IndustrialCachableOperation.py +++ b/fedot_ind/core/operation/IndustrialCachableOperation.py @@ -40,11 +40,15 @@ def transform(self, input_data: InputData) -> OutputData: Method firstly tries to load result from cache. If unsuccessful, it starts to generate features """ # TODO: get back to - # operation_parameters = [f'{key}:{value}' for key, value in self.params.to_dict().items()] - # class_params = list(self.__dir__()) - # operational_info = operation_parameters + class_params - # hashed_info = self.cacher.hash_info(data=input_data.features.tobytes(), - # operation_info=operational_info) + # operation_parameters = self.params.to_dict() + # class_params = {k:v for k,v in self.__dict__.items() if k not in ['cacher', + # 'params', + # 'n_processes', + # 'logging_params']} + # + # operation_parameters.update(class_params) + # hashed_info = self.cacher.hash_info(data=input_data.features, + # operation_info=operation_parameters.__repr__()) # hashed_info = self.cacher.hash_info(data=input_data.features.tobytes(), # operation_info=self.params.to_dict()) diff --git a/fedot_ind/core/operation/transformation/basis/data_driven.py b/fedot_ind/core/operation/transformation/basis/data_driven.py index a6acca95a..415ddd9af 100644 --- a/fedot_ind/core/operation/transformation/basis/data_driven.py +++ b/fedot_ind/core/operation/transformation/basis/data_driven.py @@ -1,24 +1,21 @@ import math -import time from multiprocessing import Pool -from typing import Tuple, TypeVar, Optional +from typing import Optional, Tuple, TypeVar import numpy as np import tensorly as tl from fedot.core.operations.operation_parameters import OperationParameters from pymonad.either import Either from pymonad.list import ListMonad -from sklearn.metrics import f1_score, roc_auc_score from tensorly.decomposition import parafac from tqdm import tqdm from fedot_ind.core.architecture.preprocessing import InputData from fedot_ind.core.operation.decomposition.matrix_decomposition.fast_svd import bksvd - from fedot_ind.core.operation.transformation.basis.abstract_basis import BasisDecompositionImplementation from fedot_ind.core.operation.transformation.data.hankel import HankelMatrix -from fedot_ind.core.operation.transformation.regularization.spectrum import singular_value_hard_threshold, \ - reconstruct_basis +from fedot_ind.core.operation.transformation.regularization.spectrum import reconstruct_basis, \ + singular_value_hard_threshold class_type = TypeVar("T", bound="DataDrivenBasis") diff --git a/fedot_ind/core/operation/transformation/basis/fourier.py b/fedot_ind/core/operation/transformation/basis/fourier.py index 9116c0696..4687b7b48 100644 --- a/fedot_ind/core/operation/transformation/basis/fourier.py +++ b/fedot_ind/core/operation/transformation/basis/fourier.py @@ -23,6 +23,8 @@ def __init__(self, params: Optional[OperationParameters] = None): self.threshold = params.get('threshold') self.basis = None + self.logging_params.update({'threshold': self.threshold}) + def low_pass(self, input_data): fourier_coef = np.fft.rfft(input_data) frequencies = np.fft.rfftfreq(input_data.size, d=2e-3 / input_data.size) diff --git a/fedot_ind/core/operation/transformation/extraction/statistical.py b/fedot_ind/core/operation/transformation/extraction/statistical.py index 622ae4978..4d99351f2 100644 --- a/fedot_ind/core/operation/transformation/extraction/statistical.py +++ b/fedot_ind/core/operation/transformation/extraction/statistical.py @@ -1,142 +1,67 @@ from typing import Union -import numpy as np -import pandas as pd - -from fedot_ind.core.architecture.settings.hyperparams import select_hyper_param - -stat_methods = select_hyper_param('statistical_methods') -stat_methods_extra = select_hyper_param('statistical_methods_extra') -quantile_dict = {'q5_': 0.05, - 'q25_': 0.25, - 'q75_': 0.75, - 'q95_': 0.95} - - -class StatFeaturesExtractor: - """Class for generating statistical features for a given time series. - - """ - - @staticmethod - def create_baseline_features(feature_to_aggregation: Union[pd.DataFrame, np.ndarray]): - stat_list = [] - column_name = [] - feature_to_aggregation = pd.DataFrame(feature_to_aggregation) - if feature_to_aggregation.shape[0] != 1: - feature_to_aggregation = feature_to_aggregation.T - for method_name, method_func in stat_methods_extra.items(): - try: - tmp = feature_to_aggregation.copy(deep=True) - except Exception: - tmp = feature_to_aggregation.copy() - - if method_name.startswith('q'): - _ = [] - for idx, row in tmp.iterrows(): - _.append(method_func(row, q=quantile_dict[method_name])) - tmp = np.array(_) - stat_list.append(tmp) - column_name.append(method_name) - elif method_name.startswith('l'): - tmp = tmp.apply(method_func, axis=1) - tmp = tmp.astype(int) - stat_list.append(tmp.sum(axis=1).values) - column_name.append(method_name) - elif method_name.startswith('d'): - tmp = tmp.apply(method_func, axis=1) - stat_list.append(tmp.apply(np.mean).values) - column_name.append(method_name + 'mean') - stat_list.append(tmp.apply(np.min).values) - column_name.append(method_name + 'min') - stat_list.append(tmp.apply(np.max).values) - column_name.append(method_name + 'max') - else: - stat_list.append(tmp.apply(method_func, axis=1).values) - column_name.append(method_name) - - del tmp - - df_points_stat = pd.DataFrame(stat_list) - df_points_stat = df_points_stat.T - df_points_stat.columns = column_name - - return df_points_stat - - @staticmethod - def create_features(feature_to_aggregation: Union[pd.DataFrame, np.ndarray]): - stat_list = [] - column_name = [] - feature_to_aggregation = pd.DataFrame(feature_to_aggregation) - for method_name, method_func in stat_methods.items(): - tmp = feature_to_aggregation.copy() - - if method_name.startswith('q'): - for col in tmp.columns: - tmp[col] = method_func(tmp[col], q=quantile_dict[method_name]) - tmp = tmp.drop_duplicates() - else: - tmp = pd.DataFrame(tmp.apply(method_func)) - tmp = tmp.T - - for feature in feature_to_aggregation.columns: - column_name.append(method_name + str(feature)) - - stat_list.append(tmp.values) - - df_points_stat = pd.DataFrame(np.concatenate(stat_list, axis=1)) - df_points_stat.columns = column_name - return df_points_stat - - def _transform(self, X, intervals) -> np.ndarray: - """ - Transform X for given intervals. Compute the mean, standard deviation and slope for given - intervals of input data X. - - Args: - X: input data - intervals: list of intervals for which to compute the mean, standard deviation and slope - - Returns: - Array of shape (len(X), 3 * len(intervals)) - """ - n_instances, _ = X.shape - n_intervals, _ = intervals.shape - transformed_x = np.empty(shape=(3 * n_intervals, n_instances), dtype=np.float32) - for j in range(n_intervals): - X_slice = X[:, intervals[j][0]: intervals[j][1]] - means = np.mean(X_slice, axis=1) - std_dev = np.std(X_slice, axis=1) - slope = _slope(X_slice, axis=1) - transformed_x[3 * j] = means - transformed_x[3 * j + 1] = std_dev - transformed_x[3 * j + 2] = slope - - return transformed_x.T - - @staticmethod - def _get_intervals(n_intervals: int, - min_interval: int, - series_length: int, - rng) -> np.ndarray: - """Generate random intervals for given parameters. - - Args: - n_intervals: Number of intervals to generate - min_interval: Minimum length of an interval - series_length: Length of the time series - rng: ... - - Returns: - Array containing the intervals. - - """ - intervals = np.zeros((n_intervals, 2), dtype=int) - for j in range(n_intervals): - intervals[j][0] = rng.randint(series_length - min_interval) - length = rng.randint(series_length - intervals[j][0] - 1) - if length < min_interval: - length = min_interval - intervals[j][1] = intervals[j][0] + length - return intervals - +from fedot_ind.core.operation.transformation.extraction.statistical_methods import * + +stat_methods = {'mean_': np.mean, + 'median_': np.median, + 'std_': np.std, + 'max_': np.max, + 'min_': np.min, + 'q5_': q5, + 'q25_': q25, + 'q75_': q75, + 'q95_': q95, + 'sum_': np.sum, + 'dif_': diff, + 'skewness_': skewness, + 'kurtosis_': kurtosis, + 'n_peaks_': n_peaks, + 'slope_': slope, + 'ben_corr_': ben_corr, + 'interquartile_range_': interquartile_range, + 'energy_': energy, + 'cross_rate_': zero_crossing_rate, + 'autocorrelation_': autocorrelation, + 'base_entropy_': base_entropy, + 'shannon_entropy_': shannon_entropy, + 'ptp_amplitude_': ptp_amp, + 'crest_factor_': crest_factor, + 'mean_ema_': mean_ema, + 'mean_moving_median_': mean_moving_median, + 'hjorth_mobility_': hjorth_mobility, + 'hjorth_complexity_': hjorth_complexity, + 'hurst_exponent_': hurst_exponent, + 'petrosian_fractal_dimension_': pfd, + } + +# class StatFeaturesExtractor: +# """Class for generating statistical features for a given time series. +# +# """ +# +# @staticmethod +# def create_baseline_features(time_series: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame: +# """ +# Method for creating baseline statistical features for a given time series. +# +# Args: +# time_series: time series for which features are generated +# +# Returns: +# Row vector of statistical features in the form of a pandas DataFrame +# +# """ +# names = [] +# vals = [] +# # flatten time series +# if isinstance(time_series, (pd.DataFrame, pd.Series)): +# time_series = time_series.values +# time_series = time_series.flatten() +# +# for name, method in stat_methods.items(): +# try: +# vals.append(method(time_series)) +# names.append(name) +# except ValueError: +# continue +# return pd.DataFrame([vals], columns=names) diff --git a/fedot_ind/core/operation/transformation/extraction/statistical_methods.py b/fedot_ind/core/operation/transformation/extraction/statistical_methods.py new file mode 100644 index 000000000..4e085f5bb --- /dev/null +++ b/fedot_ind/core/operation/transformation/extraction/statistical_methods.py @@ -0,0 +1,262 @@ +import warnings + +import numpy as np +import pandas as pd +from scipy.signal import find_peaks +from scipy.stats import entropy, linregress +from sklearn.preprocessing import MinMaxScaler + +warnings.filterwarnings("ignore") + + +def lambda_less_zero(array: np.array) -> int: + mask = np.array(list(map(lambda x: x < 0.01, array)), dtype=int) + return np.sum(mask) + + +def q5(array: np.array) -> float: + return np.quantile(array, 0.05) + + +def q25(array: np.array) -> float: + return np.quantile(array, 0.25) + + +def q75(array: np.array) -> float: + return np.quantile(array, 0.75) + + +def q95(array: np.array) -> float: + return np.quantile(array, 0.95) + + +def diff(array: np.array) -> float: + return np.diff(array, n=len(array) - 1)[0] + + +# Extra methods for statistical features extraction +def skewness(array: np.array) -> float: + if not isinstance(array, pd.Series): + array = pd.Series(array) + + return pd.Series.skew(array) + + +def kurtosis(array: np.array) -> float: + if not isinstance(array, pd.Series): + array = pd.Series(array) + return pd.Series.kurtosis(array) + + +def n_peaks(array: np.array) -> int: + return len(find_peaks(array)) + + +def slope(array: np.array) -> float: + return linregress(range(len(array)), array).slope + + +def ben_corr(x): + """ + Useful for anomaly detection applications [1][2]. Returns the correlation from first digit distribution when + compared to the Newcomb-Benford's Law distribution [3][4]. + + .. math:: + + P(d)=\\log_{10}\\left(1+\\frac{1}{d}\\right) + + where :math:`P(d)` is the Newcomb-Benford distribution for :math:`d` that is the leading digit of the number + {1, 2, 3, 4, 5, 6, 7, 8, 9}. + + .. rubric:: References + + | [1] A Statistical Derivation of the Significant-Digit Law, Theodore P. Hill, Statistical Science, 1995 + | [2] The significant-digit phenomenon, Theodore P. Hill, The American Mathematical Monthly, 1995 + | [3] The law of anomalous numbers, Frank Benford, Proceedings of the American philosophical society, 1938 + | [4] Note on the frequency of use of the different digits in natural numbers, Simon Newcomb, American Journal of + | mathematics, 1881 + + :param x: the time series to calculate the feature of + :type x: numpy.ndarray + :return: the value of this feature + :return type: float + """ + x = np.asarray(x) + + # retrieve first digit from data + x = np.array( + [int(str(np.format_float_scientific(i))[:1]) for i in np.abs(np.nan_to_num(x))] + ) + + # benford distribution + benford_distribution = np.array([np.log10(1 + 1 / n) for n in range(1, 10)]) + + data_distribution = np.array([(x == n).mean() for n in range(1, 10)]) + + # np.corrcoef outputs the normalized covariance (correlation) between benford_distribution and data_distribution. + # In this case returns a 2x2 matrix, the [0, 1] and [1, 1] are the values between the two arrays + return np.corrcoef(benford_distribution, data_distribution)[0, 1] + + +def interquartile_range(array: np.array) -> float: + return q75(array) - q25(array) + + +def energy(array: np.array) -> float: + return np.sum(np.power(array, 2)) / len(array) + + +def autocorrelation(array: np.array) -> float: + """Autocorrelation of the time series with its lagged version + """ + lagged_ts = np.roll(array, 1) + return np.corrcoef(array, lagged_ts)[0, 1] + + +def zero_crossing_rate(array: np.array) -> float: + """Returns the rate of sign-changes of the time series for a scaled version of it. + """ + scaler = MinMaxScaler(feature_range=(-1, 1)) + scaled_array = scaler.fit_transform(array.reshape(-1, 1)).flatten() + signs = np.sign(scaled_array) + signs[signs == 0] = -1 + return np.sum((signs[1:] - signs[:-1]) != 0) / len(scaled_array) + + +def shannon_entropy(array: np.array) -> float: + """Returns the Shannon Entropy of the time series. + """ + p = np.unique(array, return_counts=True)[1] / len(array) + return -np.sum(p * np.log2(p)) + + +def base_entropy(array: np.array) -> float: + """Returns the Shannon Entropy of the time series. + """ + # Normalize the time series to sum up to 1 + normalized_series = array / np.sum(array) + return entropy(normalized_series) + +def ptp_amp(array: np.array) -> float: + """Returns the peak-to-peak amplitude of the time series. + """ + return np.ptp(array) + + +def crest_factor(array: np.array) -> float: + """Returns the crest factor of the time series. + """ + return np.max(np.abs(array)) / np.sqrt(np.mean(np.square(array))) + + +def mean_ema(array: np.array) -> float: + """Returns the exponential moving average of the time series. + """ + span = int(len(array) / 10) + if span in [0, 1]: + span = 2 + return pd.Series(array).ewm(span=span).mean().iloc[-1] + + +def mean_moving_median(array: np.array) -> float: + span = int(len(array) / 10) + if span in [0, 1]: + span = 2 + return pd.Series(array).rolling(window=span, center=False).median().mean() + + +def hjorth_mobility(array): + # Compute the first-order differential sequence + diff_sequence = np.diff(array) + # Calculate the mean power of the first-order differential sequence + M2 = np.sum(np.power(diff_sequence, 2)) / len(diff_sequence) + # Calculate the total power of the time series + TP = np.sum(np.power(array, 2)) / len(array) + # Calculate Hjorth mobility + mobility = np.sqrt(M2 / TP) + return mobility + + +def hjorth_complexity(array): + # Compute the first-order differential sequence + diff_sequence = np.diff(array) + # Calculate the mean power of the first-order differential sequence + M2 = np.sum(np.power(diff_sequence, 2)) / len(diff_sequence) + # Calculate the total power of the time series + TP = np.sum(np.power(array, 2)) / len(array) + # Calculate the fourth central moment of the first-order differential sequence + M4 = sum([(diff_sequence[i] - diff_sequence[i - 1]) ** 2 for i in range(1, len(diff_sequence))]) / len( + diff_sequence) + # Calculate Hjorth complexity + complexity = np.sqrt((M4 * TP) / (M2 * M2)) + # complexity = (M4 * TP) / (M2 * M2) + return complexity + + +def hurst_exponent(array): + """ + Notes + -------- + Author of this function is Xin Liu + + Args: + array: + + Returns: + + """ + X = np.array(array) + N = X.size + T = np.arange(1, N + 1) + Y = np.cumsum(X) + Ave_T = Y / T + + S_T = np.zeros(N) + R_T = np.zeros(N) + + for i in range(N): + S_T[i] = np.std(X[:i + 1]) + X_T = Y - T * Ave_T[i] + R_T[i] = np.ptp(X_T[:i + 1]) + + R_S = R_T / S_T + R_S = np.log(R_S)[1:] + n = np.log(T)[1:] + A = np.column_stack((n, np.ones(n.size))) + [m, c] = np.linalg.lstsq(A, R_S,rcond=None)[0] + H = m + return H + + +def pfd(X, D=None): + import numpy + """ + The Petrosian fractal dimension (PFD) is a chaotic algorithm used to calculate EEG signal complexity + Compute Petrosian Fractal Dimension of a time series from either two + cases below: + 1. X, the time series of type list (default) + 2. D, the first order differential sequence of X (if D is provided, + recommended to speed up) + + In case 1, D is computed using Numpy's difference function. + + To speed up, it is recommended to compute D before calling this function + because D may also be used by other functions whereas computing it here + again will slow down. + """ + if D is None: + D = numpy.diff(X) + D = D.tolist() + N_delta = 0 # number of sign changes in derivative of the signal + for i in range(1, len(D)): + if D[i] * D[i - 1] < 0: + N_delta += 1 + n = len(X) + return numpy.log10(n) / ( + numpy.log10(n) + numpy.log10(n / n + 0.4 * N_delta) + ) + + + +if __name__ == "__main__": + arr = np.array([1, 2, 3, 4, 5]) diff --git a/fedot_ind/core/repository/data/default_operation_params.json b/fedot_ind/core/repository/data/default_operation_params.json index 3a14f7d8f..d5289b07e 100644 --- a/fedot_ind/core/repository/data/default_operation_params.json +++ b/fedot_ind/core/repository/data/default_operation_params.json @@ -155,8 +155,9 @@ "threshold": 20000 }, "quantile_extractor": { - "window_size": 5, - "window_mode": false + "window_size": null, + "window_mode": false, + "var_threshold": 0.01 }, "recurrence_extractor": { "window_size": 20, diff --git a/fedot_ind/core/tuning/search_space.py b/fedot_ind/core/tuning/search_space.py index b38e05138..f14703509 100644 --- a/fedot_ind/core/tuning/search_space.py +++ b/fedot_ind/core/tuning/search_space.py @@ -4,8 +4,8 @@ industrial_search_space = { 'data_driven_basis': { - 'sv_selector': (hp.choice, [['median', 'mean', '0.25%']]), - 'window_size': (hp.choice, [[x for x in range(5, 50, 5)]])}, + 'sv_selector': (hp.choice, [['median', '0.75%', '0.25%']]), + 'window_size': (hp.choice, [[x for x in range(5, 50, 5)]])}, 'wavelet_basis': {'n_components': (hp.uniformint, [2, 10]), 'wavelet': (hp.choice, [['mexh', 'shan', 'morl', 'cmor', 'fbsp', 'db5', 'sym5']])}, @@ -13,9 +13,25 @@ {'spectrum': (hp.choice, [['smoothed']]), 'threshold': (hp.uniformint, [10000, 50000])}, - 'quantile_extractor': - {'window_mode': (hp.choice, [[True, False]]), - 'window_size': (hp.choice, [[x for x in range(1, 50, 3)]])}, + # 'quantile_extractor': + # {'window_mode': (hp.choice, [[True, True]]), + # {'window_mode': (hp.choice, [[True, False]]), + # 'window_size': (hp.choice, [[x for x in range(1, 50, 3)]]), + # 'var_threshold': (hp.choice, [[_ for _ in np.linspace(0, 0.02, 35)]])}, + + 'quantile_extractor': {'nested_space': (hp.choice, [[ + { + 'window_mode': True, + 'window_size': hp.choice('window_size_true', list(range(1, 50, 3))), + 'var_threshold': hp.uniform('threshold_true', 0, 0.02) + }, + { + 'window_mode': False, + 'window_size': None, + 'var_threshold': hp.uniform('threshold_false', 0, 0.02) + } + + ]])}, 'recurrence_extractor': {'win_mode': (hp.choice, [[True, False]]),