From 7e5240e417acd6c533414ff395db19736229eea3 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 14 Sep 2023 15:55:57 +0300 Subject: [PATCH] topological extractor refactoring --- fedot_ind/core/models/BaseExtractor.py | 11 +-- .../models/recurrence/RecurrenceExtractor.py | 2 +- .../topological/TopologicalExtractor.py | 73 ++++++++++---- .../{topological.py => topofeatures.py} | 29 ++---- .../transformation/data/kernel_matrix.py | 53 +++++++++++ .../point_cloud.py} | 94 +++---------------- 6 files changed, 137 insertions(+), 125 deletions(-) rename fedot_ind/core/models/topological/{topological.py => topofeatures.py} (91%) create mode 100644 fedot_ind/core/operation/transformation/data/kernel_matrix.py rename fedot_ind/core/operation/transformation/{DataTransformer.py => data/point_cloud.py} (64%) diff --git a/fedot_ind/core/models/BaseExtractor.py b/fedot_ind/core/models/BaseExtractor.py index 3b5c7f7f7..8a5aa24f2 100644 --- a/fedot_ind/core/models/BaseExtractor.py +++ b/fedot_ind/core/models/BaseExtractor.py @@ -3,6 +3,8 @@ from multiprocessing import cpu_count, Pool from typing import Optional +import numpy as np +import pandas as pd from fedot.core.data.data import InputData from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum @@ -38,12 +40,9 @@ def _transform(self, input_data: InputData) -> np.array: """ Method for feature generation for all series """ - if type(input_data) == InputData: - features = input_data.features - n_samples = input_data.features.shape[0] - else: - features = input_data - n_samples = input_data.shape[0] + features = input_data.features + n_samples = input_data.features.shape[0] + try: input_data_squeezed = np.squeeze(features, 3) except Exception: diff --git a/fedot_ind/core/models/recurrence/RecurrenceExtractor.py b/fedot_ind/core/models/recurrence/RecurrenceExtractor.py index e20bed16c..d5f40d8ad 100644 --- a/fedot_ind/core/models/recurrence/RecurrenceExtractor.py +++ b/fedot_ind/core/models/recurrence/RecurrenceExtractor.py @@ -7,7 +7,7 @@ from fedot_ind.core.operation.transformation.data.hankel import HankelMatrix from fedot_ind.core.metrics.metrics_implementation import * from fedot_ind.core.models.WindowedFeaturesExtractor import WindowedFeatureExtractor -from fedot_ind.core.operation.transformation.DataTransformer import TSTransformer +from fedot_ind.core.operation.transformation.data.kernel_matrix import TSTransformer from fedot_ind.core.models.recurrence.sequences import ReccurenceFeaturesExtractor diff --git a/fedot_ind/core/models/topological/TopologicalExtractor.py b/fedot_ind/core/models/topological/TopologicalExtractor.py index 29c3438e5..c70e5fa52 100644 --- a/fedot_ind/core/models/topological/TopologicalExtractor.py +++ b/fedot_ind/core/models/topological/TopologicalExtractor.py @@ -6,15 +6,20 @@ import pandas as pd from fedot.core.data.data import InputData, OutputData from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.pipelines.pipeline_builder import PipelineBuilder from gtda.time_series import takens_embedding_optimal_parameters from scipy import stats from tqdm import tqdm +from examples.fedot.fedot_ex import init_input_data +from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader from fedot_ind.core.models.BaseExtractor import BaseExtractor -from fedot_ind.core.models.topological.topological import AverageHoleLifetimeFeature, \ +from fedot_ind.core.models.topological.topofeatures import AverageHoleLifetimeFeature, \ AveragePersistenceLandscapeFeature, BettiNumbersSumFeature, HolesNumberFeature, MaxHoleLifeTimeFeature, \ PersistenceDiagramsExtractor, PersistenceEntropyFeature, RadiusAtMaxBNFeature, RelevantHolesNumber, \ SimultaneousAliveHolesFeature, SumHoleLifetimeFeature, TopologicalFeaturesExtractor +from fedot_ind.core.operation.transformation.data.point_cloud import TopologicalTransformation +from fedot_ind.core.repository.initializer_industrial_models import IndustrialModels sys.setrecursionlimit(1000000000) @@ -29,6 +34,11 @@ 'BettiNumbersSumFeature': BettiNumbersSumFeature(), 'RadiusAtMaxBNFeature': RadiusAtMaxBNFeature()} +PERSISTENCE_DIAGRAM_EXTRACTOR = PersistenceDiagramsExtractor(takens_embedding_dim=1, + takens_embedding_delay=2, + homology_dimensions=(0, 1), + parallel=False) + class TopologicalExtractor(BaseExtractor): """Class for extracting topological features from time series data. @@ -39,33 +49,53 @@ class TopologicalExtractor(BaseExtractor): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) - self.filtered_features = None - self.feature_extractor = None - - def fit(self, input_data: InputData) -> OutputData: - pass - - def generate_topological_features(self, ts_data: pd.DataFrame) -> pd.DataFrame: + self.window_size = params.get('window_size', 10) + self.feature_extractor = TopologicalFeaturesExtractor( + persistence_diagram_extractor=PERSISTENCE_DIAGRAM_EXTRACTOR, + persistence_diagram_features=PERSISTENCE_DIAGRAM_FEATURES) + self.data_transformer = None + def __evaluate_persistence_params(self, ts_data): if self.feature_extractor is None: - te_dimension, te_time_delay = self.get_embedding_params_from_batch(ts_data=ts_data) persistence_diagram_extractor = PersistenceDiagramsExtractor(takens_embedding_dim=te_dimension, takens_embedding_delay=te_time_delay, - homology_dimensions=(0, 1), + homology_dimensions=(0, 1, 2), parallel=True) self.feature_extractor = TopologicalFeaturesExtractor( persistence_diagram_extractor=persistence_diagram_extractor, persistence_diagram_features=PERSISTENCE_DIAGRAM_FEATURES) - ts_data_transformed = self.feature_extractor.fit_transform(ts_data) + def fit(self, input_data: InputData) -> OutputData: + pass + + def _generate_features_from_ts(self, ts_data, persistence_params): + if self.data_transformer is None: + self.data_transformer = TopologicalTransformation( + persistence_params=persistence_params, + window_length=round(ts_data.shape[0] * 0.01 * self.window_size)) + + point_cloud = self.data_transformer.time_series_to_point_cloud(input_data=ts_data) + topological_features = self.feature_extractor.transform(point_cloud) + return topological_features + + def generate_topological_features(self, ts_data: np.array, + persistence_params: dict = None) -> pd.DataFrame: + + if persistence_params is not None: + self._evaluate_persistence_params(ts_data) + + if len(ts_data.shape) > 1: + topological_features = [self._generate_features_from_ts(component, persistence_params) + for component in ts_data] + for component_idx, feature_df in enumerate(topological_features): + feature_df.columns = [f'{col}_component_{component_idx}' for col in feature_df.columns] + return pd.concat(topological_features, axis=1) - if self.filtered_features is None: - self.filtered_features = ts_data_transformed.columns.tolist() - gc.collect() - return ts_data_transformed[self.filtered_features] + else: + return self._generate_features_from_ts(ts_data, persistence_params) def generate_features_from_ts(self, ts_data: pd.DataFrame, dataset_name: str = None): return self.generate_topological_features(ts_data=ts_data) @@ -104,4 +134,15 @@ def get_embedding_params_from_batch(self, ts_data: pd.DataFrame, method: str = ' @staticmethod def _mode(arr: list) -> int: - return int(stats.mode(arr)[0][0]) \ No newline at end of file + return int(stats.mode(arr)[0][0]) + + +# if __name__ == "__main__": +# train_data, test_data = DataLoader(dataset_name='Ham').load_data() +# with IndustrialModels(): +# pipeline = PipelineBuilder().add_node('data_driven_basis').add_node('topological_extractor').add_node( +# 'rf').build() +# input_data = init_input_data(train_data[0], train_data[1]) +# pipeline.fit(input_data) +# features = pipeline.predict(input_data) +# print(features) diff --git a/fedot_ind/core/models/topological/topological.py b/fedot_ind/core/models/topological/topofeatures.py similarity index 91% rename from fedot_ind/core/models/topological/topological.py rename to fedot_ind/core/models/topological/topofeatures.py index 544eb4574..e4f5ff931 100644 --- a/fedot_ind/core/models/topological/topological.py +++ b/fedot_ind/core/models/topological/topofeatures.py @@ -7,7 +7,6 @@ import pandas as pd from gtda.diagrams import BettiCurve, Filtering, PersistenceEntropy, PersistenceLandscape, Scaler from gtda.homology import VietorisRipsPersistence -from gtda.time_series import TakensEmbedding class PersistenceDiagramFeatureExtractor(ABC): @@ -19,7 +18,7 @@ def extract_feature_(self, persistence_diagram): pass def fit_transform(self, x_pd): - return np.array([self.extract_feature_(diagram) for diagram in x_pd]) + return self.extract_feature_(x_pd) class PersistenceDiagramsExtractor: @@ -49,11 +48,6 @@ def __init__(self, takens_embedding_dim: int, self.parallel_ = parallel self.n_job = None - def takens_embeddings_(self, data): - te = TakensEmbedding(dimension=self.takens_embedding_dim_, - time_delay=self.takens_embedding_delay_) - return te.fit_transform(data) - def persistence_diagrams_(self, x_embeddings): if self.parallel_: pool = ThreadPool() @@ -62,10 +56,7 @@ def persistence_diagrams_(self, x_embeddings): pool.join() return x_transformed else: - x_transformed = list() - for embedding in x_embeddings: - x_transformed.append(self.parallel_embed_(embedding)) - return x_transformed + return self.parallel_embed_(x_embeddings) def parallel_embed_(self, embedding): vr = VietorisRipsPersistence(metric='euclidean', homology_dimensions=self.homology_dimensions_, @@ -77,8 +68,7 @@ def parallel_embed_(self, embedding): persistence_diagrams = diagram_filter.fit_transform(persistence_diagrams) return persistence_diagrams[0] - def fit_transform(self, x): - x_embeddings = self.takens_embeddings_(x) + def transform(self, x_embeddings): x_persistence_diagrams = self.persistence_diagrams_(x_embeddings) return x_persistence_diagrams @@ -88,20 +78,21 @@ def __init__(self, persistence_diagram_extractor, persistence_diagram_features): self.persistence_diagram_extractor_ = persistence_diagram_extractor self.persistence_diagram_features_ = persistence_diagram_features - def fit_transform(self, x): + def transform(self, x): - x_pers_diag = self.persistence_diagram_extractor_.fit_transform(x) - tmp = [] + x_pers_diag = self.persistence_diagram_extractor_.transform(x) + feature_list = [] column_list = [] for feature_name, feature_model in self.persistence_diagram_features_.items(): try: x_features = feature_model.fit_transform(x_pers_diag) - tmp.append(x_features) - for dim in range(len(x_features.shape)): + feature_list.append(x_features) + for dim in range(len(x_features)): column_list.append('{}_{}'.format(feature_name, dim)) except Exception: continue - x_transformed = pd.DataFrame(data=np.hstack(tmp), columns=column_list) + x_transformed = pd.DataFrame(data=np.hstack(feature_list)).T + x_transformed.columns = column_list return x_transformed diff --git a/fedot_ind/core/operation/transformation/data/kernel_matrix.py b/fedot_ind/core/operation/transformation/data/kernel_matrix.py new file mode 100644 index 000000000..bf54da52f --- /dev/null +++ b/fedot_ind/core/operation/transformation/data/kernel_matrix.py @@ -0,0 +1,53 @@ +import numpy as np +from scipy.spatial.distance import pdist, squareform + + +class TSTransformer: + def __init__(self, time_series, min_signal_ratio, max_signal_ratio, rec_metric): + self.time_series = time_series + self.recurrence_matrix = None + self.threshold_baseline = [0.95, 0.7] # cosine + self.min_signal_ratio = min_signal_ratio + self.max_signal_ratio = max_signal_ratio + self.rec_metric = rec_metric + + def ts_to_recurrence_matrix(self, + threshold=None): + distance_matrix = pdist(metric=self.rec_metric, X=self.time_series.T) + distance_matrix = np.ones(shape=distance_matrix.shape[0]) - distance_matrix + distance_matrix = self.binarization(distance_matrix, threshold=threshold) + self.recurrence_matrix = squareform(distance_matrix) + return self.recurrence_matrix + + def binarization(self, distance_matrix, threshold): + best_threshold_flag = False + signal_ratio_list = [] + reccurence_matrix = None + if threshold is None: + for threshold_baseline in self.threshold_baseline: + threshold = threshold_baseline + tmp_array = np.copy(distance_matrix) + tmp_array[tmp_array < threshold_baseline] = 0.0 + tmp_array[tmp_array >= threshold_baseline] = 1.0 + signal_ratio = np.where(tmp_array == 0)[0].shape[0] / tmp_array.shape[0] + + if self.min_signal_ratio < signal_ratio < self.max_signal_ratio: + best_ratio = signal_ratio + reccurence_matrix = tmp_array + best_threshold_flag = True + if signal_ratio > best_ratio: + reccurence_matrix = tmp_array + else: + signal_ratio_list.append(abs(self.max_signal_ratio - signal_ratio)) + + del tmp_array + + if not best_threshold_flag: + distance_matrix[distance_matrix < self.threshold_baseline[0]] = 0.0 + distance_matrix[distance_matrix >= self.threshold_baseline[0]] = 1.0 + reccurence_matrix = distance_matrix + return reccurence_matrix + + def get_recurrence_metrics(self): + if self.recurrence_matrix is None: + return self.ts_to_recurrence_matrix() diff --git a/fedot_ind/core/operation/transformation/DataTransformer.py b/fedot_ind/core/operation/transformation/data/point_cloud.py similarity index 64% rename from fedot_ind/core/operation/transformation/DataTransformer.py rename to fedot_ind/core/operation/transformation/data/point_cloud.py index 089fc010f..ebd8c97cd 100644 --- a/fedot_ind/core/operation/transformation/DataTransformer.py +++ b/fedot_ind/core/operation/transformation/data/point_cloud.py @@ -1,10 +1,10 @@ from typing import Union - import numpy as np import pandas as pd from ripser import Rips, ripser from scipy import sparse -from scipy.spatial.distance import pdist, squareform +from fedot_ind.core.operation.transformation.data.hankel import HankelMatrix +from fedot.core.data.data import InputData, OutputData class TopologicalTransformation: @@ -26,7 +26,7 @@ class TopologicalTransformation: def __init__(self, time_series: Union[pd.Series, np.ndarray, list] = None, max_simplex_dim: int = None, - epsilon: int = None, + epsilon: int = 10, persistence_params: dict = None, window_length: int = None): self.time_series = time_series @@ -73,29 +73,13 @@ def __compute_persistence_landscapes(ts): ys = np.unique(dgm0[:, 1]) ys = ys[ys < np.inf] - @staticmethod - def rolling_window(array: np.array, window: int) -> np.array: - """Takes in an array and return array of rolling windows of specified length. - - Args: - array: Array to be rolled. - window: Length of the window. - - Returns: - Array of rolling windows. - - """ - shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) - strides = array.strides + (array.strides[-1],) - a_windowed = np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) - return a_windowed - - def time_series_to_point_cloud(self, array: np.array = None, + def time_series_to_point_cloud(self, + input_data: np.array = None, dimension_embed=2) -> np.array: """Convert a time series into a point cloud in the dimension specified by dimension_embed. Args: - array: Time series to be converted. + input_data: Time series to be converted. dimension_embed: dimension of Euclidean space in which to embed the time series into by taking windows of dimension_embed length, e.g. if the time series is ``[t_1,...,t_n]`` and dimension_embed is ``2``, then the point cloud would be ``[(t_0, t_1), (t_1, t_2),...,(t_(n-1), t_n)]`` @@ -106,20 +90,15 @@ def time_series_to_point_cloud(self, array: np.array = None, """ - assert len(self.time_series) >= dimension_embed, 'dimension_embed larger than length of time_series' - if self.__window_length is None: self.__window_length = dimension_embed - # compute point cloud - if array is None: - array = self.time_series - - point_cloud = self.rolling_window(array=array, window=dimension_embed) - return np.array(point_cloud) + trajectory_transformer = HankelMatrix(time_series=input_data, window_size=self.__window_length) + return trajectory_transformer.trajectory_matrix - def point_cloud_to_persistent_cohomology_ripser(self, point_cloud: np.array = None, - max_simplex_dim: int = None): + def point_cloud_to_persistent_cohomology_ripser(self, + point_cloud: np.array = None, + max_simplex_dim: int = 1): # ensure epsilon_range is a numpy array epsilon_range = self.epsilon_range @@ -185,54 +164,3 @@ def time_series_rolling_betti_ripser(self, ts): df_features.columns = cols df_features['Betti_sum'] = df_features.sum(axis=1) return df_features - - -class TSTransformer: - def __init__(self, time_series, min_signal_ratio, max_signal_ratio, rec_metric): - self.time_series = time_series - self.recurrence_matrix = None - self.threshold_baseline = [0.95, 0.7] #cosine - self.min_signal_ratio = min_signal_ratio - self.max_signal_ratio = max_signal_ratio - self.rec_metric = rec_metric - - def ts_to_recurrence_matrix(self, - threshold=None): - distance_matrix = pdist(metric=self.rec_metric, X=self.time_series.T) - distance_matrix = np.ones(shape=distance_matrix.shape[0])-distance_matrix - distance_matrix = self.binarization(distance_matrix, threshold=threshold) - self.recurrence_matrix = squareform(distance_matrix) - return self.recurrence_matrix - - def binarization(self, distance_matrix, threshold): - best_threshold_flag = False - signal_ratio_list = [] - reccurence_matrix = None - if threshold is None: - for threshold_baseline in self.threshold_baseline: - threshold = threshold_baseline - tmp_array = np.copy(distance_matrix) - tmp_array[tmp_array < threshold_baseline] = 0.0 - tmp_array[tmp_array >= threshold_baseline] = 1.0 - signal_ratio = np.where(tmp_array == 0)[0].shape[0] / tmp_array.shape[0] - - if self.min_signal_ratio < signal_ratio < self.max_signal_ratio: - best_ratio = signal_ratio - reccurence_matrix = tmp_array - best_threshold_flag = True - if signal_ratio > best_ratio: - reccurence_matrix = tmp_array - else: - signal_ratio_list.append(abs(self.max_signal_ratio - signal_ratio)) - - del tmp_array - - if not best_threshold_flag: - distance_matrix[distance_matrix < self.threshold_baseline[0]] = 0.0 - distance_matrix[distance_matrix >= self.threshold_baseline[0]] = 1.0 - reccurence_matrix = distance_matrix - return reccurence_matrix - - def get_recurrence_metrics(self): - if self.recurrence_matrix is None: - return self.ts_to_recurrence_matrix()