Skip to content

Commit

Permalink
topological extractor refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
v1docq committed Sep 14, 2023
1 parent e63ba49 commit 7e5240e
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 125 deletions.
11 changes: 5 additions & 6 deletions fedot_ind/core/models/BaseExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from multiprocessing import cpu_count, Pool
from typing import Optional

import numpy as np
import pandas as pd
from fedot.core.data.data import InputData
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.dataset_types import DataTypesEnum
Expand Down Expand Up @@ -38,12 +40,9 @@ def _transform(self, input_data: InputData) -> np.array:
"""
Method for feature generation for all series
"""
if type(input_data) == InputData:
features = input_data.features
n_samples = input_data.features.shape[0]
else:
features = input_data
n_samples = input_data.shape[0]
features = input_data.features
n_samples = input_data.features.shape[0]

try:
input_data_squeezed = np.squeeze(features, 3)
except Exception:
Expand Down
2 changes: 1 addition & 1 deletion fedot_ind/core/models/recurrence/RecurrenceExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from fedot_ind.core.operation.transformation.data.hankel import HankelMatrix
from fedot_ind.core.metrics.metrics_implementation import *
from fedot_ind.core.models.WindowedFeaturesExtractor import WindowedFeatureExtractor
from fedot_ind.core.operation.transformation.DataTransformer import TSTransformer
from fedot_ind.core.operation.transformation.data.kernel_matrix import TSTransformer
from fedot_ind.core.models.recurrence.sequences import ReccurenceFeaturesExtractor


Expand Down
73 changes: 57 additions & 16 deletions fedot_ind/core/models/topological/TopologicalExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,20 @@
import pandas as pd
from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.pipelines.pipeline_builder import PipelineBuilder
from gtda.time_series import takens_embedding_optimal_parameters
from scipy import stats
from tqdm import tqdm

from examples.fedot.fedot_ex import init_input_data
from fedot_ind.core.architecture.preprocessing.DatasetLoader import DataLoader
from fedot_ind.core.models.BaseExtractor import BaseExtractor
from fedot_ind.core.models.topological.topological import AverageHoleLifetimeFeature, \
from fedot_ind.core.models.topological.topofeatures import AverageHoleLifetimeFeature, \
AveragePersistenceLandscapeFeature, BettiNumbersSumFeature, HolesNumberFeature, MaxHoleLifeTimeFeature, \
PersistenceDiagramsExtractor, PersistenceEntropyFeature, RadiusAtMaxBNFeature, RelevantHolesNumber, \
SimultaneousAliveHolesFeature, SumHoleLifetimeFeature, TopologicalFeaturesExtractor
from fedot_ind.core.operation.transformation.data.point_cloud import TopologicalTransformation
from fedot_ind.core.repository.initializer_industrial_models import IndustrialModels

sys.setrecursionlimit(1000000000)

Expand All @@ -29,6 +34,11 @@
'BettiNumbersSumFeature': BettiNumbersSumFeature(),
'RadiusAtMaxBNFeature': RadiusAtMaxBNFeature()}

PERSISTENCE_DIAGRAM_EXTRACTOR = PersistenceDiagramsExtractor(takens_embedding_dim=1,
takens_embedding_delay=2,
homology_dimensions=(0, 1),
parallel=False)


class TopologicalExtractor(BaseExtractor):
"""Class for extracting topological features from time series data.
Expand All @@ -39,33 +49,53 @@ class TopologicalExtractor(BaseExtractor):

def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.filtered_features = None
self.feature_extractor = None

def fit(self, input_data: InputData) -> OutputData:
pass

def generate_topological_features(self, ts_data: pd.DataFrame) -> pd.DataFrame:
self.window_size = params.get('window_size', 10)
self.feature_extractor = TopologicalFeaturesExtractor(
persistence_diagram_extractor=PERSISTENCE_DIAGRAM_EXTRACTOR,
persistence_diagram_features=PERSISTENCE_DIAGRAM_FEATURES)
self.data_transformer = None

def __evaluate_persistence_params(self, ts_data):
if self.feature_extractor is None:

te_dimension, te_time_delay = self.get_embedding_params_from_batch(ts_data=ts_data)

persistence_diagram_extractor = PersistenceDiagramsExtractor(takens_embedding_dim=te_dimension,
takens_embedding_delay=te_time_delay,
homology_dimensions=(0, 1),
homology_dimensions=(0, 1, 2),
parallel=True)

self.feature_extractor = TopologicalFeaturesExtractor(
persistence_diagram_extractor=persistence_diagram_extractor,
persistence_diagram_features=PERSISTENCE_DIAGRAM_FEATURES)

ts_data_transformed = self.feature_extractor.fit_transform(ts_data)
def fit(self, input_data: InputData) -> OutputData:
pass

def _generate_features_from_ts(self, ts_data, persistence_params):
if self.data_transformer is None:
self.data_transformer = TopologicalTransformation(
persistence_params=persistence_params,
window_length=round(ts_data.shape[0] * 0.01 * self.window_size))

point_cloud = self.data_transformer.time_series_to_point_cloud(input_data=ts_data)
topological_features = self.feature_extractor.transform(point_cloud)
return topological_features

def generate_topological_features(self, ts_data: np.array,
persistence_params: dict = None) -> pd.DataFrame:

if persistence_params is not None:
self._evaluate_persistence_params(ts_data)

if len(ts_data.shape) > 1:
topological_features = [self._generate_features_from_ts(component, persistence_params)
for component in ts_data]
for component_idx, feature_df in enumerate(topological_features):
feature_df.columns = [f'{col}_component_{component_idx}' for col in feature_df.columns]
return pd.concat(topological_features, axis=1)

if self.filtered_features is None:
self.filtered_features = ts_data_transformed.columns.tolist()
gc.collect()
return ts_data_transformed[self.filtered_features]
else:
return self._generate_features_from_ts(ts_data, persistence_params)

def generate_features_from_ts(self, ts_data: pd.DataFrame, dataset_name: str = None):
return self.generate_topological_features(ts_data=ts_data)
Expand Down Expand Up @@ -104,4 +134,15 @@ def get_embedding_params_from_batch(self, ts_data: pd.DataFrame, method: str = '

@staticmethod
def _mode(arr: list) -> int:
return int(stats.mode(arr)[0][0])
return int(stats.mode(arr)[0][0])


# if __name__ == "__main__":
# train_data, test_data = DataLoader(dataset_name='Ham').load_data()
# with IndustrialModels():
# pipeline = PipelineBuilder().add_node('data_driven_basis').add_node('topological_extractor').add_node(
# 'rf').build()
# input_data = init_input_data(train_data[0], train_data[1])
# pipeline.fit(input_data)
# features = pipeline.predict(input_data)
# print(features)
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
from gtda.diagrams import BettiCurve, Filtering, PersistenceEntropy, PersistenceLandscape, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.time_series import TakensEmbedding


class PersistenceDiagramFeatureExtractor(ABC):
Expand All @@ -19,7 +18,7 @@ def extract_feature_(self, persistence_diagram):
pass

def fit_transform(self, x_pd):
return np.array([self.extract_feature_(diagram) for diagram in x_pd])
return self.extract_feature_(x_pd)


class PersistenceDiagramsExtractor:
Expand Down Expand Up @@ -49,11 +48,6 @@ def __init__(self, takens_embedding_dim: int,
self.parallel_ = parallel
self.n_job = None

def takens_embeddings_(self, data):
te = TakensEmbedding(dimension=self.takens_embedding_dim_,
time_delay=self.takens_embedding_delay_)
return te.fit_transform(data)

def persistence_diagrams_(self, x_embeddings):
if self.parallel_:
pool = ThreadPool()
Expand All @@ -62,10 +56,7 @@ def persistence_diagrams_(self, x_embeddings):
pool.join()
return x_transformed
else:
x_transformed = list()
for embedding in x_embeddings:
x_transformed.append(self.parallel_embed_(embedding))
return x_transformed
return self.parallel_embed_(x_embeddings)

def parallel_embed_(self, embedding):
vr = VietorisRipsPersistence(metric='euclidean', homology_dimensions=self.homology_dimensions_,
Expand All @@ -77,8 +68,7 @@ def parallel_embed_(self, embedding):
persistence_diagrams = diagram_filter.fit_transform(persistence_diagrams)
return persistence_diagrams[0]

def fit_transform(self, x):
x_embeddings = self.takens_embeddings_(x)
def transform(self, x_embeddings):
x_persistence_diagrams = self.persistence_diagrams_(x_embeddings)
return x_persistence_diagrams

Expand All @@ -88,20 +78,21 @@ def __init__(self, persistence_diagram_extractor, persistence_diagram_features):
self.persistence_diagram_extractor_ = persistence_diagram_extractor
self.persistence_diagram_features_ = persistence_diagram_features

def fit_transform(self, x):
def transform(self, x):

x_pers_diag = self.persistence_diagram_extractor_.fit_transform(x)
tmp = []
x_pers_diag = self.persistence_diagram_extractor_.transform(x)
feature_list = []
column_list = []
for feature_name, feature_model in self.persistence_diagram_features_.items():
try:
x_features = feature_model.fit_transform(x_pers_diag)
tmp.append(x_features)
for dim in range(len(x_features.shape)):
feature_list.append(x_features)
for dim in range(len(x_features)):
column_list.append('{}_{}'.format(feature_name, dim))
except Exception:
continue
x_transformed = pd.DataFrame(data=np.hstack(tmp), columns=column_list)
x_transformed = pd.DataFrame(data=np.hstack(feature_list)).T
x_transformed.columns = column_list
return x_transformed


Expand Down
53 changes: 53 additions & 0 deletions fedot_ind/core/operation/transformation/data/kernel_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np
from scipy.spatial.distance import pdist, squareform


class TSTransformer:
def __init__(self, time_series, min_signal_ratio, max_signal_ratio, rec_metric):
self.time_series = time_series
self.recurrence_matrix = None
self.threshold_baseline = [0.95, 0.7] # cosine
self.min_signal_ratio = min_signal_ratio
self.max_signal_ratio = max_signal_ratio
self.rec_metric = rec_metric

def ts_to_recurrence_matrix(self,
threshold=None):
distance_matrix = pdist(metric=self.rec_metric, X=self.time_series.T)
distance_matrix = np.ones(shape=distance_matrix.shape[0]) - distance_matrix
distance_matrix = self.binarization(distance_matrix, threshold=threshold)
self.recurrence_matrix = squareform(distance_matrix)
return self.recurrence_matrix

def binarization(self, distance_matrix, threshold):
best_threshold_flag = False
signal_ratio_list = []
reccurence_matrix = None
if threshold is None:
for threshold_baseline in self.threshold_baseline:
threshold = threshold_baseline
tmp_array = np.copy(distance_matrix)
tmp_array[tmp_array < threshold_baseline] = 0.0
tmp_array[tmp_array >= threshold_baseline] = 1.0
signal_ratio = np.where(tmp_array == 0)[0].shape[0] / tmp_array.shape[0]

if self.min_signal_ratio < signal_ratio < self.max_signal_ratio:
best_ratio = signal_ratio
reccurence_matrix = tmp_array
best_threshold_flag = True
if signal_ratio > best_ratio:
reccurence_matrix = tmp_array
else:
signal_ratio_list.append(abs(self.max_signal_ratio - signal_ratio))

del tmp_array

if not best_threshold_flag:
distance_matrix[distance_matrix < self.threshold_baseline[0]] = 0.0
distance_matrix[distance_matrix >= self.threshold_baseline[0]] = 1.0
reccurence_matrix = distance_matrix
return reccurence_matrix

def get_recurrence_metrics(self):
if self.recurrence_matrix is None:
return self.ts_to_recurrence_matrix()
Loading

0 comments on commit 7e5240e

Please sign in to comment.