Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add window size selector #1237

Merged
merged 23 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion fedot/api/api_utils/api_composer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import gc
from copy import deepcopy
from typing import List, Optional, Sequence, Tuple, Union

from golem.core.log import default_log
Expand Down Expand Up @@ -97,7 +98,7 @@ def propose_and_fit_initial_assumption(self, train_data: InputData) -> Tuple[Seq

with self.timer.launch_assumption_fit():
fitted_assumption = \
assumption_handler.fit_assumption_and_check_correctness(initial_assumption[0],
assumption_handler.fit_assumption_and_check_correctness(deepcopy(initial_assumption[0]),
pipelines_cache=self.pipelines_cache,
preprocessing_cache=self.preprocessing_cache,
eval_n_jobs=self.params.n_jobs)
Expand Down
4 changes: 3 additions & 1 deletion fedot/core/composer/gp_composer/specific_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def parameter_change_mutation(pipeline: Pipeline, requirements, graph_gen_params
node_mutation_probability = get_mutation_prob(mut_id=parameters.mutation_strength,
node=pipeline.root_node)
for node in pipeline.nodes:
if random() < node_mutation_probability:
lagged = node.operation.metadata.id in ('lagged', 'sparse_lagged', 'exog_ts')
do_mutation = random() < (node_mutation_probability * (0.5 if lagged else 1))
if do_mutation:
operation_name = node.operation.operation_type
current_params = node.parameters

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from copy import copy, deepcopy
from random import random
from typing import Optional, Union

import numpy as np
import pandas as pd

from fedot.utilities.window_size_selector import WindowSizeSelector, WindowSizeSelectorMethodsEnum
from golem.core.log import default_log
from scipy.ndimage import gaussian_filter
from sklearn.decomposition import TruncatedSVD
Expand Down Expand Up @@ -103,9 +106,9 @@ def transform_for_fit(self, input_data: InputData) -> OutputData:
self._update_column_types(output_data)
return output_data

def _check_and_correct_window_size(self, time_series: np.array, forecast_length: int):
def _check_and_correct_window_size(self, time_series: np.ndarray, forecast_length: int):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я про то, это этот метод вызывается в transform и transform_for_fit

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Окно подстраивается только если оно по дефолту не задано. Если задать его руками или изменить в процессе композиции/тюнинга, то оно не будет переопределяться.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

То есть сейчас это работает так:

  1. Дефолтное значение длины окна 0.
  2. Пусть у нас 3 фолда валидации
  3. Заходим в первый фолд, подбираем длину окна
  4. На следующих фолдах и при финальном обучении на всем ряду размер окна не подбираем?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Добавил тесты что при вычислении графа на фолдах и внутри тюнера окно не перенастраивается.

""" Method check if the length of the time series is not enough for
lagged transformation - clip it
lagged transformation

Args:
time_series: time series for transformation
Expand All @@ -114,10 +117,24 @@ def _check_and_correct_window_size(self, time_series: np.array, forecast_length:
Returns:

"""
max_allowed_window_size = max(1, len(time_series) - forecast_length - 1)

if self.window_size == 0:
selector = WindowSizeSelector(method=WindowSizeSelectorMethodsEnum.HAC, window_range=(5, 60))
new = int(selector.apply(time_series) * time_series.shape[0] * 0.01)
new = min(max_allowed_window_size, new)
self.log.message((f"Window size of lagged transformation was changed "
f"by WindowSizeSelector from {self.params.get('window_size')} to {new}"))
self.params.update(window_size=new)

# Maximum threshold
if self.window_size + forecast_length > len(time_series):
raise ValueError(f"Window size is to high ({self.window_size}) for provided data len {len(time_series)}")
if self.window_size > max_allowed_window_size:
new = int(random() * max_allowed_window_size)
new = min(new, max_allowed_window_size)
new = max(new, self.window_size_minimum)
self.log.info(("Window size of lagged transformation was changed "
f"from {self.params.get('window_size')} to {new}"))
self.params.update(window_size=new)

# Minimum threshold
if self.window_size < self.window_size_minimum:
Expand Down
2 changes: 1 addition & 1 deletion fedot/core/repository/data/default_operation_params.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"verbose": -1
},
"lagged": {
"window_size": 10
"window_size": 0
},
"diff_filter": {
"window_size": 3,
Expand Down
250 changes: 250 additions & 0 deletions fedot/utilities/window_size_selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
import math
from enum import Enum, auto

import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from statsmodels.tsa.stattools import acf


class WindowSizeSelectorMethodsEnum(Enum):
DFF = auto()
HAC = auto()
MWF = auto()
SSS = auto()


class WindowSizeSelector:
"""Class to select appropriate window size to catch periodicity for time series analysis.
There are two group of algorithms implemented:
Whole-Series-Based (WSB):
1. WindowSizeSelectorMethodsEnum.HAC - highest_autocorrelation
2. WindowSizeSelectorMethodsEnum.DFF - dominant_fourier_frequency
Subsequence-based (SB):
1. WindowSizeSelectorMethodsEnum.MWF - multi_window_finder
2. WindowSizeSelectorMethodsEnum.SSS - summary_statistics_subsequence
Args:
method: by ``default``, it is WindowSizeSelectorMethodsEnum.DFF.
window_range: % of time series length, by ``default`` it is (5, 50).
Attributes:
length_ts(int): length of the time_series.
window_max(int): maximum window size in real values.
window_min(int): minimum window size in real values.
dict_methods(dict): dictionary with all implemented methods.
Example:
To find window size for single time series::
ts = np.random.rand(1000)
ws_selector = WindowSizeSelector(method='hac')
window_size = ws_selector.get_window_size(time_series=ts)
To find window size for multiple time series::
ts = np.random.rand(1000, 10)
ws_selector = WindowSizeSelector(method='hac')
window_size = ws_selector.apply(time_series=ts, average='median')
Reference:
(c) "Windows Size Selection in Unsupervised Time Series Analytics: A Review and Benchmark. Arik Ermshaus,
Patrick Schafer, and Ulf Leser. 2022"
"""

def __init__(self,
method: WindowSizeSelectorMethodsEnum = WindowSizeSelectorMethodsEnum.DFF,
window_range: tuple = (5, 50)):

if window_range[0] >= window_range[1]:
raise ValueError('Upper bound of window range should be bigger than lower bound')
if window_range[0] < 0:
raise ValueError('Lower bound of window range should be bigger or equal to 0')
if window_range[1] > 100:
raise ValueError('Upper bound of window range should be lower or equal to 100')

self.dict_methods = {WindowSizeSelectorMethodsEnum.HAC: self.autocorrelation,
WindowSizeSelectorMethodsEnum.DFF: self.dominant_fourier_frequency,
WindowSizeSelectorMethodsEnum.MWF: self.mwf,
WindowSizeSelectorMethodsEnum.SSS: self.summary_statistics_subsequence}
self.wss_algorithm = method
self.window_range = window_range
self.window_max = None
self.window_min = None
self.length_ts = None

def apply(self, time_series: np.ndarray, average: str = 'median') -> int:
"""Method to run WSS class over selected time series in parallel mode via joblib
Args:
time_series: time series to study
average: 'mean' or 'median' to average window size over all time series
Returns:
window_size_selected: value which has been chosen as appropriate window size
"""
methods = {'mean': np.mean, 'median': np.median}
if time_series.ndim == 1:
time_series = time_series.reshape((-1, 1))
window_list = [self.get_window_size(time_series[:, i].ravel()) for i in range(time_series.shape[1])]
return round(methods[average](window_list))

def get_window_size(self, time_series: np.ndarray) -> int:
"""Main function to run WSS class over selected time series
Note:
One of the reason of ValueError is that time series size can be equal or smaller than 50.
In case of it try to initially set window_size min and max.
Returns:
window_size_selected: value which has been chosen as appropriate window size
"""
self.length_ts = len(time_series)

self.window_max = int(round(self.length_ts * self.window_range[1] / 100)) # in real values
self.window_min = int(round(self.length_ts * self.window_range[0] / 100)) # in real values

window_size_selected = self.dict_methods[self.wss_algorithm](time_series=time_series)
window_size_selected = round(window_size_selected * 100 / self.length_ts)
window_size_selected = max(self.window_range[0], window_size_selected)
window_size_selected = min(self.window_range[1], window_size_selected)
return window_size_selected

def dominant_fourier_frequency(self, time_series: np.ndarray) -> int:
"""
Method to find dominant fourier frequency in time series and return appropriate window size. It is based on
the assumption that the dominant frequency is the one with the highest magnitude in the Fourier transform. The
window size is then the inverse of the dominant frequency.
"""
fourier = np.fft.fft(time_series)
freq = np.fft.fftfreq(time_series.shape[0], 1)

magnitudes, window_sizes = [], []

for coef, freq in zip(fourier, freq):
if coef and freq > 0:
window_size = int(1 / freq)
mag = math.sqrt(coef.real * coef.real + coef.imag * coef.imag)

if self.window_min <= window_size < self.window_max:
window_sizes.append(window_size)
magnitudes.append(mag)
if window_sizes and magnitudes:
return window_sizes[np.argmax(magnitudes)]
else:
return self.window_min

def autocorrelation(self, time_series: np.array) -> int:
"""Method to find the highest autocorrelation in time series and return appropriate window size. It is based on
the assumption that the lag of highest autocorrelation coefficient corresponds to the window size that best
captures the periodicity of the time series.
"""
ts_len = time_series.shape[0]
acf_values = acf(time_series, fft=True, nlags=int(ts_len / 2))

peaks, _ = find_peaks(acf_values)
peaks = peaks[np.logical_and(peaks >= self.window_min, peaks < self.window_max)]
corrs = acf_values[peaks]

if peaks.shape[0] == 0: # if there is no peaks in range (window_min, window_max) return window_min
return self.window_min
return peaks[np.argmax(corrs)]

def mwf(self, time_series: np.array) -> int:
""" Method to find the window size that minimizes the moving average residual. It is based on the assumption
that the window size that best captures the periodicity of the time series is the one that minimizes the
difference between the moving average and the time series.
"""

all_averages, window_sizes = [], []

for w in range(self.window_min, self.window_max, 1):
movingAvg = np.array(self.movmean(time_series, w))
all_averages.append(movingAvg)
window_sizes.append(w)

movingAvgResiduals = []

for i, w in enumerate(window_sizes):
moving_avg = all_averages[i][:len(all_averages[-1])]
movingAvgResidual = np.log(abs(moving_avg - (moving_avg).mean()).sum())
movingAvgResiduals.append(movingAvgResidual)

b = (np.diff(np.sign(np.diff(movingAvgResiduals))) > 0).nonzero()[0] + 1 # local min

if len(b) == 0:
return self.window_min
if len(b) < 3:
return window_sizes[b[0]]

reswin = np.array([window_sizes[b[i]] / (i + 1) for i in range(3)])
w = np.mean(reswin)

return int(w)

def movmean(self, ts, w):
"""Fast moving average function"""
moving_avg = np.cumsum(ts, dtype=float)
moving_avg[w:] = moving_avg[w:] - moving_avg[:-w]
return moving_avg[w - 1:] / w

def summary_statistics_subsequence(self, time_series: np.array, threshold=.89) -> int:
"""Method to find the window size that maximizes the subsequence unsupervised similarity score (SUSS). It is
based on the assumption that the window size that best captures the periodicity of the time series is the one
that maximizes the similarity between subsequences of the time series.
"""
# lbound = self.window_min
time_series = (time_series - time_series.min()) / (time_series.max() - time_series.min())

ts_mean = np.mean(time_series)
ts_std = np.std(time_series)
ts_min_max = np.max(time_series) - np.min(time_series)

stats = (ts_mean, ts_std, ts_min_max)

max_score = self.suss_score(time_series=time_series, window_size=1, stats=stats)
min_score = self.suss_score(time_series=time_series, window_size=time_series.shape[0] - 1, stats=stats)

exp = 0

# exponential search (to find window size interval)
while True:
window_size = 2 ** exp

if window_size > self.window_max:
break

if window_size < self.window_min:
exp += 1
continue

score = 1 - (self.suss_score(time_series, window_size, stats) - min_score) / (max_score - min_score)

if score > threshold:
break

exp += 1

lbound, ubound = max(self.window_min, 2 ** (exp - 1)), 2 ** exp + 1

# binary search (to find window size in interval)
while lbound <= ubound:
window_size = int((lbound + ubound) / 2)
score = 1 - (self.suss_score(time_series, window_size, stats) - min_score) / (max_score - min_score)

if score < threshold:
lbound = window_size + 1
elif score > threshold:
ubound = window_size - 1
else:
break

return 2 * lbound

def suss_score(self, time_series, window_size, stats):
roll = pd.Series(time_series).rolling(window_size)
ts_mean, ts_std, ts_min_max = stats

roll_mean = roll.mean().to_numpy()[window_size:]
roll_std = roll.std(ddof=0).to_numpy()[window_size:]
roll_min = roll.min().to_numpy()[window_size:]
roll_max = roll.max().to_numpy()[window_size:]

X = np.array([
roll_mean - ts_mean,
roll_std - ts_std,
(roll_max - roll_min) - ts_min_max
])

X = np.sqrt(np.sum(np.square(X), axis=0)) / np.sqrt(window_size)

return np.mean(X)
5 changes: 2 additions & 3 deletions test/unit/data_operations/test_data_operation_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np
import pandas as pd
import pytest

from fedot.core.data.data import InputData
from fedot.core.data.data_split import train_test_data_setup
Expand Down Expand Up @@ -54,8 +53,8 @@ def test_lagged_with_invalid_params_fit_correctly():
pipeline = get_ts_pipeline(window_size)

# Fit it
with pytest.raises(ValueError):
pipeline.fit(ts_input)
pipeline.fit(ts_input)
assert 1 <= pipeline.nodes[-1].parameters['window_size'] <= len(time_series) - len_forecast


def test_ransac_with_invalid_params_fit_correctly():
Expand Down
Loading
Loading