Skip to content

Commit

Permalink
add riemann extractor. Fix mutations. Update result analysis. Update …
Browse files Browse the repository at this point in the history
…repos
  • Loading branch information
v1docq committed Mar 12, 2024
1 parent 4b271c1 commit c38342b
Show file tree
Hide file tree
Showing 27 changed files with 1,005 additions and 346 deletions.
366 changes: 366 additions & 0 deletions benchmark/feature_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pywt, librosa

from fedot_ind.api.utils.path_lib import PROJECT_PATH

eeg_windows = {
'10s': (4000, 6000), # Middle 10s
'30s': (2000, 8000), # Middle 30s
'50s': (0, 10000) # Entire sample (50s)
}

spec_windows = {
'10m': (-300, 300), # Entire sample
'5m': (-150, 150),
'1m': (-30, 30),
'10s': (-5, 5),
'20s': (-10, 10),
'30s': (-15, 15),
'pre': (-300, -10),
'post': (10, 300)

}

eeg_built_spec_windows = {
'50s': (0, 256), # Entire sample
'10s': (100, -100), # 10s
'pre': (0, 100),
'post': (-100, 256)
}


USE_WAVELET = None

NAMES = ['LL', 'LP', 'RP', 'RR']

FEATS = [['Fp1', 'F7', 'T3', 'T5', 'O1'],
['Fp1', 'F3', 'C3', 'P3', 'O1'],
['Fp2', 'F8', 'T4', 'T6', 'O2'],
['Fp2', 'F4', 'C4', 'P4', 'O2']]


# DENOISE FUNCTION
def maddest(d, axis=None):
return np.mean(np.absolute(d - np.mean(d, axis)), axis)


def denoise(x, wavelet='haar', level=1):
coeff = pywt.wavedec(x, wavelet, mode="per")
sigma = (1 / 0.6745) * maddest(coeff[-level])

uthresh = sigma * np.sqrt(2 * np.log(len(x)))
coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

ret = pywt.waverec(coeff, wavelet, mode='per')

return ret


def spectrogram_from_eeg(parquet_path, display=False):
# LOAD MIDDLE 50 SECONDS OF EEG SERIES
eeg = pd.read_parquet(parquet_path)
middle = (len(eeg) - 10_000) // 2
eeg = eeg.iloc[middle:middle + 10_000]

# VARIABLE TO HOLD SPECTROGRAM
img = np.zeros((128, 256, 4), dtype='float32')

if display: plt.figure(figsize=(10, 7))
signals = []
for k in range(4):
COLS = FEATS[k]

for kk in range(4):

# COMPUTE PAIR DIFFERENCES
x = eeg[COLS[kk]].values - eeg[COLS[kk + 1]].values

# FILL NANS
m = np.nanmean(x)
if np.isnan(x).mean() < 1:
x = np.nan_to_num(x, nan=m)
else:
x[:] = 0

# DENOISE
if USE_WAVELET:
x = denoise(x, wavelet=USE_WAVELET)
signals.append(x)

# RAW SPECTROGRAM
mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x) // 256,
n_fft=1024, n_mels=128, fmin=0, fmax=20, win_length=128)

# LOG TRANSFORM
width = (mel_spec.shape[1] // 32) * 32
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:, :width]

# STANDARDIZE TO -1 TO 1
mel_spec_db = (mel_spec_db + 40) / 40
img[:, :, k] += mel_spec_db

# AVERAGE THE 4 MONTAGE DIFFERENCES
img[:, :, k] /= 4.0

return img

class ReadData():
def __init__(self, is_train=True):
self.is_train = is_train

def _read_data(self, data_type, file_id):

if self.is_train:
PATH = PROJECT_PATH + f"/data/hms-harmful-brain-activity-classification/train_{data_type}/{file_id}.parquet"
else:
PATH = PROJECT_PATH + f"/data/hms-harmful-brain-activity-classification/test_{data_type}/{file_id}.parquet"

return pd.read_parquet(PATH)

def read_spectrogram_data(self, spectrogram_id):
return self._read_data('spectrograms', spectrogram_id).set_index('time')

def read_eeg_data(self, eeg_id) -> pd.DataFrame:
return self._read_data('eegs', eeg_id)

def read_eeg_built_spectrogram_data(self, eeg_id) -> pd.DataFrame:

montages = ['LL', 'LP', 'RP', 'RR']
spec = pd.DataFrame()

if self.is_train:
_ = PROJECT_PATH + f"/data/hms-harmful-brain-activity-classification/EEG_Spectrograms/{eeg_id}.npy"
eeg_specs = np.load(_)
else:
eeg_specs = spectrogram_from_eeg(
f"/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/{eeg_id}.parquet")

for i in range(len(montages)):
spec = pd.concat([spec, pd.DataFrame(eeg_specs[:, :, i]).T.add_prefix(f'{montages[i]}_')], axis=1)

return spec

def read_train_data(self):
TRAIN_PATH = PROJECT_PATH + '/data/hms-harmful-brain-activity-classification/train.csv'
dataframe = pd.read_csv(TRAIN_PATH)
# TARGETS = ['target', 'fold', 'eeg_id']
# EEG_IDS = dataframe.eeg_id.unique()
# target_df = dataframe[TARGETS]
return dataframe

def read_test_data(self):
TEST_PATH = PROJECT_PATH + '/data/hms-harmful-brain-activity-classification/test.csv'
return pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/test.csv")

class FeatureEngineerData(ReadData):
def __init__(self, metadata, is_train=True, row_id='label_id'):
'''
Params
----------
metadata : dict
Contains the information on the eeg ids and labels
'''
self.metadata = metadata
self.is_train = is_train

self.row_id = metadata[row_id]

def get_mean(self, df) -> pd.DataFrame:
return (df
.mean()
.reset_index()
.set_axis(['var', 'mean'], axis=1)
.assign(row_id=self.row_id)
.pivot(columns='var', values='mean', index='row_id')
.add_prefix('mean_')
)

def get_max(self, df) -> pd.DataFrame:
return (df
.max()
.reset_index()
.set_axis(['var', 'max'], axis=1)
.assign(row_id=self.row_id)
.pivot(columns='var', values='max', index='row_id')
.add_prefix('max_')
)

def get_min(self, df) -> pd.DataFrame:
return (df
.max()
.reset_index()
.set_axis(['var', 'min'], axis=1)
.assign(row_id=self.row_id)
.pivot(columns='var', values='min', index='row_id')
.add_prefix('min_')
)

def get_corr(self, df) -> pd.DataFrame:
'''
Returns the correlation of an eeg file
'''

def apply_mask(df):
mask = np.triu(np.ones_like(df, dtype=bool))
return df.where(mask).unstack().dropna()

return (df
.corr()
.pipe(apply_mask)
.reset_index()
.set_axis(['var_1', 'var_2', 'corr'], axis=1)
.query("var_1 != var_2")
.assign(
row_id=self.row_id,
label=lambda x: x.var_1 + "_" + x.var_2
)
.pivot(columns='label', values='corr', index='row_id')
.add_prefix('cor_')
)

def filter_spectrogram_corr(self, corr_df) -> pd.DataFrame:
'''
Returns a dataframe with only the correlation across the same frequency
'''
return corr_df[[col for col in corr_df.columns if col.split('_')[2] == col.split('_')[4]]]

def filter_eegspectrogram_corr(self, corr_df) -> pd.DataFrame:
pass

def get_std(self, df) -> pd.DataFrame:
return (df
.std()
.reset_index()
.set_axis(['var', 'std'], axis=1)
.assign(row_id=self.row_id)
.pivot(columns='var', values='std', index='row_id')
.add_prefix('std_')
)

def get_range(self, df) -> pd.DataFrame:
return (
df
.max()
.sub(df.min())
.reset_index()
.set_axis(['var', 'range'], axis=1)
.assign(row_id=self.row_id)
.pivot(columns='var', values='range', index='row_id')
.add_prefix('range_')
)


class EEGFeatures(FeatureEngineerData):

def get_offset(self):
if self.metadata.get('right_eeg_index') is None:
return [0, 10000]
else:
return [self.metadata['left_eeg_index'], self.metadata['right_eeg_index']]

def format_eeg_data(self, window_sizes={}):

offset_range = self.get_offset()

df = self.read_eeg_data(self.metadata['eeg_id']).iloc[offset_range[0]:offset_range[1]]

eeg_df = pd.DataFrame()
for window in window_sizes:
left_index = window_sizes[window][0]
right_index = window_sizes[window][1]

eeg_df = pd.concat([
eeg_df,
self.get_features(df.iloc[left_index:right_index], time_id=window)
], axis=1)

return eeg_df

def get_features(self, df, time_id) -> pd.DataFrame():
return (
pd.concat([
self.get_mean(df),
self.get_std(df),
self.get_max(df),
self.get_range(df),
self.get_corr(df)
], axis=1).add_prefix(f"eeg_{time_id}_")
)


class SpectrogramFeatures(FeatureEngineerData):

def get_offset(self):
if self.metadata.get('spectrogram_label_offset_seconds') is None:
return 0
else:
return self.metadata['spectrogram_label_offset_seconds']

def format_spectrogram_data(self, window_sizes={}):

# Create a variable to make the code more readable
offset = self.get_offset()

# Read specific spectrogram window
df = (self.read_spectrogram_data(self.metadata['spectrogram_id'])
.loc[offset:offset + 600]
.fillna(0)
)

# Creates the middle of the spectrogram
middle = (offset + (600 + offset)) / 2

spec_df = pd.DataFrame()
for window in window_sizes:
left_index = window_sizes[window][0]
right_index = window_sizes[window][1]

spec_df = pd.concat([
spec_df,
self.get_features(df.loc[middle + left_index:middle + right_index], time_id=window)
], axis=1)

return spec_df

def get_features(self, df, time_id) -> pd.DataFrame():
return (
pd.concat([
self.get_mean(df),
self.get_std(df),
self.get_max(df),
self.get_min(df),
self.get_range(df)
], axis=1).add_prefix(f"spec_{time_id}_")
)


class EEGBuiltSpectrogramFeatures(FeatureEngineerData):
def format_custom_spectrogram(self, window_sizes={()}):
df = self.read_eeg_built_spectrogram_data(self.metadata['eeg_id']).copy()

spec_df = pd.DataFrame()
for window in window_sizes:
left_index = window_sizes[window][0]
right_index = window_sizes[window][1]

spec_df = pd.concat([
spec_df,
self.get_features(df.iloc[left_index:right_index], time_id=window)
], axis=1)

return spec_df

def get_features(self, df, time_id) -> pd.DataFrame():
return (
pd.concat([
self.get_mean(df),
self.get_std(df),
self.get_max(df),
self.get_min(df),
self.get_range(df)
], axis=1).add_prefix(f"eegspec_{time_id}_")
)
Loading

0 comments on commit c38342b

Please sign in to comment.