Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Functions for Evaluating Target Variables in Predictive Modeling #57

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ dmypy.json
# Pyre type checker
.pyre/

# Temporary files
temp.py
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"editor.formatOnSave": false
}
16 changes: 15 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use

### Additional Features

Preferably, use the `_bibmon_tools.py` file to implement additional features.
Preferably, use the `_bibmon_tools.py` file to implement additional features.

### Testing New Functionalities

The first step to add new functionalities is to download the testing libraries. To do this, run the following command:

```bash
pip install -r test/requirements.txt
```

After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:

```bash
pytest
```
4 changes: 2 additions & 2 deletions bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from ._sbm import SBM
from ._sklearn_regressor import sklearnRegressor
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._load_data import load_tennessee_eastman, load_real_data, load_3w
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'load_tennessee_eastman', 'load_real_data', 'load_3w',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
'create_df_with_noise', 'align_dfs_by_rows']
147 changes: 147 additions & 0 deletions bibmon/_alarms.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,150 @@ def detecOutlier(data, lim, count = False, count_limit = 1):
alarm = +1
return alarm

def detectStdDevOutlier(data, threshold=2):
"""
Detects outliers based on the standard deviation method.

Parameters
----------
data: array-like
The input data to be analyzed.
threshold: float, optional
The number of standard deviations from the mean to consider
as an outlier. Default is 2.

Returns
----------
alarm: ndarray
An array indicating the outliers (0 for values within
threshold * stddev, 1 for outliers).
"""
mean = np.nanmean(data)
std_dev = np.nanstd(data)
upper_limit = mean + threshold * std_dev
lower_limit = mean - threshold * std_dev
return np.where((data > upper_limit) | (data < lower_limit), 1, 0)


def detectTrend(data, window_size=5):
"""
Detects trend in the data using a moving average.

Parameters
----------
data: array-like
The input data to be analyzed.
window_size: int
The size of the moving window to calculate the trend.

Returns
----------
alarm: ndarray
An array indicating the trend (1 for positive trend, -1 for negative trend, 0 for no trend).
"""
if len(data) < window_size:
return np.zeros(len(data))

moving_avg = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
trend = np.diff(moving_avg)
trend_alarm = np.where(trend > 0, 1, np.where(trend < 0, -1, 0))

# Pad the result with zeros for the beginning of the array
trend_alarm = np.pad(trend_alarm, (window_size-1, 0), mode='constant')

return trend_alarm


def detectMovingWindowOutlier(data, window_size=10, count_limit=1):
"""
Detects outliers in a moving window.

Parameters
----------
data: array-like
The input data to be analyzed.
window_size: int
The size of the moving window to analyze.
count_limit: int
The maximum number of outliers allowed within the window.

Returns
----------
alarm: ndarray
An array indicating if the count of outliers exceeds
the count_limit within each window (1 for alarm, 0 for no alarm).
"""
alarm = np.zeros(len(data))
for i in range(len(data) - window_size + 1):
window = data[i:i + window_size]
if np.count_nonzero(window > np.nanmean(window)) > count_limit:
alarm[i + window_size - 1] = 1 # Mark the last element in the window
return alarm

def detectBias(data, expected_value, threshold=0.1):
"""
Detects bias in the data by comparing the mean to an expected value.

Parameters
----------
data: array-like
The input data to be analyzed.
expected_value: float
The expected mean value to compare against.
threshold: float, optional
The threshold for deviation from the expected value. Default is 0.1.

Returns
----------
alarm: ndarray
An array indicating if the bias exceeds the threshold (1 for alarm, 0 for no alarm).
"""
mean_value = np.nanmean(data)
return np.where(np.abs(mean_value - expected_value) > threshold, 1, 0)


def detectNelsonRules(data,threshold=1):
"""
Detects anomalies in the data based on Nelson Rules 1, 2, and 3.

Parameters
----------
data: array-like
The input data to be analyzed.

Returns
----------
alarms: dict
A dictionary with alarms for each rule (1 for alarm, 0 for no alarm).
"""
mean_value = np.nanmean(data)
std_dev = np.nanstd(data)

#rule 1 = 1 point exceeding the threshold
rule_1_alarms = np.where(np.abs(data - mean_value) > threshold * std_dev, 1, 0)

#rule 2 = 2 points in a row exceeding the threshold
rule_2_alarms = np.zeros_like(data)
for i in range(1, len(data)):
if (np.abs(data[i] - mean_value) > threshold * std_dev) and (np.abs(data[i-1] - mean_value) > threshold * std_dev):
rule_2_alarms[i-1:i+1] = 1

# rule 3 = 6 points in a row increasing or decreasing
rule_3_alarms = np.zeros_like(data)
count = 0
for i in range(len(data)):
if data[i] > mean_value + threshold :
count += 1
if count >= 6:
rule_3_alarms[i-5:i+1] = 1
else:
count = 0
alarms = {
'rule_1': rule_1_alarms,
'rule_2': rule_2_alarms,
'rule_3': rule_3_alarms
}

return alarms


130 changes: 130 additions & 0 deletions bibmon/_best_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression


def bestColumn_pearson_spearman(df):
"""
This function calculates the Pearson and Spearman correlation coefficients
between all columns in the dataframe. It identifies the column with the highest
average correlation (positive) with other columns using both correlation methods.

Pearson measures linear relationships, while Spearman measures monotonic relationships.

Returns:
dict: A dictionary containing the best column for each correlation method.
"""
pearson_corr = df.corr(method="pearson")
spearman_corr = df.corr(method="spearman")

# Calculate the average correlation of each column with the others
spearman_correlation_mean = spearman_corr.mean().sort_values(ascending=False)
y_column_spearman = spearman_correlation_mean.index[1]

peasron_correlation_mean = pearson_corr.mean().sort_values(ascending=False)
y_column_pearson = peasron_correlation_mean.index[1]

#return a dictionary with the best column for each correlation method
return {'pearson': y_column_pearson, 'spearman': y_column_spearman}

def bestColumn_with_least_mae_or_r2(df):
"""
This function evaluates each column as a target variable for regression
and calculates the Mean Absolute Error (MAE) and R-squared (R²) scores
for predictions made by an XGBoost regressor.

It identifies which column minimizes MAE and maximizes R², providing insights
on the best target variable based on predictive performance.

Returns:
dict: A dictionary with sorted results for MAE and R².
"""
performance_mae = {}
performance_r2 = {}

for columns in df.columns:
X = df.drop(columns, axis=1)
Y = df[columns]

Y = Y.fillna(Y.mean())

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#using XGBRegressor to predict the column, because it is faster than the other regressors that dosent not use GPU
model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor')
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

performance_mae[columns] = mae
performance_r2[columns] = r2


performance_mae_sorted = sorted(performance_mae.items(), key=lambda x: x[1], reverse=False)
performance_r2_sorted = sorted(performance_r2.items(), key=lambda x: x[1], reverse=True)

return {'r2': performance_r2_sorted,'mae': performance_mae_sorted}

def bestColumn_feature_importance(df):

"""
This function evaluates the importance of each feature by training an XGBoost regressor
for each column and computing the average feature importance. This helps to identify
which columns contribute most to predicting the target variable.

Returns:
list: A sorted list of feature importances for each column.
"""

feature_importances = {}

for columns in df.columns:
X = df.drop(columns, axis=1)
Y = df[columns]

Y = Y.fillna(Y.mean())

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor')
model.fit(X_train, Y_train)

# Importance of the features
feature_importance = model.feature_importances_
feature_importances[columns] = feature_importance.mean()

feature_importances_sorted = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

return feature_importances_sorted


def bestColumn_mutual_information(df):

"""
This function calculates the mutual information scores between each column
and the other columns in the dataframe. Mutual information quantifies the
amount of information obtained about one variable through the other, helping
to determine which features are most informative for predicting the target variable.

Returns:
list: A sorted list of mutual information scores for each column.
"""

mi_scores = {}

for columns in df.columns:
X = df.drop(columns, axis=1)
Y = df[columns]

mi = mutual_info_regression(X, Y)
mi_scores[columns] = mi.mean() #


mi_sorted = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)

return mi_sorted
Loading