petrobras · Reinaldo-Kn · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# Temporary files
+temp.py
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "editor.formatOnSave": false
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use
 
 ### Additional Features
 
-Preferably, use the `_bibmon_tools.py` file to implement additional features.
+Preferably, use the `_bibmon_tools.py` file to implement additional features.
+
+### Testing New Functionalities
+
+The first step to add new functionalities is to download the testing libraries. To do this, run the following command:
+
+```bash
+pip install -r test/requirements.txt
+```
+
+After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:
+
+```bash
+pytest
+```
diff --git a/bibmon/__init__.py b/bibmon/__init__.py
@@ -4,12 +4,12 @@
 from ._sbm import SBM
 from ._sklearn_regressor import sklearnRegressor
 from ._preprocess import PreProcess
-from ._load_data import load_tennessee_eastman, load_real_data
+from ._load_data import load_tennessee_eastman, load_real_data, load_3w
 from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows
 
 __all__ = ['Autoencoder','PCA','ESN','SBM',
 	   'sklearnRegressor', 'PreProcess',
-           'load_tennessee_eastman', 'load_real_data', 
+           'load_tennessee_eastman', 'load_real_data', 'load_3w', 
            'train_val_test_split', 'complete_analysis', 'comparative_table',
 	       'spearmanr_dendrogram', 'create_df_with_dates',
            'create_df_with_noise', 'align_dfs_by_rows']
diff --git a/bibmon/_alarms.py b/bibmon/_alarms.py
@@ -44,3 +44,150 @@ def detecOutlier(data, lim, count = False, count_limit = 1):
             alarm = +1
         return alarm
 
+def detectStdDevOutlier(data, threshold=2):
+    """
+    Detects outliers based on the standard deviation method.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    threshold: float, optional
+        The number of standard deviations from the mean to consider 
+        as an outlier. Default is 2.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating the outliers (0 for values within 
+        threshold * stddev, 1 for outliers).
+    """
+    mean = np.nanmean(data)
+    std_dev = np.nanstd(data)
+    upper_limit = mean + threshold * std_dev
+    lower_limit = mean - threshold * std_dev
+    return np.where((data > upper_limit) | (data < lower_limit), 1, 0)
+
+
+def detectTrend(data, window_size=5):
+    """
+    Detects trend in the data using a moving average.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    window_size: int
+        The size of the moving window to calculate the trend.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating the trend (1 for positive trend, -1 for negative trend, 0 for no trend).
+    """
+    if len(data) < window_size:
+        return np.zeros(len(data))
+
+    moving_avg = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
+    trend = np.diff(moving_avg)
+    trend_alarm = np.where(trend > 0, 1, np.where(trend < 0, -1, 0))
+
+    # Pad the result with zeros for the beginning of the array
+    trend_alarm = np.pad(trend_alarm, (window_size-1, 0), mode='constant')
+
+    return trend_alarm
+
+
+def detectMovingWindowOutlier(data, window_size=10, count_limit=1):
+    """
+    Detects outliers in a moving window.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    window_size: int
+        The size of the moving window to analyze.
+    count_limit: int
+        The maximum number of outliers allowed within the window.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating if the count of outliers exceeds 
+        the count_limit within each window (1 for alarm, 0 for no alarm).
+    """
+    alarm = np.zeros(len(data))
+    for i in range(len(data) - window_size + 1):
+        window = data[i:i + window_size]
+        if np.count_nonzero(window > np.nanmean(window)) > count_limit:
+            alarm[i + window_size - 1] = 1  # Mark the last element in the window
+    return alarm
+
+def detectBias(data, expected_value, threshold=0.1):
+    """
+    Detects bias in the data by comparing the mean to an expected value.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    expected_value: float
+        The expected mean value to compare against.
+    threshold: float, optional
+        The threshold for deviation from the expected value. Default is 0.1.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating if the bias exceeds the threshold (1 for alarm, 0 for no alarm).
+    """
+    mean_value = np.nanmean(data)
+    return np.where(np.abs(mean_value - expected_value) > threshold, 1, 0)
+
+
+def detectNelsonRules(data,threshold=1):
+    """
+    Detects anomalies in the data based on Nelson Rules 1, 2, and 3.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+
+    Returns
+    ----------
+    alarms: dict
+        A dictionary with alarms for each rule (1 for alarm, 0 for no alarm).
+    """
+    mean_value = np.nanmean(data)
+    std_dev = np.nanstd(data)
+
+    #rule 1 = 1 point exceeding the threshold
+    rule_1_alarms = np.where(np.abs(data - mean_value) > threshold * std_dev, 1, 0)
+
+    #rule 2 = 2 points in a row exceeding the threshold
+    rule_2_alarms = np.zeros_like(data)
+    for i in range(1, len(data)):
+        if (np.abs(data[i] - mean_value) > threshold * std_dev) and (np.abs(data[i-1] - mean_value) > threshold * std_dev):
+            rule_2_alarms[i-1:i+1] = 1  
+
+    # rule 3 = 6 points in a row increasing or decreasing
+    rule_3_alarms = np.zeros_like(data)
+    count = 0
+    for i in range(len(data)):
+        if data[i] > mean_value + threshold :
+            count += 1
+            if count >= 6:
+                rule_3_alarms[i-5:i+1] = 1  
+        else:
+            count = 0  
+    alarms = {
+        'rule_1': rule_1_alarms,
+        'rule_2': rule_2_alarms,
+        'rule_3': rule_3_alarms
+    }
+
+    return alarms
+
+
diff --git a/bibmon/_best_columns.py b/bibmon/_best_columns.py
@@ -0,0 +1,130 @@
+import pandas as pd
+import xgboost as xgb
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import r2_score, mean_absolute_error
+from sklearn.feature_selection import mutual_info_regression
+
+
+def bestColumn_pearson_spearman(df):
+    """
+    This function calculates the Pearson and Spearman correlation coefficients
+    between all columns in the dataframe. It identifies the column with the highest
+    average correlation (positive) with other columns using both correlation methods.
+
+    Pearson measures linear relationships, while Spearman measures monotonic relationships.
+
+    Returns:
+        dict: A dictionary containing the best column for each correlation method.
+    """
+    pearson_corr = df.corr(method="pearson")
+    spearman_corr = df.corr(method="spearman")
+
+    # Calculate the average correlation of each column with the others
+    spearman_correlation_mean = spearman_corr.mean().sort_values(ascending=False)
+    y_column_spearman = spearman_correlation_mean.index[1]
+
+    peasron_correlation_mean = pearson_corr.mean().sort_values(ascending=False)
+    y_column_pearson = peasron_correlation_mean.index[1]
+
+    #return a dictionary with the best column for each correlation method
+    return {'pearson': y_column_pearson, 'spearman': y_column_spearman}
+
+def bestColumn_with_least_mae_or_r2(df):
+    """
+    This function evaluates each column as a target variable for regression
+    and calculates the Mean Absolute Error (MAE) and R-squared (R²) scores
+    for predictions made by an XGBoost regressor.
+
+    It identifies which column minimizes MAE and maximizes R², providing insights
+    on the best target variable based on predictive performance.
+
+    Returns:
+        dict: A dictionary with sorted results for MAE and R².
+    """
+    performance_mae = {}
+    performance_r2 = {}
+
+    for columns in df.columns:
+        X = df.drop(columns, axis=1)
+        Y = df[columns]
+
+        Y = Y.fillna(Y.mean())
+
+        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
+
+        #using XGBRegressor to predict the column, because it is faster than the other regressors that dosent not use GPU
+        model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor')
+        model.fit(X_train, Y_train)
+
+        Y_pred = model.predict(X_test)
+
+        mae = mean_absolute_error(Y_test, Y_pred)
+        r2 = r2_score(Y_test, Y_pred)
+
+        performance_mae[columns] = mae 
+        performance_r2[columns] = r2  
+
+
+    performance_mae_sorted = sorted(performance_mae.items(), key=lambda x: x[1], reverse=False)
+    performance_r2_sorted = sorted(performance_r2.items(), key=lambda x: x[1], reverse=True)
+
+    return {'r2': performance_r2_sorted,'mae': performance_mae_sorted}
+
+def bestColumn_feature_importance(df):
+
+    """
+    This function evaluates the importance of each feature by training an XGBoost regressor
+    for each column and computing the average feature importance. This helps to identify
+    which columns contribute most to predicting the target variable.
+
+    Returns:
+        list: A sorted list of feature importances for each column.
+    """
+
+    feature_importances = {}
+
+    for columns in df.columns:
+        X = df.drop(columns, axis=1)
+        Y = df[columns]
+
+        Y = Y.fillna(Y.mean())
+
+        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
+
+        model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor')
+        model.fit(X_train, Y_train)
+
+        # Importance of the features
+        feature_importance = model.feature_importances_
+        feature_importances[columns] = feature_importance.mean()
+
+    feature_importances_sorted = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
+
+    return feature_importances_sorted
+
+
+def bestColumn_mutual_information(df):
+
+    """
+    This function calculates the mutual information scores between each column
+    and the other columns in the dataframe. Mutual information quantifies the
+    amount of information obtained about one variable through the other, helping
+    to determine which features are most informative for predicting the target variable.
+
+    Returns:
+        list: A sorted list of mutual information scores for each column.
+    """
+
+    mi_scores = {}
+
+    for columns in df.columns:
+        X = df.drop(columns, axis=1)
+        Y = df[columns]
+
+        mi = mutual_info_regression(X, Y)
+        mi_scores[columns] = mi.mean()  #
+
+
+    mi_sorted = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True)
+
+    return mi_sorted
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,3 +172,5 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # Temporary files
+    temp.py