diff --git a/.gitignore b/.gitignore index 770002a..0126e61 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,5 @@ dmypy.json # Pyre type checker .pyre/ +# Temporary files +temp.py \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..7c2feb7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "editor.formatOnSave": false +} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e0a93e5..eac021d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use ### Additional Features -Preferably, use the `_bibmon_tools.py` file to implement additional features. \ No newline at end of file +Preferably, use the `_bibmon_tools.py` file to implement additional features. + +### Testing New Functionalities + +The first step to add new functionalities is to download the testing libraries. To do this, run the following command: + +```bash +pip install -r test/requirements.txt +``` + +After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command: + +```bash +pytest +``` \ No newline at end of file diff --git a/bibmon/__init__.py b/bibmon/__init__.py index 3dd3ee7..1d783ab 100644 --- a/bibmon/__init__.py +++ b/bibmon/__init__.py @@ -4,12 +4,12 @@ from ._sbm import SBM from ._sklearn_regressor import sklearnRegressor from ._preprocess import PreProcess -from ._load_data import load_tennessee_eastman, load_real_data +from ._load_data import load_tennessee_eastman, load_real_data, load_3w from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows __all__ = ['Autoencoder','PCA','ESN','SBM', 'sklearnRegressor', 'PreProcess', - 'load_tennessee_eastman', 'load_real_data', + 'load_tennessee_eastman', 'load_real_data', 'load_3w', 'train_val_test_split', 'complete_analysis', 'comparative_table', 'spearmanr_dendrogram', 'create_df_with_dates', 'create_df_with_noise', 'align_dfs_by_rows'] diff --git a/bibmon/_alarms.py b/bibmon/_alarms.py index dea0af9..895b166 100644 --- a/bibmon/_alarms.py +++ b/bibmon/_alarms.py @@ -44,3 +44,150 @@ def detecOutlier(data, lim, count = False, count_limit = 1): alarm = +1 return alarm +def detectStdDevOutlier(data, threshold=2): + """ + Detects outliers based on the standard deviation method. + + Parameters + ---------- + data: array-like + The input data to be analyzed. + threshold: float, optional + The number of standard deviations from the mean to consider + as an outlier. Default is 2. + + Returns + ---------- + alarm: ndarray + An array indicating the outliers (0 for values within + threshold * stddev, 1 for outliers). + """ + mean = np.nanmean(data) + std_dev = np.nanstd(data) + upper_limit = mean + threshold * std_dev + lower_limit = mean - threshold * std_dev + return np.where((data > upper_limit) | (data < lower_limit), 1, 0) + + +def detectTrend(data, window_size=5): + """ + Detects trend in the data using a moving average. + + Parameters + ---------- + data: array-like + The input data to be analyzed. + window_size: int + The size of the moving window to calculate the trend. + + Returns + ---------- + alarm: ndarray + An array indicating the trend (1 for positive trend, -1 for negative trend, 0 for no trend). + """ + if len(data) < window_size: + return np.zeros(len(data)) + + moving_avg = np.convolve(data, np.ones(window_size)/window_size, mode='valid') + trend = np.diff(moving_avg) + trend_alarm = np.where(trend > 0, 1, np.where(trend < 0, -1, 0)) + + # Pad the result with zeros for the beginning of the array + trend_alarm = np.pad(trend_alarm, (window_size-1, 0), mode='constant') + + return trend_alarm + + +def detectMovingWindowOutlier(data, window_size=10, count_limit=1): + """ + Detects outliers in a moving window. + + Parameters + ---------- + data: array-like + The input data to be analyzed. + window_size: int + The size of the moving window to analyze. + count_limit: int + The maximum number of outliers allowed within the window. + + Returns + ---------- + alarm: ndarray + An array indicating if the count of outliers exceeds + the count_limit within each window (1 for alarm, 0 for no alarm). + """ + alarm = np.zeros(len(data)) + for i in range(len(data) - window_size + 1): + window = data[i:i + window_size] + if np.count_nonzero(window > np.nanmean(window)) > count_limit: + alarm[i + window_size - 1] = 1 # Mark the last element in the window + return alarm + +def detectBias(data, expected_value, threshold=0.1): + """ + Detects bias in the data by comparing the mean to an expected value. + + Parameters + ---------- + data: array-like + The input data to be analyzed. + expected_value: float + The expected mean value to compare against. + threshold: float, optional + The threshold for deviation from the expected value. Default is 0.1. + + Returns + ---------- + alarm: ndarray + An array indicating if the bias exceeds the threshold (1 for alarm, 0 for no alarm). + """ + mean_value = np.nanmean(data) + return np.where(np.abs(mean_value - expected_value) > threshold, 1, 0) + + +def detectNelsonRules(data,threshold=1): + """ + Detects anomalies in the data based on Nelson Rules 1, 2, and 3. + + Parameters + ---------- + data: array-like + The input data to be analyzed. + + Returns + ---------- + alarms: dict + A dictionary with alarms for each rule (1 for alarm, 0 for no alarm). + """ + mean_value = np.nanmean(data) + std_dev = np.nanstd(data) + + #rule 1 = 1 point exceeding the threshold + rule_1_alarms = np.where(np.abs(data - mean_value) > threshold * std_dev, 1, 0) + + #rule 2 = 2 points in a row exceeding the threshold + rule_2_alarms = np.zeros_like(data) + for i in range(1, len(data)): + if (np.abs(data[i] - mean_value) > threshold * std_dev) and (np.abs(data[i-1] - mean_value) > threshold * std_dev): + rule_2_alarms[i-1:i+1] = 1 + + # rule 3 = 6 points in a row increasing or decreasing + rule_3_alarms = np.zeros_like(data) + count = 0 + for i in range(len(data)): + if data[i] > mean_value + threshold : + count += 1 + if count >= 6: + rule_3_alarms[i-5:i+1] = 1 + else: + count = 0 + alarms = { + 'rule_1': rule_1_alarms, + 'rule_2': rule_2_alarms, + 'rule_3': rule_3_alarms + } + + return alarms + + diff --git a/bibmon/_best_columns.py b/bibmon/_best_columns.py new file mode 100644 index 0000000..aef02ef --- /dev/null +++ b/bibmon/_best_columns.py @@ -0,0 +1,130 @@ +import pandas as pd +import xgboost as xgb +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_absolute_error +from sklearn.feature_selection import mutual_info_regression + + +def bestColumn_pearson_spearman(df): + """ + This function calculates the Pearson and Spearman correlation coefficients + between all columns in the dataframe. It identifies the column with the highest + average correlation (positive) with other columns using both correlation methods. + + Pearson measures linear relationships, while Spearman measures monotonic relationships. + + Returns: + dict: A dictionary containing the best column for each correlation method. + """ + pearson_corr = df.corr(method="pearson") + spearman_corr = df.corr(method="spearman") + + # Calculate the average correlation of each column with the others + spearman_correlation_mean = spearman_corr.mean().sort_values(ascending=False) + y_column_spearman = spearman_correlation_mean.index[1] + + peasron_correlation_mean = pearson_corr.mean().sort_values(ascending=False) + y_column_pearson = peasron_correlation_mean.index[1] + + #return a dictionary with the best column for each correlation method + return {'pearson': y_column_pearson, 'spearman': y_column_spearman} + +def bestColumn_with_least_mae_or_r2(df): + """ + This function evaluates each column as a target variable for regression + and calculates the Mean Absolute Error (MAE) and R-squared (R²) scores + for predictions made by an XGBoost regressor. + + It identifies which column minimizes MAE and maximizes R², providing insights + on the best target variable based on predictive performance. + + Returns: + dict: A dictionary with sorted results for MAE and R². + """ + performance_mae = {} + performance_r2 = {} + + for columns in df.columns: + X = df.drop(columns, axis=1) + Y = df[columns] + + Y = Y.fillna(Y.mean()) + + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) + + #using XGBRegressor to predict the column, because it is faster than the other regressors that dosent not use GPU + model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor') + model.fit(X_train, Y_train) + + Y_pred = model.predict(X_test) + + mae = mean_absolute_error(Y_test, Y_pred) + r2 = r2_score(Y_test, Y_pred) + + performance_mae[columns] = mae + performance_r2[columns] = r2 + + + performance_mae_sorted = sorted(performance_mae.items(), key=lambda x: x[1], reverse=False) + performance_r2_sorted = sorted(performance_r2.items(), key=lambda x: x[1], reverse=True) + + return {'r2': performance_r2_sorted,'mae': performance_mae_sorted} + +def bestColumn_feature_importance(df): + + """ + This function evaluates the importance of each feature by training an XGBoost regressor + for each column and computing the average feature importance. This helps to identify + which columns contribute most to predicting the target variable. + + Returns: + list: A sorted list of feature importances for each column. + """ + + feature_importances = {} + + for columns in df.columns: + X = df.drop(columns, axis=1) + Y = df[columns] + + Y = Y.fillna(Y.mean()) + + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) + + model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=None, tree_method='gpu_hist', predictor='gpu_predictor') + model.fit(X_train, Y_train) + + # Importance of the features + feature_importance = model.feature_importances_ + feature_importances[columns] = feature_importance.mean() + + feature_importances_sorted = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True) + + return feature_importances_sorted + + +def bestColumn_mutual_information(df): + + """ + This function calculates the mutual information scores between each column + and the other columns in the dataframe. Mutual information quantifies the + amount of information obtained about one variable through the other, helping + to determine which features are most informative for predicting the target variable. + + Returns: + list: A sorted list of mutual information scores for each column. + """ + + mi_scores = {} + + for columns in df.columns: + X = df.drop(columns, axis=1) + Y = df[columns] + + mi = mutual_info_regression(X, Y) + mi_scores[columns] = mi.mean() # + + + mi_sorted = sorted(mi_scores.items(), key=lambda x: x[1], reverse=True) + + return mi_sorted \ No newline at end of file diff --git a/bibmon/_bibmon_tools.py b/bibmon/_bibmon_tools.py index fd598d9..210b961 100644 --- a/bibmon/_bibmon_tools.py +++ b/bibmon/_bibmon_tools.py @@ -2,6 +2,7 @@ import pandas as pd from datetime import datetime import matplotlib.pyplot as plt +from typing import Literal ############################################################################### @@ -692,4 +693,86 @@ def comparative_table (models, X_train, X_validation, X_test, return_tables.append(times_df) - return return_tables \ No newline at end of file + return return_tables + +############################################################################## + +def find_df_transitions( + df: pd.DataFrame, + threshold: float = 1, + data_type: Literal["string", "number"] = "number", + label: str = None, +) -> list[int]: + """ + Finds transitions in a DataFrame. This can be used to find indices of interesting events in the data. + + Parameters + ---------- + df: pandas.DataFrame + Data to be analyzed. + threshold: float, optional + Threshold to be used in the transition detection, this is the minimum difference between two consecutive points. Will be used only if data_type is 'number'. + data_type: str, optional + Type of data to be analyzed. If 'number', the threshold will be used to detect transitions. If 'string', the function will look for changes in the values. + label: str + Label to be used in the transition detection. + + Returns + ---------- + : list of ints + Indices of the transitions. + """ + + if label is None: + return [] + + transitions = [] + previous_event = df[label].iloc[0] + + for i in range(1, len(df)): + if data_type == "number": + if abs(df[label].iloc[i] - previous_event) > threshold: + transitions.append(i) + previous_event = df[label].iloc[i] + elif data_type == "string": + if df[label].iloc[i] != previous_event: + transitions.append(i) + previous_event = df[label].iloc[i] + + return transitions + +############################################################################### + +def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]: + """ + Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1. + + For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%. + + Warning: This function may cause data loss if the split cannot be done exactly according to the percentages. + + Parameters + ---------- + df: pandas.DataFrame + Data to be split. + percentages: list of floats + List of percentages to be used in the split. + + Returns + ---------- + : list of pandas.DataFrames + List with the split DataFrames. + """ + + if sum(percentages) != 1: + raise ValueError("The sum of the percentages must be 1.") + + split_dfs = [] + start = 0 + + for i in range(len(percentages)): + end = start + int(percentages[i] * len(df)) + split_dfs.append(df.iloc[start:end]) + start = end + + return split_dfs \ No newline at end of file diff --git a/bibmon/_gridsearch.py b/bibmon/_gridsearch.py new file mode 100644 index 0000000..08f8a62 --- /dev/null +++ b/bibmon/_gridsearch.py @@ -0,0 +1,259 @@ +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.ensemble import RandomForestRegressor +import pandas as pd +import joblib +import matplotlib.pyplot as plt +import numpy as np +from sklearn.model_selection import cross_val_score +import xgboost as xgb + +class GridSearchModel: + """ + Class used to encapsulate the grid search process for hyperparameter tuning. + + Parameters + ---------- + model : estimator object + The base model to perform grid search on (e.g., RandomForestRegressor). + + param_grid : dict, optional + The grid of parameters to search over. If not provided, a default + parameter grid for RandomForestRegressor will be used. + + scoring : string, optional + The scoring method to use for evaluating the models. + + cv : int, optional + Number of cross-validation folds. + + test_size : float, optional + The percentage of data used for testing, default is 0.1 (10%). + + Attributes + ---------- + best_params_ : dict + The best parameters found by the grid search. + + best_estimator_ : estimator object + The estimator that was chosen by the grid search. + + Methods + ---------- + fit(df, target_column): + Fits the grid search using the provided data. + """ + + def __init__(self, model=None, param_grid=None, scoring='neg_mean_absolute_error', cv=5, test_size=0.01): + self.model = model if model is not None else RandomForestRegressor(n_jobs=-1, random_state=42) + + # Param_grid default + default_param_grid = { + 'n_estimators': [10, 100], + 'max_depth': [None, 10], + 'min_samples_split': [2, 4], + } + self.param_grid = param_grid if param_grid is not None else default_param_grid + + self.scoring = scoring + self.cv = cv + self.test_size = test_size # Using only 10% of the data for testing + self.grid_search = None + self.best_params_ = None + self.best_estimator_ = None + + def fit(self, df: pd.DataFrame, target_column: str): + """ + Fit the grid search model on the provided data. + + Parameters + ---------- + df : pandas.DataFrame + The dataset including features and target column. + + target_column : str + The name of the column to be predicted (target variable). + """ + X = df.drop(columns=[target_column]) + Y = df[target_column] + + + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=self.test_size, random_state=42) + + self.grid_search = GridSearchCV( + estimator=self.model, + param_grid=self.param_grid, + scoring=self.scoring, + cv=self.cv, + n_jobs=-1 + ) + + self.grid_search.fit(X_train, Y_train) + + self.best_params_ = self.grid_search.best_params_ + self.best_estimator_ = self.grid_search.best_estimator_ + + return self.best_estimator_ + + def predict(self, X): + """ + Make predictions using the best estimator found by grid search. + + Parameters + ---------- + X : pandas.DataFrame or numpy.ndarray + Input data for making predictions. + + Returns + ---------- + y_pred : numpy.ndarray + Predicted values. + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + return self.best_estimator_.predict(X) + + def score(self, X_test, y_test): + """ + Evaluate the best estimator's performance on the test set. + + Parameters + ---------- + X_test : pandas.DataFrame or numpy.ndarray + Test features. + y_test : pandas.Series or numpy.ndarray + True values for the test set. + + Returns + ---------- + score : float + The score of the best model on the test set based on the defined scoring metric. + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + return self.best_estimator_.score(X_test, y_test) + + def get_best_params(self): + """ + Get the best hyperparameters found by grid search. + + Returns + ---------- + best_params : dict + Dictionary of the best hyperparameters. + """ + if self.best_params_ is None: + raise ValueError("Grid search has not been performed yet. Call the `fit` method first.") + return self.best_params_ + + + def save_model(self, filename): + """ + Save the best model to a file. + + Parameters + ---------- + filename : str + Path where the model should be saved. + + Returns + ---------- + None + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + joblib.dump(self.best_estimator_, filename) + + def load_model(self, filename): + """ + Load a model from a file. + + Parameters + ---------- + filename : str + Path to the saved model file. + + Returns + ---------- + None + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + self.best_estimator_ = joblib.load(filename) + + + def plot_feature_importance(self, feature_names): + """ + Plot the feature importance from the best estimator. + + Parameters + ---------- + feature_names : list + List of feature names in the same order as they appear in the dataset. + + Returns + ---------- + None + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + + importances = self.best_estimator_.feature_importances_ + indices = np.argsort(importances)[::-1] + + plt.figure(figsize=(10, 6)) + plt.title("Feature Importance") + plt.bar(range(len(importances)), importances[indices], align='center') + plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90) + plt.tight_layout() + plt.show() + + def cross_val_score_summary(self, X, Y): + """ + Get a summary of cross-validation scores for the best model. + + Parameters + ---------- + X : pandas.DataFrame + Input data (features). + Y : pandas.Series + Target variable. + + Returns + ---------- + summary : dict + Dictionary with mean and standard deviation of cross-validation scores. + """ + if self.best_estimator_ is None: + raise ValueError("The model has not been fitted yet. Call the `fit` method first.") + + scores = cross_val_score(self.best_estimator_, X, Y, cv=self.cv, scoring=self.scoring) + return { + 'mean_score': scores.mean(), + 'std_dev': scores.std(), + 'scores': scores + } + + def xgboost_best(self, df: pd.DataFrame, target_column: str): + """Perform grid search to find the best parameters for XGBRegressor.""" + X = df.drop(columns=[target_column]) + Y = df[target_column] + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=self.test_size, random_state=42) + + param_grid = { + 'n_estimators': [100, 200], + 'learning_rate': [ 0.05, 0.1], + 'max_depth': [None, 6], + 'subsample': [0.1,0.5,], + 'colsample_bytree': [0.1,0.5], + 'gamma': [0, 0.1], + 'reg_alpha': [0, 0.1], + 'reg_lambda': [0.1, 0.5] + } + + model = xgb.XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', random_state=42) + self.grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=self.scoring, cv=self.cv, n_jobs=-1) + self.grid_search.fit(X_train, Y_train) + self.best_params_ = self.grid_search.best_params_ + self.best_estimator_ = self.grid_search.best_estimator_ + + return self.best_estimator_ \ No newline at end of file diff --git a/bibmon/_load_data.py b/bibmon/_load_data.py index 85d160f..bcb2f3b 100644 --- a/bibmon/_load_data.py +++ b/bibmon/_load_data.py @@ -1,13 +1,15 @@ import os import pandas as pd import importlib.resources as pkg_resources +from typing import Literal -from ._bibmon_tools import create_df_with_dates -from . import real_process_data, tennessee_eastman +from . import _bibmon_tools as b_tools +from . import real_process_data, tennessee_eastman, three_w ############################################################################### -def load_tennessee_eastman (train_id = 0, test_id = 0): + +def load_tennessee_eastman(train_id=0, test_id=0): """ Load the 'Tennessee Eastman Process' benchmark data. @@ -20,71 +22,110 @@ def load_tennessee_eastman (train_id = 0, test_id = 0): Identifier of the test data. No fault: 0. With faults: 1 to 20. Returns - ---------- + ---------- train_df: pandas.DataFrame Training data. test_df: pandas.DataFrame Test data. - """ - - tags1 = ['XMEAS('+str(ii)+')' for ii in range(1,42)] - tags2 = ['XMV('+str(ii)+')' for ii in range(1,12)] + """ + + tags1 = ["XMEAS(" + str(ii) + ")" for ii in range(1, 42)] + tags2 = ["XMV(" + str(ii) + ")" for ii in range(1, 12)] tags = tags1 + tags2 - - file_train = f'd{train_id}.dat' - file_test = f'd{test_id}_te.dat' + + file_train = f"d{train_id}.dat" + file_test = f"d{test_id}_te.dat" if len(file_train) == 6: - file_train = file_train[:2]+'0'+file_train[2:] - + file_train = file_train[:2] + "0" + file_train[2:] + if len(file_test) == 9: - file_test = file_test[:1]+'0'+file_test[1:] + file_test = file_test[:1] + "0" + file_test[1:] with pkg_resources.path(tennessee_eastman, file_train) as filepath: - if file_train == 'd00.dat': - - tmp1 = pd.read_csv(filepath,sep='\t', - names=['0']) - tmp2 = pd.DataFrame([tmp1.T.iloc[0,i].strip() for - i in range(tmp1.shape[0])]) + if file_train == "d00.dat": + + tmp1 = pd.read_csv(filepath, sep="\t", names=["0"]) + tmp2 = pd.DataFrame( + [tmp1.T.iloc[0, i].strip() for i in range(tmp1.shape[0])] + ) train_df = pd.DataFrame() for ii in range(52): - train_df[tags[ii]]=[float(s) for s in tmp2[0][ii].split(' ')] - - train_df = create_df_with_dates(train_df, - freq = '3min') - + train_df[tags[ii]] = [float(s) for s in tmp2[0][ii].split(" ")] + + train_df = b_tools.create_df_with_dates(train_df, freq="3min") + else: - train_df = create_df_with_dates(pd.read_csv(filepath, - sep = '\s+', - names = tags), - freq = '3min') + train_df = b_tools.create_df_with_dates( + pd.read_csv(filepath, sep="\s+", names=tags), freq="3min" + ) with pkg_resources.path(tennessee_eastman, file_test) as filepath: - test_df = create_df_with_dates(pd.read_csv(filepath, - sep = '\s+', - names = tags), - start = '2020-02-01 00:00:00', - freq = '3min') + test_df = b_tools.create_df_with_dates( + pd.read_csv(filepath, sep="\s+", names=tags), + start="2020-02-01 00:00:00", + freq="3min", + ) return train_df, test_df + ############################################################################### -def load_real_data (): + +def load_real_data(): """ Load a sample of real process data. The variables have been anonymized for availability in the library. - + Returns - ---------- + ---------- : pandas.DataFrame Process data. - """ + """ + + with pkg_resources.path(real_process_data, "real_process_data.csv") as file: + return pd.read_csv(file, index_col=0, parse_dates=True) + + +############################################################################### + +AVAILABLE_3W_CLASSES = ["8"] + + +def load_3w(dataset_class: Literal["8"] = "8"): + """ + Load the '3W-8' benchmark data. + + Parameters + ---------- + dataset_class: string + Identifier of the dataset class. + Available classes: '8'. + Returns + ---------- + : pandas.DataFrame + Process data. + : configparser.ConfigParser + Configuration file. + """ + + if dataset_class != "8": + raise ValueError( + f"Dataset class not available. Available classes: {AVAILABLE_3W_CLASSES}" + ) - with pkg_resources.path(real_process_data,'real_process_data.csv') as file: - return pd.read_csv(file,index_col = 0, parse_dates = True) \ No newline at end of file + with pkg_resources.path(three_w, "WELL-00019_20120601165020.parquet") as file: + with pkg_resources.path(three_w, "dataset.ini") as path: + ini = three_w.tools.load_dataset_ini(path) + return ( + pd.read_parquet( + file, + engine=ini.get("PARQUET_SETTINGS", "PARQUET_ENGINE"), + ), + ini, + ) diff --git a/bibmon/_preprocess.py b/bibmon/_preprocess.py index 0a85adb..45890e8 100644 --- a/bibmon/_preprocess.py +++ b/bibmon/_preprocess.py @@ -1,7 +1,7 @@ import copy import pandas as pd import statsmodels.tsa.tsatools - +import numpy as np ############################################################################### class PreProcess (): @@ -391,4 +391,112 @@ def moving_average_filter (self, df, train_or_test = 'train', WS = 10): if hasattr(df,'name'): new_df.name = df.name - return new_df.drop(df.index[:WS]) \ No newline at end of file + return new_df.drop(df.index[:WS]) + + def time_based_windowing(self, df, train_or_test='train', window_size='30min', agg_func= "max"): + """ + Applies a rolling window based on time, where each window is determined by the timestamp index. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + window_size: string, optional + Size of the time window (e.g., '30min' for 30 minutes, '1H' for 1 hour). + agg_func: string, optional + Aggregation function to apply within each window ('mean', 'sum', 'max', 'min', etc.). + + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + + if not pd.api.types.is_datetime64_any_dtype(df.index): + raise ValueError("DataFrame index must be a datetime type.") + + if agg_func == 'mean': + return df.resample(window_size).mean() + elif agg_func == 'sum': + return df.resample(window_size).sum() + elif agg_func == 'max': + return df.resample(window_size).max() + elif agg_func == 'min': + return df.resample(window_size).min() + else: + raise ValueError("Unsupported aggregation function. Choose from 'mean', 'sum', 'max', or 'min'.") + + + def remove_constant_difference(self, df,train_or_test='train'): + """ + Calculate the cumulative sum. + Drop columns with constant difference. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + + Returns + ------- + pandas.DataFrame + Processed DataFrame without columns containing constant difference. + """ + cumulative_df = df.cumsum() + columns_to_drop = [] + + for column in cumulative_df.columns: + if not pd.api.types.is_numeric_dtype(cumulative_df[column]): + raise TypeError(f"'{column}' is not numeric type.") + + diff = cumulative_df[column].diff() + + # Verify constant difference + if len(diff.dropna()) > 1 and diff.nunique() == 1: + columns_to_drop.append(column) + + cumulative_df.drop(columns=columns_to_drop, inplace=True) + return cumulative_df + + def differencing(self, df, train_or_test='train', lag=1): + """ + Apply differencing to remove trend or seasonality. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + lag: int, optional + Lag to be used for differencing. + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + return df.diff(periods=lag).dropna() + + + + def log_transform(self, df, train_or_test='train'): + """ + Apply log transformation to reduce variability. + + Parameters + ---------- + df: pandas.DataFrame + Data to be processed. + train_or_test: string, optional + Indicates which step the data corresponds to. + + Returns + ---------- + : pandas.DataFrame + Processed data. + """ + return df.apply(lambda x: np.log1p(x)) \ No newline at end of file diff --git a/bibmon/three_w/LICENSE-CC-BY b/bibmon/three_w/LICENSE-CC-BY new file mode 100644 index 0000000..52bd145 --- /dev/null +++ b/bibmon/three_w/LICENSE-CC-BY @@ -0,0 +1,395 @@ +Attribution 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution 4.0 International Public License ("Public License"). To the +extent this Public License may be interpreted as a contract, You are +granted the Licensed Rights in consideration of Your acceptance of +these terms and conditions, and the Licensor grants You such rights in +consideration of benefits the Licensor receives from making the +Licensed Material available under these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/bibmon/three_w/README.md b/bibmon/three_w/README.md new file mode 100644 index 0000000..d37506f --- /dev/null +++ b/bibmon/three_w/README.md @@ -0,0 +1,83 @@ +[![CC BY 4.0][cc-by-shield]][cc-by] +[![Versioning][semver-shield]][semver] + +[cc-by]: http://creativecommons.org/licenses/by/4.0/ +[cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg +[semver]: https://semver.org +[semver-shield]: https://img.shields.io/badge/semver-2.0.0-blue + +# Table of Content + +* [Introduction](#introduction) +* [Release Notes](#release-notes) + * [1.0.0](#100) + * [1.1.0](#110) + * [1.1.1](#111) + * [2.0.0](#200) + +# Introduction + +All 3W Dataset data files (files in the subdirectories of the [dataset](dataset) directory) are licensed under the [Creative Commons Attribution 4.0 International License][cc-by]. + +# Release Notes + +Each subsection below contains release notes for a specific 3W Dataset version. Differences from the immediately previous version are highlighted. + +## 1.0.0 + +Release: July 1, 2019. + +This was the first published version, which is fully described in [this](https://doi.org/10.1016/j.petrol.2019.106223) paper. + +## 1.1.0 + +Release: December 30, 2022. + +Highlights: + +1. New instances were added as follows: + * 1 instance of event type 7. +1. Instances were removed due to issues identified as follows: + * 3 instances of event type 0; + * 1 instance of event type 5; + * 3 instances of event type 8, when compared to what is described in the paper **A realistic and public dataset with rare undesirable real events in oil wells** published in the **Journal of Petroleum Science and Engineering** (link [here](https://doi.org/10.1016/j.petrol.2019.106223)). +1. Normal periods of certain instances with anomalies were increased as possible. We tried to have instances with minimum normal periods of 1 hour; +1. Names of certain files with instances have changed due to increased normal periods; +1. Labels in some real instances were adjusted by experts; +1. All values of some variables in some real instances were corrected due to corrections in historian systems' tag configurations; +1. Certain variable values ​​have undergone minimal change due to different rounding; +1. The 3W Dataset's main configuration file ([dataset.ini](dataset.ini)) was updated. + +## 1.1.1 + +Release: April 09, 2023. + +Highlights: + +1. Issue #60 was resolved; +1. Issue #65 was resolved; +1. Certain variable values ​​have undergone minimal change due to different rounding; +1. The 3W Dataset's main configuration file ([dataset.ini](dataset.ini)) was updated. + +## 2.0.0 + +Release: July 25, 2024. + +Highlights: + +1. All instances are now saved in Parquet files (created with the `pyarrow` engine and `brotli` compression); +1. Reduction in disk space occupied by the 3W Dataset of 3.15 GB (from 4.89 GB to 1.74 GB); +1. Real and simulated instances of type 9 were added; +1. Several instances of types 0, 3, 4, 5, 6 and 8 were added; +1. Another 24 real wells were covered with new real instances (now 42 real wells are covered); +1. Some real instances, mainly of type 1, were removed; +1. 1 variable was removed (`T-JUS-CKGL`); +1. Another 20 variables were added (there are now 27 variables); +1. Another label referring to well operational status was added; +1. Normal periods in several real instances with unwanted events were extended; +1. All labeling gaps in real instances were eliminated (all observations were labeled); +1. Conversions between measurement units in several instances were corrected; +1. Labels in several real instances were adjusted by experts; +1. All values of some variables in some real instances were corrected due to corrections in historian systems' tag configurations; +1. Certain variable values ​​have undergone minimal change due to different rounding; +1. The 3W Dataset's main configuration file ([dataset.ini](dataset.ini)) was updated. \ No newline at end of file diff --git a/bibmon/three_w/WELL-00019_20120601165020.parquet b/bibmon/three_w/WELL-00019_20120601165020.parquet new file mode 100644 index 0000000..abce3e4 Binary files /dev/null and b/bibmon/three_w/WELL-00019_20120601165020.parquet differ diff --git a/bibmon/three_w/__init__.py b/bibmon/three_w/__init__.py new file mode 100644 index 0000000..fed9bd2 --- /dev/null +++ b/bibmon/three_w/__init__.py @@ -0,0 +1 @@ +from . import tools \ No newline at end of file diff --git a/bibmon/three_w/dataset.ini b/bibmon/three_w/dataset.ini new file mode 100644 index 0000000..d8e271f --- /dev/null +++ b/bibmon/three_w/dataset.ini @@ -0,0 +1,140 @@ +# 3W Dataset's main configuration file. +# +# All settings inherent in the 3W Dataset that can be used by your +# consumers, including the 3W Toolkit, are maintained in this file. +# In this file, we use the configuration language supported by the +# configparser module. + +# Versions in gereral +# +[VERSION] +# 3W Dataset version (may be different than 3W Toolkit version) +DATASET = 2.0.0 + +# Parquet files properties in general +# +[PARQUET_FILE_PROPERTIES] +timestamp = Instant at which observation was generated +ABER-CKGL = Opening of the GLCK (gas lift choke) [%%] +ABER-CKP = Opening of the PCK (production choke) [%%] +ESTADO-DHSV = State of the DHSV (downhole safety valve) [0, 0.5, or 1] +ESTADO-M1 = State of the PMV (production master valve) [0, 0.5, or 1] +ESTADO-M2 = State of the AMV (annulus master valve) [0, 0.5, or 1] +ESTADO-PXO = State of the PXO (pig-crossover) valve [0, 0.5, or 1] +ESTADO-SDV-GL = State of the gas lift SDV (shutdown valve) [0, 0.5, or 1] +ESTADO-SDV-P = State of the production SDV (shutdown valve) [0, 0.5, or 1] +ESTADO-W1 = State of the PWV (production wing valve) [0, 0.5, or 1] +ESTADO-W2 = State of the AWV (annulus wing valve) [0, 0.5, or 1] +ESTADO-XO = State of the XO (crossover) valve [0, 0.5, or 1] +P-ANULAR = Pressure in the well annulus [Pa] +P-JUS-BS = Downstream pressure of the SP (service pump) [Pa] +P-JUS-CKGL = Downstream pressure of the GLCK (gas lift choke) [Pa] +P-JUS-CKP = Downstream pressure of the PCK (production choke) [Pa] +P-MON-CKGL = Upstream pressure of the GLCK (gas lift choke) [Pa] +P-MON-CKP = Upstream pressure of the PCK (production choke) [Pa] +P-MON-SDV-P = Upstream pressure of the production SDV (shutdown valve) [Pa] +P-PDG = Pressure at the PDG (permanent downhole gauge) [Pa] +PT-P = Downstream pressure of the PWV (production wing valve) in the production tube [Pa] +P-TPT = Pressure at the TPT (temperature and pressure transducer) [Pa] +QBS = Flow rate at the SP (service pump) [m3/s] +QGL = Gas lift flow rate [m3/s] +T-JUS-CKP = Downstream temperature of the PCK (production choke) [oC] +T-MON-CKP = Upstream temperature of the PCK (production choke) [oC] +T-PDG = Temperature at the PDG (permanent downhole gauge) [oC] +T-TPT = Temperature at the TPT (temperature and pressure transducer) [oC] +class = Label of the observation +state = Well operational status + +# Common properties of all event types covered by the 3W Project +# +[EVENTS] +# Internal names of all event types +NAMES = NORMAL, ABRUPT_INCREASE_OF_BSW, SPURIOUS_CLOSURE_OF_DHSV, + SEVERE_SLUGGING, FLOW_INSTABILITY, RAPID_PRODUCTIVITY_LOSS, + QUICK_RESTRICTION_IN_PCK, SCALING_IN_PCK, + HYDRATE_IN_PRODUCTION_LINE, HYDRATE_IN_SERVICE_LINE +# Simulated and hand-drawn instances +EXTRA_INSTANCES_TRAINING = -1 +# Offset between an undesirable event's label and its transient label +TRANSIENT_OFFSET = 100 + +# This section defines default properties for a specific event type +# +[NORMAL] +LABEL = 0 +DESCRIPTION = Normal Operation + +# This section defines default properties for a specific event type +# +[ABRUPT_INCREASE_OF_BSW] +LABEL = 1 +DESCRIPTION = Abrupt Increase of BSW +TRANSIENT = True + +# This section defines default properties for a specific event type +# +[SPURIOUS_CLOSURE_OF_DHSV] +LABEL = 2 +DESCRIPTION = Spurious Closure of DHSV +TRANSIENT = True +WINDOW = 180 +STEP = 15 + +# This section defines default properties for a specific event type +# +[SEVERE_SLUGGING] +LABEL = 3 +DESCRIPTION = Severe Slugging +TRANSIENT = False + +# This section defines default properties for a specific event type +# +[FLOW_INSTABILITY] +LABEL = 4 +DESCRIPTION = Flow Instability +TRANSIENT = False + +# This section defines default properties for a specific event type +# +[RAPID_PRODUCTIVITY_LOSS] +LABEL = 5 +DESCRIPTION = Rapid Productivity Loss +TRANSIENT = True + +# This section defines default properties for a specific event type +# +[QUICK_RESTRICTION_IN_PCK] +LABEL = 6 +DESCRIPTION = Quick Restriction in PCK +TRANSIENT = True + +# This section defines default properties for a specific event type +# +[SCALING_IN_PCK] +LABEL = 7 +DESCRIPTION = Scaling in PCK +TRANSIENT = True + +# This section defines default properties for a specific event type +# +[HYDRATE_IN_PRODUCTION_LINE] +LABEL = 8 +DESCRIPTION = Hydrate in Production Line +TRANSIENT = True + +# This section defines default properties for a specific event type +# +[HYDRATE_IN_SERVICE_LINE] +LABEL = 9 +DESCRIPTION = Hydrate in Service Line +TRANSIENT = True + +# Settings related to the Parquet format +# +[PARQUET_SETTINGS] +# Used/chosen extension +PARQUET_EXTENSION = .parquet +# Used/chosen extension engine +PARQUET_ENGINE = pyarrow +# Used/chosen extension compression +PARQUET_COMPRESSION = brotli \ No newline at end of file diff --git a/bibmon/three_w/tools.py b/bibmon/three_w/tools.py new file mode 100644 index 0000000..a73ced2 --- /dev/null +++ b/bibmon/three_w/tools.py @@ -0,0 +1,99 @@ +import configparser +import pandas as pd +from .. import _preprocess as preproc +from .. import _bibmon_tools as b_tools +from typing import Literal, Tuple + + +def load_dataset_ini(dataset_ini_path) -> configparser.ConfigParser: + """ + Loads the dataset.ini file. + + Parameters + ---------- + dataset_ini_path: string + Path to the dataset.ini file. + Returns + ---------- + : configparser.ConfigParser + Configuration file. + """ + + config = configparser.ConfigParser() + config.read(dataset_ini_path) + + return config + + +############################################################################### + + +def split_dataset( + DataFrame: pd.DataFrame, + config_file: configparser.ConfigParser, + training_percentage: float = 0.8, + validation_percentage: float = 0.2, + split_on: Literal["transition_state", "error_state"] = "transition_state", +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Splits a DataFrame into training, validation and test sets. + + The split is done according to the transition_state or error_state, the sum of training and validation percentages must equal 1. + + Parameters + ---------- + DataFrame: pandas.DataFrame + Data to be split. + config_file: configparser.ConfigParser + Configuration file. + training_percentage: float + Percentage of the training set. + validation_percentage: float + Percentage of the validation set. + split_on: string + Split the data on the transition_state or error_state. + Returns + ---------- + : tuple of pandas.DataFrames + Training, validation and test sets. + """ + + if training_percentage + validation_percentage != 1: + raise ValueError( + "The sum of training and validation percentages must equals 1." + ) + + df_processed = preproc.PreProcess(f_pp=["ffill_nan"]).apply(DataFrame) + + transitions = b_tools.find_df_transitions(df_processed, 1, "number", "class") + + split_index = 0 + + for t in transitions: + if t == 0: + continue + + transition_class = int(df_processed.iloc[t]["class"]) + transient_offset = int(config_file.get("EVENTS", "TRANSIENT_OFFSET")) + + if transition_class > transient_offset and split_on == "transition_state": + split_index = t + break + if transition_class < transient_offset and split_on == "error_state": + split_index = t + break + + if split_index == 0: + raise ValueError("No split index found.") + + train_and_validation_df = df_processed.iloc[:split_index] + test_df = df_processed.iloc[split_index:] + + splitted_df = b_tools.split_df_percentages( + train_and_validation_df, [training_percentage, validation_percentage] + ) + + train_df = splitted_df[0] + validation_df = splitted_df[1] + + return train_df, validation_df, test_df diff --git a/requirements.txt b/requirements.txt index d157d87..bf549e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ pandas>=2.2.2 matplotlib>=3.9.0 seaborn>=0.13.2 statsmodels>=0.14.1 -optuna>=3.6.1 \ No newline at end of file +optuna>=3.6.1 +pyarrow>=17.0.0 \ No newline at end of file diff --git a/test/requirements.txt b/test/requirements.txt new file mode 100644 index 0000000..c714a35 --- /dev/null +++ b/test/requirements.txt @@ -0,0 +1 @@ +pytest>=8.x \ No newline at end of file diff --git a/test/test_tools.py b/test/test_tools.py index 03f0d5d..071bd33 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -9,6 +9,8 @@ import bibmon import pandas as pd +import bibmon.three_w + def test_complete_analysis(): # load data @@ -60,4 +62,41 @@ def test_complete_analysis(): fault_start = '2018-01-02 06:00:00', fault_end = '2018-01-02 09:00:00') - model.plot_importances() \ No newline at end of file + model.plot_importances() + + +def test_find_df_transitions(): + data = bibmon.load_real_data() + + transitions = bibmon._bibmon_tools.find_df_transitions(data, 1, "number", "tag101") + + assert transitions == [99, 101, 102, 103, 104, 106, 107, 108, 243] + +def test_split_df_percentages(): + data = bibmon.load_real_data() + + splitted = bibmon._bibmon_tools.split_df_percentages(data, [0.6, 0.2, 0.2]) + + assert splitted[0].shape[0] == 1901 + assert splitted[1].shape[0] == 633 + assert splitted[2].shape[0] == 633 + +def test_split_df_percentages_error(): + data = bibmon.load_real_data() + + try: + _ = bibmon._bibmon_tools.split_df_percentages(data, [0.6, 0.2]) + except ValueError: + assert True + +def test_3w_load_dataset_ini(): + config = bibmon.three_w.tools.load_dataset_ini("dataset.ini") + + assert config["VERSION"]["DATASET"] == "2.0.0" + +def test_3w_split_dataset(): + data, conf = bibmon.load_3w() + + train_df, validation_df, test_df = bibmon.three_w.tools.split_dataset(data, conf) + + assert (train_df.shape[0], validation_df.shape[0], test_df.shape[0]) == (85112, 21278, 136746) \ No newline at end of file