Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Classification Model for Fault Detection in 3W Dataset #64

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c0095ad
Add ClassificationModel to bibmon package
yantavares Oct 13, 2024
4725395
Refactor complete_analysis function to include an optional algorithm …
yantavares Oct 13, 2024
16de5bd
Refactor complete_analysis function to include an optional algorithm …
yantavares Oct 13, 2024
160ff4e
Refactor complete_analysis function to include an optional algorithm …
yantavares Oct 13, 2024
142548a
Refactor complete_analysis function to include ClassificationModel in…
yantavares Oct 13, 2024
5328124
changed back (Was not a bug)
yantavares Oct 13, 2024
ad5e649
Update requirements.txt with new dependencies
yantavares Oct 13, 2024
4598a07
Refactor calculate_tipping_point function to remove filter_delay_in_s…
yantavares Oct 13, 2024
1a3ccc4
Refactor calculate_tipping_point function to remove unnecessary white…
yantavares Oct 13, 2024
c4c9e53
added new transient_start param
yantavares Oct 15, 2024
b78cdb1
removed redefining_limit for testing
yantavares Oct 16, 2024
827e37a
testing redefining limit for filter
yantavares Oct 16, 2024
51252de
Refactor redefine_limit condition in GenericModel class
yantavares Oct 17, 2024
ddc6736
Refactor plot_SPE function to include isValidation parameter
yantavares Oct 18, 2024
2361c7e
Remove testing dataset
yantavares Oct 18, 2024
7d26924
Refactor split_files method to include should_shuffle parameter
yantavares Oct 18, 2024
3f11f5e
Refactor label in GenericModel class to improve readability
yantavares Oct 19, 2024
286ada1
Refactor calculate_accuracy method to improve accuracy calculation
yantavares Oct 19, 2024
af0e895
Refactor calculate_accuracy method to remove unused code
yantavares Oct 19, 2024
3c58734
Refactor complete_analysis method to include use_val_limit parameter
yantavares Oct 19, 2024
3ead3db
removed new alarm limit
yantavares Oct 19, 2024
372872f
made cols variable to ensure it works outside of 3W as well
yantavares Oct 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ dmypy.json
# Pyre type checker
.pyre/

# Remove testing dataset
dataset/
3 changes: 2 additions & 1 deletion bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows
from ._classification_model import ClassificationModel

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'sklearnRegressor', 'PreProcess', 'ClassificationModel',
'load_tennessee_eastman', 'load_real_data',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
Expand Down
331 changes: 331 additions & 0 deletions bibmon/_classification_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import random
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from typing import List, Any, Tuple
from sklearn.metrics import accuracy_score, f1_score, classification_report
from time import time


class ClassificationModel:
"""
A class to train and predict error types from the 3W dataset using a customizable model.

Attributes
----------
dataset_path : str
Directory where the parquet files are stored.
model : Any
Custom model passed to the class.
scaler : StandardScaler
Scaler used to normalize the feature data.

Methods
-------
load_and_prepare_data(file_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
Loads, cleans, and prepares data from a parquet file.
split_files(files: List[str], train_size: float = 0.7) -> Tuple[List[str], List[str]]:
Splits the files into training and testing sets.
load_data_from_files(files: List[str]) -> Tuple[pd.DataFrame, pd.Series]:
Loads and concatenates data from a list of files.
train_model(X_train: pd.DataFrame, y_train: pd.Series) -> None:
Trains the provided model using the training data.
predict_single_file(file_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]:
Predicts classes for a single file.
plot_predictions_with_timestamp(timestamps: pd.Series, X: pd.DataFrame,
y_true: pd.Series, y_pred: pd.Series, y_axis: str) -> None:
Plots predictions and actual class values against timestamps.
complete_analysis(y_axis: str, files: List[str], train_size: float = 0.7) -> None:
Performs the complete analysis from data loading to prediction and visualization.
"""

def __init__(self, dataset_path: str, model: Any, col_to_predict='class', cols_to_drop=['state', 'timestamp']):
"""
Initializes the ClassificationModel class with the specified model, and data directory.

Parameters
----------
model : Any
Custom model to be used for training and prediction.
dataset_path : str, optional
Directory where the parquet files are stored.
col_to_predict : str, optional
Name of the column to predict (default is 'class').
cols_to_drop : List[str], optional
List of columns to drop from the dataset (default is None).

"""
self.dataset_path = dataset_path
self.model = model
self.scaler = StandardScaler()
self.col_to_predict = col_to_predict
self.cols_to_drop = cols_to_drop

def load_and_prepare_data(self, file_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
"""Load, clean, and prepare data from a parquet file."""
df = pd.read_parquet(file_path)
print(f"Loaded {file_path} with {len(df)} rows.")

df.reset_index(inplace=True)
df = df.dropna(subset=["timestamp"]).drop_duplicates(subset="timestamp").fillna(0)
df = df.sort_values(by="timestamp")

timestamps = df["timestamp"]

drop_cols = self.cols_to_drop + [self.col_to_predict]

X = df.drop(drop_cols, axis=1)
y = df[self.col_to_predict].astype(int)

return X, y, timestamps

def split_files(self, files: List[str], train_size: float = 0.7, should_shuffle=True) -> Tuple[List[str], List[str]]:
"""Split the files into training and testing sets."""
if should_shuffle:
random.shuffle(files)
split_idx = int(len(files) * train_size)
return files[:split_idx], files[split_idx:]

def load_data_from_files(self, files: List[str]) -> Tuple[pd.DataFrame, pd.Series]:
"""Load and concatenate data from a list of files."""
X_list, y_list = [], []
for file in files:
X, y, _ = self.load_and_prepare_data(file)
X_list.append(X)
y_list.append(y)

X_all = pd.concat(X_list, axis=0).reset_index(drop=True)
y_all = pd.concat(y_list, axis=0).reset_index(drop=True)

return X_all, y_all

def train_model(self, X_train: pd.DataFrame, y_train: pd.Series) -> None:
"""Train the provided model using the training data."""
X_train_scaled = self.scaler.fit_transform(X_train)
self.model.fit(X_train_scaled, y_train)

def predict_single_file(self, file_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series]:
"""Predict classes for a single file."""
X, y, timestamps = self.load_and_prepare_data(file_path)
X_scaled = self.scaler.transform(X)
y_pred = self.model.predict(X_scaled)

return X, y, y_pred, timestamps

def plot_predictions_with_timestamp(self, timestamps: pd.Series, X: pd.DataFrame,
y_true: pd.Series, y_pred: pd.Series, y_axis: str) -> None:
"""Plot predictions and actual class values against timestamps."""
y_true_normalized = self.normalize_classes(y_true)
y_pred_normalized = self.normalize_classes(y_pred)

# Adjusting the y positions to avoid overlap
offset = (max(y_true_normalized) - min(y_true_normalized)) * 0.015
y_true_adjusted = [y + offset for y in y_true_normalized]
y_pred_adjusted = [y - offset for y in y_pred_normalized]

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
go.Scatter(
x=timestamps, y=X[y_axis], mode='lines', name=y_axis, line=dict(color='#003366')
),
secondary_y=False,
)

fig.add_trace(
go.Scatter(
x=timestamps, y=y_true_adjusted, mode='markers', name='Actual Class',
opacity=0.8,
marker=dict(size=12, symbol='circle'),
text=[f"Class: {c}" for c in y_true],
textposition="top center"
),
secondary_y=True,
)

fig.add_trace(
go.Scatter(
x=timestamps, y=y_pred_adjusted, mode='markers', name='Predicted Class',
opacity=0.8,
marker=dict(size=10, symbol='x'),
text=[f"Pred: {p}" for p in y_pred],
textposition="bottom center"
),
secondary_y=True,
)

fig.update_layout(
title=f'{y_axis} with Actual vs Predicted Class',
xaxis_title='Timestamp',
yaxis_title=y_axis,
legend=dict(x=0.5, y=1.11, orientation='h'),
font=dict(size=18)
)

# Update secondary y-axis with tick values dynamically based on unique classes
unique_classes = sorted(y_true.unique())
tick_vals = self.normalize_classes(pd.Series(unique_classes))
tick_text = [f'Class {c}' for c in unique_classes]

fig.update_yaxes(
title_text="Class Values", secondary_y=True,
tickvals=tick_vals, ticktext=tick_text
)

fig.show()


def normalize_classes(self, classes: np.ndarray) -> List[float]:
"""
Normalize class values for clearer visualization by distributing
them equally across a range [0, 100].

Parameters
----------
classes : np.ndarray
Array of class values to be normalized.

Returns
-------
List[float]
List of normalized class values equally spaced across [0, 100].
"""
# Ensure the classes are treated as a Pandas Series
classes_series = pd.Series(classes)

# Get the unique class values and sort them
unique_classes = sorted(classes_series.unique())

# Generate equally spaced values between 0 and 100
normalized_values = np.linspace(0, 100, len(unique_classes))

# Create a mapping from original class to normalized value
class_mapping = {cls: norm for cls, norm in zip(unique_classes, normalized_values)}

# Map the original classes to normalized values
return [class_mapping[c] for c in classes]


def complete_analysis(self, y_axis: str, files: List[str], train_size: float = 0.7, should_shuffle=True) -> None:
"""
Perform a complete analysis by training the model, predicting, and visualizing results.

Parameters
----------
y_axis : str
The feature to plot on the y-axis.
files : List[str]
List of parquet files to analyze.
train_size : float, optional
Proportion of data to use for training (default is 0.7).
should_shuffle : bool, optional
Whether to shuffle the files before splitting (default is True).
"""
train_files, test_files = self.split_files(files, train_size, should_shuffle=should_shuffle)
print(f"Training on {len(train_files)} files, Testing on {len(test_files)} files.")

X_train, y_train = self.load_data_from_files(train_files)
self.train_model(X_train, y_train)

for file in test_files:
X, y_true, y_pred, timestamps = self.predict_single_file(file)
print(classification_report(y_true, y_pred, zero_division=0))
self.plot_predictions_with_timestamp(timestamps, X, y_true, y_pred, y_axis)

def evaluate_model(self, y_true: np.ndarray, y_pred: np.ndarray) -> dict:
"""
Evaluate a model's performance using several metrics.

Parameters
----------
y_true : np.ndarray
Ground truth (actual class labels).
y_pred : np.ndarray
Predicted class labels.

Returns
-------
dict
Dictionary containing accuracy and F1-score.
"""
return {
"accuracy": accuracy_score(y_true, y_pred),
"f1_score": f1_score(y_true, y_pred, average="weighted"),
"report": classification_report(y_true, y_pred, output_dict=True, zero_division=0)
}

def compare_models(self, models: List[Tuple[str, Any]],
train_files: List[str],
test_files: List[str]) -> pd.DataFrame:
"""
Train and evaluate multiple models to compare their performance.

Parameters
----------
models : List[Tuple[str, Any]]
List of tuples with model names and model objects.
train_files : List[str]
List of files used for training.
test_files : List[str]
List of files used for testing.

Returns
-------
pd.DataFrame
DataFrame with evaluation metrics for each model.
"""
results = []

# Load and prepare the training data
X_train, y_train = self.load_data_from_files(train_files)

for model_name, model in models:
print(f"Training model: {model_name}")
start_time = time()

# Train the model
self.model = model
self.train_model(X_train, y_train)

# Collect predictions on the test files
y_true_all, y_pred_all = [], []
for file in test_files:
_, y_true, y_pred, _ = self.predict_single_file(file)
y_true_all.extend(y_true)
y_pred_all.extend(y_pred)

# Evaluate the model's performance
metrics = self.evaluate_model(np.array(y_true_all), np.array(y_pred_all))
metrics["model"] = model_name
metrics["train_time"] = time() - start_time

results.append(metrics)

# Create a DataFrame to compare results
results_df = pd.DataFrame(results)

# Display the results
print("\nModel Comparison:")
print(results_df[["model", "accuracy", "f1_score", "train_time"]])

return results_df

if __name__ == "__main__":
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, class_weight='balanced')

dataset_path = './dataset/2'

# Initialize the classification model
classification_model = ClassificationModel(dataset_path=dataset_path, model=model)

# Get the list of parquet files
all_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith(".parquet") and f.startswith("WELL-00011")]

# Run the complete analysis
classification_model.complete_analysis(y_axis='T-TPT', files=all_files, train_size=0.8, should_shuffle=False)
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ pandas>=2.2.2
matplotlib>=3.9.0
seaborn>=0.13.2
statsmodels>=0.14.1
optuna>=3.6.1
optuna>=3.6.1
pyarrow>=17.0.0
plotly>=5.24.1