Machine Learning model for detecting fraudulent transaction
import numpy as np # support multi-dimensional arrays and matrices
import pandas as pd # provide high-performance, easy-to-use data structures and data analysis tools
data = pd.read_csv('Fraud.csv')
[https://drive.google.com/drive/folders/1kC2b0_rDrb5HpJFNXP3ZFlD2ePeMzZ9Y?usp=drive_link]
data.head()
data.tail()
data.shape
data.isna().sum()
data.isFraud.value_counts()
data.isFlaggedFraud.value_counts()
data=data.drop(['nameOrig','nameDest'],axis=1)
data.shape
data.head()
from sklearn import preprocessing
label_encoder object knows how to understand word labels.It assigns a unique integer to each category in the data
label_encoder = preprocessing.LabelEncoder()
the 'type' column in the DataFrame data will be replaced with the numerical labels generated by the LabelEncoder.
data['type']= label_encoder.fit_transform(data['type'])
data.head()
- It includes all columns from the DataFrame data except for the column 'isFraud'.
- X will represent the input data for training the model.
- It corresponds to the column 'isFraud' from the DataFrame data.
- y will represent the output or labels for training the model.
data.loc[:, data.columns != 'isFraud'] selects all rows and all columns from data except for the column 'isFraud'.
X, y = data.loc[:, data.columns != 'isFraud'], data['isFraud']
from sklearn.model_selection import train_test_split
preprocessing technique used to standardize features by removing the mean and scaling them to unit variance.
from sklearn.preprocessing import StandardScaler
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.40,random_state=42)
It ensures that the mean and standard deviation of each feature are approximately 0 and 1, respectively, in both sets.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
to create an instance of the Gaussian Naive Bayes model and use it to train and make predictions on your data.
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gnb = GaussianNB()
gnb.fit(X_train, y_train)
Predict the response for test dataset, ".predict" takes the test features as input and returns the predicted class labels.
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print ("Accuracy : ", metrics.accuracy_score(y_test, y_pred)*100)
- confusion_matrix: Visualizes the performance of the logistic regression model
- roc_curve: Visualizes the trade-off between the true positive rate (sensitivity) and the false positive rate (1-specificity) across different threshold values.
- AUC: The area under the ROC curve (AUC) quantifies the model's ability to discriminate between positive and negative cases.
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.xticks([0, 1], ['Non-Fraud', 'Fraud'])
plt.yticks([0, 1], ['Non-Fraud', 'Fraud'])
plt.tight_layout()
for i in range(2):
for j in range(2):
plt.text(j, i, format(conf_matrix[i, j], 'd'),
horizontalalignment="center",
color="white" if conf_matrix[i, j] > conf_matrix.max() / 2. else "black")
plt.show()
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
# Plot Receiver Operating Characteristic curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()