forked from Kevin44774/Email-Spoofing-Detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
87 lines (78 loc) · 4.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
# Check if the vectorizers have been previously saved
# try:
# vectorizer_count = joblib.load('vectorizer_count.joblib')
# vectorizer_tfidf = joblib.load('vectorizer_tfidf.joblib')
# except FileNotFoundError:
# df = pd.read_csv('final_random_spam_filtering_dataset.csv')
# df['label'] = df.label.map({'ham':0,'spam':1})
# X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
#
# # Vectorize the text data using CountVectorizer and TfidfVectorizer
# vectorizer_count = CountVectorizer()
# vectorizer_tfidf = TfidfVectorizer()
# X_train_count = vectorizer_count.fit_transform(X_train)
# X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
#
# # Save the vectorizers for future use
# joblib.dump(vectorizer_count, 'vectorizer_count.joblib')
# joblib.dump(vectorizer_tfidf, 'vectorizer_tfidf.joblib')
vectorizer_count = joblib.load('vectorizer_count.joblib')
vectorizer_tfidf = joblib.load('vectorizer_tfidf.joblib')
# Load the models for future use
loaded_knn_count_model = joblib.load('models/knn_count_model.joblib')
loaded_lr_tfidf_model = joblib.load('models/lr_tfidf_model.joblib')
loaded_nb_count_model = joblib.load('models/nb_count_model.joblib')
loaded_knn_tfidf_model = joblib.load('models/knn_tfidf_model.joblib')
loaded_lr_count_model = joblib.load('models/lr_count_model.joblib')
loaded_nb_tfidf_model = joblib.load('models/nb_tfidf_model.joblib')
new_text = ["Congratulations!!! you won a 100000 $ lottery click here to claim "]
new_text_count = vectorizer_count.transform(new_text)
new_text_tfidf = vectorizer_tfidf.transform(new_text)
# Use the trained models to predict whether the new text is spam or ham
lr_count_prediction = loaded_lr_count_model.predict(new_text_count)
lr_tfidf_prediction = loaded_lr_tfidf_model.predict(new_text_tfidf)
nb_count_prediction = loaded_nb_count_model.predict(new_text_count)
nb_tfidf_prediction = loaded_nb_tfidf_model.predict(new_text_tfidf)
knn_count_prediction = loaded_knn_count_model.predict(new_text_count)
knn_tfidf_prediction = loaded_knn_tfidf_model.predict(new_text_tfidf)
lr_count_scores = loaded_lr_count_model.predict_proba(new_text_count)
lr_tfidf_scores = loaded_lr_tfidf_model.predict_proba(new_text_tfidf)
nb_count_scores = loaded_nb_count_model.predict_proba(new_text_count)
nb_tfidf_scores = loaded_nb_tfidf_model.predict_proba(new_text_tfidf)
knn_count_scores = loaded_knn_count_model.predict_proba(new_text_count)
knn_tfidf_scores = loaded_knn_tfidf_model.predict_proba(new_text_tfidf)
print('lr_count_prediction', lr_count_prediction)
print('lr_count_scores:', lr_count_scores)
print('lr_tfidf_prediction', lr_tfidf_prediction)
print('lr_tfidf_scores:', lr_tfidf_scores)
print('nb_count_prediction', nb_count_prediction)
print('nb_count_scores:', nb_count_scores)
print('nb_tfidf_prediction', nb_tfidf_prediction)
print('nb_tfidf_scores:', nb_tfidf_scores)
print('knn_count_prediction', knn_count_prediction)
print('knn_count_scores:', knn_count_scores)
print('knn_tfidf_prediction', knn_tfidf_prediction)
print('knn_tfidf_scores:', knn_tfidf_scores)
zeros_prob = []
ones_prob = []
lr_count_zero, lr_count_one = np.split(lr_count_scores[0], 2)
lr_tfidf_zero, lr_tfidf_one = np.split(lr_tfidf_scores[0], 2)
nb_count_zero, nb_count_one = np.split(nb_count_scores[0], 2)
nb_tfidf_zero, nb_tfidf_one = np.split(nb_tfidf_scores[0], 2)
knn_count_zero, knn_count_one = np.split(knn_count_scores[0], 2)
knn_tfidf_zero, knn_tfidf_one = np.split(knn_tfidf_scores[0], 2)
zeros_prob.extend([lr_count_zero, lr_tfidf_zero, nb_count_zero, nb_tfidf_zero, knn_count_zero, knn_tfidf_zero])
ones_prob.extend([lr_count_one, lr_tfidf_one, nb_count_one, nb_tfidf_one, knn_count_one, knn_tfidf_one])
zeros_prob = [round(score.tolist()[0], 2) for score in zeros_prob]
ones_prob = [round(label.tolist()[0], 2) for label in ones_prob]
accumulate_result = []
accumulate_result.extend([lr_count_prediction, lr_tfidf_prediction, nb_count_prediction, nb_tfidf_prediction,
knn_count_prediction, knn_tfidf_prediction])
accumulate_result = [round(label.tolist()[0], 2) for label in accumulate_result]
final_result = 'not spam' if accumulate_result.count(0) > accumulate_result.count(1) else 'spam'
print(zeros_prob, ones_prob, accumulate_result, final_result)