-
Notifications
You must be signed in to change notification settings - Fork 0
/
mySpamClassifier.py
124 lines (113 loc) · 5.16 KB
/
mySpamClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Naive Bayes Classifer
# Retrieve and label spam and ham documents from spam and ham directories.
# The features are the 2000 most common words in all the documents.
# Each feature gets a say in deciding which label should be assigned to a given input value.
# Starts by calculating prior probability of each label, determined by the frequency of
# each label in the training set, e.g. 60 spam and 40 ham out of 100 files,
# spam has a 60% prior probability and ham has a 40% prior probability.
# Each feature contributes to the prior probability to get a likelihood estimate foe each label.
# The label with the highest likelihood estimate is assigned to the input value,
# e.g. 39% estimate for spam, 61% estimate for ham, file is assigned "ham".
import nltk
import os
import random
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english') #all non-descriptive English words
class mySpamClassifier:
def __init__(self, spamFolder, hamFolder):
self.totalSpamWords = []
self.totalHamWords = []
self.totalSpamFile = []
self.spamFiles = os.listdir(spamFolder)
self.totalHamFile = []
self.hamFiles = os.listdir(hamFolder)
for docs in self.spamFiles:
textFile = open(spamFolder + "/" + docs, "r")
lines = textFile.readlines()
textFile.close()
wordList = [w.split() for w in lines]#splits the lines into words
words = sum(wordList, [])#flattens inot a simple list of all words
featureWords = [w.lower() for w in words if w not in stopwords and len(w) > 1 and w.isalpha()] #omits needless words
featureWords = list(set(featureWords)) # remove duplicates
self.totalSpamFile.append((featureWords, "spam")) # assigns label for file
self.totalSpamWords += featureWords #adds words to total words
for docs in self.hamFiles:
textFile = open(hamFolder + "/" + docs, "r")
lines = textFile.readlines()
textFile.close()
wordList = [w.split() for w in lines]#splits the lines into words
words = sum(wordList, [])#flattens inot a simple list of all words
featureWords = [w.lower() for w in words if w not in stopwords and len(w) > 1 and w.isalpha()] #omits needless words
featureWords = list(set(featureWords)) # remove duplicates
self.totalHamFile.append((featureWords, "ham")) # assigns label for file
self.totalHamWords += featureWords #adds words to total words
self.documents = self.totalSpamFile
self.documents += self.totalHamFile
random.shuffle(self.documents)#list with spam and ham documents randomly distributed
certainIndex = int(len(self.documents)*0.9)#getting 90% and 10% of the total documents
self.trainDocs = self.documents[:certainIndex] #90% of total documents
self.testDocs = self.documents[certainIndex:] #10% of total documents
self.totalWords = self.totalSpamWords + self.totalHamWords
random.shuffle(self.totalWords)
self.all_words = nltk.FreqDist(w for w in self.totalWords if w.isalpha())#lists frequency of all words
self.word_features = list(self.all_words)[:2000] #lists top 2000 most frequent words
def train(self): #trains the classifier by calculating probabilities
self.numSpam = 0
for i in self.trainDocs:
if(i[1]=="spam"):
self.numSpam += 1
self.probSpam = self.numSpam/(len(self.documents))
self.numHam = len(self.documents)-self.numSpam
self.probHam = self.numHam/(len(self.documents))
self.spamProb = {}
self.hamProb = {}
self.hamEp = 1/(self.numHam+1)
self.spamEp = 1/(self.numSpam+1)
for doc in self.trainDocs:
for word in self.word_features:
self.spamProb[word] = 0
self.hamProb[word] = 0
if doc[1] == "spam":
if word in doc[0]:
self.spamProb[word] += 1
else:
if word in doc[0]:
self.hamProb[word] += 1
for word in self.spamProb:
if self.spamProb[word] == 0:
self.spamProb[word] = self.spamEp
else:
self.spamProb[word] = self.spamProb[word]/self.numSpam;
for word in self.hamProb:
if self.hamProb[word] == 0:
self.hamProb[word] = self.hamEp
else:
self.hamProb[word] = self.hamProb[word]/self.numHam;
def classify (self): #labels test docs as spam or ham based on feature probs.
self.classifiedList = [] #list where docs are labeled
self.probS = 1
self.probH = 1
for doc in self.testDocs:
for word in doc:
if word in self.word_features:
self.probS = self.probS * self.spamProb[word]
self.probH = self.probH * self.hamProb[word]
self.probS = self.probS * self.probSpam
self.probH = self.probH * self.probHam
if self.probS > self.probH:
self.classifiedList.append((doc,"Spam"))
else:
self.classifiedList.append((doc,"Ham"))
return self.classifiedList
def accuracy (self): #calculates percent of docs that were correctly classified
result = 0
countCorrect = 0
for item in self.classifiedList:
if item[1] == "Spam":
if item[0][1] in self.spamFiles:
countCorrect += 1
else:
if item[0][1] in self.hamFiles:
countCorrect +=1
result = float(countCorrect)/float(len(self.classifiedList))
return result