-
Notifications
You must be signed in to change notification settings - Fork 0
/
learn.py
126 lines (104 loc) · 4.13 KB
/
learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from __future__ import division
from testSpamFilter import StripTags, testFilter
import os
import re
import shelve
def learn(root="spamFilter/user/"):
# PROCESS SPAMS
spams = []
for filename in os.listdir(root + "spam/"):
f = open(root+"spam/" + filename,"r+")
spams.append(f.read())
f.close()
# Find out spams word frequency
# start with an empty dictionary
freqSpamWords = {}
for m in spams :
str1 = StripTags(m)
#print "Original string:"
#print str1
#print
# create a list of words separated at whitespaces
wordList1 = re.findall(r'\w+',str1)
# strip any punctuation marks and build modified word list
# start with an empty list
wordList2 = []
for word1 in wordList1:
# last character of each word
lastchar = word1[-1:]
# use a list of punctuation marks
if lastchar in [",", ".", "!", "?", ";","'"]:
word2 = word1.rstrip(lastchar)
else:
word2 = word1
#word2 = re.findall(r'\w+',word1)
# build a wordList of lower case modified words
if len(word2) < 10:
wordList2.append(word2.lower())
# create a wordfrequency dictionary
for word2 in wordList2:
freqSpamWords[word2] = freqSpamWords.get(word2, 0) + 1
#PROCESS HAMS
hams = []
for filename in os.listdir(root+"ham"):
f2 = open(root+"ham/" + filename,"r+")
hams.append(f2.read())
f2.close()
# Find out hams word frequency
# start with an empty dictionary
freqHamWords = {}
for m in hams :
str1 = StripTags(m)
# create a list of words separated at whitespaces
wordList1 = re.findall(r'\w+',str1)
# strip any punctuation marks and build modified word list
# start with an empty list
wordList2 = []
for word1 in wordList1:
# last character of each word
lastchar = word1[-1:]
# use a list of punctuation marks
if lastchar in [",", ".", "!", "?", ";","'"]:
word2 = word1.rstrip(lastchar)
else:
word2 = word1
#word2 = re.findall(r'\w+',word1)
# build a wordList of lower case modified words
if len(word2) < 10:
wordList2.append(word2.lower())
# create a wordfrequency dictionary
for word2 in wordList2:
freqHamWords[word2] = freqHamWords.get(word2, 0) + 1
# create a list of keys and sort the list
# all words are lower case already
keyList1 = freqSpamWords.keys()
keyList1.sort()
keyList2 = freqHamWords.keys()
keyList2.sort()
total1 = 0
total2 = 0
myFile1 = open(root+"spamDict","w")
myFile2 = open(root+"hamDict","w")
for key1 in keyList1:
myFile1.write( "%-10s %d" % (key1, freqSpamWords[key1]) + "\n")
total1+= freqSpamWords[key1]
myFile1.close()
for key2 in keyList2:
myFile2.write( "%-10s %d" % (key2, freqHamWords[key2]) + "\n")
total2+= freqHamWords[key2]
myFile2.close()
# print "dictionary size for spam is {0}".format(len(freqSpamWords))
# print "total spams are {0} and \"you\" is {1}".format(total1,freqSpamWords["you"])
# print "the probability of the word \"you\" being a spam {0}".format(freqSpamWords["you"]/total1)
#
# print "dictionary size for ham is {0}".format(len(freqHamWords))
# print "total hams are {0} and \"you\" is {1}".format(total2,freqHamWords["you"])
# print "the probability of the word \"you\" being a ham {0}".format(freqHamWords["you"]/total2)
#
constant = (len(hams)+1)/(len(hams)+len(spams)+2)
f = shelve.open(root+"/db")
f["pOfHam"] = constant
f["lenHams"] = len(hams)
f["lenSpams"] = len(spams)
f.close()
return constant