Skip to content

Commit 8b1e6e3

Browse files
committed
Added automatic data pre-processing, enhanced CSV sipport, automatic record counting
1 parent 2605807 commit 8b1e6e3

9 files changed

+203
-24
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,5 @@ phone.txt
4040
email.txt
4141
test.txt
4242
test.py
43+
test.csv
4344
spam_keywords.txt

bayes.db

0 Bytes
Binary file not shown.

bayes.py

-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ def register_mode(mode_class):
1717
register_mode(Status)
1818

1919
args = sys.argv
20-
print(args)
2120
usage = 'Usage: %s %s <mode specific args>' % (args[0], '|'.join(modes.keys()))
2221

2322
if (len(args) < 2):

spam.csv example_spam.csv

File renamed without changes.

findSeparator.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
def findSep(self):
2+
text = self.text
3+
data = list(text)
4+
found="-1"
5+
for x in data:
6+
found=type(x)
7+
if (found != "-1"):
8+
break
9+
return found
10+
11+
def type(i):
12+
switcher={
13+
' ':'space',
14+
',':'comma',
15+
';':'semi-colon',
16+
':':'colon',
17+
'|':'ampersand',
18+
'.':'period',
19+
}
20+
return switcher.get(i,"-1")
21+

importfiles.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
def importsingleCSV(self,pars):
2+
file_contents = ""
3+
count=0
4+
data=""
5+
6+
for sen in self.text:
7+
if (pars):
8+
punc= findSep(sen)
9+
if (punc=="comma"):
10+
data=sen.split(',')
11+
file_contents = file_contents + (data[1])
12+
if (punc=="space"):
13+
data=(sen).split(' ')
14+
file_contents = file_contents + (data[1])
15+
16+
if (punc=="semi-colon"):
17+
data=(sen).split(';')
18+
file_contents = file_contents + (data[1])
19+
else:
20+
file_contents = file_contents + (sen)
21+
print(data[0])
22+
count = count+1
23+
return [count,file_contents]
24+
25+
def importsingleTXT(self,pars):
26+
file_contents = ""
27+
count=0
28+
data=""
29+
30+
31+
32+
for sen in self.text:
33+
if (pars):
34+
punc= findSep(sen)
35+
if (punc=="comma"):
36+
data=(sen).split(',')
37+
file_contents = file_contents + (data[1])
38+
if (punc=="space"):
39+
data=(sen).split(' ')
40+
file_contents = file_contents + (data[1])
41+
42+
if (punc=="semi-colon"):
43+
data=(sen).split(';')
44+
file_contents = file_contents + (data[1])
45+
else:
46+
file_contents = file_contents + (sen)
47+
48+
count=count+1
49+
return [count,file_contents]

learn.py

+130-22
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,61 @@
44
from words import text_to_list
55
import csv
66
import codecs
7+
import fnmatch
8+
from findSeparator import findSep
79

810
class Learn(Mode):
911
ext=""
1012
def validate(self, args):
1113
valid_args = False
1214
usage = 'Usage: %s learn <doc type> <file> <count>' % args[0]
13-
14-
if len(args) == 5:
15+
count = 0
16+
self.file = args[3]
17+
self.text=""
18+
self.ext = (args[3])[-3:]
19+
if len(args) == 4 or len(args)==5:
1520
doc_type = args[2]
16-
1721
file_contents = None
1822
try:
19-
ext = (args[3])[-3:]
20-
if (ext=="csv"):
21-
f_open = codecs.open('args[3]','r',encoding='utf-8',errors='ignore')
22-
text = list(csv.reader(f_open, delimiter=','))
23-
file_contents = ""
24-
for sen in text:
25-
file_contents = file_contents + (sen[1])
26-
if (ext=="txt"):
27-
file_contents = open(args[3], 'r').read()
23+
24+
if (self.ext=="csv"): #for single spam/ham dataset file
25+
f_open = codecs.open(self.file,'r',encoding='utf-8',errors='ignore')
26+
self.text = list(csv.reader(f_open, delimiter=','))
27+
typeCk = self.text
28+
#print(typeCk)
29+
Ck=[typeCk[0][0]]
30+
31+
if (self.ext=="txt"):
32+
self.text = open(args[3], 'r').read()
33+
typeCk = self.text.split()
34+
Ck=[typeCk[0][-5:]]
35+
36+
37+
38+
#print(Ck)
39+
filter1 = fnmatch.filter(Ck, 'spam?')
40+
filter2 = fnmatch.filter(Ck, 'ham**')
41+
filter3 = fnmatch.filter(Ck, 'spam')
42+
filter4 = fnmatch.filter(Ck, 'ham')
43+
pars = filter1 or filter2 or filter3 or filter4
44+
#print(pars)
45+
vals = self.importsingleData(pars)
46+
count,file_contents= vals[0],vals[1]
47+
48+
49+
2850
except Exception as e:
2951
raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e))
30-
31-
count = 0
32-
try:
33-
count = int(args[4])
34-
except:
35-
raise ValueError(usage + '\nEnter an integer value for the "count" parameter')
36-
52+
53+
#print(file_contents)
3754
self.file_contents = file_contents
3855
self.count = count
3956
self.doc_type = doc_type
40-
if (ext=="csv"):
41-
f_open.close()
57+
# if (ext=="csv"):
58+
# f_open.close()
4259

4360
else:
44-
raise ValueError(usage)
61+
raise ValueError(usage)
4562

4663
def execute(self):
4764
db = Db()
@@ -53,3 +70,94 @@ def execute(self):
5370

5471
def output(self, _):
5572
print("Processed %s documents of type '%s'" % (self.count, self.doc_type))
73+
74+
# def importsingleCSV(self,pars):
75+
# file_contents = ""
76+
# count=0
77+
# data=""
78+
# data=self.text.split()
79+
# Ck =""
80+
# if (pars):
81+
# Ck = pars[-1:]
82+
83+
# ls = ['spam'+Ck,'ham'+Ck]
84+
# for word in range(0,len(data)):
85+
# punc= findSep(self)
86+
# if ((punc=="comma") or (punc=="semi-colon")):
87+
# if (data[word]!=ls[0]):
88+
# if (data[word]!=ls[1]):
89+
# #print(ls[0])
90+
# #print(data[word])
91+
# file_contents = file_contents +' '+ (data[word])
92+
# count=count+1
93+
# if (punc=="space"):
94+
# if (data[word]!='spam'):
95+
# if (data[word]!='ham'):
96+
# data=data.split(' ')
97+
# file_contents = file_contents +' '+ (data[word])
98+
# count=count+1
99+
100+
# else:
101+
# file_contents = file_contents + self.text
102+
# print(file_contents)
103+
# count=count+1
104+
# return [count,file_contents]
105+
106+
def importsingleData(self,pars):
107+
file_contents = ""
108+
count=0
109+
if (self.ext=='txt'):
110+
data=self.text.split()
111+
if (pars):
112+
Ck = pars[0][-1:]
113+
114+
ls = ['spam'+Ck,'ham'+Ck,]
115+
for word in range(0,len(self.text.split())):
116+
punc= findSep(self)
117+
if ((punc=="comma") or (punc=="semi-colon")):
118+
if (data[word]!=ls[0]):
119+
if (data[word]!=ls[1]):
120+
#print(data[word]!=ls[0])
121+
#print(data[word])
122+
file_contents = file_contents +' '+ (data[word])
123+
count=count+1
124+
if (punc=="space"):
125+
126+
file_contents = file_contents +' '+ (data[word])
127+
count=count+1
128+
129+
else:
130+
file_contents = file_contents + self.text
131+
#print(file_contents)
132+
count=len(self.text.split())
133+
else:
134+
data=self.text
135+
#print(data)
136+
if (pars):
137+
Ck = pars[0][-1:]
138+
139+
ls = ['spam'+Ck,'ham'+Ck,]
140+
for word in range(0,len(self.text)):
141+
# if (data[word][1]!=ls[0]):
142+
# if (data[word]!=ls[1]):
143+
#print(data[word]!=ls[0])
144+
#print(data[word])
145+
146+
file_contents = file_contents +' '+ (data[word][1])
147+
148+
count=count+1
149+
# if (punc=="space"):
150+
151+
# file_contents = file_contents +' '+ (data[word][1])
152+
# count=count+1
153+
154+
else:
155+
count=len(data[0])
156+
for x in range(0,count):
157+
file_contents = file_contents + ' '+ data[0][x]
158+
#print(file_contents)
159+
160+
161+
return [count,file_contents]
162+
163+

mode.py

+2
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ def execute(self):
77

88
def output(self):
99
raise NotImplementedError()
10+
11+

words.py

-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,5 @@ def add_list_to_dict(l, d):
2525

2626
def text_to_list(text):
2727
k = cleanUpWord
28-
print(k)
2928
cleaned_words = map(cleanUpWord, re.split('\W+', text.strip()))
3029
return filter(lambda word : word and (len(word) > 0), cleaned_words)

0 commit comments

Comments
 (0)