4
4
from words import text_to_list
5
5
import csv
6
6
import codecs
7
+ import fnmatch
8
+ from findSeparator import findSep
7
9
8
10
class Learn (Mode ):
9
11
ext = ""
10
12
def validate (self , args ):
11
13
valid_args = False
12
14
usage = 'Usage: %s learn <doc type> <file> <count>' % args [0 ]
13
-
14
- if len (args ) == 5 :
15
+ count = 0
16
+ self .file = args [3 ]
17
+ self .text = ""
18
+ self .ext = (args [3 ])[- 3 :]
19
+ if len (args ) == 4 or len (args )== 5 :
15
20
doc_type = args [2 ]
16
-
17
21
file_contents = None
18
22
try :
19
- ext = (args [3 ])[- 3 :]
20
- if (ext == "csv" ):
21
- f_open = codecs .open ('args[3]' ,'r' ,encoding = 'utf-8' ,errors = 'ignore' )
22
- text = list (csv .reader (f_open , delimiter = ',' ))
23
- file_contents = ""
24
- for sen in text :
25
- file_contents = file_contents + (sen [1 ])
26
- if (ext == "txt" ):
27
- file_contents = open (args [3 ], 'r' ).read ()
23
+
24
+ if (self .ext == "csv" ): #for single spam/ham dataset file
25
+ f_open = codecs .open (self .file ,'r' ,encoding = 'utf-8' ,errors = 'ignore' )
26
+ self .text = list (csv .reader (f_open , delimiter = ',' ))
27
+ typeCk = self .text
28
+ #print(typeCk)
29
+ Ck = [typeCk [0 ][0 ]]
30
+
31
+ if (self .ext == "txt" ):
32
+ self .text = open (args [3 ], 'r' ).read ()
33
+ typeCk = self .text .split ()
34
+ Ck = [typeCk [0 ][- 5 :]]
35
+
36
+
37
+
38
+ #print(Ck)
39
+ filter1 = fnmatch .filter (Ck , 'spam?' )
40
+ filter2 = fnmatch .filter (Ck , 'ham**' )
41
+ filter3 = fnmatch .filter (Ck , 'spam' )
42
+ filter4 = fnmatch .filter (Ck , 'ham' )
43
+ pars = filter1 or filter2 or filter3 or filter4
44
+ #print(pars)
45
+ vals = self .importsingleData (pars )
46
+ count ,file_contents = vals [0 ],vals [1 ]
47
+
48
+
49
+
28
50
except Exception as e :
29
51
raise ValueError (usage + '\n Unable to read specified file "%s", the error message was: %s' % (args [3 ], e ))
30
-
31
- count = 0
32
- try :
33
- count = int (args [4 ])
34
- except :
35
- raise ValueError (usage + '\n Enter an integer value for the "count" parameter' )
36
-
52
+
53
+ #print(file_contents)
37
54
self .file_contents = file_contents
38
55
self .count = count
39
56
self .doc_type = doc_type
40
- if (ext == "csv" ):
41
- f_open .close ()
57
+ # if (ext=="csv"):
58
+ # f_open.close()
42
59
43
60
else :
44
- raise ValueError (usage )
61
+ raise ValueError (usage )
45
62
46
63
def execute (self ):
47
64
db = Db ()
@@ -53,3 +70,94 @@ def execute(self):
53
70
54
71
def output (self , _ ):
55
72
print ("Processed %s documents of type '%s'" % (self .count , self .doc_type ))
73
+
74
+ # def importsingleCSV(self,pars):
75
+ # file_contents = ""
76
+ # count=0
77
+ # data=""
78
+ # data=self.text.split()
79
+ # Ck =""
80
+ # if (pars):
81
+ # Ck = pars[-1:]
82
+
83
+ # ls = ['spam'+Ck,'ham'+Ck]
84
+ # for word in range(0,len(data)):
85
+ # punc= findSep(self)
86
+ # if ((punc=="comma") or (punc=="semi-colon")):
87
+ # if (data[word]!=ls[0]):
88
+ # if (data[word]!=ls[1]):
89
+ # #print(ls[0])
90
+ # #print(data[word])
91
+ # file_contents = file_contents +' '+ (data[word])
92
+ # count=count+1
93
+ # if (punc=="space"):
94
+ # if (data[word]!='spam'):
95
+ # if (data[word]!='ham'):
96
+ # data=data.split(' ')
97
+ # file_contents = file_contents +' '+ (data[word])
98
+ # count=count+1
99
+
100
+ # else:
101
+ # file_contents = file_contents + self.text
102
+ # print(file_contents)
103
+ # count=count+1
104
+ # return [count,file_contents]
105
+
106
+ def importsingleData (self ,pars ):
107
+ file_contents = ""
108
+ count = 0
109
+ if (self .ext == 'txt' ):
110
+ data = self .text .split ()
111
+ if (pars ):
112
+ Ck = pars [0 ][- 1 :]
113
+
114
+ ls = ['spam' + Ck ,'ham' + Ck ,]
115
+ for word in range (0 ,len (self .text .split ())):
116
+ punc = findSep (self )
117
+ if ((punc == "comma" ) or (punc == "semi-colon" )):
118
+ if (data [word ]!= ls [0 ]):
119
+ if (data [word ]!= ls [1 ]):
120
+ #print(data[word]!=ls[0])
121
+ #print(data[word])
122
+ file_contents = file_contents + ' ' + (data [word ])
123
+ count = count + 1
124
+ if (punc == "space" ):
125
+
126
+ file_contents = file_contents + ' ' + (data [word ])
127
+ count = count + 1
128
+
129
+ else :
130
+ file_contents = file_contents + self .text
131
+ #print(file_contents)
132
+ count = len (self .text .split ())
133
+ else :
134
+ data = self .text
135
+ #print(data)
136
+ if (pars ):
137
+ Ck = pars [0 ][- 1 :]
138
+
139
+ ls = ['spam' + Ck ,'ham' + Ck ,]
140
+ for word in range (0 ,len (self .text )):
141
+ # if (data[word][1]!=ls[0]):
142
+ # if (data[word]!=ls[1]):
143
+ #print(data[word]!=ls[0])
144
+ #print(data[word])
145
+
146
+ file_contents = file_contents + ' ' + (data [word ][1 ])
147
+
148
+ count = count + 1
149
+ # if (punc=="space"):
150
+
151
+ # file_contents = file_contents +' '+ (data[word][1])
152
+ # count=count+1
153
+
154
+ else :
155
+ count = len (data [0 ])
156
+ for x in range (0 ,count ):
157
+ file_contents = file_contents + ' ' + data [0 ][x ]
158
+ #print(file_contents)
159
+
160
+
161
+ return [count ,file_contents ]
162
+
163
+
0 commit comments