Added automatic data pre-processing, enhanced CSV sipport, automatic record counting

sierret · sierret · commit 8b1e6e337035 · 2020-06-14T00:29:24.000Z
diff --git a/.gitignore b/.gitignore
@@ -40,4 +40,5 @@ phone.txt
 email.txt
 test.txt
 test.py
+test.csv
 spam_keywords.txt
diff --git a/bayes.db b/bayes.db
diff --git a/bayes.py b/bayes.py
@@ -17,7 +17,6 @@ def register_mode(mode_class):
 		register_mode(Status)
 
 		args = sys.argv
-		print(args)
 		usage = 'Usage: %s %s <mode specific args>' % (args[0], '|'.join(modes.keys()))
 
 		if (len(args) < 2):
diff --git a/example_spam.csv b/example_spam.csv
diff --git a/findSeparator.py b/findSeparator.py
@@ -0,0 +1,21 @@
+def findSep(self):
+    text = self.text
+    data = list(text)
+    found="-1"
+    for x in data:
+        found=type(x)
+        if (found != "-1"):
+            break
+    return found
+
+def type(i):
+    switcher={
+                ' ':'space',
+                ',':'comma',
+                ';':'semi-colon',
+                ':':'colon',
+                '|':'ampersand',
+                '.':'period',
+             }
+    return switcher.get(i,"-1")
+
diff --git a/importfiles.py b/importfiles.py
@@ -0,0 +1,49 @@
+def importsingleCSV(self,pars):
+                file_contents = ""
+                count=0
+                data=""       
+                
+                for sen in self.text:
+                        if (pars):
+                                punc= findSep(sen)
+                                if (punc=="comma"):
+                                        data=sen.split(',')
+                                        file_contents = file_contents + (data[1])
+                                if (punc=="space"):
+                                        data=(sen).split(' ')
+                                        file_contents = file_contents + (data[1])
+                
+                                if (punc=="semi-colon"):
+                                        data=(sen).split(';')
+                                        file_contents = file_contents + (data[1])
+                        else:
+                                file_contents = file_contents + (sen) 
+                                print(data[0])
+                        count = count+1 
+                return [count,file_contents]
+        
+def importsingleTXT(self,pars):
+        file_contents = ""
+        count=0
+        data=""
+        
+                
+        
+        for sen in self.text:
+                if (pars):
+                        punc= findSep(sen)
+                        if (punc=="comma"):
+                                data=(sen).split(',')
+                                file_contents = file_contents + (data[1])
+                        if (punc=="space"):
+                                data=(sen).split(' ')
+                                file_contents = file_contents + (data[1])
+        
+                        if (punc=="semi-colon"):
+                                data=(sen).split(';')
+                                file_contents = file_contents + (data[1])
+                else:
+                        file_contents = file_contents + (sen) 
+
+                count=count+1
+        return [count,file_contents]
diff --git a/learn.py b/learn.py
@@ -4,44 +4,61 @@
 from words import text_to_list
 import csv
 import codecs
+import fnmatch
+from findSeparator import findSep
 
 class Learn(Mode):
         ext=""
         def validate(self, args):
                 valid_args = False
                 usage = 'Usage: %s learn <doc type> <file> <count>' % args[0]
-
-                if len(args) == 5:
+                count = 0
+                self.file = args[3]
+                self.text=""
+                self.ext = (args[3])[-3:]
+                if len(args) == 4 or len(args)==5:
                         doc_type = args[2]
-                        
                         file_contents = None
                         try:
-                                ext = (args[3])[-3:]
-                                if (ext=="csv"):
-                                        f_open = codecs.open('args[3]','r',encoding='utf-8',errors='ignore') 
-                                        text = list(csv.reader(f_open, delimiter=','))
-                                        file_contents = ""
-                                        for sen in text:
-                                                file_contents = file_contents + (sen[1])
-                                if (ext=="txt"):
-                                     file_contents = open(args[3], 'r').read()   
+                                
+                                if (self.ext=="csv"): #for single spam/ham dataset file
+                                        f_open = codecs.open(self.file,'r',encoding='utf-8',errors='ignore') 
+                                        self.text = list(csv.reader(f_open, delimiter=','))
+                                        typeCk = self.text 
+                                        #print(typeCk) 
+                                        Ck=[typeCk[0][0]]                                       
+
+                                if (self.ext=="txt"):
+                                        self.text = open(args[3], 'r').read()
+                                        typeCk = self.text.split()
+                                        Ck=[typeCk[0][-5:]]
+                                
+                                
+                                
+                                #print(Ck)
+                                filter1 = fnmatch.filter(Ck, 'spam?')
+                                filter2 = fnmatch.filter(Ck, 'ham**')
+                                filter3 = fnmatch.filter(Ck, 'spam')
+                                filter4 = fnmatch.filter(Ck, 'ham')
+                                pars = filter1 or filter2 or filter3 or filter4
+                                #print(pars)
+                                vals = self.importsingleData(pars)
+                                count,file_contents= vals[0],vals[1]
+                                                         
+                                
+                                        
                         except Exception as e:
                                 raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e))
-
-                        count = 0
-                        try:
-                                count = int(args[4])
-                        except:
-                                raise ValueError(usage + '\nEnter an integer value for the "count" parameter')                  
-
+                 
+                        #print(file_contents)
                         self.file_contents = file_contents
                         self.count = count
                         self.doc_type = doc_type
-                        if (ext=="csv"):
-                                f_open.close()
+                        # if (ext=="csv"):
+                        #         f_open.close()
 
                 else:
-                        raise ValueError(usage)                         
+                        raise ValueError(usage)                          
 
         def execute(self):
                 db = Db()
@@ -53,3 +70,94 @@ def execute(self):
 
         def output(self, _):
                 print("Processed %s documents of type '%s'" % (self.count, self.doc_type))
+
+        # def importsingleCSV(self,pars):
+        #         file_contents = ""
+        #         count=0
+        #         data="" 
+        #         data=self.text.split()
+        #         Ck =""
+        #         if (pars):
+        #                 Ck = pars[-1:]
+        
+        #                 ls = ['spam'+Ck,'ham'+Ck]   
+        #                 for word in range(0,len(data)):
+        #                         punc= findSep(self)
+        #                         if ((punc=="comma") or (punc=="semi-colon")):
+        #                                 if (data[word]!=ls[0]):
+        #                                         if (data[word]!=ls[1]):
+        #                                                 #print(ls[0])
+        #                                                 #print(data[word])
+        #                                                 file_contents = file_contents +' '+ (data[word])
+        #                                                 count=count+1
+        #                         if (punc=="space"):
+        #                                 if (data[word]!='spam'):
+        #                                         if (data[word]!='ham'):
+        #                                                 data=data.split(' ')
+        #                                                 file_contents = file_contents +' '+ (data[word])
+        #                                                 count=count+1
+
+        #         else:
+        #                 file_contents = file_contents + self.text
+        #                 print(file_contents)
+        #                 count=count+1
+        #         return [count,file_contents]
+        
+        def importsingleData(self,pars):
+                file_contents = ""
+                count=0
+                if (self.ext=='txt'):
+                        data=self.text.split()                        
+                        if (pars):
+                                Ck = pars[0][-1:]
+                
+                                ls = ['spam'+Ck,'ham'+Ck,]   
+                                for word in range(0,len(self.text.split())):
+                                        punc= findSep(self)
+                                        if ((punc=="comma") or (punc=="semi-colon")):
+                                                if (data[word]!=ls[0]):
+                                                        if (data[word]!=ls[1]):
+                                                                #print(data[word]!=ls[0])
+                                                                #print(data[word])
+                                                                file_contents = file_contents +' '+ (data[word])
+                                                                count=count+1
+                                        if (punc=="space"):
+                                                
+                                                file_contents = file_contents +' '+ (data[word])
+                                                count=count+1
+
+                        else:
+                                file_contents = file_contents + self.text
+                                #print(file_contents)
+                                count=len(self.text.split())
+                else:
+                        data=self.text 
+                        #print(data)                      
+                        if (pars):
+                                Ck = pars[0][-1:]
+                
+                                ls = ['spam'+Ck,'ham'+Ck,]   
+                                for word in range(0,len(self.text)):
+                                                # if (data[word][1]!=ls[0]):
+                                                #         if (data[word]!=ls[1]):
+                                                                #print(data[word]!=ls[0])
+                                                                #print(data[word])
+                                                
+                                        file_contents = file_contents +' '+ (data[word][1])
+                                        
+                                        count=count+1
+                                        # if (punc=="space"):
+                                                
+                                        #         file_contents = file_contents +' '+ (data[word][1])
+                                        #         count=count+1
+
+                        else:
+                                count=len(data[0])
+                                for x in range(0,count):
+                                        file_contents = file_contents + ' '+  data[0][x]
+                                #print(file_contents)
+                                
+                        
+                return [count,file_contents]
+
+                
diff --git a/mode.py b/mode.py
@@ -7,3 +7,5 @@ def execute(self):
 
 	def output(self):
 		raise NotImplementedError()
+
+		
diff --git a/words.py b/words.py
@@ -25,6 +25,5 @@ def add_list_to_dict(l, d):
 
 def text_to_list(text):
 	k = cleanUpWord
-	print(k)
 	cleaned_words = map(cleanUpWord, re.split('\W+', text.strip()))
 	return filter(lambda word : word and (len(word) > 0), cleaned_words)