axello
diff --git a/‎mlab_mysql_import.py
Lines changed: 31 additions & 16 deletions b/‎mlab_mysql_import.py
Lines changed: 31 additions & 16 deletions
@@ -9,6 +9,7 @@
 # 20120628      AX  removed testing for every line, added timing code, 
 # 20120629      AX  added loop over all arguments, exception handling, restructured code, moved processed files to archive or error folder
 # 20120708      AX  skip empty ip lines instead or error message
+# 20120708      RB  cleaning some names and spelling, also we don't want processed_files.log to clobber the downloaders processed_files.log. So we should use overly descriptive names
 #
 # test: 
 # cd /DATA
@@ -20,6 +21,15 @@
 #       v move error files naar error directory
 #       v log process and errors
 #       v skip empty ip lines instead or error message
+#
+#       Get the date from the filename, and look up the correct maxmind database
+#       then, insert the locId directly with the line in the mlab/{glasnost,ndt} database, preventing slow future updates
+#       on the other hand, all these updates might be extremely slow: TEST
+#
+#		todo : refactor all the utility functions in a separate file
+#		todo : refactor all the passwords in a separate file (which is NOT in the repo, AND is in the .gitignore list
+
+
 
 import sys
 import re
@@ -39,7 +49,7 @@
 # PLEASE UPDATE THESE SETTINGS
 db_host = "localhost" # your host, usually localhost
 db_user = "root" # your username
-db_passwd = "rootpassword" # your password
+db_passwd = "" # your password
 db_name = "mlab" # name of the database
 db_tables = {"glasnost": "glasnost", "ndt": "ndt"} # a mapping from testname to tablename
 db_filetable = 'files'
@@ -54,8 +64,8 @@
 cleanDir    = baseDir + 'clean/'
 
 #files
-errorLog    = "error.log"
-processLog  = "processed_files.log"
+errorLog    = "mlab_mysql_import_error.log"
+processLog  = "mlab_mysql_import_processed_files.log"
 
 #################################################################
 #                                                               #
@@ -67,9 +77,10 @@ def usage():
   print "Usage: mlab_mysql_import3.py mlab_file1.csv [mlab_files.csv ...]"
   sys.exit(1)
 
+# This routine extracts the destination server of the mlab file. 
+# It assumes that the filename has the form like 20100210T000000Z-mlab3-dfw01-ndt-0000.tgz.csv
+#  
 def extract_destination(filename):
-  ''' This routine extracts the destination server of the mlab file. 
-      It assumes that the filename has the form like 20100210T000000Z-mlab3-dfw01-ndt-0000.tgz.csv '''
   # Split the filename and perform some tests if it conforms to our standard
   f_split = filename.split('-')
   if len(f_split) < 3:
@@ -80,37 +91,37 @@ def extract_destination(filename):
 
   return '.'.join(filename.split('-')[1:-1])
 
+# Returns the datetime contained in string.
 def extract_datetime(string):
-  ''' Returns the datetime contained in string '''
   # Extract the date
   date_match = re.search(r'\d{4}/\d{2}/\d{2}', string)
   if not date_match:
-    raise Exception('Error im import: line "', string, '" does not contain a valid date.')
+    raise Exception('Error in import: line "', string, '" does not contain a valid date.')
   # Extract the time
   time_match = re.search(r'\d{2}:\d{2}:\d{2}', string)
   if not time_match:
-    raise Exception('Error im import: line "', string, '" does not contain a valid time.')
+    raise Exception('Error in import: line "', string, '" does not contain a valid time.')
 
   try:
     return dparser.parse(date_match.group(0) + ' ' + time_match.group(0), fuzzy=True) 
   except ValueError:
-    raise ValueError, 'Error im import: line "' + string + '" does not contain a valid date and time.'
+    raise ValueError, 'Error in import: line "' + string + '" does not contain a valid date and time.'
 
+# Returns the first valid ip address contained in string.
 # return with empty string when we encounter cputime, or no ip number
 def extract_ip(string):
   if re.search('cputime', string):
     return ''
-  ''' Returns the first valid ip address contained in string '''
   # Extract the date
   match = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', string)
   if not match:
     # ignore file
     return ''
-    # raise Exception ('Error im import: line "', string, '" does not contain a valid ip address.')
+    # raise Exception ('Error in import: line "', string, '" does not contain a valid ip address.')
   return match.group(0)
 
+# Test if the entry already exists in the database
 def exists_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
-    ''' Test if the entry already exists in the database '''
     # Check if the entry exists already 
     sql = "SELECT COUNT(*) FROM " + db_table + " WHERE date = '" + test_datetime.isoformat() + "' AND destination = '" + destination +  "' AND  source = '" + source_ip + "' AND file_id = " + str(file_id) 
     cur.execute(sql)
@@ -120,26 +131,26 @@ def exists_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip
     else:
         return True
 
+# Insert a connection to the database without testing.
 def blunt_insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
-    ''' Insert a connection to the database without testing '''
     columns = ', '.join(['date', 'destination', 'source', 'file_id'])
     values = '"' + '", "'.join([test_datetime.isoformat(), destination, source_ip, str(file_id)]) + '"'
     sql = "INSERT INTO  " + db_table + " (" + columns + ") VALUES(" + values + ") "
     cur.execute(sql)
 
+# Insert a test connection to the database, if it not already exists
 def insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
-    ''' Insert a test connection to the database, if it not already exists '''
     # Check if the entry exists already 
     sql = "SELECT COUNT(*) FROM " + db_table + " WHERE date = '" + test_datetime.isoformat() + "' AND destination = '" + destination +  "' AND  source = '" + source_ip + "' AND file_id = " + str(file_id) 
     cur.execute(sql)
 
-    # If not, then isert it
+    # If not, then insert it
     if cur.fetchone()[0] < 1:
         print 'Found new test performed on the', test_datetime, 'from ' + destination + ' -> ' + source_ip + '.' 
         blunt_insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip)
 
+# Returns the id of a filename in the filename table. Creates a new row if the filename does not exist. 
 def get_file_id(cur, filename):
-    ''' Returns the id of a filename in the filename table. Creates a new row if the filename does not exist. ''' 
     sql = "SELECT id FROM " + db_filetable + " WHERE filename ='" + filename + "'"
     cur.execute(sql)
     id = cur.fetchone()
@@ -150,6 +161,7 @@ def get_file_id(cur, filename):
         return get_file_id(cur, filename)
     return id[0]
 
+# do deduplucation of connection strings
 def dedup(file_id, table, test_datetime, destination, source_ip):
     key = str(file_id) + table + str(test_datetime) + destination + source_ip
     if key in deduplookup:
@@ -219,6 +231,7 @@ def process_file(f, filename):
             f.write(pathname + '\n')
             f.write('Error handling file ' + filename + ' (' + str(e.args) + ')\n')
         print
+# This bit should probably be cleaned up.        
 #    except:
 #        sys.stderr.write('Process error ' + '\n')
     finally:
@@ -230,6 +243,7 @@ def process_file(f, filename):
 
     return failure
 
+# get the test date from the archive filename
 def extract_archive_date(filename):
       m = re.match('^(\d{4})(\d{2})(\d{2})', filename)
       return (m.group(1),m.group(2))
@@ -240,6 +254,7 @@ def create_archive_dir(ym):
         os.makedirs(ym)
     return ym
 
+# move processed file to archive folder
 def move_archive(pathname):
     fname = os.path.basename(pathname)
     (year,month) = extract_archive_date(fname)