Skip to content

Commit eb6716d

Browse files
committed
Cleaned up comments, Ruben added some comments, added todos
1 parent 8c329ac commit eb6716d

File tree

2 files changed

+338
-16
lines changed

2 files changed

+338
-16
lines changed

mlab_mysql_import.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# 20120628 AX removed testing for every line, added timing code,
1010
# 20120629 AX added loop over all arguments, exception handling, restructured code, moved processed files to archive or error folder
1111
# 20120708 AX skip empty ip lines instead or error message
12+
# 20120708 RB cleaning some names and spelling, also we don't want processed_files.log to clobber the downloaders processed_files.log. So we should use overly descriptive names
1213
#
1314
# test:
1415
# cd /DATA
@@ -20,6 +21,15 @@
2021
# v move error files naar error directory
2122
# v log process and errors
2223
# v skip empty ip lines instead or error message
24+
#
25+
# Get the date from the filename, and look up the correct maxmind database
26+
# then, insert the locId directly with the line in the mlab/{glasnost,ndt} database, preventing slow future updates
27+
# on the other hand, all these updates might be extremely slow: TEST
28+
#
29+
# todo : refactor all the utility functions in a separate file
30+
# todo : refactor all the passwords in a separate file (which is NOT in the repo, AND is in the .gitignore list
31+
32+
2333

2434
import sys
2535
import re
@@ -39,7 +49,7 @@
3949
# PLEASE UPDATE THESE SETTINGS
4050
db_host = "localhost" # your host, usually localhost
4151
db_user = "root" # your username
42-
db_passwd = "rootpassword" # your password
52+
db_passwd = "" # your password
4353
db_name = "mlab" # name of the database
4454
db_tables = {"glasnost": "glasnost", "ndt": "ndt"} # a mapping from testname to tablename
4555
db_filetable = 'files'
@@ -54,8 +64,8 @@
5464
cleanDir = baseDir + 'clean/'
5565

5666
#files
57-
errorLog = "error.log"
58-
processLog = "processed_files.log"
67+
errorLog = "mlab_mysql_import_error.log"
68+
processLog = "mlab_mysql_import_processed_files.log"
5969

6070
#################################################################
6171
# #
@@ -67,9 +77,10 @@ def usage():
6777
print "Usage: mlab_mysql_import3.py mlab_file1.csv [mlab_files.csv ...]"
6878
sys.exit(1)
6979

80+
# This routine extracts the destination server of the mlab file.
81+
# It assumes that the filename has the form like 20100210T000000Z-mlab3-dfw01-ndt-0000.tgz.csv
82+
#
7083
def extract_destination(filename):
71-
''' This routine extracts the destination server of the mlab file.
72-
It assumes that the filename has the form like 20100210T000000Z-mlab3-dfw01-ndt-0000.tgz.csv '''
7384
# Split the filename and perform some tests if it conforms to our standard
7485
f_split = filename.split('-')
7586
if len(f_split) < 3:
@@ -80,37 +91,37 @@ def extract_destination(filename):
8091

8192
return '.'.join(filename.split('-')[1:-1])
8293

94+
# Returns the datetime contained in string.
8395
def extract_datetime(string):
84-
''' Returns the datetime contained in string '''
8596
# Extract the date
8697
date_match = re.search(r'\d{4}/\d{2}/\d{2}', string)
8798
if not date_match:
88-
raise Exception('Error im import: line "', string, '" does not contain a valid date.')
99+
raise Exception('Error in import: line "', string, '" does not contain a valid date.')
89100
# Extract the time
90101
time_match = re.search(r'\d{2}:\d{2}:\d{2}', string)
91102
if not time_match:
92-
raise Exception('Error im import: line "', string, '" does not contain a valid time.')
103+
raise Exception('Error in import: line "', string, '" does not contain a valid time.')
93104

94105
try:
95106
return dparser.parse(date_match.group(0) + ' ' + time_match.group(0), fuzzy=True)
96107
except ValueError:
97-
raise ValueError, 'Error im import: line "' + string + '" does not contain a valid date and time.'
108+
raise ValueError, 'Error in import: line "' + string + '" does not contain a valid date and time.'
98109

110+
# Returns the first valid ip address contained in string.
99111
# return with empty string when we encounter cputime, or no ip number
100112
def extract_ip(string):
101113
if re.search('cputime', string):
102114
return ''
103-
''' Returns the first valid ip address contained in string '''
104115
# Extract the date
105116
match = re.search(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', string)
106117
if not match:
107118
# ignore file
108119
return ''
109-
# raise Exception ('Error im import: line "', string, '" does not contain a valid ip address.')
120+
# raise Exception ('Error in import: line "', string, '" does not contain a valid ip address.')
110121
return match.group(0)
111122

123+
# Test if the entry already exists in the database
112124
def exists_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
113-
''' Test if the entry already exists in the database '''
114125
# Check if the entry exists already
115126
sql = "SELECT COUNT(*) FROM " + db_table + " WHERE date = '" + test_datetime.isoformat() + "' AND destination = '" + destination + "' AND source = '" + source_ip + "' AND file_id = " + str(file_id)
116127
cur.execute(sql)
@@ -120,26 +131,26 @@ def exists_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip
120131
else:
121132
return True
122133

134+
# Insert a connection to the database without testing.
123135
def blunt_insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
124-
''' Insert a connection to the database without testing '''
125136
columns = ', '.join(['date', 'destination', 'source', 'file_id'])
126137
values = '"' + '", "'.join([test_datetime.isoformat(), destination, source_ip, str(file_id)]) + '"'
127138
sql = "INSERT INTO " + db_table + " (" + columns + ") VALUES(" + values + ") "
128139
cur.execute(sql)
129140

141+
# Insert a test connection to the database, if it not already exists
130142
def insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
131-
''' Insert a test connection to the database, if it not already exists '''
132143
# Check if the entry exists already
133144
sql = "SELECT COUNT(*) FROM " + db_table + " WHERE date = '" + test_datetime.isoformat() + "' AND destination = '" + destination + "' AND source = '" + source_ip + "' AND file_id = " + str(file_id)
134145
cur.execute(sql)
135146

136-
# If not, then isert it
147+
# If not, then insert it
137148
if cur.fetchone()[0] < 1:
138149
print 'Found new test performed on the', test_datetime, 'from ' + destination + ' -> ' + source_ip + '.'
139150
blunt_insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip)
140151

152+
# Returns the id of a filename in the filename table. Creates a new row if the filename does not exist.
141153
def get_file_id(cur, filename):
142-
''' Returns the id of a filename in the filename table. Creates a new row if the filename does not exist. '''
143154
sql = "SELECT id FROM " + db_filetable + " WHERE filename ='" + filename + "'"
144155
cur.execute(sql)
145156
id = cur.fetchone()
@@ -150,6 +161,7 @@ def get_file_id(cur, filename):
150161
return get_file_id(cur, filename)
151162
return id[0]
152163

164+
# do deduplucation of connection strings
153165
def dedup(file_id, table, test_datetime, destination, source_ip):
154166
key = str(file_id) + table + str(test_datetime) + destination + source_ip
155167
if key in deduplookup:
@@ -219,6 +231,7 @@ def process_file(f, filename):
219231
f.write(pathname + '\n')
220232
f.write('Error handling file ' + filename + ' (' + str(e.args) + ')\n')
221233
print
234+
# This bit should probably be cleaned up.
222235
# except:
223236
# sys.stderr.write('Process error ' + '\n')
224237
finally:
@@ -230,6 +243,7 @@ def process_file(f, filename):
230243

231244
return failure
232245

246+
# get the test date from the archive filename
233247
def extract_archive_date(filename):
234248
m = re.match('^(\d{4})(\d{2})(\d{2})', filename)
235249
return (m.group(1),m.group(2))
@@ -240,6 +254,7 @@ def create_archive_dir(ym):
240254
os.makedirs(ym)
241255
return ym
242256

257+
# move processed file to archive folder
243258
def move_archive(pathname):
244259
fname = os.path.basename(pathname)
245260
(year,month) = extract_archive_date(fname)

0 commit comments

Comments
 (0)