Skip to content

Commit 91682ae

Browse files
committed
added maxmind.py and updated mlab_mysql_import to take advantage of it.
maxmind.py indexes one of the maxmind tables, to greatly speed up the locId lookup process.
1 parent eb6716d commit 91682ae

File tree

3 files changed

+175
-329
lines changed

3 files changed

+175
-329
lines changed

maxmind.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/python
2+
#
3+
# maxmind.py
4+
#
5+
# A class to represent a full maxmind database as a python hash, to make locId lookups much faster (hopefully)
6+
#
7+
# Algorithm:
8+
# set up two tables:
9+
# startips = list of all the startips, we binary search through this using bisect
10+
# ipranges = hash of all the ranges, where the key is the startip, and the endip is the value
11+
#
12+
# binary search through startips to find the closest startip of the target ip
13+
# get the endip of the range from the hash, and compare
14+
# return locId to caller
15+
#
16+
# Initials:
17+
# AX Axel Roest
18+
# RB Ruben Bloemgarten
19+
#
20+
# Version history
21+
# 20120710 AR first version
22+
#
23+
# ToDO:
24+
#
25+
26+
import sys
27+
import re
28+
import os
29+
import math
30+
import bisect
31+
from datetime import datetime
32+
import MySQLdb
33+
34+
#################################################################
35+
# #
36+
# settings #
37+
# #
38+
#################################################################
39+
40+
# Defaults
41+
42+
#################################################################
43+
# #
44+
# the meat #
45+
# #
46+
#################################################################
47+
48+
class MaxMind:
49+
# two tables:
50+
# startips = [] # list of all the startips, we binary search through this using bisect
51+
# ipranges = {} # hash of all the ranges, where the key is the startip, and the endip is the value
52+
53+
# initialise the object with the database table to instantiate with
54+
def __init__(self, maxmind_db_host, maxmind_db_user, maxmind_db_passwd, maxmind_db_name, maxmind_table_name):
55+
global_start_time = datetime.now()
56+
try:
57+
# Connect to the mysql database
58+
maxmind_db = MySQLdb.connect(host = maxmind_db_host,
59+
user = maxmind_db_user,
60+
passwd = maxmind_db_passwd,
61+
db = maxmind_db_name)
62+
maxmind_cursor = maxmind_db.cursor()
63+
MaxMind.loadTable(self, maxmind_cursor, maxmind_table_name)
64+
except Exception as e:
65+
print('Aborting maxmind due to error: ' + str(e))
66+
exit(1)
67+
finally:
68+
maxmind_cursor.close()
69+
global_end_time = datetime.now()
70+
print 'MaxMind: Read and indexed `' + maxmind_table_name + '` in ' + str(global_end_time - global_start_time) + ' seconds.'
71+
72+
def loadTable(self, maxmind_cursor, maxmind_table_name):
73+
# grab everything (whole table, 3,5 million items)
74+
sql = """SELECT startIpNum, endIpNum, locId FROM `{0}`""".format(maxmind_table_name)
75+
maxmind_cursor.execute(sql)
76+
result = maxmind_cursor.fetchall()
77+
if result:
78+
# fromkeys
79+
self.startips = [x[0] for x in result]
80+
self.startips.sort()
81+
self.ipranges = {int(start) : [int(e),int(l)] for start,e,l in result}
82+
83+
def find_le(self, a, x):
84+
# Find rightmost value less than or equal to x
85+
i = bisect.bisect_right(a, x)
86+
if i:
87+
return a[i-1]
88+
raise ValueError
89+
90+
def lookup(self, ipnumber):
91+
# print "looking up: " + str(ipnumber)
92+
i = self.find_le(self.startips, ipnumber)
93+
# print "i="+str(i)
94+
(endip, loc) = self.ipranges[i]
95+
# print endip, loc
96+
if ipnumber <= endip:
97+
return loc
98+
else:
99+
return -1
100+

mlab_mysql_import.py

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# 20120629 AX added loop over all arguments, exception handling, restructured code, moved processed files to archive or error folder
1111
# 20120708 AX skip empty ip lines instead or error message
1212
# 20120708 RB cleaning some names and spelling, also we don't want processed_files.log to clobber the downloaders processed_files.log. So we should use overly descriptive names
13+
# 20120710 AX added locId lookup and added longip to insert query
1314
#
1415
# test:
1516
# cd /DATA
@@ -21,15 +22,14 @@
2122
# v move error files naar error directory
2223
# v log process and errors
2324
# v skip empty ip lines instead or error message
25+
# v added locId lookup and added longip to insert query
2426
#
2527
# Get the date from the filename, and look up the correct maxmind database
2628
# then, insert the locId directly with the line in the mlab/{glasnost,ndt} database, preventing slow future updates
2729
# on the other hand, all these updates might be extremely slow: TEST
2830
#
29-
# todo : refactor all the utility functions in a separate file
30-
# todo : refactor all the passwords in a separate file (which is NOT in the repo, AND is in the .gitignore list
31-
32-
31+
# todo : refactor all the utility functions in a separate file
32+
# todo : refactor all the passwords in a separate file (which is NOT in the repo, AND is in the .gitignore list
3333

3434
import sys
3535
import re
@@ -39,6 +39,8 @@
3939
import dateutil.parser as dparser
4040
import MySQLdb
4141
import shutil
42+
from maxmind import MaxMind
43+
import socket, struct
4244

4345
#################################################################
4446
# #
@@ -51,11 +53,12 @@
5153
db_user = "root" # your username
5254
db_passwd = "" # your password
5355
db_name = "mlab" # name of the database
54-
db_tables = {"glasnost": "glasnost", "ndt": "ndt"} # a mapping from testname to tablename
56+
db_tables = {"glasnost": "glasnost", "ndt": "ndt_test"} # a mapping from testname to tablename
5557
db_filetable = 'files'
5658

5759
# directories
5860
baseDir = '/DATA/mlab/'
61+
#baseDir = '/home/axel/mlab/'
5962
scratchDir = baseDir + 'scratch/'
6063
workDir = baseDir + 'work/'
6164
archiveDir = baseDir + 'archive/'
@@ -67,14 +70,26 @@
6770
errorLog = "mlab_mysql_import_error.log"
6871
processLog = "mlab_mysql_import_processed_files.log"
6972

73+
# default tables
74+
maxmind_table = 'Blocks_GeoLiteCity_Last'
75+
ndt_import = 'ndt_import'
7076
#################################################################
7177
# #
7278
# functions #
7379
# #
7480
#################################################################
7581

82+
# Convert an IP string to long
83+
def ip2long(ip):
84+
packedIP = socket.inet_aton(ip)
85+
return struct.unpack("!L", packedIP)[0]
86+
87+
def long2ip(l):
88+
return socket.inet_ntoa(struct.pack('!L', l))
89+
7690
def usage():
77-
print "Usage: mlab_mysql_import3.py mlab_file1.csv [mlab_files.csv ...]"
91+
print "Usage: mlab_mysql_import.py [ -m maxmind_Blocks_Tablename ] mlab_file1.csv [mlab_files.csv ...]"
92+
print "Default: maxmind_Blocks_Tablename = `Blocks_GeoLiteCity_Last`"
7893
sys.exit(1)
7994

8095
# This routine extracts the destination server of the mlab file.
@@ -133,9 +148,13 @@ def exists_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip
133148

134149
# Insert a connection to the database without testing.
135150
def blunt_insert_dbentry(cur, file_id, db_table, test_datetime, destination, source_ip):
136-
columns = ', '.join(['date', 'destination', 'source', 'file_id'])
137-
values = '"' + '", "'.join([test_datetime.isoformat(), destination, source_ip, str(file_id)]) + '"'
151+
longip = ip2long(source_ip)
152+
# locid = 0
153+
locid = mm.lookup(longip) # lookup location id from ip number
154+
columns = ', '.join(['date', 'destination', 'source', 'file_id', 'longip', 'locId'])
155+
values = '"' + '", "'.join([test_datetime.isoformat(), destination, source_ip, str(file_id), str(longip), str(locid)]) + '"'
138156
sql = "INSERT INTO " + db_table + " (" + columns + ") VALUES(" + values + ") "
157+
# print sql
139158
cur.execute(sql)
140159

141160
# Insert a test connection to the database, if it not already exists
@@ -170,6 +189,26 @@ def dedup(file_id, table, test_datetime, destination, source_ip):
170189
deduplookup[key] = True
171190
return True
172191

192+
# for the temp table, look up all the locations with the locId
193+
def lookup_locations(cur, destination):
194+
location_table_name = maxmind_table.replace("Blocks", "Location")
195+
# sql = 'UPDATE mlab.`' + destination + '` L, maxmind.`' + location_table_name + '` M SET L.country_code = M.country, L.region=M.region, L.city=M.city, L.postalCode=M.postalCode, L.latitude=M.latitude, L.longitude=M.longitude, L.metroCode=M.metroCode, L.areaCode=M.areaCode WHERE L.`locId` = M.`locId`'
196+
sql = 'UPDATE mlab.`ndt_import` L, maxmind.`' + location_table_name + '` M SET L.country_code = M.country, L.region=M.region, L.city=M.city, L.postalCode=M.postalCode, L.latitude=M.latitude, L.longitude=M.longitude, L.metroCode=M.metroCode, L.areaCode=M.areaCode WHERE L.`locId` = M.`locId`'
197+
updated = cur.execute(sql)
198+
# update country from country_code later?
199+
return updated
200+
201+
# clear the temp table
202+
def clear_temp_table(cur):
203+
sql = 'truncate table `' + ndt_import + '`'
204+
cur.execute(sql)
205+
206+
# move the temp table to the real on (either ndt_test or ndt)
207+
def move_temp_table(cur, destination):
208+
sql = 'INSERT INTO `' + destination + '` (`created_at`, `date`, `destination`, `source`, `file_id`, `country_code`, `longip`, `locId`, `country`, `region`, `city`, `postalCode`, `latitude`, `longitude`, `metroCode`, `areaCode`) SELECT * FROM `' + ndt_import + '`'
209+
updated = cur.execute(sql)
210+
return updated
211+
173212
# returns True on error, False on correct processing
174213
def process_file(f, filename):
175214
start_time = datetime.now()
@@ -181,6 +220,7 @@ def process_file(f, filename):
181220
passwd = db_passwd,
182221
db = db_name)
183222
cur = db.cursor()
223+
clear_temp_table(cur)
184224

185225
# Find the destination server by investigating the filename
186226
destination = extract_destination(filename)
@@ -190,15 +230,15 @@ def process_file(f, filename):
190230
file_id = get_file_id(cur, filename)
191231
db.commit()
192232

193-
# Find the testsuite by investigating the filename
233+
# Find the testsuite (glasnost or ndt) by investigating the filename
194234
try:
195235
test = [test for test in db_tables.keys() if test in filename][0]
196236
except IndexError:
197237
sys.stderr.write('The filename ' + filename + ' does not contain a valid testname.')
198238
return 1
199239
# print "Found test suite " + test
200240

201-
# The filetest ALONE, takes 3 seconds with a 9 million records database, without indexes
241+
# The filetest ALONE, takes 3 seconds with a 9 million records database, without indexes
202242
# But falls back to less than half a second when indexing is turned on on the db
203243
filetest=True
204244
# Read the file line by line and import it into the database
@@ -215,9 +255,12 @@ def process_file(f, filename):
215255
filetest=False
216256
# test if we have already done it in this or last filetest
217257
if (dedup(file_id, db_tables[test], test_datetime, destination, source_ip)):
218-
blunt_insert_dbentry(cur, file_id, db_tables[test], test_datetime, destination, source_ip)
258+
# blunt_insert_dbentry(cur, file_id, db_tables[test], test_datetime, destination, source_ip)
259+
blunt_insert_dbentry(cur, file_id, ndt_import, test_datetime, destination, source_ip)
219260
end_time = datetime.now()
220261
print 'File done in ' + str(end_time - start_time)
262+
lookup_locations(cur, destination)
263+
move_temp_table(cur, db_tables[test])
221264
failure = False
222265
except Exception as inst:
223266
sys.stderr.write('Exception: '+str(inst.args) + '\n')
@@ -231,9 +274,6 @@ def process_file(f, filename):
231274
f.write(pathname + '\n')
232275
f.write('Error handling file ' + filename + ' (' + str(e.args) + ')\n')
233276
print
234-
# This bit should probably be cleaned up.
235-
# except:
236-
# sys.stderr.write('Process error ' + '\n')
237277
finally:
238278
# Commit and finish up
239279
sys.stderr.flush()
@@ -273,7 +313,11 @@ def move_archive(pathname):
273313

274314
parser = OptionParser()
275315
parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=False, help="don't print status messages to stdout")
316+
parser.add_option("-m", "--maxmind", dest="maxmind_table", default='', help="optional maxmind_table, if omitted we use 'Last'")
276317
(options, args) = parser.parse_args()
318+
if options.maxmind_table != '':
319+
maxmind_table = options.maxmind_table
320+
277321
if len(args) == 0:
278322
usage()
279323

@@ -295,17 +339,26 @@ def move_archive(pathname):
295339
#################################################################
296340
global_start_time = datetime.now()
297341

342+
# get instance of maxmind table
343+
print "using " + maxmind_table
344+
345+
mm = MaxMind(db_host, db_user, db_passwd, "maxmind",maxmind_table)
346+
347+
if not mm:
348+
sys.stderr.write('maxmind table does not exist: ' + maxmind_table + ' (' + str(e.args) + ')\n')
349+
exit(1)
350+
298351
# Iterate over ALL filenames
299352
for pathname in args:
300-
try:
301-
with open(pathname, 'r') as f:
302-
# Extract the basename of the filename, as the path is not of interest after this point
303-
filename = os.path.basename(pathname)
353+
try:
354+
with open(pathname, 'r') as f:
355+
# Extract the basename of the filename, as the path is not of interest after this point
356+
filename = os.path.basename(pathname)
304357
print "processing file " + filename,
305-
if (process_file(f, filename)):
306-
shutil.move(pathname,errorDir)
307-
else:
308-
move_archive(pathname)
358+
if (process_file(f, filename)):
359+
shutil.move(pathname,errorDir)
360+
else:
361+
move_archive(pathname)
309362
# file is automatically closed if needed
310363
except IOError as e:
311364
print 'Could not open file ' + pathname + '\nError: ' + str(e.args)

0 commit comments

Comments
 (0)