forked from rspeer/dominionstats
-
Notifications
You must be signed in to change notification settings - Fork 17
/
scrape_leaderboard.py
157 lines (123 loc) · 5.14 KB
/
scrape_leaderboard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import os
import datetime
import httplib
import socket
import StringIO
import gzip
import bz2
import utils
output_directory = 'static/leaderboard/'
def date_from_http_header_time(http_header_time):
return datetime.datetime.strptime(http_header_time, '%a, %d %b %Y %H:%M:%S %Z').date()
def get_date_of_last_cached_leaderboard():
filename_pattern = re.compile('^(?P<year>\d\d\d\d)-(?P<month>\d\d)-(?P<day>\d\d)\.html\.bz2$')
filenames = os.listdir(output_directory)
filenames.sort(reverse=True)
for filename in filenames:
match = filename_pattern.search(filename)
if not match:
continue
return datetime.date(int(match.group('year')), int(match.group('month')), int(match.group('day')))
# return the day before the first day on http://bggdl.square7.ch/leaderboard/
return datetime.date(2011, 3, 10)
def get_date_of_current_isotropic_leaderboard():
try:
connection = httplib.HTTPConnection('dominion.isotropic.org', timeout=30)
connection.request('HEAD', '/leaderboard/')
response = connection.getresponse()
headers = dict(response.getheaders())
connection.close()
except socket.error:
return
if response.status == 200:
# just after midnight Pacific time, GMT will have the same calendar date as Pacific time
# so, we can ignore the hour, minute, and second
return date_from_http_header_time(headers['last-modified'])
def save_file(date, data, is_gzipped):
if is_gzipped:
f = gzip.GzipFile(fileobj=StringIO.StringIO(data))
data = f.read()
f.close()
data = bz2.compress(data)
f = open(output_directory + str(date) + '.html.bz2', 'w')
f.write(data)
f.close()
def scrape_leaderboard(date, host, url, is_gzipped, assert_same_date):
try:
connection = httplib.HTTPConnection(host, timeout=30)
connection.request('GET', url, headers={'Accept-Encoding': 'gzip'} if is_gzipped else {})
response = connection.getresponse()
data = response.read()
connection.close()
except socket.error:
return 'socket error'
if assert_same_date:
headers = dict(response.getheaders())
if date != date_from_http_header_time(headers['last-modified']):
return 'leaderboard updated'
if response.status == 200:
save_file(date, data, is_gzipped)
return response.status
def scrape_leaderboard_from_isotropic(date):
return scrape_leaderboard(date, 'dominion.isotropic.org', '/leaderboard/', True, True)
def scrape_leaderboard_from_councilroom(date):
return scrape_leaderboard(date, 'councilroom.com', '/static/leaderboard/' + str(date) + '.html.bz2', False, False)
def scrape_leaderboard_from_bggdl(date):
return scrape_leaderboard(date, 'bggdl.square7.ch', '/leaderboard/leaderboard-' + str(date) + '.html', True, False)
def run_scrape_function_with_retries(scrape_function, date):
num_attempts = 0
while True:
num_attempts += 1
status = scrape_function(date)
if status == 200:
print 'successful'
break
elif status == 404:
print 'file not found'
break
elif status == 'leaderboard updated':
print 'the leaderboard was updated after this script was started, so re-run this script'
break
else:
if num_attempts < 3:
print 'retrying'
else:
print 'reached 3 attempts, aborting'
break
return status
def main():
utils.ensure_exists(output_directory)
date_of_last_cached_leaderboard = get_date_of_last_cached_leaderboard()
print 'date of the last cached leaderboard is', date_of_last_cached_leaderboard
date_of_current_isotropic_leaderboard = get_date_of_current_isotropic_leaderboard()
if date_of_current_isotropic_leaderboard is None:
print 'could not determine the date of the current isotropic leaderboard, so please try again later'
return
print 'date of the current isotropic leaderboard is', date_of_current_isotropic_leaderboard
one_day_delta = datetime.timedelta(1)
date = date_of_last_cached_leaderboard + one_day_delta
while date <= date_of_current_isotropic_leaderboard:
print
print date
if date == date_of_current_isotropic_leaderboard:
print 'scraping from isotropic'
status = run_scrape_function_with_retries(scrape_leaderboard_from_isotropic, date)
else:
print 'scraping from councilroom'
status = run_scrape_function_with_retries(scrape_leaderboard_from_councilroom, date)
if status != 200:
print 'scraping from bggdl'
status = run_scrape_function_with_retries(scrape_leaderboard_from_bggdl, date)
if status == 200:
pass
elif status == 404:
print 'file not found, so we will assume that it does not exist, and go to the next day'
else:
print 'please try again later'
break
date += one_day_delta
if __name__ == '__main__':
main()