forked from rrenaud/dominionstats
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscrape.py
220 lines (180 loc) · 6.9 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/python
# taken from
# http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
import datetime
import glob
import logging
import os
import os.path
import subprocess
import sys
import tempfile
import time
import urllib
import utils
# if the size of the game log is less than this assume we got an error page
SMALL_FILE_SIZE = 5000
DEBUG = False
GOOD = 0
MISSING = 1
ERROR = 2
DOWNLOADED = 3
REPACKAGED = 4
# make I should just adopt the isotropic format for consistency?
ISOTROPIC_FORMAT = '%(year)d%(month)02d/%(day)02d/all.tar.bz2'
COUNCILROOM_FORMAT = '%(year)d%(month)02d%(day)02d/%(year)d%(month)02d%(day)02d.all.tar.bz2'
def FormatDate(fmt, cur_date):
return fmt % {
'year': cur_date.year, 'month': cur_date.month, 'day': cur_date.day
}
def IsotropicGamesCollectionUrl(cur_date):
host = 'http://dominion.isotropic.org/gamelog/'
return host + FormatDate(ISOTROPIC_FORMAT, cur_date)
def CouncilroomGamesCollectionUrl(cur_date):
host = 'http://councilroom.com/static/scrape_data/'
return host + FormatDate(COUNCILROOM_FORMAT, cur_date)
def RemoveSmallFileIfExists(fn):
if (os.path.exists(fn) and
os.stat(fn).st_size <= SMALL_FILE_SIZE):
print 'removing small existing file', fn
os.unlink(fn)
def download_date(str_date, cur_date, saved_games_bundle):
urls_by_priority = [
CouncilroomGamesCollectionUrl(cur_date),
IsotropicGamesCollectionUrl(cur_date),
]
for url in urls_by_priority:
if DEBUG:
print 'getting', saved_games_bundle, 'at', url
contents = urllib.urlopen(url).read()
if len(contents) > SMALL_FILE_SIZE:
if DEBUG:
print 'yay, success from', url, 'no more requests for', \
str_date, 'needed'
open(saved_games_bundle, 'w').write(contents)
return True
elif DEBUG:
print 'request to', url, 'failed to find large file'
return False
def unzip_date(directory, filename):
os.chdir(directory)
cmd = 'tar -xjvf %s >/dev/null 2>/dev/null'%filename
if DEBUG:
print cmd
ret = os.system(cmd)
if ret==0:
os.system('chmod -R 755 .')
code = True
else:
code = False
os.chdir('..')
return code
def repackage_filename(orig_archive_filename):
return orig_archive_filename.replace(".all.tar.bz2", ".bz2.tar")
def repackage_archive(filename):
""" Converts a .tar.bz2 file into a .bz2.tar file in the same directory.
Game archives are distributed as .tar.bz2 (a bzip2-compressed tar
archive). For speed of serving, we repackage them as .bz2.tar (a
tar archive of bzip2-compressed HTML files). The .bz2.tar file is
a good bit larger, but an individual file can be extracted,
decompressed, and served to a client in tenths of a second instead
of tens of seconds. At the same time, storage space is still
dramatically smaller than a raw folder of uncompressed (or even
compressed) HTML files.
"""
orig_dir = os.getcwd()
# Extract the existing file into a temporary folder
directory_name = tempfile.mkdtemp()
source_filename = os.path.abspath(filename)
try:
subprocess.check_call(["tar", "--auto-compress", "-C", directory_name,
"-xf", source_filename])
except subprocess.CalledProcessError, e:
# Not handling this yet, just re-raise
logging.warning("Unexpected return from tar >>{msg}<<".format(msg=e.output))
raise
# Compress all the game*.html files
os.chdir(directory_name)
game_files = glob.glob("game*.html")
if len(game_files) > 0:
try:
subprocess.check_call(["bzip2"] + game_files)
except subprocess.CalledProcessError, e: #(retcode, cmd, output=output)
# Not handling this yet, just re-raise
logging.warning("Unexpected return from bzip >>{msg}<<".format(msg=e.output))
raise
else:
os.chdir(orig_dir)
os.removedirs(directory_name)
return
# Tar the results back to the directory where the original file
# came from
dest_filename = repackage_filename(source_filename)
game_files = glob.glob("game*.html.bz2")
try:
subprocess.check_call(["tar", "--remove", "-cf", dest_filename+".part"] + game_files)
except subprocess.CalledProcessError, e: #(retcode, cmd, output=output)
# Not handling this yet, just re-raise
logging.warning("Unexpected return from tar >>{msg}<<".format(msg=e.output))
raise
os.rename(dest_filename + ".part", dest_filename)
os.chdir(orig_dir)
os.removedirs(directory_name)
def scrape_date(str_date, cur_date, passive=False):
#directory = str_date
games_short_name = str_date + '.all.tar.bz2'
saved_games_bundle = games_short_name
return_code = ERROR
if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE):
if DEBUG:
print 'skipping because exists', str_date, saved_games_bundle, \
'and not small (size=', os.stat(saved_games_bundle).st_size, ')'
return_code = GOOD
else:
RemoveSmallFileIfExists(saved_games_bundle)
if passive:
return_code = MISSING
elif not download_date(str_date, cur_date, saved_games_bundle):
return_code = ERROR
return_code = DOWNLOADED
# Repackage an existing file, if found
if utils.at_least_as_big_as(saved_games_bundle, SMALL_FILE_SIZE) and \
not os.path.exists(repackage_filename(saved_games_bundle)):
repackage_archive(saved_games_bundle)
return_code = REPACKAGED
return return_code
def scrape_games():
parser = utils.incremental_date_range_cmd_line_parser()
utils.ensure_exists('static/scrape_data')
os.chdir('static/scrape_data')
args = parser.parse_args()
last_month = ''
for cur_date in utils.daterange(datetime.date(2010, 10, 15),
datetime.date.today()):
str_date = time.strftime("%Y%m%d", cur_date.timetuple())
if not utils.includes_day(args, str_date):
if DEBUG:
print 'skipping', str_date, 'because not in cmd line arg daterange'
continue
mon = time.strftime("%b%y", cur_date.timetuple())
if mon != last_month:
print
print mon, cur_date.day*" ",
sys.stdout.flush()
last_month = mon
ret = scrape_date(str_date, cur_date, passive=args.passive)
if ret==DOWNLOADED:
print 'o',
elif ret==REPACKAGED:
print 'O',
elif ret==ERROR:
print '!',
elif ret==MISSING:
print '_',
else:
print '.',
sys.stdout.flush()
print
os.chdir('../..')
if __name__=='__main__':
scrape_games()