forked from preshing/analyze-spec-benchmarks
-
Notifications
You must be signed in to change notification settings - Fork 2
/
fetch-pages.py
executable file
·91 lines (75 loc) · 3.14 KB
/
fetch-pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from builtins import open
from future import standard_library
standard_library.install_aliases()
import multiprocessing
import os
import time
import urllib.error
import urllib.parse
import urllib.request
import lxml.html
def cachedFetch(url, localPath, verbose=True):
if os.path.exists(localPath):
return 'Cached ' + url
try:
os.makedirs(os.path.split(localPath)[0])
except OSError:
pass
if verbose:
print('Fetching %s ...' % url)
finished = False
sleepTime = 1
while not finished:
try:
response = urllib.request.urlopen(url)
data = response.read()
finished = True
except:
time.sleep(sleepTime)
sleepTime *= 2
print('Fetching Hit a snag fetching the page, retrying...')
with open(localPath, 'wb') as f:
f.write(data)
return 'Fetched ' + url
def cachedRead(url, localPath):
cachedFetch(url, localPath)
return open(localPath, 'rb')
def mpFetch(args):
return cachedFetch(*args, verbose=False)
def iterateAllPageURLs():
with cachedRead('http://www.spec.org/cpu95/results/cpu95.html', os.path.join('scraped', 'cpu95.html')) as f:
print('Scanning cpu95.html ...')
doc = lxml.html.parse(f)
for elem, attr, link, pos in doc.getroot().iterlinks():
if link.lower().endswith('.asc') or link.lower().endswith('.html'):
yield 'http://www.spec.org' + link, os.path.join('scraped', 'cpu95', link.split('/')[-1])
with cachedRead('http://www.spec.org/cpu2000/results/cpu2000.html', os.path.join('scraped', 'cpu2000.html')) as f:
print('Scanning cpu2000.html ...')
doc = lxml.html.parse(f)
for elem, attr, link, pos in doc.getroot().iterlinks():
if link.lower().endswith('.asc'):
yield 'http://www.spec.org/cpu2000/results/' + link, os.path.join('scraped', 'cpu2000', link.split('/')[-1])
with cachedRead('http://www.spec.org/cpu2006/results/cpu2006.html', os.path.join('scraped', 'cpu2006.html')) as f:
print('Scanning cpu2006.html ...')
doc = lxml.html.parse(f)
for elem, attr, link, pos in doc.getroot().iterlinks():
if link.lower().endswith('.txt'):
yield 'http://www.spec.org/cpu2006/results/' + link, os.path.join('scraped', 'cpu2006', link.split('/')[-1])
with cachedRead('http://www.spec.org/cpu2017/results/cpu2017.html', os.path.join('scraped', 'cpu2017.html')) as f:
print('Scanning cpu2017.html ...')
doc = lxml.html.parse(f)
for elem, attr, link, pos in doc.getroot().iterlinks():
if link.lower().endswith('.txt'):
yield 'http://www.spec.org/cpu2017/results/' + link, os.path.join('scraped', 'cpu2017', link.split('/')[-1])
if __name__ == '__main__':
allPageURLs = list(iterateAllPageURLs())
pool = multiprocessing.Pool(20)
i = 0
for result in pool.imap_unordered(mpFetch, allPageURLs):
i += 1
print('%d/%d ... %s' % (i, len(allPageURLs), result))