fetch-pages.py

#!/usr/bin/env python3
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division

from builtins import open
from future import standard_library

standard_library.install_aliases()
import multiprocessing
import os
import time
import urllib.error
import urllib.parse
import urllib.request

import lxml.html


def cachedFetch(url, localPath, verbose=True):
    if os.path.exists(localPath):
        return 'Cached ' + url
    try:
        os.makedirs(os.path.split(localPath)[0])
    except OSError:
        pass
    if verbose:
        print('Fetching %s ...' % url)
    finished = False
    sleepTime = 1
    while not finished:
        try:
            response = urllib.request.urlopen(url)
            data = response.read()
            finished = True
        except:
            time.sleep(sleepTime)
            sleepTime *= 2
            print('Fetching Hit a snag fetching the page, retrying...')
    with open(localPath, 'wb') as f:
        f.write(data)
    return 'Fetched ' + url


def cachedRead(url, localPath):
    cachedFetch(url, localPath)
    return open(localPath, 'rb')


def mpFetch(args):
    return cachedFetch(*args, verbose=False)


def iterateAllPageURLs():
    with cachedRead('http://www.spec.org/cpu95/results/cpu95.html', os.path.join('scraped', 'cpu95.html')) as f:
        print('Scanning cpu95.html ...')
        doc = lxml.html.parse(f)
    for elem, attr, link, pos in doc.getroot().iterlinks():
        if link.lower().endswith('.asc') or link.lower().endswith('.html'):
            yield 'http://www.spec.org' + link, os.path.join('scraped', 'cpu95', link.split('/')[-1])

    with cachedRead('http://www.spec.org/cpu2000/results/cpu2000.html', os.path.join('scraped', 'cpu2000.html')) as f:
        print('Scanning cpu2000.html ...')
        doc = lxml.html.parse(f)
    for elem, attr, link, pos in doc.getroot().iterlinks():
        if link.lower().endswith('.asc'):
            yield 'http://www.spec.org/cpu2000/results/' + link, os.path.join('scraped', 'cpu2000', link.split('/')[-1])

    with cachedRead('http://www.spec.org/cpu2006/results/cpu2006.html', os.path.join('scraped', 'cpu2006.html')) as f:
        print('Scanning cpu2006.html ...')
        doc = lxml.html.parse(f)
    for elem, attr, link, pos in doc.getroot().iterlinks():
        if link.lower().endswith('.txt'):
            yield 'http://www.spec.org/cpu2006/results/' + link, os.path.join('scraped', 'cpu2006', link.split('/')[-1])

    with cachedRead('http://www.spec.org/cpu2017/results/cpu2017.html', os.path.join('scraped', 'cpu2017.html')) as f:
        print('Scanning cpu2017.html ...')
        doc = lxml.html.parse(f)
    for elem, attr, link, pos in doc.getroot().iterlinks():
        if link.lower().endswith('.txt'):
            yield 'http://www.spec.org/cpu2017/results/' + link, os.path.join('scraped', 'cpu2017', link.split('/')[-1])


if __name__ == '__main__':
    allPageURLs = list(iterateAllPageURLs())
    pool = multiprocessing.Pool(20)
    i = 0
    for result in pool.imap_unordered(mpFetch, allPageURLs):
        i += 1
        print('%d/%d ... %s' % (i, len(allPageURLs), result))