diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index bf31fbb..5a73541 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -1,11 +1,16 @@ +import signal + from scrapy import log from scrapy.utils.decorator import inthread from scrapy.utils.misc import load_object +from scrapy.exceptions import IgnoreRequest from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse FALLBACK_HANDLER = 'scrapy.core.downloader.handlers.http.HttpDownloadHandler' +class WebdriverTimeout(Exception): + pass class WebdriverDownloadHandler(object): """This download handler uses webdriver, deferred in a thread. @@ -15,11 +20,37 @@ class WebdriverDownloadHandler(object): """ def __init__(self, settings): self._enabled = settings.get('WEBDRIVER_BROWSER') is not None + self._timeout = settings.get('WEBDRIVER_TIMEOUT') + self._hang_timeout = settings.get('WEBDRIVER_HANG_TIMEOUT', None) self._fallback_handler = load_object(FALLBACK_HANDLER)(settings) def download_request(self, request, spider): """Return the result of the right download method for the request.""" if self._enabled and isinstance(request, WebdriverRequest): + + # set the signal handler for the SIGALRM event + if self._hang_timeout: + + def alarm_handler(signum, frame): + + # kill the selenium webdriver process (with SIGTERM, + # so that it kills both the primary process and the + # process that gets spawned) + request.manager.webdriver.service.process.send_signal(signal.SIGTERM) + + # set the defunct _webdriver attribute back to + # original value of None, so that the next time it is + # accessed it is recreated. + request.manager._webdriver = None + + # log an informative warning message + msg = "WebDriver.get for '%s' took more than WEBDRIVER_HANG_TIMEOUT (%ss)" % \ + (request.url, self._hang_timeout) + spider.log(msg, level=log.INFO) + + # bind the handler + signal.signal(signal.SIGALRM, alarm_handler) + if isinstance(request, WebdriverActionRequest): download = self._do_action_request else: @@ -31,10 +62,52 @@ def download_request(self, request, spider): @inthread def _download_request(self, request, spider): """Download a request URL using webdriver.""" - log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG) - request.manager.webdriver.get(request.url) - return WebdriverResponse(request.url, request.manager.webdriver) + spider.log('Downloading %s with webdriver' % request.url, level=log.DEBUG) + + # set a countdown timer for the webdriver.get + if self._hang_timeout: + signal.alarm(self._hang_timeout) + + # make the get request + try: + request.manager.webdriver.get(request.url) + + # if the get fails for any reason, set the webdriver attribute of the + # response to the exception that occurred + except Exception, exception: + + # since it's already failed, don't try to raise alarm anymore (this has no effect if the failure was due to the alarm) + if self._hang_timeout: + spider.log('settings alarm to 0 on FAILURE', level=log.DEBUG) + spider.log('FAIL: ' + str(request.manager._webdriver), level=log.DEBUG) + signal.alarm(0) + + # set page_source to blank so that WebdriverResponse doesn't complain + exception.page_source = '' + + # log a nice error message + msg = 'Error while downloading %s with webdriver (%s)' % \ + (request.url, exception) + spider.log(msg, level=log.ERROR) + + # since manager.webdriver is a @property, this will recreate connection + webdriver = request.manager.webdriver + spider.log('FAIL 2. THIS SHOULD BE WEBDRIVER: ' + str(request.manager._webdriver), level=log.DEBUG) + return WebdriverResponse(request.url, exception) + + # if the get finishes, defuse the bomb and return a response with the + # webdriver attached + else: + + # since it succeeded, don't raise any alarm + if self._hang_timeout: + spider.log('settings alarm to 0 on SUCCESS', level=log.DEBUG) + spider.log('YEAH: ' + str(request.manager._webdriver), level=log.DEBUG) + signal.alarm(0) + # return the correct response + return WebdriverResponse(request.url, request.manager.webdriver) + @inthread def _do_action_request(self, request, spider): """Perform an action on a previously webdriver-loaded page.""" diff --git a/scrapy_webdriver/manager.py b/scrapy_webdriver/manager.py index ec496dc..3436031 100644 --- a/scrapy_webdriver/manager.py +++ b/scrapy_webdriver/manager.py @@ -19,6 +19,7 @@ def __init__(self, crawler): self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None) self._user_agent = crawler.settings.get('USER_AGENT', None) self._options = crawler.settings.get('WEBDRIVER_OPTIONS', dict()) + self._timeout = crawler.settings.get('WEBDRIVER_TIMEOUT', None) self._webdriver = None if isinstance(self._browser, basestring): if '.' in self._browser: @@ -52,6 +53,8 @@ def webdriver(self): options[cap_attr] = self._desired_capabilities self._webdriver = self._browser(**options) self.crawler.signals.connect(self._cleanup, signal=engine_stopped) + if self._timeout: + self._webdriver.set_page_load_timeout(self._timeout) return self._webdriver def acquire(self, request): diff --git a/scrapy_webdriver/middlewares.py b/scrapy_webdriver/middlewares.py index 052c199..8581b02 100644 --- a/scrapy_webdriver/middlewares.py +++ b/scrapy_webdriver/middlewares.py @@ -1,9 +1,9 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy import log -from .http import WebdriverActionRequest, WebdriverRequest +from .http import WebdriverActionRequest, WebdriverRequest, WebdriverResponse from .manager import WebdriverManager - class WebdriverSpiderMiddleware(object): """This middleware coordinates concurrent webdriver access attempts.""" def __init__(self, crawler): @@ -57,3 +57,40 @@ def _process_requests(self, items_or_requests, start=False): if request is WebdriverRequest.WAITING: continue # Request has been enqueued, so drop it. yield request + + def process_spider_exception(self, response, exception, spider): + """If there is an exception while parsing, feed the scrapy + scheduler with the next request from the queue in the + webdriver manager. + """ + if isinstance(response.request, WebdriverRequest): + + # release the lock that was acquired for this URL + self.manager.release(response.request.url) + + # get the next request + next_request = self.manager.acquire_next() + + # only schedule if the queue isn't empty + if next_request is not WebdriverRequest.WAITING: + scheduler = self.manager.crawler.engine.slots[spider].scheduler + scheduler.enqueue_request(next_request.replace(dont_filter=True)) + + +class WebdriverDownloaderMiddleware(object): + """This middleware handles webdriver.get failures.""" + + def process_response(self, request, response, spider): + + # if there is a downloading error in the WebdriverResponse, + # make a nice error message + if isinstance(response, WebdriverResponse): + if isinstance(response.webdriver, Exception): + msg = 'Error while downloading %s with webdriver (%s)' % \ + (request.url, response.webdriver) + spider.log(msg, level=log.ERROR) + + # but always still return the response. When there are errors, + # parse methods will probably fail. + return response +