From f8294e2bea759b4aaa6693606921a72aeead927d Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 21 May 2013 20:42:52 +0800 Subject: [PATCH] Added option to abort request on timeout Fixes #3 stuck on downloading for a long time --- README.md | 3 +++ scrapy_webdriver/download.py | 2 +- scrapy_webdriver/manager.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aab12fa..fb34ffe 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,9 @@ Add something like this in your scrapy project settings: 'service_args': ['--debug=true', '--load-images=false', '--webdriver-loglevel=debug'] } + # If not set it will use the webdriver default which seems to be infinit + WEBDRIVER_TIMEOUT = 10 + Usage ===== diff --git a/scrapy_webdriver/download.py b/scrapy_webdriver/download.py index bf31fbb..9124f32 100644 --- a/scrapy_webdriver/download.py +++ b/scrapy_webdriver/download.py @@ -32,7 +32,7 @@ def download_request(self, request, spider): def _download_request(self, request, spider): """Download a request URL using webdriver.""" log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG) - request.manager.webdriver.get(request.url) + request.manager.get(request.url) return WebdriverResponse(request.url, request.manager.webdriver) @inthread diff --git a/scrapy_webdriver/manager.py b/scrapy_webdriver/manager.py index 75cc905..a511c15 100644 --- a/scrapy_webdriver/manager.py +++ b/scrapy_webdriver/manager.py @@ -1,6 +1,7 @@ from collections import deque from threading import Lock +from scrapy import log from scrapy.signals import engine_stopped from selenium import webdriver from scrapy_webdriver.http import WebdriverRequest, WebdriverActionRequest @@ -19,6 +20,7 @@ def __init__(self, crawler): self._user_agent = crawler.settings.get('USER_AGENT', None) self._web_driver_options = crawler.settings.get('WEBDRIVER_OPTIONS', dict()) + self.timeout = crawler.settings.get("WEBDRIVER_TIMEOUT", 0) self._webdriver = None if isinstance(self._browser, basestring): self._browser = getattr(webdriver, self._browser) @@ -64,6 +66,17 @@ def acquire(self, request): queue = self._wait_queue queue.append(request) + def get(self, url): + if self.timeout: + self.webdriver.set_page_load_timeout(self.timeout) + self.webdriver.set_script_timeout(self.timeout) + self.webdriver.implicitly_wait(self.timeout) + try: + self.webdriver.get(url) + except Exception as e: + message = "Unable to get url %s because of %s" % (url, e) + log.msg(message, level=log.ERROR) + def acquire_next(self): """Return the next waiting request, if any.