Skip to content

Commit

Permalink
Added option to abort request on timeout
Browse files Browse the repository at this point in the history
Fixes brandicted#3 stuck on downloading for a long time
  • Loading branch information
samos123 committed May 21, 2013
1 parent 3b1f6e6 commit f8294e2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 1 deletion.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ Add something like this in your scrapy project settings:
'service_args': ['--debug=true', '--load-images=false', '--webdriver-loglevel=debug']
}

# If not set it will use the webdriver default which seems to be infinit
WEBDRIVER_TIMEOUT = 10

Usage
=====

Expand Down
2 changes: 1 addition & 1 deletion scrapy_webdriver/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def download_request(self, request, spider):
def _download_request(self, request, spider):
"""Download a request URL using webdriver."""
log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
request.manager.webdriver.get(request.url)
request.manager.get(request.url)
return WebdriverResponse(request.url, request.manager.webdriver)

@inthread
Expand Down
13 changes: 13 additions & 0 deletions scrapy_webdriver/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import deque
from threading import Lock

from scrapy import log
from scrapy.signals import engine_stopped
from selenium import webdriver
from scrapy_webdriver.http import WebdriverRequest, WebdriverActionRequest
Expand All @@ -19,6 +20,7 @@ def __init__(self, crawler):
self._user_agent = crawler.settings.get('USER_AGENT', None)
self._web_driver_options = crawler.settings.get('WEBDRIVER_OPTIONS',
dict())
self.timeout = crawler.settings.get("WEBDRIVER_TIMEOUT", 0)
self._webdriver = None
if isinstance(self._browser, basestring):
self._browser = getattr(webdriver, self._browser)
Expand Down Expand Up @@ -64,6 +66,17 @@ def acquire(self, request):
queue = self._wait_queue
queue.append(request)

def get(self, url):
if self.timeout:
self.webdriver.set_page_load_timeout(self.timeout)
self.webdriver.set_script_timeout(self.timeout)
self.webdriver.implicitly_wait(self.timeout)
try:
self.webdriver.get(url)
except Exception as e:
message = "Unable to get url %s because of %s" % (url, e)
log.msg(message, level=log.ERROR)

def acquire_next(self):
"""Return the next waiting request, if any.
Expand Down

0 comments on commit f8294e2

Please sign in to comment.