Skip to content

Commit

Permalink
Fixed a bug WRT webdriver initialization and added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ncadou committed Jun 8, 2013
1 parent 388e161 commit 079ab6a
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 20 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Add something like this in your scrapy project settings:
}

WEBDRIVER_BROWSER = 'PhantomJS' # Or any other from selenium.webdriver
# or 'your_package.CustomWebdriverClass'
# or an actual class instead of a string.
# Optional passing of parameters to the webdriver
WEBDRIVER_OPTIONS = {
'service_args': ['--debug=true', '--load-images=false', '--webdriver-loglevel=debug']
Expand Down
34 changes: 19 additions & 15 deletions scrapy_webdriver/manager.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import inspect
from collections import deque
from threading import Lock

from scrapy.signals import engine_stopped
from selenium import webdriver
from scrapy_webdriver.http import WebdriverRequest, WebdriverActionRequest
from selenium import webdriver


class WebdriverManager(object):
Expand All @@ -17,12 +18,18 @@ def __init__(self, crawler):
self._wait_inpage_queue = deque()
self._browser = crawler.settings.get('WEBDRIVER_BROWSER', None)
self._user_agent = crawler.settings.get('USER_AGENT', None)
self._web_driver_options = crawler.settings.get('WEBDRIVER_OPTIONS',
dict())
self._options = crawler.settings.get('WEBDRIVER_OPTIONS', dict())
self._webdriver = None
if isinstance(self._browser, basestring):
self._browser = getattr(webdriver, self._browser)
elif self._browser is not None:
if '.' in self._browser:
module, browser = self._browser.rsplit('.', 2)
else:
module, browser = 'selenium.webdriver', self._browser
module = __import__(module, fromlist=[browser])
self._browser = getattr(module, browser)
elif inspect.isclass(self._browser):
self._browser = self._browser
else:
self._webdriver = self._browser

@property
Expand All @@ -32,20 +39,17 @@ def _desired_capabilities(self):
capabilities[self.USER_AGENT_KEY] = self._user_agent
return capabilities or None

@classmethod
def valid_settings(cls, settings):
browser = settings.get('WEBDRIVER_BROWSER')
if isinstance(browser, basestring):
return getattr(webdriver, browser, None) is not None
else:
return browser is not None

@property
def webdriver(self):
"""Return the webdriver instance, instantiate it if necessary."""
if self._webdriver is None:
options = self._web_driver_options
options['desired_capabilities'] = self._desired_capabilities
short_arg_classes = (webdriver.Firefox, webdriver.Ie)
if issubclass(self._browser, short_arg_classes):
cap_attr = 'capabilities'
else:
cap_attr = 'desired_capabilities'
options = self._options
options[cap_attr] = self._desired_capabilities
self._webdriver = self._browser(**options)
self.crawler.signals.connect(self._cleanup, signal=engine_stopped)
return self._webdriver
Expand Down
9 changes: 5 additions & 4 deletions scrapy_webdriver/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ def __init__(self, crawler):

@classmethod
def from_crawler(cls, crawler):
if not WebdriverManager.valid_settings(crawler.settings):
raise NotConfigured('WEBDRIVER_BROWSER is misconfigured: %r'
% crawler.settings.get('WEBDRIVER_BROWSER'))
return cls(crawler)
try:
return cls(crawler)
except Exception as e:
raise NotConfigured('WEBDRIVER_BROWSER is misconfigured: %r (%r)'
% (crawler.settings.get('WEBDRIVER_BROWSER'), e))

def process_start_requests(self, start_requests, spider):
"""Return start requests, with some reordered by the manager.
Expand Down
7 changes: 6 additions & 1 deletion scrapy_webdriver/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,12 @@ def extract(self):


class _TextNode(object):
"""Works around webdriver XPath inability to select text nodes."""
"""Works around webdriver XPath inability to select text nodes.
It's a rather contrived element API implementation, it should probably
be expanded.
"""
JS_FIND_FIRST_TEXT_NODE = ('return arguments[0].firstChild '
'&& arguments[0].firstChild.nodeValue')

Expand Down
47 changes: 47 additions & 0 deletions scrapy_webdriver/tests/test_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from selenium import webdriver

from scrapy_webdriver.manager import WebdriverManager

BASE_SETTINGS = dict(
DOWNLOAD_HANDLERS={
'http': 'scrapy_webdriver.download.WebdriverDownloadHandler',
'https': 'scrapy_webdriver.download.WebdriverDownloadHandler',
},
SPIDER_MIDDLEWARES={
'scrapy_webdriver.middlewares.WebdriverSpiderMiddleware': 543,
})


class TestManager:
@classmethod
def setup_class(cls):
cls._settings = BASE_SETTINGS

def settings(self, **options):
settings = self._settings.copy()
settings.update(**options)
return settings

def test_browser_config(self):
class TestBrowser(object):
pass

settings = self.settings(WEBDRIVER_BROWSER='Firefox')
crawler = Crawler(Settings(values=settings))
crawler.configure()
browser = WebdriverManager(crawler)
assert issubclass(browser._browser, webdriver.Firefox)

settings = self.settings(WEBDRIVER_BROWSER=TestBrowser)
crawler = Crawler(Settings(values=settings))
crawler.configure()
browser = WebdriverManager(crawler)
assert issubclass(browser._browser, TestBrowser)

settings = self.settings(WEBDRIVER_BROWSER=TestBrowser())
crawler = Crawler(Settings(values=settings))
crawler.configure()
browser = WebdriverManager(crawler)
assert isinstance(browser._webdriver, TestBrowser)

0 comments on commit 079ab6a

Please sign in to comment.