From acb29620ae370b386c140b1cb7ebebf3c78459db Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Wed, 15 Jun 2016 12:39:52 +0300 Subject: [PATCH] Splash proxy support Do not set meta['proxy'] for splash requests, pass them to splash args instead. --- README.rst | 13 +++++++++++++ autologin/middleware.py | 15 +++++++++++++-- autologin/spiders.py | 3 ++- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index f2f7188..ba7b705 100644 --- a/README.rst +++ b/README.rst @@ -219,6 +219,19 @@ Response is JSON with a ``status`` field with the following possible values: ``cookies`` field, in ``Cookie.__dict__`` format. +Proxy support +------------- + +Proxies can be specified via ``HTTP_PROXY`` and ``HTTPS_PROOXY`` keys +in ``settings`` argument. Username and password can be specified +as part of the proxy url (the format is ``protocol://username:password@url``). + +If you are using proxy with Splash, it is assumed that +you want to have Splash make requests via given proxy, +and not make a request **to** Splash via proxy. +``HTTP_PROXY`` is always used for Splash. + + Captcha support --------------- diff --git a/autologin/middleware.py b/autologin/middleware.py index c0e5490..22995e0 100644 --- a/autologin/middleware.py +++ b/autologin/middleware.py @@ -34,8 +34,11 @@ def get_cookiejar(response): return obj -class ProxyFromSettingsMiddleware(HttpProxyMiddleware): - """A middleware that sets proxy from settings file""" +class ProxyMiddleware(HttpProxyMiddleware): + """ A middleware that sets proxy from settings for non-splash requests, + and passes proxy splash args for splash requests. + This middleware must be placed **before** splash middleware. + """ @classmethod def from_crawler(cls, crawler): @@ -48,8 +51,16 @@ def __init__(self, settings): ('http', settings.get('HTTP_PROXY')), ('https', settings.get('HTTPS_PROXY')), ] + self.splash_proxy = settings.get('HTTP_PROXY') for type_, url in proxies: if url: self.proxies[type_] = self._get_proxy(url, type_) if not self.proxies: raise NotConfigured + + def process_request(self, request, spider): + if 'splash' in request.meta: + if self.splash_proxy: + request.meta['splash']['args']['proxy'] = self.splash_proxy + else: + super(ProxyMiddleware, self).process_request(request, spider) diff --git a/autologin/spiders.py b/autologin/spiders.py index 0fdf03d..b4146d8 100644 --- a/autologin/spiders.py +++ b/autologin/spiders.py @@ -48,7 +48,8 @@ USER_AGENT = USER_AGENT, DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, - 'autologin.middleware.ProxyFromSettingsMiddleware': 750, + # Placed before splash middleware + 'autologin.middleware.ProxyMiddleware': 720, }, ))