diff --git a/README.rst b/README.rst index f2f7188..ba7b705 100644 --- a/README.rst +++ b/README.rst @@ -219,6 +219,19 @@ Response is JSON with a ``status`` field with the following possible values: ``cookies`` field, in ``Cookie.__dict__`` format. +Proxy support +------------- + +Proxies can be specified via ``HTTP_PROXY`` and ``HTTPS_PROOXY`` keys +in ``settings`` argument. Username and password can be specified +as part of the proxy url (the format is ``protocol://username:password@url``). + +If you are using proxy with Splash, it is assumed that +you want to have Splash make requests via given proxy, +and not make a request **to** Splash via proxy. +``HTTP_PROXY`` is always used for Splash. + + Captcha support --------------- diff --git a/autologin/middleware.py b/autologin/middleware.py index 88cf0d0..22995e0 100644 --- a/autologin/middleware.py +++ b/autologin/middleware.py @@ -34,8 +34,11 @@ def get_cookiejar(response): return obj -class ProxyFromSettingsMiddleware(HttpProxyMiddleware): - """A middleware that sets proxy from settings file""" +class ProxyMiddleware(HttpProxyMiddleware): + """ A middleware that sets proxy from settings for non-splash requests, + and passes proxy splash args for splash requests. + This middleware must be placed **before** splash middleware. + """ @classmethod def from_crawler(cls, crawler): @@ -43,12 +46,21 @@ def from_crawler(cls, crawler): def __init__(self, settings): self.proxies = {} + self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING') proxies = [ ('http', settings.get('HTTP_PROXY')), ('https', settings.get('HTTPS_PROXY')), ] + self.splash_proxy = settings.get('HTTP_PROXY') for type_, url in proxies: if url: self.proxies[type_] = self._get_proxy(url, type_) if not self.proxies: raise NotConfigured + + def process_request(self, request, spider): + if 'splash' in request.meta: + if self.splash_proxy: + request.meta['splash']['args']['proxy'] = self.splash_proxy + else: + super(ProxyMiddleware, self).process_request(request, spider) diff --git a/autologin/spiders.py b/autologin/spiders.py index 0fdf03d..b4146d8 100644 --- a/autologin/spiders.py +++ b/autologin/spiders.py @@ -48,7 +48,8 @@ USER_AGENT = USER_AGENT, DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, - 'autologin.middleware.ProxyFromSettingsMiddleware': 750, + # Placed before splash middleware + 'autologin.middleware.ProxyMiddleware': 720, }, ))