Skip to content

Commit

Permalink
Merge pull request TeamHG-Memex#28 from TeamHG-Memex/fix-proxy
Browse files Browse the repository at this point in the history
Fix proxy support: authorization and Splash support
  • Loading branch information
lopuhin authored Jun 15, 2016
2 parents 2a7f981 + acb2962 commit 0a6814e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 3 deletions.
13 changes: 13 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,19 @@ Response is JSON with a ``status`` field with the following possible values:
``cookies`` field, in ``Cookie.__dict__`` format.


Proxy support
-------------

Proxies can be specified via ``HTTP_PROXY`` and ``HTTPS_PROOXY`` keys
in ``settings`` argument. Username and password can be specified
as part of the proxy url (the format is ``protocol://username:password@url``).

If you are using proxy with Splash, it is assumed that
you want to have Splash make requests via given proxy,
and not make a request **to** Splash via proxy.
``HTTP_PROXY`` is always used for Splash.


Captcha support
---------------

Expand Down
16 changes: 14 additions & 2 deletions autologin/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,33 @@ def get_cookiejar(response):
return obj


class ProxyFromSettingsMiddleware(HttpProxyMiddleware):
"""A middleware that sets proxy from settings file"""
class ProxyMiddleware(HttpProxyMiddleware):
""" A middleware that sets proxy from settings for non-splash requests,
and passes proxy splash args for splash requests.
This middleware must be placed **before** splash middleware.
"""

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def __init__(self, settings):
self.proxies = {}
self.auth_encoding = settings.get('HTTPPROXY_AUTH_ENCODING')
proxies = [
('http', settings.get('HTTP_PROXY')),
('https', settings.get('HTTPS_PROXY')),
]
self.splash_proxy = settings.get('HTTP_PROXY')
for type_, url in proxies:
if url:
self.proxies[type_] = self._get_proxy(url, type_)
if not self.proxies:
raise NotConfigured

def process_request(self, request, spider):
if 'splash' in request.meta:
if self.splash_proxy:
request.meta['splash']['args']['proxy'] = self.splash_proxy
else:
super(ProxyMiddleware, self).process_request(request, spider)
3 changes: 2 additions & 1 deletion autologin/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
USER_AGENT = USER_AGENT,
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'autologin.middleware.ProxyFromSettingsMiddleware': 750,
# Placed before splash middleware
'autologin.middleware.ProxyMiddleware': 720,
},
))

Expand Down

0 comments on commit 0a6814e

Please sign in to comment.