diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py index c51c1d3..d418c72 100644 --- a/scrapyjs/middleware.py +++ b/scrapyjs/middleware.py @@ -32,6 +32,14 @@ def __init__(self, crawler, splash_base_url, slot_policy): self.splash_base_url = splash_base_url self.slot_policy = slot_policy + def get_splash_options(self, request, spider): + if request.meta.get("dont_proxy"): + return + + spider_options = getattr(spider, "splash", {}) + request_options = request.meta.get("splash") + return request_options or spider_options + @classmethod def from_crawler(cls, crawler): splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url) @@ -43,24 +51,26 @@ def from_crawler(cls, crawler): return cls(crawler, splash_base_url, slot_policy) def process_request(self, request, spider): - splash_options = request.meta.get('splash') + splash_options = self.get_splash_options(request, spider) if not splash_options: return + elif request.meta.get("_splash_processed"): + return + if request.method != 'GET': log.msg("Currently only GET requests are supported by SplashMiddleware; %s " "will be handled without Splash" % request, logging.WARNING) return request meta = request.meta - del meta['splash'] - meta['_splash_processed'] = splash_options slot_policy = splash_options.get('slot_policy', self.slot_policy) self._set_download_slot(request, meta, slot_policy) args = splash_options.setdefault('args', {}) - args.setdefault('url', request.url) + args['url'] = request.url + body = json.dumps(args, ensure_ascii=False) if 'timeout' in args: @@ -86,6 +96,7 @@ def process_request(self, request, spider): endpoint = splash_options.setdefault('endpoint', self.default_endpoint) splash_base_url = splash_options.get('splash_url', self.splash_base_url) splash_url = urljoin(splash_base_url, endpoint) + meta['_splash_processed'] = True req_rep = request.replace( url=splash_url, @@ -96,18 +107,16 @@ def process_request(self, request, spider): # are not respected. headers=Headers({'Content-Type': 'application/json'}), ) - self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return req_rep def process_response(self, request, response, spider): - splash_options = request.meta.get("_splash_processed") + splash_options = self.get_splash_options(request, spider) if splash_options: endpoint = splash_options['endpoint'] self.crawler.stats.inc_value( 'splash/%s/response_count/%s' % (endpoint, response.status) ) - return response def _set_download_slot(self, request, meta, slot_policy): diff --git a/tests/test_middleware.py b/tests/test_middleware.py index aa84efd..363b876 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -153,3 +153,25 @@ def test_adjust_timeout(): }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30 + + +def test_spider_attribute(): + req_url = "http://scrapy.org" + req1 = scrapy.Request(req_url) + + spider = scrapy.Spider("example") + spider.splash = {"args": {"images": 0}} + + mw = _get_mw() + req1 = mw.process_request(req1, spider) + assert "_splash_processed" in req1.meta + assert "render.json" in req1.url + assert "url" in json.loads(req1.body) + assert json.loads(req1.body).get("url") == req_url + assert "images" in json.loads(req1.body) + assert req1.method == 'POST' + + # spider attribute blank middleware disabled + spider.splash = {} + req2 = mw.process_request(req1, spider) + assert req2 is None