istresearch · MohamedMb · Nov 22, 2018 · Nov 26, 2018 · Nov 26, 2018 · Nov 26, 2018
diff --git a/crawler/crawling/distributed_cookies.py b/crawler/crawling/distributed_cookies.py
@@ -0,0 +1,193 @@
+from __future__ import print_function
+import sys
+import redis
+import tldextract
+import jsonpickle
+
+from scrapy import Item
+from scrapy.exceptions import NotConfigured
+from redis.exceptions import ConnectionError
+from scrapy.http.cookies import CookieJar
+from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
+
+from scutils.log_factory import LogFactory
+
+
+class DistributedCookiesMiddleware(CookiesMiddleware):
+    '''
+    This middleware enable working with cookies for requests having the same crawlid
+    '''
+
+    def __init__(self, settings):
+        super(DistributedCookiesMiddleware, self).__init__()
+        self.setup(settings)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        if not crawler.settings.getbool('COOKIES_ENABLED'):
+            raise NotConfigured
+        return cls(crawler.settings)
+
+    def setup(self, settings):
+        '''
+        Does the actual setup of the middleware
+        '''
+        self.extract = tldextract.TLDExtract()
+        self.debug = settings.getbool('COOKIES_DEBUG')
+
+        # set up the default sc logger
+        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
+        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
+        my_output = settings.get('SC_LOG_STDOUT', True)
+        my_json = settings.get('SC_LOG_JSON', False)
+        my_dir = settings.get('SC_LOG_DIR', 'logs')
+        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
+        my_file = settings.get('SC_LOG_FILE', 'main.log')
+        my_backups = settings.get('SC_LOG_BACKUPS', 5)
+
+        self.distributed_cookies_timeout = settings.get('DISTRIBUTED_COOKIES_TIMEOUT', None)
+
+        self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir,
+                                              file=my_file, bytes=my_bytes, backups=my_backups)
+
+        # set up redis
+        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings['REDIS_DB'])
+        try:
+            self.redis_conn.info()
+        except ConnectionError:
+            print("Could not connect to Redis")
+            # plugin is essential to functionality
+            sys.exit(1)
+
+    def process_request(self, request, spider):
+        if request.meta.get('dont_merge_cookies', False):
+            return
+
+        jar = self._get_cookiejar(request, spider)
+        cookies = self._get_request_cookies(jar, request)
+        for cookie in cookies:
+            jar.set_cookie_if_ok(cookie, request)
+
+        # set Cookie header
+        request.headers.pop('Cookie', None)
+        jar.add_cookie_header(request)
+        self._update_cookiejar(request, jar, spider)
+        self._debug_cookie(request, spider)
+
+    def process_response(self, request, response, spider):
+        if request.meta.get('dont_merge_cookies', False):
+            return response
+
+        # extract cookies from Set-Cookie and drop invalid/expired cookies
+        jar = self._get_cookiejar(request, spider)
+        jar.extract_cookies(response, request)
+        self._update_cookiejar(request, jar, spider)
+        self._debug_set_cookie(response, spider)
+
+        return response
+
+    def _get_cookiejar(self, request, spider):
+        '''
+        Retrieving crawler's cookiejar in Redis
+        '''
+        key = self._get_key(request, spider)
+        self.cookiejar_keys = self.redis_conn.keys(key)
+        if key not in self.cookiejar_keys:
+            return CookieJar()
+
+        encoded_cookiejar = self.redis_conn.get(key)
+
+        # Loading and returning scrapy.http.cookies.CookieJar object from json
+        return jsonpickle.loads(encoded_cookiejar)
+
+    def _update_cookiejar(self, request, cookiejar, spider):
+        '''
+        Update crawler's cookiejar in Redis
+        '''
+        key = self._get_key(request, spider)
+        encoded_cookiejar = jsonpickle.dumps(cookiejar)
+
+        # Inserting the cookiejar in Redis with/without expiring time
+        if self.distributed_cookies_timeout:
+            self.redis_conn.psetex(key, self.distributed_cookies_timeout, encoded_cookiejar)
+        else:
+            self.redis_conn.set(key, encoded_cookiejar)
+
+    def _get_key(self, request, spider):
+        '''
+        Redis key of current cookiejar crawler
+        '''
+        crawlid = request.meta['crawlid']
+        ex_res = self.extract(request.url)
+        domain = ex_res.domain
+        suffix = ex_res.suffix
+        key = "{sname}:{dom}.{suf}:{cid}:cookiejar".format(sname=spider.name, dom=domain, suf=suffix, cid=crawlid)
+        return key
+
+
+class ClearCookiesMiddleware(object):
+    '''
+    Clear the cookies of a crawl job on a yield item
+    '''
+    def __init__(self, settings):
+        self.setup(settings)
+
+    def setup(self, settings):
+        '''
+        Does the actual setup of the middleware
+        '''
+        self.extract = tldextract.TLDExtract()
+
+        # set up the default sc logger
+        my_level = settings.get('SC_LOG_LEVEL', 'INFO')
+        my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
+        my_output = settings.get('SC_LOG_STDOUT', True)
+        my_json = settings.get('SC_LOG_JSON', False)
+        my_dir = settings.get('SC_LOG_DIR', 'logs')
+        my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
+        my_file = settings.get('SC_LOG_FILE', 'main.log')
+        my_backups = settings.get('SC_LOG_BACKUPS', 5)
+
+        self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir,
+                                              file=my_file, bytes=my_bytes, backups=my_backups)
+
+        # set up redis
+        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings['REDIS_DB'])
+        try:
+            self.redis_conn.info()
+        except ConnectionError:
+            print("Could not connect to Redis")
+            # plugin is essential to functionality
+            sys.exit(1)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings)
+
+    def process_spider_output(self, response, result, spider):
+        '''
+        If an item is yield, its cookies are cleared
+        '''
+        self.logger.debug("processing clean cookies middleware")
+        for x in result:
+            # only operate on items
+            if isinstance(x, Item):
+                self.logger.debug("found item")
+                key = self._get_key(x, spider)
+                self.logger.debug("found key : {}".format(key))
+
+                # Delete the cookiejar of the current crawl which yield the final item
+                self.redis_conn.delete(key)
+                self.logger.debug("deleted key : {}".format(key))
+            yield x
+
+    def _get_key(self, item, spider):
+        '''
+        Redis key of current cookiejar crawler
+        '''
+        crawlid = item.get('crawlid')
+        ex_res = self.extract(item.get('url'))
+        domain = ex_res.domain
+        suffix = ex_res.suffix
+        key = "{sname}:{dom}.{suf}:{cid}:cookiejar".format(sname=spider.name, dom=domain, suf=suffix, cid=crawlid)
+        return key
diff --git a/crawler/crawling/settings.py b/crawler/crawling/settings.py
@@ -150,7 +150,10 @@
     # depth management per crawl request
     'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
     'crawling.meta_passthrough_middleware.MetaPassthroughMiddleware': 100,
-    'crawling.redis_stats_middleware.RedisStatsMiddleware': 101
+    'crawling.redis_stats_middleware.RedisStatsMiddleware': 101,
+    # delete the cookies in Redis having the same crawlid of the yield item
+    # It is used with distributed_ccokies.DistributedCookiesMiddleware
+    # 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
 }
 
 DOWNLOADER_MIDDLEWARES = {
@@ -162,8 +165,17 @@
     # custom cookies to not persist across crawl requests
     'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
     'crawling.custom_cookies.CustomCookiesMiddleware': 700,
+    # delete the cookies in Redis having the same crawlid of the yield item
+    # It is used with distributed_cookies.DistributedCookiesMiddleware
+    # 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
 }
 
+# To use this, you must enable the crawling.distributed_cookies.DistributedCookiesMiddleware downloader
+# The key and value (containing the cookies of a crawl process) are deleted after the
+# timeout in ms (the timeout is refreshed after each request or returned response).
+# If the value is None, the cookies will stay in Redis.
+# DISTRIBUTED_COOKIES_TIMEOUT = 1 * 1000 * 60 * 5  # 5 minutes
+
 # Disable the built in logging in production
 LOG_ENABLED = False
 

diff --git a/crawler/requirements.txt b/crawler/requirements.txt
@@ -8,6 +8,7 @@ funcsigs==1.0.2
 future==0.16.0
 idna==2.6
 ipaddress==1.0.22  # Updated from 1.0.18
+jsonpickle==1.0
 kafka-python==1.4.3  # Updated from 1.3.4
 kazoo==2.4.0
 lxml==4.2.1  # Updated from 3.8.0

diff --git a/crawler/tests/test_distributed_cookies.py b/crawler/tests/test_distributed_cookies.py
@@ -0,0 +1,88 @@
+from unittest import TestCase
+import mock
+from mock import MagicMock
+import tldextract
+import jsonpickle
+from crawling.distributed_cookies import DistributedCookiesMiddleware, ClearCookiesMiddleware
+from scrapy.http import Request, Response
+from scrapy.spiders import Spider
+from scrapy import Item, Field
+
+
+class TestDistributedCookiesMiddleware(TestCase):
+
+    @mock.patch('crawling.distributed_cookies.DistributedCookiesMiddleware.setup')
+    def setUp(self, s):
+        self.dcm = DistributedCookiesMiddleware(MagicMock())
+        self.dcm.debug = False
+        self.dcm.logger = MagicMock()
+        self.dcm.logger.debug = MagicMock()
+        self.dcm.redis_conn = MagicMock()
+        self.dcm.redis_conn.keys = MagicMock(return_value=[])
+        self.dcm.redis_conn.get = MagicMock(return_value=None)
+        self.dcm.redis_conn.psetex = MagicMock()
+        self.dcm.redis_conn.set = MagicMock()
+        self.dcm.distributed_cookies_timeout = None
+        self.dcm.extract = tldextract.TLDExtract()
+
+    @mock.patch('crawling.distributed_cookies.DistributedCookiesMiddleware._update_cookiejar')
+    def _update_cookiejar(self, request, cookiejar, spider):
+        encoded_cookiejar = jsonpickle.dumps(cookiejar)
+        self.dcm.redis_conn.keys = MagicMock(return_value=[self.dcm._get_key(request, spider)])
+        self.dcm.redis_conn.get = MagicMock(return_value=encoded_cookiejar)
+        self.dcm._update_cookiejar(request, cookiejar, spider)
+
+    def test_dcm_middleware(self):
+        spider = Spider('foo')
+        request = Request('http://istresearch.com')
+        request.meta['crawlid'] = 'abc123'
+        assert self.dcm.process_request(request, spider) is None
+        assert 'Cookie' not in request.headers
+
+        self.dcm.distributed_cookies_timeout = 1000  # for testing all the lines in _update_cookiejar
+
+        headers = {'Set-Cookie': 'C1=value1; path=/'}
+        response = Response('http://istresearch.com', headers=headers)
+        assert self.dcm.process_response(request, response, spider) is response
+
+        request2 = Request('http://istresearch.com/sub1/')
+        request2.meta['crawlid'] = 'abc123'
+        assert self.dcm.process_request(request2, spider) is None
+        # self.assertEqual(request2.headers.get('Cookie'), 'C1=value1')
+
+
+class TestClearCookiesMiddleware(TestCase):
+    @mock.patch('crawling.distributed_cookies.ClearCookiesMiddleware.setup')
+    def setUp(self, s):
+        self.ccm = ClearCookiesMiddleware(MagicMock())
+        self.ccm.logger = MagicMock()
+        self.ccm.logger.debug = MagicMock()
+        self.ccm.extract = tldextract.TLDExtract()
+        self.ccm.redis_conn = MagicMock()
+        self.ccm.redis_conn.keys = MagicMock(return_value=['foo:istresearch.com:abc123:cookiejar'])
+        self.ccm.redis_conn.delete = MagicMock()
+
+    def test_ccm_middleware(self):
+        class TestItem(Item):
+            crawlid = Field()
+            url = Field()
+
+        spider = Spider('foo')
+        response = MagicMock()
+
+        a = TestItem()
+        a['crawlid'] = 'abc123'
+        a['url'] = 'http://istresearch.com'
+
+        test_list = [
+            {},
+            a,
+            Request('http://istresearch.com')
+        ]
+        yield_count = 0
+        for item in self.ccm.process_spider_output(response, test_list, spider):
+            if isinstance(item, Item):
+                self.assertEquals(a.get('crawlid'), item.get('crawlid'))
+                self.assertEquals(a.get('url'), item.get('url'))
+            yield_count += 1
+        self.assertEquals(yield_count, 3)
diff --git a/docker/crawler/settings.py b/docker/crawler/settings.py
@@ -157,7 +157,10 @@ def str2bool(v):
     # depth management per crawl request
     'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
     'crawling.meta_passthrough_middleware.MetaPassthroughMiddleware': 100,
-    'crawling.redis_stats_middleware.RedisStatsMiddleware': 101
+    'crawling.redis_stats_middleware.RedisStatsMiddleware': 101,
+    # delete the cookies in Redis having the same crawlid of the yield item
+    # it is used with distributed_cookies.DistributedCookiesMiddleware
+    # 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
 }
 
 DOWNLOADER_MIDDLEWARES = {
@@ -169,8 +172,16 @@ def str2bool(v):
     # custom cookies to not persist across crawl requests
     'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
     'crawling.custom_cookies.CustomCookiesMiddleware': 700,
+    # comment custom_cookies and uncomment below line to enable share cookies across crawl job
+    # 'crawling.distributed_cookies.DistributedCookiesMiddleware': 700,
 }
 
+# To use this, you must enable the crawling.distributed_cookies.DistributedCookiesMiddleware downloader
+# The key and value (containing the cookies of a crawl process) are deleted after the
+# timeout in ms (the timeout is refreshed after each request or returned response).
+# If the value is None, the cookies will stay in Redis.
+# DISTRIBUTED_COOKIES_TIMEOUT = 1 * 1000 * 60 * 5  # 5 minutes
+
 # Disable the built in logging in production
 LOG_ENABLED = str2bool(os.getenv('LOG_ENABLED', False))
 

diff --git a/docs/topics/crawler/design.rst b/docs/topics/crawler/design.rst
@@ -60,6 +60,13 @@ custom_cookies.py
 
 Enables long lived spiders to not cache the cookies received in the Spider cookie jar, yet pass cookies in all Requests. This prevents the spiders from caching response cookies and making subsequent requests with those cookies for a completely different crawl job.
 
+distributed\_cookies.py
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Cache the cookies across Redis for each crawl job.
+It also permit to clear the cookies when an item, sharing the same ``crawlid``, is yield.
+You have to choose between this and custom_cookies.
+
 distributed\_scheduler.py
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/requirements.txt b/requirements.txt
@@ -22,6 +22,7 @@ future==0.16.0
 idna==2.6
 ipaddress==1.0.22  # Updated from 1.0.18
 itsdangerous==0.24
+jsonpickle==1.0
 jsonschema==2.6.0
 kafka-python==1.4.3  # Updated from 1.3.4
 kazoo==2.4.0