Skip to content

Adding support of shared cookies for requests having the same crawlid #209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions crawler/crawling/distributed_cookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
from __future__ import print_function
import sys
import redis
import tldextract
import jsonpickle

from scrapy import Item
from scrapy.exceptions import NotConfigured
from redis.exceptions import ConnectionError
from scrapy.http.cookies import CookieJar
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware

from scutils.log_factory import LogFactory


class DistributedCookiesMiddleware(CookiesMiddleware):
'''
This middleware enable working with cookies for requests having the same crawlid
'''

def __init__(self, settings):
super(DistributedCookiesMiddleware, self).__init__()
self.setup(settings)

@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COOKIES_ENABLED'):
raise NotConfigured
return cls(crawler.settings)

def setup(self, settings):
'''
Does the actual setup of the middleware
'''
self.extract = tldextract.TLDExtract()
self.debug = settings.getbool('COOKIES_DEBUG')

# set up the default sc logger
my_level = settings.get('SC_LOG_LEVEL', 'INFO')
my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
my_output = settings.get('SC_LOG_STDOUT', True)
my_json = settings.get('SC_LOG_JSON', False)
my_dir = settings.get('SC_LOG_DIR', 'logs')
my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
my_file = settings.get('SC_LOG_FILE', 'main.log')
my_backups = settings.get('SC_LOG_BACKUPS', 5)

self.distributed_cookies_timeout = settings.get('DISTRIBUTED_COOKIES_TIMEOUT', None)

self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir,
file=my_file, bytes=my_bytes, backups=my_backups)

# set up redis
self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings['REDIS_DB'])
try:
self.redis_conn.info()
except ConnectionError:
print("Could not connect to Redis")
# plugin is essential to functionality
sys.exit(1)

def process_request(self, request, spider):
if request.meta.get('dont_merge_cookies', False):
return

jar = self._get_cookiejar(request, spider)
cookies = self._get_request_cookies(jar, request)
for cookie in cookies:
jar.set_cookie_if_ok(cookie, request)

# set Cookie header
request.headers.pop('Cookie', None)
jar.add_cookie_header(request)
self._update_cookiejar(request, jar, spider)
self._debug_cookie(request, spider)

def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
return response

# extract cookies from Set-Cookie and drop invalid/expired cookies
jar = self._get_cookiejar(request, spider)
jar.extract_cookies(response, request)
self._update_cookiejar(request, jar, spider)
self._debug_set_cookie(response, spider)

return response

def _get_cookiejar(self, request, spider):
'''
Retrieving crawler's cookiejar in Redis
'''
key = self._get_key(request, spider)
self.cookiejar_keys = self.redis_conn.keys(key)
if key not in self.cookiejar_keys:
return CookieJar()

encoded_cookiejar = self.redis_conn.get(key)

# Loading and returning scrapy.http.cookies.CookieJar object from json
return jsonpickle.loads(encoded_cookiejar)

def _update_cookiejar(self, request, cookiejar, spider):
'''
Update crawler's cookiejar in Redis
'''
key = self._get_key(request, spider)
encoded_cookiejar = jsonpickle.dumps(cookiejar)

# Inserting the cookiejar in Redis with/without expiring time
if self.distributed_cookies_timeout:
self.redis_conn.psetex(key, self.distributed_cookies_timeout, encoded_cookiejar)
else:
self.redis_conn.set(key, encoded_cookiejar)

def _get_key(self, request, spider):
'''
Redis key of current cookiejar crawler
'''
crawlid = request.meta['crawlid']
ex_res = self.extract(request.url)
domain = ex_res.domain
suffix = ex_res.suffix
key = "{sname}:{dom}.{suf}:{cid}:cookiejar".format(sname=spider.name, dom=domain, suf=suffix, cid=crawlid)
return key


class ClearCookiesMiddleware(object):
'''
Clear the cookies of a crawl job on a yield item
'''
def __init__(self, settings):
self.setup(settings)

def setup(self, settings):
'''
Does the actual setup of the middleware
'''
self.extract = tldextract.TLDExtract()

# set up the default sc logger
my_level = settings.get('SC_LOG_LEVEL', 'INFO')
my_name = settings.get('SC_LOGGER_NAME', 'sc-logger')
my_output = settings.get('SC_LOG_STDOUT', True)
my_json = settings.get('SC_LOG_JSON', False)
my_dir = settings.get('SC_LOG_DIR', 'logs')
my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB')
my_file = settings.get('SC_LOG_FILE', 'main.log')
my_backups = settings.get('SC_LOG_BACKUPS', 5)

self.logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir,
file=my_file, bytes=my_bytes, backups=my_backups)

# set up redis
self.redis_conn = redis.Redis(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], db=settings['REDIS_DB'])
try:
self.redis_conn.info()
except ConnectionError:
print("Could not connect to Redis")
# plugin is essential to functionality
sys.exit(1)

@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)

def process_spider_output(self, response, result, spider):
'''
If an item is yield, its cookies are cleared
'''
self.logger.debug("processing clean cookies middleware")
for x in result:
# only operate on items
if isinstance(x, Item):
self.logger.debug("found item")
key = self._get_key(x, spider)
self.logger.debug("found key : {}".format(key))

# Delete the cookiejar of the current crawl which yield the final item
self.redis_conn.delete(key)
self.logger.debug("deleted key : {}".format(key))
yield x

def _get_key(self, item, spider):
'''
Redis key of current cookiejar crawler
'''
crawlid = item.get('crawlid')
ex_res = self.extract(item.get('url'))
domain = ex_res.domain
suffix = ex_res.suffix
key = "{sname}:{dom}.{suf}:{cid}:cookiejar".format(sname=spider.name, dom=domain, suf=suffix, cid=crawlid)
return key
14 changes: 13 additions & 1 deletion crawler/crawling/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,10 @@
# depth management per crawl request
'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
'crawling.meta_passthrough_middleware.MetaPassthroughMiddleware': 100,
'crawling.redis_stats_middleware.RedisStatsMiddleware': 101
'crawling.redis_stats_middleware.RedisStatsMiddleware': 101,
# delete the cookies in Redis having the same crawlid of the yield item
# It is used with distributed_ccokies.DistributedCookiesMiddleware
# 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
}

DOWNLOADER_MIDDLEWARES = {
Expand All @@ -162,8 +165,17 @@
# custom cookies to not persist across crawl requests
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'crawling.custom_cookies.CustomCookiesMiddleware': 700,
# delete the cookies in Redis having the same crawlid of the yield item
# It is used with distributed_cookies.DistributedCookiesMiddleware
# 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
}

# To use this, you must enable the crawling.distributed_cookies.DistributedCookiesMiddleware downloader
# The key and value (containing the cookies of a crawl process) are deleted after the
# timeout in ms (the timeout is refreshed after each request or returned response).
# If the value is None, the cookies will stay in Redis.
# DISTRIBUTED_COOKIES_TIMEOUT = 1 * 1000 * 60 * 5 # 5 minutes

# Disable the built in logging in production
LOG_ENABLED = False

Expand Down
1 change: 1 addition & 0 deletions crawler/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ funcsigs==1.0.2
future==0.16.0
idna==2.6
ipaddress==1.0.22 # Updated from 1.0.18
jsonpickle==1.0
kafka-python==1.4.3 # Updated from 1.3.4
kazoo==2.4.0
lxml==4.2.1 # Updated from 3.8.0
Expand Down
88 changes: 88 additions & 0 deletions crawler/tests/test_distributed_cookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from unittest import TestCase
import mock
from mock import MagicMock
import tldextract
import jsonpickle
from crawling.distributed_cookies import DistributedCookiesMiddleware, ClearCookiesMiddleware
from scrapy.http import Request, Response
from scrapy.spiders import Spider
from scrapy import Item, Field


class TestDistributedCookiesMiddleware(TestCase):

@mock.patch('crawling.distributed_cookies.DistributedCookiesMiddleware.setup')
def setUp(self, s):
self.dcm = DistributedCookiesMiddleware(MagicMock())
self.dcm.debug = False
self.dcm.logger = MagicMock()
self.dcm.logger.debug = MagicMock()
self.dcm.redis_conn = MagicMock()
self.dcm.redis_conn.keys = MagicMock(return_value=[])
self.dcm.redis_conn.get = MagicMock(return_value=None)
self.dcm.redis_conn.psetex = MagicMock()
self.dcm.redis_conn.set = MagicMock()
self.dcm.distributed_cookies_timeout = None
self.dcm.extract = tldextract.TLDExtract()

@mock.patch('crawling.distributed_cookies.DistributedCookiesMiddleware._update_cookiejar')
def _update_cookiejar(self, request, cookiejar, spider):
encoded_cookiejar = jsonpickle.dumps(cookiejar)
self.dcm.redis_conn.keys = MagicMock(return_value=[self.dcm._get_key(request, spider)])
self.dcm.redis_conn.get = MagicMock(return_value=encoded_cookiejar)
self.dcm._update_cookiejar(request, cookiejar, spider)

def test_dcm_middleware(self):
spider = Spider('foo')
request = Request('http://istresearch.com')
request.meta['crawlid'] = 'abc123'
assert self.dcm.process_request(request, spider) is None
assert 'Cookie' not in request.headers

self.dcm.distributed_cookies_timeout = 1000 # for testing all the lines in _update_cookiejar

headers = {'Set-Cookie': 'C1=value1; path=/'}
response = Response('http://istresearch.com', headers=headers)
assert self.dcm.process_response(request, response, spider) is response

request2 = Request('http://istresearch.com/sub1/')
request2.meta['crawlid'] = 'abc123'
assert self.dcm.process_request(request2, spider) is None
# self.assertEqual(request2.headers.get('Cookie'), 'C1=value1')


class TestClearCookiesMiddleware(TestCase):
@mock.patch('crawling.distributed_cookies.ClearCookiesMiddleware.setup')
def setUp(self, s):
self.ccm = ClearCookiesMiddleware(MagicMock())
self.ccm.logger = MagicMock()
self.ccm.logger.debug = MagicMock()
self.ccm.extract = tldextract.TLDExtract()
self.ccm.redis_conn = MagicMock()
self.ccm.redis_conn.keys = MagicMock(return_value=['foo:istresearch.com:abc123:cookiejar'])
self.ccm.redis_conn.delete = MagicMock()

def test_ccm_middleware(self):
class TestItem(Item):
crawlid = Field()
url = Field()

spider = Spider('foo')
response = MagicMock()

a = TestItem()
a['crawlid'] = 'abc123'
a['url'] = 'http://istresearch.com'

test_list = [
{},
a,
Request('http://istresearch.com')
]
yield_count = 0
for item in self.ccm.process_spider_output(response, test_list, spider):
if isinstance(item, Item):
self.assertEquals(a.get('crawlid'), item.get('crawlid'))
self.assertEquals(a.get('url'), item.get('url'))
yield_count += 1
self.assertEquals(yield_count, 3)
13 changes: 12 additions & 1 deletion docker/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,10 @@ def str2bool(v):
# depth management per crawl request
'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
'crawling.meta_passthrough_middleware.MetaPassthroughMiddleware': 100,
'crawling.redis_stats_middleware.RedisStatsMiddleware': 101
'crawling.redis_stats_middleware.RedisStatsMiddleware': 101,
# delete the cookies in Redis having the same crawlid of the yield item
# it is used with distributed_cookies.DistributedCookiesMiddleware
# 'crawling.distributed_cookies.ClearCookiesMiddleware': 102
}

DOWNLOADER_MIDDLEWARES = {
Expand All @@ -169,8 +172,16 @@ def str2bool(v):
# custom cookies to not persist across crawl requests
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'crawling.custom_cookies.CustomCookiesMiddleware': 700,
# comment custom_cookies and uncomment below line to enable share cookies across crawl job
# 'crawling.distributed_cookies.DistributedCookiesMiddleware': 700,
}

# To use this, you must enable the crawling.distributed_cookies.DistributedCookiesMiddleware downloader
# The key and value (containing the cookies of a crawl process) are deleted after the
# timeout in ms (the timeout is refreshed after each request or returned response).
# If the value is None, the cookies will stay in Redis.
# DISTRIBUTED_COOKIES_TIMEOUT = 1 * 1000 * 60 * 5 # 5 minutes

# Disable the built in logging in production
LOG_ENABLED = str2bool(os.getenv('LOG_ENABLED', False))

Expand Down
7 changes: 7 additions & 0 deletions docs/topics/crawler/design.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ custom_cookies.py

Enables long lived spiders to not cache the cookies received in the Spider cookie jar, yet pass cookies in all Requests. This prevents the spiders from caching response cookies and making subsequent requests with those cookies for a completely different crawl job.

distributed\_cookies.py
^^^^^^^^^^^^^^^^^^^^^^^

Cache the cookies across Redis for each crawl job.
It also permit to clear the cookies when an item, sharing the same ``crawlid``, is yield.
You have to choose between this and custom_cookies.

distributed\_scheduler.py
^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ future==0.16.0
idna==2.6
ipaddress==1.0.22 # Updated from 1.0.18
itsdangerous==0.24
jsonpickle==1.0
jsonschema==2.6.0
kafka-python==1.4.3 # Updated from 1.3.4
kazoo==2.4.0
Expand Down