Skip to content

Commit

Permalink
Merge pull request #40 from scrapy-plugins/replace-bsddb3-with-dbm
Browse files Browse the repository at this point in the history
[core change!] replace bsddb3 with default Python dbm
  • Loading branch information
pawelmhm authored Sep 20, 2021
2 parents 88f2b26 + ad8b7db commit 7ed3e53
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.5]
python-version: [3.5, 3.6, 3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v2
Expand Down
13 changes: 4 additions & 9 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,16 @@ scrapy-deltafetch
:target: https://codecov.io/gh/scrapy-plugins/scrapy-deltafetch

This is a Scrapy spider middleware to ignore requests
to pages containing items seen in previous crawls of the same spider,
thus producing a "delta crawl" containing only new items.
to pages seen in previous crawls of the same spider,
thus producing a "delta crawl" containing only new requests.

This also speeds up the crawl, by reducing the number of requests that need
to be crawled, and processed (typically, item requests are the most CPU
intensive).

Requirements
============

DeltaFetch middleware depends on Python's bsddb3_ package.

On Ubuntu/Debian, you may need to install ``libdb-dev`` if it's not installed already.
DeltaFetch middleware uses Python's dbm_ package to store requests fingerprints.

.. _bsddb3: https://pypi.python.org/pypi/bsddb3
.. _dbm: https://docs.python.org/3/library/dbm.html


Installation
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
scrapy>=1.1.0
bsddb3
22 changes: 6 additions & 16 deletions scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import os
import time
import dbm

from scrapy.http import Request
from scrapy.item import Item
Expand All @@ -26,12 +27,6 @@ class DeltaFetch(object):
"""

def __init__(self, dir, reset=False, stats=None):
dbmodule = None
try:
dbmodule = __import__('bsddb3').db
except ImportError:
raise NotConfigured('bsddb3 is required')
self.dbmodule = dbmodule
self.dir = dir
self.reset = reset
self.stats = stats
Expand All @@ -45,29 +40,25 @@ def from_crawler(cls, crawler):
reset = s.getbool('DELTAFETCH_RESET')
o = cls(dir, reset, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o

def spider_opened(self, spider):
if not os.path.exists(self.dir):
os.makedirs(self.dir)
# TODO may be tricky, as there may be different paths on systems
dbpath = os.path.join(self.dir, '%s.db' % spider.name)
reset = self.reset or getattr(spider, 'deltafetch_reset', False)
flag = self.dbmodule.DB_TRUNCATE if reset else self.dbmodule.DB_CREATE
flag = 'n' if reset else 'c'
try:
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=flag)
self.db = dbm.open(dbpath, flag=flag)
except Exception:
logger.warning("Failed to open DeltaFetch database at %s, "
"trying to recreate it" % dbpath)
if os.path.exists(dbpath):
os.remove(dbpath)
self.db = self.dbmodule.DB()
self.db.open(filename=dbpath,
dbtype=self.dbmodule.DB_HASH,
flags=self.dbmodule.DB_CREATE)
self.db = dbm.open(dbpath, 'c')

def spider_closed(self, spider):
self.db.close()
Expand All @@ -90,5 +81,4 @@ def process_spider_output(self, response, result, spider):

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
# request_fingerprint() returns `hashlib.sha1().hexdigest()`, is a string
return to_bytes(key)
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
],
install_requires=['Scrapy>=1.1.0', 'bsddb3']
install_requires=['Scrapy>=1.1.0']
)
31 changes: 31 additions & 0 deletions tests/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import tempfile

import mock
from scrapy import Request, Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.test import get_crawler

from scrapy_deltafetch import DeltaFetch


def benchmark_middleware(result):
spider_name = 'df_tests'
spider = Spider(spider_name)
temp_dir = tempfile.gettempdir()
crawler = get_crawler(Spider)
stats = StatsCollector(crawler)
mw = DeltaFetch(temp_dir, reset=False, stats=stats)
mw.spider_opened(spider)
response = mock.Mock()
response.request = Request('http://url',
meta={'deltafetch_key': 'key'})

for x in mw.process_spider_output(response, result, spider):
pass

def test_middleware(benchmark):
result = []
for x in range(50000):
request = Request(f'https://{x}')
result.append(request)
result = benchmark(benchmark_middleware, result)
4 changes: 3 additions & 1 deletion tests/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
-r ../requirements.txt
mock
pytest
pytest
pytest-benchmark

56 changes: 18 additions & 38 deletions tests/test_deltafetch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import TestCase, skipIf

import os
import dbm
import mock
import tempfile
from scrapy import Request
Expand All @@ -16,14 +17,6 @@
from scrapy_deltafetch.middleware import DeltaFetch


dbmodule = None
try:
dbmodule = __import__('bsddb3')
except ImportError:
pass


@skipIf(not dbmodule, "bsddb3 is not found on the system")
class DeltaFetchTestCase(TestCase):

mwcls = DeltaFetch
Expand Down Expand Up @@ -85,10 +78,7 @@ def test_spider_opened_new(self):
assert os.path.isdir(self.temp_dir)
assert os.path.exists(self.db_path)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
assert mw.db.keys() == []

def test_spider_opened_existing(self):
"""Middleware should open and use existing and valid .db files."""
Expand All @@ -97,11 +87,11 @@ def test_spider_opened_existing(self):
assert not hasattr(self.mwcls, 'db')
mw.spider_opened(self.spider)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == [(b'test_key_1', b'test_v_1'),
(b'test_key_2', b'test_v_2')]
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
for k, v in [
(b'test_key_1', b'test_v_1'),
(b'test_key_2', b'test_v_2')
]:
assert mw.db.get(k) == v

def test_spider_opened_corrupt_dbfile(self):
"""Middleware should create a new .db if it cannot open it."""
Expand All @@ -116,51 +106,43 @@ def test_spider_opened_corrupt_dbfile(self):
assert os.path.isdir(self.temp_dir)
assert os.path.exists(self.db_path)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))

# and db should be empty (it was re-created)
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_CREATE
assert mw.db.keys() == []

def test_spider_opened_existing_spider_reset(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
self.spider.deltafetch_reset = True
mw.spider_opened(self.spider)
assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
assert mw.db.keys() == []

def test_spider_opened_reset_non_existing_db(self):
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
self.spider.deltafetch_reset = True
mw.spider_opened(self.spider)
assert mw.db.fd()
# there's different logic for different bdb versions:
# it can fail when opening a non-existing db with truncate flag,
# then it should be caught and retried with rm & create flag
assert (mw.db.get_open_flags() == dbmodule.db.DB_CREATE or
mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE)

assert mw.db.get(b'random') is None

def test_spider_opened_recreate(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
assert not hasattr(self.mwcls, 'db')
mw.spider_opened(self.spider)
assert hasattr(mw, 'db')
assert isinstance(mw.db, type(dbmodule.db.DB()))
assert mw.db.items() == []
assert mw.db.get_type() == dbmodule.db.DB_HASH
assert mw.db.get_open_flags() == dbmodule.db.DB_TRUNCATE
assert mw.db.keys() == []

def test_spider_closed(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats)
mw.spider_opened(self.spider)
assert mw.db.fd()
assert mw.db.get('random') is None
mw.spider_closed(self.spider)
self.assertRaises(dbmodule.db.DBError, mw.db.fd)
with self.assertRaises(Exception) as cm:
# should fail because database closed
mw.db.get('radom')
# self.assertRaisesRegex(, mw.db.get('random'))

def test_process_spider_output(self):
self._create_test_db()
Expand Down Expand Up @@ -323,10 +305,8 @@ def test_get_key(self):
self.assertEqual(mw._get_key(test_req3), b'dfkey1')

def _create_test_db(self):
db = dbmodule.db.DB()
# truncate test db if there were failed tests
db.open(self.db_path, dbmodule.db.DB_HASH,
dbmodule.db.DB_CREATE | dbmodule.db.DB_TRUNCATE)
db = dbm.open(self.db_path, 'n')
db[b'test_key_1'] = b'test_v_1'
db[b'test_key_2'] = b'test_v_2'
db.close()

0 comments on commit 7ed3e53

Please sign in to comment.