Skip to content

Commit 1a4e72c

Browse files
committed
add unit 6
1 parent 4e192c1 commit 1a4e72c

35 files changed

+412
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ Crawling the Web with Scrapy
66
* [Unit 3: Running Spiders in the Cloud](unit3/README.md)
77
* [Unit 4: Handling HTML Forms](unit4/README.md)
88
* [Unit 5: Scraping JavaScript based pages](unit5/README.md)
9+
* [Unit 6: Extending Scrapy](unit6/README.md)

unit6/README.md

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
Unit 6: Extending Scrapy
2+
========================
3+
4+
This unit covers how to extend Scrapy capabilities, either via Item Pipelines or Middlewares.
5+
6+
**[Click here to view this unit's slides](https://docs.google.com/presentation/d/1cPGni3rAhE-vQoDdxOJSuXrqioHDvBQOpufDQNyYQOM/edit)**
7+
8+
9+
## Sample Spiders
10+
1. A project including a Pipeline that drops items that don't have `tags`: [`p1_pipeline`](spiders/p1_pipeline)
11+
2. A project including a Pipeline that stores scraped data in MongoDB: [`p2_pipeline`](spiders/p2_pipeline)
12+
3. A project with 2 spider middlewares: [`p3_spider_middleware`](spiders/p3_spider_middleware)
13+
14+
15+
## Hands-on
16+
17+
#### 1. Pipeline
18+
Build an item pipeline that stores the quotes from each author from http://quotes.toscrape.com in a separate json-lines file.
19+
20+
* Albert Einstein → albert_einstein.jl
21+
* Jane Austen → jane_austen.jl
22+
* etc
23+
24+
[Check out the project **once you're done**.](spiders/p4_pipeline_handson)
25+
26+
27+
#### 2. Downloader Middleware
28+
Build a downloader middleware to fetch and render pages using Selenium + PhantomJS instead of the Scrapy downloader.
29+
30+
* Make sure users can disable it either via settings or in a per-request basis.
31+
32+
33+
[Check out the project **once you're done**.](spiders/p5_downloader_middleware_handson)
34+
35+
36+
## References
37+
* [Scrapy Architecture](https://doc.scrapy.org/en/latest/topics/architecture.html)
38+
* [Item pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html)
39+
* [Spider middlewares](https://doc.scrapy.org/en/latest/topics/spider-middleware.html)
40+
* [Downloader middlewares](https://doc.scrapy.org/en/latest/topics/downloader-middleware.html)
41+
* [Scrapy Signals](https://doc.scrapy.org/en/latest/topics/signals.html)

unit6/spiders/p1_pipeline/db.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"_default": {"1": {"item": {"huw": "aa", "hua": 1}}, "2": {"item": {"huw": "aa", "hua": 2}}, "3": {"item": {"huw": "aa", "hua": 3}}}}

unit6/spiders/p1_pipeline/p1_pipeline/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from scrapy.exceptions import DropItem
2+
3+
4+
class DropNoTagsPipeline(object):
5+
def process_item(self, item, spider):
6+
if not item.get('tags', []):
7+
raise DropItem('item doesnt contain tags')
8+
return item
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
3+
BOT_NAME = 'p1_pipeline'
4+
5+
SPIDER_MODULES = ['p1_pipeline.spiders']
6+
NEWSPIDER_MODULE = 'p1_pipeline.spiders'
7+
8+
ROBOTSTXT_OBEY = True
9+
10+
11+
# Configure item pipelines
12+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
13+
ITEM_PIPELINES = {
14+
'p1_pipeline.pipelines.DropNoTagsPipeline': 300,
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import scrapy
2+
3+
4+
class QuotesSpider(scrapy.Spider):
5+
name = "quotes-pagination"
6+
start_urls = [
7+
'http://quotes.toscrape.com',
8+
]
9+
10+
def parse(self, response):
11+
for quote in response.css('div.quote'):
12+
yield {
13+
'text': quote.css('span.text::text').extract_first(),
14+
'author': quote.css('span small::text').extract_first(),
15+
'tags': quote.css('div.tags a.tag::text').extract(),
16+
}
17+
18+
next_page = response.css("li.next > a::attr(href)").extract_first()
19+
if next_page is not None:
20+
url = response.urljoin(next_page)
21+
yield scrapy.Request(url, callback=self.parse)

unit6/spiders/p1_pipeline/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = p1_pipeline.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = p1_pipeline

unit6/spiders/p2_pipeline/items.json

+1
Large diffs are not rendered by default.

unit6/spiders/p2_pipeline/p2_pipeline/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from pymongo import MongoClient
2+
from pymongo.errors import DuplicateKeyError
3+
from scrapy.exceptions import DropItem
4+
5+
6+
class MongoDbPipeline(object):
7+
collection_name = 'quotes'
8+
9+
def __init__(self, mongo_uri, mongo_db):
10+
self.mongo_uri = mongo_uri
11+
self.mongo_db = mongo_db
12+
13+
@classmethod
14+
def from_crawler(cls, crawler):
15+
return cls(
16+
mongo_uri=crawler.settings.get('MONGO_URI'),
17+
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
18+
)
19+
20+
def open_spider(self, spider):
21+
self.client = MongoClient(self.mongo_uri)
22+
self.db = self.client[self.mongo_db]
23+
24+
def process_item(self, item, spider):
25+
try:
26+
self.db[self.collection_name].insert(dict(item))
27+
return item
28+
except DuplicateKeyError:
29+
raise DropItem('Duplicated item')
30+
31+
def close_spider(self, spider):
32+
self.client.close()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
BOT_NAME = 'p2_pipeline'
2+
3+
SPIDER_MODULES = ['p2_pipeline.spiders']
4+
NEWSPIDER_MODULE = 'p2_pipeline.spiders'
5+
6+
ROBOTSTXT_OBEY = True
7+
8+
9+
ITEM_PIPELINES = {
10+
'p2_pipeline.pipelines.MongoDbPipeline': 300,
11+
}
12+
13+
MONGO_URI = 'mongodb://localhost:27017/'
14+
MONGO_DATABASE = 'items'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import scrapy
2+
3+
4+
class QuotesSpider(scrapy.Spider):
5+
name = "quotes-pagination"
6+
start_urls = [
7+
'http://quotes.toscrape.com',
8+
]
9+
10+
def parse(self, response):
11+
for quote in response.css('div.quote'):
12+
yield {
13+
'text': quote.css('span.text::text').extract_first(),
14+
'author': quote.css('span small::text').extract_first(),
15+
'tags': quote.css('div.tags a.tag::text').extract(),
16+
}
17+
18+
next_page = response.css("li.next > a::attr(href)").extract_first()
19+
if next_page is not None:
20+
url = response.urljoin(next_page)
21+
yield scrapy.Request(url, callback=self.parse)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tinydb

unit6/spiders/p2_pipeline/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = p2_pipeline.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = p2_pipeline

unit6/spiders/p3_spider_middleware/p3_spider_middleware/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import logging
2+
3+
logger = logging.getLogger(__name__)
4+
5+
6+
class EmptyResponseException(Exception):
7+
pass
8+
9+
10+
class AddUrlFieldMiddleware(object):
11+
"""Adds to the item a field containing the response URL from
12+
where the item has been generated.
13+
"""
14+
15+
def process_spider_output(self, response, result, spider):
16+
for r in result:
17+
if isinstance(r, dict):
18+
if 'url' not in r:
19+
r['url'] = response.url
20+
yield r
21+
22+
23+
class IgnoreEmptyResponseMiddleware(object):
24+
"""Ignores responses with empty bodies."""
25+
26+
def process_spider_input(self, response, spider):
27+
if not response.text:
28+
raise EmptyResponseException()
29+
else:
30+
return None
31+
32+
def process_spider_exception(self, response, exception, spider):
33+
if isinstance(exception, EmptyResponseException):
34+
logger.info('Response from {} ignored due to empty body.'.format(response.url))
35+
spider.crawler.stats.inc_value('emptyresponse/response_ignored_count')
36+
return []
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
BOT_NAME = 'p3_spider_middleware'
2+
3+
SPIDER_MODULES = ['p3_spider_middleware.spiders']
4+
NEWSPIDER_MODULE = 'p3_spider_middleware.spiders'
5+
6+
# Obey robots.txt rules
7+
ROBOTSTXT_OBEY = True
8+
9+
SPIDER_MIDDLEWARES = {
10+
'p3_spider_middleware.middlewares.IgnoreEmptyResponseMiddleware': 540,
11+
'p3_spider_middleware.middlewares.AddUrlFieldMiddleware': 543,
12+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import scrapy
2+
3+
4+
class QuotesSpider(scrapy.Spider):
5+
name = "quotes-pagination"
6+
start_urls = [
7+
'http://quotes.toscrape.com',
8+
]
9+
10+
def parse(self, response):
11+
for quote in response.css('div.quote'):
12+
yield {
13+
'text': quote.css('span.text::text').extract_first(),
14+
'author': quote.css('span small::text').extract_first(),
15+
'tags': quote.css('div.tags a.tag::text').extract(),
16+
}
17+
18+
next_page = response.css("li.next > a::attr(href)").extract_first()
19+
if next_page is not None:
20+
url = response.urljoin(next_page)
21+
yield scrapy.Request(url, callback=self.parse)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = p3_spider_middleware.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = p3_spider_middleware

unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import json
2+
3+
4+
class SaveToFilesPipeline(object):
5+
6+
def open_spider(self, spider):
7+
self.fps = {}
8+
9+
def process_item(self, item, spider):
10+
author_name = item.get('author', '')
11+
filename = '{}.jl'.format('_'.join(author_name.lower().split()))
12+
if filename not in self.fps:
13+
self.fps[filename] = open(filename, 'w')
14+
self.fps[filename].write(json.dumps(item) + '\n')
15+
return item
16+
17+
def close_spider(self, spider):
18+
for fp in self.fps.values():
19+
fp.close()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
BOT_NAME = 'p4_pipeline_handson'
2+
3+
SPIDER_MODULES = ['p4_pipeline_handson.spiders']
4+
NEWSPIDER_MODULE = 'p4_pipeline_handson.spiders'
5+
6+
ROBOTSTXT_OBEY = True
7+
8+
9+
ITEM_PIPELINES = {
10+
'p4_pipeline_handson.pipelines.SaveToFilesPipeline': 300,
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import scrapy
2+
3+
4+
class QuotesSpider(scrapy.Spider):
5+
name = "quotes-pagination"
6+
start_urls = [
7+
'http://quotes.toscrape.com',
8+
]
9+
10+
def parse(self, response):
11+
for quote in response.css('div.quote'):
12+
yield {
13+
'text': quote.css('span.text::text').extract_first(),
14+
'author': quote.css('span small::text').extract_first(),
15+
'tags': quote.css('div.tags a.tag::text').extract(),
16+
}
17+
18+
next_page = response.css("li.next > a::attr(href)").extract_first()
19+
if next_page is not None:
20+
url = response.urljoin(next_page)
21+
yield scrapy.Request(url, callback=self.parse)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = p4_pipeline_handson.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = p4_pipeline_handson

unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from scrapy import signals
2+
from scrapy.http import HtmlResponse
3+
from scrapy.exceptions import NotConfigured
4+
import selenium.webdriver as webdriver
5+
6+
7+
class SeleniumDownloaderMiddleware(object):
8+
9+
def __init__(self):
10+
self.driver = webdriver.PhantomJS()
11+
12+
@classmethod
13+
def from_crawler(cls, crawler):
14+
m = cls()
15+
if not crawler.settings.getbool('SELENIUM_ENABLED'):
16+
raise NotConfigured()
17+
crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
18+
return m
19+
20+
def process_request(self, request, spider):
21+
if request.meta.get('nojs'):
22+
# disable js rendering in a per-request basis
23+
return
24+
self.driver.get(request.url)
25+
content = self.driver.page_source.encode('utf-8')
26+
return HtmlResponse(request.url, body=content)
27+
28+
def spider_closed(self, spider):
29+
self.driver.close()

0 commit comments

Comments
 (0)