add unit 6

stummjr · stummjr · commit 1a4e72c327bb · 2017-03-15T17:41:47.000-03:00
diff --git a/README.md b/README.md
@@ -6,3 +6,4 @@ Crawling the Web with Scrapy
 * [Unit 3: Running Spiders in the Cloud](unit3/README.md)
 * [Unit 4: Handling HTML Forms](unit4/README.md)
 * [Unit 5: Scraping JavaScript based pages](unit5/README.md)
+* [Unit 6: Extending Scrapy](unit6/README.md)
diff --git a/unit6/README.md b/unit6/README.md
@@ -0,0 +1,41 @@
+Unit 6: Extending Scrapy
+========================
+
+This unit covers how to extend Scrapy capabilities, either via Item Pipelines or Middlewares.
+
+**[Click here to view this unit's slides](https://docs.google.com/presentation/d/1cPGni3rAhE-vQoDdxOJSuXrqioHDvBQOpufDQNyYQOM/edit)**
+
+
+## Sample Spiders
+1. A project including a Pipeline that drops items that don't have `tags`: [`p1_pipeline`](spiders/p1_pipeline)
+2. A project including a Pipeline that stores scraped data in MongoDB: [`p2_pipeline`](spiders/p2_pipeline)
+3. A project with 2 spider middlewares: [`p3_spider_middleware`](spiders/p3_spider_middleware)
+
+
+## Hands-on
+
+#### 1. Pipeline
+Build an item pipeline that stores the quotes from each author from http://quotes.toscrape.com in a separate json-lines file.
+
+* Albert Einstein → albert_einstein.jl
+* Jane Austen → jane_austen.jl
+* etc
+
+[Check out the project **once you're done**.](spiders/p4_pipeline_handson)
+
+
+#### 2. Downloader Middleware
+Build a downloader middleware to fetch and render pages using Selenium + PhantomJS instead of the Scrapy downloader.
+
+* Make sure users can disable it either via settings or in a per-request basis.
+
+
+[Check out the project **once you're done**.](spiders/p5_downloader_middleware_handson)
+
+
+## References
+* [Scrapy Architecture](https://doc.scrapy.org/en/latest/topics/architecture.html)
+* [Item pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html)
+* [Spider middlewares](https://doc.scrapy.org/en/latest/topics/spider-middleware.html)
+* [Downloader middlewares](https://doc.scrapy.org/en/latest/topics/downloader-middleware.html)
+* [Scrapy Signals](https://doc.scrapy.org/en/latest/topics/signals.html)
diff --git a/unit6/spiders/p1_pipeline/db.json b/unit6/spiders/p1_pipeline/db.json
@@ -0,0 +1 @@
+{"_default": {"1": {"item": {"huw": "aa", "hua": 1}}, "2": {"item": {"huw": "aa", "hua": 2}}, "3": {"item": {"huw": "aa", "hua": 3}}}}
diff --git a/unit6/spiders/p1_pipeline/p1_pipeline/__init__.py b/unit6/spiders/p1_pipeline/p1_pipeline/__init__.py
diff --git a/unit6/spiders/p1_pipeline/p1_pipeline/pipelines.py b/unit6/spiders/p1_pipeline/p1_pipeline/pipelines.py
@@ -0,0 +1,8 @@
+from scrapy.exceptions import DropItem
+
+
+class DropNoTagsPipeline(object):
+    def process_item(self, item, spider):
+        if not item.get('tags', []):
+            raise DropItem('item doesnt contain tags')
+        return item
diff --git a/unit6/spiders/p1_pipeline/p1_pipeline/settings.py b/unit6/spiders/p1_pipeline/p1_pipeline/settings.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+BOT_NAME = 'p1_pipeline'
+
+SPIDER_MODULES = ['p1_pipeline.spiders']
+NEWSPIDER_MODULE = 'p1_pipeline.spiders'
+
+ROBOTSTXT_OBEY = True
+
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'p1_pipeline.pipelines.DropNoTagsPipeline': 300,
+}
diff --git a/unit6/spiders/p1_pipeline/p1_pipeline/spiders/__init__.py b/unit6/spiders/p1_pipeline/p1_pipeline/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/unit6/spiders/p1_pipeline/p1_pipeline/spiders/spider_1_quotes_pagination.py b/unit6/spiders/p1_pipeline/p1_pipeline/spiders/spider_1_quotes_pagination.py
@@ -0,0 +1,21 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes-pagination"
+    start_urls = [
+        'http://quotes.toscrape.com',
+    ]
+
+    def parse(self, response):
+        for quote in response.css('div.quote'):
+            yield {
+                'text': quote.css('span.text::text').extract_first(),
+                'author': quote.css('span small::text').extract_first(),
+                'tags': quote.css('div.tags a.tag::text').extract(),
+            }
+
+            next_page = response.css("li.next > a::attr(href)").extract_first()
+            if next_page is not None:
+                url = response.urljoin(next_page)
+                yield scrapy.Request(url, callback=self.parse)
diff --git a/unit6/spiders/p1_pipeline/scrapy.cfg b/unit6/spiders/p1_pipeline/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = p1_pipeline.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = p1_pipeline
diff --git a/unit6/spiders/p2_pipeline/items.json b/unit6/spiders/p2_pipeline/items.json
diff --git a/unit6/spiders/p2_pipeline/p2_pipeline/__init__.py b/unit6/spiders/p2_pipeline/p2_pipeline/__init__.py
diff --git a/unit6/spiders/p2_pipeline/p2_pipeline/pipelines.py b/unit6/spiders/p2_pipeline/p2_pipeline/pipelines.py
@@ -0,0 +1,32 @@
+from pymongo import MongoClient
+from pymongo.errors import DuplicateKeyError
+from scrapy.exceptions import DropItem
+
+
+class MongoDbPipeline(object):
+    collection_name = 'quotes'
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get('MONGO_URI'),
+            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
+        )
+
+    def open_spider(self, spider):
+        self.client = MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def process_item(self, item, spider):
+        try:
+            self.db[self.collection_name].insert(dict(item))
+            return item
+        except DuplicateKeyError:
+            raise DropItem('Duplicated item')
+
+    def close_spider(self, spider):
+        self.client.close()
diff --git a/unit6/spiders/p2_pipeline/p2_pipeline/settings.py b/unit6/spiders/p2_pipeline/p2_pipeline/settings.py
@@ -0,0 +1,14 @@
+BOT_NAME = 'p2_pipeline'
+
+SPIDER_MODULES = ['p2_pipeline.spiders']
+NEWSPIDER_MODULE = 'p2_pipeline.spiders'
+
+ROBOTSTXT_OBEY = True
+
+
+ITEM_PIPELINES = {
+    'p2_pipeline.pipelines.MongoDbPipeline': 300,
+}
+
+MONGO_URI = 'mongodb://localhost:27017/'
+MONGO_DATABASE = 'items'
diff --git a/unit6/spiders/p2_pipeline/p2_pipeline/spiders/__init__.py b/unit6/spiders/p2_pipeline/p2_pipeline/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/unit6/spiders/p2_pipeline/p2_pipeline/spiders/quotes-pagination.py b/unit6/spiders/p2_pipeline/p2_pipeline/spiders/quotes-pagination.py
@@ -0,0 +1,21 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes-pagination"
+    start_urls = [
+        'http://quotes.toscrape.com',
+    ]
+
+    def parse(self, response):
+        for quote in response.css('div.quote'):
+            yield {
+                'text': quote.css('span.text::text').extract_first(),
+                'author': quote.css('span small::text').extract_first(),
+                'tags': quote.css('div.tags a.tag::text').extract(),
+            }
+
+            next_page = response.css("li.next > a::attr(href)").extract_first()
+            if next_page is not None:
+                url = response.urljoin(next_page)
+                yield scrapy.Request(url, callback=self.parse)
diff --git a/unit6/spiders/p2_pipeline/requirements.txt b/unit6/spiders/p2_pipeline/requirements.txt
@@ -0,0 +1 @@
+tinydb
diff --git a/unit6/spiders/p2_pipeline/scrapy.cfg b/unit6/spiders/p2_pipeline/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = p2_pipeline.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = p2_pipeline
diff --git a/unit6/spiders/p3_spider_middleware/p3_spider_middleware/__init__.py b/unit6/spiders/p3_spider_middleware/p3_spider_middleware/__init__.py
diff --git a/unit6/spiders/p3_spider_middleware/p3_spider_middleware/middlewares.py b/unit6/spiders/p3_spider_middleware/p3_spider_middleware/middlewares.py
@@ -0,0 +1,36 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class EmptyResponseException(Exception):
+    pass
+
+
+class AddUrlFieldMiddleware(object):
+    """Adds to the item a field containing the response URL from
+       where the item has been generated.
+    """
+
+    def process_spider_output(self, response, result, spider):
+        for r in result:
+            if isinstance(r, dict):
+                if 'url' not in r:
+                    r['url'] = response.url
+            yield r
+
+
+class IgnoreEmptyResponseMiddleware(object):
+    """Ignores responses with empty bodies."""
+
+    def process_spider_input(self, response, spider):
+        if not response.text:
+            raise EmptyResponseException()
+        else:
+            return None
+
+    def process_spider_exception(self, response, exception, spider):
+        if isinstance(exception, EmptyResponseException):
+            logger.info('Response from {} ignored due to empty body.'.format(response.url))
+            spider.crawler.stats.inc_value('emptyresponse/response_ignored_count')
+            return []
diff --git a/unit6/spiders/p3_spider_middleware/p3_spider_middleware/settings.py b/unit6/spiders/p3_spider_middleware/p3_spider_middleware/settings.py
@@ -0,0 +1,12 @@
+BOT_NAME = 'p3_spider_middleware'
+
+SPIDER_MODULES = ['p3_spider_middleware.spiders']
+NEWSPIDER_MODULE = 'p3_spider_middleware.spiders'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+SPIDER_MIDDLEWARES = {
+    'p3_spider_middleware.middlewares.IgnoreEmptyResponseMiddleware': 540,
+    'p3_spider_middleware.middlewares.AddUrlFieldMiddleware': 543,
+}
diff --git a/unit6/spiders/p3_spider_middleware/p3_spider_middleware/spiders/__init__.py b/unit6/spiders/p3_spider_middleware/p3_spider_middleware/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/unit6/spiders/p3_spider_middleware/p3_spider_middleware/spiders/quotes-pagination.py b/unit6/spiders/p3_spider_middleware/p3_spider_middleware/spiders/quotes-pagination.py
@@ -0,0 +1,21 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes-pagination"
+    start_urls = [
+        'http://quotes.toscrape.com',
+    ]
+
+    def parse(self, response):
+        for quote in response.css('div.quote'):
+            yield {
+                'text': quote.css('span.text::text').extract_first(),
+                'author': quote.css('span small::text').extract_first(),
+                'tags': quote.css('div.tags a.tag::text').extract(),
+            }
+
+            next_page = response.css("li.next > a::attr(href)").extract_first()
+            if next_page is not None:
+                url = response.urljoin(next_page)
+                yield scrapy.Request(url, callback=self.parse)
diff --git a/unit6/spiders/p3_spider_middleware/scrapy.cfg b/unit6/spiders/p3_spider_middleware/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = p3_spider_middleware.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = p3_spider_middleware
diff --git a/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/__init__.py b/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/__init__.py
diff --git a/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/pipelines.py b/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/pipelines.py
@@ -0,0 +1,19 @@
+import json
+
+
+class SaveToFilesPipeline(object):
+
+    def open_spider(self, spider):
+        self.fps = {}
+
+    def process_item(self, item, spider):
+        author_name = item.get('author', '')
+        filename = '{}.jl'.format('_'.join(author_name.lower().split()))
+        if filename not in self.fps:
+            self.fps[filename] = open(filename, 'w')
+        self.fps[filename].write(json.dumps(item) + '\n')
+        return item
+
+    def close_spider(self, spider):
+        for fp in self.fps.values():
+            fp.close()
diff --git a/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/settings.py b/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/settings.py
@@ -0,0 +1,11 @@
+BOT_NAME = 'p4_pipeline_handson'
+
+SPIDER_MODULES = ['p4_pipeline_handson.spiders']
+NEWSPIDER_MODULE = 'p4_pipeline_handson.spiders'
+
+ROBOTSTXT_OBEY = True
+
+
+ITEM_PIPELINES = {
+    'p4_pipeline_handson.pipelines.SaveToFilesPipeline': 300,
+}
diff --git a/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/spiders/__init__.py b/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/spiders/quotes-pagination.py b/unit6/spiders/p4_pipeline_handson/p4_pipeline_handson/spiders/quotes-pagination.py
@@ -0,0 +1,21 @@
+import scrapy
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "quotes-pagination"
+    start_urls = [
+        'http://quotes.toscrape.com',
+    ]
+
+    def parse(self, response):
+        for quote in response.css('div.quote'):
+            yield {
+                'text': quote.css('span.text::text').extract_first(),
+                'author': quote.css('span small::text').extract_first(),
+                'tags': quote.css('div.tags a.tag::text').extract(),
+            }
+
+            next_page = response.css("li.next > a::attr(href)").extract_first()
+            if next_page is not None:
+                url = response.urljoin(next_page)
+                yield scrapy.Request(url, callback=self.parse)
diff --git a/unit6/spiders/p4_pipeline_handson/scrapy.cfg b/unit6/spiders/p4_pipeline_handson/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = p4_pipeline_handson.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = p4_pipeline_handson
diff --git a/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/__init__.py b/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/__init__.py
diff --git a/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/middlewares.py b/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/middlewares.py
@@ -0,0 +1,29 @@
+from scrapy import signals
+from scrapy.http import HtmlResponse
+from scrapy.exceptions import NotConfigured
+import selenium.webdriver as webdriver
+
+
+class SeleniumDownloaderMiddleware(object):
+
+    def __init__(self):
+        self.driver = webdriver.PhantomJS()
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        m = cls()
+        if not crawler.settings.getbool('SELENIUM_ENABLED'):
+            raise NotConfigured()
+        crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
+        return m
+
+    def process_request(self, request, spider):
+        if request.meta.get('nojs'):
+            # disable js rendering in a per-request basis
+            return
+        self.driver.get(request.url)
+        content = self.driver.page_source.encode('utf-8')
+        return HtmlResponse(request.url, body=content)
+
+    def spider_closed(self, spider):
+        self.driver.close()
diff --git a/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/settings.py b/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/settings.py
diff --git a/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/spiders/__init__.py b/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/spiders/__init__.py
diff --git a/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/spiders/quotes-pagination.py b/unit6/spiders/p5_downloader_middleware_handson/p5_downloader_middleware_handson/spiders/quotes-pagination.py
diff --git a/unit6/spiders/p5_downloader_middleware_handson/scrapy.cfg b/unit6/spiders/p5_downloader_middleware_handson/scrapy.cfg

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"_default": {"1": {"item": {"huw": "aa", "hua": 1}}, "2": {"item": {"huw": "aa", "hua": 2}}, "3": {"item": {"huw": "aa", "hua": 3}}}}`