diff --git a/quotesbot/items.py b/quotesbot/items.py index 63a00a9..292c9ea 100644 --- a/quotesbot/items.py +++ b/quotesbot/items.py @@ -1,9 +1,7 @@ -# -*- coding: utf-8 -*- - # Define here the models for your scraped items # # See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html +# https://doc.scrapy.org/en/latest/topics/items.html import scrapy diff --git a/quotesbot/pipelines.py b/quotesbot/pipelines.py index 2aff2d7..09f26f5 100644 --- a/quotesbot/pipelines.py +++ b/quotesbot/pipelines.py @@ -1,11 +1,9 @@ -# -*- coding: utf-8 -*- - # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -class QuotesbotPipeline(object): +class QuotesbotPipeline: def process_item(self, item, spider): return item diff --git a/quotesbot/settings.py b/quotesbot/settings.py index 250b253..9433cd6 100644 --- a/quotesbot/settings.py +++ b/quotesbot/settings.py @@ -1,13 +1,11 @@ -# -*- coding: utf-8 -*- - # Scrapy settings for quotesbot project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'quotesbot' @@ -16,75 +14,75 @@ # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'quotesbot (+http://www.yourdomain.com)' +# USER_AGENT = 'holidaygems (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'quotesbot.middlewares.MyCustomSpiderMiddleware': 543, -#} +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'scrapingbot.middlewares.SpiderMiddleware': 543, +# } # Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { # 'quotesbot.middlewares.MyCustomDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { # 'quotesbot.pipelines.SomePipeline': 300, -#} +# } # Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # noqa +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/quotesbot/spiders/toscrape-css.py b/quotesbot/spiders/toscrape-css.py index 555e204..259203c 100644 --- a/quotesbot/spiders/toscrape-css.py +++ b/quotesbot/spiders/toscrape-css.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import scrapy @@ -11,12 +10,11 @@ class ToScrapeCSSSpider(scrapy.Spider): def parse(self, response): for quote in response.css("div.quote"): yield { - 'text': quote.css("span.text::text").extract_first(), - 'author': quote.css("small.author::text").extract_first(), - 'tags': quote.css("div.tags > a.tag::text").extract() + 'text': quote.css("span.text::text").get(), + 'author': quote.css("small.author::text").get(), + 'tags': quote.css("div.tags > a.tag::text").getall() } - next_page_url = response.css("li.next > a::attr(href)").extract_first() + next_page_url = response.css("li.next > a::attr(href)").get() if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) - + yield response.follow(next_page_url) diff --git a/quotesbot/spiders/toscrape-xpath.py b/quotesbot/spiders/toscrape-xpath.py index 9599fd8..3bb42b1 100644 --- a/quotesbot/spiders/toscrape-xpath.py +++ b/quotesbot/spiders/toscrape-xpath.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import scrapy @@ -11,12 +10,12 @@ class ToScrapeSpiderXPath(scrapy.Spider): def parse(self, response): for quote in response.xpath('//div[@class="quote"]'): yield { - 'text': quote.xpath('./span[@class="text"]/text()').extract_first(), - 'author': quote.xpath('.//small[@class="author"]/text()').extract_first(), - 'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract() + 'text': quote.xpath('./span[@class="text"]/text()').get(), + 'author': quote.xpath('.//small[@class="author"]/text()').get(), + 'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').getall() } - next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first() + next_page_url = response.xpath('//li[@class="next"]/a/@href').get() if next_page_url is not None: - yield scrapy.Request(response.urljoin(next_page_url)) + yield response.follow(next_page_url)