-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathblog_crawler.py
41 lines (35 loc) · 1.35 KB
/
blog_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
from scrapy.selector import HtmlXPathSelector
from scrapy.spiders import BaseSpider
import html2text
import scrapy
import lxml.html
import re
import unicodedata
class BlogSpider(BaseSpider):
name = "blog_spider"
start_urls = []
base_url = "http://www.nomadicmatt.com/page/"
for i in range(1,100): # Get results from the first 100 pages
url = base_url + str(i) + "/?s"
start_urls.append(url)
def parse(self, response):
i = 0
for href in response.css('.entry-title a::attr(href)'):
url = href.extract()
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response):
text = ""
for par in response.css('.entry-content p'):
this_text = lxml.html.fromstring(par.extract())
this_text = this_text.text_content().strip()
this_text = unicodedata.normalize('NFKD', u"" + this_text)
this_text = this_text.encode('ascii', errors='backslashreplace')
# substitute symbols
this_text = this_text.replace("\u2019", "'")
this_text = this_text.replace('\\xa0', ' ')
# add it up
text = text + " " + this_text
yield {
'Text': text
}