-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathspider_5_reddit.py
26 lines (23 loc) · 1.04 KB
/
spider_5_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import scrapy
from datetime import datetime
class RedditSpider(scrapy.Spider):
name = 'reddit'
start_urls = [
'http://reddit.com/r/programming',
'http://reddit.com/r/python',
]
def parse(self, response):
# ':not(.stickied)' avoids scraping announcements that arte sticked to the top
for thing in response.css('.thing:not(.stickied)'):
yield {
'title': thing.css('.title::text').extract_first(),
'link': thing.css('.title > a::attr(href)').extract_first(),
'user_name': thing.css('a.author::text').extract_first(),
'user_url': thing.css('a.author::attr(href)').extract_first(),
# when score is 0, reddit show a bullet point instead of a 0
'score': int(thing.css('.score.unvoted::text').re_first('(\d+)') or 0),
'time': datetime.strptime(
thing.css('time::attr(datetime)').extract_first(),
'%Y-%m-%dT%H:%M:%S+00:00'
),
}