-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
125 lines (86 loc) · 3.6 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# This is a template for a Python scraper on Morph (https://morph.io)
# including some code snippets below that you should find helpful
# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries. You can use whatever libraries are installed
# on Morph for Python (https://github.com/openaustralia/morph-docker-python/blob/master/pip_requirements.txt) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.
import scraperwiki
import lxml.html
import time
def scrapePage(url):
html = None
attempts = 0
while html == None and attempts < 3:
try: html = scraperwiki.scrape(url)
except:
attempts += 1
continue
if html == None and attempts == 3:
print 'Unable to scrape ' + review_href
return html
def extractTrack(html, mytrackType):
root = lxml.html.fromstring(html)
pagetitle = root.cssselect('head title')[0].text_content()
myRoot = "div[id^='{0}']".format(mytrackType)
for el in root.cssselect(myRoot):
#mytest = el.text_content()
#print str(mytest.encode('utf-8'))
if len(el.cssselect('div.inner div.title h1')) ==0 : continue
title = el.cssselect('div.inner div.title h2')[0].text_content()
artist = el.cssselect("div.inner div.title h1")[0].text_content()
publisher = el.cssselect("div.inner div.title h3")[0].text_content()
rank = el.cssselect("div.inner div.review-content div.rank")[0].text_content()
reviewtext = el.cssselect("div.inner div.review-content p")[0].text_content()
#link = el.attrib['href']
#isbn = link.split("/")[2]
#print title
#print artist
#print link
#link = "http://www.readings.com.au" + link
record = {
"pagetitle" : pagetitle,
"tracktype" : mytrackType,
"title" : title,
"artist" : artist,
"publisher" : publisher,
"rank" : rank,
"reviewtext" : reviewtext,
"scrape_date" : time.strftime( "%Y-%m-%d" )
}
scraperwiki.sqlite.save(unique_keys=["scrape_date", "rank"], data=record)
return
def extractPage (baseURL, mytrackType):
page = 1
while page < 6:
myhtml = scrapePage(baseURL.format(page))
extractTrack(myhtml, mytrackType)
page += 1
return
### Start extraction
mybaseURL = "http://pitchfork.com/features/staff-lists/9465-the-top-100-albums-of-2010-2014/{0}/"
extractPage(mybaseURL, "album")
mybaseURL = "http://pitchfork.com/features/staff-lists/9555-the-100-best-tracks-of-2014/{0}/"
extractPage(mybaseURL, "track")
mybaseURL = "http://pitchfork.com/features/staff-lists/9558-the-50-best-albums-of-2014/{0}/"
extractPage(mybaseURL, "album")
mybaseURL = "http://pitchfork.com/features/staff-lists/9466-the-top-200-tracks-of-2010-2014/{0}/"
extractPage(mybaseURL, "track")
mybaseURL = "http://pitchfork.com/features/staff-lists/9293-the-top-50-albums-of-2013/{0}/"
extractPage(mybaseURL, "album")
mybaseURL = "http://pitchfork.com/features/staff-lists/9288-the-top-tracks-of-2013/{0}/"
extractPage(mybaseURL, "track")