-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscraper.py
188 lines (156 loc) · 5.75 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import bs4
import requests
import tld
import errno
import itertools
import os
import re
import urlparse
from collections import defaultdict
import logging
"""
This grabs the phone data from GSMArena. It goes through a list of makers,
then goes through the list of phones by each maker, and then pulls out
whatever specs GSMArena lists for said phones.
The goal of this project is to make more informed data about which phones to
buy. You can filter by what operating system it has, when it was released,
whether it supports and SD card, the diagonal size of the screen, and more.
This relies entirely on the data from GSMArena and similar websites
Now we have all the details on a per-phone basis. Wonderful.
Some of my least finest work, but it serves the purpose.
TODO:
YhG1s and cdunklau from #python suggest me using Scrapy instead of
BeautifulSoup + requests
"""
# http://stackoverflow.com/a/600612/198348
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
URL = "http://www.gsmarena.com/makers.php3"
TLD_URL = tld.get_tld(URL)
USER_AGENT = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.49 Safari/537.36"
HEADERS = { "User-Agent" : USER_AGENT
, "Accept" : "text/html" }
CACHE = "./cache"
# I wonder if there's a more clever way than using 'directory'
CACHE_MAKERS = "{0}/makers".format(CACHE)
CACHE_PHONES = "{0}/phones".format(CACHE)
mkdir_p(CACHE)
mkdir_p(CACHE_MAKERS)
mkdir_p(CACHE_PHONES)
logging.basicConfig(filename="./cache/error.log")
def rget(url, directory=CACHE, **kwargs):
# if cached, open cached file instead of using requests
urlpath = urlparse.urlparse(url).path.strip("/")
urlpath = os.path.join(directory, urlpath)
try:
return open(urlpath).read()
except IOError as e:
if e.errno != errno.ENOENT:
raise
r = requests.get(url,headers=HEADERS,**kwargs)
with open(urlpath,"w") as f:
# I'm confused! YEAARGH isn't utf-8 automatic in requests?
# http://stackoverflow.com/a/9942822/198348
f.write(r.text.encode("utf-8").strip())
return r.text
def cleanse(string):
string = re.sub("\xa0","",string)
string = re.sub("\r\n","\n",string)
string = string.strip()
return string
class Maker(object):
def __init__(self,name,url):
self.name = name
self.url = url
self.results = []
def __str__(self):
return "<Maker({0},{1})>".format(self.name,self.url)
def __repr__(self):
return str(self)
def get_phones(self):
rtext = rget(self.url, directory=CACHE_MAKERS)
soup = bs4.BeautifulSoup(rtext)
phone_soups = soup.select("#main .makers > ul > li > a")
phones = []
for phone_soup in phone_soups:
name = phone_soup.strong.text
href = "http://{0}/{1}".format(TLD_URL,phone_soup.attrs["href"])
phones.append(Phone(name=name,url=href,maker=self.name))
self.phones = phones
return phones
class Phone(object):
def __init__(self,name="",url="",maker=""):
self.name = name
self.url = url
self.maker = maker
self.fields = {}
self.description = ""
self.canonical_name = "{0} {1}".format(self.maker,self.name)
def __str__(self):
return "<Phone({0},{1},{2})>".format(self.maker,self.name,self.url)
def __repr__(self):
return str(self)
def get_fields(self):
rtext = rget(self.url,directory=CACHE_PHONES)
soup = bs4.BeautifulSoup(rtext)
description = soup.select("#specs-list > p")
self.description = description[0].text if description else None
fields = {}
sections_with_emptykey = defaultdict(int)
tables = soup.select("table")
for table in tables:
title = table.th.text
subvalues = itertools.izip(table.select(".ttl"),
table.select(".nfo"))
subdict = {}
for key, value in subvalues:
clean_key = cleanse(key.text)
clean_value = cleanse(value.text)
subdict[clean_key] = clean_value
if not clean_key:
sections_with_emptykey[title] += 1
fields[title] = subdict
if sections_with_emptykey:
logging.warn("{0}: Sections with empty keys\n{1}"
.format(self,sections_with_emptykey))
self.fields = fields
return fields
def get_makers(url):
rtext = rget(url)
root_soup = bs4.BeautifulSoup(rtext)
makers = root_soup.select("#main tr > td > a")
results = []
# every other element is a duplicate
for maker in makers[::2]:
href = "http://{0}/{1}".format(TLD_URL,maker.attrs["href"])
name = maker.img.attrs["alt"]
m = Maker(name,href)
results.append(m)
return results
if __name__ == "__main__":
makers = get_makers(URL)
mdict = {}
print("#Brands")
for maker in makers:
print("Working on " + maker.name)
mdict[maker.name] = maker.get_phones()
print("#Phones")
phones = list(itertools.chain(*mdict.values()))
for index, phone in enumerate(phones):
if index % 100 == 0:
print("{:4} Working on {} {}".format(index, phone.maker, phone.name))
try:
phone.get_fields()
except Exception as e:
logging.error("encountered exception for {0}".format(phone.name))
logging.exception(e)
if True:
import jsonpickle
with open(os.path.join(CACHE,"mdict.json"),"w") as fm:
mdict_json = jsonpickle.encode(mdict)
fm.write(mdict_json)