forked from sampsyo/python-blekko
-
Notifications
You must be signed in to change notification settings - Fork 0
/
blekko.py
186 lines (162 loc) · 6.12 KB
/
blekko.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""Bindings for the Blekko search API."""
import urllib
import time
import threading
import json
BASE_URL = 'http://blekko.com'
MAX_RETRIES = 3
RATE_LIMIT = 1.0 # Seconds.
RETRIES_BACKOFF = 2
RETRIES_DELAY = 3.0 # Seconds.
class _rate_limit(object):
"""A decorator that limits the rate at which the function may be
called. Minimum interval is given by RATE_LIMIT. Thread-safe using
locks.
"""
def __init__(self, fun):
self.fun = fun
self.last_call = 0.0
self.lock = threading.Lock()
def __call__(self, *args, **kwargs):
with self.lock:
# Wait until RATE_LIMIT time has passed since last_call,
# then update last_call.
since_last_call = time.time() - self.last_call
if since_last_call < RATE_LIMIT:
time.sleep(RATE_LIMIT - since_last_call)
self.last_call = time.time()
# Call the original function.
return self.fun(*args, **kwargs)
class BlekkoError(Exception):
"""Base class for exceptions raised by this module."""
class ServerError(BlekkoError):
"""Raised when the server denies a request for some reason."""
def _retries(func):
"""A decorator implementing retrying process.
Maximum number of retries is given by MAX_RETRIES.
Delay between retries is given by RETRIES_DELAY.
RETRIES_BACKOFF is the factor by which the delay is
lengthen after each failure.
"""
def dec(func):
def f2(*args, **kwargs):
mydelay = RETRIES_DELAY
for tries in range(MAX_RETRIES):
try:
return func(*args, **kwargs)
except ServerError as e:
if tries >= MAX_RETRIES - 1:
raise
else:
time.sleep(mydelay)
mydelay = mydelay * RETRIES_BACKOFF
else:
break
return f2
return dec(func)
@_retries
@_rate_limit
def _http_request(url):
"""Make a (rate-limited) request to the Blekko server and return the
resulting data.
"""
f = urllib.urlopen(url)
code = f.getcode()
if code == 503:
raise ServerError('server overloaded (503)')
elif code != 200:
raise ServerError('HTTP error {0}'.format(code))
return f.read()
class ResponseObject(object):
"""An object wrapper for a dictionary providing item access to
values in the underlying dictionary.
"""
def __init__(self, data):
self.data = data
def __getattr__(self, key):
if key in self.data:
return self.data[key]
raise KeyError('no such field {0}'.format(repr(key)))
def __repr__(self):
return '{0}({1})'.format(type(self).__name__, self.data)
class Result(ResponseObject):
"""A single search result. Available fields include url, url_title,
snippet, rss, short_host, short_host_url, and display_url.
"""
class ResultSet(ResponseObject):
"""A set of search results. Behaves as an immutable sequence
containing Result objects (accessible via iteration or
subscripting). Additional available fields include q, noslash_q,
total_num, num_elem_start, num_elem,end, nav_page_range_start,
nav_page_range_end, tag_switches, sug_slash, and
universal_total_results.
"""
def _get_results(self):
try:
return self.data['RESULT']
except KeyError:
return []
def __iter__(self):
for result in self._get_results():
yield Result(result)
def __getitem__(self, index):
return Result(self._get_results()[index])
def __len__(self):
return len(self._get_results())
class Blekko(object):
def __init__(self, auth=None, source=None):
"""Create an API object. Either `auth` or `source` must be
provided to identify the application (use whichever was assigned
to you by Blekko).
"""
if not auth and not source:
raise BlekkoError('API key not provided')
self.auth = auth
self.source = source
def _request(self, path, params):
"""Make a (rate-limited) request to the Blekko server and return
the result data.
"""
params = dict(params) # Make a copy.
if 'self' in params: # remove 'self' added by locals()
del params['self']
if self.auth:
params['auth'] = self.auth
else:
params['source'] = self.source
query = urllib.urlencode(params)
url = "{0}{1}?{2}".format(BASE_URL, path, query)
return _http_request(url)
def query(self, terms, page=0):
"""Perform a search and return a ResultSet object."""
if isinstance(terms, unicode):
terms = terms.encode("utf-8")
data = self._request('/ws/', {
'q': terms + ' /json',
'p': page,
})
return ResultSet(json.loads(data))
def pagestats(self, url):
"""Get page statistics for a URL and return a Result object.
"""
data = self._request('/api/pagestats', locals())
return Result(json.loads(data))
def inbound_hosts(self, to_host, limit_hosts=2500, skip_hosts=0,
sort_hosts="from_host_rank10_desc"):
"""Get inbound hosts for a given host and return a Result object.
"""
data = self._request('/api/inbound', locals())
return Result(json.loads(data))
def inbound_links(self, to_url, limit_links=2500, skip_links=0,
sort_links="from_pubdate_desc,from_rank10_desc"):
"""Get inbound links for a given url and return a Result object.
"""
data = self._request('/api/inbound', locals())
return Result(json.loads(data))
def hosts_links(self, from_host, to_host, limit_links=2500, skip_links=0,
sort_links="from_pubdate_desc,from_rank10_desc"):
"""Get links from one host to another and return a Result object.
Presence or absence of leading "www." matters for the from_host field.
"""
data = self._request('/api/inbound', locals())
return Result(json.loads(data))