-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathopenanything.py
105 lines (88 loc) · 6.33 KB
/
openanything.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
'''OpenAnything: a kind and thoughtful library for HTTP web services
This program is part of 'Dive Into Python', a free Python book for
experienced programmers. Visit http://diveintopython.org/ for the
latest version.
'''
__author__ = 'Mark Pilgrim ([email protected])'
__version__ = '$Revision: 1.6 $'[11:-2]
__date__ = '$Date: 2004/04/16 21:16:24 $'
__copyright__ = 'Copyright (c) 2004 Mark Pilgrim'
__license__ = 'Python'
import urllib2, urlparse, gzip
from StringIO import StringIO
USER_AGENT = 'OpenAnything/%s +http://diveintopython.org/http_web_services/' % __version__
class SmartRedirectHandler(urllib2.HTTPRedirectHandler): # 11.7: Handling redirects - Example 11.11
def http_error_301(self, req, fp, code, msg, headers): # permanent redirect (update your address book!)
result = urllib2.HTTPRedirectHandler.http_error_301(
self, req, fp, code, msg, headers)
result.status = code # also store status code, which base class does not, for some reason
return result
def http_error_302(self, req, fp, code, msg, headers): # temporary redirect (should not update your address book)
result = urllib2.HTTPRedirectHandler.http_error_302( # would it have been cleaner to merge into http_error_redirect()?
self, req, fp, code, msg, headers) # or maybe use *args?
result.status = code
return result
class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): # Section 11.6 Example 11.7
def http_error_default(self, req, fp, code, msg, headers): # python docs: base class turns all responses into HTTPError exceptions
result = urllib2.HTTPError(
req.get_full_url(), code, msg, headers, fp)
result.status = code
return result # subclass RETURNS the exception instead of raising
# would it have been cleaner to call base class's method and CATCH the exception?
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT):
"""URL, filename, or string --> stream
This function lets you define parsers that take any input source
(URL, pathname to local or network file, or actual data as a string)
and deal with it in a uniform manner. Returned object is guaranteed
to have all the basic stdio read methods (read, readline, readlines).
Just .close() the object when you're done with it.
If the etag argument is supplied, it will be used as the value of an
If-None-Match request header.
If the lastmodified argument is supplied, it must be a formatted
date/time string in GMT (as returned in the Last-Modified header of
a previous request). The formatted date/time will be used
as the value of an If-Modified-Since request header.
If the agent argument is supplied, it will be used as the value of a
User-Agent request header.
"""
if hasattr(source, 'read'):
return source
if source == '-':
return sys.stdin
if urlparse.urlparse(source)[0] == 'http': # 11.9: make sure you're dealing with an HTTP URL
# open URL with urllib2
request = urllib2.Request(source) # 11.5 Example 11.4: first, create a Request object
request.add_header('User-Agent', agent) # 11.5: Setting the User-Agent
if lastmodified: # trivium: HTTP header field names are case-insensitive
request.add_header('If-Modified-Since', lastmodified) # 11.6: if data hasn't changed, server should return 304, which raises HTTPError exception
if etag:
request.add_header('If-None-Match', etag) # 11.6: request page hash (serves same purpose as LastModified)
request.add_header('Accept-encoding', 'gzip') # 11.8: Servers won't give you compressed data unless you tell them you can handle it.
opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler()) # second, create a URL opener. (11.6: this allows you to specify a custom error-handler easily [11.9: in order of decreasing priority])
return opener.open(request) # finally, use opener to open the Request
# try to open with native open function (if source is a filename)
try:
return open(source)
except (IOError, OSError):
pass
# treat source as string
return StringIO(str(source))
def fetch(source, etag=None, lastmodified=None, agent=USER_AGENT): # 11.9: Putting it all together
'''Fetch data and metadata from a URL, file, stream, or string'''
result = {}
f = openAnything(source, etag, lastmodified, agent)
result['data'] = f.read()
if hasattr(f, 'headers'): # 11.9: store headers for future invocations
# save ETag, if the server sent one
result['etag'] = f.headers.get('ETag')
# save Last-Modified header, if the server sent one
result['lastmodified'] = f.headers.get('Last-Modified')
if f.headers.get('content-encoding') == 'gzip': # 11.8: Handling compressed data
# data came back gzip-compressed, decompress it
result['data'] = gzip.GzipFile(fileobj=StringIO(result['data'])).read() # workaround: "write" gzipped data to "string-file" in memory, since GzipFile requires a file-like object
if hasattr(f, 'url'): # Example 11.6: gzip cannot unzip f directly from server, due to implementation constraints
result['url'] = f.url
result['status'] = 200 # 11.9: default status is OK
if hasattr(f, 'status'):
result['status'] = f.status # from DefaultErrorHandler()
f.close()