-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcrawler.py
176 lines (151 loc) · 4.97 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import cookielib
import json
import os.path
import re
import sys
import urllib
import urllib2
class Util:
"""Util to help to generate the URLs and make directory."""
@staticmethod
def to_profile_url(profile_id):
return 'http://www.renren.com/%s' % self.profile_id
@staticmethod
def to_albumlist_url(profile_id):
return 'http://photo.renren.com/photo/%s/albumlist/v7' % profile_id
@staticmethod
def photo_ajax_query_url(profile_id, album_id, page):
return 'http://photo.renren.com/photo/%s/album-%s/bypage/ajax/v7?' \
'page=%d&pageSize=40' % (profile_id, album_id, page)
@staticmethod
def create_dir_if_no_exist(directory):
if not os.path.exists(directory):
os.makedirs(directory)
class Log:
"""This class is to log the download metric."""
def __init__(self):
self.photo_num = 0
self.album_num = 0
def start(self):
print "Start to download photos."
def photo_increase(self):
self.photo_num += 1
if self.photo_num % 50 == 0:
print 'Already download', self.photo_num, 'photos.'
def album_increase(self):
self.album_num += 1
def summary(self):
print '***************************************\n' \
'Download photos successfully:\n' \
'Total Photos:', self.photo_num, '\n' \
'Total Albums:', self.album_num, '\n' \
'***************************************'
class Account:
"""Renren account class for www.renren.com.
This class is responsible for logging, crawling and download all photos to
local disk.
"""
def __init__(self, name = '', password = ''):
self.name = name
self.password = password
self.is_login = False
self.profile_id = ''
self.cookie_jar = cookielib.LWPCookieJar()
self.opener = urllib2.build_opener(
urllib2.HTTPCookieProcessor(self.cookie_jar))
urllib2.install_opener(self.opener)
def login(self):
"""Login the account."""
# Generate the params and request.
params = {
'domain': 'www.renren.com',
'email': self.name,
'password': self.password}
request = urllib2.Request(
'http://www.renren.com/PLogin.do',
urllib.urlencode(params))
# Try to login the account.
try:
self.openrate = self.opener.open(request)
url = self.openrate.geturl()
is_login = re.match('http://www.renren.com/[\d]{9}', url);
if is_login:
print 'Login successfuly.'
self.is_login = True
self.profile_id = url[-9:]
return True
else:
print 'Account/Password incorrect.'
return False
except Exception, e:
print 'Fail to login.', e.message
return False;
return False
def get_html_content(self, url):
"""Fetch the html content for a given url.
Args:
url: The url(string) to crawl the content.
Returns:
The html content(string) edcoded in UTF-8.
"""
request = urllib2.Request(url)
self.openrate = self.opener.open(request)
info = self.openrate.read()
type = sys.getfilesystemencoding()
return info.decode("UTF-8").encode(type)
def get_album_ids(self):
"""Get all album id from this accout.
Returns:
A list of album ids(string).
"""
albumlist_url = Util.to_albumlist_url(self.profile_id)
html_content = self.get_html_content(albumlist_url);
album_ids = re.findall('\"albumId\":\"[\d]{9}\"', html_content)
album_ids = map(lambda x: x[-10:-1], album_ids)
return album_ids
def get_photo_urls_in_album(self, album_id):
"""Get all photo urls in the album.
Args:
album_id: Alubm id(string) for this accout.
Returns:
A list of photo urls(string) in the album.
"""
photo_urls = []
for page in range(1, 100): # max album num
ajax_url = Util.photo_ajax_query_url(self.profile_id, album_id, page)
html_content = self.get_html_content(ajax_url)
data = json.loads(html_content)
photo_list = data['photoList']
if len(photo_list) <= 0:
break
photo_urls.extend([photo['url'] for photo in photo_list])
return photo_urls
def download_photos(self, path):
"""Download all photos from this account to local disk.
Args:
path: Local disk path(string).
"""
log = Log()
log.start()
Util.create_dir_if_no_exist(path)
album_ids = self.get_album_ids();
for album_id in album_ids:
photo_urls = self.get_photo_urls_in_album(album_id)
album_path = path + '/' + str(album_id)
log.album_increase()
Util.create_dir_if_no_exist(album_path)
name = 1
for photo_url in photo_urls:
file_path = ''.join([album_path, '/', str(name), '.jpg'])
urllib.urlretrieve(photo_url, file_path)
log.photo_increase()
name += 1
log.summary()
if __name__=='__main__':
username, password, path = raw_input(), raw_input(), raw_input()
account = Account(username, password)
account.login();
if account.is_login == False:
sys.exit()
# Download photos.
account.download_photos(path)