forked from PeggyZWY/house-renting-spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
houseRentingSpider.py
250 lines (211 loc) · 11.6 KB
/
houseRentingSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sqlite3
import sys
import time
import datetime
import os
import requests
from bs4 import BeautifulSoup
import Config
class Utils(object):
@staticmethod
def isInBalckList(blacklist, toSearch):
if blacklist:
return False
for item in blacklist:
if toSearch.find(item) != -1:
return True
return False
@staticmethod
def getTimeFromStr(timeStr):
# 13:47:32 or 2016-05-25 or 2016-05-25 13:47:32
# all be transformed to datetime
if '-' in timeStr and ':' in timeStr:
return datetime.datetime.strptime(timeStr, "%Y-%m-%d %H:%M:%S")
elif '-' in timeStr:
return datetime.datetime.strptime(timeStr, "%Y-%m-%d")
elif ':' in timeStr:
date_today = datetime.date.today();
date = datetime.datetime.strptime(timeStr, "%H:%M:%S")
# date.replace(year, month, day):生成一个新的日期对象
return date.replace(year=date_today.year, month=date_today.month, day=date_today.day)
else:
return datetime.date.today()
class Main(object):
def __init__(self, config):
self.config = config
self.douban_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,en-GB;q=0.2,zh-TW;q=0.2',
'Connection': 'keep-alive',
'DNT': '1',
'HOST': 'www.douban.com',
'Cookie': self.config.douban_cookie
}
def run(self):
result_file_name = 'results/result_' + str(spider.file_time)
try:
print 'Connecting database... 打开数据库...'
# creat database
conn = sqlite3.connect(result_file_name + '.sqlite')
conn.text_factory = str
cursor = conn.cursor()
cursor.execute(
'CREATE TABLE IF NOT EXISTS rent(id INTEGER PRIMARY KEY, title TEXT, url TEXT UNIQUE, itemtime timestamp, crawtime timestamp, source TEXT, keyword TEXT, note TEXT)')
cursor.close()
cursor = conn.cursor()
search_list = self.config.key_search_word_list
custom_black_list = self.config.custom_black_list
start_time = Utils.getTimeFromStr(self.config.start_time)
def urlList(page_number):
num_in_url = str(page_number * 50)
douban_url = ['https://www.douban.com/group/search?start=' + num_in_url +'&group=146409&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=523355&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=557646&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=383972&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=283855&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=76231&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=196844&cat=1013&sort=time&q=',
'https://www.douban.com/group/search?start=' + num_in_url +'&group=259227&cat=1013&sort=time&q=']
return douban_url
douban_url_name = [u'上海租房', u'上海招聘,租房', u'上海租房(2)', u'上海合租族_魔都租房', u'上海租房@浦东租房', \
u'上海租房---房子是租来的,生活不是', u'上海租房@长宁租房/徐汇/静安租房', u'上海租房(不良中介勿扰)']
def crawl(i, douban_url, keyword, douban_headers):
url_link = douban_url[i] + keyword
print 'url_link: ', url_link
r = requests.get(url_link, headers=douban_headers)
if r.status_code == 200:
try:
if i == 0:
self.douban_headers['Cookie'] = r.cookies.items()[0][1]
soup = BeautifulSoup(r.text, "lxml")
paginator = soup.find_all(attrs={'class': 'paginator'})[0]
if (page_number != 0) and not paginator:
return False
else:
try:
table = soup.find_all(attrs={'class': 'olt'})[0]
tr_count_for_this_page = 0
spider.ok = True
for tr in table.find_all('tr'):
td = tr.find_all('td')
title_element = td[0].find_all('a')[0]
title_text = title_element.get('title')
# ignore items in blacklist
if Utils.isInBalckList(custom_black_list, title_text):
continue
time_text = td[1].get('title')
if (page_number != 0) and (Utils.getTimeFromStr(time_text) < start_time):
spider.ok = False
break
# ignore data ahead of the specific date
if Utils.getTimeFromStr(time_text) < start_time:
continue
link_text = title_element.get('href');
reply_count = td[2].find_all('span')[0].text
tr_count_for_this_page += 1
try:
cursor.execute(
'INSERT INTO rent(id, title, url, itemtime, crawtime, source, keyword, note) VALUES(NULL, ?, ?, ?, ?, ?, ?, ?)',
[title_text, link_text, Utils.getTimeFromStr(time_text),
datetime.datetime.now(), keyword,
douban_url_name[i], reply_count])
print 'add new data:', title_text, time_text, reply_count, link_text, keyword
except sqlite3.Error, e:
print 'data exists:', title_text, link_text, e # URL should be unique
except Exception, e:
print 'error match table:', e
except Exception, e:
print 'error match paginator:', e
spider.ok = False
return False
else:
print 'request url error %s -status code: %s:' % (url_link, r.status_code)
time.sleep(self.config.douban_sleep_time)
print 'The spider begins to work... 爬虫开始运行...'
douban_url = urlList(0)
for i in range(len(douban_url)):
page_number = 0
print 'start i ->',i
for j in range(len(search_list)):
spider.ok = True
page_number = 0
keyword = search_list[j]
print 'start i->j %s -> %s %s' %(i, j, keyword)
print '>>>>>>>>>> Search %s %s ...' % (douban_url_name[i].encode('utf-8'), keyword)
while spider.ok:
print 'i, j, page_number: ', i, j, page_number
douban_url = urlList(page_number)
crawl(i, douban_url, keyword, self.douban_headers)
page_number += 1
cursor.close()
cursor = conn.cursor()
cursor.execute('SELECT * FROM rent ORDER BY itemtime DESC ,crawtime DESC')
values = cursor.fetchall()
# export to html file
print 'The spider has finished working. Now begin to write the data in the result HTML. 爬虫运行结束。开始写入结果文件'
file = open(result_file_name + '.html', 'wb')
with file:
file.write('''<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>上海租房信息 | 豆瓣</title>
<link rel="stylesheet" type="text/css" href="../lib/resultPage.css">
</head>
<body>''')
file.write('<h1>Shanghai Renting Information 上海租房信息 | </h1>')
file.write('''
<a href="https://www.douban.com/" target="_black">
<img src="https://img3.doubanio.com/f/shire/8977fa054324c4c7f565447b003ebf75e9b4f9c6/pics/nav/[email protected]" alt="豆瓣icon"/>
</a>
''')
file.write('<table>')
file.write(
'<tr><th>Index<br>索引</th><th>Title<br>标题</th><th>Posting Time<br>发帖时间</th><th>Scrawling Time<br>抓取时间</th><th>Keyword<br>关键字</th><th>Group<br>来源</th><th>Number of reply<br>回复数</th></tr>')
for row in values:
file.writelines('<tr>')
for i in range(len(row)):
if i == 2:
i += 1
continue
file.write('<td class="column%s">' % str(i))
if i == 1:
file.write('<a href="' + str(row[2]) + '" target="_black">' + str(row[1]) + '</a>')
i += 1
continue
file.write(str(row[i]))
i += 1
file.write('</td>')
file.write('</tr>')
file.write('</table>')
file.write('<script type="text/javascript" src="../lib/resultPage.js"></script>')
file.write('</body></html>')
cursor.close()
except Exception, e:
print 'Error:', e.message
finally:
conn.commit()
conn.close()
print '=============================================='
print 'Finished writing the result HTML. Please open "' + result_file_name + '.html" to check the result'
print '结果文件写入完毕。请打开"' + result_file_name + '.html"查看结果。'
class Spider(object):
def __init__(self):
this_file_dir = os.path.split(os.path.realpath(__file__))[0]
config_file_path = os.path.join(this_file_dir, 'config.ini')
self.ok = True
self.config = Config.Config(config_file_path)
FILETIMEFORMAT = '%Y%m%d_%X'
self.file_time = time.strftime(FILETIMEFORMAT, time.localtime()).replace(':', '')
results_path = os.path.join(sys.path[0], 'results')
if not os.path.isdir(results_path):
os.makedirs(results_path)
def run(self):
main = Main(self.config)
main.run()
if __name__ == '__main__':
spider = Spider()
spider.run()