-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcrawl.py
294 lines (214 loc) · 9.35 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# -*- coding: utf-8 -*-
import requests
import json
import sqlite3
import time
import multiprocessing
import argparse
from util import get_user_info, get_comment_user, get_articles_id
def write_team_info():
"""
获取各个球队的信息,并存储在本地的sqlite数据库中。(data.db -> team)
:return:
"""
# 使用时需要将 Cookie 替换。
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Cookie': 'laravel_session=eyJpdiI6ImtucXJlaTdDdnlCWHJOaDl6Q3pnNlZkcUgxU0FpVE5IZDBuWGt1a3pha2c9IiwidmFsdWUiOiJNQ0ZzNXZla2hsaWtENEEraERQQW1adXFRdUdCUlBGV25MQ09SMW5jek1EV2xPaG5sV05VSGFHMUkxSDVEM1pBVWJsWFBZMUQ1SnRCQnREZlBrRUJ5dz09IiwibWFjIjoiOGE3NzY2YWE3NTlmYjIyODg5M2U4ZjBlMDc4NzU5NzgzYmM2NDIwOTY2MTU0NmI4Zjc5OTFjMWM5YmQ1YzZmMSJ9; expires=Sat, 12-May-2018 13:55:57 GMT; Max-Age=7200; path=/; domain=dongqiudi.com; httponly'
}
BASE_URL = 'http://api.dongqiudi.com'
# 1. 获取左侧tab
URL = '/catalogs'
r = requests.get(url=BASE_URL + URL, headers=headers)
tab_content = json.loads(r.text)
# print(tab_content)
# 用来存储数据,然后一起写入数据库
team_data = list()
for item in tab_content:
# 跳过热门
if item['id'] in (1, 9, 10, 11):
continue
URL = '/catalog/channels/'
r = requests.get(url=BASE_URL + URL + str(item['id']), headers=headers)
content = json.loads(r.text)
team_data.append(content)
# 存入 sqlite3 数据库
# 连接到SQLite数据库
# 数据库文件是 data.db
# 如果文件不存在,会自动在当前目录创建:
conn = sqlite3.connect('data.db')
# 创建一个Cursor:
cursor = conn.cursor()
# 判断表是否存在,如果存在则删除
cursor.execute("select * from sqlite_master where type = 'table' and name = 'team'")
# 获取查询结果
value = cursor.fetchall()
if value:
cursor.execute('DROP TABLE team')
# 执行一条SQL语句,创建team表:
cursor.execute('CREATE TABLE team (id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, league VARCHAR(20), '
'avatar VARCHAR(100), name VARCHAR(40), team_id VARCHAR(10), object_id VARCHAR(10))')
# 插入记录:
# cursor.execute('INSERT INTO user (id, name) VALUES (\'1\', \'Michael\')')
for league_data in team_data:
league_name = league_data['title']
for team in league_data['data']:
cursor.execute("INSERT INTO team (league, avatar, name, team_id, object_id) VALUES "
"('{}', '{}', '{}', '{}', '{}')".format(league_name, team.get('avatar', ''),
team.get('name', ''), team.get('id', ''),
team.get('object_id', '')))
print('{} data finish.'.format(league_name.encode(encoding='utf-8')))
# 关闭Cursor:
cursor.close()
# 提交事务:
conn.commit()
# 关闭Connection:
conn.close()
def write_article_comment_user(page_num, obtain_article=True, multi_process=False, if_continue=False):
"""
将前 page num 页文章的评论区的用户id写入 article_comment_user 列表。
:param page_num: 前 page num 页文章
:param obtain_article: 是否要运行part 1来获取文章列表
:param multi_process: 是否启动多线程
:param if_continue: 是否继续上次的请求
:return:
"""
tic = time.time()
# --------------------- 1. 获取文章id列表 ---------------------
if obtain_article:
article_id_dict = get_articles_id(page_num)
article_id_list = list(article_id_dict.values())
# flatten
article_id_list = [y for x in article_id_list for y in x]
print(f'Article id list obtained, there are total {len(article_id_list)} articles.')
with open('article_id.txt', 'wb') as F:
F.write(str(article_id_list).encode(encoding='utf-8'))
toc1 = time.time()
print(f'Part 1 costs time: {toc1 - tic} second.')
# --------------------- 2. 获取各文章下的用户id列表 ---------------------
# 数据库
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
# 如果不是继续上次的,就清空表
if not if_continue:
cursor.execute("select * from sqlite_master where type = 'table' and name = 'article_comment_user'")
value = cursor.fetchall()
if value:
cursor.execute('DROP TABLE article_comment_user')
cursor.execute('CREATE TABLE article_comment_user (id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, '
'article_id VARCHAR(20), user_id_set VARCHAR(10000))')
user_id_list = list()
count = 0
total_users = 0
with open('article_id.txt', 'rb') as F:
d = F.readlines()
article_id_list = eval(d[0])
with open('crawled_article_id.txt', 'rb') as F:
d = F.readlines()
crawled_article_id_list = eval(d[0])
article_id_list = list(set(article_id_list) - set(crawled_article_id_list))
# 单进程
if not multi_process:
for article_id in article_id_list:
user_set = get_comment_user(article_id)
user_id_list.append([article_id, str(user_set)])
crawled_article_id_list.append(article_id)
count += 1
total_users += len(user_set)
if count % 20 == 0:
print(f'{count} articles have been processed, there are total {total_users} users.')
if count % 1000 == 0:
# 将当前数据写入数据库
cursor.executemany(f"INSERT INTO article_comment_user (article_id, user_id_set) VALUES (?, ?)",
user_id_list)
cursor.close()
conn.commit()
user_id_list = list()
cursor = conn.cursor()
with open('crawled_article_id.txt', 'wb') as F:
F.write(str(crawled_article_id_list).encode(encoding='utf-8'))
else:
# 多进程
pool = multiprocessing.Pool(8)
for article_id in article_id_list:
result = pool.apply_async(get_comment_user, (article_id,))
user_set = result.get()
user_id_list.append([article_id, str(user_set)])
count += 1
total_users += len(user_set)
if count % 10 == 0:
print(f'{count} articles have been processed, there are total {total_users} users.')
pool.close()
pool.join()
print(f'User id set obtained, there are total {total_users} users.')
toc2 = time.time()
print(f'Part 2 costs time: {toc2 - toc1} second.')
# --------------------- 3. 写入数据库 ---------------------
cursor.executemany(f"INSERT INTO article_comment_user (article_id, user_id_set) VALUES (?, ?)", user_id_list)
cursor.close()
conn.commit()
conn.close()
toc3 = time.time()
print(f'Part 3 costs time: {toc3 - toc2} second.')
def write_user_list():
"""
将带爬取的用户id列表写入 user_id_set.txt 。
:return:
"""
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
cursor.execute('select * from article_comment_user')
value = cursor.fetchall()
user_id_set = set()
for x in value:
d = eval(x[2])
user_id_set.update(d)
print(f'There are {len(user_id_set)} users in total.')
cursor.close()
conn.commit()
conn.close()
with open('user_id_set.txt', 'wb') as F:
F.write(str(list(user_id_set)).encode(encoding='utf-8'))
def write_user_info(begin, end):
tic = time.time()
# 1. 先读取user id列表
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
with open('user_id_set.txt', 'rb') as F:
d = F.readlines()
res = eval(d[0])
user_id_set = set(res[begin: end])
toc1 = time.time()
print(f'Part 1 finish, cost time {toc1 - tic} second.')
# 2. 获取用户信息,写入user表
insert_data = list()
count = 0
for user_id in user_id_set:
try:
user_info = get_user_info(user_id)
except:
continue
insert_data.append(user_info)
count += 1
if count % 200 == 0:
print(f'{count} users have been processed.')
cursor.executemany(f"INSERT INTO user (user_id, user_name, gender, created_at, region_id, region_phrase, "
f"team_id, introduction, timeline_total, post_total, reply_total, up_total, following_total, "
f"followers_total) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", insert_data)
cursor.close()
conn.commit()
conn.close()
toc2 = time.time()
print(f'Part 2 costs time: {toc2 - toc1} second.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parse arguments.')
parser.add_argument('--begin', type=int, default=0,
help='begin index')
parser.add_argument('--end', type=int, default=610803,
help='end index')
args = parser.parse_args()
# 获取球队信息,写入 team 表
write_team_info()
# write_article_comment_user(5000, obtain_article=False, multi_process=False, if_continue=True)
# write_user_list()
# write_user_info(args.begin, args.end)