-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler_helper.py
221 lines (193 loc) · 7.09 KB
/
crawler_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# -*- coding:utf8 -*-
import requests
import logging
import time
import json
import os
import re
import markdown
from bs4 import BeautifulSoup
import proxy_config
import proxy_helper
target_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
# 'Cookie': 'ajs_user_id=null; ajs_group_id=null; _ga=GA1.2.517667667.1526523301; ajs_anonymous_id=%22513f05e7-2732-4124-aa74-3dfbee43c418%22; _mkto_trk=id:929-FJL-178&token:_mch-docker.com-1526523307626-79689; _gid=GA1.2.215116819.1526888391; mp_82c8a87cfaa9219dff0e89ef744d8357_mixpanel=%7B%22distinct_id%22%3A%20%221636be2346c860-00d080ff043e37-33657f07-13c680-1636be2346d3cd%22%2C%22mp_lib%22%3A%20%22Segment%3A%20web%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D',
'DNT': '1',
'Host': 'store.docker.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
def get_html(url, params=None, post_data=None):
"""
get the html by url and params
:param url: is target url
:param params: is a dict of params and they have been encoded
:param post_data: the data(dict) for post, this param will be None when Get method
:return: a text of html
"""
# headers = proxy_config.target_headers
proxies = proxy_helper.get_available_random_proxy()
if params is not None:
url += "?"
for key in params.keys():
url += "%s=%s&" % (str(key), str(params[key]))
url = url[0:-1]
# print "url: [%s]" % url
# print "proxies: %s" % proxies
print "target url: [%s], proxies: [%s]" % (url, proxies)
# print "get_html: %s" % url
html = requests.get(url, headers=None, proxies=proxies)
# print "url: [%s]\nhtml.text: [%s]" % (url, html.text.encode("utf-8"))
return html.text
def get_json_data(url, params=None, post_data=None):
"""
get the json data by url and params(Note: just for the url whose response is a json)
:param url: is target url
:param params: is a dict of params has been encoded
:param post_data: the data(dict) for post, this param will be None when Get method
:return: a string of json
"""
json_data = get_html(url, params=params, post_data=post_data)
# why while statement?
# because the url have no response sometimes(network or some errors of server)
# RetryTimes = crawler_config.retry_times
while json_data is None or json_data == "":
# RetryTimes -= 1
# if RetryTimes < 0:
# logging.error("Thread: [%s] Retry url: [%s], params: [%s] fail, RetryTimes: [%d]", thread_name, url, params, crawler_config.retry_times)
# json_data = None
# break
logging.warning("Retry url: [%s], params: [%s] post_data: [%s]", url, params, post_data)
print "Warnning: Retry %s" % url
time.sleep(2)
json_data = get_html(url, params=params, post_data=post_data)
# if json_data is None:
# thread_name = threading.current_thread
# logging.warning("Thread: [%s] Error url: [%s], params: [%s]", thread_name, url, params)
json_data = json_data.encode("utf-8")
return json_data
def check_json_format(raw_msg):
"""
check a msg if could be decoded by json
:param raw_msg: the raw msg
:return:
"""
if isinstance(raw_msg, str):
try:
json.loads(raw_msg, encoding='utf-8')
except ValueError:
return False
return True
else:
return False
def find_and_create_dirs(dir_name):
"""
find dir, create it if it doesn't exist
:param dir_name: the name of dir
:return: the name of dir
"""
if os.path.exists(dir_name) is False:
os.makedirs(dir_name)
return dir_name
def write_object_to_file(file_name, target_object):
"""
write the object to file with json(if the file is exist, this function will overwrite it)
:param file_name: the name of new file
:param target_object: the target object for writing
:return: True if success else False
"""
dirname = os.path.dirname(file_name)
find_and_create_dirs(dirname)
try:
with open(file_name, "w") as f:
json.dump(target_object, f, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True, cls=None, indent=True, separators=None, encoding="utf-8", default=None, sort_keys=False)
except Exception, e:
message = "Write [%s...] to file [%s] error: json.dump error" % (str(target_object)[0:10], file_name)
logging.error("%s\n\t%s" % (message, e.message))
return False
else:
# logging.info(get_time() + ": Write " + self.docker_save_path + doc_file_name + ".json")
logging.info("Write %s" % file_name)
return True
def read_object_from_file(file_name):
"""
read a json object from file_name
:param file_name:
:return:
"""
if os.path.exists(file_name) is False:
logging.error("Error read path: [%s]" % file_name)
return None
with open(file_name, 'r') as f:
try:
obj = json.load(f)
except Exception:
logging.error("Error json: [%s]" % f.read()[0:10])
return None
return obj
def remove_links(text):
"""
remove links for raw_text
:param text:
:return:
"""
# remove links
links = re.findall('((http|ftp)s?://.*?(\n| |\0|\v|\t|\r|\f|\000|\b))', text)
for link in links:
url = link[0][:-1]
text = text.replace(url, '')
# remove emails
emails = re.findall('((\n| |\0|\v|\t|\r|\f|\000|\b)[0-9a-zA-Z.]+@[0-9a-zA-Z.]+\.[0-9a-zA-Z.]+(\n| |\0|\v|\t|\r|\f|\000|\b))', text)
for link in emails:
url = link[0][1:-1]
text = text.replace(url, '')
# remove <br />
brs = re.findall('<br />', text)
for br in brs:
text = text.replace(br, '')
return text
def remove_blank_spaces(text):
"""
remove black space, including \n, \t and \r
:param text:
:return:
"""
# remove \n
enters = re.findall('\n', text)
for enter in enters:
text = text.replace(enter, '')
# remove special white space
text = str(text).strip(u' ')
text = str(text).strip()
return text
def remove_tags(html):
"""
remove the tags: <code> <a> <img> <table>
:param html:
:return:
"""
html = markdown.markdown(html, ['markdown.extensions.extra'])
# print html
soup = BeautifulSoup(html)
tags = soup.find_all('code')
for tag in tags:
# print tag
tag.decompose()
tags = soup.find_all('a')
for tag in tags:
# print tag
tag.decompose()
tags = soup.find_all('img')
for tag in tags:
# print tag
tag.decompose()
tags = soup.find_all('table')
for tag in tags:
# print tag
tag.decompose()
text = soup.get_text()
return text