-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathip.py
113 lines (101 loc) · 3.18 KB
/
ip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import random
import requests
import time
import pymongo
from bs4 import BeautifulSoup
# 爬取代理的URL地址,选择的是西刺代理
url_ip = "http://www.xicidaili.com/nt/"
# 设定等待时间
set_timeout = 5
# 爬取代理的页数,2表示爬取2页的ip地址
num = 2
# 代理的使用次数
count_time = 5
# 构造headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
# 测试ip的URL
url_for_test = 'http://httpbin.org/ip'
def scrawl_xici_ip(num):
'''
爬取代理ip地址,代理的url是西刺代理
'''
ip_list = []
for num_page in range(1, num):
url = url_ip + str(num_page)
response = requests.get(url, headers=headers)
if response.status_code == 200:
content = response.text
soup = BeautifulSoup(content, 'lxml')
trs = soup.find_all('tr')
for i in range(1, len(trs)):
tr = trs[i]
tds = tr.find_all('td')
ip_item = tds[1].text + ':' + tds[2].text
# print(ip_item)
ip_list.append(ip_item)
ip_set = set(ip_list) # 去掉可能重复的ip
ip_list = list(ip_set)
time.sleep(count_time) # 等待5秒
return ip_list
def ip_test(url_for_test, ip_info):
'''
测试爬取到的ip,测试成功则存入MongoDB
'''
for ip_for_test in ip_info:
# 设置代理
proxies = {
'http': 'http://' + ip_for_test,
'https': 'http://' + ip_for_test,
}
print(proxies)
try:
response = requests.get(url_for_test, headers=headers, proxies=proxies, timeout=10)
if response.status_code == 200:
ip = {'ip': ip_for_test}
print(response.text)
print('测试通过')
write_to_MongoDB(ip)
except Exception as e:
print(e)
continue
def write_to_MongoDB(proxies):
'''
将测试通过的ip存入MongoDB
'''
client = pymongo.MongoClient()
db = client.PROXY
collection = db.proxies
result = collection.insert(proxies)
print(result)
print('存储MongoDB成功')
def get_random_ip():
'''
随机取出一个ip
'''
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.PROXY
collection = db.proxies
items = collection.find()
length = items.count()
ind = random.randint(0, length - 1)
useful_proxy = items[ind]['ip'].replace('\n', '')
proxy = {
'http': 'http://' + useful_proxy,
'https': 'http://' + useful_proxy,
}
response = requests.get(url_for_test, headers=headers, proxies=proxy, timeout=10)
if response.status_code == 200:
print('取出的ip为:' + useful_proxy)
else:
print('此{ip}已失效'.format(useful_proxy))
collection.remove(useful_proxy)
print('已经从MongoDB移除')
get_random_ip()
def main():
ip_info = []
ip_info = scrawl_xici_ip(2)
sucess_proxy = ip_test(url_for_test, ip_info)
get_random_ip()
if __name__ == '__main__':
get_random_ip()