Skip to content

Commit d5dfc21

Browse files
committed
init proxypool
0 parents  commit d5dfc21

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+757
-0
lines changed

.gitignore

Whitespace-only changes.

README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
### ProxyPool
2+
3+
#### 1.配置
4+
##### 1.数据库配置
5+
`conf.py`中修改数据库配置,并新建`proxypool`表。
6+
```
7+
sql>create table proxypool(ip char(20),port char(5),time char(30));
8+
```
9+
10+
##### 2.安装Python库
11+
```
12+
pip3 install pymysql
13+
pip3 install requests
14+
pip3 install bs4
15+
pip3 install lxml
16+
```
17+
#### 2.运行
18+
抓取代理IP
19+
```
20+
python3 proxypool.py
21+
```
22+
23+
定时验证IP
24+
```
25+
python3 verify.py
26+
```
27+
28+
29+
30+
31+

conf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
MYSQL_CONF = {
2+
'host': 'localhost',
3+
'port': 3306,
4+
'user': 'root',
5+
'passwd': '123456',
6+
'db': 'proxypool'
7+
}

proxy_spiders/__init__.py

Whitespace-only changes.

proxy_spiders/mimvp/__init__.py

Whitespace-only changes.

proxy_spiders/mimvp/icon/0/1.jpg

413 Bytes
Loading

proxy_spiders/mimvp/icon/0/6.jpg

413 Bytes
Loading

proxy_spiders/mimvp/icon/1/0.jpg

399 Bytes
Loading

proxy_spiders/mimvp/icon/1/5.jpg

399 Bytes
Loading

proxy_spiders/mimvp/icon/2/2.jpg

404 Bytes
Loading

proxy_spiders/mimvp/icon/2/4.jpg

404 Bytes
Loading

proxy_spiders/mimvp/icon/3/139.jpg

415 Bytes
Loading

proxy_spiders/mimvp/icon/3/41.jpg

415 Bytes
Loading

proxy_spiders/mimvp/icon/4/115.jpg

405 Bytes
Loading

proxy_spiders/mimvp/icon/4/19.jpg

405 Bytes
Loading

proxy_spiders/mimvp/icon/5/28.jpg

412 Bytes
Loading

proxy_spiders/mimvp/icon/5/3.jpg

412 Bytes
Loading

proxy_spiders/mimvp/icon/6/33.jpg

415 Bytes
Loading

proxy_spiders/mimvp/icon/6/8.jpg

415 Bytes
Loading

proxy_spiders/mimvp/icon/7/11.jpg

405 Bytes
Loading

proxy_spiders/mimvp/icon/7/40.jpg

405 Bytes
Loading

proxy_spiders/mimvp/icon/8/35.jpg

412 Bytes
Loading

proxy_spiders/mimvp/icon/8/44.jpg

412 Bytes
Loading

proxy_spiders/mimvp/icon/9/37.jpg

414 Bytes
Loading

proxy_spiders/mimvp/icon/9/39.jpg

414 Bytes
Loading

proxy_spiders/mimvp/mimvp_proxy.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
from PIL import Image
4+
import re
5+
from .recognize import CaptchaRecognize, convert_image
6+
import logging
7+
import time
8+
9+
headers = {
10+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
11+
"Accept-Encoding": "gzip, deflate",
12+
"Accept-Language": "en-US,en;q=0.5",
13+
"Connection": "keep-alive",
14+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
15+
16+
17+
def get_current_time():
18+
timenow = time.strftime('%Y-%m-%d %X', time.localtime())
19+
return timenow
20+
21+
22+
def mimvp_proxy():
23+
urls = ['http://proxy.mimvp.com/free.php?proxy=in_hp', 'http://proxy.mimvp.com/free.php?proxy=out_hp',
24+
'http://proxy.mimvp.com/free.php?proxy=in_tp', 'http://proxy.mimvp.com/free.php?proxy=out_tp']
25+
result = []
26+
imageRecognize = CaptchaRecognize()
27+
for url in urls:
28+
try:
29+
html = requests.get(url, headers=headers, timeout=30).text
30+
table = BeautifulSoup(html, 'lxml').find('div', id='list').find('tbody') # .find_all('tr')
31+
except Exception as e:
32+
print('[%s][Spider][mimvp]Error!' % get_current_time(), logging.exception(e))
33+
continue
34+
table = re.findall('(\d+\.\d+\.\d+\.\d+).*?img src="(.*?)"', str(table))
35+
for item in table:
36+
try:
37+
ip = item[0]
38+
imgurl = 'http://proxy.mimvp.com/' + item[1].replace('amp;', '')
39+
image = getimage(imgurl)
40+
port_str_list = imageRecognize.recognise(image)
41+
port = [item[1] for item in port_str_list]
42+
port = ''.join(port)
43+
result.append(ip + ':' + port)
44+
except:
45+
continue
46+
print('[%s][Spider][mimvp]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
47+
return result
48+
49+
50+
def getimage(imgurl):
51+
with open('./temp.png', 'wb') as img:
52+
content = requests.get(imgurl, headers=headers, timeout=20).content
53+
img.write(content)
54+
image = Image.open('./temp.png')
55+
image = convert_image(image)
56+
return image

proxy_spiders/mimvp/recognize.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# coding:utf-8
2+
3+
import os
4+
from PIL import Image
5+
import math
6+
7+
8+
def convert_image(image):
9+
image = image.convert('L')
10+
image2 = Image.new('L', image.size, 255)
11+
for x in range(image.size[0]):
12+
for y in range(image.size[1]):
13+
pix = image.getpixel((x, y))
14+
if pix < 120:
15+
image2.putpixel((x, y), 0)
16+
return image2
17+
18+
19+
def cut_image(image):
20+
inletter = False
21+
foundletter = False
22+
letters = []
23+
start = 0
24+
end = 0
25+
for x in range(image.size[0]):
26+
for y in range(image.size[1]):
27+
pix = image.getpixel((x, y))
28+
if (pix == 0):
29+
inletter = True
30+
if foundletter == False and inletter == True:
31+
foundletter = True
32+
start = x
33+
if foundletter == True and inletter == False:
34+
end = x
35+
letters.append((start, end))
36+
foundletter = False
37+
inletter = False
38+
images = []
39+
for letter in letters:
40+
img = image.crop((letter[0], 0, letter[1], image.size[1]))
41+
images.append(img)
42+
return images
43+
44+
45+
def buildvector(image):
46+
result = {}
47+
count = 0
48+
for i in image.getdata():
49+
result[count] = i
50+
count += 1
51+
return result
52+
53+
54+
class CaptchaRecognize:
55+
def __init__(self):
56+
self.letters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
57+
self.loadSet()
58+
59+
def loadSet(self):
60+
self.imgset = []
61+
for letter in self.letters:
62+
temp = []
63+
for img in os.listdir('./proxy_spiders/mimvp/icon/%s' % (letter)):
64+
temp.append(buildvector(Image.open('./proxy_spiders/mimvp/icon/%s/%s' % (letter, img))))
65+
self.imgset.append({letter: temp})
66+
67+
# 计算矢量大小
68+
def magnitude(self, concordance):
69+
total = 0
70+
for word, count in concordance.items():
71+
total += count ** 2
72+
return math.sqrt(total)
73+
74+
# 计算矢量之间的 cos 值
75+
def relation(self, concordance1, concordance2):
76+
relevance = 0
77+
topvalue = 0
78+
for word, count in concordance1.items():
79+
if word in concordance2:
80+
topvalue += count * concordance2[word]
81+
return topvalue / (self.magnitude(concordance1) * self.magnitude(concordance2))
82+
83+
def recognise(self, image):
84+
image = convert_image(image)
85+
self.images = cut_image(image)
86+
vectors = []
87+
for img in self.images:
88+
vectors.append(buildvector(img))
89+
result = []
90+
for vector in vectors:
91+
guess = []
92+
for image in self.imgset:
93+
for letter, temp in image.items():
94+
relevance = 0
95+
num = 0
96+
for img in temp:
97+
relevance += self.relation(vector, img)
98+
num += 1
99+
relevance = relevance / num
100+
guess.append((relevance, letter))
101+
guess.sort(reverse=True)
102+
result.append(guess[0])
103+
return result
104+
105+
106+
if __name__ == '__main__':
107+
imageRecognize = CaptchaRecognize()
108+
count = 0
109+
for imgfile in os.listdir('images'):
110+
image = Image.open('images/' + imgfile)
111+
result = imageRecognize.recognise(image)
112+
images = imageRecognize.images
113+
for index in range(len(result)):
114+
try:
115+
os.mkdir('result/' + result[index][1])
116+
except:
117+
pass
118+
images[index].save('result/' + result[index][1] + '/' + str(count) + '.jpg')
119+
count += 1
120+
'''
121+
count=0
122+
for imgfile in os.listdir('img'):
123+
image=Image.open('img/'+imgfile)
124+
image=convert_image(image)
125+
images=cut_image(image)
126+
for img in images:
127+
img.save('icon/%s.jpg'%count)
128+
count+=1
129+
'''

proxy_spiders/mimvp/temp.png

Loading

proxy_spiders/spider_66ip.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import requests
2+
import re
3+
import logging
4+
import time
5+
import threading
6+
7+
headers = {
8+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9+
"Accept-Encoding": "gzip, deflate",
10+
"Accept-Language": "en-US,en;q=0.5",
11+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12+
13+
14+
def get_current_time():
15+
timenow = time.strftime('%Y-%m-%d %X', time.localtime())
16+
return timenow
17+
18+
19+
def crawl():
20+
urls = [
21+
'http://www.66ip.cn/nmtq.php?getnum=600&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=0&proxytype=0&api=66ip']
22+
result = []
23+
for pageurl in urls:
24+
try:
25+
html = requests.get(pageurl, headers=headers, timeout=30).text
26+
except Exception as e:
27+
print('[%s][Spider][66ip]Error:' % get_current_time(), logging.exception(e))
28+
continue
29+
ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
30+
result += ips
31+
time.sleep(2)
32+
print('[%s][Spider][66ip]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
33+
return result
34+
35+
36+
class SpiderIP66(threading.Thread):
37+
def __init__(self):
38+
super(SpiderIP66, self).__init__()
39+
40+
def run(self):
41+
self.result = crawl()

proxy_spiders/spider_89ip.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import requests
2+
import re
3+
import logging
4+
import time
5+
import threading
6+
7+
headers = {
8+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9+
"Accept-Encoding": "gzip, deflate",
10+
"Accept-Language": "en-US,en;q=0.5",
11+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
12+
13+
14+
def get_current_time():
15+
timenow = time.strftime('%Y-%m-%d %X', time.localtime())
16+
return timenow
17+
18+
19+
def crawl():
20+
urls = ['http://www.89ip.cn/tiqu.php?sxb=&tqsl=1000&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1']
21+
result = []
22+
for pageurl in urls:
23+
try:
24+
html = requests.get(pageurl, headers=headers, timeout=30).text
25+
except Exception as e:
26+
print('[%s][Spider][89ip]Error:' % get_current_time(), logging.exception(e))
27+
continue
28+
ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
29+
result += ips
30+
time.sleep(2)
31+
print('[%s][Spider][89ip]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
32+
return result
33+
34+
35+
class SpiderIP89(threading.Thread):
36+
def __init__(self):
37+
super(SpiderIP89, self).__init__()
38+
39+
def run(self):
40+
self.result = crawl()

proxy_spiders/spider_data5u.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import requests
2+
import re
3+
import logging
4+
import time
5+
import threading
6+
from bs4 import BeautifulSoup
7+
8+
headers = {
9+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10+
"Accept-Encoding": "gzip, deflate",
11+
"Accept-Language": "en-US,en;q=0.5",
12+
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
13+
14+
15+
def get_current_time():
16+
timenow = time.strftime('%Y-%m-%d %X', time.localtime())
17+
return timenow
18+
19+
20+
def crawl():
21+
urls = ['http://www.data5u.com/']
22+
result = []
23+
for url in urls:
24+
try:
25+
html = requests.get(url, headers=headers, timeout=30).text
26+
table = BeautifulSoup(html, 'lxml').find('div', {'class': 'wlist'}).find_all('ul', {"class": 'l2'})
27+
except Exception as e:
28+
print('[%s][Spider][data5u]Error:' % get_current_time(), logging.exception(e))
29+
continue
30+
for item in table[1:]:
31+
try:
32+
spans = item.find_all('span')
33+
ip = spans[0].get_text()
34+
port = spans[1].get_text()
35+
except:
36+
continue
37+
line = ip + ':' + port
38+
result.append(line.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', ''))
39+
print('[%s][Spider][data5u]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
40+
return result
41+
42+
43+
class SpiderData5u(threading.Thread):
44+
def __init__(self):
45+
super(SpiderData5u, self).__init__()
46+
47+
def run(self):
48+
self.result = crawl()

0 commit comments

Comments
 (0)