luzzbob
diff --git a/‎.gitignore b/‎.gitignore
diff --git a/‎README.md
Lines changed: 31 additions & 0 deletions b/‎README.md
Lines changed: 31 additions & 0 deletions
diff --git a/‎conf.py
Lines changed: 7 additions & 0 deletions b/‎conf.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎proxy_spiders/__init__.py b/‎proxy_spiders/__init__.py
diff --git a/‎proxy_spiders/mimvp/__init__.py b/‎proxy_spiders/mimvp/__init__.py
diff --git a/‎proxy_spiders/mimvp/icon/0/1.jpg
413 Bytes b/‎proxy_spiders/mimvp/icon/0/1.jpg
413 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/0/6.jpg
413 Bytes b/‎proxy_spiders/mimvp/icon/0/6.jpg
413 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/1/0.jpg
399 Bytes b/‎proxy_spiders/mimvp/icon/1/0.jpg
399 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/1/5.jpg
399 Bytes b/‎proxy_spiders/mimvp/icon/1/5.jpg
399 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/2/2.jpg
404 Bytes b/‎proxy_spiders/mimvp/icon/2/2.jpg
404 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/2/4.jpg
404 Bytes b/‎proxy_spiders/mimvp/icon/2/4.jpg
404 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/3/139.jpg
415 Bytes b/‎proxy_spiders/mimvp/icon/3/139.jpg
415 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/3/41.jpg
415 Bytes b/‎proxy_spiders/mimvp/icon/3/41.jpg
415 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/4/115.jpg
405 Bytes b/‎proxy_spiders/mimvp/icon/4/115.jpg
405 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/4/19.jpg
405 Bytes b/‎proxy_spiders/mimvp/icon/4/19.jpg
405 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/5/28.jpg
412 Bytes b/‎proxy_spiders/mimvp/icon/5/28.jpg
412 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/5/3.jpg
412 Bytes b/‎proxy_spiders/mimvp/icon/5/3.jpg
412 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/6/33.jpg
415 Bytes b/‎proxy_spiders/mimvp/icon/6/33.jpg
415 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/6/8.jpg
415 Bytes b/‎proxy_spiders/mimvp/icon/6/8.jpg
415 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/7/11.jpg
405 Bytes b/‎proxy_spiders/mimvp/icon/7/11.jpg
405 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/7/40.jpg
405 Bytes b/‎proxy_spiders/mimvp/icon/7/40.jpg
405 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/8/35.jpg
412 Bytes b/‎proxy_spiders/mimvp/icon/8/35.jpg
412 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/8/44.jpg
412 Bytes b/‎proxy_spiders/mimvp/icon/8/44.jpg
412 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/9/37.jpg
414 Bytes b/‎proxy_spiders/mimvp/icon/9/37.jpg
414 Bytes
diff --git a/‎proxy_spiders/mimvp/icon/9/39.jpg
414 Bytes b/‎proxy_spiders/mimvp/icon/9/39.jpg
414 Bytes
diff --git a/‎proxy_spiders/mimvp/mimvp_proxy.py
Lines changed: 56 additions & 0 deletions b/‎proxy_spiders/mimvp/mimvp_proxy.py
Lines changed: 56 additions & 0 deletions
diff --git a/‎proxy_spiders/mimvp/recognize.py
Lines changed: 129 additions & 0 deletions b/‎proxy_spiders/mimvp/recognize.py
Lines changed: 129 additions & 0 deletions
diff --git a/‎proxy_spiders/mimvp/temp.png b/‎proxy_spiders/mimvp/temp.png
diff --git a/‎proxy_spiders/spider_66ip.py
Lines changed: 41 additions & 0 deletions b/‎proxy_spiders/spider_66ip.py
Lines changed: 41 additions & 0 deletions
diff --git a/‎proxy_spiders/spider_89ip.py
Lines changed: 40 additions & 0 deletions b/‎proxy_spiders/spider_89ip.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎proxy_spiders/spider_data5u.py
Lines changed: 48 additions & 0 deletions b/‎proxy_spiders/spider_data5u.py
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,31 @@
+### ProxyPool
+
+#### 1.配置
+##### 1.数据库配置
+在`conf.py`中修改数据库配置，并新建`proxypool`表。
+```
+sql>create table proxypool(ip char(20),port char(5),time char(30));
+```
+
+##### 2.安装Python库
+```
+pip3 install pymysql
+pip3 install requests
+pip3 install bs4
+pip3 install lxml
+```
+#### 2.运行
+抓取代理IP
+```
+python3 proxypool.py
+```
+
+定时验证IP
+```
+python3 verify.py
+```
+
+
+
+
+
@@ -0,0 +1,7 @@
+MYSQL_CONF = {
+    'host': 'localhost',
+    'port': 3306,
+    'user': 'root',
+    'passwd': '123456',
+    'db': 'proxypool'
+}
@@ -0,0 +1,56 @@
+import requests
+from bs4 import BeautifulSoup
+from PIL import Image
+import re
+from .recognize import CaptchaRecognize, convert_image
+import logging
+import time
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "en-US,en;q=0.5",
+    "Connection": "keep-alive",
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
+
+
+def get_current_time():
+    timenow = time.strftime('%Y-%m-%d %X', time.localtime())
+    return timenow
+
+
+def mimvp_proxy():
+    urls = ['http://proxy.mimvp.com/free.php?proxy=in_hp', 'http://proxy.mimvp.com/free.php?proxy=out_hp',
+            'http://proxy.mimvp.com/free.php?proxy=in_tp', 'http://proxy.mimvp.com/free.php?proxy=out_tp']
+    result = []
+    imageRecognize = CaptchaRecognize()
+    for url in urls:
+        try:
+            html = requests.get(url, headers=headers, timeout=30).text
+            table = BeautifulSoup(html, 'lxml').find('div', id='list').find('tbody')  # .find_all('tr')
+        except Exception as e:
+            print('[%s][Spider][mimvp]Error!' % get_current_time(), logging.exception(e))
+            continue
+        table = re.findall('(\d+\.\d+\.\d+\.\d+).*?img src="(.*?)"', str(table))
+        for item in table:
+            try:
+                ip = item[0]
+                imgurl = 'http://proxy.mimvp.com/' + item[1].replace('amp;', '')
+                image = getimage(imgurl)
+                port_str_list = imageRecognize.recognise(image)
+                port = [item[1] for item in port_str_list]
+                port = ''.join(port)
+                result.append(ip + ':' + port)
+            except:
+                continue
+    print('[%s][Spider][mimvp]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
+    return result
+
+
+def getimage(imgurl):
+    with open('./temp.png', 'wb') as img:
+        content = requests.get(imgurl, headers=headers, timeout=20).content
+        img.write(content)
+    image = Image.open('./temp.png')
+    image = convert_image(image)
+    return image
@@ -0,0 +1,129 @@
+# coding:utf-8
+
+import os
+from PIL import Image
+import math
+
+
+def convert_image(image):
+    image = image.convert('L')
+    image2 = Image.new('L', image.size, 255)
+    for x in range(image.size[0]):
+        for y in range(image.size[1]):
+            pix = image.getpixel((x, y))
+            if pix < 120:
+                image2.putpixel((x, y), 0)
+    return image2
+
+
+def cut_image(image):
+    inletter = False
+    foundletter = False
+    letters = []
+    start = 0
+    end = 0
+    for x in range(image.size[0]):
+        for y in range(image.size[1]):
+            pix = image.getpixel((x, y))
+            if (pix == 0):
+                inletter = True
+        if foundletter == False and inletter == True:
+            foundletter = True
+            start = x
+        if foundletter == True and inletter == False:
+            end = x
+            letters.append((start, end))
+            foundletter = False
+        inletter = False
+    images = []
+    for letter in letters:
+        img = image.crop((letter[0], 0, letter[1], image.size[1]))
+        images.append(img)
+    return images
+
+
+def buildvector(image):
+    result = {}
+    count = 0
+    for i in image.getdata():
+        result[count] = i
+        count += 1
+    return result
+
+
+class CaptchaRecognize:
+    def __init__(self):
+        self.letters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+        self.loadSet()
+
+    def loadSet(self):
+        self.imgset = []
+        for letter in self.letters:
+            temp = []
+            for img in os.listdir('./proxy_spiders/mimvp/icon/%s' % (letter)):
+                temp.append(buildvector(Image.open('./proxy_spiders/mimvp/icon/%s/%s' % (letter, img))))
+            self.imgset.append({letter: temp})
+
+    # 计算矢量大小
+    def magnitude(self, concordance):
+        total = 0
+        for word, count in concordance.items():
+            total += count ** 2
+        return math.sqrt(total)
+
+    # 计算矢量之间的 cos 值
+    def relation(self, concordance1, concordance2):
+        relevance = 0
+        topvalue = 0
+        for word, count in concordance1.items():
+            if word in concordance2:
+                topvalue += count * concordance2[word]
+        return topvalue / (self.magnitude(concordance1) * self.magnitude(concordance2))
+
+    def recognise(self, image):
+        image = convert_image(image)
+        self.images = cut_image(image)
+        vectors = []
+        for img in self.images:
+            vectors.append(buildvector(img))
+        result = []
+        for vector in vectors:
+            guess = []
+            for image in self.imgset:
+                for letter, temp in image.items():
+                    relevance = 0
+                    num = 0
+                    for img in temp:
+                        relevance += self.relation(vector, img)
+                        num += 1
+                    relevance = relevance / num
+                    guess.append((relevance, letter))
+            guess.sort(reverse=True)
+            result.append(guess[0])
+        return result
+
+
+if __name__ == '__main__':
+    imageRecognize = CaptchaRecognize()
+    count = 0
+    for imgfile in os.listdir('images'):
+        image = Image.open('images/' + imgfile)
+        result = imageRecognize.recognise(image)
+        images = imageRecognize.images
+        for index in range(len(result)):
+            try:
+                os.mkdir('result/' + result[index][1])
+            except:
+                pass
+            images[index].save('result/' + result[index][1] + '/' + str(count) + '.jpg')
+            count += 1
+    '''
+    count=0
+    for imgfile in os.listdir('img'):
+        image=Image.open('img/'+imgfile)
+        image=convert_image(image)
+        images=cut_image(image)
+        for img in images:
+            img.save('icon/%s.jpg'%count)
+            count+=1
+    '''
@@ -0,0 +1,41 @@
+import requests
+import re
+import logging
+import time
+import threading
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "en-US,en;q=0.5",
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
+
+
+def get_current_time():
+    timenow = time.strftime('%Y-%m-%d %X', time.localtime())
+    return timenow
+
+
+def crawl():
+    urls = [
+        'http://www.66ip.cn/nmtq.php?getnum=600&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=0&proxytype=0&api=66ip']
+    result = []
+    for pageurl in urls:
+        try:
+            html = requests.get(pageurl, headers=headers, timeout=30).text
+        except Exception as e:
+            print('[%s][Spider][66ip]Error:' % get_current_time(), logging.exception(e))
+            continue
+        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
+        result += ips
+        time.sleep(2)
+    print('[%s][Spider][66ip]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
+    return result
+
+
+class SpiderIP66(threading.Thread):
+    def __init__(self):
+        super(SpiderIP66, self).__init__()
+
+    def run(self):
+        self.result = crawl()
@@ -0,0 +1,40 @@
+import requests
+import re
+import logging
+import time
+import threading
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "en-US,en;q=0.5",
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
+
+
+def get_current_time():
+    timenow = time.strftime('%Y-%m-%d %X', time.localtime())
+    return timenow
+
+
+def crawl():
+    urls = ['http://www.89ip.cn/tiqu.php?sxb=&tqsl=1000&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1']
+    result = []
+    for pageurl in urls:
+        try:
+            html = requests.get(pageurl, headers=headers, timeout=30).text
+        except Exception as e:
+            print('[%s][Spider][89ip]Error:' % get_current_time(), logging.exception(e))
+            continue
+        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
+        result += ips
+        time.sleep(2)
+    print('[%s][Spider][89ip]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
+    return result
+
+
+class SpiderIP89(threading.Thread):
+    def __init__(self):
+        super(SpiderIP89, self).__init__()
+
+    def run(self):
+        self.result = crawl()
@@ -0,0 +1,48 @@
+import requests
+import re
+import logging
+import time
+import threading
+from bs4 import BeautifulSoup
+
+headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Encoding": "gzip, deflate",
+    "Accept-Language": "en-US,en;q=0.5",
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0"}
+
+
+def get_current_time():
+    timenow = time.strftime('%Y-%m-%d %X', time.localtime())
+    return timenow
+
+
+def crawl():
+    urls = ['http://www.data5u.com/']
+    result = []
+    for url in urls:
+        try:
+            html = requests.get(url, headers=headers, timeout=30).text
+            table = BeautifulSoup(html, 'lxml').find('div', {'class': 'wlist'}).find_all('ul', {"class": 'l2'})
+        except Exception as e:
+            print('[%s][Spider][data5u]Error:' % get_current_time(), logging.exception(e))
+            continue
+        for item in table[1:]:
+            try:
+                spans = item.find_all('span')
+                ip = spans[0].get_text()
+                port = spans[1].get_text()
+            except:
+                continue
+            line = ip + ':' + port
+            result.append(line.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', ''))
+    print('[%s][Spider][data5u]OK!' % get_current_time(), 'Crawled IP Count:', len(result))
+    return result
+
+
+class SpiderData5u(threading.Thread):
+    def __init__(self):
+        super(SpiderData5u, self).__init__()
+
+    def run(self):
+        self.result = crawl()