Skip to content

Commit

Permalink
Merge pull request #59 from bobobo80/bobo_dev
Browse files Browse the repository at this point in the history
redis及getFreeProxy修改
  • Loading branch information
jhao104 authored Aug 28, 2017
2 parents ba2f983 + 551b35a commit f4783d2
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 9 deletions.
18 changes: 15 additions & 3 deletions DB/RedisClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import json
import random
import redis

import sys

class RedisClient(object):
"""
Expand All @@ -33,7 +33,14 @@ def get(self):
:return:
"""
key = self.__conn.hgetall(name=self.name)
return random.choice(key.keys()) if key else None
# return random.choice(key.keys()) if key else None
# key.keys()在python3中返回dict_keys,不支持index,不能直接使用random.choice
# 另:python3中,redis返回为bytes,需要解码
rkey = random.choice(list(key.keys())) if key else None
if isinstance(rkey, bytes):
return rkey.decode('utf-8')
else:
return rkey
# return self.__conn.srandmember(name=self.name)

def put(self, key):
Expand Down Expand Up @@ -74,7 +81,12 @@ def inckey(self, key, value):
self.__conn.hincrby(self.name, key, value)

def getAll(self):
return self.__conn.hgetall(self.name).keys()
# return self.__conn.hgetall(self.name).keys()
# python3 redis返回bytes类型,需要解码
if sys.version_info.major == 3:
return [key.decode('utf-8') for key in self.__conn.hgetall(self.name).keys()]
else:
return self.__conn.hgetall(self.name).keys()
# return self.__conn.smembers(self.name)

def get_status(self):
Expand Down
24 changes: 18 additions & 6 deletions ProxyGetter/getFreeProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ def freeProxySecond(proxy_number=100):
url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
proxy_number)
request = WebRequest()
html = request.get(url).content
# html = request.get(url).content
# content为未解码,text为解码后的字符串
html = request.get(url).text
for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html):
yield proxy

Expand Down Expand Up @@ -112,8 +114,18 @@ def freeProxyFifth():
page_url = url.format(page=page)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath('//td[@class="ip"]')
# 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
# 需要过滤掉<p style="display:none;">的内容
xpath_str = """.//*[not(contains(@style, 'display: none'))
and not(contains(@style, 'display:none'))
and not(contains(@class, 'port'))
]/text()
"""
for each_proxy in proxy_list:
yield ''.join(each_proxy.xpath('.//text()'))
# :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))
port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
yield '{}:{}'.format(ip_addr, port)


if __name__ == '__main__':
Expand All @@ -127,8 +139,8 @@ def freeProxyFifth():
# for e in gg.freeProxyThird():
# print e

for e in gg.freeProxyFourth():
print e
# for e in gg.freeProxyFourth():
# print(e)

# for e in gg.freeProxyFifth():
# print(e)
for e in gg.freeProxyFifth():
print(e)

0 comments on commit f4783d2

Please sign in to comment.