Skip to content

Commit

Permalink
Merge pull request #516 from jhao104/release-2.1.1
Browse files Browse the repository at this point in the history
Release 2.1.1
  • Loading branch information
jhao104 authored Nov 4, 2020
2 parents 171a5f1 + 97dc41a commit 76689de
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 102 deletions.
28 changes: 14 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,20 +190,20 @@ PROXY_FETCHER = [

目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)):

| 厂商名称 | 状态 | 更新速度 | 可用率 | 是否被墙 | 地址 |
| ----- | ---- | -------- | ------ | --------- | ----- |
| 无忧代理 | 可用 | 几分钟一次 | * | | [地址](http://www.data5u.com/free/index.html) |
| 66代理 | 可用 | 更新很慢 | * | | [地址](http://www.66ip.cn/) |
| 西刺代理 | 可用 | 几分钟一次 | * | | [地址](http://www.xicidaili.com)|
| 全网代理 | 可用 | 几分钟一次 | * | | [地址](http://www.goubanjia.com/)|
| ~~训代理~~ | 已关闭免费代理 | * | * | | [地址](http://www.xdaili.cn/)|
| 快代理 | 可用 |几分钟一次| * | | [地址](https://www.kuaidaili.com/)|
| 云代理 | 可用 |几分钟一次| * | | [地址](http://www.ip3366.net/)|
| IP海 | 可用 |几小时一次| * | | [地址](http://www.iphai.com/)|
| 免费IP代理库 | 可用 || * | | [地址](http://ip.jiangxianli.com/)|
| 中国IP地址 | 可用 |几分钟一次| * | | [地址](http://cn-proxy.com/)|
| Proxy List | 可用 |几分钟一次| * | | [地址](https://proxy-list.org/chinese/index.php)|
| ProxyList+ | 可用 |几分钟一次| * || [地址](https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1)|
| 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 |
| --------- | ---- | -------- | ------ | ----- | ------- |
| 无忧代理 | | | * | [地址](http://www.data5u.com/) | `freeProxy01` |
| 66代理 | | ★★ | * | [地址](http://www.66ip.cn/) | `freeProxy02` |
| ~~西刺代理~~ | ~~已关闭~~ | —— | —— | ~~[地址](http://www.xicidaili.com)~~| ~~`freeProxy03`~~|
| 全网代理 | | | * | [地址](http://www.goubanjia.com/)| `freeProxy04` |
| 快代理 || | * | [地址](https://www.kuaidaili.com/)| `freeProxy05` |
| 代理盒子 | | ★★★ | * | [地址](https://proxy.coderbusy.com/)| `freeProxy06` |
| 云代理 | | | * | [地址](http://www.ip3366.net/)| `freeProxy07` |
| ~~IP海~~ |~~已关闭~~| —— | —— | ~~[地址](http://www.iphai.com/)~~|~~`freeProxy08`~~|
| 免费代理库 | | | * | [地址](http://ip.jiangxianli.com/)| `freeProxy09` |
| 89代理 | | | * | [地址](http://cn-proxy.com/)| `freeProxy13` |
| 西拉代理 | | ★★ | * | [地址](https://proxy-list.org/chinese/index.php)| `freeProxy14` |


如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。

Expand Down
3 changes: 3 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ ChangeLog
------------------

1. Fix Bug `#493`_, 新增时区配置; (2020-08-12)
2. 修复 **66代理** 采集; (2020-11-04)
3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04)
4. 新增 **代理盒子** 免费源; (2020-11-04)

.. _#493: https://github.com/jhao104/proxy_pool/issues/493

Expand Down
112 changes: 32 additions & 80 deletions fetcher/proxyFetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,57 +54,22 @@ def freeProxy01():
print(e)

@staticmethod
def freeProxy02(count=20):
def freeProxy02():
"""
代理66 http://www.66ip.cn/
:param count: 提取数量
:return:
"""
urls = [
"http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
"http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s"
"tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip"
]

try:
import execjs
import requests

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Accept': '*/*',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8'}
session = requests.session()
src = session.get("http://www.66ip.cn/", headers=headers).text
src = src.split("</script>")[0] + '}'
src = src.replace("<script>", "function test() {")
src = src.replace("while(z++)try{eval(", ';var num=10;while(z++)try{var tmp=')
src = src.replace(");break}", ";num--;if(tmp.search('cookie') != -1 | num<0){return tmp}}")
ctx = execjs.compile(src)
src = ctx.call("test")
src = src[src.find("document.cookie="): src.find("};if((")]
src = src.replace("document.cookie=", "")
src = "function test() {var window={}; return %s }" % src
cookie = execjs.compile(src).call('test')
js_cookie = cookie.split(";")[0].split("=")[-1]
except Exception as e:
print(e)
return
url = "http://www.66ip.cn/mo.php"

for url in urls:
try:
html = session.get(url.format(count), cookies={"__jsl_clearance": js_cookie}, headers=headers).text
ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", html)
for ip in ips:
yield ip.strip()
except Exception as e:
print(e)
pass
resp = WebRequest().get(url, timeout=10)
proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})', resp.text)
for proxy in proxies:
yield proxy

@staticmethod
def freeProxy03(page_count=1):
"""
西刺代理 http://www.xicidaili.com
西刺代理 http://www.xicidaili.com 网站已关闭
:return:
"""
url_list = [
Expand All @@ -125,7 +90,7 @@ def freeProxy03(page_count=1):
@staticmethod
def freeProxy04():
"""
guobanjia http://www.goubanjia.com/
全网代理 http://www.goubanjia.com/
:return:
"""
url = "http://www.goubanjia.com/"
Expand All @@ -138,25 +103,22 @@ def freeProxy04():
and not(contains(@class, 'port'))
]/text()
"""

# port是class属性值加密得到
def _parse_port(port_element):
port_list = []
for letter in port_element:
port_list.append(str("ABCDEFGHIZ".find(letter)))
_port = "".join(port_list)
return int(_port) >> 0x3

for each_proxy in proxy_list:
try:
# :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))

# HTML中的port是随机数,真正的端口编码在class后面的字母中。
# 比如这个:
# <span class="port CFACE">9054</span>
# CFACE解码后对应的是3128。
port = 0
for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
"/attribute::class")[0]. \
replace("port ", ""):
port *= 10
port += (ord(_) - ord('A'))
port /= 8

port_str = each_proxy.xpath(".//span[contains(@class, 'port')]/@class")[0].split()[-1]
port = _parse_port(port_str.strip())
yield '{}:{}'.format(ip_addr, int(port))
except Exception as e:
except Exception:
pass

@staticmethod
Expand All @@ -183,15 +145,18 @@ def freeProxy05(page_count=1):
@staticmethod
def freeProxy06():
"""
码农代理 https://proxy.coderbusy.com/
代理盒子 https://proxy.coderbusy.com/
:return:
"""
urls = ['https://proxy.coderbusy.com/']
urls = ['https://proxy.coderbusy.com/zh-hans/ops/country/cn.html']
for url in urls:
tree = WebRequest().get(url).tree
proxy_list = tree.xpath('.//table//tr')
for tr in proxy_list[1:]:
yield ':'.join(tr.xpath('./td/text()')[0:2])
proxy = '{}:{}'.format("".join(tr.xpath("./td[1]/text()")).strip(),
"".join(tr.xpath("./td[2]//text()")).strip())
if proxy:
yield proxy

@staticmethod
def freeProxy07():
Expand Down Expand Up @@ -282,24 +247,6 @@ def freeProxy09(page_count=1):

@staticmethod
def freeProxy13(max_page=2):
"""
http://www.qydaili.com/free/?action=china&page=1
齐云代理
:param max_page:
:return:
"""
base_url = 'http://www.qydaili.com/free/?action=china&page='
for page in range(1, max_page + 1):
url = base_url + str(page)
r = WebRequest().get(url, timeout=10)
proxies = re.findall(
r'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td.*?>(\d+)</td>',
r.text)
for proxy in proxies:
yield ':'.join(proxy)

@staticmethod
def freeProxy14(max_page=2):
"""
http://www.89ip.cn/index.html
89免费代理
Expand All @@ -317,7 +264,12 @@ def freeProxy14(max_page=2):
yield ':'.join(proxy)

@staticmethod
def freeProxy15():
def freeProxy14():
"""
http://www.xiladaili.com/
西拉代理
:return:
"""
urls = ['http://www.xiladaili.com/putong/',
"http://www.xiladaili.com/gaoni/",
"http://www.xiladaili.com/http/",
Expand Down
7 changes: 3 additions & 4 deletions setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,16 @@
# ###### config the proxy fetch function ######
PROXY_FETCHER = [
"freeProxy01",
# "freeProxy02",
"freeProxy02",
# "freeProxy03",
"freeProxy04",
"freeProxy05",
# "freeProxy06",
"freeProxy06",
"freeProxy07",
# "freeProxy08",
"freeProxy09",
"freeProxy13",
"freeProxy14",
"freeProxy15",
"freeProxy14"
]

# ############# proxy validator #################
Expand Down
9 changes: 5 additions & 4 deletions test/testProxyFetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
def testProxyFetcher():
conf = ConfigHandler()
proxy_getter_functions = conf.fetchers
proxy_counter = {_: 0 for _ in proxy_getter_functions}
for proxyGetter in proxy_getter_functions:
proxy_count = 0
for proxy in getattr(ProxyFetcher, proxyGetter.strip())():
if proxy:
print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,
proxy_count=proxy_count))
proxy_count += 1
print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1
for key, value in proxy_counter.items():
print(key, value)


if __name__ == '__main__':
Expand Down

0 comments on commit 76689de

Please sign in to comment.