Skip to content

Commit 2ba6dcb

Browse files
authored
Add files via upload
1 parent 017196e commit 2ba6dcb

File tree

6 files changed

+724
-0
lines changed

6 files changed

+724
-0
lines changed
756 Bytes
Binary file not shown.

补天/config/config.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# @author:九世
2+
# @time:2019/7/8
3+
# @file:config.py
4+
5+
COOKIE='This is your cookie'
6+
PAGE=20 #公益页数你尽管设,专属只有一页只能设1,企业也是
7+
PROCESS=100 #协程设置
8+
VERSION=0.1
9+
ID={1:'/Reward/pub',2:'/Reward/corps',3:'/Reward/com'} #1为抓取公益SRC,2为专属SRC,3为企业SRC
10+
SET_ID=1 #ID设置
11+
WAIT=0.3 #遇见验证码的等待时间
12+
URL='https://www.butian.net' #补天的URL
13+
LOOK_ID='https://www.butian.net/Loo/submit?cid={}' #获取域名的url

补天/doc/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## 补天爬虫 ##
2+
3+
- [x] 抓取项目大厅
4+
5+
~~- 获取抓到的URL的title信息,headers信息~~
6+
- [x] 保存为txt格式
7+
- [x] 支持人工验证码识别然后继续爬
8+
9+
10+
## config.py设置 ##
11+
使用步骤,自己看config.py里的注释
12+
13+
注意:除了公益有多页以为,专属和企业SRC只能设置为一页
14+
15+
## 抓取效果 ##
16+
![](https://s2.ax1x.com/2019/07/08/ZrRamd.png)

补天/main.py

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# @author:九世
2+
# @file:main.py
3+
# @time:2019/7/8
4+
5+
from gevent import monkey;monkey.patch_all()
6+
from gevent.lock import RLock
7+
from multiprocessing import Process
8+
from bs4 import BeautifulSoup
9+
import re
10+
import config.config
11+
import gevent
12+
import requests
13+
14+
lock=RLock()
15+
domain={}
16+
17+
class Butian:
18+
def __init__(self):
19+
self.cookies={}
20+
self.company_id=[]
21+
self.look_id=config.config.LOOK_ID
22+
self.url=config.config.URL
23+
self.version=config.config.VERSION
24+
self.id=config.config.ID
25+
self.set_id=config.config.SET_ID
26+
self.cookie=config.config.COOKIE
27+
self.wait=config.config.WAIT
28+
self.page=config.config.PAGE
29+
self.process=config.config.PROCESS
30+
self.calc=0
31+
self.proces=[]
32+
self.request=requests.session()
33+
self.runid=[]
34+
for cook in str(self.cookie).split(';'):
35+
key,value=cook.split('=',1)
36+
self.cookies[key]=value
37+
38+
def banner(self):
39+
banner='''
40+
,--.--------. .=-.-. ,---. .-._
41+
_..---. .--.-. .-.-./==/, - , -\/==/_ /.--.' \ /==/ \ .-._
42+
.' .'.-. \/==/ -|/=/ |\==\.-. - ,-./==|, | \==\-/\ \ |==|, \/ /, /
43+
/==/- '=' /|==| ,||=| -| `--`\==\- \ |==| | /==/-|_\ | |==|- \| |
44+
|==|-, ' |==|- | =/ | \==\_ \ |==|- | \==\, - \ |==| , | -|
45+
|==| .=. \|==|, \/ - | |==|- | |==| ,| /==/ - ,| |==| - _ |
46+
/==/- '=' ,|==|- , / |==|, | |==|- |/==/- /\ - \|==| /\ , |
47+
|==| - //==/ , _ .' /==/ -/ /==/. /\==\ _.\=\.-'/==/, | |- |
48+
`-._`.___,' `--`..---' `--`--` `--`-` `--` `--`./ `--`
49+
version:{}
50+
author:九世
51+
github:https://github.com/422926799
52+
'''.format(self.version)
53+
print(banner)
54+
55+
def yzm_rz(self,rqt):
56+
if '网站当前访问量较大' in rqt.content.decode('utf-8'):
57+
print('[验证码] 验证码下载到yzm文件夹里,请手动输入验证码:{}'.format(rqt.url))
58+
text = BeautifulSoup(rqt.text, 'html.parser').find('img').get('src')
59+
yzm_url = self.url + text
60+
wt = open('yzm/yzm.jpg', 'wb')
61+
download = self.request.get(url=yzm_url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'},cookies=self.cookies)
62+
wt.write(download.content)
63+
wt.close()
64+
user = input('验证码>')
65+
jg = self.url + '/waf_verify.htm?captcha={}'.format(user)
66+
rqt = self.request.get(url=jg, headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'},cookies=self.cookies)
67+
if not 'waf' in rqt.url:
68+
print('[+] 验证码已通过')
69+
70+
return rqt
71+
72+
def zhuaqu(self):
73+
urls=str(self.url)+str(self.id[self.set_id])
74+
for b in range(int(self.page)):
75+
rqt=self.request.post(url=urls,headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'},cookies=self.cookies,data={'s':'1','p':'{}'.format(b)})
76+
if 'waf' in rqt.url:
77+
rqt=self.yzm_rz(rqt)
78+
json=rqt.json()['data']['list']
79+
for id in json:
80+
self.company_id.append(id['company_id'])
81+
82+
def caiji(self,urls):
83+
rbt=self.request.get(url=urls,headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'},cookies=self.cookies)
84+
if 'waf' in rbt.url:
85+
rbt = self.yzm_rz(rbt)
86+
data=re.findall('placeholder="请输入厂商域名" value=".*"',rbt.text)
87+
name=re.findall('placeholder="漏洞属于哪个厂商" value=".*"',rbt.text)
88+
data_s=str(data[0]).replace('placeholder="请输入厂商域名" value="','').replace('"','')
89+
names_s=str(name[0]).replace('placeholder="漏洞属于哪个厂商" value="','').replace('"','')
90+
domain[names_s]=data_s
91+
92+
def write_file(self):
93+
key=list(domain.keys())
94+
value=list(domain.values())
95+
for u in range(0,len(key)):
96+
print("'厂商名称':'{}','url':'{}'".format(key[u],value[u]))
97+
print("'厂商名称':'{}','url':'{}'".format(key[u], value[u]),file=open('save.txt','a',encoding='utf-8'))
98+
99+
def run(self,rw):
100+
lock.acquire()
101+
for r in rw:
102+
urls=str(self.look_id).format(r)
103+
self.runid.append(gevent.spawn(self.caiji,urls))
104+
lock.release()
105+
gevent.joinall(self.runid)
106+
self.write_file()
107+
108+
def run_process(self):
109+
self.banner()
110+
self.zhuaqu()
111+
for i in self.company_id:
112+
if self.calc==self.process:
113+
p=Process(target=self.run,args=(self.proces,))
114+
p.start()
115+
self.proces.clear()
116+
self.calc=0
117+
self.proces.append(i)
118+
self.calc+=1
119+
if len(self.proces)>0:
120+
p = Process(target=self.run, args=(self.proces,))
121+
p.start()
122+
123+
if __name__ == '__main__':
124+
obj=Butian()
125+
obj.run_process()

0 commit comments

Comments
 (0)