forked from LandGrey/toolsparty
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathowt.py
199 lines (183 loc) · 6.49 KB
/
owt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
# coding:utf-8
# Build By LandGrey
#
import re
import os
import ssl
import sys
import socket
import requests
import argparse
import HTMLParser
from requests.adapters import HTTPAdapter
from multiprocessing.dummy import Pool as ThreadPool
try:
requests.packages.urllib3.disable_warnings()
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
def out_format(url, information):
for char in ('\r', '\n', '\t'):
information = information.replace(char, "")
try:
message = u"{target:50} {information}".format(target=url, information=information.strip())
except:
try:
message = "{target:50} {information}".format(target=url, information=information.strip())
except:
message = "{target:50} {information}".format(target=url, information="NoInformation")
try:
print(message)
except UnicodeError:
print("{target:50} {information}".format(target=url, information="PrintUnicodeError"))
def html_decoder(html_entries):
try:
hp = HTMLParser.HTMLParser()
return hp.unescape(html_entries)
except Exception as e:
return html_entries
def match_title(content):
title = re.findall("document\.title[\s]*=[\s]*['\"](.*?)['\"]", content, re.I | re.M | re.S)
if title and len(title) >= 1:
return title[0]
else:
title = re.findall("<title.*?>(.*?)</title>", content, re.I | re.M | re.S)
if title and len(title) >= 1:
return title[0]
else:
return False
def page_decode(url, html_content):
raw_content = html_content
try:
html_content = raw_content.decode("utf-8")
except UnicodeError:
try:
html_content = raw_content.decode("gbk")
except UnicodeError:
try:
html_content = raw_content.decode("gb2312")
except UnicodeError:
try:
html_content = raw_content.decode("big5")
except:
return out_format(url, "DecodeHtmlError")
return html_content
def get_title(url):
origin = url
if "://" not in url:
url = "http://" + url.strip()
url = url.rstrip("/") + "/"
# First Try Obtain WebSite Title
try:
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=1))
s.mount('https://', HTTPAdapter(max_retries=1))
req = s.get(url, headers=headers, verify=False, allow_redirects=True, timeout=20)
html_content = req.content
req.close()
except requests.ConnectionError:
return out_format(origin, "ConnectError")
except requests.Timeout:
return out_format(origin, "RequestTimeout")
except socket.timeout:
return out_format(origin, "SocketTimeout")
except requests.RequestException:
return out_format(origin, "RequestException")
except Exception as e:
return out_format(origin, "OtherException")
html_content = page_decode(url, html_content)
if html_content:
title = match_title(html_content)
else:
exit(0)
try:
if title:
if re.findall("\$#\d{3,};", title):
title = html_decoder(title)
return out_format(origin, title)
except Exception as e:
return out_format(origin, "FirstTitleError")
# Find Jump URL
for pattern in patterns:
jump = re.findall(pattern, html_content, re.I | re.M)
if len(jump) == 1:
if "://" in jump[0]:
url = jump[0]
else:
url += jump[0]
break
# Second Try Obtain WebSite Title
try:
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=1))
s.mount('https://', HTTPAdapter(max_retries=1))
req = s.get(url, headers=headers, verify=False, timeout=15)
html_content = req.content
req.close()
except requests.ConnectionError:
return out_format(origin, "ConnectError")
except requests.Timeout:
return out_format(origin, "RequestTimeout")
except socket.timeout:
return out_format(origin, "SocketTimeout")
except requests.RequestException:
return out_format(origin, "RequestException")
except Exception as e:
return out_format(origin, "OtherException")
html_content = page_decode(url, html_content)
if html_content:
title = match_title(html_content)
else:
exit(0)
try:
if title:
if re.findall("[$#]\d{3,};", title):
title = html_decoder(title)
return out_format(origin, title)
else:
return out_format(origin, "NoTitle")
except Exception as e:
return out_format(origin, "SecondTitleError")
if __name__ == "__main__":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
}
patterns = (
'<meta[\s]*http-equiv[\s]*=[\s]*[\'"]refresh[\'"][\s]*content[\s]*=[\s]*[\'"]\d+[\s]*;[\s]*url[\s]*=[\s]*(.*?)[\'"][\s]*/?>',
'window.location[\s]*=[\s]*[\'"](.*?)[\'"][\s]*;',
'window.location.href[\s]*=[\s]*[\'"](.*?)[\'"][\s]*;',
'window.location.replace[\s]*\([\'"](.*?)[\'"]\)[\s]*;',
'window.navigate[\s]*\([\'"](.*?)[\'"]\)',
'location.href[\s]*=[\s]*[\'"](.*?)[\'"]',
)
urls = []
results = []
parser = argparse.ArgumentParser(prog='owt.py', description="Obtain WebSite Title")
parser.add_argument("-t", dest='target', default='urls.txt', help="target with [file-path] or [single-url]")
parser.add_argument("-x", dest='threads', default=4, type=int, help="number of concurrent threads")
if len(sys.argv) == 1:
sys.argv.append('-h')
args = parser.parse_args()
target = args.target
threads = args.threads
if os.path.isfile(target):
with open(target, 'r') as f:
for line in f.readlines():
urls.append(line.strip())
try:
pool = ThreadPool(threads)
pool.map(get_title, urls)
pool.close()
pool.join()
except KeyboardInterrupt:
exit("[*] User abort")
else:
if "://" not in target:
target = "http://" + target
get_title(target)