-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubjsscan2.0
190 lines (153 loc) · 6.54 KB
/
subjsscan2.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import argparse
import os
import re
import requests
import time
import concurrent.futures
from datetime import datetime
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from tqdm import tqdm
from openpyxl import Workbook, load_workbook
def extract_strings(text):
pattern = r"['\"{}\(\)\[\]]([A-Za-z0-9\\/\.]{1,100})['\"{}\(\)\[\]]"
matches = re.findall(pattern, text)
return matches
def extract_title(text):
soup = BeautifulSoup(text, "html.parser")
title_tag = soup.find("title")
if title_tag:
return title_tag.text.strip()
return ""
def save_to_file(strings, filename):
existing_strings = set()
# 检查现有的 zifu.txt 文件,将已存在的字符串添加到集合中
if os.path.isfile(filename):
with open(filename, "r", encoding="utf-8") as file:
for line in file:
existing_strings.add(line.strip())
# 过滤重复的字符串并将新字符串保存到文件中
new_strings = []
for string in strings:
if string not in existing_strings:
new_strings.append(string)
existing_strings.add(string)
if new_strings:
with open(filename, "a", encoding="utf-8") as file:
for string in new_strings:
file.write(string + "\n")
def download_file(url, folder_name):
response = requests.get(url, verify=True)
if response.status_code == 200:
file_name = get_valid_filename(url.split("/")[-1].split("?")[0])
file_path = os.path.join(folder_name, file_name)
with open(file_path, "wb") as file:
file.write(response.content)
print(f"文件已下载: {file_name}")
if file_name.endswith(".js"):
# 如果是JavaScript文件,则提取满足要求的字符串并保存到zifu.txt文件中
with open(file_path, "r", encoding="utf-8") as js_file:
content = js_file.read()
strings = extract_strings(content)
save_to_file(strings, os.path.join(folder_name, "zifu.txt"))
def download_pages(url):
# 创建以URL命名的文件夹
folder_name = get_valid_filename(url)
os.makedirs(folder_name, exist_ok=True)
response = requests.get(url, verify=True)
if response.status_code == 200:
content_type = response.headers.get("content-type")
if "text/html" in content_type:
soup = BeautifulSoup(response.content, "html.parser")
text = soup.prettify()
# 保存页面内容到文件
file_name = "page.html"
file_path = os.path.join(folder_name, file_name)
with open(file_path, "w", encoding="utf-8") as file:
file.write(text)
# 提取满足要求的字符串并保存到文件
strings = extract_strings(text)
save_to_file(strings, os.path.join(folder_name, "zifu.txt"))
# 下载JavaScript文件
script_tags = soup.find_all("script")
for script_tag in script_tags:
if script_tag.get("src"):
script_url = urljoin(url, script_tag["src"])
download_file(script_url, folder_name)
print(f"页面已下载并保存至文件夹 {folder_name}")
else:
print("不是HTML响应")
else:
print("无法获取URL")
def get_valid_filename(filename):
# 替换文件名中的非法字符
filename = re.sub(r'[\\/:*?"<>|]', "_", filename)
return filename
def process_url(suburl, wb):
response = requests.get(suburl, verify=True)
if response.status_code == 200:
status_code = response.status_code
content_length = len(response.content)
title = extract_title(response.content)
ws = wb.active
ws.append([suburl, status_code, content_length, title])
else:
ws = wb.active
ws.append([suburl, "请求失败"])
def main():
parser = argparse.ArgumentParser(description="从URL下载并提取字符串")
parser.add_argument("-u", "--url", required=True, help="要下载的URL")
args = parser.parse_args()
url = args.url
download_pages(url)
print("已提取关键词完毕")
time.sleep(5) # 暂停5秒钟
folder_name = get_valid_filename(url)
suburl_file = os.path.join(folder_name, "suburl.txt")
if os.path.isfile(suburl_file):
with open(suburl_file, "a", encoding="utf-8") as outfile:
zifu_file = os.path.join(folder_name, "zifu.txt")
if os.path.isfile(zifu_file):
with open(zifu_file, "r", encoding="utf-8") as file:
zifu_strings = file.read().splitlines()
progress_bar = tqdm(zifu_strings, desc="保存目录", unit="URL", bar_format="{l_bar}{bar}")
for string in progress_bar:
suburl = url + string
outfile.write(suburl + "\n")
else:
print("无法找到zifu.txt文件")
else:
with open(suburl_file, "w", encoding="utf-8") as outfile:
zifu_file = os.path.join(folder_name, "zifu.txt")
if os.path.isfile(zifu_file):
with open(zifu_file, "r", encoding="utf-8") as file:
zifu_strings = file.read().splitlines()
progress_bar = tqdm(zifu_strings, desc="保存目录2", unit="URL", bar_format="{l_bar}{bar}")
for string in progress_bar:
suburl = url + string
outfile.write(suburl + "\n")
else:
print("无法找到zifu.txt文件")
print("等待2秒钟")
time.sleep(2)
# 提取URL的文件名作为保存的文件名
outfile = os.path.join(folder_name, f"{folder_name}.xlsx")
if not os.path.isfile(outfile):
wb = Workbook()
ws = wb.active
ws.append(["URL", "状态码", "返回包大小", "标题"])
else:
wb = load_workbook(outfile)
ws = wb.active
with open(suburl_file, "r", encoding="utf-8") as file:
urls = file.read().splitlines()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for suburl in urls:
future = executor.submit(process_url, suburl, wb)
futures.append(future)
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="访问拼接关键词后的目录", unit="URL"):
pass
wb.save(outfile)
if __name__ == "__main__":
main()