-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnoisy.py
309 lines (265 loc) · 17.6 KB
/
noisy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import argparse
import datetime
import json
import os
import random
import re
import time
import signal
from urllib.parse import urljoin, urlparse
import requests
from urllib3.exceptions import LocationParseError
import colorama
from colorama import Fore, Style
from user_agents import user_agents # Assuming user_agents.py contains a list named `user_agents`
colorama.init(autoreset=True)
cyan = "\033[1;36;40m"
green = "\033[1;32;40m"
red = "\033[1;31;40m"
Y = '\033[1;33;40m'
main_logo = '''
[91m_[0m[93m_[0m[92m_[0m [96m_[0m[94m_[0m[95m_[0m [91m_[0m[93m_[0m[92m_[0m [96m_[0m[94m_[0m[95m_[0m
[91m/[0m[93m\[0m[92m_[0m[96m_[0m[94m\[0m [95m/[0m[91m\[0m [93m\[0m [92m_[0m[96m_[0m[94m_[0m [95m/[0m[91m\[0m [93m\[0m [92m|[0m[96m\[0m[94m_[0m[95m_[0m[91m\[0m
[93m/[0m[92m:[0m[96m:[0m[94m|[0m [95m|[0m [91m/[0m[93m:[0m[92m:[0m[96m\[0m [94m\[0m [95m/[0m[91m\[0m [93m\[0m [92m/[0m[96m:[0m[94m:[0m[95m\[0m [91m\[0m [93m|[0m[92m:[0m[96m|[0m [94m|[0m
[95m/[0m[91m:[0m[93m|[0m[92m:[0m[96m|[0m [94m|[0m [95m/[0m[91m:[0m[93m/[0m[92m\[0m[96m:[0m[94m\[0m [95m\[0m [91m\[0m[93m:[0m[92m\[0m [96m\[0m [94m/[0m[95m:[0m[91m/[0m[93m\[0m [92m\[0m [96m\[0m [94m|[0m[95m:[0m[91m|[0m [93m|[0m
[92m/[0m[96m:[0m[94m/[0m[95m|[0m[91m:[0m[93m|[0m [92m|[0m[96m_[0m[94m_[0m [95m/[0m[91m:[0m[93m/[0m [92m\[0m[96m:[0m[94m\[0m [95m\[0m [91m/[0m[93m:[0m[92m:[0m[96m\[0m[94m_[0m[95m_[0m[91m\[0m [93m_[0m[92m\[0m[96m:[0m[94m\[0m[95m~[0m[91m\[0m [93m\[0m [92m\[0m [96m|[0m[94m:[0m[95m|[0m[91m_[0m[93m_[0m[92m|[0m[96m_[0m[94m_[0m
[95m/[0m[91m:[0m[93m/[0m [92m|[0m[96m:[0m[94m|[0m [95m/[0m[91m\[0m[93m_[0m[92m_[0m[96m\[0m [94m/[0m[95m:[0m[91m/[0m[93m_[0m[92m_[0m[96m/[0m [94m\[0m[95m:[0m[91m\[0m[93m_[0m[92m_[0m[96m\[0m [94m_[0m[95m_[0m[91m/[0m[93m:[0m[92m/[0m[96m\[0m[94m/[0m[95m_[0m[91m_[0m[93m/[0m [92m/[0m[96m\[0m [94m\[0m[95m:[0m[91m\[0m [93m\[0m [92m\[0m[96m_[0m[94m_[0m[95m\[0m [91m/[0m[93m:[0m[92m:[0m[96m:[0m[94m:[0m[95m\[0m[91m_[0m[93m_[0m[92m\[0m
[96m\[0m[94m/[0m[95m_[0m[91m_[0m[93m|[0m[92m:[0m[96m|[0m[94m/[0m[95m:[0m[91m/[0m [93m/[0m [92m\[0m[96m:[0m[94m\[0m [95m\[0m [91m/[0m[93m:[0m[92m/[0m [96m/[0m [94m/[0m[95m\[0m[91m/[0m[93m:[0m[92m/[0m [96m/[0m [94m\[0m[95m:[0m[91m\[0m [93m\[0m[92m:[0m[96m\[0m [94m\[0m[95m/[0m[91m_[0m[93m_[0m[92m/[0m [96m/[0m[94m:[0m[95m/[0m[91m~[0m[93m~[0m[92m/[0m[96m~[0m
[94m|[0m[95m:[0m[91m/[0m[93m:[0m[92m/[0m [96m/[0m [94m\[0m[95m:[0m[91m\[0m [93m/[0m[92m:[0m[96m/[0m [94m/[0m [95m\[0m[91m:[0m[93m:[0m[92m/[0m[96m_[0m[94m_[0m[95m/[0m [91m\[0m[93m:[0m[92m\[0m [96m\[0m[94m:[0m[95m\[0m[91m_[0m[93m_[0m[92m\[0m [96m/[0m[94m:[0m[95m/[0m [91m/[0m
[93m|[0m[92m:[0m[96m:[0m[94m/[0m [95m/[0m [91m\[0m[93m:[0m[92m\[0m[96m/[0m[94m:[0m[95m/[0m [91m/[0m [93m\[0m[92m:[0m[96m\[0m[94m_[0m[95m_[0m[91m\[0m [93m\[0m[92m:[0m[96m\[0m[94m/[0m[95m:[0m[91m/[0m [93m/[0m [92m\[0m[96m/[0m[94m_[0m[95m_[0m[91m/[0m
[93m/[0m[92m:[0m[96m/[0m [94m/[0m [95m\[0m[91m:[0m[93m:[0m[92m/[0m [96m/[0m [94m\[0m[95m/[0m[91m_[0m[93m_[0m[92m/[0m [96m\[0m[94m:[0m[95m:[0m[91m/[0m [93m/[0m
[92m\[0m[96m/[0m[94m_[0m[95m_[0m[91m/[0m [93m\[0m[92m/[0m[96m_[0m[94m_[0m[95m/[0m [91m\[0m[93m/[0m[92m_[0m[96m_[0m[94m/[0m
'''
infoabt = '''
\033[95m ****************************\033[0m
\033[96m github.com/noarche/Noisy \033[0m
\033[94m [91mT[0m[93mr[0m[92ma[0m[96mf[0m[94mf[0m[95mi[0m[91mc[0m [93mO[0m[92mb[0m[96mf[0m[94mu[0m[95ms[0m[91mc[0m[93ma[0m[92mt[0m[96mi[0m[94mo[0m[95mn[0m \033[0m
\033[95m ****************************\033[0m
\033[96m Build Date: [91mA[0m[93mu[0m[92mg[0m[96mu[0m[94ms[0m[95mt[0m [91m1[0m[93m0[0m [92m2[0m[96m0[0m[94m2[0m[95m4[0m
[91mS[0m[93me[0m[92ml[0m[96me[0m[94mc[0m[95mt[0m [91mt[0m[93mh[0m[92me[0m [96mc[0m[94mo[0m[95mn[0m[91mf[0m[93mi[0m[92mg[0m [96my[0m[94mo[0m[95mu[0m [91mw[0m[93ma[0m[92mn[0m[96mt[0m [94mt[0m[95mo[0m [91mr[0m[93mu[0m[92mn[0m[96m,[0m [94mE[0m[95ma[0m[91ms[0m[93my[0m [92mt[0m[96mo[0m [94mb[0m[95ml[0m[91me[0m[93mn[0m[92md[0m [96mi[0m[94ms[0m [95mb[0m[91me[0m[93ms[0m[92mt[0m[96m.[0m [94mA[0m[95ml[0m[91ml[0m [93mo[0m[92mt[0m[96mh[0m[94me[0m[95mr[0m [91mq[0m[93mu[0m[92me[0m[96ms[0m[94mt[0m[95mi[0m[91mo[0m[93mn[0m[92ms[0m
[96ma[0m[94mr[0m[95me[0m [91mo[0m[93mp[0m[92mt[0m[96mi[0m[94mo[0m[95mn[0m[91ma[0m[93ml[0m [92mt[0m[96mo[0m [94mc[0m[95mu[0m[91ms[0m[93mt[0m[92mo[0m[96mm[0m[94mi[0m[95mz[0m[91me[0m [93my[0m[92mo[0m[96mu[0m[94mr[0m [95me[0m[91mx[0m[93mp[0m[92me[0m[96mr[0m[94mi[0m[95me[0m[91mn[0m[93mc[0m[92me[0m [96m&[0m [94mO[0m[95mK[0m [91mt[0m[93mo[0m [92ms[0m[96mk[0m[94mi[0m[95mp[0m [91mi[0m[93mf[0m [92mu[0m[96mn[0m[94ms[0m[95mu[0m[91mr[0m[93me[0m[92m.[0m
\033[36m Since update Realistic-Links is the best config to use by default!\033[0m
\033[36m To filter links enter string to filter. Not reccomended. Leave blank to continue.\033[0m
\033[32m Domain Only? Not recomended. Leave blank to continue.\033[0m
\033[32m Enter minimum time to wait. fast is 0 (reccomended: 0)\033[0m
\033[32m Enter maximum time to wait. fast is 1 (reccomended: 4)\033[0m
\033[92m A random number between these two numbers will be used.\033[0m
\033[96m Enter depth, the number of links to visit on that domain before switching. A random number between 0 and your input will be used each time. Leave blank to continue.\033[0m
'''
print(main_logo)
print(infoabt)
REQUEST_COUNTER = -1
SYS_RANDOM = random.SystemRandom()
total_bandwidth = 0
visited_responsive_urls = 0
def handle_interrupt(signum, frame):
print(f"\n{Fore.RED}Process interrupted by user! Returning to main menu...{Style.RESET_ALL}")
main()
def exit_script(signum, frame):
print(f"\n{Fore.RED}Exiting the script.{Style.RESET_ALL}")
exit(0)
class Crawler:
def __init__(self):
self._config = {}
self._links = []
self._start_time = None
self._visit_counter = 0
self._valid_string = None
self._blacklist = set()
self._domain_only = False
def set_valid_string(self, valid_string):
self._valid_string = valid_string
def set_domain_only(self, domain_only):
self._domain_only = domain_only
def _filter_links(self):
if self._valid_string:
self._links = [link for link in self._links if self._valid_string in link]
if self._domain_only:
self._links = list(set(self._links)) # Remove duplicates
self._links = [link for link in self._links if self._valid_string in link]
def _normalize_link(self, link, root_url):
try:
parsed_url = urlparse(link)
except ValueError:
return None
parsed_root_url = urlparse(root_url)
if link.startswith("//"):
return "{}://{}{}".format(parsed_root_url.scheme, parsed_url.netloc, parsed_url.path)
if not parsed_url.scheme:
return urljoin(root_url, link)
return link
def _request(self, url):
global total_bandwidth, visited_responsive_urls
random_user_agent = random.choice(user_agents) # Use a random user agent from user_agents.py
headers = {"user-agent": random_user_agent}
try:
response = requests.get(url, headers=headers, timeout=4)
if response:
content_length = len(response.content)
total_bandwidth += content_length
visited_responsive_urls += 1
print_responsive_link(url)
print_bandwidth_usage(content_length)
return response
except requests.exceptions.RequestException:
return None
def _is_valid_url(self, url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def _is_blacklisted(self, url):
return url in self._blacklist
def _contains_unwanted_strings(self, url):
unwanted_strings = ['.ico', '.png', '.jpg', '.webp', '.webm', '.js', '.pdf', '.doc', '.docx', '.svg', '.jpeg', '.json', '.onion', '.i2p', '.safetensors', '.rar', '.zip', '.gguf', '.ggml', '.shp', '.gif', '.avi', '.mp3', '.wav', '.mkv', '.mp4', '.m4a', '.flac', '.ogg', '.opus', '.avif', '.hc', '.tc', '.xyz', '.exe', '.msi', '.tar', '.7z', '.tif', '.css', '.csv']
return any(unwanted_string in url for unwanted_string in unwanted_strings)
def _remove_and_blacklist(self, url):
self._blacklist.add(url)
self._links = [link for link in self._links if link != url]
def _should_accept_url(self, url):
valid_string_condition = not self._valid_string or (self._valid_string and self._valid_string in url)
return (
url
and self._is_valid_url(url)
and not self._is_blacklisted(url)
and not self._contains_unwanted_strings(url)
and valid_string_condition
)
def _extract_urls(self, body, root_url):
pattern = r"href=[\"'](?!#)(.*?)[\"'].*?"
urls = re.findall(pattern, str(body))
normalize_urls = [self._normalize_link(url, root_url) for url in urls]
return list(filter(self._should_accept_url, normalize_urls))
def _browse_from_links(self, depth=0):
max_depth = SYS_RANDOM.randint(1, self._config["max_depth"])
is_depth_reached = depth >= max_depth
if not self._links or is_depth_reached:
return
if self._is_timeout_reached():
raise self.CrawlerTimedOut
random_link = SYS_RANDOM.choice(self._links)
try:
response = self._request(random_link)
if response:
sub_page = response.content
sub_links = self._extract_urls(sub_page, random_link)
time.sleep(SYS_RANDOM.randrange(self._config["min_sleep"], self._config["max_sleep"]))
if len(sub_links) > 1:
self._links = sub_links
else:
self._remove_and_blacklist(random_link)
self.save_links(sub_links)
self._visit_counter += 1
if self._visit_counter % 10 == 0:
self._filter_links()
except (requests.exceptions.RequestException, UnicodeDecodeError):
self._remove_and_blacklist(random_link)
self._browse_from_links(depth + 1)
def load_config_file(self, file_path):
with open(file_path, "r") as config_file:
config = json.load(config_file)
self.set_config(config)
self._filter_links()
def set_config(self, config):
self._config = config
def set_option(self, option, value):
self._config[option] = value
def _is_timeout_reached(self):
is_timeout_set = self._config["timeout"] is not False
end_time = self._start_time + datetime.timedelta(seconds=self._config["timeout"])
is_timed_out = datetime.datetime.now() >= end_time
return is_timeout_set and is_timed_out
def crawl(self):
self._start_time = datetime.datetime.now()
while True:
url = SYS_RANDOM.choice(self._config["root_urls"])
try:
response = self._request(url)
if response:
body = response.content
self._links = self._extract_urls(body, url)
self._browse_from_links()
else:
time.sleep(SYS_RANDOM.randrange(self._config["min_sleep"], self._config["max_sleep"]))
except (requests.exceptions.RequestException, LocationParseError):
continue
def save_links(self, links):
with open('output_links.txt', 'a') as file:
for link in links:
file.write(link + '\n')
def print_responsive_link(url):
print(f"{Fore.GREEN}Responsive Link: {url}{Style.RESET_ALL}")
def print_bandwidth_usage(content_length=0):
global total_bandwidth, visited_responsive_urls
total_bandwidth += content_length
print(f"{Fore.MAGENTA}Total Bandwidth Used: {total_bandwidth / (1024 * 1024):.2f} MB | Responsive URLs Visited: {visited_responsive_urls}{Style.RESET_ALL}", end='\r')
def load_configs():
config_dir = 'configs'
configs = [f for f in os.listdir(config_dir) if f.endswith('.json')]
print(f"{Fore.YELLOW}Available Configurations:{Style.RESET_ALL}")
for i, config in enumerate(configs):
print(f"{Fore.CYAN}{i + 1}. {config}{Style.RESET_ALL}")
return configs
def main():
signal.signal(signal.SIGINT, exit_script)
configs = load_configs()
if not configs:
print(f"{Fore.RED}No configuration files found in 'configs' directory.{Style.RESET_ALL}")
return
config_choice = input(f"{Fore.BLUE}Choose a configuration by number:{Style.RESET_ALL} ").strip()
try:
config_choice = int(config_choice) - 1
if 0 <= config_choice < len(configs):
selected_config = configs[config_choice]
else:
print(f"{Fore.RED}Invalid selection.{Style.RESET_ALL}")
return
except ValueError:
print(f"{Fore.RED}Invalid input.{Style.RESET_ALL}")
return
config_path = os.path.join('configs', selected_config)
crawler = Crawler()
crawler.load_config_file(config_path)
valid_string = input(f"{Fore.RED}Enter a valid string (e.g., .gov, .edu, .net) or leave blank to skip:{Style.RESET_ALL} ").strip()
if valid_string:
crawler.set_valid_string(valid_string)
domain_only = input(f"{Fore.RED}Domain only mode? (0 for no, 1 for yes, default is no):{Style.RESET_ALL} ").strip()
if domain_only == "1":
crawler.set_domain_only(True)
min_sleep = input(f"{Fore.GREEN}Enter min sleep time in seconds or leave blank to continue:{Style.RESET_ALL} ").strip()
max_sleep = input(f"{Fore.GREEN}Enter max sleep time in seconds or leave blank to continue:{Style.RESET_ALL} ").strip()
if min_sleep.isdigit() and max_sleep.isdigit():
crawler.set_option("min_sleep", int(min_sleep))
crawler.set_option("max_sleep", int(max_sleep))
depth = input(f"{Fore.GREEN}Enter the maximum depth of hits:{Style.RESET_ALL} ").strip()
if depth.isdigit():
crawler.set_option("max_depth", int(depth))
signal.signal(signal.SIGINT, handle_interrupt)
crawler.crawl()
if __name__ == "__main__":
main()
# Disclaimer:
# This code/script/application/program is solely for educational and learning purposes.
# All information, datasets, images, code, and materials are presented in good faith and
# intended for instructive use. However, noarche make no representation or warranty,
# express or implied, regarding the accuracy, adequacy, validity, reliability, availability,
# or completeness of any data or associated materials.
# Under no circumstance shall noarche have any liability to you for any loss, damage, or
# misinterpretation arising due to the use of or reliance on the provided data. Your utilization
# of the code and your interpretations thereof are undertaken at your own discretion and risk.
#
# By executing script/code/application, the user acknowledges and agrees that they have read,
# understood, and accepted the terms and conditions (or any other relevant documentation or
#policy) as provided by noarche.
#
#Visit https://github.com/noarche for more information.
#
# _.··._.·°°°·.°·..·°¯°·._.··._.·°¯°·.·° .·°°°°·.·°·._.··._
# ███╗ ██╗ ██████╗ █████╗ ██████╗ ██████╗██╗ ██╗███████╗
# ████╗ ██║██╔═══██╗██╔══██╗██╔══██╗██╔════╝██║ ██║██╔════╝
# ██╔██╗ ██║██║ ██║███████║██████╔╝██║ ███████║█████╗
# ██║╚██╗██║██║ ██║██╔══██║██╔══██╗██║ ██╔══██║██╔══╝
# ██║ ╚████║╚██████╔╝██║ ██║██║ ██║╚██████╗██║ ██║███████╗
# ╚═╝ ╚═══╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚══════╝
# °°°·._.··._.·°°°·.°·..·°¯°··°¯°·.·°.·°°°°·.·°·._.··._.·°°°