-
Notifications
You must be signed in to change notification settings - Fork 0
/
pastecrawl.py
188 lines (149 loc) · 5.47 KB
/
pastecrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import requests
import time
from bs4 import BeautifulSoup
import tagmaster
import signal
import inspect
import sys
import codecs
# ANSI Colors
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# The blacklisted words
blacklist = []
# The whitelisted words
whitelist = []
# Shoudl we exit?
should_exit = False
# An exception for blacklisted words
class BlacklistedWordException(Exception):
pass
# Parse the blacklist
with open('blacklist.txt', 'r') as f:
[blacklist.append(line.strip()) for line in f.readlines()]
# Parse the whitelist
with open('whitelist.txt', 'r') as f:
[whitelist.append(line.strip()) for line in f.readlines()]
# Load all tagger packages in the default directory
tagmaster.collect_taggers('taggers')
# Define a signal handler for SIGINT
def sigint_handler(sig, frame):
signal.signal(signal.SIGINT, original_sigint)
while True:
sys.stdout.write('\r')
sys.stdout.flush()
print("")
instruction = input("> ").strip()
if instruction == "exit":
sys.exit(1)
elif instruction == "continue":
signal.signal(signal.SIGINT, sigint_handler)
break
elif instruction == "taggers":
for tagger in tagmaster.taggers:
print(bcolors.WARNING + "--> " + bcolors.ENDC, end='')
print(inspect.getfile(tagger))
print("")
elif instruction == "reload":
tagmaster.reload_taggers('taggers')
elif instruction == "links":
for link in links:
print(bcolors.WARNING + "--> " + bcolors.ENDC, end='')
print(link)
print("")
else:
print("Unknown instruction.")
signal.signal(signal.SIGINT, sigint_handler)
# Save the original signal handler
original_sigint = signal.getsignal(signal.SIGINT)
# Register it
signal.signal(signal.SIGINT, sigint_handler)
# Crawling loop
while True:
# Links to crawl
links = []
try:
# Fetch the page and parse it
page = requests.get("http://pastebin.com/archive")
soup = BeautifulSoup(page.content, 'html.parser')
# Fetch the main table
table = soup.find('table', {'class': 'maintable'})
# Fetch the rows
rows = table.findAll('tr')
for tr in rows:
# Fetch the columns
cols = tr.findAll('td')
for td in cols:
# Fetch the links
hrefs = td.findAll('a')
for a in hrefs:
# Fetch the content
link = a.get('href')
# Skip over default ones
if "archive" not in link:
links.append(link)
except SystemExit:
sys.exit(0)
except:
print("Failed to fetch latest content. Potential timeout by service. This may indicate a temporary ban!")
break
# Crawl each link
for link in links:
if should_exit:
sys.exit(0)
print(bcolors.WARNING + "--> " + bcolors.ENDC, end='')
print(link, end='')
try:
# Fetch the content
crawl = requests.get("http://pastebin.com/raw" + link)
# The tag list
tags = []
# Run every tagger
for tag in tagmaster.run_taggers(crawl.text):
tags.append(tag)
# Print any tags identified by taggers in a different color
for word in tags:
print(bcolors.OKBLUE + " [" + word + "] " + bcolors.ENDC, end='')
# Loop through the whitelist
for word in whitelist:
# Skip over comments and empty lines
if word and not word.startswith('#') and not word.startswith(' '):
# Convert crawl to lowercase and check if word is contained
if word in crawl.text.lower():
print(bcolors.OKGREEN + " [" + word + "] " + bcolors.ENDC, end='')
# Tag it
tags.append(word)
# Loop through the blacklist
for word in blacklist:
# Skip over comments and empty lines
if word and not word.startswith('#') and not word.startswith(' '):
# Convert crawl to lowercase and check if word is contained
if word in crawl.text.lower():
print(bcolors.FAIL + " [" + word + "] " + bcolors.ENDC, end='')
# Bail out early
raise BlacklistedWordException("Blacklisted word!")
# Prepend an empty tag if the list is not empty for cosmetic purposes
if len(tags) > 0:
tags.insert(0, '')
# Write the crawel to the file, appending the timestamp and tag list to the filename
with codecs.open('./crawls/' + str(int(time.time())) + '_'.join(tags) + ".txt", "w", encoding="utf-8") as f:
f.write(crawl.text)
# Sleep
time.sleep(10)
except SystemExit:
sys.exit(0)
except BlacklistedWordException:
pass
except Exception as e:
print(e)
# Advance to the next line
print("")
# Sleep
time.sleep(10)