Skip to content

Commit b3e11ea

Browse files
Keyword update
1 parent 7d06100 commit b3e11ea

File tree

1 file changed

+49
-7
lines changed

1 file changed

+49
-7
lines changed

scrape.py

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,15 @@
22
from datetime import datetime
33
import requests
44
from os import path, mkdir
5-
from progress.bar import Bar
5+
import argparse
6+
from sys import modules
7+
8+
try:
9+
from progress.bar import Bar
10+
11+
except ModuleNotFoundError:
12+
print("Make sure the progress module is installed.")
13+
exit(0)
614

715

816
def status(message):
@@ -12,6 +20,7 @@ def status(message):
1220
def main():
1321
status("Fetching latest pastes...")
1422

23+
# fetch latest 100 pastes
1524
current_request = requests.get("https://scrape.pastebin.com/api_scraping.php?limit=100")
1625
current_json = current_request.json()
1726

@@ -20,36 +29,69 @@ def main():
2029

2130
for entry in current_json:
2231
path_t = path.join("files", "{0}.txt".format(entry["key"]))
32+
2333
if path.isfile(path_t):
2434
skipped_pastes += 1
2535

2636
with Bar("Processing", max=len(current_json) - skipped_pastes, fill=">") as bar:
2737
for entry in current_json:
2838
path_t = path.join("files", "{0}.txt".format(entry["key"]))
39+
path_t_important = path.join("files_important", "{0}.txt".format(entry["key"]))
40+
2941
if path.isfile(path_t):
3042
continue
3143

3244
entry_request = requests.get("https://scrape.pastebin.com/api_scrape_item.php?i={0}"
3345
.format(entry["key"]))
3446

35-
f = open(path_t, "w+")
36-
f.write(entry_request.text)
37-
f.close()
47+
entry_file = open(path_t, "w+")
48+
entry_file.write(entry_request.text)
49+
entry_file.close()
50+
51+
if keywords is not None:
52+
for keyword in keywords:
53+
if keyword.upper() in entry_request.text:
54+
print(" [KEYWORD] Paste \'{0}\' contains keyword \'{1}\'".format(entry["key"], keyword))
55+
56+
entry_file = open(path_t_important, "w+")
57+
entry_file.write(entry_request.text)
58+
entry_file.close()
59+
60+
break
3861

3962
bar.next()
4063

41-
bar.finish()
64+
bar.finish()
4265

43-
if skipped_pastes is not 0:
44-
status("Skipped {0} previously fetched pastes".format(skipped_pastes))
66+
if skipped_pastes is not 0:
67+
status("Skipped {0} previously fetched pastes".format(skipped_pastes))
4568

4669
status("Hibernating for 60 seconds...")
4770
print()
4871
threading.Timer(60, main).start()
4972

5073

74+
# make sure file directories exists
5175
if not path.isdir("files"):
5276
status("No file directory found, creating...")
5377
mkdir("files")
5478

79+
if not path.isdir("files_important"):
80+
status("No important file directory found, creating...")
81+
mkdir("files_important")
82+
83+
# parse arguments
84+
keywords = None
85+
86+
parser = argparse.ArgumentParser(description="A script to scrape pastebin.com with optional keyword search")
87+
parser.add_argument("--keywords", "-k", help="A file containing keywords for the search")
88+
args = parser.parse_args()
89+
90+
if args.keywords is not None:
91+
f = open(args.keywords)
92+
keywords = f.readlines()
93+
f.close()
94+
95+
status("Loaded {0} keywords".format(len(keywords)))
96+
5597
main()

0 commit comments

Comments
 (0)