forked from merwin-asm/OpenCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmongo_db.py
230 lines (154 loc) · 4.5 KB
/
mongo_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
Open Crawler v 1.0.0 | Mongo - DB
"""
from pymongo.mongo_client import MongoClient
from rich import print
import atexit
import time
import json
import os
# Main Variables
CLIENT = None
DB = None
def connect_db(uri, pwd):
"""
Initializes Connection With MongoDB
uri : str - > The URI given by MongoDB
pwd : str - > The password to connect
"""
global CLIENT, DB
uri = uri.replace("<password>", pwd)
try:
CLIENT = MongoClient(uri)
print("[spring_green1] [+] Connected To MongoDB [/spring_green1]")
DB = CLIENT.Crawledsites
except Exception as e:
print("[red] [-] Error Occured While Connecting To Mongo DB [/red]")
print(f"[red] [bold] \t\t\t\t Error : {e}[/bold] [/red]")
quit()
def if_waiting(url):
"""
Checks if a website is in the waiting list
returns bool
"""
try:
a = DB.waitlist.find_one({"website":url})["website"]
if a != None:
return True
else:
return False
except:
return False
def _DB():
"""
returns the DB
"""
return DB
def get_info():
"""
To get count of docs in main collections
returns list of int
"""
a = int(DB.Crawledsites.estimated_document_count())
b = int(DB.waitlist.estimated_document_count())
a = f" Len : {a} | Storage : {a*257} Bytes"
b = f" Len : {b} | Storage : {b*618} Bytes"
return [a, b]
def get_last():
"""
Last crawled site
returns str
"""
a = DB.Crawledsites.find().sort("_id", -1)
return a[0]["website"]
def get_crawl(website):
"""
Get crawled info of a site
returns dict
"""
return dict(DB.Crawledsites.find_one({"website":website}))
def if_crawled(url):
"""
Checks if a site was crawled
returns Bool , time/None (last crawled time)
"""
try:
a = DB.Crawledsites.find_one({"website":url})
return True, a["time"]
except:
return False, None
def update_crawl(website, time, mal, offn, ln, key, desc, recc):
"""
Updates a crawl
"""
DB.Crawledsites.delete_many({"website":website})
DB.Crawledsites.insert_one({"website":website, "time":time, "mal":mal, "offn":offn, "ln":ln, "key":key, "desc":desc, "recc":recc})
def save_crawl(website, time, mal, offn, ln, key, desc, recc):
"""
Saves a crawl
"""
DB.Crawledsites.insert_one({"website":website, "time":time, "mal":mal, "offn":offn, "ln":ln, "key":key, "desc":desc, "recc":recc})
def save_robots(website, robots):
"""
Saves dissallowed sites
"""
DB.Robots.insert_one({"website":website, "restricted":robots})
def get_robots(website):
"""
Gets dissallowed sites from the database
"""
return DB.Robots.find_one({"website":website})["restricted"]
def get_wait_list(num):
"""
Gets websites to crawl
num : int - > number of websites to recv
returns list - > list of websites
"""
wait = list(DB.waitlist.find().limit(num))
for e in wait:
DB.waitlist.delete_many({"website":e["website"]})
return wait
def write_to_wait_list(list_):
"""
Writes to collection of websites to get crawled
list_ : list - > website urls
"""
list_ = set(list_)
list__ = []
for e in list_:
if not if_waiting(e):
list__.append({"website": e})
try:
DB.waitlist.insert_many(list__)
except:
pass
# Part of testings
if __name__ == "__main__":
# Config File
config_file = "config.json"
# Load configs from config_file - > json
try:
config_file = open(config_file, "r")
configs = json.loads(config_file.read())
config_file.close()
except:
try:
os.system("python3 config.py") # Re-configures
except:
os.system("python config.py") # Re-configures
config_file = open(config_file, "r")
configs = json.loads(config_file.read())
config_file.close()
## Setting Up Configs
MONGODB_PWD = configs["MONGODB_PWD"]
MONGODB_URI = configs["MONGODB_URI"]
connect_db(MONGODB_URI, MONGODB_PWD)
# save_crawl("w1",1,0,1,3,4,5,5)
# save_crawl("w2",4,0,1,3,4,5,5)
# save_crawl("w3",10,0,1,3,4,5,5)
# print(if_crawled("w"))
# update_crawl("w",1,1,1,3,4,5,5)
# print(get_last())
# print(if_crawled("https://darkmash-org.github.io/"))
# print(get_robots("www.bfi.org.uk"))
# print(if_waiting("https://www.w3.org/blog/2015/01/"))