-
Notifications
You must be signed in to change notification settings - Fork 1
/
Crawl_bkp.py
38 lines (29 loc) · 1.09 KB
/
Crawl_bkp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import Page
import networkx as nx
class Crawl:
def __init__(self, url_arg, max_iter_arg, max_time_arg):
self.max_iter = max_iter_arg
self.max_time = max_time_arg
self.start_url = url_arg
self.graph = None
self.page_array = []
self.home_page = Page.Page(0, url_arg)
def crawl(self):
page_queue = [self.home_page.url]
visited_pages = set()
while len(page_queue) > 0:
current_page = page_queue.pop(0)
if current_page not in visited_pages:
visited_pages |= {current_page}
p = Page.Page(0, current_page)
local_links = p.get_links()
for local_url in local_links:
if local_url not in visited_pages:
# new_page = Page.Page(0, local_url).url
new_page = local_url
page_queue.append(new_page)
print(new_page)
print(visited_pages)
if __name__ == '__main__':
c = Crawl('icm.hr', 10**6, 10**6)
c.crawl()