-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdfs.py
46 lines (33 loc) · 1.14 KB
/
dfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import sys, os
import urllib2
from BeautifulSoup import *
from urlparse import *
def dfs(url, callback, urls = [], depth = 1, max_depth = 10):
"""
recursive depth-first search for links
"""
#base case
if (depth > max_depth) or (url in urls):
return
else:
#make sure we do not revisit
urls.append(url)
#HTTP get request
root_html = urllib2.urlopen(url).read()
callback(url, root_html)
#parse and itterate over 'a' tags
for link in BeautifulSoup(root_html, parseOnlyThese=SoupStrainer('a')):
try:
#ignore the inner links
if not (link['href'].startswith('#') or link['href'].startswith('?')):
#find the absolute path
next = urljoin(url, link['href'])
dfs(next, callback, urls, depth + 1, max_depth)
except Exception as e:
print e
if __name__ == "__main__":
"""crawls an apachee dir and prints out all of the paths discovered"""
base_url = sys.argv[1]
def cb(url, html):
print url.replace(base_url, '')
dfs(base_url, cb)