-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsolutions.py
104 lines (82 loc) · 2.79 KB
/
solutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Library 'o code:
import re
import urllib2
class WordOccurrences:
def __init__(self):
self.occurrences = dict()
def __repr__(self):
output = ''
for word in self.occurrences:
output += '{}:\n'.format(word)
for url in self.occurrences[word]:
output += ' {}: {}\n'.format(url, self.occurrences[word][url])
return output
def record_word_occurrence(self, url, word):
if word in self.occurrences:
if url in self.occurrences[word]:
self.occurrences[word][url] += 1
else:
self.occurrences[word][url] = 1
else:
self.occurrences[word] = {url: 1}
def get_best_url_for_word(self, word):
if word in self.occurrences:
inverted = dict([
(v, k) for k, v in self.occurrences[word].iteritems()])
highest = sorted(inverted.keys())[-1]
return "Best URL is " + inverted[highest]
else:
return "Word not available anywhere"
occurrences = WordOccurrences()
def get_web_page(url):
print 'Getting webpage for: ', url
try:
return urllib2.urlopen(url).read()
except:
return ''
def get_links(page):
return re.compile("href\s*=\s*\"\s*([^\"]+)\"").findall(page)
def crawl_for_links(url, extract_links_function, depth=0):
if depth > 1:
return []
links = extract_links_function(url)
for link in links:
links += crawl_for_links(link, extract_links_function, depth + 1)
return set(links)
# Fetch the content of a web page:
url = raw_input("give me a url: ")
webpage = get_web_page(url)
print 'Raw HTML from {}:'.format(url)
print webpage
print '------------------------------------'
# Identify how often each word occurs in the page:
def record_occurrences(url, webpage):
words = webpage.split()
for word in words:
occurrences.record_word_occurrence(url, word)
record_occurrences(url, webpage)
print occurrences
# Extract anchor tags from document:
def get_cleaned_links(url):
webpage = get_web_page(url)
links = get_links(webpage)
cleaned_links = []
for link in links:
if link.startswith('http://'):
cleaned_links.append(link)
else:
cleaned_links.append(url + link)
return cleaned_links
print 'Cleaned up links: '
print get_cleaned_links(url)
print '------------------------------------'
# Visit each link and record how often each word occurs:
all_links = crawl_for_links(url, get_cleaned_links)
for link in all_links:
webpage = get_web_page(link)
record_occurrences(link, webpage)
print occurrences
# Provide a user interface to request best page for a word:
word = raw_input("word to search for: ")
best_url = occurrences.get_best_url_for_word(word)
print best_url