-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
72 lines (56 loc) · 1.76 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#'''
#A web crawler for extracting email addresses from web pages.
#
#Takes a string of URLs and requests each page, checks to see if we've
#found any emails and prints each email it finds.
#'''
## usage: python crawler.py --urls https://www.example.com/
import argparse
import re
import sys
import urllib2
class Crawler(object):
def __init__(self, urls):
'''
@urls: a string containing the (comma separated) URLs to crawl.
'''
self.urls = urls.split(',')
def crawl(self):
'''
Iterate the list of URLs and request each page, then parse it and
print the emails we find.
'''
for url in self.urls:
data = self.request(url)
for email in self.process(data):
print(email)
@staticmethod
def request(url):
'''
Request @url and return the page contents.
'''
response = urllib2.urlopen(url)
return response.read()
@staticmethod
def process(data):
'''
Process @data and yield the emails we find in it.
'''
for email in re.findall(r'(\w+@\w+\.com)', data):
yield email
for email in re.findall(r'(\w+@\w+\.es)', data):
yield email
for email in re.findall(r'(\w+@\w+\.org)', data):
yield email
for email in re.findall(r'(\w+@\w+\.net)', data):
yield email
def main():
argparser = argparse.ArgumentParser()
argparser.add_argument(
'--urls', dest='urls', required=True,
help='A comma separated string of emails.')
parsed_args = argparser.parse_args()
crawler = Crawler(parsed_args.urls)
crawler.crawl()
if __name__ == '__main__':
sys.exit(main())