-
Notifications
You must be signed in to change notification settings - Fork 490
/
Copy pathwebpage-email-scrape.py
71 lines (56 loc) · 2.55 KB
/
webpage-email-scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Script to scrap all the E-Mails in a Webpage.
# Importing required libraries and modules.
import re # For regular expression operations
import requests # For sending HTTP request to server
from urllib.parse import urlsplit # For Splitting the URL
from collections import deque # A list-like container
# A Python package for parsing HTML and XML documents
from bs4 import BeautifulSoup
import requests.exceptions # For handling exceptions
# Enter here the webpage to scrape in the original_url.
original_url = input("Enter the webpage url: ")
# a queue of URLs to be scraped
unprocessed_urls = deque([original_url])
# set of already crawled urls.
processed_urls = set()
# a set of fetched emails to save fetched emails
emails = set()
# moving unscraped_url from the queue to scraped_urls set
url = unprocessed_urls.popleft()
# Remove and return an element from the left side of the deque.
processed_urls.add(url)
# extracting base url to resolve relative links,i.e extract different parts of the url.
parts = urlsplit(url)
# As urlsplit() returns a 5-tuple which are (addressing scheme, network location, path, query, fragment identifier).
# So we will get the base and path part for the website URL.
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[: url.rfind("/") + 1] if "/" in parts.path else url
# Sending an HTTP GET request to the website.
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
# ignoring pages with errors and continue with next url
pass
# extracting all email addresses and add them into the resulting set
new_emails = set(
re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I)
)
emails.update(new_emails)
print(emails)
# Finding all linked URLs in the website.
# Creating a Beautiful Soup to parse the HTML document.
soup = BeautifulSoup(response.text, "html.parser")
# Once this document is parsed and processed,
# now find and process all the anchors as it may contains emails i.e. linked urls in this document.
for anchor in soup.find_all("a"):
# extracting link url from the anchor tag
link = anchor.attrs["href"] if "href" in anchor.attrs else ""
# resolve relative links (starting with /)
if link.startswith("/"):
link = base_url + link
elif not link.startswith("http"):
link = path + link
# add the new url to the queue if it was not in unprocessed list nor in processed list yet
if not link in unprocessed_urls and not link in processed_urls:
unprocessed_urls.append(link)
# End of the script.