forked from HuskyChaos/pythonScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscraper.py
executable file
·38 lines (28 loc) · 966 Bytes
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from distutils.filelist import findall
import urllib.request
from bs4 import BeautifulSoup
from bs4 import Comment
import argparse
def urlCheck(url):
if 'http://' not in url or 'https://' not in url:
url = 'http://' + url
openUrl(url)
else:
openUrl(url)
def openUrl(url):
with urllib.request.urlopen(url) as response:
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
print("========= Links =========")
for a in soup.find_all('a'):
print(a['href'])
print("========= Comments =========")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
print(comments)
try:
parser = argparse.ArgumentParser(description='Fetch comments and anchor tags from a website')
parser.add_argument('-u', '--url', type=str, help='URL to scrape')
arg = parser.parse_args()
urlCheck(arg.url)
except Exception as e:
print(e)