-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathscrape.py
80 lines (67 loc) · 2.84 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Scrape metadata attributes from a requested URL."""
def get_title(html):
"""Scrape page title."""
title = None
if html.find("meta", property="og:title"):
title = html.find("meta", property="og:title").get('content')
elif html.find("meta", property="twitter:title"):
title = html.find("meta", property="twitter:title").get('content')
elif html.find("title"):
title = html.find("title").string
elif html.find("h1"):
title = html.find("h1").string
return title
def get_description(html):
"""Scrape page description."""
description = None
if html.find("meta", property="description"):
description = html.find("meta", property="description").get('content')
elif html.find("meta", property="og:description"):
description = html.find("meta", property="og:description").get('content')
elif html.find("meta", property="twitter:description"):
description = html.find("meta", property="twitter:description").get('content')
elif html.find("p"):
description = html.find("p").contents
return description
def get_image(html):
"""Scrape share image."""
image = None
if html.find("meta", property="image"):
image = html.find("meta", property="image").get('content')
elif html.find("meta", property="og:image"):
image = html.find("meta", property="og:image").get('content')
elif html.find("meta", property="twitter:image"):
image = html.find("meta", property="twitter:image").get('content')
elif html.find_all("img", src=True):
image = html.find_all("img")
if image:
image = html.find_all("img")[0].get('src')
return image
def get_site_name(html, url):
"""Scrape site name."""
if html.find("meta", property="og:site_name"):
site_name = html.find("meta", property="og:site_name").get('content')
elif html.find("meta", property='twitter:site'):
site_name = html.find("meta", property="twitter:site").get('content')
else:
site_name = url.split('//')[1]
if site_name.split('/')[0].rsplit('.')[0] == 'www':
return site_name.split('/')[0].rsplit('.')[1]
else:
return site_name.split('/')[0].rsplit('.')[0]
return site_name
def get_favicon(html, url):
"""Scrape favicon."""
if html.find("link", attrs={"rel": "icon"}):
favicon = html.find("link", attrs={"rel": "icon"}).get('href')
elif html.find("link", attrs={"rel": "shortcut icon"}):
favicon = html.find("link", attrs={"rel": "shortcut icon"}).get('href')
else:
favicon = f'{url.rstrip("/")}/favicon.ico'
return favicon
def get_theme_color(html):
"""Scrape brand color."""
if html.find("meta", property="theme-color"):
color = html.find("meta", property="theme-color").get('content')
return color
return None