-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_warc.py
112 lines (99 loc) · 4.42 KB
/
create_warc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
import sys
import re
from urllib.parse import urlparse
from warcio.capture_http import capture_http
# warcio Documentation STRESSES to import requests after capture_http
import requests
from requests_html import HTML
def check_url_similarity(url_1, url_2):
'''Method of compare two URLs to identify if they are same or not.
Returns bool: True/False based on comparison'''
def check_path(path_1, path_2):
# handles cases: cases where path are similar of have a trailing /
if path_1 == path_2:
return True
if path_1 == path_2+'/' or \
path_1+'/' == path_2:
return True
else:
return False
if len(url_2) == len(url_1):
if url_1 == url_2:
return True
else:
url_1_struct = urlparse(url_1)
url_2_struct = urlparse(url_2)
if url_1_struct.netloc == url_2_struct.netloc:
if check_path(url_1_struct.path, url_2_struct.path):
return True
if url_1_struct.netloc == 'www.'+url_2_struct.netloc or \
'www.'+url_1_struct.netloc == url_2_struct.netloc:
if check_path(url_1_struct.path, url_2_struct.path):
return True
return False
def find_js_urls(html):
'''Find urls to all JS files used in the HTML'''
js_urls = [elem.attrs['src'] for elem in html.find('script') if 'src' in elem.attrs]
print("Found {} urls for Javascripts".format(len(js_urls)))
return js_urls
def find_img_urls(html):
'''Find urls to all images being used in the HTML'''
img_urls = [elem.attrs['src'] for elem in html.find('img') if 'src' in elem.attrs]
print("Found {} urls for images".format(len(img_urls)))
return img_urls
def find_css_urls(html):
'''Find urls to all stylesheets used in the HTML'''
css_urls = []
# linked CSS that uses link tag. syntax: <link href="css_file.css" rel="stylesheet" type="text/css" media="all">
for elem in html.find('link'):
if 'stylesheet' in elem.attrs.get('rel'):
if 'href' in elem.attrs:
css_urls.append(elem.attrs['href'])
# Linked CSS that uses style tag. syntax: <link href="css_file.css" rel="stylesheet" type="text/css" media="all">
for elem in html.find('style'):
if 'src' in elem.attrs:
print('src inside style ', elem.html)
css_urls.append(elem.attrs['src'])
# Linked CSS that uses style tag and uses @import. syntax: <style type="text/css" media="all">\n@import url("//www.mygov.in/modules/system/system.base.css?q7y7yf");</style>
for elem in html.find('style'):
for url in re.findall(r'url\((.*?)\)', elem.html):
if url[0] == '"' or url[0] == "'":
# To avoid redundant quotes around url:
css_urls.append(url[1:-1])
else:
# To cover cases where link is not part of quotes:
# url(https://www.hindustantimes.com/images/app-images/2019/4/authors/footer-sprite.svg)
# This seems to be happening with HindustanTimes pages
css_urls.append(url)
print("Found {} urls for CSS files".format(len(css_urls)))
return css_urls
def get_all_static_urls(html):
static_urls = set.union(
set(find_js_urls(html)),
set(find_img_urls(html)),
set(find_css_urls(html))
)
return static_urls
def filter_records(request, response, recorder):
if response.http_headers.get_statuscode() != '200':
print("skipping {}".format(request.http_headers.get_statuscode()))
return None, None
return request, response
if __name__ == "__main__":
warc_file = '{}.warc.gz'.format(sys.argv[2])
with capture_http(warc_file, filter_records):
response = requests.get(sys.argv[1])
html = HTML(html=response.text, url=response.url)
static_resource_urls = get_all_static_urls(html)
for static_resource in static_resource_urls:
if not urlparse(static_resource).hostname:
continue
if check_url_similarity(static_resource, html.url):
print('skipping: {}'.format(static_resource))
continue
if not urlparse(static_resource).scheme:
static_resource = 'https:'+static_resource
print('downloading resource: {}'.format(static_resource))
requests.get(static_resource)
print('Done getting archive of: {}'.format(sys.argv[1]))