diff --git a/helpers/sri_helper.py b/helpers/sri_helper.py index a734b9e9..f8d1d2ad 100644 --- a/helpers/sri_helper.py +++ b/helpers/sri_helper.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=too-many-lines import base64 +import json import re import urllib import urllib.parse @@ -141,44 +142,49 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result): # https://www.srihash.org/ content = res['content']['text'] # TODO: Should we match all elements and give penalty when used wrong? - regex = ( - r'(?P<(?Plink|script) [^>]*?>)' - ) - - matches = re.finditer(regex, content, re.MULTILINE | re.IGNORECASE) - for _, match in enumerate(matches, start=1): - raw = match.group('raw') - name = match.group('name').lower() - - src = None - regex_src = r'(href|src)="(?P[^"\']+)["\']' - group_src = re.search(regex_src, raw, re.IGNORECASE) - if group_src is not None: - src = group_src.group('src') - src = url_2_host_source(src, req_domain) - - link_rel = None - regex_rel = r'(rel)="(?P[^"\']+)["\']' - group_rel = re.search(regex_rel, raw, re.IGNORECASE) - if group_rel is not None: - link_rel = group_rel.group('rel').lower() - - should_have_integrity = False - if name in ('link'): - if link_rel in ('stylesheet', 'preload', 'modulepreload'): - should_have_integrity = True - elif name in ('script') and src is not None: - should_have_integrity = True - - if should_have_integrity: - print('A', raw) - print('\tname:', name) - print('\tsrc/href:', src) - print('') - + candidates = get_sri_candidates(req_domain, content) + nice_candidates = json.dumps(candidates, indent=3) + print('Candidates', nice_candidates) # regex = ( # r'(?P<(?Plink|script)[^<]*? integrity=["\'](?P[^"\']+)["\'][^>]*?>)' # ) + found_sris = get_sris(req_domain, content) + nice_found_sris = json.dumps(found_sris, indent=3) + print('SRI', nice_found_sris) + + csp_findings_match = csp_findings_match or append_csp_data_for_linked_resources( + req_domain, + org_domain, + result, + content) + + regex = r'<(?Pstyle|script|form)>' + matches = re.finditer(regex, content, re.DOTALL | re.IGNORECASE | re.MULTILINE) + for _, match in enumerate(matches, start=1): + element_name = match.group('type').lower() + if element_name in ('style', 'script'): + key = f'\'unsafe-inline\'|{element_name}' + if key not in result[org_domain]['csp-findings']['quotes']: + result[org_domain]['csp-findings']['quotes'].append(key) + csp_findings_match = True + elif element_name == 'form': + element_url = url_2_host_source(req_url, req_domain) + o = urllib.parse.urlparse(element_url) + element_domain = o.hostname + if element_domain == org_domain: + key = f'\'self\'|{element_name}' + if key not in result[org_domain]['csp-findings']['quotes']: + result[org_domain]['csp-findings']['quotes'].append(key) + csp_findings_match = True + else: + key = f'{element_domain}|{element_name}' + if key not in result[org_domain]['csp-findings']['host-sources']: + result[org_domain]['csp-findings']['host-sources'].append(key) + csp_findings_match = True + return csp_findings_match + +def get_sris(req_domain, content): + sri_list = [] regex = ( r'(?P<(?P[a-z]+)[^<]*? integrity=["\'](?P[^"\']+)["\'][^>]*?>)' ) @@ -192,9 +198,14 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result): # - rel="stylesheet" # - rel="preload" # - rel="modulepreload" - print('B', raw) - print('\tname:', name) - print('\tintegrity:', integrity) + sri = { + 'raw': raw, + 'tag-name': name, + 'integrity': integrity + } + # print('B', raw) + # print('\tname:', name) + # print('\tintegrity:', integrity) src = None regex_src = r'(href|src)="(?P[^"\']+)["\']' @@ -202,7 +213,8 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result): if group_src is not None: src = group_src.group('src') src = url_2_host_source(src, req_domain) - print('\tsrc/href:', src) + sri['src'] = src + # print('\tsrc/href:', src) src_type = None if name == 'script': @@ -223,8 +235,10 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result): if src_type is None and link_rel in ('stylesheet'): src_type = 'style' - print('\ttype:', src_type) - print('\trel:', link_rel) + sri['type'] = src_type + sri['rel'] = link_rel + # print('\ttype:', src_type) + # print('\trel:', link_rel) if name in ('link'): if link_rel not in ('stylesheet', 'preload', 'modulepreload'): @@ -236,37 +250,53 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result): print('WEBSITE WARNING: USING integrity incorrectly!') print('') + sri_list.append(sri) - csp_findings_match = csp_findings_match or append_csp_data_for_linked_resources( - req_domain, - org_domain, - result, - content) + return sri_list - regex = r'<(?Pstyle|script|form)>' - matches = re.finditer(regex, content, re.DOTALL | re.IGNORECASE | re.MULTILINE) +def get_sri_candidates(req_domain, content): + candidates = [] + regex = ( + r'(?P<(?Plink|script) [^>]*?>)' + ) + + matches = re.finditer(regex, content, re.MULTILINE | re.IGNORECASE) for _, match in enumerate(matches, start=1): - element_name = match.group('type').lower() - if element_name in ('style', 'script'): - key = f'\'unsafe-inline\'|{element_name}' - if key not in result[org_domain]['csp-findings']['quotes']: - result[org_domain]['csp-findings']['quotes'].append(key) - csp_findings_match = True - elif element_name == 'form': - element_url = url_2_host_source(req_url, req_domain) - o = urllib.parse.urlparse(element_url) - element_domain = o.hostname - if element_domain == org_domain: - key = f'\'self\'|{element_name}' - if key not in result[org_domain]['csp-findings']['quotes']: - result[org_domain]['csp-findings']['quotes'].append(key) - csp_findings_match = True - else: - key = f'{element_domain}|{element_name}' - if key not in result[org_domain]['csp-findings']['host-sources']: - result[org_domain]['csp-findings']['host-sources'].append(key) - csp_findings_match = True - return csp_findings_match + raw = match.group('raw') + name = match.group('name').lower() + + src = None + regex_src = r'(href|src)="(?P[^"\']+)["\']' + group_src = re.search(regex_src, raw, re.IGNORECASE) + if group_src is not None: + src = group_src.group('src') + src = url_2_host_source(src, req_domain) + + link_rel = None + regex_rel = r'(rel)="(?P[^"\']+)["\']' + group_rel = re.search(regex_rel, raw, re.IGNORECASE) + if group_rel is not None: + link_rel = group_rel.group('rel').lower() + + should_have_integrity = False + if name in ('link'): + if link_rel in ('stylesheet', 'preload', 'modulepreload'): + should_have_integrity = True + elif name in ('script') and src is not None: + should_have_integrity = True + + if should_have_integrity: + # print('A', raw) + # print('\tname:', name) + # print('\tsrc/href:', src) + # print('') + candidates.append({ + 'raw': raw, + 'tag-name': name, + 'src': src + }) + + return candidates def append_csp_data_for_linked_resources(req_domain, org_domain, result, content): """