lets use lists

cockroacher · Sep 27, 2024 · 882597a · 882597a
1 parent 4b0952b
commit 882597a
Showing 1 changed file with 99 additions and 69 deletions.
diff --git a/helpers/sri_helper.py b/helpers/sri_helper.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=too-many-lines
 import base64
+import json
 import re
 import urllib
 import urllib.parse
@@ -141,44 +142,49 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result):
     # https://www.srihash.org/
     content = res['content']['text']
     # TODO: Should we match all elements and give penalty when used wrong?
-    regex = (
-        r'(?P<raw><(?P<name>link|script) [^>]*?>)'
-        )
-
-    matches = re.finditer(regex, content, re.MULTILINE | re.IGNORECASE)
-    for _, match in enumerate(matches, start=1):
-        raw = match.group('raw')
-        name = match.group('name').lower()
-
-        src = None
-        regex_src = r'(href|src)="(?P<src>[^"\']+)["\']'
-        group_src = re.search(regex_src, raw, re.IGNORECASE)
-        if group_src is not None:
-            src = group_src.group('src')
-            src = url_2_host_source(src, req_domain)
-
-        link_rel = None
-        regex_rel = r'(rel)="(?P<rel>[^"\']+)["\']'
-        group_rel = re.search(regex_rel, raw, re.IGNORECASE)
-        if group_rel is not None:
-            link_rel = group_rel.group('rel').lower()
-
-        should_have_integrity = False
-        if name in ('link'):
-            if link_rel in ('stylesheet', 'preload', 'modulepreload'):
-                should_have_integrity = True
-        elif name in ('script') and src is not None:
-            should_have_integrity = True
-
-        if should_have_integrity:
-            print('A', raw)
-            print('\tname:', name)
-            print('\tsrc/href:', src)
-            print('')
-
+    candidates = get_sri_candidates(req_domain, content)
+    nice_candidates = json.dumps(candidates, indent=3)
+    print('Candidates', nice_candidates)
     # regex = (
     #     r'(?P<raw><(?P<name>link|script)[^<]*? integrity=["\'](?P<integrity>[^"\']+)["\'][^>]*?>)'
     #     )
+    found_sris = get_sris(req_domain, content)
+    nice_found_sris = json.dumps(found_sris, indent=3)
+    print('SRI', nice_found_sris)
+
+    csp_findings_match = csp_findings_match or append_csp_data_for_linked_resources(
+        req_domain,
+        org_domain,
+        result,
+        content)
+
+    regex = r'<(?P<type>style|script|form)>'
+    matches = re.finditer(regex, content, re.DOTALL | re.IGNORECASE | re.MULTILINE)
+    for _, match in enumerate(matches, start=1):
+        element_name = match.group('type').lower()
+        if element_name in ('style', 'script'):
+            key = f'\'unsafe-inline\'|{element_name}'
+            if key not in result[org_domain]['csp-findings']['quotes']:
+                result[org_domain]['csp-findings']['quotes'].append(key)
+            csp_findings_match = True
+        elif element_name == 'form':
+            element_url = url_2_host_source(req_url, req_domain)
+            o = urllib.parse.urlparse(element_url)
+            element_domain = o.hostname
+            if element_domain == org_domain:
+                key = f'\'self\'|{element_name}'
+                if key not in result[org_domain]['csp-findings']['quotes']:
+                    result[org_domain]['csp-findings']['quotes'].append(key)
+                csp_findings_match = True
+            else:
+                key = f'{element_domain}|{element_name}'
+                if key not in result[org_domain]['csp-findings']['host-sources']:
+                    result[org_domain]['csp-findings']['host-sources'].append(key)
+                csp_findings_match = True
+    return csp_findings_match
+
+def get_sris(req_domain, content):
+    sri_list = []
     regex = (
         r'(?P<raw><(?P<name>[a-z]+)[^<]*? integrity=["\'](?P<integrity>[^"\']+)["\'][^>]*?>)'
         )
@@ -192,17 +198,23 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result):
         # - rel="stylesheet"
         # - rel="preload"
         # - rel="modulepreload"
-        print('B', raw)
-        print('\tname:', name)
-        print('\tintegrity:', integrity)
+        sri = {
+            'raw': raw,
+            'tag-name': name,
+            'integrity': integrity
+        }
+        # print('B', raw)
+        # print('\tname:', name)
+        # print('\tintegrity:', integrity)
 
         src = None
         regex_src = r'(href|src)="(?P<src>[^"\']+)["\']'
         group_src = re.search(regex_src, raw, re.IGNORECASE)
         if group_src is not None:
             src = group_src.group('src')
             src = url_2_host_source(src, req_domain)
-            print('\tsrc/href:', src)
+            sri['src'] = src
+            # print('\tsrc/href:', src)
 
         src_type = None
         if name == 'script':
@@ -223,8 +235,10 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result):
             if src_type is None and link_rel in ('stylesheet'):
                 src_type = 'style'
 
-        print('\ttype:', src_type)
-        print('\trel:', link_rel)
+        sri['type'] = src_type
+        sri['rel'] = link_rel
+        # print('\ttype:', src_type)
+        # print('\trel:', link_rel)
 
         if name in ('link'):
             if link_rel not in ('stylesheet', 'preload', 'modulepreload'):
@@ -236,37 +250,53 @@ def append_sri_data_for_html(req_url, req_domain, res, org_domain, result):
             print('WEBSITE WARNING: USING integrity incorrectly!')
 
         print('')
+        sri_list.append(sri)
 
-    csp_findings_match = csp_findings_match or append_csp_data_for_linked_resources(
-        req_domain,
-        org_domain,
-        result,
-        content)
+    return sri_list
 
-    regex = r'<(?P<type>style|script|form)>'
-    matches = re.finditer(regex, content, re.DOTALL | re.IGNORECASE | re.MULTILINE)
+def get_sri_candidates(req_domain, content):
+    candidates = []
+    regex = (
+        r'(?P<raw><(?P<name>link|script) [^>]*?>)'
+        )
+
+    matches = re.finditer(regex, content, re.MULTILINE | re.IGNORECASE)
     for _, match in enumerate(matches, start=1):
-        element_name = match.group('type').lower()
-        if element_name in ('style', 'script'):
-            key = f'\'unsafe-inline\'|{element_name}'
-            if key not in result[org_domain]['csp-findings']['quotes']:
-                result[org_domain]['csp-findings']['quotes'].append(key)
-            csp_findings_match = True
-        elif element_name == 'form':
-            element_url = url_2_host_source(req_url, req_domain)
-            o = urllib.parse.urlparse(element_url)
-            element_domain = o.hostname
-            if element_domain == org_domain:
-                key = f'\'self\'|{element_name}'
-                if key not in result[org_domain]['csp-findings']['quotes']:
-                    result[org_domain]['csp-findings']['quotes'].append(key)
-                csp_findings_match = True
-            else:
-                key = f'{element_domain}|{element_name}'
-                if key not in result[org_domain]['csp-findings']['host-sources']:
-                    result[org_domain]['csp-findings']['host-sources'].append(key)
-                csp_findings_match = True
-    return csp_findings_match
+        raw = match.group('raw')
+        name = match.group('name').lower()
+
+        src = None
+        regex_src = r'(href|src)="(?P<src>[^"\']+)["\']'
+        group_src = re.search(regex_src, raw, re.IGNORECASE)
+        if group_src is not None:
+            src = group_src.group('src')
+            src = url_2_host_source(src, req_domain)
+
+        link_rel = None
+        regex_rel = r'(rel)="(?P<rel>[^"\']+)["\']'
+        group_rel = re.search(regex_rel, raw, re.IGNORECASE)
+        if group_rel is not None:
+            link_rel = group_rel.group('rel').lower()
+
+        should_have_integrity = False
+        if name in ('link'):
+            if link_rel in ('stylesheet', 'preload', 'modulepreload'):
+                should_have_integrity = True
+        elif name in ('script') and src is not None:
+            should_have_integrity = True
+
+        if should_have_integrity:
+            # print('A', raw)
+            # print('\tname:', name)
+            # print('\tsrc/href:', src)
+            # print('')
+            candidates.append({
+                'raw': raw,
+                'tag-name': name,
+                'src': src
+            })
+
+    return candidates
 
 def append_csp_data_for_linked_resources(req_domain, org_domain, result, content):
     """