Skip to content

Commit

Permalink
new: Generate domhash when building tree
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafiot committed Oct 23, 2024
1 parent 7f5b704 commit 84b81c1
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 47 deletions.
52 changes: 13 additions & 39 deletions har2tree/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from base64 import b64decode
from collections import defaultdict
from functools import lru_cache
from io import BytesIO
from logging import LoggerAdapter
from typing import Mapping, MutableMapping, Any
Expand All @@ -20,7 +19,6 @@
import filetype # type: ignore

from bs4 import BeautifulSoup, Tag, MarkupResemblesLocatorWarning
from charset_normalizer import from_bytes

warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)

Expand Down Expand Up @@ -243,26 +241,10 @@ def _unpack_data_uri(data: str) -> tuple[str, str, BytesIO] | None:
return None


@lru_cache(maxsize=5)
def init_bs4(html_doc: bytes) -> BeautifulSoup | None:
# make BS4 life easier and avoid it to attempt to decode
doc_as_str = str(from_bytes(html_doc).best())
if not doc_as_str:
# no need to bother.
return None
if doc_as_str.startswith('<?xml'):
return BeautifulSoup(doc_as_str, 'lxml-xml')
else:
return BeautifulSoup(doc_as_str, 'lxml')


def find_identifiers(html_doc: bytes) -> dict[str, list[str]] | None:
def find_identifiers(soup: BeautifulSoup) -> dict[str, list[str]] | None:
''' Extracts the identifiers from the HTML blob.
The identifier we extract now is the recapthca site key, but there will be more.
'''
soup = init_bs4(html_doc)
if soup is None:
return None
to_return: dict[str, list[str]] = defaultdict(list)

default_captchas = ['g-recaptcha', 'h-captcha', 'cf-turnstile']
Expand All @@ -280,15 +262,16 @@ def find_identifiers(html_doc: bytes) -> dict[str, list[str]] | None:
# This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
# https://support.google.com/google-ads/answer/12326985?hl=en_us_us
# NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
if google_tag_ids := set(re.findall(rb"(?:G-|AW-|GA-|UA-)\w{9,13}", html_doc)):
blocklist = {b'UA-Compatible'}
if google_tag_ids := set(re.findall(r"(?:G-|AW-|GA-|UA-)\w{9,13}", str(soup))):
blocklist = {'UA-Compatible'}
google_tag_ids -= blocklist
to_return['google_tag_ids'] = [i.decode() for i in google_tag_ids]
if google_tag_ids:
to_return['google_tag_ids'] = list(google_tag_ids)

return to_return


def find_external_ressources(html_doc: bytes, base_url: str, all_requests: list[str], full_text_search: bool=True) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, BytesIO]]]]:
def find_external_ressources(soup: BeautifulSoup, base_url: str, all_requests: list[str], full_text_search: bool=True) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, BytesIO]]]]:
""" Get URLs to external contents out of an HTML blob."""
# Source: https://stackoverflow.com/questions/31666584/beutifulsoup-to-extract-all-external-resources-from-html
# Because this is awful.
Expand All @@ -311,8 +294,9 @@ def find_external_ressources(html_doc: bytes, base_url: str, all_requests: list[
'meta_refresh': []}

embedded_ressources: dict[str, list[tuple[str, BytesIO]]] = defaultdict(list)
soup = init_bs4(html_doc)
if soup is None:
string_soup = str(soup)
if not string_soup:
# Empty HTML document, nothing to do
return external_ressources, embedded_ressources
for link in soup.find_all(['img', 'script', 'video', 'audio', 'iframe', 'embed',
'source', 'link', 'object']):
Expand Down Expand Up @@ -359,12 +343,7 @@ def find_external_ressources(html_doc: bytes, base_url: str, all_requests: list[
external_ressources['meta_refresh'].append(content)

# external stuff loaded from css content, because reasons.
for u in re.findall(rb'url\((?:[\'"])?(.*?)(?:[\'"])?\)', html_doc):
try:
url = u.decode()
except UnicodeDecodeError as e:
logger.info(f'Unable to decode ressource in CSS {u[:20]}[...]: {e}')
continue
for url in re.findall(r'url\((?:[\'"])?(.*?)(?:[\'"])?\)', string_soup):
if url.startswith('data:'):
unpacked = _unpack_data_uri(url)
if unpacked:
Expand All @@ -375,18 +354,13 @@ def find_external_ressources(html_doc: bytes, base_url: str, all_requests: list[

# Javascript changing the current page
# I never found a website where it matched anything useful
for u in re.findall(b'(?:window|self|top).location(?:.*)\"(.*?)\"', html_doc):
try:
url = u.decode()
except UnicodeDecodeError as e:
logger.info(f'Unable to decode ressource in JS {u[:20]}[...]: {e}')
continue
for url in re.findall('(?:window|self|top).location(?:.*)\"(.*?)\"', string_soup):
external_ressources['javascript'].append(url)
# NOTE: we may want to extract calls to decodeURI and decodeURIComponent
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent
# Just in case, there is sometimes an unescape call in JS code
for to_unescape in re.findall(br'unescape\(\'(.*)\'\)', html_doc):
for to_unescape in re.findall(r'unescape\(\'(.*)\'\)', string_soup):
unescaped = unquote_to_bytes(to_unescape)
kind = filetype.guess(unescaped)
if kind:
Expand All @@ -399,7 +373,7 @@ def find_external_ressources(html_doc: bytes, base_url: str, all_requests: list[

if full_text_search:
# Just regex in the whole blob, because we can
external_ressources['full_regex'] = [url.decode() for url in re.findall(rb'(?:http[s]?:)?//(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html_doc)]
external_ressources['full_regex'] = re.findall(r'(?:http[s]?:)?//(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(soup))
# print("################ REGEXES ", external_ressources['full_regex'])
# NOTE: unescaping a potential URL as HTML content can make it unusable (example: (...)&ltime=(...>) => (...)<ime=(...))
return url_cleanup(external_ressources, base_url, all_requests), embedded_ressources
Expand Down
39 changes: 32 additions & 7 deletions har2tree/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
import re

from base64 import b64decode
from charset_normalizer import from_bytes
from datetime import datetime, timedelta
from functools import lru_cache
from functools import lru_cache, cached_property, cache
from hashlib import sha256
from io import BytesIO
from pathlib import Path
from typing import MutableMapping, Any
Expand Down Expand Up @@ -81,10 +83,14 @@ def __init__(self, capture_uuid: str, **kwargs: Any):
self.features_to_skip.add('time_content_received')
self.features_to_skip.add('ip_address')

def _compute_domhash(self) -> str:
to_hash = "|".join(t.name for t in self.rendered_soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]

def add_rendered_features(self, all_requests: list[str], rendered_html: BytesIO | None=None, downloaded_file: tuple[str, BytesIO | None] | None=None) -> None:
if rendered_html:
self.add_feature('rendered_html', rendered_html)
rendered_external, rendered_embedded = find_external_ressources(rendered_html.getvalue(), self.name, all_requests)
rendered_external, rendered_embedded = find_external_ressources(self.rendered_soup, self.name, all_requests)
if hasattr(self, 'external_ressources'):
# for the external ressources, the keys are always the same
self.external_ressources: dict[str, list[str]] = {initiator_type: urls + rendered_external[initiator_type] for initiator_type, urls in self.external_ressources.items()}
Expand All @@ -98,9 +104,13 @@ def add_rendered_features(self, all_requests: list[str], rendered_html: BytesIO
else:
self.add_feature('embedded_ressources', rendered_embedded)

if identifiers := find_identifiers(rendered_html.getvalue()):
if identifiers := find_identifiers(self.rendered_soup):
self.add_feature('identifiers', identifiers)

if domhash := self._compute_domhash():
print(domhash)
self.add_feature('domhash', domhash)

if downloaded_file:
downloaded_filename, downloaded_file_data = downloaded_file
self.add_feature('downloaded_file', downloaded_file_data)
Expand Down Expand Up @@ -355,7 +365,8 @@ def load_har_entry(self, har_entry: MutableMapping[str, Any], all_requests: list
if not hasattr(self, 'mimetype'):
self.add_feature('mimetype', '')

external_ressources, embedded_ressources = find_external_ressources(self.body.getvalue(), self.name, all_requests)
soup = self._make_soup(self.body.getvalue())
external_ressources, embedded_ressources = find_external_ressources(soup, self.name, all_requests)
self.add_feature('external_ressources', external_ressources)
self.add_feature('embedded_ressources', embedded_ressources)

Expand Down Expand Up @@ -534,18 +545,17 @@ def _sanitize(maybe_url: str) -> str | None:
if not hasattr(self, 'rendered_html') or not self.rendered_html:
raise Har2TreeError('Not the node of a page rendered, invalid request.')
urls: set[str] = set()
soup = BeautifulSoup(self.rendered_html.getvalue(), "lxml")

# The simple ones: the links.
for a_tag in soup.find_all(["a", "area"]):
for a_tag in self.rendered_soup.find_all(["a", "area"]):
href = a_tag.attrs.get("href")
if not href:
continue
if href := _sanitize(href):
urls.add(href)

# The rest of the mess
for tag in soup.find_all(True):
for tag in self.rendered_soup.find_all(True):
if tag.name in ["a", "area", 'img', 'script', 'video', 'audio', 'iframe', 'embed',
'source', 'link', 'object']:
# processed either above or as external resources
Expand All @@ -559,6 +569,21 @@ def _sanitize(maybe_url: str) -> str | None:

return sorted(urls)

@cache
def _make_soup(self, html: bytes) -> BeautifulSoup:
# make BS4 life easier and avoid it to attempt to decode
doc_as_str = str(from_bytes(html).best())
if doc_as_str.startswith('<?xml'):
return BeautifulSoup(doc_as_str, 'lxml-xml')
else:
return BeautifulSoup(doc_as_str, 'lxml')

@cached_property
def rendered_soup(self) -> BeautifulSoup:
if not hasattr(self, 'rendered_html') or not self.rendered_html:
raise Har2TreeError('Not the node of a page rendered, invalid request.')
return self._make_soup(self.rendered_html.getvalue())


class HostNode(HarTreeNode):

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "har2tree"
version = "1.26.0"
version = "1.26.1"
description = "HTTP Archive (HAR) to ETE Toolkit generator"
authors = ["Raphaël Vinot <[email protected]>"]
license = "BSD-3-Clause"
Expand Down

0 comments on commit 84b81c1

Please sign in to comment.