Skip to content

Commit

Permalink
Add wappalyzer dom detection and unit test about it
Browse files Browse the repository at this point in the history
  • Loading branch information
Darkiros authored and fwininger committed Jan 24, 2023
1 parent 49a8626 commit 5327eb7
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 4 deletions.
52 changes: 52 additions & 0 deletions tests/attack/test_mod_wapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,58 @@ async def test_html_detection():
'["Development"], "groups": ["Web development"]}'
)

@pytest.mark.asyncio
@respx.mock
async def test_dom_detection():
# Test if application is detected using its dom regex
respx.get("http://perdu.com").mock(
return_value=httpx.Response(
200,
text="<html><head><title>Vous Etes Perdu ?</title> \
<link href=\"/wp-content/plugins/astra-widgets/test.css?ver=1.5.4\" rel=\"stylesheet\" >\
</head><body><h1>Perdu sur l'Internet ?</h1> \
<h2>Pas de panique, on va vous aider</h2> \
<strong><pre> * <----- vous &ecirc;tes ici</pre></strong> \
<input type=\"hidden\" name=\"_glpi_csrf_token\" value=\"b6db36a8c9fd4f3f5d244faa76247688\">\
<p id=\"mod-sellacious-cart\">test text</p> \
<p id=\"sm-page-footer-copyright\">SmugMug</p> \
<img src=\"www.afi-b.com\" /> \
<a href=\"/cart\">test</a> \
</body></html>"
)
)

persister = AsyncMock()
home_dir = os.getenv("HOME") or os.getenv("USERPROFILE") or "/home"
base_dir = os.path.join(home_dir, ".wapiti")
persister.CONFIG_DIR = os.path.join(base_dir, "config")

request = Request("http://perdu.com")
request.path_id = 1

crawler_configuration = CrawlerConfiguration(Request("http://perdu.com"))
async with AsyncCrawler.with_configuration(crawler_configuration) as crawler:
options = {"timeout": 10, "level": 2}

module = ModuleWapp(crawler, persister, options, Event(), crawler_configuration)

await module.attack(request)

assert persister.add_payload.call_count
expected_result = [
'{"name": "Astra Widgets", "versions": ["1.5.4"], "categories": ["WordPress plugins", "Widgets"], "groups": ["Add-ons", "Other"]}',
'{"name": "GLPI", "versions": [], "categories": ["Web frameworks", "CRM"], "groups": ["Web development", "Marketing", "Business tools"]}',
'{"name": "Sellacious", "versions": [], "categories": ["Ecommerce"], "groups": ["Sales"]}',
'{"name": "SmugMug", "versions": [], "categories": ["Photo galleries"], "groups": ["Content", "Media"]}',
'{"name": "Affiliate B", "versions": [], "categories": ["Affiliate programs", "Advertising"], "groups": ["Marketing"]}',
'{"name": "Cart Functionality", "versions": [], "categories": ["Ecommerce"], "groups": ["Sales"]}',
'{"name": "PHP", "versions": [], "categories": ["Programming languages"], "groups": ["Web development"]}'

]
for arg in persister.add_payload.call_args_list:
assert arg[1]["info"] in expected_result



@pytest.mark.asyncio
@respx.mock
Expand Down
118 changes: 114 additions & 4 deletions wapitiCore/wappalyzer/wappalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import warnings
from typing import Set
from soupsieve.util import SelectorSyntaxError

from wapitiCore.net.crawler import Response
from wapitiCore.parsers.html_parser import Html
Expand Down Expand Up @@ -64,10 +65,21 @@ def normalize_applications(self):
# Ensure to not iterate on a string value
self.applications[application_name][list_field] = [self.applications[application_name][list_field]]

for dict_field in ["meta", "cookies", "headers"]:
for dict_field in ["meta", "cookies", "headers", "dom"]:
if dict_field not in self.applications[application_name]:
# Complete with empty elements if not already present
self.applications[application_name][dict_field] = {}
# To deal with dom
elif dict_field == "dom" and not isinstance(self.applications[application_name][dict_field], dict):
temp_dict = {}
dom = self.applications[application_name][dict_field]
if isinstance(dom, str):
temp_dict.update({dom: ""})
elif isinstance(dom, list):
for selector in dom:
temp_dict.update({selector: ""})
self.applications[application_name][dict_field] = temp_dict

elif not isinstance(self.applications[application_name][dict_field], dict):
# Raise an exception if the provided field is not a dict
raise ApplicationDataException(
Expand Down Expand Up @@ -130,6 +142,34 @@ def normalize_application_regex(self):
for i, pattern in enumerate(self.applications[application_name][dict_field][key]):
self.applications[application_name][dict_field][key][i] = self.normalize_regex(pattern)

# Format each dom of applications
tmp_dom = []
for (key, patterns) in self.applications[application_name]["dom"].items():
tmp_dom.append(self.normalize_application_regex_dom({key:patterns}))
self.applications[application_name]["dom"] = tmp_dom

@staticmethod
def normalize_application_regex_dom(dom : dict) -> dict:
"""
Convert the result of wappalyzer to a generic format used by Wapiti.
The goal is to match with the css selector if we don't have a regex.
In that case we set an empty value else we put the regex.
The list contains only one item same as the idct contains only one key to fit with the wapiti format.
ex input of the function :
{"link[href*='/wp-content/plugins/wp-statistics/']": ''}
{'[data-block-key]': {'attributes': {'data-block-key': '[a-z0-9]{5}'}}}
ex output :
{"link[href*='/wp-content/plugins/wp-statistics/']": {'exists': ''}}
{'[data-block-key]': {'attributes': {'data-block-key': '[a-z0-9]{5}'}}}
"""
css_selector = list(dom.keys())[0]
value = dom[css_selector]
if value == "":
return {css_selector : {"exists": ""}}
return dom

@staticmethod
def normalize_regex(pattern: str):
"""
Expand Down Expand Up @@ -290,11 +330,11 @@ def __init__(self, application_data: ApplicationData, web_content: Response, js:
self._url = web_content.url
self._html_code = web_content.content
# Copy some values to make sure they aren't processed more than once
html = Html(self._html_code, self._url)
self._scripts = html.scripts[:]
self.html = Html(self._html_code, self._url)
self._scripts = self.html.scripts[:]
self._cookies = dict(web_content.cookies)
self._headers = web_content.headers
self._metas = dict(html.metas)
self._metas = dict(self.html.metas)
self._js = js

def detect_application_versions(self, application: dict) -> Set[str]:
Expand All @@ -307,6 +347,7 @@ def detect_application_versions(self, application: dict) -> Set[str]:
"html": self._html_code,
"scriptSrc": self._scripts,
}

for element_name, data in elements_to_check.items():
versions.update(detect_versions_normalize_list(application[element_name], data))

Expand All @@ -318,8 +359,77 @@ def detect_application_versions(self, application: dict) -> Set[str]:
for element, data in elements_to_check.items():
versions.update(detect_versions_normalize_dict(application[element], data))

# Detect version of dom element
versions.update(self.detect_versions_normalize_dom(application))

return versions

def detect_versions_normalize_dom(self, application: dict) -> Set[str]:
versions = set()
soup = self.html.soup
for dom_raw in application["dom"]:
for css_selector, value in dom_raw.items():
self.check_dom_attribute(soup, versions, css_selector, value)
return versions

def check_dom_attribute(self, soup, versions : set, css_selector, value):
try:
match = soup.select(css_selector)
except SelectorSyntaxError as err:
warnings.warn(
f"Caught {err} while selecting css selector: {css_selector}")
return
for attribute, data in value.items():
if attribute == "exists":
self.check_dom_attribute_exists(match, versions)
elif attribute == "text":
self.check_dom_attribute_text(match, versions, data)

elif attribute == "attributes":
self.check_dom_attribute_others(match, versions, data)

@staticmethod
def check_dom_attribute_exists(match, versions : set):
# if attribute is "exists" we just want to match with the css selector
if match:
versions.add("__detected__")

@staticmethod
def check_dom_attribute_text(match, versions: set, data):
# if data is empty you just want to match with the css selector and check if the attribute exist
if data == "":
for match_html in match:
if match_html.getText():
versions.add("__detected__")
# Else we have to get value inside the attribute and it must match with the regex
else:
regex = ApplicationData.normalize_regex(data)
for match_html in match:
match_html_text = match_html.getText()
if match_html_text and re.search(regex["regex"], str(match_html_text)):
versions.add("__detected__")
versions.update(extract_version(regex, str(match)))

@staticmethod
def check_dom_attribute_others(match, versions: set, data):
for attribute, value in data.items():
# if data is empty you just want to match with the css selector and check if the attribute exist
if value == "":
for match_html in match:
if match_html.get(attribute):
versions.add("__detected__")
# Else we have to get value inside the attribute and it must match with the regex
else:
regex = ApplicationData.normalize_regex(value)
for match_html in match:
if attribute == "text":
match_html_text = match_html.getText()
else:
match_html_text = match_html.get(attribute)
if match_html_text and re.search(regex["regex"], str(match_html_text)):
versions.add("__detected__")
versions.update(extract_version(regex, str(match)))

def get_rec_implied_applications(self, detected_applications: Set[str]) -> set:
"""
Return the set of applications implied by the already detected applications.
Expand Down

0 comments on commit 5327eb7

Please sign in to comment.