From 87fdd6deb60aae0a94f5d8cdb8b0f9bc836fb3e0 Mon Sep 17 00:00:00 2001 From: Mariusz Date: Mon, 26 Dec 2022 17:06:49 +0100 Subject: [PATCH 1/3] added a new feature to handle metadata from the site --- core/metadata.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ photon.py | 9 ++++++++ 2 files changed, 69 insertions(+) create mode 100644 core/metadata.py diff --git a/core/metadata.py b/core/metadata.py new file mode 100644 index 0000000..ac879c3 --- /dev/null +++ b/core/metadata.py @@ -0,0 +1,60 @@ +import extruct + + +def extract_metadata(text, url): + """Extract all metadata present in the page and return a dictionary of metadata lists. + + Args: + url (string): URL of page from which to extract metadata. + + Returns: + metadata (dict): Dictionary of json-ld, microdata, and opengraph lists. + Each of the lists present within the dictionary contains multiple dictionaries. + """ + + metadata = extruct.extract(text, + base_url=url, + uniform=True, + syntaxes=['json-ld', + 'microdata', + 'opengraph']) + return metadata + + +def get_dictionary_by_key_value(dictionary, target_key, target_value): + """Return a dictionary that contains a target key value pair. + + Args: + dictionary: Metadata dictionary containing lists of other dictionaries. + target_key: Target key to search for within a dictionary inside a list. + target_value: Target value to search for within a dictionary inside a list. + + Returns: + target_dictionary: Target dictionary that contains target key value pair. + """ + result = None + for key in dictionary: + if len(dictionary[key]) > 0: + for item in dictionary[key]: + if item[target_key] == target_value: + return item + return result + + +def get_dictionary_by_key(dictionary, target_key): + """Return a dictionary that contains a target key. + + Args: + dictionary: Metadata dictionary containing lists of other dictionaries. + target_key: Target key to search for within a dictionary inside a list. + + Returns: + target_dictionary: Target dictionary that contains target key value pair. + """ + result = None + for key in dictionary: + if len(dictionary[key]) > 0: + for item in dictionary[key]: + if item[target_key]: + return item + return result \ No newline at end of file diff --git a/photon.py b/photon.py index 37b7285..d95fc3c 100644 --- a/photon.py +++ b/photon.py @@ -36,6 +36,7 @@ from core.prompt import prompt from core.requester import requester from core.updater import updater +from core.metadata import extract_metadata, get_dictionary_by_key_value, get_dictionary_by_key from core.utils import (luhn, proxy_type, is_good_proxy, @@ -80,6 +81,7 @@ type=float) parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies', type=proxy_type) +parser.add_argument('-m', '--metadata', help='page metadata', dest='metadata') # Switches parser.add_argument('--clone', help='clone the website locally', dest='clone', @@ -142,6 +144,7 @@ crawl_level = args.level or 2 # Crawling level thread_count = args.threads or 2 # Number of threads only_urls = bool(args.only_urls) # Only URLs mode is off by default +has_metadata = bool(args.metadata) # Has metadata # Variables we are gonna use later to store stuff keys = set() # High entropy strings, prolly secret keys @@ -158,6 +161,7 @@ processed = set(['dummy']) # URLs that have been crawled # URLs that belong to the target i.e. in-scope internal = set(args.seeds) +metadata = set() everything = [] bad_scripts = set() # Unclean javascript file urls @@ -239,6 +243,11 @@ def remove_file(url): def extractor(url): """Extract details from the response body.""" response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed) + + # Add metadata + if has_metadata: + metadata.add(extract_metadata(response, url)) + if clone: mirror(url, response) matches = rhref.findall(response) From 96c97dc89342a94875e3126fac47e90b00b18c67 Mon Sep 17 00:00:00 2001 From: Mariusz Date: Mon, 26 Dec 2022 17:08:54 +0100 Subject: [PATCH 2/3] update response --- photon.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/photon.py b/photon.py index d95fc3c..d9fb439 100644 --- a/photon.py +++ b/photon.py @@ -81,7 +81,7 @@ type=float) parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies', type=proxy_type) -parser.add_argument('-m', '--metadata', help='page metadata', dest='metadata') +parser.add_argument('-m', '--metadata', help='page metadata', dest='store_true') # Switches parser.add_argument('--clone', help='clone the website locally', dest='clone', @@ -387,9 +387,9 @@ def jscanner(url): os.mkdir(output_dir) # create a new directory datasets = [files, intel, robots, custom, failed, internal, scripts, - external, fuzzable, endpoints, keys] + external, fuzzable, endpoints, keys, metadata] dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', - 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] + 'scripts', 'external', 'fuzzable', 'endpoints', 'keys', 'metadata'] writer(datasets, dataset_names, output_dir) # Printing out results @@ -408,7 +408,7 @@ def jscanner(url): 'custom': list(custom), 'failed': list(failed), 'internal': list(internal), 'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), - 'keys': list(keys) + 'keys': list(keys), 'metadata': list(metadata) } if args.dns: From d879b7f7bb90990f544f7b930a3a3e0ceb3da224 Mon Sep 17 00:00:00 2001 From: Mariusz Date: Mon, 26 Dec 2022 17:09:49 +0100 Subject: [PATCH 3/3] updated README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bf1de88..25d6a15 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Photon can extract the following data while crawling: - JavaScript files & Endpoints present in them - Strings matching custom regex pattern - Subdomains & DNS related data +- Metadata from sites The extracted information is saved in an organized manner or can be [exported as json](https://github.com/s0md3v/Photon/wiki/Usage#export-formatted-result).