Skip to content

Pipe input and concurrent futures #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 99 additions & 137 deletions uddup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import os
import re
from urllib.parse import urlparse
import concurrent.futures
import threading

# Check if we are running this on windows platform
is_windows = sys.platform.startswith('win')

# Console Colors
if is_windows:
# Windows deserves coloring too :D
G = '\033[92m' # green
Y = '\033[93m' # yellow
W = '\033[0m' # white
Expand All @@ -28,7 +29,7 @@


def banner():
print("""%s
print(r"""%s
_ _ ____ _
| | | | _ \ __| |_ _ _ __
| | | | | | |/ _` | | | | '_ \
Expand All @@ -41,95 +42,40 @@ def banner():


def file_arg(path):
# from os.path import exists
if not os.path.isfile(path):
raise ValueError # or TypeError, or `argparse.ArgumentTypeError
raise ValueError
return path


def get_ignored_suffixes():
return (
'css',
'js',
'gif',
'jpg',
'png',
'jpeg',
'svg',
'xml',
'txt',
'json',
'ico',
'webp',
'otf',
'ttf',
'woff',
'woff2',
'eot',
'swf',
'zip',
'pdf',
'doc',
'ppt',
'docx',
'xls',
'xlsx',
'ogg',
'mp4',
'mp3',
'mov'
'css', 'js', 'gif', 'jpg', 'png', 'jpeg', 'svg', 'xml', 'txt', 'json',
'ico', 'webp', 'otf', 'ttf', 'woff', 'woff2', 'eot', 'swf', 'zip',
'pdf', 'doc', 'ppt', 'docx', 'xls', 'xlsx', 'ogg', 'mp4', 'mp3', 'mov'
)


def get_web_suffixes():
return (
'htm',
'html',
'xhtml',
'shtml',
'jhtml',
'cfm',
'jsp',
'jspx',
'wss',
'action',
'php',
'php4',
'php5',
'py',
'rb',
'pl',
'do',
'xml',
'rss',
'cgi',
'axd',
'asx',
'asmx',
'ashx',
'asp',
'aspx',
'dll'
'htm', 'html', 'xhtml', 'shtml', 'jhtml', 'cfm', 'jsp', 'jspx',
'wss', 'action', 'php', 'php4', 'php5', 'py', 'rb', 'pl', 'do',
'xml', 'rss', 'cgi', 'axd', 'asx', 'asmx', 'ashx', 'asp', 'aspx', 'dll'
)


def get_existing_pattern_urls(purl, uurls):
results = []

url_path = get_url_path(purl)
path_parts = url_path.split('/')

# If there is only one path, return empty list.
if len(path_parts) == 1:
return results

url_pattern = '/'.join(path_parts[:-1])

url_schema = purl.scheme
url_hostname = purl.hostname

for uurl in uurls:
# Skip different hostname and schemes (they can't be a match).
if uurl.scheme != url_schema or uurl.hostname != url_hostname:
continue

Expand All @@ -145,7 +91,6 @@ def get_query_params_keys(parsed_url_query):
qparams = parsed_url_query.split('&')
for q in qparams:
keys.append(q.split('=')[0])

return keys


Expand All @@ -156,7 +101,6 @@ def is_all_params_exists(old_pattern, new_pattern):
for k in old_params_keys:
if k not in new_params_keys:
return False

return True


Expand All @@ -170,68 +114,76 @@ def get_url_path(purl):
return purl.path.strip('/')


def main(urls_file, output, silent, filter_path):
def process_url_batch(urls, web_suffixes, ignored_suffixes, filter_path):
thread_local_urls = set()

for url in urls:
url = url.rstrip()
if not url:
continue

parsed_url = urlparse(url)
url_path = get_url_path(parsed_url)

if not url_path:
thread_local_urls.add(parsed_url)
continue

if url_path.endswith(ignored_suffixes):
continue

if filter_path and re.search(filter_path, url_path):
continue

if url_path.endswith(web_suffixes):
thread_local_urls.add(parsed_url)
continue

existing_pattern_urls = get_existing_pattern_urls(parsed_url, thread_local_urls)
if not existing_pattern_urls:
thread_local_urls.add(parsed_url)
elif parsed_url.query:
for u in existing_pattern_urls:
if not u.query:
thread_local_urls.remove(u)
thread_local_urls.add(parsed_url)
continue

if is_all_params_exists(u, parsed_url):
if has_more_params(u, parsed_url):
thread_local_urls.remove(u)
thread_local_urls.add(parsed_url)
continue
else:
thread_local_urls.add(parsed_url)
continue

return thread_local_urls


def main(urls, output, silent, filter_path):
unique_urls = set()
lock = threading.Lock()

# Every tool needs a banner.
if not silent:
banner()

web_suffixes = get_web_suffixes()
ignored_suffixes = get_ignored_suffixes()
# Iterate over the given domains
with open(urls_file, 'r', encoding="utf-8") as f:
for url in f:
url = url.rstrip()
if not url:
continue

parsed_url = urlparse(url)

# @todo Reconsider the strip, since it can remove some interesting urls
url_path = get_url_path(parsed_url)

# If the URL doesn't have a path, just add it as is.
# @todo Some dups can still occur, handle it
if not url_path:
unique_urls.add(parsed_url)
continue

# Do not add paths to common files.
if url_path.endswith(ignored_suffixes):
continue

# Filter paths by custom Regex if set.
if filter_path and re.search(filter_path, url_path):
continue

# Add as-is paths that points to a specific web extension (e.g. html).
if url_path.endswith(web_suffixes):
unique_urls.add(parsed_url)
continue

# Do the more complicated ddup work.
# Get existing URL patterns from our unique patterns.
existing_pattern_urls = get_existing_pattern_urls(parsed_url, unique_urls)
if not existing_pattern_urls:
unique_urls.add(parsed_url)
elif parsed_url.query:
for u in existing_pattern_urls:
# Favor URL patterns with params over those without params.
if not u.query:
unique_urls.remove(u)
unique_urls.add(parsed_url)
continue

# Check if it has query params that are extra to the unique URL pattern.
if is_all_params_exists(u, parsed_url):
if has_more_params(u, parsed_url):
unique_urls.remove(u)
unique_urls.add(parsed_url)
continue
else:
unique_urls.add(parsed_url)
continue

with concurrent.futures.ThreadPoolExecutor() as executor:
# Process URLs in batches
batch_size = 100 # Tune this based on performance and task size
futures = [
executor.submit(process_url_batch, urls[i:i + batch_size], web_suffixes, ignored_suffixes, filter_path)
for i in range(0, len(urls), batch_size)
]

for future in concurrent.futures.as_completed(futures):
thread_local_urls = future.result()
# Lock only when updating the global unique_urls set
with lock:
unique_urls.update(thread_local_urls)

print_results(unique_urls, output)
return unique_urls
Expand All @@ -240,14 +192,11 @@ def main(urls_file, output, silent, filter_path):
def print_results(uurls, output):
if output:
try:
f = open(output, "w")

for url in sorted(uurls):
u = url.geturl()
f.write(u + "\n")
print(u)

f.close()
with open(output, "w") as f:
for url in sorted(uurls):
u = url.geturl()
f.write(u + "\n")
print(u)
except:
print('[X] Failed to save the output to a file.')
else:
Expand All @@ -257,16 +206,29 @@ def print_results(uurls, output):


def interactive():
parser = argparse.ArgumentParser(description='Remove URL pattern duplications..')
parser = argparse.ArgumentParser(description='Remove URL pattern duplications.')

# Detect if input is from pipe
if not sys.stdin.isatty():
urls = sys.stdin.read().splitlines()
silent = True
output = None
filter_path = None
else:
parser.add_argument('-u', '--urls', help='File with a list of URLs.', type=file_arg, dest='urls_file', required=True)
parser.add_argument('-o', '--output', help='Save results to a file.', dest='output')
parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent')
parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path')
args = parser.parse_args()

silent = args.silent
output = args.output
filter_path = args.filter_path

# Add the arguments
parser.add_argument('-u', '--urls', help='File with a list of urls.', type=file_arg, dest='urls_file', required=True)
parser.add_argument('-o', '--output', help='Save results to a file.', dest='output')
parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent')
parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path')
args = parser.parse_args()
with open(args.urls_file, 'r', encoding="utf-8") as f:
urls = f.readlines()

main(args.urls_file, args.output, args.silent, args.filter_path)
main(urls, output, silent, filter_path)


if __name__ == "__main__":
Expand Down