rotemreiss · DylanTiger · Oct 11, 2024
diff --git a/uddup/main.py b/uddup/main.py
@@ -5,13 +5,14 @@
 import os
 import re
 from urllib.parse import urlparse
+import concurrent.futures
+import threading
 
 # Check if we are running this on windows platform
 is_windows = sys.platform.startswith('win')
 
 # Console Colors
 if is_windows:
-    # Windows deserves coloring too :D
     G = '\033[92m'  # green
     Y = '\033[93m'  # yellow
     W = '\033[0m'   # white
@@ -28,7 +29,7 @@
 
 
 def banner():
-    print("""%s
+    print(r"""%s
   _   _ ____      _             
  | | | |  _ \  __| |_   _ _ __  
  | | | | | | |/ _` | | | | '_ \ 
@@ -41,95 +42,40 @@ def banner():
 
 
 def file_arg(path):
-    # from os.path import exists
     if not os.path.isfile(path):
-        raise ValueError  # or TypeError, or `argparse.ArgumentTypeError
+        raise ValueError
     return path
 
 
 def get_ignored_suffixes():
     return (
-        'css',
-        'js',
-        'gif',
-        'jpg',
-        'png',
-        'jpeg',
-        'svg',
-        'xml',
-        'txt',
-        'json',
-        'ico',
-        'webp',
-        'otf',
-        'ttf',
-        'woff',
-        'woff2',
-        'eot',
-        'swf',
-        'zip',
-        'pdf',
-        'doc',
-        'ppt',
-        'docx',
-        'xls',
-        'xlsx',
-        'ogg',
-        'mp4',
-        'mp3',
-        'mov'
+        'css', 'js', 'gif', 'jpg', 'png', 'jpeg', 'svg', 'xml', 'txt', 'json', 
+        'ico', 'webp', 'otf', 'ttf', 'woff', 'woff2', 'eot', 'swf', 'zip', 
+        'pdf', 'doc', 'ppt', 'docx', 'xls', 'xlsx', 'ogg', 'mp4', 'mp3', 'mov'
     )
 
 
 def get_web_suffixes():
     return (
-        'htm',
-        'html',
-        'xhtml',
-        'shtml',
-        'jhtml',
-        'cfm',
-        'jsp',
-        'jspx',
-        'wss',
-        'action',
-        'php',
-        'php4',
-        'php5',
-        'py',
-        'rb',
-        'pl',
-        'do',
-        'xml',
-        'rss',
-        'cgi',
-        'axd',
-        'asx',
-        'asmx',
-        'ashx',
-        'asp',
-        'aspx',
-        'dll'
+        'htm', 'html', 'xhtml', 'shtml', 'jhtml', 'cfm', 'jsp', 'jspx', 
+        'wss', 'action', 'php', 'php4', 'php5', 'py', 'rb', 'pl', 'do', 
+        'xml', 'rss', 'cgi', 'axd', 'asx', 'asmx', 'ashx', 'asp', 'aspx', 'dll'
     )
 
 
 def get_existing_pattern_urls(purl, uurls):
     results = []
-
     url_path = get_url_path(purl)
     path_parts = url_path.split('/')
 
-    # If there is only one path, return empty list.
     if len(path_parts) == 1:
         return results
 
     url_pattern = '/'.join(path_parts[:-1])
-
     url_schema = purl.scheme
     url_hostname = purl.hostname
 
     for uurl in uurls:
-        # Skip different hostname and schemes (they can't be a match).
         if uurl.scheme != url_schema or uurl.hostname != url_hostname:
             continue
 
@@ -145,7 +91,6 @@ def get_query_params_keys(parsed_url_query):
     qparams = parsed_url_query.split('&')
     for q in qparams:
         keys.append(q.split('=')[0])
-
     return keys
 
 
@@ -156,7 +101,6 @@ def is_all_params_exists(old_pattern, new_pattern):
     for k in old_params_keys:
         if k not in new_params_keys:
             return False
-
     return True
 
 
@@ -170,68 +114,76 @@ def get_url_path(purl):
     return purl.path.strip('/')
 
 
-def main(urls_file, output, silent, filter_path):
+def process_url_batch(urls, web_suffixes, ignored_suffixes, filter_path):
+    thread_local_urls = set()
+
+    for url in urls:
+        url = url.rstrip()
+        if not url:
+            continue
+
+        parsed_url = urlparse(url)
+        url_path = get_url_path(parsed_url)
+
+        if not url_path:
+            thread_local_urls.add(parsed_url)
+            continue
+
+        if url_path.endswith(ignored_suffixes):
+            continue
+
+        if filter_path and re.search(filter_path, url_path):
+            continue
+
+        if url_path.endswith(web_suffixes):
+            thread_local_urls.add(parsed_url)
+            continue
+
+        existing_pattern_urls = get_existing_pattern_urls(parsed_url, thread_local_urls)
+        if not existing_pattern_urls:
+            thread_local_urls.add(parsed_url)
+        elif parsed_url.query:
+            for u in existing_pattern_urls:
+                if not u.query:
+                    thread_local_urls.remove(u)
+                    thread_local_urls.add(parsed_url)
+                    continue
+
+                if is_all_params_exists(u, parsed_url):
+                    if has_more_params(u, parsed_url):
+                        thread_local_urls.remove(u)
+                        thread_local_urls.add(parsed_url)
+                    continue
+                else:
+                    thread_local_urls.add(parsed_url)
+                    continue
+
+    return thread_local_urls
+
+
+def main(urls, output, silent, filter_path):
     unique_urls = set()
+    lock = threading.Lock()
 
-    # Every tool needs a banner.
     if not silent:
         banner()
 
     web_suffixes = get_web_suffixes()
     ignored_suffixes = get_ignored_suffixes()
-    # Iterate over the given domains
-    with open(urls_file, 'r', encoding="utf-8") as f:
-        for url in f:
-            url = url.rstrip()
-            if not url:
-                continue
-
-            parsed_url = urlparse(url)
-
-            # @todo Reconsider the strip, since it can remove some interesting urls
-            url_path = get_url_path(parsed_url)
-
-            # If the URL doesn't have a path, just add it as is.
-            # @todo Some dups can still occur, handle it
-            if not url_path:
-                unique_urls.add(parsed_url)
-                continue
-
-            # Do not add paths to common files.
-            if url_path.endswith(ignored_suffixes):
-                continue
-
-            # Filter paths by custom Regex if set.
-            if filter_path and re.search(filter_path, url_path):
-                continue
-
-            # Add as-is paths that points to a specific web extension (e.g. html).
-            if url_path.endswith(web_suffixes):
-                unique_urls.add(parsed_url)
-                continue
-
-            # Do the more complicated ddup work.
-            # Get existing URL patterns from our unique patterns.
-            existing_pattern_urls = get_existing_pattern_urls(parsed_url, unique_urls)
-            if not existing_pattern_urls:
-                unique_urls.add(parsed_url)
-            elif parsed_url.query:
-                for u in existing_pattern_urls:
-                    # Favor URL patterns with params over those without params.
-                    if not u.query:
-                        unique_urls.remove(u)
-                        unique_urls.add(parsed_url)
-                        continue
-
-                    # Check if it has query params that are extra to the unique URL pattern.
-                    if is_all_params_exists(u, parsed_url):
-                        if has_more_params(u, parsed_url):
-                            unique_urls.remove(u)
-                            unique_urls.add(parsed_url)
-                            continue
-                    else:
-                        unique_urls.add(parsed_url)
-                        continue
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # Process URLs in batches
+        batch_size = 100  # Tune this based on performance and task size
+        futures = [
+            executor.submit(process_url_batch, urls[i:i + batch_size], web_suffixes, ignored_suffixes, filter_path)
+            for i in range(0, len(urls), batch_size)
+        ]
+
+        for future in concurrent.futures.as_completed(futures):
+            thread_local_urls = future.result()
+            # Lock only when updating the global unique_urls set
+            with lock:
+                unique_urls.update(thread_local_urls)
 
     print_results(unique_urls, output)
     return unique_urls
@@ -240,14 +192,11 @@ def main(urls_file, output, silent, filter_path):
 def print_results(uurls, output):
     if output:
         try:
-            f = open(output, "w")
-
-            for url in sorted(uurls):
-                u = url.geturl()
-                f.write(u + "\n")
-                print(u)
-
-            f.close()
+            with open(output, "w") as f:
+                for url in sorted(uurls):
+                    u = url.geturl()
+                    f.write(u + "\n")
+                    print(u)
         except:
             print('[X] Failed to save the output to a file.')
     else:
@@ -257,16 +206,29 @@ def print_results(uurls, output):
 
 
 def interactive():
-    parser = argparse.ArgumentParser(description='Remove URL pattern duplications..')
+    parser = argparse.ArgumentParser(description='Remove URL pattern duplications.')
+
+    # Detect if input is from pipe
+    if not sys.stdin.isatty():
+        urls = sys.stdin.read().splitlines()
+        silent = True
+        output = None
+        filter_path = None
+    else:
+        parser.add_argument('-u', '--urls', help='File with a list of URLs.', type=file_arg, dest='urls_file', required=True)
+        parser.add_argument('-o', '--output', help='Save results to a file.', dest='output')
+        parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent')
+        parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path')
+        args = parser.parse_args()
+
+        silent = args.silent
+        output = args.output
+        filter_path = args.filter_path
 
-    # Add the arguments
-    parser.add_argument('-u', '--urls', help='File with a list of urls.', type=file_arg, dest='urls_file', required=True)
-    parser.add_argument('-o', '--output', help='Save results to a file.', dest='output')
-    parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent')
-    parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path')
-    args = parser.parse_args()
+        with open(args.urls_file, 'r', encoding="utf-8") as f:
+            urls = f.readlines()
 
-    main(args.urls_file, args.output, args.silent, args.filter_path)
+    main(urls, output, silent, filter_path)
 
 
 if __name__ == "__main__":