From aee781567b567e2af146df802146494b45f4b5e2 Mon Sep 17 00:00:00 2001
From: Mitch Haddadi <12067255+dfirsec@users.noreply.github.com>
Date: Sat, 20 May 2023 08:10:32 -0400
Subject: [PATCH] Optimized & Refactored

---
 ioc_extractor.py      | 81 +++++++++++++++++--------------------------
 utils/logger.py       | 13 ++++---
 utils/regex_helper.py | 24 +++++--------
 3 files changed, 46 insertions(+), 72 deletions(-)

diff --git a/ioc_extractor.py b/ioc_extractor.py
index d469f97..8c95726 100644
--- a/ioc_extractor.py
+++ b/ioc_extractor.py
@@ -4,7 +4,6 @@
 import json
 import sys
 from pathlib import Path
-from typing import Dict, Tuple
 
 import chardet
 from rich.pretty import pprint
@@ -16,18 +15,14 @@
 
 
 def get_files(source: str, extensions: tuple) -> list:
-    """
-    Returns a list of all files in a given source directory with specific extensions.
+    """Returns a list of all files in a given directory with specific extensions.
 
     Args:
-        source (str):
-          A string representing the directory path where the files are located.
-        extensions (tuple):
-          A tuple of file extensions that the function should look for in the
-            specified source directory.
+        source (str): Path where the files are located.
+        extensions (tuple): Extensions to look for.
 
     Returns:
-        A list of file paths that match the given extensions in the specified source directory.
+        A list of file paths that match the extensions in the specified source directory.
     """
     all_files = []
     for ext in extensions:
@@ -35,18 +30,14 @@ def get_files(source: str, extensions: tuple) -> list:
     return all_files
 
 
-def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, list]:
-    """
-    Processes a given line of text using `regex` and returns a dictionary of patterns found in the line.
+def process_line(regex: RegexHelper, line: str, patterns: dict) -> dict[str, list]:
+    """Processes a given line of text using `regex` and returns a dictionary of patterns found in the line.
 
     Args:
-        regex (RegexHelper):
-          A helper class that contains regular expression patterns for different types of data
+        regex (RegexHelper): Methods for working with the `regex` patterns.
             (e.g. IP addresses, email addresses, etc.)
-        line (str):
-          The input string that contains the text to be processed for patterns
-        patterns (dict):
-          A dictionary that stores the patterns found in the input line.
+        line (str): The text to be processed for patterns
+        patterns (dict): Dictionary that stores the patterns found.
 
     Returns:
         A dictionary containing the patterns found in the input line, organized by their type (e.g.
@@ -72,19 +63,13 @@ def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, lis
     return patterns
 
 
-def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str, dict]:
-    """
-    Reads a file, detects its encoding, and processes each line using `process_line` function.
+def process_file(regex: RegexHelper, filename: str, patterns: dict) -> tuple[str, dict]:
+    """Reads a file, detects its encoding, and processes each line using `process_line` function.
 
     Args:
-        regex (RegexHelper):
-          An instance of the RegexHelper class, which provides methods for working
-            with `regex` patterns.
-        filename (str):
-          The name of the file to be processed,
-        patterns (dict):
-          A dictionary containing `regex` patterns as keys and their corresponding
-            counts as values.
+        regex (RegexHelper): Methods for working with the `regex` patterns.
+        filename (str): The name of the file to be processed
+        patterns (dict): A dictionary containing `regex` patterns.
 
     Returns:
         A tuple containing the filename and a dictionary of patterns.
@@ -107,44 +92,40 @@ def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str
 
 
 def save_to_file(data: dict, output_file: str) -> None:
-    """
-    Ssaves a dictionary object to a file in JSON format.
+    """Saves a dictionary object to a file in JSON format.
 
     Args:
-        data (dict):
-          A dictionary containing the data that needs to be saved to a file.
-        output_file (str):
-          Represents the file path and name where the data will be saved.
+        data (dict): Data that needs to be saved to a file.
+        output_file (str): Path and name where the data will be saved.
     """
     with open(output_file, "w", encoding="utf-8") as fileobj:
         json.dump(data, fileobj, indent=4)
 
 
-def main(source: str, extensions: tuple) -> Dict[str, Dict]:
-    """
-    Main function that processes files in a given directory and returns a dictionary of patterns found.
+def main(source: str, extensions: tuple) -> dict[str, dict]:
+    """Main function that processes the files.
 
     Args:
-        source (str):
-          The directory path where the files to be processed are located.
-        extensions (tuple):
-            A tuple of file extensions that the function should look for in the
-                specified source directory.
+        source (str): Directory path where the files are located.
+        extensions (tuple): File extensions to look for.
 
     Returns:
-        A dictionary where the keys are the file paths of the processed files (as strings) and the values
-        are dictionaries containing the patterns found in each file. If no files are found or no patterns
-        are found in the files, an empty dictionary is returned.
+        A dictionary containing the patterns found in the input files,
+        organized by their type (e.g. "IPV4", "EMAIL", etc.).
     """
     regex = RegexHelper()
     files = get_files(source, extensions)
 
     if files:
         all_patterns = {}
-        for filename in files:
-            file_path, patterns = process_file(regex, filename, {})
-            if patterns:
-                all_patterns[str(file_path)] = patterns  # Convert WindowsPath to string
+        try:
+            for filename in files:
+                print(f"Processing: {filename}")
+                file_path, patterns = process_file(regex, filename, {})
+                if patterns:
+                    all_patterns[str(file_path)] = patterns  # Convert WindowsPath to string
+        except KeyboardInterrupt:
+            sys.exit("\n\nUser aborted.")
         return all_patterns
 
     return {}
diff --git a/utils/logger.py b/utils/logger.py
index c6a0069..30dfa6a 100644
--- a/utils/logger.py
+++ b/utils/logger.py
@@ -7,20 +7,20 @@
 class LazyFileHandler(logging.FileHandler):
     """A file handler that is initialized only when an error is logged."""
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self: "LazyFileHandler", *args, **kwargs) -> None:
         """Initialization that stores the arguments and keyword arguments passed to it."""
         self._args = args
         self._kwargs = kwargs
         self._initialized = False
 
-    def _initialize(self):
+    def _initialize(self: "LazyFileHandler") -> None:
+        """Initializes the file handler if it has not been initialized yet."""
         if not self._initialized:
-            super().__init__(*self._args, **self._kwargs)  # noqa: WPS613
+            super().__init__(*self._args, **self._kwargs)
             self._initialized = True
 
     def emit(self, record: logging.LogRecord) -> None:
-        """
-        Emits a log record if its level is equal to or greater than ERROR.
+        """Emits a log record if its level is equal to or greater than ERROR.
 
         Args:
             record (logging.LogRecord):
@@ -32,8 +32,7 @@ def emit(self, record: logging.LogRecord) -> None:
 
 
 def logger() -> logging.Logger:
-    """
-    Sets up a logger with a file handler for errors, and returns the logger object.
+    """Sets up a logger with a file handler for errors, and returns the logger object.
 
     Returns:
         A logger object with a configured logging level, format, and handlers, including a file
diff --git a/utils/regex_helper.py b/utils/regex_helper.py
index 71da09b..be994ed 100644
--- a/utils/regex_helper.py
+++ b/utils/regex_helper.py
@@ -1,16 +1,14 @@
 """Desc: Helper class for regular expressions."""
 import re
 from pathlib import Path
-from typing import List
 
 import requests
 from utils.console import console
 from utils.logger import logger
 
 
-def get_valid_tlds() -> List[str]:
-    """
-    Uses a list of top-level domains (TLDs) and returns the list of TLDs.
+def get_valid_tlds() -> list[str]:
+    """Uses a list of top-level domains (TLDs) and returns the list of TLDs.
 
     Returns:
         A list of valid top-level domains (TLDs) either from a local file named "tlds.txt" or by
@@ -23,9 +21,8 @@ def get_valid_tlds() -> List[str]:
         return download_tlds()
 
 
-def download_tlds() -> List[str]:
-    """
-    Downloads a list of valid top-level domains (TLDs) and saves them to a file.
+def download_tlds() -> list[str]:
+    """Downloads a list of valid top-level domains (TLDs) and saves them to a file.
 
     Returns:
         A list of valid top-level domains (TLDs) that have been downloaded and
@@ -47,7 +44,7 @@ def download_tlds() -> List[str]:
     return valid_tlds
 
 
-class RegexHelper(object):
+class RegexHelper:
     """Helper class for regular expressions."""
 
     def __init__(self):
@@ -55,8 +52,7 @@ def __init__(self):
         self.tlds = get_valid_tlds()
 
     def regex(self, retype: str) -> re.Pattern:
-        """
-        Returns a compiled regular expression pattern based on the input type.
+        """Returns a compiled regular expression pattern based on the input type.
 
         Args:
             retype (str):
@@ -89,9 +85,8 @@ def regex(self, retype: str) -> re.Pattern:
         }
         return re.compile(pattern[retype], re.IGNORECASE)
 
-    def regex_iter(self, regex: re.Pattern, text: str) -> List[str]:
-        """
-        Returns a list of all non-overlapping matches of the regular expression in the string.
+    def regex_iter(self, regex: re.Pattern, text: str) -> list[str]:
+        """Returns a list of all non-overlapping matches of the regular expression in the string.
 
         Args:
             regex (re.Pattern):
@@ -105,8 +100,7 @@ def regex_iter(self, regex: re.Pattern, text: str) -> List[str]:
         return [re.group() for re in re.finditer(regex, text.lower())]
 
     def regex_patterns(self, text: str) -> dict:
-        """
-        Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL.
+        """Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL.
 
         Args:
             text (str):