diff --git a/ioc_extractor.py b/ioc_extractor.py index d469f97..8c95726 100644 --- a/ioc_extractor.py +++ b/ioc_extractor.py @@ -4,7 +4,6 @@ import json import sys from pathlib import Path -from typing import Dict, Tuple import chardet from rich.pretty import pprint @@ -16,18 +15,14 @@ def get_files(source: str, extensions: tuple) -> list: - """ - Returns a list of all files in a given source directory with specific extensions. + """Returns a list of all files in a given directory with specific extensions. Args: - source (str): - A string representing the directory path where the files are located. - extensions (tuple): - A tuple of file extensions that the function should look for in the - specified source directory. + source (str): Path where the files are located. + extensions (tuple): Extensions to look for. Returns: - A list of file paths that match the given extensions in the specified source directory. + A list of file paths that match the extensions in the specified source directory. """ all_files = [] for ext in extensions: @@ -35,18 +30,14 @@ def get_files(source: str, extensions: tuple) -> list: return all_files -def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, list]: - """ - Processes a given line of text using `regex` and returns a dictionary of patterns found in the line. +def process_line(regex: RegexHelper, line: str, patterns: dict) -> dict[str, list]: + """Processes a given line of text using `regex` and returns a dictionary of patterns found in the line. Args: - regex (RegexHelper): - A helper class that contains regular expression patterns for different types of data + regex (RegexHelper): Methods for working with the `regex` patterns. (e.g. IP addresses, email addresses, etc.) - line (str): - The input string that contains the text to be processed for patterns - patterns (dict): - A dictionary that stores the patterns found in the input line. + line (str): The text to be processed for patterns + patterns (dict): Dictionary that stores the patterns found. Returns: A dictionary containing the patterns found in the input line, organized by their type (e.g. @@ -72,19 +63,13 @@ def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, lis return patterns -def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str, dict]: - """ - Reads a file, detects its encoding, and processes each line using `process_line` function. +def process_file(regex: RegexHelper, filename: str, patterns: dict) -> tuple[str, dict]: + """Reads a file, detects its encoding, and processes each line using `process_line` function. Args: - regex (RegexHelper): - An instance of the RegexHelper class, which provides methods for working - with `regex` patterns. - filename (str): - The name of the file to be processed, - patterns (dict): - A dictionary containing `regex` patterns as keys and their corresponding - counts as values. + regex (RegexHelper): Methods for working with the `regex` patterns. + filename (str): The name of the file to be processed + patterns (dict): A dictionary containing `regex` patterns. Returns: A tuple containing the filename and a dictionary of patterns. @@ -107,44 +92,40 @@ def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str def save_to_file(data: dict, output_file: str) -> None: - """ - Ssaves a dictionary object to a file in JSON format. + """Saves a dictionary object to a file in JSON format. Args: - data (dict): - A dictionary containing the data that needs to be saved to a file. - output_file (str): - Represents the file path and name where the data will be saved. + data (dict): Data that needs to be saved to a file. + output_file (str): Path and name where the data will be saved. """ with open(output_file, "w", encoding="utf-8") as fileobj: json.dump(data, fileobj, indent=4) -def main(source: str, extensions: tuple) -> Dict[str, Dict]: - """ - Main function that processes files in a given directory and returns a dictionary of patterns found. +def main(source: str, extensions: tuple) -> dict[str, dict]: + """Main function that processes the files. Args: - source (str): - The directory path where the files to be processed are located. - extensions (tuple): - A tuple of file extensions that the function should look for in the - specified source directory. + source (str): Directory path where the files are located. + extensions (tuple): File extensions to look for. Returns: - A dictionary where the keys are the file paths of the processed files (as strings) and the values - are dictionaries containing the patterns found in each file. If no files are found or no patterns - are found in the files, an empty dictionary is returned. + A dictionary containing the patterns found in the input files, + organized by their type (e.g. "IPV4", "EMAIL", etc.). """ regex = RegexHelper() files = get_files(source, extensions) if files: all_patterns = {} - for filename in files: - file_path, patterns = process_file(regex, filename, {}) - if patterns: - all_patterns[str(file_path)] = patterns # Convert WindowsPath to string + try: + for filename in files: + print(f"Processing: {filename}") + file_path, patterns = process_file(regex, filename, {}) + if patterns: + all_patterns[str(file_path)] = patterns # Convert WindowsPath to string + except KeyboardInterrupt: + sys.exit("\n\nUser aborted.") return all_patterns return {} diff --git a/utils/logger.py b/utils/logger.py index c6a0069..30dfa6a 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -7,20 +7,20 @@ class LazyFileHandler(logging.FileHandler): """A file handler that is initialized only when an error is logged.""" - def __init__(self, *args, **kwargs): + def __init__(self: "LazyFileHandler", *args, **kwargs) -> None: """Initialization that stores the arguments and keyword arguments passed to it.""" self._args = args self._kwargs = kwargs self._initialized = False - def _initialize(self): + def _initialize(self: "LazyFileHandler") -> None: + """Initializes the file handler if it has not been initialized yet.""" if not self._initialized: - super().__init__(*self._args, **self._kwargs) # noqa: WPS613 + super().__init__(*self._args, **self._kwargs) self._initialized = True def emit(self, record: logging.LogRecord) -> None: - """ - Emits a log record if its level is equal to or greater than ERROR. + """Emits a log record if its level is equal to or greater than ERROR. Args: record (logging.LogRecord): @@ -32,8 +32,7 @@ def emit(self, record: logging.LogRecord) -> None: def logger() -> logging.Logger: - """ - Sets up a logger with a file handler for errors, and returns the logger object. + """Sets up a logger with a file handler for errors, and returns the logger object. Returns: A logger object with a configured logging level, format, and handlers, including a file diff --git a/utils/regex_helper.py b/utils/regex_helper.py index 71da09b..be994ed 100644 --- a/utils/regex_helper.py +++ b/utils/regex_helper.py @@ -1,16 +1,14 @@ """Desc: Helper class for regular expressions.""" import re from pathlib import Path -from typing import List import requests from utils.console import console from utils.logger import logger -def get_valid_tlds() -> List[str]: - """ - Uses a list of top-level domains (TLDs) and returns the list of TLDs. +def get_valid_tlds() -> list[str]: + """Uses a list of top-level domains (TLDs) and returns the list of TLDs. Returns: A list of valid top-level domains (TLDs) either from a local file named "tlds.txt" or by @@ -23,9 +21,8 @@ def get_valid_tlds() -> List[str]: return download_tlds() -def download_tlds() -> List[str]: - """ - Downloads a list of valid top-level domains (TLDs) and saves them to a file. +def download_tlds() -> list[str]: + """Downloads a list of valid top-level domains (TLDs) and saves them to a file. Returns: A list of valid top-level domains (TLDs) that have been downloaded and @@ -47,7 +44,7 @@ def download_tlds() -> List[str]: return valid_tlds -class RegexHelper(object): +class RegexHelper: """Helper class for regular expressions.""" def __init__(self): @@ -55,8 +52,7 @@ def __init__(self): self.tlds = get_valid_tlds() def regex(self, retype: str) -> re.Pattern: - """ - Returns a compiled regular expression pattern based on the input type. + """Returns a compiled regular expression pattern based on the input type. Args: retype (str): @@ -89,9 +85,8 @@ def regex(self, retype: str) -> re.Pattern: } return re.compile(pattern[retype], re.IGNORECASE) - def regex_iter(self, regex: re.Pattern, text: str) -> List[str]: - """ - Returns a list of all non-overlapping matches of the regular expression in the string. + def regex_iter(self, regex: re.Pattern, text: str) -> list[str]: + """Returns a list of all non-overlapping matches of the regular expression in the string. Args: regex (re.Pattern): @@ -105,8 +100,7 @@ def regex_iter(self, regex: re.Pattern, text: str) -> List[str]: return [re.group() for re in re.finditer(regex, text.lower())] def regex_patterns(self, text: str) -> dict: - """ - Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL. + """Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL. Args: text (str):