Skip to content

Commit

Permalink
Optimized & Refactored
Browse files Browse the repository at this point in the history
  • Loading branch information
dfirsec committed May 20, 2023
1 parent f4714dd commit aee7815
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 72 deletions.
81 changes: 31 additions & 50 deletions ioc_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
import sys
from pathlib import Path
from typing import Dict, Tuple

import chardet
from rich.pretty import pprint
Expand All @@ -16,37 +15,29 @@


def get_files(source: str, extensions: tuple) -> list:
"""
Returns a list of all files in a given source directory with specific extensions.
"""Returns a list of all files in a given directory with specific extensions.
Args:
source (str):
A string representing the directory path where the files are located.
extensions (tuple):
A tuple of file extensions that the function should look for in the
specified source directory.
source (str): Path where the files are located.
extensions (tuple): Extensions to look for.
Returns:
A list of file paths that match the given extensions in the specified source directory.
A list of file paths that match the extensions in the specified source directory.
"""
all_files = []
for ext in extensions:
all_files.extend(Path(source).glob(ext))
return all_files


def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, list]:
"""
Processes a given line of text using `regex` and returns a dictionary of patterns found in the line.
def process_line(regex: RegexHelper, line: str, patterns: dict) -> dict[str, list]:
"""Processes a given line of text using `regex` and returns a dictionary of patterns found in the line.
Args:
regex (RegexHelper):
A helper class that contains regular expression patterns for different types of data
regex (RegexHelper): Methods for working with the `regex` patterns.
(e.g. IP addresses, email addresses, etc.)
line (str):
The input string that contains the text to be processed for patterns
patterns (dict):
A dictionary that stores the patterns found in the input line.
line (str): The text to be processed for patterns
patterns (dict): Dictionary that stores the patterns found.
Returns:
A dictionary containing the patterns found in the input line, organized by their type (e.g.
Expand All @@ -72,19 +63,13 @@ def process_line(regex: RegexHelper, line: str, patterns: dict) -> Dict[str, lis
return patterns


def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str, dict]:
"""
Reads a file, detects its encoding, and processes each line using `process_line` function.
def process_file(regex: RegexHelper, filename: str, patterns: dict) -> tuple[str, dict]:
"""Reads a file, detects its encoding, and processes each line using `process_line` function.
Args:
regex (RegexHelper):
An instance of the RegexHelper class, which provides methods for working
with `regex` patterns.
filename (str):
The name of the file to be processed,
patterns (dict):
A dictionary containing `regex` patterns as keys and their corresponding
counts as values.
regex (RegexHelper): Methods for working with the `regex` patterns.
filename (str): The name of the file to be processed
patterns (dict): A dictionary containing `regex` patterns.
Returns:
A tuple containing the filename and a dictionary of patterns.
Expand All @@ -107,44 +92,40 @@ def process_file(regex: RegexHelper, filename: str, patterns: dict) -> Tuple[str


def save_to_file(data: dict, output_file: str) -> None:
"""
Ssaves a dictionary object to a file in JSON format.
"""Saves a dictionary object to a file in JSON format.
Args:
data (dict):
A dictionary containing the data that needs to be saved to a file.
output_file (str):
Represents the file path and name where the data will be saved.
data (dict): Data that needs to be saved to a file.
output_file (str): Path and name where the data will be saved.
"""
with open(output_file, "w", encoding="utf-8") as fileobj:
json.dump(data, fileobj, indent=4)


def main(source: str, extensions: tuple) -> Dict[str, Dict]:
"""
Main function that processes files in a given directory and returns a dictionary of patterns found.
def main(source: str, extensions: tuple) -> dict[str, dict]:
"""Main function that processes the files.
Args:
source (str):
The directory path where the files to be processed are located.
extensions (tuple):
A tuple of file extensions that the function should look for in the
specified source directory.
source (str): Directory path where the files are located.
extensions (tuple): File extensions to look for.
Returns:
A dictionary where the keys are the file paths of the processed files (as strings) and the values
are dictionaries containing the patterns found in each file. If no files are found or no patterns
are found in the files, an empty dictionary is returned.
A dictionary containing the patterns found in the input files,
organized by their type (e.g. "IPV4", "EMAIL", etc.).
"""
regex = RegexHelper()
files = get_files(source, extensions)

if files:
all_patterns = {}
for filename in files:
file_path, patterns = process_file(regex, filename, {})
if patterns:
all_patterns[str(file_path)] = patterns # Convert WindowsPath to string
try:
for filename in files:
print(f"Processing: {filename}")
file_path, patterns = process_file(regex, filename, {})
if patterns:
all_patterns[str(file_path)] = patterns # Convert WindowsPath to string
except KeyboardInterrupt:
sys.exit("\n\nUser aborted.")
return all_patterns

return {}
Expand Down
13 changes: 6 additions & 7 deletions utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@
class LazyFileHandler(logging.FileHandler):
"""A file handler that is initialized only when an error is logged."""

def __init__(self, *args, **kwargs):
def __init__(self: "LazyFileHandler", *args, **kwargs) -> None:
"""Initialization that stores the arguments and keyword arguments passed to it."""
self._args = args
self._kwargs = kwargs
self._initialized = False

def _initialize(self):
def _initialize(self: "LazyFileHandler") -> None:
"""Initializes the file handler if it has not been initialized yet."""
if not self._initialized:
super().__init__(*self._args, **self._kwargs) # noqa: WPS613
super().__init__(*self._args, **self._kwargs)
self._initialized = True

def emit(self, record: logging.LogRecord) -> None:
"""
Emits a log record if its level is equal to or greater than ERROR.
"""Emits a log record if its level is equal to or greater than ERROR.
Args:
record (logging.LogRecord):
Expand All @@ -32,8 +32,7 @@ def emit(self, record: logging.LogRecord) -> None:


def logger() -> logging.Logger:
"""
Sets up a logger with a file handler for errors, and returns the logger object.
"""Sets up a logger with a file handler for errors, and returns the logger object.
Returns:
A logger object with a configured logging level, format, and handlers, including a file
Expand Down
24 changes: 9 additions & 15 deletions utils/regex_helper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
"""Desc: Helper class for regular expressions."""
import re
from pathlib import Path
from typing import List

import requests
from utils.console import console
from utils.logger import logger


def get_valid_tlds() -> List[str]:
"""
Uses a list of top-level domains (TLDs) and returns the list of TLDs.
def get_valid_tlds() -> list[str]:
"""Uses a list of top-level domains (TLDs) and returns the list of TLDs.
Returns:
A list of valid top-level domains (TLDs) either from a local file named "tlds.txt" or by
Expand All @@ -23,9 +21,8 @@ def get_valid_tlds() -> List[str]:
return download_tlds()


def download_tlds() -> List[str]:
"""
Downloads a list of valid top-level domains (TLDs) and saves them to a file.
def download_tlds() -> list[str]:
"""Downloads a list of valid top-level domains (TLDs) and saves them to a file.
Returns:
A list of valid top-level domains (TLDs) that have been downloaded and
Expand All @@ -47,16 +44,15 @@ def download_tlds() -> List[str]:
return valid_tlds


class RegexHelper(object):
class RegexHelper:
"""Helper class for regular expressions."""

def __init__(self):
"""Load valid TLDs from file."""
self.tlds = get_valid_tlds()

def regex(self, retype: str) -> re.Pattern:
"""
Returns a compiled regular expression pattern based on the input type.
"""Returns a compiled regular expression pattern based on the input type.
Args:
retype (str):
Expand Down Expand Up @@ -89,9 +85,8 @@ def regex(self, retype: str) -> re.Pattern:
}
return re.compile(pattern[retype], re.IGNORECASE)

def regex_iter(self, regex: re.Pattern, text: str) -> List[str]:
"""
Returns a list of all non-overlapping matches of the regular expression in the string.
def regex_iter(self, regex: re.Pattern, text: str) -> list[str]:
"""Returns a list of all non-overlapping matches of the regular expression in the string.
Args:
regex (re.Pattern):
Expand All @@ -105,8 +100,7 @@ def regex_iter(self, regex: re.Pattern, text: str) -> List[str]:
return [re.group() for re in re.finditer(regex, text.lower())]

def regex_patterns(self, text: str) -> dict:
"""
Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL.
"""Returns a dictionary containing regex patterns for domain, email, IPV4, MD5, SHA1, SHA256, and URL.
Args:
text (str):
Expand Down

0 comments on commit aee7815

Please sign in to comment.