From 2b8b6ebfb5e7ab48e465cef9254a76bb3ef1ee30 Mon Sep 17 00:00:00 2001
From: Stefan Wieczorek <you@example.com>
Date: Mon, 7 Feb 2022 21:56:38 +0100
Subject: [PATCH] hello world

---
 config.ini                  |  42 +++++
 dump_db.py                  |  25 +++
 monodon.py                  | 309 ++++++++++++++++++++++++++++++++++++
 requirements.txt            |   2 +
 tlds/abused.txt             |  24 +++
 tlds/top15.txt              |  15 ++
 tlds/top5.txt               |   5 +
 utils/parser_checks.py      |  15 ++
 utils/squat_generator.py    | 130 +++++++++++++++
 utils/tld_generator.py      |  72 +++++++++
 utils/utils.py              |   3 +
 utils/wikipedia_wordlist.py |  30 ++++
 wordlists/country_codes.txt |  18 +++
 wordlists/phishing.txt      |  59 +++++++
 wordlists/similar_chars.txt | 107 +++++++++++++
 15 files changed, 856 insertions(+)
 create mode 100644 config.ini
 create mode 100755 dump_db.py
 create mode 100755 monodon.py
 create mode 100644 requirements.txt
 create mode 100644 tlds/abused.txt
 create mode 100644 tlds/top15.txt
 create mode 100644 tlds/top5.txt
 create mode 100644 utils/parser_checks.py
 create mode 100644 utils/squat_generator.py
 create mode 100644 utils/tld_generator.py
 create mode 100644 utils/utils.py
 create mode 100644 utils/wikipedia_wordlist.py
 create mode 100644 wordlists/country_codes.txt
 create mode 100644 wordlists/phishing.txt
 create mode 100644 wordlists/similar_chars.txt

diff --git a/config.ini b/config.ini
new file mode 100644
index 0000000..5cf82a4
--- /dev/null
+++ b/config.ini
@@ -0,0 +1,42 @@
+# In all sections, tld defintions can either be explicit tlds, or one the supplied tld lists
+# These lists are: all_tlds, all_tlds_incl_slds, abused, top5, top15
+
+[CHARS]
+# TLDs = abused
+
+
+[HOMO]
+# TLDs = abused
+
+
+[CCODES]
+# TLDs = abused
+
+
+[PHISHING]
+# TLDs = abused
+
+
+[NUMBERS]
+# TLDs = abused
+
+
+[WIKI]
+# TLDs = top5
+
+# Which wikipedia articles should be used to generate the wordlist
+# Terms = cyber computer hacker malware software tiger
+
+# How many of these related terms should be scanned
+# Count = 750
+
+# Which wikipedia language version should be used
+# Language = en
+
+
+[WORDLIST]
+# TLDs = abused
+
+# Additional wordlists - please specify complete path
+# Wordlists = /home/fant/wordlist/animals.txt
+
diff --git a/dump_db.py b/dump_db.py
new file mode 100755
index 0000000..cffef36
--- /dev/null
+++ b/dump_db.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+import argparse
+import sqlite3
+
+parser = argparse.ArgumentParser(description="Dump the domainsquatting database")
+parser.add_argument("dbfile", type=str, help="Squatting database to load")
+parser.add_argument("--filter", nargs="+", type=str, help="Filter master names containing these strings")
+args = parser.parse_args()
+
+# Setup the database
+con = sqlite3.connect(args.dbfile)
+cur = con.cursor()
+for row in cur.execute("SELECT * FROM domains"):
+	filtered = False
+	if args.filter:
+		for f in args.filter:
+			if f in row[2]:
+				filtered = True
+				break
+
+	if not filtered:
+		print(f"{row[0]}.{row[1]}\t{row[2]}")
+
+con.close()
\ No newline at end of file
diff --git a/monodon.py b/monodon.py
new file mode 100755
index 0000000..e59d4c8
--- /dev/null
+++ b/monodon.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+
+import dns.resolver
+import string
+import time
+import copy
+import sys
+import argparse
+import configparser
+import sqlite3
+import queue
+import logging
+import threading 
+
+from utils.parser_checks import parser_check_rate, parser_check_threads
+from utils.squat_generator import generate_char_simple, generate_homoglyphs, generate_numbers
+from utils.wikipedia_wordlist import generate_wikipedia_wordlist
+from utils.tld_generator import TLDGenerator
+from utils.utils import dedup
+
+URL_CHARS = list(string.ascii_lowercase) + list(string.digits) + ["-", "ä", "ö", "ü"]
+START_TIME = time.time()
+
+parser = argparse.ArgumentParser(description="Search for possible squatting domains")
+parser.add_argument("scanword", type=str, help="Which domain name / word to scan (without the TLD)")
+parser.add_argument("--config", type=str, default="config.ini", help="Config file to use")
+parser.add_argument("--all", default=False, action='store_true', help="Execute all scanning techniques")
+parser.add_argument("--tlds", default=False, action='store_true', help="Scan all tlds")
+parser.add_argument("--slds", default=False, action='store_true', help="Scan all tlds and known slds")
+parser.add_argument("--homo", default=False, action='store_true', help="Scan homoglyphs")
+parser.add_argument("--chars", default=False, action='store_true', help="Scan character replacements and additions")
+parser.add_argument("--numbers", default=False, action='store_true', help="Iterate numbers in the domain name")
+parser.add_argument("--phishing", default=False, action='store_true', help="Scan phishing wordlist")
+parser.add_argument("--ccodes", default=False, action='store_true', help="Scan two-letter country codes")
+parser.add_argument("--wiki",  default=False, action='store_true', help="Scan Wikipedia generated related word lists")
+parser.add_argument("--wikiterms", type=str, default=None, nargs="+", help="Wikipedia terms to scan instead of terms from config.ini")
+parser.add_argument("--wordlist", default=False, action='store_true', help="Scan wordlists defined in config file")
+parser.add_argument("--forcetlds", type=str, default=None, nargs="+", help="Override scan tlds set in the config.ini file")
+parser.add_argument("--tldfile", type=str, default=None, nargs="?", help="Instead of downloading a fresh copy from publicsuffix.org, use this as a list of all tlds and slds")
+parser.add_argument("--threads", type=parser_check_threads, default=5, help="Number of scanthreads to start")
+parser.add_argument("--rate", type=parser_check_rate, default=10, help="Scans per second to aim for")
+
+args = parser.parse_args()
+
+config = configparser.ConfigParser()
+config.read(args.config)
+
+SCANWORD = args.scanword.lower()
+glob_scancounter = 0
+glob_found_domains = 0
+glob_scan_delay = 1.0
+glob_scanpool = queue.SimpleQueue()
+glob_known_hosts = {}
+
+con = sqlite3.connect(f"{SCANWORD}.db")
+cur = con.cursor()
+cur.execute("CREATE TABLE IF NOT EXISTS domains (host text, tld text, master text, first_seen text, last_seen text, accepts_anyhost bool)")
+con.commit()
+con.close()
+
+# Setup logging
+logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
+
+def load_wordlist_file(filename):
+	words = []
+	with open(filename, "r") as f:
+		for line in f.readlines():
+			words += line.lower().split()
+
+	returnlist = dedup(words)
+	return returnlist
+
+def scan_host(host, tlds):
+	global glob_known_hosts, glob_scanpool
+	if host in glob_known_hosts:
+		# We cannot remove anything from the queue, so we add all of the tlds that will not already be scanned
+		remaining_tlds = [tld for tld in tlds if tld not in glob_known_hosts[host]]
+		if len(remaining_tlds) > 0:
+			glob_scanpool.put((host, remaining_tlds))
+			glob_known_hosts[host] += remaining_tlds
+	else:
+		glob_known_hosts[host] = tlds
+		glob_scanpool.put((host, tlds))
+
+
+def scan_wordlist(scanword, wordlist, tld_list):
+	for word in wordlist:
+		scan_host(f"{scanword}{word}", tld_list)
+		scan_host(f"{scanword}-{word}", tld_list)
+		scan_host(f"{word}{scanword}", tld_list)
+		scan_host(f"{word}-{scanword}", tld_list)
+
+
+class ScanThread(threading.Thread):
+	def _touch_domain(self, host, tld):
+		try:
+			soa_records = dns.resolver.resolve(".".join([host, tld]), "SOA")
+		except dns.resolver.NXDOMAIN:
+			return False
+		except Exception as e:
+			return False
+
+		# Search the SOA records for master names
+		master_names = []
+		for soa_record in soa_records.response.answer:
+			for rdata in soa_record:
+				try:
+					master_names.append(rdata.mname.to_text())
+				except Exception as e:
+					return False
+
+		return list(set(master_names)) # Deduplicate
+
+	def _note_domain(self, host, tld, master_name, accepts_anyhost, first_seen=time.time(), last_seen=time.time()):
+		con = sqlite3.connect(f"{SCANWORD}.db")
+		cur = con.cursor()
+		domain_to_insert = (host, tld, master_name, str(first_seen), str(last_seen), accepts_anyhost)
+		sql = ("INSERT INTO domains(host,tld,master,first_seen,last_seen,accepts_anyhost) VALUES (?, ?, ?, ?, ?, ?)")	
+		con.execute(sql, domain_to_insert)
+		con.commit()
+		con.close()
+
+
+	def scan_tlds(self, to_scan):
+		global glob_scancounter, glob_scan_delay, glob_found_domains
+		host = to_scan[0]
+
+		logging.debug(f"Scanning {to_scan[0]} on {to_scan[1]}")
+		
+		for tld in to_scan[1]:
+			glob_scancounter += 1
+			dns_result = self._touch_domain(host, tld)
+			if dns_result:
+				logging.warning(f"Found: {host}.{tld} on {dns_result[0]}")
+				accepts_anyhost = True if self._touch_domain("jdwqnwqqnwdsauuwuwdnakkkasd", tld) else False
+				self._note_domain(host, tld, dns_result[0], accepts_anyhost)
+				glob_found_domains += 1
+
+			time.sleep(glob_scan_delay)
+
+
+	def run(self):
+		global glob_scan_delay, glob_scanpool, glob_tlds_to_scan
+		while True:
+			to_scan = glob_scanpool.get()  # Blocks until item is available
+			if to_scan == "STOP": 
+				logging.info(f"Scan thread {threading.get_ident()} finished")
+				break  # Terminate the thread
+			else:
+				self.scan_tlds(to_scan)
+
+
+	def __init__(self):
+		super(ScanThread, self).__init__()
+		self.busy = False
+
+
+class WatchThread(threading.Thread):
+	def run(self):
+		global glob_scan_delay, glob_scancounter, glob_scanpool, glob_known_hosts, glob_found_domains, START_TIME
+		last_scancounter = 0
+		i = 0
+		while True:
+			# Readjust scan delay
+			current_scanrate = glob_scancounter-last_scancounter
+			if time.time() > START_TIME+5:
+				adjustment_factor = current_scanrate / self.target_scanrate
+				glob_scan_delay *= adjustment_factor
+				glob_scan_delay = max(0.1, glob_scan_delay) # Make sure that the we dont not accidentially DDOS somebody
+				glob_scan_delay = min(20, glob_scan_delay) # Make sure that the delay does not occilate to wildly
+
+			# Print current status
+			if i%30 == 10 and glob_scancounter > 0:
+				domains_to_scan = sum(map(lambda x: len(x), glob_known_hosts.values()))
+				remaining_scantime = round(domains_to_scan/(glob_scancounter/(time.time()-START_TIME))/3600, 2)
+				logging.info("")
+				logging.info(f"Running since {round((time.time()-START_TIME)/3600,2)}h, about {remaining_scantime}h left")
+				logging.info(f"Scanned {glob_scancounter} of {domains_to_scan} ({round((glob_scancounter/(domains_to_scan))*100, 2)}%), found {glob_found_domains} domains")
+				logging.info(f"Current scanrate is {current_scanrate} scans/sec, scan-delay is {round(glob_scan_delay,2)}s")
+				logging.info("")
+
+			last_scancounter = copy.copy(glob_scancounter)
+			i += 1
+			time.sleep(1)
+
+	def __init__(self, target_scanrate):
+		super(WatchThread, self).__init__()
+		self.target_scanrate = target_scanrate
+
+
+tld_gen = TLDGenerator(tldfile=args.tldfile, forcedtlds=args.forcetlds) # Initialize the tld generator
+
+# Start all threads
+watch_thread = WatchThread(args.rate)
+watch_thread.daemon = True
+watch_thread.start()
+
+threadpool = []
+for i in range(0, args.threads):
+	threadpool.append(ScanThread())
+	threadpool[-1].start()
+
+# Scan all tlds and known slds
+if args.all or args.slds:
+	logging.info("Scanning tlds and known slds")
+
+	# Split this task into smaller chunks to make it multi-threaded
+	tlds_to_scan = tld_gen.generate_tlds("all_tlds_incl_slds")
+	for i in range(0, len(tlds_to_scan), 10):
+		scan_host(SCANWORD, tlds_to_scan[i:i+10])
+
+# Scan all tlds
+elif args.tlds:
+	logging.info("Scanning tlds")
+	
+	# Split this task into smaller chunks to make it multi-threaded
+	tlds_to_scan = tld_gen.generate_tlds("all_tlds")
+	for i in range(0, len(tlds_to_scan), 10):
+		scan_host(SCANWORD, tlds_to_scan[i:i+10])
+
+# Scan for character replacement and addition squatting
+if args.all or args.chars:
+	logging.info(f"Scanning simple char replacements")
+
+	for host in generate_char_simple(SCANWORD):
+		if host != SCANWORD: 
+			scan_host(host, tld_gen.generate_tlds(config["CHARS"].get("TLDs", "abused")))
+
+# Scan homoglyphs
+if args.all or args.homo:
+	logging.info(f"Scanning homoglyphs")
+
+	for host in generate_homoglyphs(SCANWORD):
+		scan_host(host, tld_gen.generate_tlds(config["HOMO"].get("TLDs", "abused")))
+
+# Scan for all country codes
+if args.all or args.ccodes:
+	logging.info(f"Scanning country codes")
+	scan_wordlist(
+		SCANWORD, 
+		load_wordlist_file("wordlists/country_codes.txt"), 
+		tld_gen.generate_tlds(config["CCODES"].get("TLDs", "abused"))
+	)
+
+# Scan often-used phshing wordlist
+if args.all or args.phishing:
+	logging.info(f"Scanning phishing wordlist")
+	scan_wordlist(
+		SCANWORD, 
+		load_wordlist_file("wordlists/phishing.txt"), 
+		tld_gen.generate_tlds(config["PHISHING"].get("TLDs", "abused"))
+	)
+
+# Scan numbers
+if args.all or args.numbers:
+	logging.info(f"Scanning numbers")
+
+	for host in generate_numbers(SCANWORD):
+		scan_host(host, tld_gen.generate_tlds(config["NUMBERS"].get("TLDs", "abused")))
+
+# Scan additional wordlists
+if args.all or args.wordlist:
+	logging.info(f"Scanning supplied wordlist")
+	for wordlist_path in config["WORDLIST"].get("Wordlists", "").split():
+		scan_wordlist(
+			SCANWORD,
+			load_wordlist_file(wordlist_path),
+			tld_gen.generate_tlds(config["WORDLIST"].get("TLDs", "abused"))
+		)
+
+# Scan wikipedia wordlists
+if args.all or args.wiki:
+	# Generate and scan related wordlist
+	if args.wikiterms:
+		rt = args.wikiterms
+	else:
+		rt = config["WIKI"].get("Terms", "").split()
+		logging.info(f"Generating wikipedia wordlist of the related terms {', '.join(rt)}")
+
+	if rt == []:
+		logging.warn("Not scanning wikipedia wordlist, since no terms were supplied")
+	else:
+		logging.info("Scanning generated wikipedia wordlist")
+
+	related_terms = {}
+	for r in rt:
+		for term, relevance in generate_wikipedia_wordlist(config["WIKI"].get("Language", "en"), r):
+			if term in related_terms:
+				related_terms[term] += relevance
+			else:
+				related_terms[term] = relevance
+
+	sorted_related_terms = sorted(related_terms.items(), key=lambda x: x[1], reverse=True)[:config["WIKI"].getint("Count", 750)]
+
+	scan_wordlist(
+		SCANWORD, 
+		map(lambda x: x[0], sorted_related_terms), 
+		tld_gen.generate_tlds(config["WORDLIST"].get("TLDs", "top5"))
+	)
+
+logging.warning(f"Scanning {sum(map(lambda x: len(x), glob_known_hosts.values()))} domains...")
+
+for i in range(0, args.threads):
+	glob_scanpool.put("STOP")  # Scan threads terminate when fetching this signal
+
+for t in threadpool:
+	t.join()
+
+logging.warning("All scans finished")
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..94b497d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+dnspython >= 2.2.0
+requests >= 2.27.1
\ No newline at end of file
diff --git a/tlds/abused.txt b/tlds/abused.txt
new file mode 100644
index 0000000..256c7ec
--- /dev/null
+++ b/tlds/abused.txt
@@ -0,0 +1,24 @@
+eu
+cn
+de
+ga
+gq
+ml
+cf
+cc
+tk
+app
+ooo
+xyz
+top
+fit
+com
+net
+org
+pro
+info
+site
+work
+rest
+buzz
+online
\ No newline at end of file
diff --git a/tlds/top15.txt b/tlds/top15.txt
new file mode 100644
index 0000000..2d56fbc
--- /dev/null
+++ b/tlds/top15.txt
@@ -0,0 +1,15 @@
+com
+net
+org
+de
+icu
+uk
+ru
+info
+top
+xyz
+tk
+cn
+ga
+cf
+nl
\ No newline at end of file
diff --git a/tlds/top5.txt b/tlds/top5.txt
new file mode 100644
index 0000000..8b6d317
--- /dev/null
+++ b/tlds/top5.txt
@@ -0,0 +1,5 @@
+com
+net
+org
+de
+ru
diff --git a/utils/parser_checks.py b/utils/parser_checks.py
new file mode 100644
index 0000000..7c6b33f
--- /dev/null
+++ b/utils/parser_checks.py
@@ -0,0 +1,15 @@
+import argparse
+
+def parser_check_threads(value):
+	value = int(value)
+	if value <= 0:
+		raise argparse.ArgumentTypeError(f"Invalid thread count: {value}")
+	elif value > 64:
+		raise argparse.ArgumentTypeError(f"Maximum thread count is 64")
+	return value
+
+def parser_check_rate(value):
+	value = int(value)
+	if value <= 0:
+		raise argparse.ArgumentTypeError(f"Invalid target rate: {value}")
+	return value	
\ No newline at end of file
diff --git a/utils/squat_generator.py b/utils/squat_generator.py
new file mode 100644
index 0000000..13af15c
--- /dev/null
+++ b/utils/squat_generator.py
@@ -0,0 +1,130 @@
+import string
+
+def generate_char_simple(scanword):
+	# Character ommission
+	for i in range(len(scanword)):
+		yield scanword[:i] + scanword[i+1:]
+
+	# Character repeat
+	for i in range(len(scanword)):
+		yield scanword[:i] + scanword[i]*2 + scanword[i+1:]
+
+	# Swap adjacent characters
+	for i in range(len(scanword)-1):
+		yield scanword[:i] + scanword[i+1] + scanword[i] + scanword[i+1:]
+
+	# Insert dashes
+	for i in range(1, len(scanword)-1):
+		yield scanword[:i] + "-" + scanword[i:]
+		
+	# Replace one character by dash
+	for i in range(1, len(scanword)-1):
+		yield scanword[:i] + "-" + scanword[i+1:]
+
+	# Replace all letters once
+	for i in range(len(scanword)):
+		for repl in string.ascii_lowercase + string.digits:
+			yield scanword[:i] + repl + scanword[i+1:]
+
+	# Insert one char
+	for i in range(len(scanword)+1):
+		for repl in string.ascii_lowercase + string.digits:
+			yield scanword[:i] + repl + scanword[i:]
+
+def _load_homoglyphs(similar_chars_file):
+	# Currently only loads important homoglypes marked with "!"
+	
+	homoglyphs = {}
+	
+	with open(similar_chars_file, "r") as f:
+		for line in f.readlines():
+			line = line.strip()
+			if "#" in line or line == "" or "!" not in line: continue # Skip comments
+
+			# Build the index
+			for char in line.split():
+				if char not in homoglyphs and char != "!":
+					homoglyphs[char] = []
+
+			# Add all chars
+			for char_index in line.split():
+				for char_add in line.split():
+					if char_index != char_add and char_index != "!" and char_add != "!":
+						homoglyphs[char_index].append(char_add)
+
+	return homoglyphs
+
+
+def _count_up(combination_bitmask, current_bitmask):
+	while combination_bitmask != current_bitmask:
+		for i in range(len(combination_bitmask)):
+			if current_bitmask[i] < combination_bitmask[i]:
+				current_bitmask = [0]*i + [current_bitmask[i]+1] + current_bitmask[i+1:] # Increase current index and nullify all before
+				yield current_bitmask
+
+
+def generate_homoglyphs(scanword):
+	homoglyphs = _load_homoglyphs("wordlists/similar_chars.txt")
+	homoglyph_tree = [] 
+
+	# Build a 2D tree of possible replacements
+	for char in scanword:
+		if char in homoglyphs:
+			homoglyph_tree.append([char]+homoglyphs[char])
+		else:
+			homoglyph_tree.append([char])
+
+	# Build a bitmap how many replacements are there per char
+	combination_bitmask = []
+	for char in homoglyph_tree:
+		combination_bitmask.append(len(char)-1)
+
+	# Iterate all combinations
+	current_bitmask = [0]*len(combination_bitmask)
+
+	for current_bitmask in _count_up(combination_bitmask, current_bitmask):
+		out = ""
+		for i in range(len(current_bitmask)):
+			out += homoglyph_tree[i][current_bitmask[i]]
+		yield out
+
+
+def _iterate_numbers(number_tree):
+	current_number_tree = []
+
+	for char in number_tree:
+		if type(n) == int:
+			current_number_tree.append(0)
+		else:
+			current_number_tree.append(char)
+
+	for i, char in enumerate(current_number_tree):
+		if type(char) == str: 
+			continue
+
+		elif type(char) == int:
+			if char < int("9"*len(number_tree[i])):
+				current_number_tree = current_number_tree[:i]
+
+
+def generate_numbers(scanword):
+	# Count numbers in the word
+	contained_numbers = 0
+	for char in scanword:
+		if char in string.digits:
+			contained_numbers += 1
+
+	if contained_numbers > 0:
+		for i in range(0, int("9"*contained_numbers)+1):
+			current_number = str(i).zfill(contained_numbers)
+
+			outword = ""
+			index = 0
+			for char in scanword:
+				if char in string.digits:
+					outword += current_number[index]
+					index += 1
+				else:
+					outword += char
+
+			yield outword
\ No newline at end of file
diff --git a/utils/tld_generator.py b/utils/tld_generator.py
new file mode 100644
index 0000000..ee4ceae
--- /dev/null
+++ b/utils/tld_generator.py
@@ -0,0 +1,72 @@
+import logging
+import requests
+
+class TLDGenerator():
+	def _load_tld_file(self, tld_file):
+		with open(tld_file, "r") as f:
+			return self._load_raw_tlds(f.read())
+
+	def _load_raw_tlds(self, raw_tld_list):
+		# Loads a list of top or n-th level domains
+		returnlist = []
+
+		for line in raw_tld_list.split("\n"):
+			line = line.strip() # Trim whitespaces
+			if line.startswith("/"): continue # Ignore comments
+			elif line == "": continue # Ignore empty lines
+
+			if line.startswith("*"): line = line[1:] # Remove leading wildcards
+			if line.startswith("!"): line = line[1:] # Remove leading exclamation
+			
+			# Remove all leading dots
+			while line.startswith("."):
+				line = line[1:]
+			
+			returnlist.append(line.lower().strip())
+
+		return returnlist
+
+	def generate_tlds(self, configuration_string):
+		out_tlds = []
+
+		if self.forcedtlds:
+			return self.forcedtlds
+
+		desired_tlds = configuration_string.split()
+		for desired_tld in desired_tlds:
+			if desired_tld == "all_tlds":
+				out_tlds += self.ALL_TLDS
+			elif desired_tld == "all_tlds_incl_slds":
+				out_tlds += self.ALL_TLDS_INCL_SLDS
+			elif desired_tld == "abused":
+				out_tlds += self.ABUSED_TLDS
+			elif desired_tld == "top5":
+				out_tlds += self.TOP5_TLDS
+			elif desired_tld == "top15":
+				out_tlds += self.TOP15_TLDS
+			elif desired_tld in ALL_TLDS_INCL_SLDS:
+				out_tlds += desired_tld
+			else:
+				logging.warn(f"Top-level-domain .{desired_tld} is not public, check if typo. Scanning it anyway.")
+
+		return out_tlds
+
+	def __init__(self, tldfile=None, forcedtlds=None):
+		self.forcedtlds = forcedtlds
+
+		if tldfile:
+			self.ALL_TLDS_INCL_SLDS = self._load_tld_file(args.tldfile)
+		elif forcedtlds:
+			return # No need to load any files, domains are forced anyway
+		else:	
+			raw_all_sdls = requests.get("https://publicsuffix.org/list/public_suffix_list.dat").text
+			self.ALL_TLDS_INCL_SLDS = self._load_raw_tlds(raw_all_sdls)
+			logging.info(f"Loaded {len(self.ALL_TLDS_INCL_SLDS)} domains from publicsuffix.org")
+
+		# Filter for a list that only contain tlds
+		self.ALL_TLDS = filter(lambda x: not "." in x, self.ALL_TLDS_INCL_SLDS)
+
+		# Load the top abused and top5 tlds
+		self.ABUSED_TLDS = self._load_tld_file("tlds/abused.txt")
+		self.TOP5_TLDS = self._load_tld_file("tlds/top5.txt")
+		self.TOP15_TLDS = self._load_tld_file("tlds/top15.txt")
\ No newline at end of file
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..ab412af
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,3 @@
+# Order preserving list deduplication
+def dedup(list_to_dedup):
+	return list(dict.fromkeys(list_to_dedup))
\ No newline at end of file
diff --git a/utils/wikipedia_wordlist.py b/utils/wikipedia_wordlist.py
new file mode 100644
index 0000000..61af0fb
--- /dev/null
+++ b/utils/wikipedia_wordlist.py
@@ -0,0 +1,30 @@
+import requests
+import json
+import string
+
+def generate_wikipedia_wordlist(language_code, searchterm):
+	r = requests.get(f"https://{language_code}.wikipedia.org/w/api.php?action=query&format=json&titles={searchterm}&prop=extracts&explaintext")
+	extract = list(r.json()["query"]["pages"].values())[0]["extract"]
+
+	relevant_words = {}
+
+	for word in extract.split():
+		word = word.strip()
+		word = word.lower()
+		
+		if len(word) <= 2: continue
+		
+		invalid_char = False
+		for letter in word:
+			if letter not in list(string.ascii_letters) + list(string.digits) + ["-", "ä", "ü", "ö"]:
+				invalid_char = True
+				break
+		if invalid_char: continue
+
+		if word in relevant_words:
+			relevant_words[word] += 1
+		else:
+			relevant_words[word] = 1
+
+	sorted_words = sorted(relevant_words.items(), key=lambda x: x[1], reverse=True)
+	return sorted_words
\ No newline at end of file
diff --git a/wordlists/country_codes.txt b/wordlists/country_codes.txt
new file mode 100644
index 0000000..3fc7408
--- /dev/null
+++ b/wordlists/country_codes.txt
@@ -0,0 +1,18 @@
+# Country codes most used for domainsquatting
+DE
+US
+FR
+GB
+NL
+AU
+CN
+HK
+TW
+KR
+CA
+BE
+ES
+SE
+CH
+TR
+AT
\ No newline at end of file
diff --git a/wordlists/phishing.txt b/wordlists/phishing.txt
new file mode 100644
index 0000000..e66cd96
--- /dev/null
+++ b/wordlists/phishing.txt
@@ -0,0 +1,59 @@
+access
+account
+admin
+auth
+authentication
+blue
+business
+calculation
+cdn
+claim
+click
+company
+connect
+copy
+delivery
+dhl
+document
+download
+fedex
+find
+group
+http
+https
+https-www
+http-www
+inside
+invoice
+label
+local
+login
+mail
+market
+marketplace
+mobile
+my
+online
+portal
+red
+register
+safe
+secure
+security
+service
+signin
+signup
+ssl
+support
+system
+ticket
+update
+user
+verification
+verify
+view
+web
+world
+worldwide
+ww
+www
\ No newline at end of file
diff --git a/wordlists/similar_chars.txt b/wordlists/similar_chars.txt
new file mode 100644
index 0000000..6422f89
--- /dev/null
+++ b/wordlists/similar_chars.txt
@@ -0,0 +1,107 @@
+# Homograph glyph list
+# Original letter are first
+# ! marks an important swap
+
+# ASCII similarities
+! 1 l I
+
+# ASCII multi-letter similarities
+! m rn 
+! A fi 
+! d cl
+! w vv uu
+
+# L33tspeak
+! o 0
+! e 3
+! a 4
+! b 8
+
+# Phoentic similarities
+! sch sh
+! ss ß
+! s z
+! d t
+! q c
+! o u
+
+# German letters
+! a ä
+! o ö
+! u ü
+
+# Cyrillic lower letters (second letter is cyrillic)
+a а
+c с
+e е
+o о
+p р
+x х
+y у
+3 З
+4 Ч
+6 б
+
+і i
+ј j
+ԛ q
+ѕ s
+ԝ w
+ä ӓ
+e ё 
+i ї
+ö ӧ
+
+
+# Cyrillic upper letters (second letter is cyrillic)
+A А
+B В
+C С
+E Е
+H Н
+I І
+J Ј
+K К
+M М
+O О
+P Р
+S Ѕ
+T Т
+X Х
+
+Y Y
+F Ғ
+G Ԍ
+
+
+# Greek lower letters (second letter is greek)
+o ο
+v ν
+a α 
+e ε
+i ι
+k κ
+n η
+p ρ
+t τ
+u υ
+w ω
+x χ
+y γ
+
+
+# Greek upper letters (second letter is greek)
+A Α
+B Β
+E Ε
+H Η
+I Ι
+K Κ
+M Μ
+N Ν
+O Ο
+P Ρ
+T Τ
+X Χ
+Y Υ
+Z Ζ
\ No newline at end of file