From 7649f1deadcc1d8c1f4777090197fd560a084446 Mon Sep 17 00:00:00 2001
From: Indrajith Indraprastham <mail@indrajith.me>
Date: Thu, 13 Jun 2024 01:58:45 +0530
Subject: [PATCH] added pylintrc and format the files

---
 .github/workflows/pylint.yml |   2 +-
 .pylintrc                    | 637 +++++++++++++++++++++++++++++++++++
 crawler/crawler.py           | 200 +++++++++++
 setup.py                     |  79 ++---
 tests/test_crawler.py        |  56 +--
 tiny_web_crawler/crawler.py  | 133 +++++---
 6 files changed, 997 insertions(+), 110 deletions(-)
 create mode 100644 .pylintrc
 create mode 100644 crawler/crawler.py

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index e8b5430..63ffc50 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -17,9 +17,9 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
+        pip install pylint==3.0.2
         pip install '.[dev]'
         pip install .
-        pip install pylint
     - name: Analysing the code with pylint
       run: |
         pylint $(git ls-files '*.py')
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..7efadbd
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,637 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Clear in-memory caches upon conclusion of linting. Useful if running pylint
+# in a server-like mode.
+clear-cache-post-run=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\\' represents the directory delimiter on Windows systems,
+# it can't be used as an escape character.
+ignore-paths=
+
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.12
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# Add paths to the list of the source roots. Supports globbing patterns. The
+# source root is an absolute path or a path relative to the current working
+# directory used to determine a package namespace for modules located under the
+# source root.
+source-roots=
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type alias names. If left empty, type
+# alias names will be checked with the set naming style.
+#typealias-rgx=
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      asyncSetUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow explicit reexports by alias from a package __init__.
+allow-reexport-from-package=no
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+           CONTROL_FLOW,
+           INFERENCE,
+           INFERENCE_FAILURE,
+           UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        use-implicit-booleaness-not-comparison-to-string,
+        use-implicit-booleaness-not-comparison-to-zero,
+        missing-function-docstring,
+        missing-class-docstring,
+        missing-module-docstring
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=
+
+
+[METHOD_ARGS]
+
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are: text, parseable, colorized,
+# json2 (improved json format), json (old json format) and msvs (visual
+# studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. No available dictionaries : You need to install
+# both the python package and the system dependency for enchant to work.
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+                          not-async-context-manager,
+                          not-context-manager,
+                          attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/crawler/crawler.py b/crawler/crawler.py
new file mode 100644
index 0000000..5bb114f
--- /dev/null
+++ b/crawler/crawler.py
@@ -0,0 +1,200 @@
+from __future__ import annotations
+
+import json
+import urllib.parse
+from typing import Dict, List, Optional, Set
+
+import requests
+import validators
+from bs4 import BeautifulSoup
+from colorama import Fore, Style, init
+
+# Initialize colorama
+init(autoreset=True)
+
+
+class SpiderConfig:
+    def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None:
+        self.root_url: str = root_url
+        self.max_links: int = max_links
+        self.default_scheme: str = 'http://'
+        self.save_to_file: Optional[str] = save_to_file
+        self.scheme: str = self.default_scheme
+
+
+class Spider:
+    """
+    A simple web crawler class.
+
+    Attributes:
+        root_url (str): The root URL to start crawling from.
+        max_links (int): The maximum number of links to crawl.
+        crawl_result (Dict[str, Dict[str, List[str]]]): The dictionary storing the crawl results.
+        crawl_set (Set[str]): A set of URLs to be crawled.
+        link_count (int): The current count of crawled links.
+        default_scheme (str): The default URL scheme (e.g., 'http://').
+        save_to_file (Optional[str]): The file path to save the crawl results.
+        scheme (str): The current URL scheme being used.
+    """
+
+    def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None:
+        """
+        Initializes the Spider class.
+
+        Args:
+            root_url (str): The root URL to start crawling from.
+            max_links (int): The maximum number of links to crawl.
+            save_to_file (Optional[str]): The file to save the crawl results to.
+        """
+        self.root_url: str = root_url
+        self.max_links: int = max_links
+        self.crawl_result: Dict[str, Dict[str, List[str]]] = {}
+        self.crawl_set: Set[str] = set()
+        self.link_count: int = 0
+        self.default_scheme: str = 'http://'
+        self.save_to_file: Optional[str] = save_to_file
+        self.scheme: str = self.default_scheme
+
+    @staticmethod
+    def is_valid_url(url: str) -> bool:
+        """
+        Returns True for a valid url, False for an invalid url.
+        """
+        return bool(validators.url(url))
+
+    def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
+        """
+        Reads the content of a URL and parses it using BeautifulSoup with lxml parser.
+
+        Args:
+            url (str): The URL to fetch and parse.
+
+        Returns:
+            Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully,
+            None otherwise.
+        """
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+            data = response.text
+            return BeautifulSoup(data, 'lxml')
+        except requests.exceptions.HTTPError as http_err:
+            print(Fore.RED + f"HTTP error occurred: {http_err}")
+        except requests.exceptions.ConnectionError as conn_err:
+            print(Fore.RED + f"Connection error occurred: {conn_err}")
+        except requests.exceptions.Timeout as timeout_err:
+            print(Fore.RED + f"Timeout error occurred: {timeout_err}")
+        except requests.exceptions.RequestException as req_err:
+            print(Fore.RED + f"Request error occurred: {req_err}")
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            # Catch all other exceptions
+            print(Fore.RED + f"An unexpected error occurred: {e}")
+        return None
+
+    def save_results(self) -> None:
+        """
+        Saves the crawl results into a JSON file.
+        """
+        if self.save_to_file:
+            with open(self.save_to_file, 'w', encoding='utf-8') as file:
+                json.dump(self.crawl_result, file, indent=4)
+
+    def format_url(self, url: str, base_url: str) -> str:
+        """
+        Formats a URL to ensure it is absolute and removes any query parameters or fragments.
+
+        Args:
+            url (str): The URL to format.
+            base_url (str): The base URL to resolve relative URLs.
+
+        Returns:
+            str: The formatted URL.
+        """
+        parsed_url = urllib.parse.urlparse(url)
+        base_url = base_url.rstrip('/')
+
+        if parsed_url.scheme:
+            self.scheme = parsed_url.scheme
+
+        if not parsed_url.scheme and not parsed_url.netloc:
+            if self.is_valid_url(self.default_scheme + parsed_url.path):
+                return self.default_scheme + parsed_url.path
+
+            if parsed_url.path.startswith('/'):
+                return base_url + parsed_url.path
+            else:
+                return f"{base_url}/{parsed_url.path}"
+
+        return f"{self.scheme}://{parsed_url.netloc}{parsed_url.path}"
+
+    def crawl(self, url: str) -> None:
+        """
+        Crawls a given URL, extracts links, and adds them to the crawl results.
+
+        Args:
+            url (str): The URL to crawl.
+        """
+        if not self.is_valid_url(url):
+            print(Fore.RED + f"Invalid url to crawl: {url}")
+            return
+
+        if url in self.crawl_result:
+            print(Fore.YELLOW + f"URL already crawled: {url}")
+            return
+
+        print(Fore.GREEN + f"Crawling: {url}")
+        soup = self.fetch_url(url)
+        if not soup:
+            return
+
+        links = soup.body.find_all('a', href=True)
+        self.crawl_result[url] = {'urls': []}
+
+        for link in links:
+            pretty_url = self.format_url(link['href'].lstrip(), url)
+            if not self.is_valid_url(pretty_url):
+                print(Fore.RED + f"Invalid url: {pretty_url}")
+                continue
+
+            if pretty_url in self.crawl_result[url]['urls']:
+                continue
+
+            self.crawl_result[url]['urls'].append(pretty_url)
+            self.crawl_set.add(pretty_url)
+            print(Fore.BLUE + f"Link found: {pretty_url}")
+
+        if self.link_count < self.max_links:
+            self.link_count += 1
+            print(Fore.GREEN + f"Links crawled: {self.link_count}")
+
+    def start(self) -> Dict[str, Dict[str, List[str]]]:
+        """
+        Starts the crawling process from the root URL. Crawls up to max_links URLs.
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: The crawl results.
+        """
+        self.crawl(self.root_url)
+
+        while self.crawl_set and self.link_count < self.max_links:
+            self.crawl(self.crawl_set.pop())
+
+        if self.save_to_file:
+            self.save_results()
+        print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
+        return self.crawl_result
+
+
+def main() -> None:
+    """
+    The main function to initialize and start the crawler.
+    """
+    root_url = 'http://github.com'
+    max_links = 2
+
+    crawler = Spider(root_url, max_links, save_to_file='out.json')
+    crawler.start()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/setup.py b/setup.py
index cdf3cc8..641e0a0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,57 +1,50 @@
 from setuptools import setup, find_packages
 
 setup(
-    name='tiny-web-crawler',  # PyPI package name
-    version='0.1.1',
-    author='Indrajith Indraprastham',
-    author_email='indr4jith@gmail.com',
-    description='A simple and efficient web crawler in Python.',
-    long_description=open('README.md').read(),
-    long_description_content_type='text/markdown',
-    url='https://github.com/indrajithi/tiny-web-crawler',
+    name="tiny-web-crawler",  # PyPI package name
+    version="0.1.1",
+    author="Indrajith Indraprastham",
+    author_email="indr4jith@gmail.com",
+    description="A simple and efficient web crawler in Python.",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/indrajithi/tiny-web-crawler",
     packages=find_packages(
-        include=['tiny_web_crawler', 'tiny_web_crawler.*', 'crawler', 'crawler.*']),
-    install_requires=[
-        'validators',
-        'beautifulsoup4',
-        'lxml',
-        'colorama',
-        'requests'
-    ],
+        include=["tiny_web_crawler",
+                 "tiny_web_crawler.*", "crawler", "crawler.*"]
+    ),
+    install_requires=["validators", "beautifulsoup4",
+                      "lxml", "colorama", "requests"],
     extras_require={
-        'dev': [
-            'mypy',
-            'pytest',
-            'responses'
-        ],
+        "dev": ["mypy", "pytest", "responses", "pylint"],
     },
     classifiers=[
-        'Development Status :: 4 - Beta',
-        'Environment :: Console',
-        'Intended Audience :: Developers',
-        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-        'Programming Language :: Python :: 3.11',
-        'Programming Language :: Python :: 3.12',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-        'Topic :: Internet :: WWW/HTTP :: Indexing/Search'
+        "Development Status :: 4 - Beta",
+        "Environment :: Console",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
     ],
-    keywords='web crawler, scraping, web scraping, python crawler, SEO, data extraction',
+    keywords="web crawler, scraping, web scraping, python crawler, SEO, data extraction",
     project_urls={
-        'Documentation': 'https://github.com/indrajithi/tiny-web-crawler#readme',
-        'Source': 'https://github.com/indrajithi/tiny-web-crawler',
-        'Tracker': 'https://github.com/indrajithi/tiny-web-crawler/issues',
+        "Documentation": "https://github.com/indrajithi/tiny-web-crawler#readme",
+        "Source": "https://github.com/indrajithi/tiny-web-crawler",
+        "Tracker": "https://github.com/indrajithi/tiny-web-crawler/issues",
     },
-    python_requires='>=3.6',
+    python_requires=">=3.6",
     entry_points={
-        'console_scripts': [
-            'tiny-web-crawler=tiny_web_crawler.crawler:main',
+        "console_scripts": [
+            "tiny-web-crawler=tiny_web_crawler.crawler:main",
         ],
     },
 )
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index 404ff08..1578369 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -1,49 +1,58 @@
-import requests
-import pytest
-import urllib.request
-from unittest.mock import patch, MagicMock, mock_open
-from crawler.crawler import Spider
+from unittest.mock import MagicMock, mock_open, patch
+
 import responses
 
+from crawler.crawler import Spider
+
 
 def test_is_valid_url() -> None:
     assert Spider.is_valid_url("http://example.com") is True
-    # assert Spider.is_valid_url('invalid') is False
+    assert Spider.is_valid_url('invalid') is False
 
 
 def test_format_url() -> None:
     spider = Spider("http://example.com", 10)
     assert spider.format_url(
         "/test", "http://example.com") == "http://example.com/test"
-    assert spider.format_url("http://example.com/test",
-                             "http://example.com") == "http://example.com/test"
+    assert (
+        spider.format_url("http://example.com/test", "http://example.com")
+        == "http://example.com/test"
+    )
 
 
 @responses.activate
 def test_fetch_url() -> None:
-    responses.add(responses.GET, 'http://example.com',
-                  body="<html><body><a href='http://example.com'>link</a></body></html>", status=200)
-    spider = Spider(root_url='http://example.com', max_links=2)
-    resp = spider.fetch_url('http://example.com')
+    responses.add(
+        responses.GET,
+        "http://example.com",
+        body="<html><body><a href='http://example.com'>link</a></body></html>",
+        status=200,
+    )
+    spider = Spider(root_url="http://example.com", max_links=2)
+    resp = spider.fetch_url("http://example.com")
 
     assert resp is not None
-    assert resp.text == 'link'
+    assert resp.text == "link"
 
 
 @responses.activate
 def test_crawl() -> None:
     # Mock HTTP response
-    responses.add(responses.GET, 'http://example.com',
-                  body="<html><body><a href='http://example.com/test'>link</a></body></html>",
-                  status=200,
-                  content_type='text/html')
+    responses.add(
+        responses.GET,
+        "http://example.com",
+        body="<html><body><a href='http://example.com/test'>link</a></body></html>",
+        status=200,
+        content_type="text/html",
+    )
 
     spider = Spider("http://example.com", 10)
     spider.crawl("http://example.com")
 
     assert "http://example.com" in spider.crawl_result
     assert spider.crawl_result["http://example.com"]["urls"] == [
-        "http://example.com/test"]
+        "http://example.com/test"
+    ]
 
 
 @responses.activate
@@ -57,16 +66,19 @@ def test_save_results() -> None:
         mocked_file.assert_called_once_with("out.json", "w")
 
 
-@patch.object(Spider, 'crawl')
-@patch.object(Spider, 'save_results')
+@patch.object(Spider, "crawl")
+@patch.object(Spider, "save_results")
 def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:
     spider = Spider("http://example.com", 10)
     mock_crawl.side_effect = lambda url: spider.crawl_result.update(
-        {url: {'urls': ['http://example.com/test']}})
+        {url: {"urls": ["http://example.com/test"]}}
+    )
+    print(mock_save_results)
 
     spider.start()
 
     assert mock_crawl.call_count == 1
     assert "http://example.com" in spider.crawl_result
     assert spider.crawl_result["http://example.com"]["urls"] == [
-        'http://example.com/test']
+        "http://example.com/test"
+    ]
diff --git a/tiny_web_crawler/crawler.py b/tiny_web_crawler/crawler.py
index 97a1604..8df822b 100644
--- a/tiny_web_crawler/crawler.py
+++ b/tiny_web_crawler/crawler.py
@@ -1,71 +1,106 @@
-# -*- coding: utf-8 -*-
-# filename: crawler.py
-# Author: Indrajith Indraprastham
-# License: GPL v3: http://www.gnu.org/licenses/
-
-# --------------------------------------------------------------------------------
-# README
-# --------------------------------------------------------------------------------
-# Install Requirements
-# pip install validators beautifulsoup4 lxml colorama
-
-# Python version: Python 3.6.3 :: Anaconda, Inc.
-
 from __future__ import annotations
-from bs4 import BeautifulSoup
-import requests
 import json
 import urllib.parse
+from typing import Dict, List, Optional, Set
+
+import requests
 import validators
-from colorama import init, Fore, Style
-from typing import Optional, Set
+from bs4 import BeautifulSoup
+from colorama import Fore, Style, init
 
-# Initialize colorama
 init(autoreset=True)
 
 
-class Spider:
-    def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None:
+DEFAULT_SCHEME: str = 'http://'
+
+
+class Spider():
+    """
+    A simple web crawler class.
+
+    Attributes:
+        root_url (str): The root URL to start crawling from.
+        max_links (int): The maximum number of links to crawl.
+        crawl_result (Dict[str, Dict[str, List[str]]]): The dictionary storing the crawl results.
+        crawl_set (Set[str]): A set of URLs to be crawled.
+        link_count (int): The current count of crawled links.
+        save_to_file (Optional[str]): The file path to save the crawl results.
+    """
+
+    def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str] = None) -> None:
+        """
+        Initializes the Spider class.
+
+        Args:
+            root_url (str): The root URL to start crawling from.
+            max_links (int): The maximum number of links to crawl.
+            save_to_file (Optional[str]): The file to save the crawl results to.
+        """
         self.root_url: str = root_url
         self.max_links: int = max_links
-        self.crawl_result: dict[str, dict[str, list]] = {}
-        self.crawl_set: set = set()
+        self.crawl_result: Dict[str, Dict[str, List[str]]] = {}
+        self.crawl_set: Set[str] = set()
         self.link_count: int = 0
-        self.default_scheme: str = 'http://'
         self.save_to_file: Optional[str] = save_to_file
-        self.scheme: str = self.default_scheme
+        self.scheme: str = DEFAULT_SCHEME
 
     def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
         """
-        Reads the content of a url, parses it using BeautifulSoup with lxml parser.
+        Reads the content of a URL and parses it using BeautifulSoup with lxml parser.
+
+        Args:
+            url (str): The URL to fetch and parse.
+
+        Returns:
+            Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise.
         """
+
         try:
-            with requests.get(url) as response:
-                data = response.text
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+            data = response.text
             return BeautifulSoup(data, 'lxml')
-        except Exception as e:
-            print(Fore.RED + f"Unable to fetch url: {url}, Error: {e}")
-            return None
+        except requests.exceptions.HTTPError as http_err:
+            print(Fore.RED + f"HTTP error occurred: {http_err}")
+        except requests.exceptions.ConnectionError as conn_err:
+            print(Fore.RED + f"Connection error occurred: {conn_err}")
+        except requests.exceptions.Timeout as timeout_err:
+            print(Fore.RED + f"Timeout error occurred: {timeout_err}")
+        except requests.exceptions.RequestException as req_err:
+            print(Fore.RED + f"Request error occurred: {req_err}")
+        return None
 
     @staticmethod
     def is_valid_url(url: str) -> bool:
         """
-        Returns True for a valid url, False for an invalid url.
+        Checks if the provided URL is valid.
+
+        Args:
+            url (str): The URL to validate.
+
+        Returns:
+            bool: True if the URL is valid, False otherwise.
         """
         return bool(validators.url(url))
 
     def save_results(self) -> None:
         """
-        Saves results into a json file.
+        Saves the crawl results into a JSON file.
         """
         if self.save_to_file:
-            with open(self.save_to_file, 'w') as file:
+            with open(self.save_to_file, 'w', encoding='utf-8') as file:
                 json.dump(self.crawl_result, file, indent=4)
 
     def format_url(self, url: str, base_url: str) -> str:
         """
-        Removes any query, params, tag-id reference in the urls.
-        Adds base_url to url if it is a relative link (link to the same domain).
+        Formats a URL to ensure it is absolute and removes any query parameters or fragments.
+
+        Args:
+            url (str): The URL to format.
+            base_url (str): The base URL to resolve relative URLs.
+
+        Returns:
+            str: The formatted URL.
         """
         parsed_url = urllib.parse.urlparse(url)
         base_url = base_url.rstrip('/')
@@ -74,17 +109,23 @@ def format_url(self, url: str, base_url: str) -> str:
             self.scheme = parsed_url.scheme
 
         if not parsed_url.scheme and not parsed_url.netloc:
-            if self.is_valid_url(self.default_scheme + parsed_url.path):
-                return self.default_scheme + parsed_url.path
+            if self.is_valid_url(DEFAULT_SCHEME + parsed_url.path):
+                return DEFAULT_SCHEME + parsed_url.path
 
             if parsed_url.path.startswith('/'):
                 return base_url + parsed_url.path
-            else:
-                return f"{base_url}/{parsed_url.path}"
+
+            return f"{base_url}/{parsed_url.path}"
 
         return f"{self.scheme}://{parsed_url.netloc}{parsed_url.path}"
 
     def crawl(self, url: str) -> None:
+        """
+        Crawls a given URL, extracts links, and adds them to the crawl results.
+
+        Args:
+            url (str): The URL to crawl.
+        """
         if not self.is_valid_url(url):
             print(Fore.RED + f"Invalid url to crawl: {url}")
             return
@@ -98,7 +139,7 @@ def crawl(self, url: str) -> None:
         if not soup:
             return
 
-        links = soup.body.find_all('a', href=True) if soup.body else []
+        links = soup.body.find_all('a', href=True)
         self.crawl_result[url] = {'urls': []}
 
         for link in links:
@@ -118,11 +159,12 @@ def crawl(self, url: str) -> None:
             self.link_count += 1
             print(Fore.GREEN + f"Links crawled: {self.link_count}")
 
-    def start(self) -> dict[str, dict[str, list]]:
+    def start(self) -> Dict[str, Dict[str, List[str]]]:
         """
-        Start crawling from the root_url. Crawls up to max_links urls.
-        After each crawl, urls found are added to the crawl_set,
-        next url to crawl is taken from this set.
+        Starts the crawling process from the root URL. Crawls up to max_links URLs.
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: The crawl results.
         """
         self.crawl(self.root_url)
 
@@ -136,6 +178,9 @@ def start(self) -> dict[str, dict[str, list]]:
 
 
 def main() -> None:
+    """
+    The main function to initialize and start the crawler.
+    """
     root_url = 'http://github.com'
     max_links = 2