From 7649f1deadcc1d8c1f4777090197fd560a084446 Mon Sep 17 00:00:00 2001 From: Indrajith Indraprastham Date: Thu, 13 Jun 2024 01:58:45 +0530 Subject: [PATCH] added pylintrc and format the files --- .github/workflows/pylint.yml | 2 +- .pylintrc | 637 +++++++++++++++++++++++++++++++++++ crawler/crawler.py | 200 +++++++++++ setup.py | 79 ++--- tests/test_crawler.py | 56 +-- tiny_web_crawler/crawler.py | 133 +++++--- 6 files changed, 997 insertions(+), 110 deletions(-) create mode 100644 .pylintrc create mode 100644 crawler/crawler.py diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index e8b5430..63ffc50 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -17,9 +17,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install pylint==3.0.2 pip install '.[dev]' pip install . - pip install pylint - name: Analysing the code with pylint run: | pylint $(git ls-files '*.py') diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..7efadbd --- /dev/null +++ b/.pylintrc @@ -0,0 +1,637 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint +# in a server-like mode. +clear-cache-post-run=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, +# it can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.12 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +source-roots= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type alias names. If left empty, type +# alias names will be checked with the set naming style. +#typealias-rgx= + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=builtins.BaseException,builtins.Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow explicit reexports by alias from a package __init__. +allow-reexport-from-package=no + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + use-implicit-booleaness-not-comparison-to-string, + use-implicit-booleaness-not-comparison-to-zero, + missing-function-docstring, + missing-class-docstring, + missing-module-docstring + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable= + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are: text, parseable, colorized, +# json2 (improved json format), json (old json format) and msvs (visual +# studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. No available dictionaries : You need to install +# both the python package and the system dependency for enchant to work. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/crawler/crawler.py b/crawler/crawler.py new file mode 100644 index 0000000..5bb114f --- /dev/null +++ b/crawler/crawler.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +import json +import urllib.parse +from typing import Dict, List, Optional, Set + +import requests +import validators +from bs4 import BeautifulSoup +from colorama import Fore, Style, init + +# Initialize colorama +init(autoreset=True) + + +class SpiderConfig: + def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None: + self.root_url: str = root_url + self.max_links: int = max_links + self.default_scheme: str = 'http://' + self.save_to_file: Optional[str] = save_to_file + self.scheme: str = self.default_scheme + + +class Spider: + """ + A simple web crawler class. + + Attributes: + root_url (str): The root URL to start crawling from. + max_links (int): The maximum number of links to crawl. + crawl_result (Dict[str, Dict[str, List[str]]]): The dictionary storing the crawl results. + crawl_set (Set[str]): A set of URLs to be crawled. + link_count (int): The current count of crawled links. + default_scheme (str): The default URL scheme (e.g., 'http://'). + save_to_file (Optional[str]): The file path to save the crawl results. + scheme (str): The current URL scheme being used. + """ + + def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None: + """ + Initializes the Spider class. + + Args: + root_url (str): The root URL to start crawling from. + max_links (int): The maximum number of links to crawl. + save_to_file (Optional[str]): The file to save the crawl results to. + """ + self.root_url: str = root_url + self.max_links: int = max_links + self.crawl_result: Dict[str, Dict[str, List[str]]] = {} + self.crawl_set: Set[str] = set() + self.link_count: int = 0 + self.default_scheme: str = 'http://' + self.save_to_file: Optional[str] = save_to_file + self.scheme: str = self.default_scheme + + @staticmethod + def is_valid_url(url: str) -> bool: + """ + Returns True for a valid url, False for an invalid url. + """ + return bool(validators.url(url)) + + def fetch_url(self, url: str) -> Optional[BeautifulSoup]: + """ + Reads the content of a URL and parses it using BeautifulSoup with lxml parser. + + Args: + url (str): The URL to fetch and parse. + + Returns: + Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, + None otherwise. + """ + try: + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx) + data = response.text + return BeautifulSoup(data, 'lxml') + except requests.exceptions.HTTPError as http_err: + print(Fore.RED + f"HTTP error occurred: {http_err}") + except requests.exceptions.ConnectionError as conn_err: + print(Fore.RED + f"Connection error occurred: {conn_err}") + except requests.exceptions.Timeout as timeout_err: + print(Fore.RED + f"Timeout error occurred: {timeout_err}") + except requests.exceptions.RequestException as req_err: + print(Fore.RED + f"Request error occurred: {req_err}") + except Exception as e: # pylint: disable=broad-exception-caught + # Catch all other exceptions + print(Fore.RED + f"An unexpected error occurred: {e}") + return None + + def save_results(self) -> None: + """ + Saves the crawl results into a JSON file. + """ + if self.save_to_file: + with open(self.save_to_file, 'w', encoding='utf-8') as file: + json.dump(self.crawl_result, file, indent=4) + + def format_url(self, url: str, base_url: str) -> str: + """ + Formats a URL to ensure it is absolute and removes any query parameters or fragments. + + Args: + url (str): The URL to format. + base_url (str): The base URL to resolve relative URLs. + + Returns: + str: The formatted URL. + """ + parsed_url = urllib.parse.urlparse(url) + base_url = base_url.rstrip('/') + + if parsed_url.scheme: + self.scheme = parsed_url.scheme + + if not parsed_url.scheme and not parsed_url.netloc: + if self.is_valid_url(self.default_scheme + parsed_url.path): + return self.default_scheme + parsed_url.path + + if parsed_url.path.startswith('/'): + return base_url + parsed_url.path + else: + return f"{base_url}/{parsed_url.path}" + + return f"{self.scheme}://{parsed_url.netloc}{parsed_url.path}" + + def crawl(self, url: str) -> None: + """ + Crawls a given URL, extracts links, and adds them to the crawl results. + + Args: + url (str): The URL to crawl. + """ + if not self.is_valid_url(url): + print(Fore.RED + f"Invalid url to crawl: {url}") + return + + if url in self.crawl_result: + print(Fore.YELLOW + f"URL already crawled: {url}") + return + + print(Fore.GREEN + f"Crawling: {url}") + soup = self.fetch_url(url) + if not soup: + return + + links = soup.body.find_all('a', href=True) + self.crawl_result[url] = {'urls': []} + + for link in links: + pretty_url = self.format_url(link['href'].lstrip(), url) + if not self.is_valid_url(pretty_url): + print(Fore.RED + f"Invalid url: {pretty_url}") + continue + + if pretty_url in self.crawl_result[url]['urls']: + continue + + self.crawl_result[url]['urls'].append(pretty_url) + self.crawl_set.add(pretty_url) + print(Fore.BLUE + f"Link found: {pretty_url}") + + if self.link_count < self.max_links: + self.link_count += 1 + print(Fore.GREEN + f"Links crawled: {self.link_count}") + + def start(self) -> Dict[str, Dict[str, List[str]]]: + """ + Starts the crawling process from the root URL. Crawls up to max_links URLs. + + Returns: + Dict[str, Dict[str, List[str]]]: The crawl results. + """ + self.crawl(self.root_url) + + while self.crawl_set and self.link_count < self.max_links: + self.crawl(self.crawl_set.pop()) + + if self.save_to_file: + self.save_results() + print(Style.BRIGHT + Fore.MAGENTA + "Exiting....") + return self.crawl_result + + +def main() -> None: + """ + The main function to initialize and start the crawler. + """ + root_url = 'http://github.com' + max_links = 2 + + crawler = Spider(root_url, max_links, save_to_file='out.json') + crawler.start() + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index cdf3cc8..641e0a0 100644 --- a/setup.py +++ b/setup.py @@ -1,57 +1,50 @@ from setuptools import setup, find_packages setup( - name='tiny-web-crawler', # PyPI package name - version='0.1.1', - author='Indrajith Indraprastham', - author_email='indr4jith@gmail.com', - description='A simple and efficient web crawler in Python.', - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - url='https://github.com/indrajithi/tiny-web-crawler', + name="tiny-web-crawler", # PyPI package name + version="0.1.1", + author="Indrajith Indraprastham", + author_email="indr4jith@gmail.com", + description="A simple and efficient web crawler in Python.", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/indrajithi/tiny-web-crawler", packages=find_packages( - include=['tiny_web_crawler', 'tiny_web_crawler.*', 'crawler', 'crawler.*']), - install_requires=[ - 'validators', - 'beautifulsoup4', - 'lxml', - 'colorama', - 'requests' - ], + include=["tiny_web_crawler", + "tiny_web_crawler.*", "crawler", "crawler.*"] + ), + install_requires=["validators", "beautifulsoup4", + "lxml", "colorama", "requests"], extras_require={ - 'dev': [ - 'mypy', - 'pytest', - 'responses' - ], + "dev": ["mypy", "pytest", "responses", "pylint"], }, classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Internet :: WWW/HTTP :: Indexing/Search' + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", ], - keywords='web crawler, scraping, web scraping, python crawler, SEO, data extraction', + keywords="web crawler, scraping, web scraping, python crawler, SEO, data extraction", project_urls={ - 'Documentation': 'https://github.com/indrajithi/tiny-web-crawler#readme', - 'Source': 'https://github.com/indrajithi/tiny-web-crawler', - 'Tracker': 'https://github.com/indrajithi/tiny-web-crawler/issues', + "Documentation": "https://github.com/indrajithi/tiny-web-crawler#readme", + "Source": "https://github.com/indrajithi/tiny-web-crawler", + "Tracker": "https://github.com/indrajithi/tiny-web-crawler/issues", }, - python_requires='>=3.6', + python_requires=">=3.6", entry_points={ - 'console_scripts': [ - 'tiny-web-crawler=tiny_web_crawler.crawler:main', + "console_scripts": [ + "tiny-web-crawler=tiny_web_crawler.crawler:main", ], }, ) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 404ff08..1578369 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,49 +1,58 @@ -import requests -import pytest -import urllib.request -from unittest.mock import patch, MagicMock, mock_open -from crawler.crawler import Spider +from unittest.mock import MagicMock, mock_open, patch + import responses +from crawler.crawler import Spider + def test_is_valid_url() -> None: assert Spider.is_valid_url("http://example.com") is True - # assert Spider.is_valid_url('invalid') is False + assert Spider.is_valid_url('invalid') is False def test_format_url() -> None: spider = Spider("http://example.com", 10) assert spider.format_url( "/test", "http://example.com") == "http://example.com/test" - assert spider.format_url("http://example.com/test", - "http://example.com") == "http://example.com/test" + assert ( + spider.format_url("http://example.com/test", "http://example.com") + == "http://example.com/test" + ) @responses.activate def test_fetch_url() -> None: - responses.add(responses.GET, 'http://example.com', - body="link", status=200) - spider = Spider(root_url='http://example.com', max_links=2) - resp = spider.fetch_url('http://example.com') + responses.add( + responses.GET, + "http://example.com", + body="link", + status=200, + ) + spider = Spider(root_url="http://example.com", max_links=2) + resp = spider.fetch_url("http://example.com") assert resp is not None - assert resp.text == 'link' + assert resp.text == "link" @responses.activate def test_crawl() -> None: # Mock HTTP response - responses.add(responses.GET, 'http://example.com', - body="link", - status=200, - content_type='text/html') + responses.add( + responses.GET, + "http://example.com", + body="link", + status=200, + content_type="text/html", + ) spider = Spider("http://example.com", 10) spider.crawl("http://example.com") assert "http://example.com" in spider.crawl_result assert spider.crawl_result["http://example.com"]["urls"] == [ - "http://example.com/test"] + "http://example.com/test" + ] @responses.activate @@ -57,16 +66,19 @@ def test_save_results() -> None: mocked_file.assert_called_once_with("out.json", "w") -@patch.object(Spider, 'crawl') -@patch.object(Spider, 'save_results') +@patch.object(Spider, "crawl") +@patch.object(Spider, "save_results") def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None: spider = Spider("http://example.com", 10) mock_crawl.side_effect = lambda url: spider.crawl_result.update( - {url: {'urls': ['http://example.com/test']}}) + {url: {"urls": ["http://example.com/test"]}} + ) + print(mock_save_results) spider.start() assert mock_crawl.call_count == 1 assert "http://example.com" in spider.crawl_result assert spider.crawl_result["http://example.com"]["urls"] == [ - 'http://example.com/test'] + "http://example.com/test" + ] diff --git a/tiny_web_crawler/crawler.py b/tiny_web_crawler/crawler.py index 97a1604..8df822b 100644 --- a/tiny_web_crawler/crawler.py +++ b/tiny_web_crawler/crawler.py @@ -1,71 +1,106 @@ -# -*- coding: utf-8 -*- -# filename: crawler.py -# Author: Indrajith Indraprastham -# License: GPL v3: http://www.gnu.org/licenses/ - -# -------------------------------------------------------------------------------- -# README -# -------------------------------------------------------------------------------- -# Install Requirements -# pip install validators beautifulsoup4 lxml colorama - -# Python version: Python 3.6.3 :: Anaconda, Inc. - from __future__ import annotations -from bs4 import BeautifulSoup -import requests import json import urllib.parse +from typing import Dict, List, Optional, Set + +import requests import validators -from colorama import init, Fore, Style -from typing import Optional, Set +from bs4 import BeautifulSoup +from colorama import Fore, Style, init -# Initialize colorama init(autoreset=True) -class Spider: - def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None: +DEFAULT_SCHEME: str = 'http://' + + +class Spider(): + """ + A simple web crawler class. + + Attributes: + root_url (str): The root URL to start crawling from. + max_links (int): The maximum number of links to crawl. + crawl_result (Dict[str, Dict[str, List[str]]]): The dictionary storing the crawl results. + crawl_set (Set[str]): A set of URLs to be crawled. + link_count (int): The current count of crawled links. + save_to_file (Optional[str]): The file path to save the crawl results. + """ + + def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str] = None) -> None: + """ + Initializes the Spider class. + + Args: + root_url (str): The root URL to start crawling from. + max_links (int): The maximum number of links to crawl. + save_to_file (Optional[str]): The file to save the crawl results to. + """ self.root_url: str = root_url self.max_links: int = max_links - self.crawl_result: dict[str, dict[str, list]] = {} - self.crawl_set: set = set() + self.crawl_result: Dict[str, Dict[str, List[str]]] = {} + self.crawl_set: Set[str] = set() self.link_count: int = 0 - self.default_scheme: str = 'http://' self.save_to_file: Optional[str] = save_to_file - self.scheme: str = self.default_scheme + self.scheme: str = DEFAULT_SCHEME def fetch_url(self, url: str) -> Optional[BeautifulSoup]: """ - Reads the content of a url, parses it using BeautifulSoup with lxml parser. + Reads the content of a URL and parses it using BeautifulSoup with lxml parser. + + Args: + url (str): The URL to fetch and parse. + + Returns: + Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise. """ + try: - with requests.get(url) as response: - data = response.text + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx) + data = response.text return BeautifulSoup(data, 'lxml') - except Exception as e: - print(Fore.RED + f"Unable to fetch url: {url}, Error: {e}") - return None + except requests.exceptions.HTTPError as http_err: + print(Fore.RED + f"HTTP error occurred: {http_err}") + except requests.exceptions.ConnectionError as conn_err: + print(Fore.RED + f"Connection error occurred: {conn_err}") + except requests.exceptions.Timeout as timeout_err: + print(Fore.RED + f"Timeout error occurred: {timeout_err}") + except requests.exceptions.RequestException as req_err: + print(Fore.RED + f"Request error occurred: {req_err}") + return None @staticmethod def is_valid_url(url: str) -> bool: """ - Returns True for a valid url, False for an invalid url. + Checks if the provided URL is valid. + + Args: + url (str): The URL to validate. + + Returns: + bool: True if the URL is valid, False otherwise. """ return bool(validators.url(url)) def save_results(self) -> None: """ - Saves results into a json file. + Saves the crawl results into a JSON file. """ if self.save_to_file: - with open(self.save_to_file, 'w') as file: + with open(self.save_to_file, 'w', encoding='utf-8') as file: json.dump(self.crawl_result, file, indent=4) def format_url(self, url: str, base_url: str) -> str: """ - Removes any query, params, tag-id reference in the urls. - Adds base_url to url if it is a relative link (link to the same domain). + Formats a URL to ensure it is absolute and removes any query parameters or fragments. + + Args: + url (str): The URL to format. + base_url (str): The base URL to resolve relative URLs. + + Returns: + str: The formatted URL. """ parsed_url = urllib.parse.urlparse(url) base_url = base_url.rstrip('/') @@ -74,17 +109,23 @@ def format_url(self, url: str, base_url: str) -> str: self.scheme = parsed_url.scheme if not parsed_url.scheme and not parsed_url.netloc: - if self.is_valid_url(self.default_scheme + parsed_url.path): - return self.default_scheme + parsed_url.path + if self.is_valid_url(DEFAULT_SCHEME + parsed_url.path): + return DEFAULT_SCHEME + parsed_url.path if parsed_url.path.startswith('/'): return base_url + parsed_url.path - else: - return f"{base_url}/{parsed_url.path}" + + return f"{base_url}/{parsed_url.path}" return f"{self.scheme}://{parsed_url.netloc}{parsed_url.path}" def crawl(self, url: str) -> None: + """ + Crawls a given URL, extracts links, and adds them to the crawl results. + + Args: + url (str): The URL to crawl. + """ if not self.is_valid_url(url): print(Fore.RED + f"Invalid url to crawl: {url}") return @@ -98,7 +139,7 @@ def crawl(self, url: str) -> None: if not soup: return - links = soup.body.find_all('a', href=True) if soup.body else [] + links = soup.body.find_all('a', href=True) self.crawl_result[url] = {'urls': []} for link in links: @@ -118,11 +159,12 @@ def crawl(self, url: str) -> None: self.link_count += 1 print(Fore.GREEN + f"Links crawled: {self.link_count}") - def start(self) -> dict[str, dict[str, list]]: + def start(self) -> Dict[str, Dict[str, List[str]]]: """ - Start crawling from the root_url. Crawls up to max_links urls. - After each crawl, urls found are added to the crawl_set, - next url to crawl is taken from this set. + Starts the crawling process from the root URL. Crawls up to max_links URLs. + + Returns: + Dict[str, Dict[str, List[str]]]: The crawl results. """ self.crawl(self.root_url) @@ -136,6 +178,9 @@ def start(self) -> dict[str, dict[str, list]]: def main() -> None: + """ + The main function to initialize and start the crawler. + """ root_url = 'http://github.com' max_links = 2