diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..8e9a28a --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +extend-ignore = E131, E722, F401, F403 diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index af76db2..819b132 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -48,11 +48,11 @@ jobs: # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. - + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality - + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild @@ -61,7 +61,7 @@ jobs: # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - # If the Autobuild fails above, remove it and uncomment the following three lines. + # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5cce972 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-added-large-files + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.4.0 + hooks: + - id: mypy + additional_dependencies: + - types-requests +- repo: https://github.com/pylint-dev/pylint + rev: v3.0.0a6 + hooks: + - id: pylint + additional_dependencies: + - flask + - Flask-SQLAlchemy + - requests + - beautifulsoup4 +- repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + additional_dependencies: + - flake8-import-order diff --git a/.pylintrc b/.pylintrc index 9152531..f8b632c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,8 +1,8 @@ [GENERAL] -fail-under=9.0 +fail-under=10.0 [MESSAGES CONTROL] -disable=R0903 +disable=C,R,W0511,W0702,W0718 [TYPECHECK] generated-members=flask_sqlalchemy.SQLAlchemy.DateTime, diff --git a/argostime.example.conf b/argostime.example.conf index 914413b..e8b4ef7 100644 --- a/argostime.example.conf +++ b/argostime.example.conf @@ -6,4 +6,3 @@ user = argostime_user password = p@ssw0rd server = localhost database = argostime - diff --git a/argostime/__init__.py b/argostime/__init__.py index 2cbbfc1..3c60c79 100644 --- a/argostime/__init__.py +++ b/argostime/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ __init__.py @@ -25,28 +24,32 @@ logging.basicConfig( filename="argostime.log", level=logging.DEBUG, - format="%(asctime)s - %(processName)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s" + format="%(asctime)s - %(processName)s - %(levelname)s - %(module)s - " + "%(funcName)s - %(message)s" ) -import os.path +import configparser # noqa: I100, I202, E402 +import os.path # noqa: E402 -import configparser +from flask import Flask # noqa: E402 -from flask import Flask -from flask_sqlalchemy import SQLAlchemy +from flask_sqlalchemy import SQLAlchemy # noqa: E402 db: SQLAlchemy = SQLAlchemy() + def get_current_commit() -> str: """Return the hexadecimal hash of the current running commit.""" - git_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.git")) - with open(os.path.join(git_path, "HEAD"), "r") as file_head: + gp = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.git")) + with open(os.path.join(gp, "HEAD"), "r", encoding="utf-8") as file_head: hexsha = file_head.read().strip() while hexsha.startswith("ref: "): - with open(os.path.join(git_path, hexsha[5:])) as file_ref: + with open(os.path.join(gp, hexsha[5:]), + "r", encoding="utf-8") as file_ref: hexsha = file_ref.read().strip() return hexsha + def create_app(): """Return a flask object for argostime, initialize logger and db.""" logging.getLogger("matplotlib.font_manager").disabled = True @@ -58,7 +61,9 @@ def create_app(): logging.debug("Found sections %s in config", config.sections()) if "mariadb" in config: - app.config["SQLALCHEMY_DATABASE_URI"] = "mysql+pymysql://{user}:{password}@{server}/{database}?charset=utf8mb4".format( + app.config["SQLALCHEMY_DATABASE_URI"] = \ + "mysql+pymysql://{user}:{password}@{server}/{database}" \ + "?charset=utf8mb4".format( user=config["mariadb"]["user"], password=config["mariadb"]["password"], server=config["mariadb"]["server"], @@ -74,6 +79,6 @@ def create_app(): db.init_app(app) with app.app_context(): - from . import routes + from . import routes # pylint: disable=W0611 db.create_all() return app diff --git a/argostime/crawler/__init__.py b/argostime/crawler/__init__.py index 702b5cb..27e9083 100644 --- a/argostime/crawler/__init__.py +++ b/argostime/crawler/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/__init__.py @@ -22,6 +21,6 @@ along with Argostimè. If not, see . """ -from argostime.crawler.crawl_utils import CrawlResult, enabled_shops from argostime.crawler.crawl_url import crawl_url +from argostime.crawler.crawl_utils import CrawlResult, enabled_shops from argostime.crawler.shop import * diff --git a/argostime/crawler/crawl_url.py b/argostime/crawler/crawl_url.py index 8095e87..78a4247 100644 --- a/argostime/crawler/crawl_url.py +++ b/argostime/crawler/crawl_url.py @@ -1,9 +1,8 @@ -#!/usr/bin/env python3 """ crawler/crawl_url.py - Crawler function exposed to the rest of the system to get pricing and product - information from a given URL. + Crawler function exposed to the rest of the system to get pricing and + product information from a given URL. Copyright (c) 2022 Martijn Copyright (c) 2022 Kevin @@ -27,9 +26,8 @@ import logging import urllib.parse -from argostime.exceptions import WebsiteNotImplementedException - from argostime.crawler.crawl_utils import CrawlResult, enabled_shops +from argostime.exceptions import WebsiteNotImplementedException def crawl_url(url: str) -> CrawlResult: @@ -47,8 +45,9 @@ def crawl_url(url: str) -> CrawlResult: if hostname not in enabled_shops: raise WebsiteNotImplementedException(url) - # Note: This is a function call! The called function is the corresponding crawler - # registered using the "@register_crawler" decorator in the "shop" directory. + # Note: This is a function call! The called function is the corresponding + # crawler registered using the "@register_crawler" decorator in the "shop" + # directory. result: CrawlResult = enabled_shops[hostname]["crawler"](url) result.check() diff --git a/argostime/crawler/crawl_utils.py b/argostime/crawler/crawl_utils.py index bbefc4f..2b4c285 100644 --- a/argostime/crawler/crawl_utils.py +++ b/argostime/crawler/crawl_utils.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/crawl_utils.py @@ -36,7 +35,9 @@ class CrawlResult: - """Data structure for returning the results of a crawler in a uniform way.""" + """ + Data structure for returning the results of a crawler in a uniform way. + """ url: Optional[str] product_name: Optional[str] @@ -50,15 +51,15 @@ class CrawlResult: def __init__( self, - url: Optional[str]=None, - product_name: Optional[str]=None, - product_description: Optional[str]=None, - product_code: Optional[str]=None, - normal_price: float=-1.0, - discount_price: float=-1.0, - on_sale: bool=False, - ean: Optional[int]=None, - ): + url: Optional[str] = None, + product_name: Optional[str] = None, + product_description: Optional[str] = None, + product_code: Optional[str] = None, + normal_price: float = -1.0, + discount_price: float = -1.0, + on_sale: bool = False, + ean: Optional[int] = None, + ): self.url = url self.product_name = product_name self.product_description = product_description @@ -72,7 +73,8 @@ def __str__(self) -> str: string = f"CrawlResult(product_name={self.product_name},"\ f"product_description={self.product_description},"\ f"product_code={self.product_code},price={self.normal_price},"\ - f"discount={self.discount_price},sale={self.on_sale},ean={self.ean}" + f"discount={self.discount_price},sale={self.on_sale}," \ + f"ean={self.ean}" return string @@ -100,24 +102,28 @@ def check(self) -> None: if self.discount_price < 0 and self.on_sale: raise CrawlerException("No discount price given for item on sale!") if self.normal_price < 0 and not self.on_sale: - raise CrawlerException("No normal price given for item not on sale!") + raise CrawlerException( + "No normal price given for item not on sale!") CrawlerFunc = Callable[[str], CrawlResult] -ShopDict = TypedDict("ShopDict", {"name": str, "hostname": str, "crawler": CrawlerFunc}) +ShopDict = TypedDict("ShopDict", {"name": str, "hostname": str, + "crawler": CrawlerFunc}) enabled_shops: Dict[str, ShopDict] = {} -def register_crawler(name: str, host: str, use_www: bool = True) -> Callable[[CrawlerFunc], None]: +def register_crawler(name: str, host: str, use_www: bool = True) \ + -> Callable[[CrawlerFunc], None]: """Decorator to register a new crawler function.""" def decorate(func: Callable[[str], CrawlResult]) -> None: """ - This function will be called when you put the "@register_crawler" decorator above - a function defined in a file in the "shop" directory! The argument will be the - function above which you put the decorator. + This function will be called when you put the "@register_crawler" + decorator above a function defined in a file in the "shop" directory! + The argument will be the function above which you put the decorator. """ - if "argostime" in __config and "disabled_shops" in __config["argostime"]: + if "argostime" in __config and \ + "disabled_shops" in __config["argostime"]: if host in __config["argostime"]["disabled_shops"]: logging.debug("Shop %s is disabled", host) return @@ -137,7 +143,9 @@ def decorate(func: Callable[[str], CrawlResult]) -> None: def parse_promotional_message(message: str, price: float) -> float: - """Parse a given promotional message, and returns the calculated effective price. + """ + Parse a given promotional message, and returns the calculated effective + price. For example "1+1 GRATIS" will be parsed to meaning a 50% discount. "2+1 GRATIS" will be parsed to mean a 33% discount, and will return 2/3. @@ -152,7 +160,8 @@ def parse_promotional_message(message: str, price: float) -> float: message_no_whitespace = message_no_whitespace.lower() - logging.debug("Promotion yielded sanitized input %s", message_no_whitespace) + logging.debug("Promotion yielded sanitized input %s", + message_no_whitespace) if message_no_whitespace == "1+1gratis": return 1/2 * price @@ -187,8 +196,9 @@ def parse_promotional_message(message: str, price: float) -> float: return float(msg_split[1]) return float(msg_split[1]) / float(msg_split[0]) except ArithmeticError as exception: - logging.error("Calculation error parsing %s %s", message_no_whitespace, exception) - except IndexError as exception: + logging.error("Calculation error parsing %s %s", + message_no_whitespace, exception) + except IndexError: logging.error("IndexError in message %s", message_no_whitespace) logging.error("Promotion text did not match any known promotion") diff --git a/argostime/crawler/shop/__init__.py b/argostime/crawler/shop/__init__.py index f2e6352..fe71a9b 100644 --- a/argostime/crawler/shop/__init__.py +++ b/argostime/crawler/shop/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/__init__.py @@ -22,10 +21,11 @@ along with Argostimè. If not, see . """ -from os.path import dirname, basename, isfile, join import glob +from os.path import basename, dirname, isfile, join -# Load all modules in the current directory, based on the answer from Anurag Uniyal: -# https://stackoverflow.com/questions/1057431/how-to-load-all-modules-in-a-folder +# Load all modules in the current directory, based on the answer from +# Anurag Uniyal: https://stackoverflow.com/q/1057431 modules = glob.glob(join(dirname(__file__), "*.py")) -__all__ = [basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')] +__all__ = [basename(f)[:-3] for f in modules if isfile(f) and + not f.endswith('__init__.py')] diff --git a/argostime/crawler/shop/ah.py b/argostime/crawler/shop/ah.py index c27e6ae..39ccf07 100644 --- a/argostime/crawler/shop/ah.py +++ b/argostime/crawler/shop/ah.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/ah.py @@ -22,33 +21,35 @@ along with Argostimè. If not, see . """ -from datetime import date import json import logging +from datetime import date -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import \ + CrawlResult, parse_promotional_message, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, parse_promotional_message, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Albert Heijn", "ah.nl") def crawl_ah(url: str) -> CrawlResult: """Crawler for ah.nl""" - response: requests.Response = requests.get(url) + response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") raw_json_match = soup.find( "script", - attrs={ "type": "application/ld+json", "data-react-helmet": "true"} + attrs={"type": "application/ld+json", "data-react-helmet": "true"} ) result: CrawlResult = CrawlResult(url=url) @@ -56,7 +57,8 @@ def crawl_ah(url: str) -> CrawlResult: try: product_dict = json.loads(raw_json_match.text) except json.decoder.JSONDecodeError as exception: - logging.error("Could not decode JSON %s, raising CrawlerException", raw_json_match) + logging.error("Could not decode JSON %s, raising CrawlerException", + raw_json_match) raise CrawlerException from exception except Exception as exception: logging.error( @@ -86,7 +88,8 @@ def crawl_ah(url: str) -> CrawlResult: try: offer = product_dict["offers"] except KeyError as exception: - logging.error("Could not find a valid offer in the json %s", product_dict) + logging.error("Could not find a valid offer in the json %s", + product_dict) raise CrawlerException from exception if "validFrom" in offer.keys(): @@ -111,13 +114,15 @@ def crawl_ah(url: str) -> CrawlResult: # Try to find a promotional message promo_text_matches = soup.find_all( "p", - attrs={ "class" :lambda x: x and x.startswith("promo-sticker-text") } + attrs={"class": lambda x: + x and x.startswith("promo-sticker-text")} ) if len(promo_text_matches) == 0: promo_text_matches = soup.find_all( "div", - attrs={ "class" :lambda x: x and x.startswith("promo-sticker_content") } + attrs={"class": lambda x: + x and x.startswith("promo-sticker_content")} ) promotion_message: str = "" @@ -130,8 +135,8 @@ def crawl_ah(url: str) -> CrawlResult: price: float = float(offer["price"]) - # If there is a mark with for example "25% Korting", this is already calculated into - # the price we got from the json. + # If there is a mark with for example "25% Korting", this is + # already calculated into the price we got from the json. if "korting" not in message_no_whitespace: promotion = parse_promotional_message(promotion_message, price) else: @@ -157,7 +162,8 @@ def crawl_ah(url: str) -> CrawlResult: try: result.normal_price = float(product_dict["offers"]["price"]) except KeyError as inner_exception: - logging.error("Couldn't even find a normal price in %s", product_dict) + logging.error("Couldn't even find a normal price in %s", + product_dict) raise CrawlerException from inner_exception return result diff --git a/argostime/crawler/shop/brandzaak.py b/argostime/crawler/shop/brandzaak.py index 050cc24..79f1f5c 100644 --- a/argostime/crawler/shop/brandzaak.py +++ b/argostime/crawler/shop/brandzaak.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/brandzaak.py @@ -23,34 +22,35 @@ along with Argostimè. If not, see . """ - import logging -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Brandzaak", "brandzaak.nl") def crawl_brandzaak(url: str) -> CrawlResult: """Parse a product from brandzaak.nl""" - response = requests.get(url) + response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") result: CrawlResult = CrawlResult(url=url) - product_title = soup.find("meta", attrs={ "name": "title"}) - product_price = soup.find("meta", attrs={ "property": "product:price:amount"}) + product_title = soup.find("meta", attrs={"name": "title"}) + product_price = soup.find("meta", + attrs={"property": "product:price:amount"}) try: result.product_name = product_title['content'] diff --git a/argostime/crawler/shop/ekoplaza.py b/argostime/crawler/shop/ekoplaza.py index 3adb746..2f2721b 100644 --- a/argostime/crawler/shop/ekoplaza.py +++ b/argostime/crawler/shop/ekoplaza.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/ekoplaza.py @@ -24,12 +23,11 @@ import logging -import requests - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +import requests @register_crawler("Ekoplaza", "ekoplaza.nl") diff --git a/argostime/crawler/shop/etos.py b/argostime/crawler/shop/etos.py index 0c002a5..3ab405d 100644 --- a/argostime/crawler/shop/etos.py +++ b/argostime/crawler/shop/etos.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/etos.py @@ -26,14 +25,14 @@ import logging from typing import Dict -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from argostime.crawler.crawl_utils import parse_promotional_message from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler -from argostime.crawler.crawl_utils import parse_promotional_message +from bs4 import BeautifulSoup + +import requests @register_crawler("Etos", "etos.nl") @@ -41,7 +40,8 @@ def crawl_etos(url: str) -> CrawlResult: """Crawler for etos.nl""" headers = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "nl,en-US;q=0.7,en;q=0.3", "Cache-Control": "no-cache", @@ -53,13 +53,15 @@ def crawl_etos(url: str) -> CrawlResult: "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) " + "Gecko/20100101 Firefox/96.0" } response = requests.get(url, timeout=10, headers=headers) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") @@ -69,9 +71,7 @@ def crawl_etos(url: str) -> CrawlResult: try: raw_product_json = soup.find( "div", - attrs= { - "class": "js-product-detail", - } + attrs={"class": "js-product-detail"} ).get("data-gtm-event") except AttributeError as exception: logging.error("Could not find a product detail json") @@ -80,7 +80,8 @@ def crawl_etos(url: str) -> CrawlResult: try: product_dict = json.loads(raw_product_json) except json.decoder.JSONDecodeError as exception: - logging.error("Could not decode JSON %s, raising CrawlerException", raw_product_json) + logging.error("Could not decode JSON %s, raising CrawlerException", + raw_product_json) raise CrawlerException from exception logging.debug(product_dict) @@ -90,7 +91,8 @@ def crawl_etos(url: str) -> CrawlResult: try: result.product_name = offer["name"] except KeyError as exception: - logging.error("No key name found in json %s parsed as %s", raw_product_json, product_dict) + logging.error("No key name found in json %s parsed as %s", + raw_product_json, product_dict) raise CrawlerException from exception try: @@ -111,9 +113,10 @@ def crawl_etos(url: str) -> CrawlResult: result.on_sale = True else: # Couldn't parse the promotion! - logging.info("Couldn't parse promotion %s, assuming no discount", promotion_message) + logging.info("Couldn't parse promotion %s, assuming no discount", + promotion_message) result.normal_price = price - except KeyError as exception: + except KeyError: logging.debug("No promotion found, assuming no discount") try: result.normal_price = price diff --git a/argostime/crawler/shop/hema.py b/argostime/crawler/shop/hema.py index 858ee1b..d5ede1b 100644 --- a/argostime/crawler/shop/hema.py +++ b/argostime/crawler/shop/hema.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/hema.py @@ -28,13 +27,13 @@ import re from typing import Optional -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("HEMA", "hema.nl") @@ -44,7 +43,8 @@ def crawl_hema(url: str) -> CrawlResult: response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") @@ -70,27 +70,34 @@ def crawl_hema(url: str) -> CrawlResult: try: product_dict = json.loads(raw_json) except json.decoder.JSONDecodeError as exception: - logging.error("Could not decode JSON %s, raising CrawlerException", raw_json) + logging.error("Could not decode JSON %s, raising CrawlerException", + raw_json) raise CrawlerException from exception logging.debug(product_dict) try: - result.product_name = product_dict["ecommerce"]["detail"]["products"][0]["name"] + result.product_name = \ + product_dict["ecommerce"]["detail"]["products"][0]["name"] except KeyError as exception: - logging.error("Could not find product name in %s via %s", raw_json, url) + logging.error("Could not find product name in %s via %s", + raw_json, url) raise CrawlerException from exception try: - result.product_code = product_dict["ecommerce"]["detail"]["products"][0]["id"] + result.product_code = \ + product_dict["ecommerce"]["detail"]["products"][0]["id"] except KeyError as exception: - logging.error("Could not find product code in %s via %s", raw_json, url) + logging.error("Could not find product code in %s via %s", + raw_json, url) raise CrawlerException from exception try: - result.normal_price = float(product_dict["ecommerce"]["detail"]["products"][0]["price"]) - except KeyError as exception: - logging.error("Could not find a valid price in %s via %s", raw_json, url) + result.normal_price = \ + float(product_dict["ecommerce"]["detail"]["products"][0]["price"]) + except KeyError: + logging.error("Could not find a valid price in %s via %s", + raw_json, url) result.normal_price = -1 return result diff --git a/argostime/crawler/shop/ikea.py b/argostime/crawler/shop/ikea.py index 88cf03b..72342e4 100644 --- a/argostime/crawler/shop/ikea.py +++ b/argostime/crawler/shop/ikea.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/ikea.py @@ -25,13 +24,13 @@ import logging import re -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("IKEA", "ikea.com") @@ -43,14 +42,15 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") info_wrapper = soup.find( "div", - id= re.compile("buy-module-content") + id=re.compile("buy-module-content") ) try: @@ -61,35 +61,36 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 try: result.product_name = info_wrapper.find( ["span", "div"], - class_= re.compile("header-section__title--big") + class_=re.compile("header-section__title--big") ).text except Exception as exception: - logging.error("Could not find a name in %s %s", info_wrapper, exception) + logging.error("Could not find a name in %s %s", + info_wrapper, exception) raise CrawlerException from exception try: result.product_description = info_wrapper.find( "span", - class_= re.compile("header-section__description-text") + class_=re.compile("header-section__description-text") ).text - except Exception as exception: + except: logging.error("Could not find a description in %s", info_wrapper) - try: result.product_code = soup.find( "span", - class_= re.compile("product-identifier__value") + class_=re.compile("product-identifier__value") ).text except Exception as exception: - logging.error("Could not find a product code in %s %s", info_wrapper, exception) + logging.error("Could not find a product code in %s %s", + info_wrapper, exception) raise CrawlerException from exception try: # Todo: Verify if this is needed with discounted product page... price_tag_prev = info_wrapper.find( "div", - class_= re.compile("price-package__previous-price-hasStrikeThrough") + class_=re.compile("price-package__previous-price-hasStrikeThrough") ) if not price_tag_prev: @@ -110,7 +111,7 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 decimals = float( price_tag_prev.find( "span", - class_= re.compile("price__decimal") + class_=re.compile("price__decimal") ).text) except Exception as exception: logging.debug("No decimals found, assuming 0 %s", exception) @@ -123,7 +124,7 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 try: price_tag_curr = info_wrapper.find( # "div", - class_= re.compile("price-module__current-price") + class_=re.compile("price-module__current-price") ) integers = float( @@ -131,14 +132,14 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 ".-", "", price_tag_curr.find( "span", - class_= re.compile("price__integer") + class_=re.compile("price__integer") ).text)) try: decimals = float( price_tag_curr.find( "span", - class_= re.compile("price__decimal") + class_=re.compile("price__decimal") ).text) except Exception as exception: logging.debug("No decimals found, assuming 0 %s", exception) @@ -150,7 +151,8 @@ def crawl_ikea(url: str) -> CrawlResult: # pylint: disable=R0915 else: result.normal_price = integers + decimals except Exception as exception: - logging.error("No current price found in %s %s", info_wrapper, exception) + logging.error("No current price found in %s %s", + info_wrapper, exception) raise CrawlerException from exception return result diff --git a/argostime/crawler/shop/intergamma.py b/argostime/crawler/shop/intergamma.py index 431bc13..50a81bc 100644 --- a/argostime/crawler/shop/intergamma.py +++ b/argostime/crawler/shop/intergamma.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/intergamma.py @@ -24,13 +23,13 @@ import logging -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests def crawl_intergamma(url: str) -> CrawlResult: @@ -38,7 +37,8 @@ def crawl_intergamma(url: str) -> CrawlResult: response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %s while getting url %s", response.status_code, url) + logging.error("Got status code %s while getting url %s", + response.status_code, url) raise PageNotFoundException(url) # Use UTF-8 encoding instead of ISO-8859-1 @@ -74,7 +74,8 @@ def crawl_intergamma(url: str) -> CrawlResult: itemtype="http://schema.org/Product" )["data-product-code"] except Exception as exception: - logging.error("Could not find a product code, raising CrawlerException") + logging.error("Could not find a product code, " + "raising CrawlerException") logging.debug("Got exception: %s", exception) raise CrawlerException from exception diff --git a/argostime/crawler/shop/jumbo.py b/argostime/crawler/shop/jumbo.py index 2cf797a..c500f30 100644 --- a/argostime/crawler/shop/jumbo.py +++ b/argostime/crawler/shop/jumbo.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/jumbo.py @@ -25,13 +24,13 @@ import json import logging -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Jumbo", "jumbo.com") @@ -42,7 +41,8 @@ def crawl_jumbo(url: str) -> CrawlResult: """ headers = { "Referer": "https://www.jumbo.com", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "nl,en-US;q=0.7,en;q=0.3", "Cache-Control": "no-cache", @@ -54,17 +54,20 @@ def crawl_jumbo(url: str) -> CrawlResult: "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) " + "Gecko/20100101 Firefox/96.0" } response = requests.get(url, timeout=10, headers=headers) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") - product_json = soup.find("script", attrs={"type": "application/ld+json", "data-n-head": "ssr"}) + product_json = soup.find("script", attrs={"type": "application/ld+json", + "data-n-head": "ssr"}) raw_json = product_json.string result: CrawlResult = CrawlResult(url=url) @@ -72,13 +75,15 @@ def crawl_jumbo(url: str) -> CrawlResult: try: product = json.loads(raw_json) except json.decoder.JSONDecodeError as exception: - logging.error("Could not decode JSON %s, raising CrawlerException", raw_json) + logging.error("Could not decode JSON %s, raising CrawlerException", + raw_json) raise CrawlerException from exception if product["offers"]["@type"] == "AggregateOffer": offer = product["offers"] else: - logging.error("No price info available in %s, raising CrawlerException", raw_json) + logging.error("No price info available in %s, " + "raising CrawlerException", raw_json) raise CrawlerException() try: diff --git a/argostime/crawler/shop/pipashop.py b/argostime/crawler/shop/pipashop.py index ebb0b9e..2f881a1 100644 --- a/argostime/crawler/shop/pipashop.py +++ b/argostime/crawler/shop/pipashop.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/pipashop.py @@ -25,13 +24,13 @@ import logging import re -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Pipa Shop", "pipa-shop.nl") @@ -46,7 +45,8 @@ def crawl_pipashop(url: str) -> CrawlResult: soup = BeautifulSoup(request.text, "html.parser") try: - price = re.sub(r"[^0-9.]", "", soup.select_one("div.product-price").text) + price = re.sub(r"[^0-9.]", "", + soup.select_one("div.product-price").text) result.product_name = soup.select_one("div.product-title a").text result.product_code = url.split("/product/").pop().split("/")[0] result.normal_price = float(price) diff --git a/argostime/crawler/shop/praxis.py b/argostime/crawler/shop/praxis.py index 162cd4a..a056b17 100644 --- a/argostime/crawler/shop/praxis.py +++ b/argostime/crawler/shop/praxis.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/praxis.py @@ -26,13 +25,13 @@ import logging import re -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests def __fix_bad_json(bad_json: str) -> str: @@ -45,7 +44,8 @@ def crawl_praxis(url: str) -> CrawlResult: response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %s while getting url %s", response.status_code, url) + logging.error("Got status code %s while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser") @@ -54,17 +54,20 @@ def crawl_praxis(url: str) -> CrawlResult: try: raw_product_json = soup.find( "script", - text=lambda value: value and value.startswith("window.__PRELOADED_STATE_productDetailsFragmentInfo__") + text=lambda value: value and value.startswith( + "window.__PRELOADED_STATE_productDetailsFragmentInfo__") ).text.split("=", maxsplit=1)[1].strip() except Exception as exception: - logging.error("Could not find a product detail JSON, raising CrawlerException") + logging.error("Could not find a product detail JSON, " + "raising CrawlerException") raise CrawlerException from exception try: json_data = json.loads(__fix_bad_json(raw_product_json)) product = json_data["productDetails"] except json.decoder.JSONDecodeError as exception: - logging.error("Could not decode JSON %s, raising CrawlerException", raw_product_json) + logging.error("Could not decode JSON %s, raising CrawlerException", + raw_product_json) raise CrawlerException from exception except KeyError as exception: logging.error("No key productDetails found in JSON data") @@ -90,13 +93,14 @@ def crawl_praxis(url: str) -> CrawlResult: try: result.ean = int(product["ean"]) - except KeyError as exception: + except KeyError: # Don't raise an exception since EAN is not strictly necessary! logging.error("No key ean found in JSON") try: if "discount" in product.keys() and \ - ("discountClass" not in product.keys() or product["discountClass"] != "excludedproducts"): + ("discountClass" not in product.keys() or + product["discountClass"] != "excludedproducts"): result.discount_price = float(product["discount"]["value"]) result.on_sale = True else: diff --git a/argostime/crawler/shop/simonlevelt.py b/argostime/crawler/shop/simonlevelt.py index 2bc0dbd..510ed56 100644 --- a/argostime/crawler/shop/simonlevelt.py +++ b/argostime/crawler/shop/simonlevelt.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/simonlevelt.py @@ -25,13 +24,13 @@ import locale import logging -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Simon Lévelt", "simonlevelt.nl") @@ -41,7 +40,8 @@ def crawl_simonlevelt(url: str) -> CrawlResult: response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.debug("Got status code %d while getting url %s", response.status_code, url) + logging.debug("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") @@ -49,9 +49,11 @@ def crawl_simonlevelt(url: str) -> CrawlResult: result = CrawlResult() try: - result.url = soup.find("meta", property="product:product_link").get("content") + result.url = soup.find("meta", + property="product:product_link").get("content") except Exception as exception: - logging.info("Couldn't find url in soup, using given instead %s", exception) + logging.info("Couldn't find url in soup, using given instead %s", + exception) result.url = url try: diff --git a/argostime/crawler/shop/steam.py b/argostime/crawler/shop/steam.py index ea7577e..6671a90 100644 --- a/argostime/crawler/shop/steam.py +++ b/argostime/crawler/shop/steam.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ crawler/shop/steam.py @@ -24,13 +23,13 @@ import logging -import requests -from bs4 import BeautifulSoup - +from argostime.crawler.crawl_utils import CrawlResult, register_crawler from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException -from argostime.crawler.crawl_utils import CrawlResult, register_crawler +from bs4 import BeautifulSoup + +import requests @register_crawler("Steam", "store.steampowered.com", False) @@ -42,7 +41,8 @@ def crawl_steam(url: str) -> CrawlResult: response: requests.Response = requests.get(url, timeout=10) if response.status_code != 200: - logging.error("Got status code %d while getting url %s", response.status_code, url) + logging.error("Got status code %d while getting url %s", + response.status_code, url) raise PageNotFoundException(url) soup = BeautifulSoup(response.text, "html.parser") @@ -85,7 +85,8 @@ def crawl_steam(url: str) -> CrawlResult: } ).get("value") except Exception as exception: - logging.error("Could not find a product code in %s %s", game_info, exception) + logging.error("Could not find a product code in %s %s", + game_info, exception) raise CrawlerException from exception try: @@ -96,10 +97,11 @@ def crawl_steam(url: str) -> CrawlResult: "game_purchase_discount" ).get("data-price-final")) / 100.0 result.on_sale = True - # There is info in the page about the normal price when there is a discount, - # it's just more of a hassle to find that information + # There is info in the page about the normal price when there is a + # discount, it's just more of a hassle to find that information except Exception as exception: - logging.info("No discount found, looking for normal price %s", exception) + logging.info("No discount found, looking for normal price %s", + exception) try: result.normal_price = float( game_info.find( @@ -107,7 +109,8 @@ def crawl_steam(url: str) -> CrawlResult: "game_purchase_price" ).get("data-price-final")) / 100.0 except Exception as inner_exception: - logging.error("No normal price found in %s %s", game_info, inner_exception) + logging.error("No normal price found in %s %s", + game_info, inner_exception) raise CrawlerException from inner_exception return result diff --git a/argostime/exceptions.py b/argostime/exceptions.py index 3eeeeeb..25c405b 100644 --- a/argostime/exceptions.py +++ b/argostime/exceptions.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ exceptions.py @@ -22,6 +21,7 @@ import logging + class PageNotFoundException(Exception): """Exception to throw when a request gets a 404 returned.""" def __init__(self, url: str): @@ -31,6 +31,7 @@ def __init__(self, url: str): super().__init__() + class WebsiteNotImplementedException(Exception): """Exception to throw if a certain website has no implemented scraper.""" @@ -41,8 +42,10 @@ def __init__(self, url: str): super().__init__() + class NoEffectivePriceAvailableException(Exception): """Exception to throw if a Price object has no valid price.""" + class CrawlerException(Exception): """Exception to throw if something goes wrong in the crawler.""" diff --git a/argostime/graphs.py b/argostime/graphs.py index 93da577..aeeed25 100644 --- a/argostime/graphs.py +++ b/argostime/graphs.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ graphs.py @@ -23,12 +22,13 @@ along with Argostimè. If not, see . """ -from datetime import datetime, timedelta import json +from datetime import datetime, timedelta from argostime import db from argostime.exceptions import NoEffectivePriceAvailableException -from argostime.models import ProductOffer, Price +from argostime.models import Price, ProductOffer + def generate_price_graph_data(offer: ProductOffer) -> str: """ @@ -51,7 +51,8 @@ def generate_price_graph_data(offer: ProductOffer) -> str: for price in prices: try: effective_prices.append(price.get_effective_price()) - dates.append(price.datetime.replace(hour=12, minute=0, second=0, microsecond=0)) + dates.append(price.datetime.replace( + hour=12, minute=0, second=0, microsecond=0)) if price.on_sale: if len(sales_index) == 0 or sales_index[-1][1] != (index - 1): @@ -90,7 +91,8 @@ def generate_price_graph_data(offer: ProductOffer) -> str: data = { "title": { - "text": f"Prijsontwikkeling van {offer.product.name} bij {offer.webshop.name}", + "text": f"Prijsontwikkeling van {offer.product.name} " + f"bij {offer.webshop.name}", "left": "center", "textStyle": { "color": "#000", diff --git a/argostime/models.py b/argostime/models.py index 8f195a9..b012db4 100644 --- a/argostime/models.py +++ b/argostime/models.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ models.py @@ -22,18 +21,19 @@ along with Argostimè. If not, see . """ -from datetime import datetime import logging import statistics +from datetime import datetime from sys import maxsize from typing import List -from argostime.crawler import crawl_url, CrawlResult -from argostime.exceptions import CrawlerException, WebsiteNotImplementedException -from argostime.exceptions import PageNotFoundException +from argostime import db +from argostime.crawler import CrawlResult, crawl_url +from argostime.exceptions import \ + CrawlerException, WebsiteNotImplementedException from argostime.exceptions import NoEffectivePriceAvailableException +from argostime.exceptions import PageNotFoundException -from argostime import db class Webshop(db.Model): # type: ignore """A webshop, which may offer products.""" @@ -41,12 +41,12 @@ class Webshop(db.Model): # type: ignore id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(512), unique=True, nullable=False) hostname = db.Column(db.Unicode(512), unique=True, nullable=False) - products = db.relationship("ProductOffer", - backref="webshop", - lazy=True, cascade="all, delete", passive_deletes=True) + products = db.relationship("ProductOffer", backref="webshop", lazy=True, + cascade="all, delete", passive_deletes=True) def __str__(self) -> str: - return f"Webshop(id={self.id}, name={self.name}, hostname={self.hostname})" + return f"Webshop(id={self.id}, name={self.name}, " \ + f"hostname={self.hostname})" class Product(db.Model): # type: ignore @@ -57,13 +57,15 @@ class Product(db.Model): # type: ignore description = db.Column(db.Unicode(1024)) ean = db.Column(db.Integer) product_code = db.Column(db.Unicode(512), unique=True) - product_offers = db.relationship("ProductOffer", - backref="product", lazy=True, - cascade="all, delete", passive_deletes=True) + product_offers = db.relationship("ProductOffer", backref="product", + lazy=True, cascade="all, delete", + passive_deletes=True) def __str__(self) -> str: - return (f"Product(id={self.id}, name={self.name}, description={self.description}," - f"ean={self.ean}, product_code={self.product_code}, product_offers={self.product_offers})") + return (f"Product(id={self.id}, name={self.name}, " + f"description={self.description}, ean={self.ean}, " + f"product_code={self.product_code}, " + f"product_offers={self.product_offers})") class Price(db.Model): # type: ignore @@ -74,14 +76,17 @@ class Price(db.Model): # type: ignore discount_price = db.Column(db.Float) on_sale = db.Column(db.Boolean) datetime = db.Column(db.DateTime) - product_offer_id = db.Column(db.Integer, - db.ForeignKey("ProductOffer.id", ondelete="CASCADE"), - nullable=False) + product_offer_id = db.Column( + db.Integer, + db.ForeignKey("ProductOffer.id", ondelete="CASCADE"), + nullable=False + ) def __str__(self) -> str: return (f"Price(id={self.id}, normal_price={self.normal_price}," f"discount_price={self.discount_price}, on_sale={self.on_sale}" - f"datetime={self.datetime}, product_offer_id={self.product_offer_id})") + f"datetime={self.datetime}, " + f"product_offer_id={self.product_offer_id})") def get_effective_price(self) -> float: """Return the discounted price if on sale, else the normal price.""" @@ -99,9 +104,11 @@ class ProductOffer(db.Model): # type: ignore __tablename__ = "ProductOffer" id = db.Column(db.Integer, primary_key=True) product_id = db.Column(db.Integer, - db.ForeignKey("Product.id", ondelete="CASCADE"), nullable=False) + db.ForeignKey("Product.id", ondelete="CASCADE"), + nullable=False) shop_id = db.Column(db.Integer, - db.ForeignKey("Webshop.id", ondelete="CASCADE"), nullable=False) + db.ForeignKey("Webshop.id", ondelete="CASCADE"), + nullable=False) url = db.Column(db.Unicode(1024), unique=True, nullable=False) time_added = db.Column(db.DateTime) average_price = db.Column(db.Float) @@ -110,11 +117,12 @@ class ProductOffer(db.Model): # type: ignore # TODO: Memoize current price with reference to the most recent Price entry prices = db.relationship("Price", backref="product_offer", lazy=True, - cascade="all, delete", passive_deletes=True) + cascade="all, delete", passive_deletes=True) def __str__(self): - return (f"ProductOffer(id={self.id}, product_id={self.product_id}," - f"shop_id={self.shop_id}, url={self.url}, time_added={self.time_added})") + return (f"ProductOffer(id={self.id}, product_id={self.product_id}, " + f"shop_id={self.shop_id}, url={self.url}, " + f"time_added={self.time_added})") def get_current_price(self) -> Price: """Get the latest Price object related to this offer.""" @@ -129,7 +137,10 @@ def get_current_price(self) -> Price: return price def update_average_price(self) -> float: - """Calculate the average price of this offer and update ProductOffer.average_price.""" + """ + Calculate the average price of this offer and update + ProductOffer.average_price. + """ logging.debug("Updating average price for %s", self) effective_price_values: List[float] = [] @@ -142,7 +153,7 @@ def update_average_price(self) -> float: try: effective_price_values.append(price.get_effective_price()) except NoEffectivePriceAvailableException: - # Ignore price entries without a valid price in calculating the price. + # Ignore price entries without a valid price. pass try: avg: float = statistics.mean(effective_price_values) @@ -150,7 +161,8 @@ def update_average_price(self) -> float: db.session.commit() return avg except statistics.StatisticsError: - logging.debug("Called get_average_price for %s but no prices were found...", str(self)) + logging.debug("Called get_average_price for %s but no prices were " + "found...", str(self)) return -1 def get_average_price(self) -> float: @@ -175,8 +187,11 @@ def get_prices_since(self, since_time: datetime) -> list[Price]: return prices_since_list def get_lowest_price_since(self, since_time: datetime) -> float: - """Return the lowest effective price of this offer since a specific time.""" - logging.debug("Calculating lowest price since %s for %s", since_time, self) + """ + Return the lowest effective price of this offer since a specific time. + """ + logging.debug("Calculating lowest price since %s for %s", + since_time, self) min_price: float = maxsize price: Price @@ -207,8 +222,11 @@ def get_lowest_price(self) -> float: return self.minimum_price def get_highest_price_since(self, since_time: datetime) -> float: - """Return the highest effective price of this offer since a specific time.""" - logging.debug("Calculating highest price since %s for %s", since_time, self) + """ + Return the highest effective price of this offer since a specific time. + """ + logging.debug("Calculating highest price since %s for %s", + since_time, self) max_price: float = -1 price: Price @@ -237,8 +255,12 @@ def get_highest_price(self) -> float: """ return self.maximum_price - def get_price_standard_deviation_since(self, since_time: datetime) -> float: - """Return the standard deviation of the effective price of this offer since a given date.""" + def get_price_standard_deviation_since(self, since_time: datetime) \ + -> float: + """ + Return the standard deviation of the effective price of this offer + since a given date. + """ effective_prices: List[float] = [] price: Price @@ -256,7 +278,9 @@ def get_price_standard_deviation_since(self, since_time: datetime) -> float: return 0.0 def get_price_standard_deviation(self) -> float: - """Return the standard deviation of the effective price of this offer.""" + """ + Return the standard deviation of the effective price of this offer. + """ return self.get_price_standard_deviation_since(self.time_added) def update_memoized_values(self) -> None: diff --git a/argostime/products.py b/argostime/products.py index 20a341c..cf890e6 100644 --- a/argostime/products.py +++ b/argostime/products.py @@ -1,9 +1,8 @@ -#!/usr/bin/env python3 """ products.py - Abstraction layer between the crawler & database on one hand, and the actual web interface - on the other. + Abstraction layer between the crawler & database on one hand, and the + actual web interface on the other. Copyright (c) 2022 Martijn @@ -23,15 +22,16 @@ along with Argostimè. If not, see . """ -from enum import Enum +import urllib.parse from datetime import datetime +from enum import Enum from typing import Tuple -import urllib.parse from argostime import db +from argostime.crawler import CrawlResult, crawl_url, enabled_shops from argostime.exceptions import WebsiteNotImplementedException -from argostime.models import Webshop, Price, Product, ProductOffer -from argostime.crawler import crawl_url, CrawlResult, enabled_shops +from argostime.models import Price, Product, ProductOffer, Webshop + class ProductOfferAddResult(Enum): """Enum to indicate the result of add_product_offer""" @@ -40,8 +40,12 @@ class ProductOfferAddResult(Enum): ALREADY_EXISTS = 2 FAILED_404_NOT_FOUND = 3 -def add_product_offer_from_url(url: str) -> Tuple[ProductOfferAddResult, ProductOffer]: - """Try to add a product offer to the database, add product and webshop if required. + +def add_product_offer_from_url(url: str) -> \ + Tuple[ProductOfferAddResult, ProductOffer]: + """ + Try to add a product offer to the database, add product and webshop if + required. Returns a ProductOfferAddResult enum """ diff --git a/argostime/routes.py b/argostime/routes.py index 3c1aa44..55f8555 100644 --- a/argostime/routes.py +++ b/argostime/routes.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ routes.py @@ -22,22 +21,24 @@ along with Argostimè. If not, see . """ -from datetime import datetime import logging -from typing import List, Dict import urllib.parse - -from flask import current_app as app -from flask import render_template, abort, request, redirect -from flask import Response +from datetime import datetime +from typing import Dict, List from argostime import db from argostime.exceptions import CrawlerException from argostime.exceptions import PageNotFoundException from argostime.exceptions import WebsiteNotImplementedException from argostime.graphs import generate_price_graph_data -from argostime.models import Webshop, Product, ProductOffer, Price -from argostime.products import ProductOfferAddResult, add_product_offer_from_url +from argostime.models import Price, Product, ProductOffer, Webshop +from argostime.products import \ + ProductOfferAddResult, add_product_offer_from_url + +from flask import Response +from flask import abort, redirect, render_template, request +from flask import current_app as app + def add_product_url(url): """Helper function for adding a product""" @@ -47,30 +48,38 @@ def add_product_url(url): hostname: str = urllib.parse.urlparse(url).netloc if len(hostname) == 0: hostname = url - return render_template("add_product_result.html.jinja", - result=f"Helaas wordt de website {hostname} nog niet ondersteund."), 400 + return render_template( + "add_product_result.html.jinja", + result=f"Helaas wordt de website {hostname} nog niet ondersteund." + ), 400 except PageNotFoundException: - return render_template("add_product_result.html.jinja", - result=f"De pagina {url} kon niet worden gevonden."), 404 + return render_template( + "add_product_result.html.jinja", + result=f"De pagina {url} kon niet worden gevonden." + ), 404 except CrawlerException as exception: logging.info( "Failed to add product from url %s, got CrawlerException %s", url, exception) - return render_template("add_product_result.html.jinja", - result=f"Het is niet gelukt om een product te vinden op de gegeven URL {url}." - " Verwijst de link wel naar een productpagina?") + return render_template( + "add_product_result.html.jinja", + result=f"Het is niet gelukt om een product te vinden op de " + f"gegeven URL {url}. Verwijst de link wel naar een " + f"productpagina?" + ) if ( res == ProductOfferAddResult.ADDED or res == ProductOfferAddResult.ALREADY_EXISTS and offer is not None - ): + ): return redirect(f"/product/{offer.product.product_code}") return render_template("add_product.html.jinja", result=str(res)) + @app.route("/", methods=["GET", "POST"]) def index(): """Render home page""" @@ -85,7 +94,7 @@ def index(): discounts = db.session.scalars( db.select(Price).where( Price.datetime >= datetime.now().date(), - Price.on_sale == True # pylint: disable=C0121 + Price.on_sale is True # pylint: disable=C0121 ) ).all() @@ -101,6 +110,7 @@ def index(): discounts=discounts, shops=shops) + @app.route("/product/") def product_page(product_code): """Show the page for a specific product, with all known product offers""" @@ -110,7 +120,8 @@ def product_page(product_code): .where(Product.product_code == product_code) ).first() - logging.debug("Rendering product page for %s based on product code %s", product, product_code) + logging.debug("Rendering product page for %s based on product code %s", + product, product_code) if product is None: abort(404) @@ -126,6 +137,7 @@ def product_page(product_code): p=product, offers=offers) + @app.route("/productoffer//price_step_graph_data.json") def offer_price_json(offer_id): """Generate the price step graph data of a specific offer""" @@ -140,6 +152,7 @@ def offer_price_json(offer_id): data: str = generate_price_graph_data(offer) return Response(data, mimetype="application/json") + @app.route("/all_offers") def all_offers(): """Generate an overview of all available offers""" @@ -165,6 +178,7 @@ def all_offers(): show_variance=show_variance ) + @app.route("/shop/") def webshop_page(shop_id): """Show a page with all the product offers of a specific webshop""" @@ -199,6 +213,7 @@ def webshop_page(shop_id): show_variance=show_variance ) + @app.route("/add_url", methods=['GET']) def add_url(): """GET request to allow users to add a URL using a booklet""" @@ -208,6 +223,7 @@ def add_url(): abort(404) return add_product_url(url) + @app.errorhandler(404) def not_found(error): """Return the 404 page""" diff --git a/argostime/static/stylesheet.css b/argostime/static/stylesheet.css index 05cebb2..712cc62 100644 --- a/argostime/static/stylesheet.css +++ b/argostime/static/stylesheet.css @@ -1,5 +1,5 @@ -body { +body { font: 1.2em/1.62 sans-serif; margin: auto; padding: 20px; @@ -31,7 +31,7 @@ b { } a { - color: #3273dc; + color: #3273dc; } .sale { diff --git a/argostime/static/table-sort.js b/argostime/static/table-sort.js index a911d10..e547302 100644 --- a/argostime/static/table-sort.js +++ b/argostime/static/table-sort.js @@ -1,15 +1,15 @@ -/* +/* table-sort-js 1.6.8 Author: Lee Wannacott -Licence: MIT License Copyright (c) 2021 Lee Wannacott - +Licence: MIT License Copyright (c) 2021 Lee Wannacott + GitHub Repository: https://github.com/LeeWannacott/table-sort-js npm package: https://www.npmjs.com/package/table-sort-js Demo: https://leewannacott.github.io/Portfolio/#/GitHub Install: Frontend: or -Download this file and add to your HTML -Backend: npm install table-sort-js and use require("../node_modules/table-sort-js/table-sort.js") +Download this file and add to your HTML +Backend: npm install table-sort-js and use require("../node_modules/table-sort-js/table-sort.js") Instructions: Add class="table-sort" to tables you'd like to make sortable Click on the table headers to sort them. diff --git a/argostime_update_prices.py b/argostime_update_prices.py old mode 100644 new mode 100755 index d1f2aeb..006e470 --- a/argostime_update_prices.py +++ b/argostime_update_prices.py @@ -22,12 +22,12 @@ along with Argostimè. If not, see . """ -import random import logging +import random import time -from argostime.models import ProductOffer from argostime import create_app, db +from argostime.models import ProductOffer app = create_app() app.app_context().push() @@ -47,7 +47,8 @@ try: offer.crawl_new_price() except Exception as exception: - logging.error("Received %s while updating price of %s, continuing...", exception, offer) + logging.error("Received %s while updating price of %s, continuing...", + exception, offer) next_sleep_time: float = random.uniform(1, 180) logging.debug("Sleeping for %f seconds", next_sleep_time) diff --git a/argostime_update_prices_parallel.py b/argostime_update_prices_parallel.py index 1bbedd1..795332c 100755 --- a/argostime_update_prices_parallel.py +++ b/argostime_update_prices_parallel.py @@ -22,17 +22,18 @@ along with Argostimè. If not, see . """ -import random import logging -from multiprocessing import Process +import random import time +from multiprocessing import Process -from argostime.models import ProductOffer, Webshop from argostime import create_app, db +from argostime.models import ProductOffer, Webshop app = create_app() app.app_context().push() + def update_shop_offers(shop_id: int) -> None: """Crawl all the offers of one shop""" @@ -48,12 +49,14 @@ def update_shop_offers(shop_id: int) -> None: try: offer.crawl_new_price() except Exception as exception: - logging.error("Received %s while updating price of %s, continuing...", exception, offer) + logging.error("Received %s while updating price of %s, " + "continuing...", exception, offer) next_sleep_time: float = random.uniform(1, 180) logging.debug("Sleeping for %f seconds", next_sleep_time) time.sleep(next_sleep_time) + if __name__ == "__main__": shops: list[Webshop] = db.session.scalars( diff --git a/check_url.py b/check_url.py index 4816e18..8a06ca4 100755 --- a/check_url.py +++ b/check_url.py @@ -26,7 +26,6 @@ import traceback from argostime.crawler.crawl_url import crawl_url - from argostime.crawler.crawl_utils import CrawlResult # Print help message if needed... @@ -37,7 +36,7 @@ # Just call the crawler with the url given by the user try: result: CrawlResult = crawl_url(sys.argv[1]) -except Exception as exception: +except: print("Exception thrown during crawling:", file=sys.stderr) traceback.print_exc() exit() diff --git a/create_indexes.py b/create_indexes.py index 4ef5794..a8ed09c 100755 --- a/create_indexes.py +++ b/create_indexes.py @@ -23,11 +23,10 @@ """ import logging -from sqlalchemy import text -from sqlalchemy.exc import OperationalError - from argostime import create_app, db -from argostime.models import ProductOffer, Product, Price, Webshop +from argostime.models import Price, Product, ProductOffer, Webshop + +from sqlalchemy.exc import OperationalError app = create_app() app.app_context().push() @@ -37,12 +36,12 @@ indexes = [ db.Index("idx_Price_datetime", Price.datetime), db.Index("idx_Price_product_offer", Price.product_offer_id), - db.Index("idx_Price_product_offer_id_datetime", Price.product_offer_id, Price.datetime), + db.Index("idx_Price_product_offer_id_datetime", + Price.product_offer_id, Price.datetime), db.Index("idx_ProductOffer_shop_id", ProductOffer.shop_id), db.Index("idx_ProductOffer_product_id", ProductOffer.product_id), db.Index("idx_Webshop_hostname", Webshop.hostname), db.Index("idx_Product_product_code", Product.product_code), - ] for index in indexes: diff --git a/manual_update.py b/manual_update.py index 8498f7a..003dabc 100755 --- a/manual_update.py +++ b/manual_update.py @@ -2,7 +2,8 @@ """ manual_update.py - Standalone script to manually update the price of a product offer by product_offer_id. + Standalone script to manually update the price of a product offer by + product_offer_id. Copyright (c) 2022 Kevin @@ -22,12 +23,13 @@ along with Argostimè. If not, see . """ -import sys import logging +import sys from argostime import create_app, db from argostime.models import ProductOffer + app = create_app() app.app_context().push() @@ -37,7 +39,10 @@ print("No number given") sys.exit(-1) -offer: ProductOffer = db.session.execute(db.select(ProductOffer).where(ProductOffer.id == product_offer_id)).scalar_one() +offer: ProductOffer = \ + db.session.execute( + db.select(ProductOffer).where(ProductOffer.id == product_offer_id) + ).scalar_one() logging.debug("Found offer %s", product_offer_id) logging.debug("Manually updating ProductOffer %s", offer) @@ -45,4 +50,5 @@ try: offer.crawl_new_price() except Exception as exception: - logging.error("Received %s while updating price of %s, continuing...", exception, offer) + logging.error("Received %s while updating price of %s, continuing...", + exception, offer) diff --git a/migration_add_productoffer_avg_price_column.py b/migration_add_productoffer_avg_price_column.py index 4a22ade..2a164f8 100755 --- a/migration_add_productoffer_avg_price_column.py +++ b/migration_add_productoffer_avg_price_column.py @@ -24,11 +24,12 @@ import logging +from argostime import create_app, db +from argostime.models import Product, ProductOffer + from sqlalchemy import text from sqlalchemy.exc import OperationalError -from argostime import create_app, db -from argostime.models import ProductOffer, Product app = create_app() app.app_context().push() @@ -36,17 +37,20 @@ logging.info("Adding average_price column") try: - db.session.execute(text('ALTER TABLE ProductOffer ADD COLUMN average_price float')) + db.session.execute( + text('ALTER TABLE ProductOffer ADD COLUMN average_price float')) except OperationalError: logging.info("Column already seems to exist, fine") try: - db.session.execute(text('ALTER TABLE ProductOffer ADD COLUMN minimum_price float')) + db.session.execute( + text('ALTER TABLE ProductOffer ADD COLUMN minimum_price float')) except OperationalError: logging.info("Column already seems to exist, fine") try: - db.session.execute(text('ALTER TABLE ProductOffer ADD COLUMN maximum_price float')) + db.session.execute( + text('ALTER TABLE ProductOffer ADD COLUMN maximum_price float')) except OperationalError: logging.info("Column already seems to exist, fine") diff --git a/requirements.txt b/requirements.txt index 4a0adae..3810bfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ requests>=2.27.1 beautifulsoup4>=4.10.0 Flask-SQLAlchemy>=2.5.1 gunicorn -SQLAlchemy >= 2 +SQLAlchemy>=2 +pre-commit diff --git a/requirements_development.txt b/requirements_development.txt deleted file mode 100644 index a84e007..0000000 --- a/requirements_development.txt +++ /dev/null @@ -1,5 +0,0 @@ --r requirements.txt - -mypy >= 1.3.0 -pylint >= 2.17.4 -types-requests >= 2.31.0.1 \ No newline at end of file diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 36b6cda..d201cb4 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ test_crawler.py @@ -8,11 +7,13 @@ import unittest -from argostime.crawler import ParseProduct import argostime.exceptions +from argostime.crawler import crawl_url + class ParseProductTestCases(unittest.TestCase): def test_not_implemented_website(self): - with self.assertRaises(argostime.exceptions.WebsiteNotImplementedException): - ParseProduct("https://example.com") + with self.assertRaises( + argostime.exceptions.WebsiteNotImplementedException): + crawl_url("https://example.com") diff --git a/tests/test_products.py b/tests/test_products.py index 8c24e85..cf5656b 100644 --- a/tests/test_products.py +++ b/tests/test_products.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ test_products.py @@ -8,11 +7,14 @@ import unittest -import argostime.products import argostime.exceptions +import argostime.products + class ProductsTestCases(unittest.TestCase): def test_not_implemented_website(self): - with self.assertRaises(argostime.exceptions.WebsiteNotImplementedException): - argostime.products.add_product_offer_from_url("https://example.com") + with self.assertRaises( + argostime.exceptions.WebsiteNotImplementedException): + argostime.products.add_product_offer_from_url( + "https://example.com") diff --git a/wsgi.py b/wsgi.py old mode 100644 new mode 100755