diff --git a/bases/ecoindex/cli/app.py b/bases/ecoindex/cli/app.py index a2e1afb..4ce67c7 100644 --- a/bases/ecoindex/cli/app.py +++ b/bases/ecoindex/cli/app.py @@ -1,4 +1,4 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed +from asyncio import run from datetime import datetime from multiprocessing import cpu_count from os.path import dirname @@ -16,9 +16,9 @@ get_window_sizes_from_args, ) from ecoindex.cli.console_output import display_result_synthesis -from ecoindex.cli.helper import run_page_analysis from ecoindex.cli.report import Report from ecoindex.models import ExportFormat, Language +from ecoindex.scraper.helper import bulk_analysis from ecoindex.utils.files import write_results_to_file, write_urls_to_file from loguru import logger from rich.progress import ( @@ -165,7 +165,9 @@ def analyze( urls=urls, urls_file=urls_file, tmp_folder=tmp_folder ) elif sitemap: - secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA) + secho( + f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA + ) urls = get_urls_from_sitemap(main_url=sitemap) ( file_prefix, @@ -220,47 +222,26 @@ def analyze( TextColumn("•"), TimeRemainingColumn(), ) as progress: + count_errors = 0 task = progress.add_task("Processing", total=len(urls) * len(window_sizes)) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - future_to_analysis = {} - - for url in urls: - for window_size in window_sizes: - future_to_analysis[ - executor.submit( - run_page_analysis, - url, - window_size, - wait_after_scroll, - wait_before_scroll, - logger, - ) - ] = ( - url, - window_size, - wait_after_scroll, - wait_before_scroll, - logger, - ) - count_errors = 0 - - for future in as_completed(future_to_analysis): - try: - result, success = future.result() - - if not success: - count_errors += 1 - - else: - results.append(result) - - except Exception as e: - count_errors += 1 - url, _, _, _, _ = future_to_analysis[future] - logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}") - - progress.update(task, advance=1) + analysis_results = run( + bulk_analysis( + max_workers=max_workers, + urls=urls, + window_sizes=window_sizes, + wait_after_scroll=wait_after_scroll, + wait_before_scroll=wait_before_scroll, + logger=logger, + ) + ) + + for result, success in analysis_results: + results.append(result) + if not success: + count_errors += 1 + + progress.update(task, advance=1) if count_errors > 0: secho( diff --git a/bases/ecoindex/cli/helper.py b/bases/ecoindex/cli/helper.py index 1e07fc9..37e6fb8 100644 --- a/bases/ecoindex/cli/helper.py +++ b/bases/ecoindex/cli/helper.py @@ -1,51 +1,14 @@ -from asyncio import run from ecoindex.config import Settings - -from ecoindex.models import Result, WindowSize, CliHost -from ecoindex.scraper import EcoindexScraper - - -def run_page_analysis( - url: str, - window_size: WindowSize, - wait_after_scroll: int = 3, - wait_before_scroll: int = 3, - logger=None, -) -> tuple[Result, bool]: - """Run the page analysis and return the result and a boolean indicating if the analysis was successful""" - scraper = EcoindexScraper( - url=str(url), - window_size=window_size, - wait_after_scroll=wait_after_scroll, - wait_before_scroll=wait_before_scroll, - page_load_timeout=20, - ) - try: - return (run(scraper.get_page_analysis()), True) - except Exception as e: - logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}") - - return ( - Result( - url=url, - water=0, - width=window_size.width, - height=window_size.height, - size=0, - nodes=0, - requests=0, - ), - False, - ) +from ecoindex.models import CliHost def replace_localhost_with_hostdocker(netloc: str) -> CliHost: if Settings().DOCKER_CONTAINER and "localhost" in netloc: domain = "host.docker.internal" netloc = netloc.replace("localhost", domain) - elif "localhost" in netloc : + elif "localhost" in netloc: domain = "localhost" - else : + else: domain = netloc return CliHost(domain=domain, netloc=netloc) diff --git a/components/ecoindex/scraper/helper.py b/components/ecoindex/scraper/helper.py new file mode 100644 index 0000000..f139149 --- /dev/null +++ b/components/ecoindex/scraper/helper.py @@ -0,0 +1,74 @@ +from asyncio import run +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import AsyncGenerator + +from ecoindex.models.compute import Result, WindowSize +from ecoindex.scraper.scrap import EcoindexScraper + + +def run_page_analysis( + url: str, + window_size: WindowSize, + wait_after_scroll: int = 3, + wait_before_scroll: int = 3, + logger=None, +) -> tuple[Result, bool]: + """Run the page analysis and return the result and a boolean indicating if the analysis was successful""" + scraper = EcoindexScraper( + url=str(url), + window_size=window_size, + wait_after_scroll=wait_after_scroll, + wait_before_scroll=wait_before_scroll, + page_load_timeout=20, + ) + try: + return (run(scraper.get_page_analysis()), True) + except Exception as e: + logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}") + + return ( + Result( + url=url, + water=0, + width=window_size.width, + height=window_size.height, + size=0, + nodes=0, + requests=0, + ), + False, + ) + + +async def bulk_analysis( + max_workers, + urls, + window_sizes, + wait_after_scroll: int = 0, + wait_before_scroll: int = 0, + logger=None, +) -> AsyncGenerator[tuple[Result, bool], None]: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_analysis = {} + + for url in urls: + for window_size in window_sizes: + future_to_analysis[ + executor.submit( + run_page_analysis, + url, + window_size, + wait_after_scroll, + wait_before_scroll, + logger, + ) + ] = ( + url, + window_size, + wait_after_scroll, + wait_before_scroll, + logger, + ) + + for future in as_completed(future_to_analysis): + yield future.result() diff --git a/components/ecoindex/scraper/scrap.py b/components/ecoindex/scraper/scrap.py index ecbf231..d149963 100644 --- a/components/ecoindex/scraper/scrap.py +++ b/components/ecoindex/scraper/scrap.py @@ -25,6 +25,7 @@ def __init__( screenshot_uid: int | None = None, screenshot_gid: int | None = None, page_load_timeout: int = 20, + headless: bool = True, ): self.url = url self.window_size = window_size @@ -39,6 +40,7 @@ def __init__( self.har_temp_file_path = ( f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har" ) + self.headless = headless @deprecated("This method is useless with new version of EcoindexScraper") def init_chromedriver(self): @@ -64,7 +66,7 @@ async def get_requests_by_category(self) -> MimetypeAggregation: async def scrap_page(self) -> PageMetrics: async with async_playwright() as p: - browser = await p.chromium.launch() + browser = await p.chromium.launch(headless=self.headless) self.page = await browser.new_page( record_har_path=self.har_temp_file_path, screen=self.window_size.model_dump(), diff --git a/development/ecoindex_scraper.py b/development/ecoindex_scraper.py index ca46ac9..eea0f98 100644 --- a/development/ecoindex_scraper.py +++ b/development/ecoindex_scraper.py @@ -1,6 +1,17 @@ import asyncio from pprint import pprint +from uuid import uuid1 +from ecoindex.models.compute import ScreenShot from ecoindex.scraper import EcoindexScraper -pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis())) +scraper = EcoindexScraper( + url="https://www.kiabi.com", + screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"), +) + +result = asyncio.run(scraper.get_page_analysis()) +all_requests = asyncio.run(scraper.get_all_requests()) +requests_by_category = asyncio.run(scraper.get_requests_by_category()) + +pprint(result)