refact(scraper):Create a bulk analyzis helper for scraper and refacto…

…r cli
cnumr · vvatelot · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024 · 4b6cacb641ffb2a43729eb961acd680b9c2c32c9
commit 4b6cacb641ffb2a43729eb961acd680b9c2c32c9
diff --git a/bases/ecoindex/cli/app.py b/bases/ecoindex/cli/app.py
@@ -1,4 +1,4 @@
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from asyncio import run
 from datetime import datetime
 from multiprocessing import cpu_count
 from os.path import dirname
@@ -16,9 +16,9 @@
     get_window_sizes_from_args,
 )
 from ecoindex.cli.console_output import display_result_synthesis
-from ecoindex.cli.helper import run_page_analysis
 from ecoindex.cli.report import Report
 from ecoindex.models import ExportFormat, Language
+from ecoindex.scraper.helper import bulk_analysis
 from ecoindex.utils.files import write_results_to_file, write_urls_to_file
 from loguru import logger
 from rich.progress import (
@@ -165,7 +165,9 @@ def analyze(
                 urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
             )
         elif sitemap:
-            secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
+            secho(
+                f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA
+            )
             urls = get_urls_from_sitemap(main_url=sitemap)
             (
                 file_prefix,
@@ -220,47 +222,26 @@ def analyze(
         TextColumn("•"),
         TimeRemainingColumn(),
     ) as progress:
+        count_errors = 0
         task = progress.add_task("Processing", total=len(urls) * len(window_sizes))
 
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            future_to_analysis = {}
-
-            for url in urls:
-                for window_size in window_sizes:
-                    future_to_analysis[
-                        executor.submit(
-                            run_page_analysis,
-                            url,
-                            window_size,
-                            wait_after_scroll,
-                            wait_before_scroll,
-                            logger,
-                        )
-                    ] = (
-                        url,
-                        window_size,
-                        wait_after_scroll,
-                        wait_before_scroll,
-                        logger,
-                    )
-            count_errors = 0
-
-            for future in as_completed(future_to_analysis):
-                try:
-                    result, success = future.result()
-
-                    if not success:
-                        count_errors += 1
-
-                    else:
-                        results.append(result)
-
-                except Exception as e:
-                    count_errors += 1
-                    url, _, _, _, _ = future_to_analysis[future]
-                    logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")
-
-                progress.update(task, advance=1)
+        analysis_results = run(
+            bulk_analysis(
+                max_workers=max_workers,
+                urls=urls,
+                window_sizes=window_sizes,
+                wait_after_scroll=wait_after_scroll,
+                wait_before_scroll=wait_before_scroll,
+                logger=logger,
+            )
+        )
+
+        for result, success in analysis_results:
+            results.append(result)
+            if not success:
+                count_errors += 1
+
+            progress.update(task, advance=1)
 
     if count_errors > 0:
         secho(

diff --git a/bases/ecoindex/cli/helper.py b/bases/ecoindex/cli/helper.py
@@ -1,51 +1,14 @@
-from asyncio import run
 from ecoindex.config import Settings
-
-from ecoindex.models import Result, WindowSize, CliHost
-from ecoindex.scraper import EcoindexScraper
-
-
-def run_page_analysis(
-    url: str,
-    window_size: WindowSize,
-    wait_after_scroll: int = 3,
-    wait_before_scroll: int = 3,
-    logger=None,
-) -> tuple[Result, bool]:
-    """Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
-    scraper = EcoindexScraper(
-        url=str(url),
-        window_size=window_size,
-        wait_after_scroll=wait_after_scroll,
-        wait_before_scroll=wait_before_scroll,
-        page_load_timeout=20,
-    )
-    try:
-        return (run(scraper.get_page_analysis()), True)
-    except Exception as e:
-        logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")
-
-        return (
-            Result(
-                url=url,
-                water=0,
-                width=window_size.width,
-                height=window_size.height,
-                size=0,
-                nodes=0,
-                requests=0,
-            ),
-            False,
-        )
+from ecoindex.models import CliHost
 
 
 def replace_localhost_with_hostdocker(netloc: str) -> CliHost:
     if Settings().DOCKER_CONTAINER and "localhost" in netloc:
         domain = "host.docker.internal"
         netloc = netloc.replace("localhost", domain)
-    elif "localhost" in netloc :
+    elif "localhost" in netloc:
         domain = "localhost"
-    else :
+    else:
         domain = netloc
 
     return CliHost(domain=domain, netloc=netloc)
diff --git a/components/ecoindex/scraper/helper.py b/components/ecoindex/scraper/helper.py
@@ -0,0 +1,74 @@
+from asyncio import run
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import AsyncGenerator
+
+from ecoindex.models.compute import Result, WindowSize
+from ecoindex.scraper.scrap import EcoindexScraper
+
+
+def run_page_analysis(
+    url: str,
+    window_size: WindowSize,
+    wait_after_scroll: int = 3,
+    wait_before_scroll: int = 3,
+    logger=None,
+) -> tuple[Result, bool]:
+    """Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
+    scraper = EcoindexScraper(
+        url=str(url),
+        window_size=window_size,
+        wait_after_scroll=wait_after_scroll,
+        wait_before_scroll=wait_before_scroll,
+        page_load_timeout=20,
+    )
+    try:
+        return (run(scraper.get_page_analysis()), True)
+    except Exception as e:
+        logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")
+
+        return (
+            Result(
+                url=url,
+                water=0,
+                width=window_size.width,
+                height=window_size.height,
+                size=0,
+                nodes=0,
+                requests=0,
+            ),
+            False,
+        )
+
+
+async def bulk_analysis(
+    max_workers,
+    urls,
+    window_sizes,
+    wait_after_scroll: int = 0,
+    wait_before_scroll: int = 0,
+    logger=None,
+) -> AsyncGenerator[tuple[Result, bool], None]:
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_analysis = {}
+
+        for url in urls:
+            for window_size in window_sizes:
+                future_to_analysis[
+                    executor.submit(
+                        run_page_analysis,
+                        url,
+                        window_size,
+                        wait_after_scroll,
+                        wait_before_scroll,
+                        logger,
+                    )
+                ] = (
+                    url,
+                    window_size,
+                    wait_after_scroll,
+                    wait_before_scroll,
+                    logger,
+                )
+
+        for future in as_completed(future_to_analysis):
+            yield future.result()
diff --git a/components/ecoindex/scraper/scrap.py b/components/ecoindex/scraper/scrap.py
@@ -25,6 +25,7 @@ def __init__(
         screenshot_uid: int | None = None,
         screenshot_gid: int | None = None,
         page_load_timeout: int = 20,
+        headless: bool = True,
     ):
         self.url = url
         self.window_size = window_size
@@ -39,6 +40,7 @@ def __init__(
         self.har_temp_file_path = (
             f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har"
         )
+        self.headless = headless
 
     @deprecated("This method is useless with new version of EcoindexScraper")
     def init_chromedriver(self):
@@ -64,7 +66,7 @@ async def get_requests_by_category(self) -> MimetypeAggregation:
 
     async def scrap_page(self) -> PageMetrics:
         async with async_playwright() as p:
-            browser = await p.chromium.launch()
+            browser = await p.chromium.launch(headless=self.headless)
             self.page = await browser.new_page(
                 record_har_path=self.har_temp_file_path,
                 screen=self.window_size.model_dump(),

diff --git a/development/ecoindex_scraper.py b/development/ecoindex_scraper.py
@@ -1,6 +1,17 @@
 import asyncio
 from pprint import pprint
+from uuid import uuid1
 
+from ecoindex.models.compute import ScreenShot
 from ecoindex.scraper import EcoindexScraper
 
-pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis()))
+scraper = EcoindexScraper(
+    url="https://www.kiabi.com",
+    screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"),
+)
+
+result = asyncio.run(scraper.get_page_analysis())
+all_requests = asyncio.run(scraper.get_all_requests())
+requests_by_category = asyncio.run(scraper.get_requests_by_category())
+
+pprint(result)