Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(scraper): Create a bulk analyzis helper for scraper and refactor cli #86

Merged
merged 1 commit into from
Jul 12, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
refact(scraper):Create a bulk analyzis helper for scraper and refacto…
…r cli
vvatelot committed Jul 12, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 4b6cacb641ffb2a43729eb961acd680b9c2c32c9
65 changes: 23 additions & 42 deletions bases/ecoindex/cli/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from asyncio import run
from datetime import datetime
from multiprocessing import cpu_count
from os.path import dirname
@@ -16,9 +16,9 @@
get_window_sizes_from_args,
)
from ecoindex.cli.console_output import display_result_synthesis
from ecoindex.cli.helper import run_page_analysis
from ecoindex.cli.report import Report
from ecoindex.models import ExportFormat, Language
from ecoindex.scraper.helper import bulk_analysis
from ecoindex.utils.files import write_results_to_file, write_urls_to_file
from loguru import logger
from rich.progress import (
@@ -165,7 +165,9 @@ def analyze(
urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
)
elif sitemap:
secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
secho(
f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA
)
urls = get_urls_from_sitemap(main_url=sitemap)
(
file_prefix,
@@ -220,47 +222,26 @@ def analyze(
TextColumn("•"),
TimeRemainingColumn(),
) as progress:
count_errors = 0
task = progress.add_task("Processing", total=len(urls) * len(window_sizes))

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
count_errors = 0

for future in as_completed(future_to_analysis):
try:
result, success = future.result()

if not success:
count_errors += 1

else:
results.append(result)

except Exception as e:
count_errors += 1
url, _, _, _, _ = future_to_analysis[future]
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

progress.update(task, advance=1)
analysis_results = run(
bulk_analysis(
max_workers=max_workers,
urls=urls,
window_sizes=window_sizes,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
logger=logger,
)
)

for result, success in analysis_results:
results.append(result)
if not success:
count_errors += 1

progress.update(task, advance=1)

if count_errors > 0:
secho(
43 changes: 3 additions & 40 deletions bases/ecoindex/cli/helper.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,14 @@
from asyncio import run
from ecoindex.config import Settings

from ecoindex.models import Result, WindowSize, CliHost
from ecoindex.scraper import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)
from ecoindex.models import CliHost


def replace_localhost_with_hostdocker(netloc: str) -> CliHost:
if Settings().DOCKER_CONTAINER and "localhost" in netloc:
domain = "host.docker.internal"
netloc = netloc.replace("localhost", domain)
elif "localhost" in netloc :
elif "localhost" in netloc:
domain = "localhost"
else :
else:
domain = netloc

return CliHost(domain=domain, netloc=netloc)
74 changes: 74 additions & 0 deletions components/ecoindex/scraper/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from asyncio import run
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import AsyncGenerator

from ecoindex.models.compute import Result, WindowSize
from ecoindex.scraper.scrap import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)


async def bulk_analysis(
max_workers,
urls,
window_sizes,
wait_after_scroll: int = 0,
wait_before_scroll: int = 0,
logger=None,
) -> AsyncGenerator[tuple[Result, bool], None]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)

for future in as_completed(future_to_analysis):
yield future.result()
4 changes: 3 additions & 1 deletion components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@ def __init__(
screenshot_uid: int | None = None,
screenshot_gid: int | None = None,
page_load_timeout: int = 20,
headless: bool = True,
):
self.url = url
self.window_size = window_size
@@ -39,6 +40,7 @@ def __init__(
self.har_temp_file_path = (
f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har"
)
self.headless = headless

@deprecated("This method is useless with new version of EcoindexScraper")
def init_chromedriver(self):
@@ -64,7 +66,7 @@ async def get_requests_by_category(self) -> MimetypeAggregation:

async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch()
browser = await p.chromium.launch(headless=self.headless)
self.page = await browser.new_page(
record_har_path=self.har_temp_file_path,
screen=self.window_size.model_dump(),
13 changes: 12 additions & 1 deletion development/ecoindex_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
import asyncio
from pprint import pprint
from uuid import uuid1

from ecoindex.models.compute import ScreenShot
from ecoindex.scraper import EcoindexScraper

pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis()))
scraper = EcoindexScraper(
url="https://www.kiabi.com",
screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"),
)

result = asyncio.run(scraper.get_page_analysis())
all_requests = asyncio.run(scraper.get_all_requests())
requests_by_category = asyncio.run(scraper.get_requests_by_category())

pprint(result)