Skip to content

Commit

Permalink
feat(scraper): add random user agent
Browse files Browse the repository at this point in the history
  • Loading branch information
vvatelot committed Dec 11, 2023
1 parent e2b2eea commit 445355b
Show file tree
Hide file tree
Showing 11 changed files with 157 additions and 72 deletions.
6 changes: 5 additions & 1 deletion components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ecoindex.exceptions.scraper import EcoindexScraperStatusException
from ecoindex.models.compute import PageMetrics, Result, ScreenShot, WindowSize
from ecoindex.models.scraper import Requests
from ecoindex.scraper.user_agent import get_user_agent
from ecoindex.utils.screenshots import convert_screenshot_to_webp, set_screenshot_rights
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
Expand Down Expand Up @@ -58,11 +59,14 @@ async def get_page_analysis(self) -> Result:

async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch()
browser = await p.chromium.launch(headless=False)
user_agent = await get_user_agent()
print(f"Make request to {self.url} with {user_agent}")
self.page = await browser.new_page(
record_har_path=self.har_temp_file_path,
screen=self.window_size.model_dump(),
ignore_https_errors=True,
user_agent=user_agent,
)
await stealth_async(self.page)
response = await self.page.goto(self.url)
Expand Down
13 changes: 13 additions & 0 deletions components/ecoindex/scraper/user_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from random_user_agent.params import OperatingSystem, SoftwareName
from random_user_agent.user_agent import UserAgent


async def get_user_agent() -> str:
software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]

user_agent_rotator = UserAgent(
software_names=software_names, operating_systems=operating_systems, limit=100
)

return user_agent_rotator.get_random_user_agent()
4 changes: 2 additions & 2 deletions development/scraper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def run_page_analysis(url: str, index: int):
with ThreadPoolExecutor(max_workers=8) as executor:
future_to_analysis = {}

url = "https://www.ecoindex.fr"
url = "https://www.decathlon.fr"

for i in range(1):
for i in range(10):
print(f"Starting ecoindex {i} analysis")
future_to_analysis[
executor.submit(
Expand Down
79 changes: 45 additions & 34 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 445355b

Please sign in to comment.