Skip to content

Commit

Permalink
replace 'httpx' in Thumbnail-Pipeline with Scrapy Requests for Splash
Browse files Browse the repository at this point in the history
- the screenshot and thumbnail pipeline worked previously in parallel to Scrapy's built-in scheduler, which comes with its own set of problems (ignoring the Scrapy scheduler means that we cannot control the load/traffic in a responsible manner)
- ToDo / work in progress: try to find a way to schedule Playwright requests with Scrapy's scheduler

Signed-off-by: criamos <[email protected]>
  • Loading branch information
Criamos committed Dec 1, 2023
1 parent fdc127f commit f0d9754
Showing 1 changed file with 42 additions and 37 deletions.
79 changes: 42 additions & 37 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,36 @@

from __future__ import annotations

import asyncio
import base64
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import csv
import datetime
import logging
import re
import time
from abc import ABCMeta
from asyncio import Future
from io import BytesIO
from typing import BinaryIO, TextIO, Optional

import dateparser
import dateutil.parser
import httpx
import isodate
import scrapy
import scrapy.crawler
from async_lru import alru_cache
from PIL import Image
from async_lru import alru_cache
from itemadapter import ItemAdapter
from scrapy import settings
from scrapy.exceptions import DropItem
from scrapy.exporters import JsonItemExporter
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.defer import maybe_deferred_to_future
from scrapy.utils.project import get_project_settings
from twisted.internet.defer import Deferred

from converter import env
from converter.constants import *
Expand Down Expand Up @@ -340,7 +343,6 @@ class ProcessThumbnailPipeline(BasicPipeline):
"""
generate thumbnails
"""
_client_async = httpx.AsyncClient()

@staticmethod
def scale_image(img, max_size):
Expand All @@ -365,7 +367,7 @@ async def process_item(self, raw_item, spider):
-- alternatively, on-demand: use Playwright to take a screenshot, rescale and save (as above)
"""
item = ItemAdapter(raw_item)
response = None
response: scrapy.http.Response | None = None
url: str | None = None
settings = get_settings_for_crawler(spider)
# checking if the (optional) attribute WEB_TOOLS exists:
Expand All @@ -383,7 +385,16 @@ async def process_item(self, raw_item, spider):
elif "thumbnail" in item:
# a thumbnail (url) is given - we will try to fetch it from the url
url: str = item["thumbnail"]
response = await self.download_thumbnail_url(url)
# ToDo: Log time before the request
time_start = datetime.datetime.now()
response: scrapy.http.Response = await self.download_thumbnail_url(url, spider)
time_end = datetime.datetime.now()
log.debug(f"Loading thumbnail from {url} took {time_end - time_start}.")
# ToDo: log time after response
if response.status != 200:
log.debug(f"Thumbnail-Pipeline received unexpected response (status: {response.status}) from {url}")
# ToDo: Error-handling necessary
pass
log.debug(f"Thumbnail-URL-Cache after trying to query {url}: {self.download_thumbnail_url.cache_info()}")
# nothing was given, we try to screenshot the page either via Splash or Playwright
elif (
Expand All @@ -393,23 +404,27 @@ async def process_item(self, raw_item, spider):
and item["lom"]["technical"]["format"] == "text/html"
):
if settings.get("SPLASH_URL") and web_tools == WebEngine.Splash:
response = await self._client_async.post(
settings.get("SPLASH_URL") + "/render.png",
json={
splash_url: str = f"{settings.get('SPLASH_URL')}/render.png"
splash_wait: str = f"{settings.get('SPLASH_WAIT')}"
request_splash = scrapy.FormRequest(
url=splash_url,
formdata={
"url": item["lom"]["technical"]["location"][0],
# since there can be multiple "technical.location"-values, the first URL is used for thumbnails
"wait": settings.get("SPLASH_WAIT"),
"html5_media": 1,
"wait": splash_wait,
"html5_media": str(1),
"headers": settings.get("SPLASH_HEADERS"),
},
timeout=30,
callback=NO_CALLBACK
)
response = await maybe_deferred_to_future(spider.crawler.engine.download(request_splash))
if env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright:
# if the attribute "WEB_TOOLS" doesn't exist as an attribute within a specific spider,
# it will default back to "splash"

# this edge-case is necessary for spiders that only need playwright to gather a screenshot,
# but don't use playwright within the spider itself (e.g. serlo_spider)
# ToDo: change to scrapy.FormRequest?
playwright_dict = await WebTools.getUrlData(url=item["lom"]["technical"]["location"][0],
engine=WebEngine.Playwright)
screenshot_bytes = playwright_dict.get("screenshot_bytes")
Expand All @@ -423,26 +438,27 @@ async def process_item(self, raw_item, spider):
if response is None:
if settings.get("DISABLE_SPLASH") is False:
log.error(
"Neither thumbnail or technical.location (and technical.format) provided! Please provide at least one of them"
"Neither thumbnail or technical.location (and technical.format) provided! "
"Please provide at least one of them"
)
else:
try:
if response.headers["Content-Type"] == "image/svg+xml":
if len(response.content) > settings.get("THUMBNAIL_MAX_SIZE"):
if len(response.body) > settings.get("THUMBNAIL_MAX_SIZE"):
raise Exception(
"SVG images can't be converted, and the given image exceeds the maximum allowed size ("
+ str(len(response.content))
+ str(len(response.body))
+ " > "
+ str(settings.get("THUMBNAIL_MAX_SIZE"))
+ ")"
)
item["thumbnail"] = {}
item["thumbnail"]["mimetype"] = response.headers["Content-Type"]
item["thumbnail"]["small"] = base64.b64encode(
response.content
response.body
).decode()
else:
img = Image.open(BytesIO(response.content))
img = Image.open(BytesIO(response.body))
self.create_thumbnails_from_image_bytes(img, item, settings)
except Exception as e:
if url is not None:
Expand All @@ -459,12 +475,12 @@ async def process_item(self, raw_item, spider):
else:
# item['thumbnail']={}
raise DropItem(
"No thumbnail provided or ressource was unavailable for fetching"
"No thumbnail provided or resource was unavailable for fetching"
)
return raw_item

@alru_cache(maxsize=128)
async def download_thumbnail_url(self, url: str):
async def download_thumbnail_url(self, url: str, spider: scrapy.Spider):
"""
Download a thumbnail URL and **caches** the result.
Expand All @@ -476,29 +492,18 @@ async def download_thumbnail_url(self, url: str):
Some webhosters serve generic placeholder images as their default thumbnail.
By caching the response of such URLs, we can save a significant amount of time and traffic.
:param spider: The spider process that collected the URL.
:param url: URL of a thumbnail/image.
:return: Response or None
"""
try:
response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
log.debug(
"Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
request = scrapy.Request(url=url, callback=NO_CALLBACK)
response: Deferred | Future = await maybe_deferred_to_future(
spider.crawler.engine.download(request)
)
if response:
return response
else:
return None
except httpx.ConnectError:
# some website hosts are super slow or throttle connections
log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
except httpx.ReadError:
log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
except httpx.RemoteProtocolError:
log.warning(f"Thumbnail-Pipeline received a malformed HTTP Response from {url}")
except httpx.PoolTimeout:
# ToDo: don't over-saturate the connection pool (-> debug with fobizz_spider)
log.warning(f"Thumbnail-Pipeline reached connection pool limit while trying to download {url}")
raise
return response
except ValueError:
log.debug(f"Thumbnail-Pipeline received an invalid URL: {url}")

# override the project settings with the given ones from the current spider
# see PR 56 for details
Expand Down

0 comments on commit f0d9754

Please sign in to comment.