replace 'httpx' in Thumbnail-Pipeline with Scrapy Requests for Splash

- the screenshot and thumbnail pipeline worked previously in parallel to Scrapy's built-in scheduler, which comes with its own set of problems (ignoring the Scrapy scheduler means that we cannot control the load/traffic in a responsible manner) - ToDo / work in progress: try to find a way to schedule Playwright requests with Scrapy's scheduler Signed-off-by: criamos <[email protected]>
openeduhub · Dec 1, 2023 · f0d9754 · f0d9754
1 parent fdc127f
commit f0d9754
Showing 1 changed file with 42 additions and 37 deletions.
diff --git a/converter/pipelines.py b/converter/pipelines.py
@@ -2,33 +2,36 @@
 
 from __future__ import annotations
 
-import asyncio
 import base64
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 import csv
+import datetime
 import logging
 import re
 import time
 from abc import ABCMeta
+from asyncio import Future
 from io import BytesIO
 from typing import BinaryIO, TextIO, Optional
 
 import dateparser
 import dateutil.parser
-import httpx
 import isodate
 import scrapy
 import scrapy.crawler
-from async_lru import alru_cache
 from PIL import Image
+from async_lru import alru_cache
 from itemadapter import ItemAdapter
 from scrapy import settings
 from scrapy.exceptions import DropItem
 from scrapy.exporters import JsonItemExporter
+from scrapy.http.request import NO_CALLBACK
+from scrapy.utils.defer import maybe_deferred_to_future
 from scrapy.utils.project import get_project_settings
+from twisted.internet.defer import Deferred
 
 from converter import env
 from converter.constants import *
@@ -340,7 +343,6 @@ class ProcessThumbnailPipeline(BasicPipeline):
     """
     generate thumbnails
     """
-    _client_async = httpx.AsyncClient()
 
     @staticmethod
     def scale_image(img, max_size):
@@ -365,7 +367,7 @@ async def process_item(self, raw_item, spider):
         -- alternatively, on-demand: use Playwright to take a screenshot, rescale and save (as above)
         """
         item = ItemAdapter(raw_item)
-        response = None
+        response: scrapy.http.Response | None = None
         url: str | None = None
         settings = get_settings_for_crawler(spider)
         # checking if the (optional) attribute WEB_TOOLS exists:
@@ -383,7 +385,16 @@ async def process_item(self, raw_item, spider):
         elif "thumbnail" in item:
             # a thumbnail (url) is given - we will try to fetch it from the url
             url: str = item["thumbnail"]
-            response = await self.download_thumbnail_url(url)
+            # ToDo: Log time before the request
+            time_start = datetime.datetime.now()
+            response: scrapy.http.Response = await self.download_thumbnail_url(url, spider)
+            time_end = datetime.datetime.now()
+            log.debug(f"Loading thumbnail from {url} took {time_end - time_start}.")
+            # ToDo: log time after response
+            if response.status != 200:
+                log.debug(f"Thumbnail-Pipeline received unexpected response (status: {response.status}) from {url}")
+                # ToDo: Error-handling necessary
+                pass
             log.debug(f"Thumbnail-URL-Cache after trying to query {url}: {self.download_thumbnail_url.cache_info()}")
             # nothing was given, we try to screenshot the page either via Splash or Playwright
         elif (
@@ -393,23 +404,27 @@ async def process_item(self, raw_item, spider):
                 and item["lom"]["technical"]["format"] == "text/html"
         ):
             if settings.get("SPLASH_URL") and web_tools == WebEngine.Splash:
-                response = await self._client_async.post(
-                    settings.get("SPLASH_URL") + "/render.png",
-                    json={
+                splash_url: str = f"{settings.get('SPLASH_URL')}/render.png"
+                splash_wait: str = f"{settings.get('SPLASH_WAIT')}"
+                request_splash = scrapy.FormRequest(
+                    url=splash_url,
+                    formdata={
                         "url": item["lom"]["technical"]["location"][0],
                         # since there can be multiple "technical.location"-values, the first URL is used for thumbnails
-                        "wait": settings.get("SPLASH_WAIT"),
-                        "html5_media": 1,
+                        "wait": splash_wait,
+                        "html5_media": str(1),
                         "headers": settings.get("SPLASH_HEADERS"),
                     },
-                    timeout=30,
+                    callback=NO_CALLBACK
                 )
+                response = await maybe_deferred_to_future(spider.crawler.engine.download(request_splash))
             if env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright:
                 # if the attribute "WEB_TOOLS" doesn't exist as an attribute within a specific spider,
                 # it will default back to "splash"
 
                 # this edge-case is necessary for spiders that only need playwright to gather a screenshot,
                 # but don't use playwright within the spider itself (e.g. serlo_spider)
+                # ToDo: change to scrapy.FormRequest?
                 playwright_dict = await WebTools.getUrlData(url=item["lom"]["technical"]["location"][0],
                                                             engine=WebEngine.Playwright)
                 screenshot_bytes = playwright_dict.get("screenshot_bytes")
@@ -423,26 +438,27 @@ async def process_item(self, raw_item, spider):
         if response is None:
             if settings.get("DISABLE_SPLASH") is False:
                 log.error(
-                    "Neither thumbnail or technical.location (and technical.format) provided! Please provide at least one of them"
+                    "Neither thumbnail or technical.location (and technical.format) provided! "
+                    "Please provide at least one of them"
                 )
         else:
             try:
                 if response.headers["Content-Type"] == "image/svg+xml":
-                    if len(response.content) > settings.get("THUMBNAIL_MAX_SIZE"):
+                    if len(response.body) > settings.get("THUMBNAIL_MAX_SIZE"):
                         raise Exception(
                             "SVG images can't be converted, and the given image exceeds the maximum allowed size ("
-                            + str(len(response.content))
+                            + str(len(response.body))
                             + " > "
                             + str(settings.get("THUMBNAIL_MAX_SIZE"))
                             + ")"
                         )
                     item["thumbnail"] = {}
                     item["thumbnail"]["mimetype"] = response.headers["Content-Type"]
                     item["thumbnail"]["small"] = base64.b64encode(
-                        response.content
+                        response.body
                     ).decode()
                 else:
-                    img = Image.open(BytesIO(response.content))
+                    img = Image.open(BytesIO(response.body))
                     self.create_thumbnails_from_image_bytes(img, item, settings)
             except Exception as e:
                 if url is not None:
@@ -459,12 +475,12 @@ async def process_item(self, raw_item, spider):
                 else:
                     # item['thumbnail']={}
                     raise DropItem(
-                        "No thumbnail provided or ressource was unavailable for fetching"
+                        "No thumbnail provided or resource was unavailable for fetching"
                     )
         return raw_item
 
     @alru_cache(maxsize=128)
-    async def download_thumbnail_url(self, url: str):
+    async def download_thumbnail_url(self, url: str, spider: scrapy.Spider):
         """
         Download a thumbnail URL and **caches** the result.
 
@@ -476,29 +492,18 @@ async def download_thumbnail_url(self, url: str):
         Some webhosters serve generic placeholder images as their default thumbnail.
         By caching the response of such URLs, we can save a significant amount of time and traffic.
 
+        :param spider: The spider process that collected the URL.
         :param url: URL of a thumbnail/image.
         :return: Response or None
         """
         try:
-            response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
-            log.debug(
-                "Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
+            request = scrapy.Request(url=url, callback=NO_CALLBACK)
+            response: Deferred | Future = await maybe_deferred_to_future(
+                spider.crawler.engine.download(request)
             )
-            if response:
-                return response
-            else:
-                return None
-        except httpx.ConnectError:
-            # some website hosts are super slow or throttle connections
-            log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
-        except httpx.ReadError:
-            log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
-        except httpx.RemoteProtocolError:
-            log.warning(f"Thumbnail-Pipeline received a malformed HTTP Response from {url}")
-        except httpx.PoolTimeout:
-            # ToDo: don't over-saturate the connection pool (-> debug with fobizz_spider)
-            log.warning(f"Thumbnail-Pipeline reached connection pool limit while trying to download {url}")
-            raise
+            return response
+        except ValueError:
+            log.debug(f"Thumbnail-Pipeline received an invalid URL: {url}")
 
     # override the project settings with the given ones from the current spider
     # see PR 56 for details