feat/perf: use LRU cache for Thumbnail-Pipeline

- feat: implements a cache for "thumbnail"-URls that discards 'least recently used' items first -- while debugging large crawlers it was observed that many website hosters serve generic placeholder-like images for items that don't have a unique thumbnail: by keeping the most commonly requested URLs (and their response) in a cache, we should be able to significantly reduce the amount of outgoing requests and traffic (and increase performance) ToDos: - observed PoolTimeout exceptions while debugging (wip) -- ToDo: don't flood the httpx client with hundreds of connections if a crawler serves them too fast (fobizz_spider)
openeduhub · Dec 1, 2023 · fdc127f · fdc127f
1 parent fdbbd7f
commit fdc127f
Showing 1 changed file with 44 additions and 14 deletions.
diff --git a/converter/pipelines.py b/converter/pipelines.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import asyncio
 import base64
 # Define your item pipelines here
 #
@@ -21,6 +22,7 @@
 import isodate
 import scrapy
 import scrapy.crawler
+from async_lru import alru_cache
 from PIL import Image
 from itemadapter import ItemAdapter
 from scrapy import settings
@@ -364,7 +366,7 @@ async def process_item(self, raw_item, spider):
         """
         item = ItemAdapter(raw_item)
         response = None
-        url = None
+        url: str | None = None
         settings = get_settings_for_crawler(spider)
         # checking if the (optional) attribute WEB_TOOLS exists:
         web_tools = settings.get("WEB_TOOLS", WebEngine.Splash)
@@ -378,20 +380,11 @@ async def process_item(self, raw_item, spider):
             # The final BaseItem data model doesn't use screenshot_bytes.
             # Therefore, we delete it after we're done with processing it
             del item["screenshot_bytes"]
-
-            # a thumbnail (url) is given - we will try to fetch it from the url
         elif "thumbnail" in item:
-            url = item["thumbnail"]
-            try:
-                response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
-                log.debug(
-                    "Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
-                )
-            except httpx.ConnectError:
-                # some website hosts are super slow or throttle connections
-                log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
-            except httpx.ReadError:
-                log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
+            # a thumbnail (url) is given - we will try to fetch it from the url
+            url: str = item["thumbnail"]
+            response = await self.download_thumbnail_url(url)
+            log.debug(f"Thumbnail-URL-Cache after trying to query {url}: {self.download_thumbnail_url.cache_info()}")
             # nothing was given, we try to screenshot the page either via Splash or Playwright
         elif (
                 "location" in item["lom"]["technical"]
@@ -470,6 +463,43 @@ async def process_item(self, raw_item, spider):
                     )
         return raw_item
 
+    @alru_cache(maxsize=128)
+    async def download_thumbnail_url(self, url: str):
+        """
+        Download a thumbnail URL and **caches** the result.
+
+        The cache works similarly to Python's built-in `functools.lru_cache`-decorator and discards the
+        least recently used items first.
+        (see: https://github.com/aio-libs/async-lru)
+
+        Typical use-case:
+        Some webhosters serve generic placeholder images as their default thumbnail.
+        By caching the response of such URLs, we can save a significant amount of time and traffic.
+
+        :param url: URL of a thumbnail/image.
+        :return: Response or None
+        """
+        try:
+            response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
+            log.debug(
+                "Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
+            )
+            if response:
+                return response
+            else:
+                return None
+        except httpx.ConnectError:
+            # some website hosts are super slow or throttle connections
+            log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
+        except httpx.ReadError:
+            log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
+        except httpx.RemoteProtocolError:
+            log.warning(f"Thumbnail-Pipeline received a malformed HTTP Response from {url}")
+        except httpx.PoolTimeout:
+            # ToDo: don't over-saturate the connection pool (-> debug with fobizz_spider)
+            log.warning(f"Thumbnail-Pipeline reached connection pool limit while trying to download {url}")
+            raise
+
     # override the project settings with the given ones from the current spider
     # see PR 56 for details