Skip to content

Commit

Permalink
feat/perf: use LRU cache for Thumbnail-Pipeline
Browse files Browse the repository at this point in the history
- feat: implements a cache for "thumbnail"-URls that discards 'least recently used' items first
-- while debugging large crawlers it was observed that many website hosters serve generic placeholder-like images for items that don't have a unique thumbnail: by keeping the most commonly requested URLs (and their response) in a cache, we should be able to significantly reduce the amount of outgoing requests and traffic (and increase performance)

ToDos:
- observed PoolTimeout exceptions while debugging (wip)
-- ToDo: don't flood the httpx client with hundreds of connections if a crawler serves them too fast (fobizz_spider)
  • Loading branch information
Criamos committed Dec 1, 2023
1 parent fdbbd7f commit fdc127f
Showing 1 changed file with 44 additions and 14 deletions.
58 changes: 44 additions & 14 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import asyncio
import base64
# Define your item pipelines here
#
Expand All @@ -21,6 +22,7 @@
import isodate
import scrapy
import scrapy.crawler
from async_lru import alru_cache
from PIL import Image
from itemadapter import ItemAdapter
from scrapy import settings
Expand Down Expand Up @@ -364,7 +366,7 @@ async def process_item(self, raw_item, spider):
"""
item = ItemAdapter(raw_item)
response = None
url = None
url: str | None = None
settings = get_settings_for_crawler(spider)
# checking if the (optional) attribute WEB_TOOLS exists:
web_tools = settings.get("WEB_TOOLS", WebEngine.Splash)
Expand All @@ -378,20 +380,11 @@ async def process_item(self, raw_item, spider):
# The final BaseItem data model doesn't use screenshot_bytes.
# Therefore, we delete it after we're done with processing it
del item["screenshot_bytes"]

# a thumbnail (url) is given - we will try to fetch it from the url
elif "thumbnail" in item:
url = item["thumbnail"]
try:
response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
log.debug(
"Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
)
except httpx.ConnectError:
# some website hosts are super slow or throttle connections
log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
except httpx.ReadError:
log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
# a thumbnail (url) is given - we will try to fetch it from the url
url: str = item["thumbnail"]
response = await self.download_thumbnail_url(url)
log.debug(f"Thumbnail-URL-Cache after trying to query {url}: {self.download_thumbnail_url.cache_info()}")
# nothing was given, we try to screenshot the page either via Splash or Playwright
elif (
"location" in item["lom"]["technical"]
Expand Down Expand Up @@ -470,6 +463,43 @@ async def process_item(self, raw_item, spider):
)
return raw_item

@alru_cache(maxsize=128)
async def download_thumbnail_url(self, url: str):
"""
Download a thumbnail URL and **caches** the result.
The cache works similarly to Python's built-in `functools.lru_cache`-decorator and discards the
least recently used items first.
(see: https://github.com/aio-libs/async-lru)
Typical use-case:
Some webhosters serve generic placeholder images as their default thumbnail.
By caching the response of such URLs, we can save a significant amount of time and traffic.
:param url: URL of a thumbnail/image.
:return: Response or None
"""
try:
response = await self._client_async.get(url=url, follow_redirects=True, timeout=60)
log.debug(
"Loading thumbnail took " + str(response.elapsed.total_seconds()) + "s"
)
if response:
return response
else:
return None
except httpx.ConnectError:
# some website hosts are super slow or throttle connections
log.warning(f"Thumbnail-Pipeline failed to establish a connection with URL {url}")
except httpx.ReadError:
log.warning(f"Thumbnail-Pipeline could not read data from URL {url}")
except httpx.RemoteProtocolError:
log.warning(f"Thumbnail-Pipeline received a malformed HTTP Response from {url}")
except httpx.PoolTimeout:
# ToDo: don't over-saturate the connection pool (-> debug with fobizz_spider)
log.warning(f"Thumbnail-Pipeline reached connection pool limit while trying to download {url}")
raise

# override the project settings with the given ones from the current spider
# see PR 56 for details

Expand Down

0 comments on commit fdc127f

Please sign in to comment.