Skip to content

Commit

Permalink
feat: improve Splash Error-Handling for unsupported URLs
Browse files Browse the repository at this point in the history
- feat: use 'scrapy.FormRequest'-objects to handle Splash Requests (Splash is queried within the Scrapy Scheduler from now on)
- feat: fallback to Playwright if Splash failed to retrieve Thumbnails
-- if Splash fails to render a websites, it will set the splash_success flag to False and use Playwright instead
-- Splash will be DEPRECATED in the future since it has proven itself more and more unreliable
- fix: several warnings in regard to shadowed variables

Signed-off-by: criamos <[email protected]>
  • Loading branch information
Criamos committed Dec 5, 2023
1 parent f0d9754 commit 6ee9280
Showing 1 changed file with 46 additions and 24 deletions.
70 changes: 46 additions & 24 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,16 +369,17 @@ async def process_item(self, raw_item, spider):
item = ItemAdapter(raw_item)
response: scrapy.http.Response | None = None
url: str | None = None
settings = get_settings_for_crawler(spider)
settings_crawler = get_settings_for_crawler(spider)
# checking if the (optional) attribute WEB_TOOLS exists:
web_tools = settings.get("WEB_TOOLS", WebEngine.Splash)
web_tools = settings_crawler.get("WEB_TOOLS", default=WebEngine.Splash)
_splash_success: bool = True # control flag flips to False if Splash can't handle a URL
# if screenshot_bytes is provided (the crawler has already a binary representation of the image
# the pipeline will convert/scale the given image
if "screenshot_bytes" in item:
# in case we are already using playwright in a spider, we can skip one additional HTTP Request by
# accessing the (temporary available) "screenshot_bytes"-field
img = Image.open(BytesIO(item["screenshot_bytes"]))
self.create_thumbnails_from_image_bytes(img, item, settings)
self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
# The final BaseItem data model doesn't use screenshot_bytes.
# Therefore, we delete it after we're done with processing it
del item["screenshot_bytes"]
Expand All @@ -403,53 +404,74 @@ async def process_item(self, raw_item, spider):
and "format" in item["lom"]["technical"]
and item["lom"]["technical"]["format"] == "text/html"
):
if settings.get("SPLASH_URL") and web_tools == WebEngine.Splash:
splash_url: str = f"{settings.get('SPLASH_URL')}/render.png"
splash_wait: str = f"{settings.get('SPLASH_WAIT')}"
if settings_crawler.get("SPLASH_URL") and web_tools == WebEngine.Splash:
target_url: str = item["lom"]["technical"]["location"][0]
_splash_url: str = f"{settings_crawler.get('SPLASH_URL')}/render.png"
_splash_parameter_wait: str = f"{settings_crawler.get('SPLASH_WAIT')}"
_splash_parameter_html5media: str = str(1)
_splash_headers: dict = settings_crawler.get("SPLASH_HEADERS")
_splash_dict: dict = {
"url": target_url,
"wait": _splash_parameter_wait,
"html5_media": _splash_parameter_wait,
"headers": _splash_headers
}
request_splash = scrapy.FormRequest(
url=splash_url,
formdata={
"url": item["lom"]["technical"]["location"][0],
# since there can be multiple "technical.location"-values, the first URL is used for thumbnails
"wait": splash_wait,
"html5_media": str(1),
"headers": settings.get("SPLASH_HEADERS"),
},
url=_splash_url,
formdata=_splash_dict,
callback=NO_CALLBACK
)
response = await maybe_deferred_to_future(spider.crawler.engine.download(request_splash))
if env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright:
splash_response: scrapy.http.Response = await maybe_deferred_to_future(
spider.crawler.engine.download(request_splash)
)
if splash_response and splash_response.status != 200:
log.debug(f"SPLASH could not handle the requested website. "
f"(Splash returned HTTP Status {splash_response.status} for {target_url} !)")
_splash_success = False
# ToDo: Error-Handling for unsupported URLs
if splash_response.status == 415:
log.debug(f"SPLASH (HTTP Status {splash_response.status} -> Unsupported Media Type): "
f"Could not render target url {target_url}")
elif splash_response:
response: scrapy.http.Response = splash_response
else:
# ToDo: if Splash error's out -> Fallback to Playwright?
log.debug(f"SPLASH returned {splash_response.status} for {target_url} ")

if (_splash_success is False and env.get("PLAYWRIGHT_WS_ENDPOINT")
or env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright):
# if the attribute "WEB_TOOLS" doesn't exist as an attribute within a specific spider,
# it will default back to "splash"

# this edge-case is necessary for spiders that only need playwright to gather a screenshot,
# but don't use playwright within the spider itself (e.g. serlo_spider)
# ToDo: change to scrapy.FormRequest?
playwright_dict = await WebTools.getUrlData(url=item["lom"]["technical"]["location"][0],
target_url: str = item["lom"]["technical"]["location"][0]
playwright_dict = await WebTools.getUrlData(url=target_url,
engine=WebEngine.Playwright)
screenshot_bytes = playwright_dict.get("screenshot_bytes")
img = Image.open(BytesIO(screenshot_bytes))
self.create_thumbnails_from_image_bytes(img, item, settings)
self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
else:
if settings.get("DISABLE_SPLASH") is False:
if settings_crawler.get("DISABLE_SPLASH") is False:
log.warning(
"No thumbnail provided and SPLASH_URL was not configured for screenshots!"
)
if response is None:
if settings.get("DISABLE_SPLASH") is False:
if settings_crawler.get("DISABLE_SPLASH") is False:
log.error(
"Neither thumbnail or technical.location (and technical.format) provided! "
"Please provide at least one of them"
)
else:
try:
if response.headers["Content-Type"] == "image/svg+xml":
if len(response.body) > settings.get("THUMBNAIL_MAX_SIZE"):
if len(response.body) > settings_crawler.get("THUMBNAIL_MAX_SIZE"):
raise Exception(
"SVG images can't be converted, and the given image exceeds the maximum allowed size ("
+ str(len(response.body))
+ " > "
+ str(settings.get("THUMBNAIL_MAX_SIZE"))
+ str(settings_crawler.get("THUMBNAIL_MAX_SIZE"))
+ ")"
)
item["thumbnail"] = {}
Expand All @@ -459,7 +481,7 @@ async def process_item(self, raw_item, spider):
).decode()
else:
img = Image.open(BytesIO(response.body))
self.create_thumbnails_from_image_bytes(img, item, settings)
self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
except Exception as e:
if url is not None:
log.warning(
Expand Down Expand Up @@ -533,7 +555,7 @@ def create_thumbnails_from_image_bytes(self, image, item, settings):
).decode()


def get_settings_for_crawler(spider):
def get_settings_for_crawler(spider) -> scrapy.settings.Settings:
all_settings = get_project_settings()
crawler_settings = settings.BaseSettings(getattr(spider, "custom_settings") or {}, 'spider')
if type(crawler_settings) == dict:
Expand Down

0 comments on commit 6ee9280

Please sign in to comment.