feat: improve Splash Error-Handling for unsupported URLs

- feat: use 'scrapy.FormRequest'-objects to handle Splash Requests (Splash is queried within the Scrapy Scheduler from now on) - feat: fallback to Playwright if Splash failed to retrieve Thumbnails -- if Splash fails to render a websites, it will set the splash_success flag to False and use Playwright instead -- Splash will be DEPRECATED in the future since it has proven itself more and more unreliable - fix: several warnings in regard to shadowed variables Signed-off-by: criamos <[email protected]>
openeduhub · Dec 5, 2023 · 6ee9280 · 6ee9280
1 parent f0d9754
commit 6ee9280
Showing 1 changed file with 46 additions and 24 deletions.
diff --git a/converter/pipelines.py b/converter/pipelines.py
@@ -369,16 +369,17 @@ async def process_item(self, raw_item, spider):
         item = ItemAdapter(raw_item)
         response: scrapy.http.Response | None = None
         url: str | None = None
-        settings = get_settings_for_crawler(spider)
+        settings_crawler = get_settings_for_crawler(spider)
         # checking if the (optional) attribute WEB_TOOLS exists:
-        web_tools = settings.get("WEB_TOOLS", WebEngine.Splash)
+        web_tools = settings_crawler.get("WEB_TOOLS", default=WebEngine.Splash)
+        _splash_success: bool = True  # control flag flips to False if Splash can't handle a URL
         # if screenshot_bytes is provided (the crawler has already a binary representation of the image
         # the pipeline will convert/scale the given image
         if "screenshot_bytes" in item:
             # in case we are already using playwright in a spider, we can skip one additional HTTP Request by
             # accessing the (temporary available) "screenshot_bytes"-field
             img = Image.open(BytesIO(item["screenshot_bytes"]))
-            self.create_thumbnails_from_image_bytes(img, item, settings)
+            self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
             # The final BaseItem data model doesn't use screenshot_bytes.
             # Therefore, we delete it after we're done with processing it
             del item["screenshot_bytes"]
@@ -403,53 +404,74 @@ async def process_item(self, raw_item, spider):
                 and "format" in item["lom"]["technical"]
                 and item["lom"]["technical"]["format"] == "text/html"
         ):
-            if settings.get("SPLASH_URL") and web_tools == WebEngine.Splash:
-                splash_url: str = f"{settings.get('SPLASH_URL')}/render.png"
-                splash_wait: str = f"{settings.get('SPLASH_WAIT')}"
+            if settings_crawler.get("SPLASH_URL") and web_tools == WebEngine.Splash:
+                target_url: str = item["lom"]["technical"]["location"][0]
+                _splash_url: str = f"{settings_crawler.get('SPLASH_URL')}/render.png"
+                _splash_parameter_wait: str = f"{settings_crawler.get('SPLASH_WAIT')}"
+                _splash_parameter_html5media: str = str(1)
+                _splash_headers: dict = settings_crawler.get("SPLASH_HEADERS")
+                _splash_dict: dict = {
+                    "url": target_url,
+                    "wait": _splash_parameter_wait,
+                    "html5_media": _splash_parameter_wait,
+                    "headers": _splash_headers
+                }
                 request_splash = scrapy.FormRequest(
-                    url=splash_url,
-                    formdata={
-                        "url": item["lom"]["technical"]["location"][0],
-                        # since there can be multiple "technical.location"-values, the first URL is used for thumbnails
-                        "wait": splash_wait,
-                        "html5_media": str(1),
-                        "headers": settings.get("SPLASH_HEADERS"),
-                    },
+                    url=_splash_url,
+                    formdata=_splash_dict,
                     callback=NO_CALLBACK
                 )
-                response = await maybe_deferred_to_future(spider.crawler.engine.download(request_splash))
-            if env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright:
+                splash_response: scrapy.http.Response = await maybe_deferred_to_future(
+                    spider.crawler.engine.download(request_splash)
+                )
+                if splash_response and splash_response.status != 200:
+                    log.debug(f"SPLASH could not handle the requested website. "
+                              f"(Splash returned HTTP Status {splash_response.status} for {target_url} !)")
+                    _splash_success = False
+                    # ToDo: Error-Handling for unsupported URLs
+                    if splash_response.status == 415:
+                        log.debug(f"SPLASH (HTTP Status {splash_response.status} -> Unsupported Media Type): "
+                                  f"Could not render target url {target_url}")
+                elif splash_response:
+                    response: scrapy.http.Response = splash_response
+                else:
+                    # ToDo: if Splash error's out -> Fallback to Playwright?
+                    log.debug(f"SPLASH returned {splash_response.status} for {target_url} ")
+
+            if (_splash_success is False and env.get("PLAYWRIGHT_WS_ENDPOINT")
+                    or env.get("PLAYWRIGHT_WS_ENDPOINT") and web_tools == WebEngine.Playwright):
                 # if the attribute "WEB_TOOLS" doesn't exist as an attribute within a specific spider,
                 # it will default back to "splash"
 
                 # this edge-case is necessary for spiders that only need playwright to gather a screenshot,
                 # but don't use playwright within the spider itself (e.g. serlo_spider)
                 # ToDo: change to scrapy.FormRequest?
-                playwright_dict = await WebTools.getUrlData(url=item["lom"]["technical"]["location"][0],
+                target_url: str = item["lom"]["technical"]["location"][0]
+                playwright_dict = await WebTools.getUrlData(url=target_url,
                                                             engine=WebEngine.Playwright)
                 screenshot_bytes = playwright_dict.get("screenshot_bytes")
                 img = Image.open(BytesIO(screenshot_bytes))
-                self.create_thumbnails_from_image_bytes(img, item, settings)
+                self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
             else:
-                if settings.get("DISABLE_SPLASH") is False:
+                if settings_crawler.get("DISABLE_SPLASH") is False:
                     log.warning(
                         "No thumbnail provided and SPLASH_URL was not configured for screenshots!"
                     )
         if response is None:
-            if settings.get("DISABLE_SPLASH") is False:
+            if settings_crawler.get("DISABLE_SPLASH") is False:
                 log.error(
                     "Neither thumbnail or technical.location (and technical.format) provided! "
                     "Please provide at least one of them"
                 )
         else:
             try:
                 if response.headers["Content-Type"] == "image/svg+xml":
-                    if len(response.body) > settings.get("THUMBNAIL_MAX_SIZE"):
+                    if len(response.body) > settings_crawler.get("THUMBNAIL_MAX_SIZE"):
                         raise Exception(
                             "SVG images can't be converted, and the given image exceeds the maximum allowed size ("
                             + str(len(response.body))
                             + " > "
-                            + str(settings.get("THUMBNAIL_MAX_SIZE"))
+                            + str(settings_crawler.get("THUMBNAIL_MAX_SIZE"))
                             + ")"
                         )
                     item["thumbnail"] = {}
@@ -459,7 +481,7 @@ async def process_item(self, raw_item, spider):
                     ).decode()
                 else:
                     img = Image.open(BytesIO(response.body))
-                    self.create_thumbnails_from_image_bytes(img, item, settings)
+                    self.create_thumbnails_from_image_bytes(img, item, settings_crawler)
             except Exception as e:
                 if url is not None:
                     log.warning(
@@ -533,7 +555,7 @@ def create_thumbnails_from_image_bytes(self, image, item, settings):
         ).decode()
 
 
-def get_settings_for_crawler(spider):
+def get_settings_for_crawler(spider) -> scrapy.settings.Settings:
     all_settings = get_project_settings()
     crawler_settings = settings.BaseSettings(getattr(spider, "custom_settings") or {}, 'spider')
     if type(crawler_settings) == dict: