From 3e31cb1168a0aad50ee9ba3850db240f47fca201 Mon Sep 17 00:00:00 2001
From: criamos <981166+Criamos@users.noreply.github.com>
Date: Fri, 25 Oct 2024 18:36:33 +0200
Subject: [PATCH] planet_n_spider v0.0.2

- feat: clean up strings with HTML entities for Planet-N's "description", "excerpt" and "title" values
  - Planet-N provides strings via their API that have HTML entities and trailing newlines
- fix: crawler init
- change: "LOM general description" prefers the longer description texts (if available) instead of the (mostly one-sentence) excerpts
---
 converter/spiders/planet_n_spider.py | 38 +++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/converter/spiders/planet_n_spider.py b/converter/spiders/planet_n_spider.py
index 4299093d..699ee249 100644
--- a/converter/spiders/planet_n_spider.py
+++ b/converter/spiders/planet_n_spider.py
@@ -1,7 +1,9 @@
+import html
 import re
 from typing import Any, Iterable
 
 import scrapy
+from bs4 import BeautifulSoup
 from scrapy import Request
 from scrapy.http import Response
 
@@ -26,7 +28,7 @@
 class PlanetNSpider(scrapy.Spider, LomBase):
     name = "planet_n_spider"
     friendlyName = "Planet N"
-    version = "0.0.1"
+    version = "0.0.2"
     custom_settings = {
         "AUTOTHROTTLE_ENABLED": True,
         "AUTOTHROTTLE_DEBUG": True,
@@ -34,6 +36,9 @@ class PlanetNSpider(scrapy.Spider, LomBase):
         "WEB_TOOLS": WebEngine.Playwright,
     }
 
+    def __init__(self, **kwargs):
+        LomBase.__init__(self, **kwargs)
+
     def start_requests(self) -> Iterable[Request]:
         _api_first_page: str = self.build_wp_json_module_list_request_url()
         yield scrapy.Request(
@@ -171,6 +176,21 @@ def check_if_item_should_be_dropped(self, response: scrapy.http.Response, wp_ite
             _drop_item_flag = True
         return _drop_item_flag
 
+    def clean_up_string(self, text: str) -> str | None:
+        # ToDo: DocString
+        _text_clean: str | None = None
+        if text and isinstance(text, str):
+            # strings from Planet-N will contain HTML entities
+            _soup = BeautifulSoup(text, "html.parser")
+            _text_soup: str = _soup.get_text()
+            if _text_soup:
+                # Planet-N's strings contain trailing whitespaces or newlines, which we have to strip first:
+                _text_clean = _text_soup.strip()
+                return _text_clean
+        else:
+            self.logger.warning(f"Received unhandled input: provided 'text'-parameter was of type '{type(text)}'")
+            return None
+
     def parse(self, response: Response, **kwargs: Any):
         try:
             # the JSON response from Planet-N's WordPress API:
@@ -197,16 +217,26 @@ def parse(self, response: Response, **kwargs: Any):
             wp_date_modified: str = wp_item["modified"]
         wp_description_long: str | None = None
         if "description" in wp_item:
+            # description text will contain HTML entities
             wp_description_long: str = wp_item["description"]
+            wp_description_long_clean: str | None = self.clean_up_string(text=wp_description_long)
+            if wp_description_long_clean:
+                wp_description_long = wp_description_long_clean
         wp_description_excerpt: str | None = None
         if "excerpt" in wp_item and "rendered" in wp_item["excerpt"]:
             wp_description_excerpt: str = wp_item["excerpt"]["rendered"]
+            wp_description_excerpt_clean: str | None = self.clean_up_string(text=wp_description_excerpt)
+            if wp_description_excerpt_clean:
+                wp_description_excerpt = wp_description_excerpt_clean
         wp_keywords: list[str] | None = None
         if "keywords" in wp_item:
             wp_keywords: list[str] = wp_item["keywords"]
         wp_title: str | None = None
         if "title" in wp_item:
             wp_title: str = wp_item["title"]
+            if wp_title and isinstance(wp_title, str):
+                # titles will contain HTML entities (long dashes, "&" etc.)
+                wp_title = html.unescape(wp_title)
         wp_location_url: str | None = None
         if "location" in wp_item:
             wp_location_url: str = wp_item["location"]
@@ -239,10 +269,10 @@ def parse(self, response: Response, **kwargs: Any):
             lom_general_itemloader.add_value("title", wp_title)
         if wp_keywords:
             lom_general_itemloader.add_value("keyword", wp_keywords)
-        if wp_description_excerpt:
-            lom_general_itemloader.add_value("description", wp_description_excerpt)
-        elif wp_description_long:
+        if wp_description_long:
             lom_general_itemloader.add_value("description", wp_description_long)
+        elif wp_description_excerpt:
+            lom_general_itemloader.add_value("description", wp_description_excerpt)
         if og_image:
             # we assume that the OpenGraph locale attribute is correct and tells us the language of the learning object
             lom_general_itemloader.add_value("language", og_locale)