From 3e31cb1168a0aad50ee9ba3850db240f47fca201 Mon Sep 17 00:00:00 2001 From: criamos <981166+Criamos@users.noreply.github.com> Date: Fri, 25 Oct 2024 18:36:33 +0200 Subject: [PATCH] planet_n_spider v0.0.2 - feat: clean up strings with HTML entities for Planet-N's "description", "excerpt" and "title" values - Planet-N provides strings via their API that have HTML entities and trailing newlines - fix: crawler init - change: "LOM general description" prefers the longer description texts (if available) instead of the (mostly one-sentence) excerpts --- converter/spiders/planet_n_spider.py | 38 +++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/converter/spiders/planet_n_spider.py b/converter/spiders/planet_n_spider.py index 4299093d..699ee249 100644 --- a/converter/spiders/planet_n_spider.py +++ b/converter/spiders/planet_n_spider.py @@ -1,7 +1,9 @@ +import html import re from typing import Any, Iterable import scrapy +from bs4 import BeautifulSoup from scrapy import Request from scrapy.http import Response @@ -26,7 +28,7 @@ class PlanetNSpider(scrapy.Spider, LomBase): name = "planet_n_spider" friendlyName = "Planet N" - version = "0.0.1" + version = "0.0.2" custom_settings = { "AUTOTHROTTLE_ENABLED": True, "AUTOTHROTTLE_DEBUG": True, @@ -34,6 +36,9 @@ class PlanetNSpider(scrapy.Spider, LomBase): "WEB_TOOLS": WebEngine.Playwright, } + def __init__(self, **kwargs): + LomBase.__init__(self, **kwargs) + def start_requests(self) -> Iterable[Request]: _api_first_page: str = self.build_wp_json_module_list_request_url() yield scrapy.Request( @@ -171,6 +176,21 @@ def check_if_item_should_be_dropped(self, response: scrapy.http.Response, wp_ite _drop_item_flag = True return _drop_item_flag + def clean_up_string(self, text: str) -> str | None: + # ToDo: DocString + _text_clean: str | None = None + if text and isinstance(text, str): + # strings from Planet-N will contain HTML entities + _soup = BeautifulSoup(text, "html.parser") + _text_soup: str = _soup.get_text() + if _text_soup: + # Planet-N's strings contain trailing whitespaces or newlines, which we have to strip first: + _text_clean = _text_soup.strip() + return _text_clean + else: + self.logger.warning(f"Received unhandled input: provided 'text'-parameter was of type '{type(text)}'") + return None + def parse(self, response: Response, **kwargs: Any): try: # the JSON response from Planet-N's WordPress API: @@ -197,16 +217,26 @@ def parse(self, response: Response, **kwargs: Any): wp_date_modified: str = wp_item["modified"] wp_description_long: str | None = None if "description" in wp_item: + # description text will contain HTML entities wp_description_long: str = wp_item["description"] + wp_description_long_clean: str | None = self.clean_up_string(text=wp_description_long) + if wp_description_long_clean: + wp_description_long = wp_description_long_clean wp_description_excerpt: str | None = None if "excerpt" in wp_item and "rendered" in wp_item["excerpt"]: wp_description_excerpt: str = wp_item["excerpt"]["rendered"] + wp_description_excerpt_clean: str | None = self.clean_up_string(text=wp_description_excerpt) + if wp_description_excerpt_clean: + wp_description_excerpt = wp_description_excerpt_clean wp_keywords: list[str] | None = None if "keywords" in wp_item: wp_keywords: list[str] = wp_item["keywords"] wp_title: str | None = None if "title" in wp_item: wp_title: str = wp_item["title"] + if wp_title and isinstance(wp_title, str): + # titles will contain HTML entities (long dashes, "&" etc.) + wp_title = html.unescape(wp_title) wp_location_url: str | None = None if "location" in wp_item: wp_location_url: str = wp_item["location"] @@ -239,10 +269,10 @@ def parse(self, response: Response, **kwargs: Any): lom_general_itemloader.add_value("title", wp_title) if wp_keywords: lom_general_itemloader.add_value("keyword", wp_keywords) - if wp_description_excerpt: - lom_general_itemloader.add_value("description", wp_description_excerpt) - elif wp_description_long: + if wp_description_long: lom_general_itemloader.add_value("description", wp_description_long) + elif wp_description_excerpt: + lom_general_itemloader.add_value("description", wp_description_excerpt) if og_image: # we assume that the OpenGraph locale attribute is correct and tells us the language of the learning object lom_general_itemloader.add_value("language", og_locale)