openeduhub · Criamos · Jan 7, 2025 · Dec 20, 2024 · Dec 20, 2024 · Jan 7, 2025
diff --git a/converter/es_connector.py b/converter/es_connector.py
diff --git a/converter/spiders/planet_n_spider.py b/converter/spiders/planet_n_spider.py
@@ -28,17 +28,23 @@
 class PlanetNSpider(scrapy.Spider, LomBase):
     name = "planet_n_spider"
     friendlyName = "Planet-N"
-    version = "0.0.3"
+    version = "0.0.4"
+    playwright_cookies: list[dict] = [
+        {
+            "name": "SimpleCookieControl",
+            "value": "deny"
+        }
+    ]
+    # see: https://wordpress.com/plugins/simple-cookie-control
     custom_settings = {
         "AUTOTHROTTLE_ENABLED": True,
         "AUTOTHROTTLE_DEBUG": True,
         "AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
         "WEB_TOOLS": WebEngine.Playwright,
+        "PLAYWRIGHT_COOKIES": playwright_cookies,
+        "PLAYWRIGHT_ADBLOCKER": True,
     }
 
-    # ToDo: the Planet-N cookiebanner obstructs part of the website-screenshot
-    #  -> reverse-engineer a method to hide the cookiebanner before the pipeline takes a screenshot
-
     MODULE_SUBJECT_TO_DISCIPLINE_MAPPING = {
         "bildende-kunst": "060",  # Kunst
         "biologie": "080",  # Biologie

diff --git a/converter/util/edu_sharing_precheck.py b/converter/util/edu_sharing_precheck.py
@@ -1,8 +1,6 @@
-import logging
-
 import requests
-
 from converter import env
+from loguru import logger
 
 
 class EduSharingPreCheck:
@@ -43,8 +41,6 @@ class EduSharingPreCheck:
 
     payload = ""
 
-    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)
-
     replication_source_id_list: list[str] = list()
 
     def __init__(self):
@@ -59,20 +55,20 @@ def set_edu_sharing_url_from_dot_env(self):
         """
         edu_sharing_url: str = env.get("EDU_SHARING_BASE_URL", True, None)
         saved_search_node_id: str = env.get("EDU_SHARING_PRECHECK_SAVED_SEARCH_ID", True, None)
-        logging.info(
+        logger.info(
             f"PreCheck utility warmup: Checking '.env'-file for EDU_SHARING_BASE_URL and "
             f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID ..."
         )
         if edu_sharing_url and saved_search_node_id:
             url_combined: str = f"{edu_sharing_url}{self.edu_sharing_rest_api_path}{saved_search_node_id}"
-            logging.info(
+            logger.info(
                 f"PreCheck utility: Recognized .env settings for CONTINUED crawl. Assembled URL string: "
                 f"{url_combined}"
             )
             self.edu_sharing_url = url_combined
             self.saved_search_node_id = saved_search_node_id
         else:
-            logging.error(
+            logger.error(
                 f"PreCheck utility: Could not retrieve valid .env settings for EDU_SHARING_BASE and "
                 f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID. Please make sure that both settings are valid if "
                 f"you want to COMPLETE/COMPLEMENT a previously aborted crawl."
@@ -95,7 +91,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
         """
         json_response = response.json()
         nodes: list[dict] = json_response["nodes"]
-        logging.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
+        logger.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
         if nodes:
             # as long as there are nodes, we haven't reached the final page of the API yet.
             for node in nodes:
@@ -107,7 +103,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
                             self.replication_source_id_list.append(replication_source_id)
             self.query_next_page()
         else:
-            logging.info(
+            logger.info(
                 f"Reached the last API page: {response.url} // \nTotal amount of ids collected: {len(self.replication_source_id_list)}"
             )
 
@@ -131,14 +127,14 @@ def try_to_retrieve_replication_source_id_list(self) -> list[str] | None:
 
         """
         if self.replication_source_id_list:
-            logging.info(
+            logger.info(
                 f"PreCheck utility: Successfully collected {len(self.replication_source_id_list)} "
                 f"'ccm:replicationsourceid'-strings."
             )
             self.replication_source_id_list.sort()
             return self.replication_source_id_list
         else:
-            logging.warning(
+            logger.warning(
                 f"PreCheck utility: The list of 'ccm:replicationsourceid'-strings appears to be empty. "
                 f"This might happen if the API Pagination is interrupted by connection problems to the "
                 f"edu-sharing repo."

diff --git a/converter/util/edu_sharing_source_template_helper.py b/converter/util/edu_sharing_source_template_helper.py
@@ -2,11 +2,8 @@
 from pprint import pp
 
 import requests
-
 from converter import env
-
-logging.basicConfig(level=logging.DEBUG)
-log = logging.getLogger(__name__)
+from loguru import logger
 
 
 class EduSharingSourceTemplateHelper:
@@ -76,7 +73,7 @@ def _initiate_from_dotenv(self):
             # (e.g., crawling against production)
             self._set_edu_sharing_url(edu_sharing_base_url_from_dot_env)
         else:
-            log.info(
+            logger.info(
                 f"Could not read '.env'-Setting 'EDU_SHARING_BASE_URL'. Please check your '.env'-file! "
                 f"(For additional help, see: oeh-search-etl/converter/.env.example )."
             )
@@ -112,7 +109,7 @@ def _build_payload(self) -> dict | None:
             self._payload = payload
             return payload
         else:
-            log.error(
+            logger.error(
                 f"Cannot build query payload without valid crawler_name. Please make sure that you instantiate "
                 f"EduSharingTemplateHelper with a valid 'crawler_name'-parameter!"
             )
@@ -135,15 +132,15 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
             try:
                 result_dict = response.json()
             except requests.exceptions.JSONDecodeError as jde:
-                log.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
+                logger.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
                 raise jde
 
             try:
                 pagination: dict = result_dict["pagination"]
                 pagination_total = pagination["total"]
                 pagination_count = pagination["count"]
             except KeyError:
-                log.error(
+                logger.error(
                     f"Missing 'pagination'-object in edu-sharing response. "
                     f"Aborting EduSharingSourceTemplateHelper process..."
                 )
@@ -155,19 +152,19 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
                 pass
             else:
                 # unexpected API behavior -> abort here by returning None
-                log.error(
+                logger.error(
                     f"The edu-sharing API returned an unexpected number of crawler 'source template' results:\n"
                     f"Expected 'pagination.count': 1 (received: {pagination_count} ) // "
                     f"expected 'pagination.total': 1 (received: {pagination_total} )"
                 )
                 if pagination_count == 0 and pagination_total == 0:
-                    log.error(
+                    logger.error(
                         f"Please make sure that a 'source template' ('Quellen-Datensatz'-template) for crawler "
                         f"'{self._crawler_name}' exists within the specified edu-sharing repository "
                         f"{self._edu_sharing_base_url} !"
                     )
                 if pagination_count > 1 or pagination_total > 1:
-                    log.error(
+                    logger.error(
                         f"edu-sharing returned more than one 'crawler source template' for the specified "
                         f"crawler '{self._crawler_name}'. "
                     )
@@ -193,7 +190,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
                         # 					"ccm:oeh_languageLevel"
                         # 				]
                         whitelist_keys: list[str] = nodes_properties[_oeh_cdi]
-                        log.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
+                        logger.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
                         if whitelist_keys and isinstance(whitelist_keys, list):
                             for whitelist_key in whitelist_keys:
                                 # the values for each property need to be looked up separately
@@ -203,16 +200,16 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
                                 if whitelisted_property_value:
                                     _whitelisted_properties.update({f"{whitelist_key}": whitelisted_property_value})
                         else:
-                            log.error(
+                            logger.error(
                                 f"Received unexpected value type of metadata property '{_oeh_cdi}': "
                                 f"{type(whitelist_keys)} . (Expected type: 'list[str]')"
                             )
                     else:
-                        log.error(
+                        logger.error(
                             f"Could not find '{_oeh_cdi}' in edu-sharing API response. "
                             f"Source template retrieval FAILED!"
                         )
-                        log.debug(response.text)
+                        logger.debug(response.text)
                 except KeyError as ke:
                     raise ke
 
@@ -229,7 +226,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
 
                 return _whitelisted_properties
             else:
-                log.error(
+                logger.error(
                     f"edu-sharing API returned an unexpected 'nodes'-object:"
                     f"Expected list[dict] of length 1, received length: {len(nodes_list)} .\n"
                     f"Please make sure that a 'source template' ('Quellendatensatz'-template) for crawler "
@@ -238,21 +235,21 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
                 return None
         else:
             # sad-case: we catch unexpected HTTP responses here
-            log.error(
+            logger.error(
                 f"Received unexpected HTTP response (status code: {status_code} ) from the edu-sharing "
                 f"repository while trying to retrieve whitelisted 'source template'-metadata-properties."
             )
             if status_code == 401:
                 # ToDo: specify exact edu-sharing version that provides the necessary API endpoint
-                log.error(
+                logger.error(
                     f"edu-sharing API returned HTTP Status Code {status_code}. "
                     f"(This might happen when the necessary API endpoint might not be available (yet) in the "
                     f"edu-sharing repository (edu-sharing v8.1+ required).)"
                 )
             if status_code == 500:
                 # code 500 might be accompanied by 'java.lang.NullPointerException' -> print whole response
                 # happens when the payload of our submitted request was empty
-                log.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
+                logger.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
                 response.raise_for_status()
             # ToDo: extend Error-Handling for additional edge-cases (as / if they occur)
             return None
@@ -267,7 +264,7 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
         # check user-defined .env Setting first if 'crawler source dataset' should be ignored:
         est_enabled: bool = env.get_bool(key="EDU_SHARING_SOURCE_TEMPLATE_ENABLED", allow_null=True, default=None)
         if est_enabled:
-            log.info(
+            logger.info(
                 f".env setting 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' is ACTIVE. Trying to retrieve whitelisted "
                 f"properties..."
             )
@@ -286,29 +283,28 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
                         "Aborting crawl process..."
                     )
             else:
-                log.error(
+                logger.error(
                     f"Could not build payload object to retrieve 'source template'-properties from "
                     f"edu-sharing repository. "
                     f"\nJSON Payload for crawler_name '{self._crawler_name}' was:\n"
                     f"{self._payload}"
                     f"\n(payload REQUIRES a valid 'crawler_name'!)"
                 )
-                log.info(
+                logger.info(
                     "Aborting crawl... (If you didn't mean to retrieve an edu-sharing 'source template', please "
                     "set the .env variable 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' to False!)"
                 )
                 return None
         else:
             # if the setting is explicitly disabled, do nothing -> continue with normal crawler behaviour
-            log.info(
+            logger.info(
                 f"Recognized '.env'-Setting EDU_SHARING_SOURCE_TEMPLATE_ENABLED: '{est_enabled}'.\n"
                 f"Crawler source dataset will be IGNORED. Continuing with default crawler behaviour..."
             )
             return None
 
 
 if __name__ == "__main__":
-    log.setLevel("DEBUG")
     crawler_name_for_testing: str = "zum_deutschlernen_spider"
     # crawler_name_for_testing: str = "does_not_exist_spider"
     est_helper: EduSharingSourceTemplateHelper = EduSharingSourceTemplateHelper(crawler_name=crawler_name_for_testing)

diff --git a/converter/util/language_mapper.py b/converter/util/language_mapper.py
@@ -1,8 +1,8 @@
-import logging
 import re
 
 import babel
 import langcodes
+from loguru import logger
 
 
 class LanguageMapper:
@@ -11,10 +11,6 @@ class LanguageMapper:
     def __init__(self, languages: list[str] = None):
         self.languages = languages
 
-    logging.basicConfig(
-        format="%(asctime)s\t%(levelname)s: %(message)s",
-        level=logging.DEBUG,
-    )
 
     @staticmethod
     def _normalize_string_to_language_code(raw_string: str) -> str | None:
@@ -35,7 +31,7 @@ def _normalize_string_to_language_code(raw_string: str) -> str | None:
             if "separator" in regex_result_dict:
                 separator: str = regex_result_dict["separator"]
         else:
-            logging.debug(f"The raw string {raw_string} does not look like a typical Locale string.")
+            logger.debug(f"The raw string {raw_string} does not look like a typical Locale string.")
 
         if regex_result and separator:
             # this case happens when the raw string looks like "de_DE" or "en-US"
@@ -76,7 +72,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
         if self.languages and isinstance(self.languages, str):
             # since every step from here on expects a list of strings, typecasting to list[str] provides some minor
             # Quality of Life
-            logging.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
+            logger.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
             self.languages: list[str] = [self.languages]
 
         if self.languages and isinstance(self.languages, list):
@@ -90,7 +86,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                     language_item = language_item.strip()
 
                     if len(language_item) < 2:
-                        # logging.debug(
+                        # logger.debug(
                         #     f"LanguageMapper detected an INVALID language string: '{language_item}' (string length is "
                         #     f"too short to be valid. Dropping string...)"
                         # )
@@ -100,7 +96,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                     if 2 <= len(language_item) <= 5 and len(language_item) != 4:
                         # this case covers the majority of pre-formatted language-codes, e.g.:
                         # "de", "EN", "de-DE", "de_DE", "en_US" or "sgn"
-                        # logging.debug(
+                        # logger.debug(
                         #     f"LanguageMapper detected a potential 2-to-4-letter language code: '{language_item}'"
                         # )
                         normalized_str: str | None = self._normalize_string_to_language_code(language_item)
@@ -110,7 +106,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                             edge_cases.add(language_item)
                     if len(language_item) == 4 or len(language_item) > 5:
                         # natural language edge-cases like "Deutsch", "german", "englisch" are handled here
-                        # logging.debug(
+                        # logger.debug(
                         #     f"LanguageMapper detected a POTENTIALLY INVALID language string: '{language_item}'. "
                         #     f"(String is too long to be a 2- or 4-letter-code). "
                         #     f"Trying to match natural language string to language code..."
@@ -121,7 +117,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                             # see: https://github.com/rspeer/langcodes/tree/master#recognizing-language-names-in-natural-language
                             if langcodes_result:
                                 langcode_detected = langcodes_result.to_tag()
-                                # logging.debug(
+                                # logger.debug(
                                 #     f"Detected language code '{langcode_detected}' from string '{language_item}'."
                                 # )
                                 # ToDo - optional: maybe compare distance between 'langcodes' and 'babel' result?
@@ -135,7 +131,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                             edge_cases.add(language_item)
 
             if edge_cases:
-                logging.info(
+                logger.info(
                     f"LanguageMapper could NOT map the following edge-cases to a normalized language code: "
                     f"{list(edge_cases)}"
                 )
@@ -150,7 +146,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
                 # sad case: if not a single mapping was possible, our result set is empty
                 return None
         else:
-            logging.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
+            logger.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
             return None
 
 

diff --git a/converter/util/license_mapper.py b/converter/util/license_mapper.py
@@ -1,9 +1,7 @@
-import logging
 import re
 
 from converter.constants import Constants
-
-logger = logging.getLogger(__name__)
+from loguru import logger
 
 
 class LicenseMapper:
@@ -18,7 +16,6 @@ class LicenseMapper:
             Use this information to set 'internal' to 'CUSTOM' and save the string as a custom license description.
     """
 
-    logging.basicConfig(level=logging.DEBUG)  # ToDo: remove me after debugging
 
     cc_pattern = re.compile(
         r"(?<=c{2}.)(?P<CC_TYPE>by(.[acdns]{2}){0,3})"