Skip to content

Commit

Permalink
Merge pull request #126 from openeduhub/develop
Browse files Browse the repository at this point in the history
Merge recent changes into master (Planet-N / headless browser update / Poetry v2)
  • Loading branch information
Criamos authored Jan 7, 2025
2 parents 7318bd9 + 2a88cf7 commit 62370f0
Show file tree
Hide file tree
Showing 10 changed files with 484 additions and 401 deletions.
72 changes: 35 additions & 37 deletions converter/es_connector.py

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions converter/spiders/planet_n_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,23 @@
class PlanetNSpider(scrapy.Spider, LomBase):
name = "planet_n_spider"
friendlyName = "Planet-N"
version = "0.0.3"
version = "0.0.4"
playwright_cookies: list[dict] = [
{
"name": "SimpleCookieControl",
"value": "deny"
}
]
# see: https://wordpress.com/plugins/simple-cookie-control
custom_settings = {
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_DEBUG": True,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
"WEB_TOOLS": WebEngine.Playwright,
"PLAYWRIGHT_COOKIES": playwright_cookies,
"PLAYWRIGHT_ADBLOCKER": True,
}

# ToDo: the Planet-N cookiebanner obstructs part of the website-screenshot
# -> reverse-engineer a method to hide the cookiebanner before the pipeline takes a screenshot

MODULE_SUBJECT_TO_DISCIPLINE_MAPPING = {
"bildende-kunst": "060", # Kunst
"biologie": "080", # Biologie
Expand Down
20 changes: 8 additions & 12 deletions converter/util/edu_sharing_precheck.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging

import requests

from converter import env
from loguru import logger


class EduSharingPreCheck:
Expand Down Expand Up @@ -43,8 +41,6 @@ class EduSharingPreCheck:

payload = ""

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

replication_source_id_list: list[str] = list()

def __init__(self):
Expand All @@ -59,20 +55,20 @@ def set_edu_sharing_url_from_dot_env(self):
"""
edu_sharing_url: str = env.get("EDU_SHARING_BASE_URL", True, None)
saved_search_node_id: str = env.get("EDU_SHARING_PRECHECK_SAVED_SEARCH_ID", True, None)
logging.info(
logger.info(
f"PreCheck utility warmup: Checking '.env'-file for EDU_SHARING_BASE_URL and "
f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID ..."
)
if edu_sharing_url and saved_search_node_id:
url_combined: str = f"{edu_sharing_url}{self.edu_sharing_rest_api_path}{saved_search_node_id}"
logging.info(
logger.info(
f"PreCheck utility: Recognized .env settings for CONTINUED crawl. Assembled URL string: "
f"{url_combined}"
)
self.edu_sharing_url = url_combined
self.saved_search_node_id = saved_search_node_id
else:
logging.error(
logger.error(
f"PreCheck utility: Could not retrieve valid .env settings for EDU_SHARING_BASE and "
f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID. Please make sure that both settings are valid if "
f"you want to COMPLETE/COMPLEMENT a previously aborted crawl."
Expand All @@ -95,7 +91,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
"""
json_response = response.json()
nodes: list[dict] = json_response["nodes"]
logging.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
logger.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
if nodes:
# as long as there are nodes, we haven't reached the final page of the API yet.
for node in nodes:
Expand All @@ -107,7 +103,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
self.replication_source_id_list.append(replication_source_id)
self.query_next_page()
else:
logging.info(
logger.info(
f"Reached the last API page: {response.url} // \nTotal amount of ids collected: {len(self.replication_source_id_list)}"
)

Expand All @@ -131,14 +127,14 @@ def try_to_retrieve_replication_source_id_list(self) -> list[str] | None:
"""
if self.replication_source_id_list:
logging.info(
logger.info(
f"PreCheck utility: Successfully collected {len(self.replication_source_id_list)} "
f"'ccm:replicationsourceid'-strings."
)
self.replication_source_id_list.sort()
return self.replication_source_id_list
else:
logging.warning(
logger.warning(
f"PreCheck utility: The list of 'ccm:replicationsourceid'-strings appears to be empty. "
f"This might happen if the API Pagination is interrupted by connection problems to the "
f"edu-sharing repo."
Expand Down
44 changes: 20 additions & 24 deletions converter/util/edu_sharing_source_template_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
from pprint import pp

import requests

from converter import env

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
from loguru import logger


class EduSharingSourceTemplateHelper:
Expand Down Expand Up @@ -76,7 +73,7 @@ def _initiate_from_dotenv(self):
# (e.g., crawling against production)
self._set_edu_sharing_url(edu_sharing_base_url_from_dot_env)
else:
log.info(
logger.info(
f"Could not read '.env'-Setting 'EDU_SHARING_BASE_URL'. Please check your '.env'-file! "
f"(For additional help, see: oeh-search-etl/converter/.env.example )."
)
Expand Down Expand Up @@ -112,7 +109,7 @@ def _build_payload(self) -> dict | None:
self._payload = payload
return payload
else:
log.error(
logger.error(
f"Cannot build query payload without valid crawler_name. Please make sure that you instantiate "
f"EduSharingTemplateHelper with a valid 'crawler_name'-parameter!"
)
Expand All @@ -135,15 +132,15 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
try:
result_dict = response.json()
except requests.exceptions.JSONDecodeError as jde:
log.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
logger.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
raise jde

try:
pagination: dict = result_dict["pagination"]
pagination_total = pagination["total"]
pagination_count = pagination["count"]
except KeyError:
log.error(
logger.error(
f"Missing 'pagination'-object in edu-sharing response. "
f"Aborting EduSharingSourceTemplateHelper process..."
)
Expand All @@ -155,19 +152,19 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
pass
else:
# unexpected API behavior -> abort here by returning None
log.error(
logger.error(
f"The edu-sharing API returned an unexpected number of crawler 'source template' results:\n"
f"Expected 'pagination.count': 1 (received: {pagination_count} ) // "
f"expected 'pagination.total': 1 (received: {pagination_total} )"
)
if pagination_count == 0 and pagination_total == 0:
log.error(
logger.error(
f"Please make sure that a 'source template' ('Quellen-Datensatz'-template) for crawler "
f"'{self._crawler_name}' exists within the specified edu-sharing repository "
f"{self._edu_sharing_base_url} !"
)
if pagination_count > 1 or pagination_total > 1:
log.error(
logger.error(
f"edu-sharing returned more than one 'crawler source template' for the specified "
f"crawler '{self._crawler_name}'. "
)
Expand All @@ -193,7 +190,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
# "ccm:oeh_languageLevel"
# ]
whitelist_keys: list[str] = nodes_properties[_oeh_cdi]
log.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
logger.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
if whitelist_keys and isinstance(whitelist_keys, list):
for whitelist_key in whitelist_keys:
# the values for each property need to be looked up separately
Expand All @@ -203,16 +200,16 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
if whitelisted_property_value:
_whitelisted_properties.update({f"{whitelist_key}": whitelisted_property_value})
else:
log.error(
logger.error(
f"Received unexpected value type of metadata property '{_oeh_cdi}': "
f"{type(whitelist_keys)} . (Expected type: 'list[str]')"
)
else:
log.error(
logger.error(
f"Could not find '{_oeh_cdi}' in edu-sharing API response. "
f"Source template retrieval FAILED!"
)
log.debug(response.text)
logger.debug(response.text)
except KeyError as ke:
raise ke

Expand All @@ -229,7 +226,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:

return _whitelisted_properties
else:
log.error(
logger.error(
f"edu-sharing API returned an unexpected 'nodes'-object:"
f"Expected list[dict] of length 1, received length: {len(nodes_list)} .\n"
f"Please make sure that a 'source template' ('Quellendatensatz'-template) for crawler "
Expand All @@ -238,21 +235,21 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
return None
else:
# sad-case: we catch unexpected HTTP responses here
log.error(
logger.error(
f"Received unexpected HTTP response (status code: {status_code} ) from the edu-sharing "
f"repository while trying to retrieve whitelisted 'source template'-metadata-properties."
)
if status_code == 401:
# ToDo: specify exact edu-sharing version that provides the necessary API endpoint
log.error(
logger.error(
f"edu-sharing API returned HTTP Status Code {status_code}. "
f"(This might happen when the necessary API endpoint might not be available (yet) in the "
f"edu-sharing repository (edu-sharing v8.1+ required).)"
)
if status_code == 500:
# code 500 might be accompanied by 'java.lang.NullPointerException' -> print whole response
# happens when the payload of our submitted request was empty
log.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
logger.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
response.raise_for_status()
# ToDo: extend Error-Handling for additional edge-cases (as / if they occur)
return None
Expand All @@ -267,7 +264,7 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
# check user-defined .env Setting first if 'crawler source dataset' should be ignored:
est_enabled: bool = env.get_bool(key="EDU_SHARING_SOURCE_TEMPLATE_ENABLED", allow_null=True, default=None)
if est_enabled:
log.info(
logger.info(
f".env setting 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' is ACTIVE. Trying to retrieve whitelisted "
f"properties..."
)
Expand All @@ -286,29 +283,28 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
"Aborting crawl process..."
)
else:
log.error(
logger.error(
f"Could not build payload object to retrieve 'source template'-properties from "
f"edu-sharing repository. "
f"\nJSON Payload for crawler_name '{self._crawler_name}' was:\n"
f"{self._payload}"
f"\n(payload REQUIRES a valid 'crawler_name'!)"
)
log.info(
logger.info(
"Aborting crawl... (If you didn't mean to retrieve an edu-sharing 'source template', please "
"set the .env variable 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' to False!)"
)
return None
else:
# if the setting is explicitly disabled, do nothing -> continue with normal crawler behaviour
log.info(
logger.info(
f"Recognized '.env'-Setting EDU_SHARING_SOURCE_TEMPLATE_ENABLED: '{est_enabled}'.\n"
f"Crawler source dataset will be IGNORED. Continuing with default crawler behaviour..."
)
return None


if __name__ == "__main__":
log.setLevel("DEBUG")
crawler_name_for_testing: str = "zum_deutschlernen_spider"
# crawler_name_for_testing: str = "does_not_exist_spider"
est_helper: EduSharingSourceTemplateHelper = EduSharingSourceTemplateHelper(crawler_name=crawler_name_for_testing)
Expand Down
22 changes: 9 additions & 13 deletions converter/util/language_mapper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
import re

import babel
import langcodes
from loguru import logger


class LanguageMapper:
Expand All @@ -11,10 +11,6 @@ class LanguageMapper:
def __init__(self, languages: list[str] = None):
self.languages = languages

logging.basicConfig(
format="%(asctime)s\t%(levelname)s: %(message)s",
level=logging.DEBUG,
)

@staticmethod
def _normalize_string_to_language_code(raw_string: str) -> str | None:
Expand All @@ -35,7 +31,7 @@ def _normalize_string_to_language_code(raw_string: str) -> str | None:
if "separator" in regex_result_dict:
separator: str = regex_result_dict["separator"]
else:
logging.debug(f"The raw string {raw_string} does not look like a typical Locale string.")
logger.debug(f"The raw string {raw_string} does not look like a typical Locale string.")

if regex_result and separator:
# this case happens when the raw string looks like "de_DE" or "en-US"
Expand Down Expand Up @@ -76,7 +72,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
if self.languages and isinstance(self.languages, str):
# since every step from here on expects a list of strings, typecasting to list[str] provides some minor
# Quality of Life
logging.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
logger.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
self.languages: list[str] = [self.languages]

if self.languages and isinstance(self.languages, list):
Expand All @@ -90,7 +86,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
language_item = language_item.strip()

if len(language_item) < 2:
# logging.debug(
# logger.debug(
# f"LanguageMapper detected an INVALID language string: '{language_item}' (string length is "
# f"too short to be valid. Dropping string...)"
# )
Expand All @@ -100,7 +96,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
if 2 <= len(language_item) <= 5 and len(language_item) != 4:
# this case covers the majority of pre-formatted language-codes, e.g.:
# "de", "EN", "de-DE", "de_DE", "en_US" or "sgn"
# logging.debug(
# logger.debug(
# f"LanguageMapper detected a potential 2-to-4-letter language code: '{language_item}'"
# )
normalized_str: str | None = self._normalize_string_to_language_code(language_item)
Expand All @@ -110,7 +106,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
edge_cases.add(language_item)
if len(language_item) == 4 or len(language_item) > 5:
# natural language edge-cases like "Deutsch", "german", "englisch" are handled here
# logging.debug(
# logger.debug(
# f"LanguageMapper detected a POTENTIALLY INVALID language string: '{language_item}'. "
# f"(String is too long to be a 2- or 4-letter-code). "
# f"Trying to match natural language string to language code..."
Expand All @@ -121,7 +117,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
# see: https://github.com/rspeer/langcodes/tree/master#recognizing-language-names-in-natural-language
if langcodes_result:
langcode_detected = langcodes_result.to_tag()
# logging.debug(
# logger.debug(
# f"Detected language code '{langcode_detected}' from string '{language_item}'."
# )
# ToDo - optional: maybe compare distance between 'langcodes' and 'babel' result?
Expand All @@ -135,7 +131,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
edge_cases.add(language_item)

if edge_cases:
logging.info(
logger.info(
f"LanguageMapper could NOT map the following edge-cases to a normalized language code: "
f"{list(edge_cases)}"
)
Expand All @@ -150,7 +146,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
# sad case: if not a single mapping was possible, our result set is empty
return None
else:
logging.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
logger.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
return None


Expand Down
5 changes: 1 addition & 4 deletions converter/util/license_mapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import logging
import re

from converter.constants import Constants

logger = logging.getLogger(__name__)
from loguru import logger


class LicenseMapper:
Expand All @@ -18,7 +16,6 @@ class LicenseMapper:
Use this information to set 'internal' to 'CUSTOM' and save the string as a custom license description.
"""

logging.basicConfig(level=logging.DEBUG) # ToDo: remove me after debugging

cc_pattern = re.compile(
r"(?<=c{2}.)(?P<CC_TYPE>by(.[acdns]{2}){0,3})"
Expand Down
Loading

0 comments on commit 62370f0

Please sign in to comment.