Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge recent changes into master (Planet-N / headless browser update / Poetry v2) #126

Merged
merged 9 commits into from
Jan 7, 2025
72 changes: 35 additions & 37 deletions converter/es_connector.py

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions converter/spiders/planet_n_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,23 @@
class PlanetNSpider(scrapy.Spider, LomBase):
name = "planet_n_spider"
friendlyName = "Planet-N"
version = "0.0.3"
version = "0.0.4"
playwright_cookies: list[dict] = [
{
"name": "SimpleCookieControl",
"value": "deny"
}
]
# see: https://wordpress.com/plugins/simple-cookie-control
custom_settings = {
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_DEBUG": True,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
"WEB_TOOLS": WebEngine.Playwright,
"PLAYWRIGHT_COOKIES": playwright_cookies,
"PLAYWRIGHT_ADBLOCKER": True,
}

# ToDo: the Planet-N cookiebanner obstructs part of the website-screenshot
# -> reverse-engineer a method to hide the cookiebanner before the pipeline takes a screenshot

MODULE_SUBJECT_TO_DISCIPLINE_MAPPING = {
"bildende-kunst": "060", # Kunst
"biologie": "080", # Biologie
Expand Down
20 changes: 8 additions & 12 deletions converter/util/edu_sharing_precheck.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging

import requests

from converter import env
from loguru import logger


class EduSharingPreCheck:
Expand Down Expand Up @@ -43,8 +41,6 @@ class EduSharingPreCheck:

payload = ""

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

replication_source_id_list: list[str] = list()

def __init__(self):
Expand All @@ -59,20 +55,20 @@ def set_edu_sharing_url_from_dot_env(self):
"""
edu_sharing_url: str = env.get("EDU_SHARING_BASE_URL", True, None)
saved_search_node_id: str = env.get("EDU_SHARING_PRECHECK_SAVED_SEARCH_ID", True, None)
logging.info(
logger.info(
f"PreCheck utility warmup: Checking '.env'-file for EDU_SHARING_BASE_URL and "
f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID ..."
)
if edu_sharing_url and saved_search_node_id:
url_combined: str = f"{edu_sharing_url}{self.edu_sharing_rest_api_path}{saved_search_node_id}"
logging.info(
logger.info(
f"PreCheck utility: Recognized .env settings for CONTINUED crawl. Assembled URL string: "
f"{url_combined}"
)
self.edu_sharing_url = url_combined
self.saved_search_node_id = saved_search_node_id
else:
logging.error(
logger.error(
f"PreCheck utility: Could not retrieve valid .env settings for EDU_SHARING_BASE and "
f"EDU_SHARING_PRECHECK_SAVED_SEARCH_ID. Please make sure that both settings are valid if "
f"you want to COMPLETE/COMPLEMENT a previously aborted crawl."
Expand All @@ -95,7 +91,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
"""
json_response = response.json()
nodes: list[dict] = json_response["nodes"]
logging.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
logger.info(f"Collecting 'ccm:replicationsourceid's from: {response.url}")
if nodes:
# as long as there are nodes, we haven't reached the final page of the API yet.
for node in nodes:
Expand All @@ -107,7 +103,7 @@ def collect_replication_source_ids_from_nodes(self, response: requests.Response)
self.replication_source_id_list.append(replication_source_id)
self.query_next_page()
else:
logging.info(
logger.info(
f"Reached the last API page: {response.url} // \nTotal amount of ids collected: {len(self.replication_source_id_list)}"
)

Expand All @@ -131,14 +127,14 @@ def try_to_retrieve_replication_source_id_list(self) -> list[str] | None:

"""
if self.replication_source_id_list:
logging.info(
logger.info(
f"PreCheck utility: Successfully collected {len(self.replication_source_id_list)} "
f"'ccm:replicationsourceid'-strings."
)
self.replication_source_id_list.sort()
return self.replication_source_id_list
else:
logging.warning(
logger.warning(
f"PreCheck utility: The list of 'ccm:replicationsourceid'-strings appears to be empty. "
f"This might happen if the API Pagination is interrupted by connection problems to the "
f"edu-sharing repo."
Expand Down
44 changes: 20 additions & 24 deletions converter/util/edu_sharing_source_template_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
from pprint import pp

import requests

from converter import env

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
from loguru import logger


class EduSharingSourceTemplateHelper:
Expand Down Expand Up @@ -76,7 +73,7 @@ def _initiate_from_dotenv(self):
# (e.g., crawling against production)
self._set_edu_sharing_url(edu_sharing_base_url_from_dot_env)
else:
log.info(
logger.info(
f"Could not read '.env'-Setting 'EDU_SHARING_BASE_URL'. Please check your '.env'-file! "
f"(For additional help, see: oeh-search-etl/converter/.env.example )."
)
Expand Down Expand Up @@ -112,7 +109,7 @@ def _build_payload(self) -> dict | None:
self._payload = payload
return payload
else:
log.error(
logger.error(
f"Cannot build query payload without valid crawler_name. Please make sure that you instantiate "
f"EduSharingTemplateHelper with a valid 'crawler_name'-parameter!"
)
Expand All @@ -135,15 +132,15 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
try:
result_dict = response.json()
except requests.exceptions.JSONDecodeError as jde:
log.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
logger.error(f"The edu-sharing response could not be parsed as JSON. Response:\n" f"{response.text}")
raise jde

try:
pagination: dict = result_dict["pagination"]
pagination_total = pagination["total"]
pagination_count = pagination["count"]
except KeyError:
log.error(
logger.error(
f"Missing 'pagination'-object in edu-sharing response. "
f"Aborting EduSharingSourceTemplateHelper process..."
)
Expand All @@ -155,19 +152,19 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
pass
else:
# unexpected API behavior -> abort here by returning None
log.error(
logger.error(
f"The edu-sharing API returned an unexpected number of crawler 'source template' results:\n"
f"Expected 'pagination.count': 1 (received: {pagination_count} ) // "
f"expected 'pagination.total': 1 (received: {pagination_total} )"
)
if pagination_count == 0 and pagination_total == 0:
log.error(
logger.error(
f"Please make sure that a 'source template' ('Quellen-Datensatz'-template) for crawler "
f"'{self._crawler_name}' exists within the specified edu-sharing repository "
f"{self._edu_sharing_base_url} !"
)
if pagination_count > 1 or pagination_total > 1:
log.error(
logger.error(
f"edu-sharing returned more than one 'crawler source template' for the specified "
f"crawler '{self._crawler_name}'. "
)
Expand All @@ -193,7 +190,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
# "ccm:oeh_languageLevel"
# ]
whitelist_keys: list[str] = nodes_properties[_oeh_cdi]
log.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
logger.info(f"'{_oeh_cdi}' contains the following properties: \n" f"{whitelist_keys}")
if whitelist_keys and isinstance(whitelist_keys, list):
for whitelist_key in whitelist_keys:
# the values for each property need to be looked up separately
Expand All @@ -203,16 +200,16 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
if whitelisted_property_value:
_whitelisted_properties.update({f"{whitelist_key}": whitelisted_property_value})
else:
log.error(
logger.error(
f"Received unexpected value type of metadata property '{_oeh_cdi}': "
f"{type(whitelist_keys)} . (Expected type: 'list[str]')"
)
else:
log.error(
logger.error(
f"Could not find '{_oeh_cdi}' in edu-sharing API response. "
f"Source template retrieval FAILED!"
)
log.debug(response.text)
logger.debug(response.text)
except KeyError as ke:
raise ke

Expand All @@ -229,7 +226,7 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:

return _whitelisted_properties
else:
log.error(
logger.error(
f"edu-sharing API returned an unexpected 'nodes'-object:"
f"Expected list[dict] of length 1, received length: {len(nodes_list)} .\n"
f"Please make sure that a 'source template' ('Quellendatensatz'-template) for crawler "
Expand All @@ -238,21 +235,21 @@ def _retrieve_whitelisted_metadata_properties(self) -> dict | None:
return None
else:
# sad-case: we catch unexpected HTTP responses here
log.error(
logger.error(
f"Received unexpected HTTP response (status code: {status_code} ) from the edu-sharing "
f"repository while trying to retrieve whitelisted 'source template'-metadata-properties."
)
if status_code == 401:
# ToDo: specify exact edu-sharing version that provides the necessary API endpoint
log.error(
logger.error(
f"edu-sharing API returned HTTP Status Code {status_code}. "
f"(This might happen when the necessary API endpoint might not be available (yet) in the "
f"edu-sharing repository (edu-sharing v8.1+ required).)"
)
if status_code == 500:
# code 500 might be accompanied by 'java.lang.NullPointerException' -> print whole response
# happens when the payload of our submitted request was empty
log.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
logger.error(f"edu-sharing API returned HTTP status code {status_code}:\n" f"{response.text}")
response.raise_for_status()
# ToDo: extend Error-Handling for additional edge-cases (as / if they occur)
return None
Expand All @@ -267,7 +264,7 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
# check user-defined .env Setting first if 'crawler source dataset' should be ignored:
est_enabled: bool = env.get_bool(key="EDU_SHARING_SOURCE_TEMPLATE_ENABLED", allow_null=True, default=None)
if est_enabled:
log.info(
logger.info(
f".env setting 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' is ACTIVE. Trying to retrieve whitelisted "
f"properties..."
)
Expand All @@ -286,29 +283,28 @@ def get_whitelisted_metadata_properties(self) -> dict | None:
"Aborting crawl process..."
)
else:
log.error(
logger.error(
f"Could not build payload object to retrieve 'source template'-properties from "
f"edu-sharing repository. "
f"\nJSON Payload for crawler_name '{self._crawler_name}' was:\n"
f"{self._payload}"
f"\n(payload REQUIRES a valid 'crawler_name'!)"
)
log.info(
logger.info(
"Aborting crawl... (If you didn't mean to retrieve an edu-sharing 'source template', please "
"set the .env variable 'EDU_SHARING_SOURCE_TEMPLATE_ENABLED' to False!)"
)
return None
else:
# if the setting is explicitly disabled, do nothing -> continue with normal crawler behaviour
log.info(
logger.info(
f"Recognized '.env'-Setting EDU_SHARING_SOURCE_TEMPLATE_ENABLED: '{est_enabled}'.\n"
f"Crawler source dataset will be IGNORED. Continuing with default crawler behaviour..."
)
return None


if __name__ == "__main__":
log.setLevel("DEBUG")
crawler_name_for_testing: str = "zum_deutschlernen_spider"
# crawler_name_for_testing: str = "does_not_exist_spider"
est_helper: EduSharingSourceTemplateHelper = EduSharingSourceTemplateHelper(crawler_name=crawler_name_for_testing)
Expand Down
22 changes: 9 additions & 13 deletions converter/util/language_mapper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
import re

import babel
import langcodes
from loguru import logger


class LanguageMapper:
Expand All @@ -11,10 +11,6 @@ class LanguageMapper:
def __init__(self, languages: list[str] = None):
self.languages = languages

logging.basicConfig(
format="%(asctime)s\t%(levelname)s: %(message)s",
level=logging.DEBUG,
)

@staticmethod
def _normalize_string_to_language_code(raw_string: str) -> str | None:
Expand All @@ -35,7 +31,7 @@ def _normalize_string_to_language_code(raw_string: str) -> str | None:
if "separator" in regex_result_dict:
separator: str = regex_result_dict["separator"]
else:
logging.debug(f"The raw string {raw_string} does not look like a typical Locale string.")
logger.debug(f"The raw string {raw_string} does not look like a typical Locale string.")

if regex_result and separator:
# this case happens when the raw string looks like "de_DE" or "en-US"
Expand Down Expand Up @@ -76,7 +72,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
if self.languages and isinstance(self.languages, str):
# since every step from here on expects a list of strings, typecasting to list[str] provides some minor
# Quality of Life
logging.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
logger.debug(f"LanguageMapper was instantiated with a string, converting to Type list[str]...")
self.languages: list[str] = [self.languages]

if self.languages and isinstance(self.languages, list):
Expand All @@ -90,7 +86,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
language_item = language_item.strip()

if len(language_item) < 2:
# logging.debug(
# logger.debug(
# f"LanguageMapper detected an INVALID language string: '{language_item}' (string length is "
# f"too short to be valid. Dropping string...)"
# )
Expand All @@ -100,7 +96,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
if 2 <= len(language_item) <= 5 and len(language_item) != 4:
# this case covers the majority of pre-formatted language-codes, e.g.:
# "de", "EN", "de-DE", "de_DE", "en_US" or "sgn"
# logging.debug(
# logger.debug(
# f"LanguageMapper detected a potential 2-to-4-letter language code: '{language_item}'"
# )
normalized_str: str | None = self._normalize_string_to_language_code(language_item)
Expand All @@ -110,7 +106,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
edge_cases.add(language_item)
if len(language_item) == 4 or len(language_item) > 5:
# natural language edge-cases like "Deutsch", "german", "englisch" are handled here
# logging.debug(
# logger.debug(
# f"LanguageMapper detected a POTENTIALLY INVALID language string: '{language_item}'. "
# f"(String is too long to be a 2- or 4-letter-code). "
# f"Trying to match natural language string to language code..."
Expand All @@ -121,7 +117,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
# see: https://github.com/rspeer/langcodes/tree/master#recognizing-language-names-in-natural-language
if langcodes_result:
langcode_detected = langcodes_result.to_tag()
# logging.debug(
# logger.debug(
# f"Detected language code '{langcode_detected}' from string '{language_item}'."
# )
# ToDo - optional: maybe compare distance between 'langcodes' and 'babel' result?
Expand All @@ -135,7 +131,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
edge_cases.add(language_item)

if edge_cases:
logging.info(
logger.info(
f"LanguageMapper could NOT map the following edge-cases to a normalized language code: "
f"{list(edge_cases)}"
)
Expand All @@ -150,7 +146,7 @@ def normalize_list_of_language_strings(self) -> list[str] | None:
# sad case: if not a single mapping was possible, our result set is empty
return None
else:
logging.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
logger.warning(f"LanguageMapper expected list[str] but received unexpected type {type(self.languages)} ")
return None


Expand Down
5 changes: 1 addition & 4 deletions converter/util/license_mapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import logging
import re

from converter.constants import Constants

logger = logging.getLogger(__name__)
from loguru import logger


class LicenseMapper:
Expand All @@ -18,7 +16,6 @@ class LicenseMapper:
Use this information to set 'internal' to 'CUSTOM' and save the string as a custom license description.
"""

logging.basicConfig(level=logging.DEBUG) # ToDo: remove me after debugging

cc_pattern = re.compile(
r"(?<=c{2}.)(?P<CC_TYPE>by(.[acdns]{2}){0,3})"
Expand Down
Loading
Loading