feat: enable EduSharingTypeValidationPipeline

- the new edu-sharing API client does rigorous Type-Checks before submitting items to the repository, which causes pydantic "ValidationError"s for some metadata properties which haven't been normalized previously - example 1: when a crawler collects a set[str] of keywords (to prevent duplicate entries), the pipeline will convert the set[str] to a list[str] before trying to submit it via the REST API - example 2: time- or age-related properties (e.g. "typicalLearningTime", "typicalAgeRange") might cause ValidationErrors when the crawler collects these values as Integers, but the edu-sharing API expects the value to be wrapped in a string - (this pipeline will be expanded over time as more edge-cases arise)
openeduhub · Sep 3, 2024 · bb321bb · bb321bb
1 parent 1f74f23
commit bb321bb
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 0 deletions.
diff --git a/converter/pipelines.py b/converter/pipelines.py
@@ -948,6 +948,43 @@ def process_item(self, raw_item, spider):
                 # raise DropItem()
         return raw_item
 
+class EduSharingTypeValidationPipeline(BasicPipeline):
+    """
+    Rudimentary type-conversion before handling metadata properties off to the API client.
+    """
+    # ToDo: if you notice pydantic "ValidationError"s during crawls, implement handling of those edge-cases here!
+    def process_item(self, item: scrapy.Item, spider: scrapy.Spider) -> Optional[scrapy.Item]:
+        item_adapter = ItemAdapter(item)
+        if "course" in item_adapter:
+            course_item: dict = item_adapter["course"]
+            if "course_duration" in course_item:
+                course_duration: int = course_item["course_duration"]
+                if course_duration and isinstance(course_duration, int):
+                    course_item["course_duration"] = str(course_duration)
+        if "lom" in item_adapter:
+            if "educational" in item_adapter["lom"]:
+                lom_educational: dict = item_adapter["lom"]["educational"]
+                if "typicalLearningTime" in lom_educational:
+                    typical_learning_time: int | str | None = lom_educational["typicalLearningTime"]
+                    if typical_learning_time and isinstance(typical_learning_time, int):
+                        lom_educational["typicalLearningTime"] = str(typical_learning_time)
+                if "typicalAgeRange" in lom_educational:
+                    if "fromRange" in lom_educational["typicalAgeRange"]:
+                        from_range: int | str | None = lom_educational["typicalAgeRange"]["fromRange"]
+                        if from_range and isinstance(from_range, int):
+                            lom_educational["typicalAgeRange"]["fromRange"] = str(from_range)
+                    if "toRange" in lom_educational["typicalAgeRange"]:
+                        to_range: int | str | None = lom_educational["typicalAgeRange"]["toRange"]
+                        if to_range and isinstance(to_range, int):
+                            lom_educational["typicalAgeRange"]["toRange"] = str(to_range)
+            if "general" in item_adapter["lom"]:
+                lom_general: dict = item_adapter["lom"]["general"]
+                if "keyword" in lom_general:
+                    keywords: list[str] | set[str] | None = lom_general["keyword"]
+                    if keywords and isinstance(keywords, set):
+                        lom_general["keyword"] = list(keywords)
+        return item
+
 
 class JSONStorePipeline(BasicPipeline, PipelineWithPerSpiderMethods):
     def __init__(self):

diff --git a/converter/settings.py b/converter/settings.py
@@ -133,6 +133,7 @@
     "converter.pipelines.ProcessValuespacePipeline": 250,
     "converter.pipelines.CourseItemPipeline": 275,
     "converter.pipelines.ProcessThumbnailPipeline": 300,
+    "converter.pipelines.EduSharingTypeValidationPipeline": 325,
     (
         "converter.pipelines.DummyPipeline"
         if storeMode == "None"