Skip to content

Commit

Permalink
feat: enable EduSharingTypeValidationPipeline
Browse files Browse the repository at this point in the history
- the new edu-sharing API client does rigorous Type-Checks before submitting items to the repository, which causes pydantic "ValidationError"s for some metadata properties which haven't been normalized previously
  - example 1: when a crawler collects a set[str] of keywords (to prevent duplicate entries), the pipeline will convert the set[str] to a list[str] before trying to submit it via the REST API
  - example 2: time- or age-related properties (e.g. "typicalLearningTime", "typicalAgeRange") might cause ValidationErrors when the crawler collects these values as Integers, but the edu-sharing API expects the value to be wrapped in a string
- (this pipeline will be expanded over time as more edge-cases arise)
  • Loading branch information
Criamos committed Sep 3, 2024
1 parent 1f74f23 commit bb321bb
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
37 changes: 37 additions & 0 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,43 @@ def process_item(self, raw_item, spider):
# raise DropItem()
return raw_item

class EduSharingTypeValidationPipeline(BasicPipeline):
"""
Rudimentary type-conversion before handling metadata properties off to the API client.
"""
# ToDo: if you notice pydantic "ValidationError"s during crawls, implement handling of those edge-cases here!
def process_item(self, item: scrapy.Item, spider: scrapy.Spider) -> Optional[scrapy.Item]:
item_adapter = ItemAdapter(item)
if "course" in item_adapter:
course_item: dict = item_adapter["course"]
if "course_duration" in course_item:
course_duration: int = course_item["course_duration"]
if course_duration and isinstance(course_duration, int):
course_item["course_duration"] = str(course_duration)
if "lom" in item_adapter:
if "educational" in item_adapter["lom"]:
lom_educational: dict = item_adapter["lom"]["educational"]
if "typicalLearningTime" in lom_educational:
typical_learning_time: int | str | None = lom_educational["typicalLearningTime"]
if typical_learning_time and isinstance(typical_learning_time, int):
lom_educational["typicalLearningTime"] = str(typical_learning_time)
if "typicalAgeRange" in lom_educational:
if "fromRange" in lom_educational["typicalAgeRange"]:
from_range: int | str | None = lom_educational["typicalAgeRange"]["fromRange"]
if from_range and isinstance(from_range, int):
lom_educational["typicalAgeRange"]["fromRange"] = str(from_range)
if "toRange" in lom_educational["typicalAgeRange"]:
to_range: int | str | None = lom_educational["typicalAgeRange"]["toRange"]
if to_range and isinstance(to_range, int):
lom_educational["typicalAgeRange"]["toRange"] = str(to_range)
if "general" in item_adapter["lom"]:
lom_general: dict = item_adapter["lom"]["general"]
if "keyword" in lom_general:
keywords: list[str] | set[str] | None = lom_general["keyword"]
if keywords and isinstance(keywords, set):
lom_general["keyword"] = list(keywords)
return item


class JSONStorePipeline(BasicPipeline, PipelineWithPerSpiderMethods):
def __init__(self):
Expand Down
1 change: 1 addition & 0 deletions converter/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@
"converter.pipelines.ProcessValuespacePipeline": 250,
"converter.pipelines.CourseItemPipeline": 275,
"converter.pipelines.ProcessThumbnailPipeline": 300,
"converter.pipelines.EduSharingTypeValidationPipeline": 325,
(
"converter.pipelines.DummyPipeline"
if storeMode == "None"
Expand Down

0 comments on commit bb321bb

Please sign in to comment.