Skip to content

Commit

Permalink
Merge pull request #118 from openeduhub/feat_oer_pipeline
Browse files Browse the repository at this point in the history
feat: (optional) OER-Filter Pipeline
  • Loading branch information
Criamos authored Nov 19, 2024
2 parents f456d28 + 7f78868 commit 2ac115f
Show file tree
Hide file tree
Showing 6 changed files with 331 additions and 233 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ WORKDIR /

COPY entrypoint.sh entrypoint.sh
COPY edu_sharing_openapi/ edu_sharing_openapi/
COPY pyproject.toml poetry.lock ./
RUN pip3 install poetry
RUN poetry install
COPY pyproject.toml poetry.lock Readme.md ./
COPY scrapy.cfg scrapy.cfg
COPY setup.cfg setup.cfg
COPY converter/ converter/
COPY csv/ csv/
COPY valuespace_converter/ valuespace_converter/
RUN pip3 install poetry
RUN poetry install


ENTRYPOINT ["/entrypoint.sh"]
4 changes: 4 additions & 0 deletions converter/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ PYPPETEER_WS_ENDPOINT="ws://localhost:3000"
# Playwright Integration, as needed for the local container (https://hub.docker.com/r/browserless/chrome#playwright)
PLAYWRIGHT_WS_ENDPOINT="ws://localhost:3000"

# --- OER Filter:
# Parse only clearly OER-compatible items (according to their license.url / license.internal / valuespaces.price value)
OER_FILTER=False

# --- Thumbnail Pipeline settings:
# Enable / disable the fallback to website-screenshot, if no thumbnail URL was available / reachable
THUMBNAIL_FALLBACK="True" # set to "False" if you want to explicitly disable the fallback via Splash/Playwright
Expand Down
61 changes: 61 additions & 0 deletions converter/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def process_item(self, raw_item, spider):
| Constants.LICENSE_CC_BY_SA_30
| Constants.LICENSE_CC_BY_SA_40
| Constants.LICENSE_CC_ZERO_10
| Constants.LICENSE_PDM
):
item["license"]["oer"] = OerType.ALL
case _:
Expand Down Expand Up @@ -254,6 +255,66 @@ def process_item(self, raw_item, spider):
return raw_item


class OERFilterPipeline(BasicPipeline):
"""
Drop items that are not OER-compatible.
OER compatible licenses are: CC BY, CC BY-SA, CC Zero and Public Domain.
"""
OER_COMPATIBLE_LICENSES: list[str] = [
# CC BY versions
Constants.LICENSE_CC_BY_10,
Constants.LICENSE_CC_BY_20,
Constants.LICENSE_CC_BY_25,
Constants.LICENSE_CC_BY_30,
Constants.LICENSE_CC_BY_40,
# CC BY-SA versions
Constants.LICENSE_CC_BY_SA_10,
Constants.LICENSE_CC_BY_SA_20,
Constants.LICENSE_CC_BY_SA_25,
Constants.LICENSE_CC_BY_SA_30,
Constants.LICENSE_CC_BY_SA_40,
# CC Zero and Public Domain
Constants.LICENSE_CC_ZERO_10,
Constants.LICENSE_PDM,
]
OER_COMPATIBLE_INTERNAL_LICENSES: list[str] = [
"CC_0",
"CC_BY",
"CC_BY_SA",
"PDM"
]
def process_item(self, raw_item: scrapy.Item, spider: scrapy.Spider) -> Optional[scrapy.Item]:
"""
Checks if an item is OER-compatible by looking at its license values and the price of an item.
:param raw_item: the ``scrapy.Item`` in question
:param spider: the ``scrapy.Spider`` which crawled said item
:return: Raises an ``scrapy.exceptions.DropItem`` Exception if the item is not OER-compatible.
Otherwise, returns the ``scrapy.Item``.
"""
item = ItemAdapter(raw_item)
item_is_oer_compatible: bool = False
if "license" in item:
if "url" in item["license"]:
license_url: str = item["license"]["url"]
if license_url in self.OER_COMPATIBLE_LICENSES:
# Item is OER compatible
item_is_oer_compatible = True
if "internal" in item["license"]:
license_internal: str = item["license"]["internal"]
if license_internal in self.OER_COMPATIBLE_INTERNAL_LICENSES:
item_is_oer_compatible = True
if "valuespaces" in item:
if "price" in item["valuespaces"]:
price: str = item["valuespaces"]["price"]
if price == "yes":
item_is_oer_compatible = False
log.info(f"Item {item['sourceId']} is not OER-compatible due to its price. Dropping item ...")
if not item_is_oer_compatible:
raise DropItem(f"Item {item['sourceId']} is not OER-compatible due to its license or price. "
f"Dropping item...")
else:
return raw_item

class ConvertTimePipeline(BasicPipeline):
"""
convert typicalLearningTime into an integer representing seconds
Expand Down
11 changes: 11 additions & 0 deletions converter/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,17 @@
else "converter.pipelines.EduSharingStorePipeline"
): 1000,
}
# OER Filter: Parse only OER-compatible items
# (Caution: This setting drops items if they cannot be clearly identified as OER materials!)
oer_filter_enabled = env.get_bool("OER_FILTER", allow_null=True, default=False)
if oer_filter_enabled:
logging.info("OER-Filter Pipeline is ENABLED! Only OER-compatible items will be stored!")
ITEM_PIPELINES.update(
{
"converter.pipelines.OERFilterPipeline": 295,
# drop items before they reach the thumbnail pipeline to skip unnecessary HTTP requests
}
)

# add custom pipelines from the .env file, if any
ADDITIONAL_PIPELINES = env.get("CUSTOM_PIPELINES", True)
Expand Down
Loading

0 comments on commit 2ac115f

Please sign in to comment.