Merge pull request #122 from openeduhub/develop

Merge develop into master
openeduhub · Dec 10, 2024 · 88adb22 · 88adb22
2 parents 63aa77b + 1b3ac42
commit 88adb22
Show file tree

Hide file tree

Showing 21 changed files with 2,563 additions and 1,037 deletions.
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -13,14 +13,15 @@ jobs:
   build-and-publish:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - uses: azure/docker-login@v1
       with:
         username: ${{ github.repository_owner }}
         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-    - uses: rlespinasse/[email protected]
+    - name: Inject slug/short variables
+      uses: rlespinasse/github-slug-action@v4
     - name: Build Docker image
       run: docker build --tag ${{ github.repository }}:${{ env.GITHUB_REF_SLUG }} .
     - name: Publish to DockerHub

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -15,16 +15,16 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.12"]
+        python-version: ["3.13"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Cache pip
-      uses: actions/cache@v2
+      uses: actions/cache@v4
       with:
         # This path is specific to Ubuntu
         path: ~/.cache/pip

diff --git a/.run/planet_n_spider.run.xml b/.run/planet_n_spider.run.xml
@@ -0,0 +1,26 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="planet_n_spider" type="PythonConfigurationType" factoryName="Python">
+    <output_file path="$PROJECT_DIR$/logs/planet_n_spider_console.log" is_save="true" />
+    <module name="oeh-search-etl" />
+    <option name="ENV_FILES" value="" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <envs>
+      <env name="PYTHONUNBUFFERED" value="1" />
+    </envs>
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="SCRIPT_NAME" value="./.venv/bin/scrapy" />
+    <option name="PARAMETERS" value="crawl planet_n_spider -O &quot;../../logs/planet_n_spider.json&quot;" />
+    <option name="SHOW_COMMAND_LINE" value="false" />
+    <option name="EMULATE_TERMINAL" value="false" />
+    <option name="MODULE_MODE" value="false" />
+    <option name="REDIRECT_INPUT" value="false" />
+    <option name="INPUT_FILE" value="" />
+    <method v="2" />
+  </configuration>
+</component>
diff --git a/Dockerfile b/Dockerfile
@@ -1,19 +1,19 @@
-FROM python:3.12.5-slim-bookworm
+FROM python:3.13-slim-bookworm
 
 # ENV CRAWLER wirlernenonline_spider
 
 WORKDIR /
 
 COPY entrypoint.sh entrypoint.sh
 COPY edu_sharing_openapi/ edu_sharing_openapi/
-COPY pyproject.toml poetry.lock ./
-RUN pip3 install poetry
-RUN poetry install
+COPY pyproject.toml poetry.lock Readme.md ./
 COPY scrapy.cfg scrapy.cfg
 COPY setup.cfg setup.cfg
 COPY converter/ converter/
 COPY csv/ csv/
 COPY valuespace_converter/ valuespace_converter/
+RUN pip3 install poetry
+RUN poetry install
 
 
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/Readme.md b/Readme.md
@@ -1,9 +1,9 @@
 # Open Edu Hub Search ETL
 
-## Step 1: Project Setup - Python 3.12 (manual approach)
+## Step 1: Project Setup — Python 3.13 (manual approach)
 
 - make sure you have python3 installed (<https://docs.python-guide.org/starting/installation/>)
-  - (Python 3.12 or newer is required)
+  - (Python 3.13 is required)
 - go to project root
 - Run the following commands:
 
@@ -22,7 +22,7 @@ python3 -m venv .venv
 
 ## Step 1 (alternative): Project Setup - Python (automated, via `poetry`)
 
-- Step 1: Make sure that you have [Poetry](https://python-poetry.org) v1.5.0+ installed
+- Step 1: Make sure that you have [Poetry](https://python-poetry.org) [v1.8.4](https://github.com/python-poetry/poetry/releases/tag/1.8.4)+ installed
   - for detailed instructions, please consult the [Poetry Installation Guide](https://python-poetry.org/docs/#installation)
 - Step 2: Open your terminal **in the project root directory**:
   - Step 2.1: If you want to have your `.venv` to be created inside the project root directory: 
@@ -31,6 +31,7 @@ python3 -m venv .venv
 - Step 3: **Install dependencies** (according to `pyproject.toml`) by running: `poetry install`
 
 ## Step 2: Project Setup - required Docker Containers
+
 If you have Docker installed, use `docker-compose up` to start up the multi-container for `Splash` and `Playwright`-integration.
 
 As a last step, set up your config variables by copying the `.env.example`-file and modifying it if necessary: 
@@ -40,7 +41,7 @@ As a last step, set up your config variables by copying the `.env.example`-file
 # Running crawlers
 
 - A crawler can be run with `scrapy crawl <spider-name>`. 
-  - (It assumes that you have an edu-sharing v6.0+ instance in your `.env` settings configured which can accept the data.)
+  - (It assumes that you have an edu-sharing v8.1+ instance in your `.env` settings configured which can accept the data.)
 - If a crawler has [Scrapy Spider Contracts](https://docs.scrapy.org/en/latest/topics/contracts.html#spiders-contracts) implemented, you can test those by running `scrapy check <spider-name>`
 
 
@@ -60,8 +61,10 @@ docker compose up
 
 - We use Scrapy as a framework. Please check out the guides for Scrapy spider (https://docs.scrapy.org/en/latest/intro/tutorial.html)
 - To create a new spider, create a file inside `converter/spiders/<myname>_spider.py`
-- We recommend inheriting the `LomBase` class in order to get out-of-the-box support for our metadata model
-- You may also Inherit a Base Class for crawling data, if your site provides LRMI metadata, the `LrmiBase` is a good start. If your system provides an OAI interface, you may use the `OAIBase`
+- We recommend inheriting the `LomBase` class to get out-of-the-box support for our metadata model
+- You may also inherit a base class (see: `converter/spiders/base_classes/`) for crawling data. 
+  - If your site provides LRMI metadata, the `LrmiBase` is a good start. 
+  - If your system provides an OAI interface, you may use the `OAIBase`
 - As a sample/template, please take a look at the `sample_spider.py` or `sample_spider_alternative.py`
 - To learn more about the LOM standard we're using, you'll find useful information at https://en.wikipedia.org/wiki/Learning_object_metadata
 

diff --git a/converter/.env.example b/converter/.env.example
@@ -23,6 +23,14 @@ PYPPETEER_WS_ENDPOINT="ws://localhost:3000"
 # Playwright Integration, as needed for the local container (https://hub.docker.com/r/browserless/chrome#playwright)
 PLAYWRIGHT_WS_ENDPOINT="ws://localhost:3000"
 
+# --- OER Filter:
+# Parse only clearly OER-compatible items (according to their license.url / license.internal / valuespaces.price value)
+OER_FILTER=False
+
+# --- Thumbnail Pipeline settings:
+# Enable / disable the fallback to website-screenshot, if no thumbnail URL was available / reachable
+THUMBNAIL_FALLBACK="True"  # set to "False" if you want to explicitly disable the fallback via Splash/Playwright
+
 # --- Edu-Sharing instance that the crawlers should upload to
 EDU_SHARING_BASE_URL="http://localhost:8080/edu-sharing/"
 EDU_SHARING_USERNAME="admin"

diff --git a/converter/env.py b/converter/env.py
@@ -6,24 +6,24 @@
 load_dotenv()
 
 
-def get(key: str, allow_null: bool = False, default: str = None) -> str:
+def get(key: str, allow_null: bool = False, default: str = None) -> str | None:
     """
     Get environment variable by key.
 
     Exits on undefined variable unless either `allow_null` or `default` is set.
     """
     value = os.getenv(key, default)
-    if value != None:
+    if value is not None:
         return value
     elif allow_null:
         return None
     else:
         _fail_on_missing_key(key)
 
 
-def get_bool(key: str, allow_null: bool = False, default: bool = None) -> bool:
+def get_bool(key: str, allow_null: bool = False, default: bool = None) -> bool | None:
     value = os.getenv(key)
-    if value != None:
+    if value is not None:
         if value.lower() in ["true", "1", "yes"]:
             return True
         elif value.lower() in ["false", "0", "no"]:
@@ -32,7 +32,7 @@ def get_bool(key: str, allow_null: bool = False, default: bool = None) -> bool:
             raise RuntimeError(
                 "Failed to parse value for boolean variable {}: {}".format(key, value)
             )
-    if default != None:
+    if default is not None:
         return default
     elif allow_null:
         return None

diff --git a/converter/es_connector.py b/converter/es_connector.py
@@ -146,7 +146,12 @@ def sync_node(self, spider, type, properties):
             )
         except ApiException as e:
             # ToDo:
-            #  - error-handling for code 500 ("java.util.concurrent.TimeoutException")
+            #  - find a graceful way to handle http status 500 ("java.util.concurrent.TimeoutException"),
+            #  e.g. when the edu-sharing repository is being restarted during an active crawl process
+            if e.status == 401:
+                # if edu-sharing "forgets" the current admin-session, we have to re-init the API client
+                self.init_api_client()
+                return None
             try:
                 json_error: dict = json.loads(e.body)
                 if json_error["error"] == "java.lang.IllegalStateException":
@@ -578,6 +583,21 @@ def transform_item(self, uuid, spider, item):
                     spaces["ccm:educationaltypicalagerange_from"] = tar["fromRange"]
                 if "toRange" in tar:
                     spaces["ccm:educationaltypicalagerange_to"] = tar["toRange"]
+            if "typicalLearningTime" in item["lom"]["educational"]:
+                tlt: int | str | None = item["lom"]["educational"]["typicalLearningTime"]
+                if (
+                        tlt and isinstance(tlt,str) and tlt.isnumeric()
+                        or tlt and isinstance(tlt, int)
+                ):
+                    tlt_in_ms: int = int(tlt) * 1000
+                    spaces["cclom:typicallearningtime"] = tlt_in_ms
+
+        if "ai_allow_usage" in item:
+            # this property is automatically filled by the RobotsTxtPipeline
+            if isinstance(item["ai_allow_usage"], bool):
+                _ai_allow_usage: bool = item["ai_allow_usage"]
+                # the edu-sharing API client expects the value to be of type string
+                spaces["ccm:ai_allow_usage"] = str(_ai_allow_usage)
 
         if "course" in item:
             if "course_availability_from" in item["course"]: