uhh-lt · bigabig · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 ci:
-  skip: [eslint]
+  skip: [eslint, pyright]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0
@@ -17,6 +17,12 @@ repos:
         args: [--fix]
       # Run the formatter.
       - id: ruff-format
+  - repo: local
+    hooks:
+      - id: pyright
+        name: "Pyright"
+        entry: bash -c 'ENV_NAME=dats source backend/_activate_current_env.sh && pyright'
+        language: system
   - repo: https://github.com/pre-commit/mirrors-eslint
     rev: v9.11.0
     hooks:

diff --git a/backend/environment.yml b/backend/environment.yml
@@ -33,6 +33,7 @@ dependencies:
   - pre-commit=3.3.3
   - psycopg2-binary=2.9
   - pydantic=2.5.3
+  - pyright=1.1.385
   - pytest=7.4.3
   - python-jose=3.3
   - python-magic=0.4

diff --git a/backend/src/alembic/versions/7393d4227260_memo_updated_non_nullable.py b/backend/src/alembic/versions/7393d4227260_memo_updated_non_nullable.py
@@ -0,0 +1,54 @@
+"""memo updated and created columns are now non-nullable
+Revision ID: 7393d4227260
+Revises: b2e6991379f5
+Create Date: 2024-10-17 20:14:45.219331
+
+"""
+
+from typing import Sequence, Union
+
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "7393d4227260"
+down_revision: Union[str, None] = "b2e6991379f5"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.alter_column(
+        "memo",
+        "created",
+        existing_type=postgresql.TIMESTAMP(),
+        nullable=False,
+        existing_server_default="now()",
+    )
+    op.alter_column(
+        "memo",
+        "updated",
+        existing_type=postgresql.TIMESTAMP(),
+        nullable=False,
+        existing_server_default="now()",
+    )
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column(
+        "memo",
+        "updated",
+        existing_type=postgresql.TIMESTAMP(),
+        nullable=True,
+        existing_server_default="now()",
+    )
+    op.alter_column(
+        "memo",
+        "created",
+        existing_type=postgresql.TIMESTAMP(),
+        nullable=True,
+        existing_server_default="now()",
+    )
+    # ### end Alembic commands ###
diff --git a/backend/src/app/celery/background_jobs/__init__.py b/backend/src/app/celery/background_jobs/__init__.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 from typing import Any, List
 
+from celery import Task
+
 from app.core.data.crawler.crawler_service import CrawlerService
 from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead
 from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead
@@ -15,6 +17,8 @@ def start_cota_refinement_job_async(
 ) -> None:
     from app.celery.background_jobs.tasks import start_cota_refinement_job_task
 
+    assert isinstance(start_cota_refinement_job_task, Task), "Not a Celery Task"
+
     start_cota_refinement_job_task.apply_async(kwargs={"cota_job_id": cota_job_id})
 
 
@@ -23,6 +27,8 @@ def start_trainer_job_async(
 ) -> None:
     from app.celery.background_jobs.tasks import start_trainer_job_task
 
+    assert isinstance(start_trainer_job_task, Task), "Not a Celery Task"
+
     start_trainer_job_task.apply_async(kwargs={"trainer_job_id": trainer_job_id})
 
 
@@ -31,6 +37,8 @@ def import_uploaded_archive_apply_async(
 ) -> Any:
     from app.celery.background_jobs.tasks import import_uploaded_archive
 
+    assert isinstance(import_uploaded_archive, Task), "Not a Celery Task"
+
     return import_uploaded_archive.apply_async(
         kwargs={"archive_file_path_and_project_id": (archive_file_path, project_id)},
     )
@@ -41,6 +49,8 @@ def prepare_and_start_export_job_async(
 ) -> ExportJobRead:
     from app.celery.background_jobs.tasks import start_export_job
 
+    assert isinstance(start_export_job, Task), "Not a Celery Task"
+
     exs: ExportService = ExportService()
     ex_job = exs.prepare_export_job(export_params)
     start_export_job.apply_async(kwargs={"export_job": ex_job})
@@ -55,16 +65,19 @@ def prepare_and_start_crawling_job_async(
         start_crawler_job,
     )
 
+    assert isinstance(start_crawler_job, Task), "Not a Celery Task"
+    assert isinstance(import_uploaded_archive, Task), "Not a Celery Task"
+
     cs: CrawlerService = CrawlerService()
     cj = cs.prepare_crawler_job(crawler_params)
-    start_export_job_chain = (
-        # crawl the data via scrapy and zip the data
-        start_crawler_job.signature(kwargs={"crawler_job": cj})
-        |
-        # import the zip
-        # TODO create a PPJ for the import
-        import_uploaded_archive.signature()
-    )
+
+    job1 = start_crawler_job.signature(kwargs={"crawler_job": cj})
+    job2 = import_uploaded_archive.signature()
+
+    assert job1 is not None, "Job 1 is None"
+    assert job2 is not None, "Job 2 is None"
+
+    start_export_job_chain = job1 | job2
     start_export_job_chain.apply_async()
 
     return cj
@@ -75,6 +88,8 @@ def prepare_and_start_llm_job_async(
 ) -> LLMJobRead:
     from app.celery.background_jobs.tasks import start_llm_job
 
+    assert isinstance(start_llm_job, Task), "Not a Celery Task"
+
     llms: LLMService = LLMService()
     llm_job = llms.prepare_llm_job(llm_job_params)
     start_llm_job.apply_async(kwargs={"llm_job": llm_job})
@@ -88,6 +103,10 @@ def execute_text_preprocessing_pipeline_apply_async(
         execute_text_preprocessing_pipeline_task,
     )
 
+    assert isinstance(
+        execute_text_preprocessing_pipeline_task, Task
+    ), "Not a Celery Task"
+
     for cargo in cargos:
         execute_text_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
 
@@ -99,6 +118,10 @@ def execute_image_preprocessing_pipeline_apply_async(
         execute_image_preprocessing_pipeline_task,
     )
 
+    assert isinstance(
+        execute_image_preprocessing_pipeline_task, Task
+    ), "Not a Celery Task"
+
     for cargo in cargos:
         execute_image_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
 
@@ -110,6 +133,10 @@ def execute_audio_preprocessing_pipeline_apply_async(
         execute_audio_preprocessing_pipeline_task,
     )
 
+    assert isinstance(
+        execute_audio_preprocessing_pipeline_task, Task
+    ), "Not a Celery Task"
+
     for cargo in cargos:
         execute_audio_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
 
@@ -121,5 +148,9 @@ def execute_video_preprocessing_pipeline_apply_async(
         execute_video_preprocessing_pipeline_task,
     )
 
+    assert isinstance(
+        execute_video_preprocessing_pipeline_task, Task
+    ), "Not a Celery Task"
+
     for cargo in cargos:
         execute_video_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
diff --git a/backend/src/app/core/analysis/analysis_service.py b/backend/src/app/core/analysis/analysis_service.py
@@ -361,14 +361,14 @@ def sample_sdocs_by_tags(
             sample_fixed = (
                 df.groupby(by=list(groups))
                 .sample(n=min(n, min_count))
-                .groupby(by=list(groups))["sdoc"]
+                .groupby(by=list(groups))["sdoc"]  # type: ignore
                 .apply(list)
                 .to_dict()
             )
             sample_relative = (
                 df.groupby(by=list(groups))
                 .sample(frac=frac)
-                .groupby(by=list(groups))["sdoc"]
+                .groupby(by=list(groups))["sdoc"]  # type: ignore
                 .apply(list)
                 .to_dict()
             )

diff --git a/backend/src/app/core/analysis/timeline.py b/backend/src/app/core/analysis/timeline.py
@@ -166,7 +166,7 @@ def timeline_analysis(
 
         query = db.query(
             sdoc_ids_agg,
-            *group_by.apply(subquery.c[1]),  # type: ignore
+            *group_by.apply(subquery.c["date"]),  # type: ignore
         ).join(subquery, SourceDocumentORM.id == subquery.c.id)
 
         query = apply_filtering(

diff --git a/backend/src/app/core/data/repo/repo_service.py b/backend/src/app/core/data/repo/repo_service.py
@@ -66,13 +66,6 @@ def __init__(self, dst_path: Path):
         )
 
 
-class UnsupportedDocTypeForMimeType(Exception):
-    def __init__(self, mime_type: str):
-        super().__init__(
-            f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'."
-        )
-
-
 class ErroneousArchiveException(Exception):
     def __init__(self, archive_path: Path):
         super().__init__(f"Error with Archive {archive_path}")

diff --git a/backend/src/app/core/filters/filtering.py b/backend/src/app/core/filters/filtering.py
@@ -18,6 +18,7 @@
     NumberOperator,
     StringOperator,
 )
+from app.core.filters.types import FilterValue
 
 
 class LogicalOperator(str, Enum):
@@ -36,8 +37,6 @@ def get_sqlalchemy_operator(self):
 
 T = TypeVar("T", bound=AbstractColumns)
 
-FilterValue = Union[bool, str, int, List[str], List[List[str]]]
-
 
 class FilterExpression(BaseModel, Generic[T]):
     id: str

diff --git a/backend/src/app/core/filters/filtering_operators.py b/backend/src/app/core/filters/filtering_operators.py
@@ -3,7 +3,7 @@
 from sqlalchemy import not_
 from sqlalchemy.orm import QueryableAttribute
 
-from app.core.filters.filtering import FilterValue
+from app.core.filters.types import FilterValue
 
 
 class FilterValueType(Enum):

diff --git a/backend/src/app/core/filters/types.py b/backend/src/app/core/filters/types.py
@@ -0,0 +1,3 @@
+from typing import List, Union
+
+FilterValue = Union[bool, str, int, List[str], List[List[str]]]
diff --git a/...app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py b/...app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py
@@ -1,14 +1,20 @@
 import re
 from html.parser import HTMLParser
 from itertools import accumulate
-from typing import Dict, List, Union
+from typing import List, Optional, TypedDict
 
 from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
 from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
 
 
+class Text(TypedDict):
+    text: str
+    start: int
+    end: int
+
+
 class CustomLineHTMLParser(HTMLParser):
-    result: List[Dict[str, Union[str, int]]]
+    result: List[Text]
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -23,7 +29,7 @@ def current_index(self):
         line, char = self.getpos()
         return self.line_lengths[line - 1] + char
 
-    def __call__(self, data: str) -> List[Dict[str, Union[str, int]]]:
+    def __call__(self, data: str) -> List[Text]:
         self.reset()
         self.line_lengths = [0] + list(
             accumulate(len(line) for line in data.splitlines(keepends=True))
@@ -37,21 +43,13 @@ class HTMLTextMapper(CustomLineHTMLParser):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.result = []
-        self.text = {
-            "text": "",
-            "start": 0,
-            "end": 0,
-        }
+        self.text: Optional[Text] = None
         self.end_spaces = 0
 
     def reset(self):
         super().reset()
         self.result = []
-        self.text = {
-            "text": "",
-            "start": 0,
-            "end": 0,
-        }
+        self.text = None
 
     def handle_data(self, data: str):
         # only add text if it is not only whitespaces!
@@ -68,6 +66,7 @@ def handle_data(self, data: str):
             self.text = {
                 "text": data.strip(),
                 "start": self.current_index + start_spaces,
+                "end": -1,
             }
 
     def handle_starttag(self, tag, attrs):
@@ -80,14 +79,11 @@ def handle_comment(self, data):
         self.text_end()
 
     def text_end(self):
-        self.text["end"] = self.current_index - self.end_spaces
-        self.result.append(self.text)
-        self.text = {
-            "text": "",
-            "start": 0,
-            "end": 0,
-        }
-        self.end_spaces = 0
+        if self.text:
+            self.text["end"] = self.current_index - self.end_spaces
+            self.result.append(self.text)
+            self.text = None
+            self.end_spaces = 0
 
     def close(self):
         super().close()

diff --git a/backend/src/app/preprocessing/preprocessing_service.py b/backend/src/app/preprocessing/preprocessing_service.py
@@ -31,7 +31,6 @@
 from app.core.data.repo.repo_service import (
     FileNotFoundInRepositoryError,
     RepoService,
-    UnsupportedDocTypeForMimeType,
     UnsupportedDocTypeForSourceDocument,
 )
 from app.core.db.sql_service import SQLService
@@ -40,6 +39,13 @@
 from app.util.singleton_meta import SingletonMeta
 
 
+class UnsupportedDocTypeForMimeType(Exception):
+    def __init__(self, mime_type: str):
+        super().__init__(
+            f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'."
+        )
+
+
 class PreprocessingService(metaclass=SingletonMeta):
     def __new__(cls, *args, **kwargs):
         cls.sqls: SQLService = SQLService()

diff --git a/frontend/src/api/openapi/models/ExportJobParameters.ts b/frontend/src/api/openapi/models/ExportJobParameters.ts
@@ -22,7 +22,7 @@ export type ExportJobParameters = {
   /**
    * The format of the exported data.
    */
-  export_format?: ExportFormat | null;
+  export_format?: ExportFormat;
   /**
    * Specific parameters for the export job w.r.t it's type
    */